653 lines
22 KiB
C++
653 lines
22 KiB
C++
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <cassert>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <iterator>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
#include <immintrin.h>
|
|
#endif
|
|
|
|
// Wait strategies for controlling thread blocking behavior when no work is
|
|
// available
|
|
enum class WaitStrategy {
|
|
// Never block - threads busy-wait (spin) when no work available.
|
|
// Stage threads will always use 100% CPU even when idle.
|
|
// Requires dedicated CPU cores to avoid scheduler thrashing.
|
|
// Use when: latency is critical and you have spare cores.
|
|
Never,
|
|
|
|
// Block only when all upstream stages are idle (no new work entering
|
|
// pipeline).
|
|
// Downstream threads busy-wait if upstream has work but not for their stage.
|
|
// Eliminates futex notifications between stages, reduces to 0% CPU when idle.
|
|
// Requires dedicated cores to avoid priority inversion when pipeline has
|
|
// work.
|
|
// Use when: high throughput with spare cores and sustained workloads.
|
|
WaitIfUpstreamIdle,
|
|
|
|
// Block when individual stages are empty (original behavior).
|
|
// Each stage waits independently on its input sources.
|
|
// Safe for shared CPU environments, works well with variable workloads.
|
|
// Use when: general purpose, shared cores, or unpredictable workloads.
|
|
WaitIfStageEmpty,
|
|
};
|
|
|
|
// Core thread state
|
|
struct ThreadState {
|
|
alignas(128) std::atomic<uint32_t> pops{0};
|
|
uint32_t local_pops{0};
|
|
std::vector<uint32_t> last_push_read;
|
|
bool last_stage;
|
|
};
|
|
|
|
// Runtime topology configuration for dynamic pipelines
|
|
//
|
|
// This class defines a pipeline topology at runtime:
|
|
// - Stage and thread calculations done at runtime
|
|
// - Flexible configuration: topology can be set via constructor
|
|
// - Dynamic arrays with runtime bounds checking
|
|
// - Single implementation works for any topology
|
|
//
|
|
// Example: PipelineTopology({1, 4, 2}) creates:
|
|
// - Stage 0: 1 thread (index 0)
|
|
// - Stage 1: 4 threads (indices 1-4)
|
|
// - Stage 2: 2 threads (indices 5-6)
|
|
// - Total: 7 threads across 3 stages
|
|
struct PipelineTopology {
|
|
const std::vector<int> threads_per_stage;
|
|
const int num_stages;
|
|
const std::vector<int> stage_offsets;
|
|
const int total_threads;
|
|
|
|
explicit PipelineTopology(std::vector<int> threads_per_stage_)
|
|
: threads_per_stage(validate_and_move(std::move(threads_per_stage_))),
|
|
num_stages(static_cast<int>(threads_per_stage.size())),
|
|
stage_offsets(build_stage_offsets(threads_per_stage)),
|
|
total_threads(build_total_threads(threads_per_stage)) {}
|
|
|
|
// Runtime stage offset calculation
|
|
int stage_offset(int stage) const {
|
|
if (stage < 0 || stage >= num_stages) {
|
|
std::abort(); // Stage index out of bounds
|
|
}
|
|
return stage_offsets[stage];
|
|
}
|
|
|
|
// Runtime thread index calculation
|
|
int thread_index(int stage, int thread) const {
|
|
if (stage < 0 || stage >= num_stages) {
|
|
std::abort(); // Stage index out of bounds
|
|
}
|
|
if (thread < 0 || thread >= threads_per_stage[stage]) {
|
|
std::abort(); // Thread index out of bounds
|
|
}
|
|
return stage_offsets[stage] + thread;
|
|
}
|
|
|
|
// Runtime previous stage thread count
|
|
int prev_stage_thread_count(int stage) const {
|
|
if (stage < 0 || stage >= num_stages) {
|
|
std::abort(); // Stage index out of bounds
|
|
}
|
|
if (stage == 0) {
|
|
return 1;
|
|
} else {
|
|
return threads_per_stage[stage - 1];
|
|
}
|
|
}
|
|
|
|
private:
|
|
static std::vector<int> validate_and_move(std::vector<int> threads) {
|
|
if (threads.empty()) {
|
|
std::abort(); // Must specify at least one stage
|
|
}
|
|
for (int count : threads) {
|
|
if (count <= 0) {
|
|
std::abort(); // All stages must have at least one thread
|
|
}
|
|
}
|
|
return threads;
|
|
}
|
|
|
|
static std::vector<int>
|
|
build_stage_offsets(const std::vector<int> &threads_per_stage) {
|
|
std::vector<int> offsets(threads_per_stage.size());
|
|
int offset = 0;
|
|
for (size_t i = 0; i < threads_per_stage.size(); ++i) {
|
|
offsets[i] = offset;
|
|
offset += threads_per_stage[i];
|
|
}
|
|
return offsets;
|
|
}
|
|
|
|
static int build_total_threads(const std::vector<int> &threads_per_stage) {
|
|
int total = 0;
|
|
for (int count : threads_per_stage) {
|
|
total += count;
|
|
}
|
|
return total;
|
|
}
|
|
};
|
|
|
|
// Pipeline algorithms - runtime configurable versions
|
|
namespace PipelineAlgorithms {
|
|
|
|
inline uint32_t calculate_safe_len(WaitStrategy wait_strategy,
|
|
const PipelineTopology &topology, int stage,
|
|
int thread_in_stage,
|
|
std::vector<ThreadState> &all_threads,
|
|
std::atomic<uint32_t> &pushes,
|
|
bool may_block) {
|
|
int thread_idx = topology.thread_index(stage, thread_in_stage);
|
|
auto &thread = all_threads[thread_idx];
|
|
uint32_t safe_len = UINT32_MAX;
|
|
|
|
int prev_stage_threads = topology.prev_stage_thread_count(stage);
|
|
|
|
// Runtime loop over previous stage threads
|
|
for (int i = 0; i < prev_stage_threads; ++i) {
|
|
std::atomic<uint32_t> &last_push = [&]() -> std::atomic<uint32_t> & {
|
|
if (stage == 0) {
|
|
return pushes;
|
|
} else {
|
|
int prev_thread_idx = topology.thread_index(stage - 1, i);
|
|
return all_threads[prev_thread_idx].pops;
|
|
}
|
|
}();
|
|
|
|
if (thread.last_push_read[i] == thread.local_pops) {
|
|
thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
|
|
if (thread.last_push_read[i] == thread.local_pops) {
|
|
if (!may_block) {
|
|
safe_len = 0;
|
|
return safe_len;
|
|
}
|
|
|
|
if (wait_strategy == WaitStrategy::Never) {
|
|
// Empty - busy wait
|
|
} else if (wait_strategy == WaitStrategy::WaitIfUpstreamIdle) {
|
|
// We're allowed to spin as long as we eventually go to 0% cpu
|
|
// usage on idle
|
|
uint32_t push;
|
|
bool should_wait = true;
|
|
for (int j = 0; j < 100000; ++j) {
|
|
push = pushes.load(std::memory_order_relaxed);
|
|
if (push != thread.local_pops) {
|
|
should_wait = false;
|
|
break;
|
|
}
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
_mm_pause();
|
|
#endif
|
|
}
|
|
if (should_wait) {
|
|
pushes.wait(push, std::memory_order_relaxed);
|
|
}
|
|
} else { // WaitStrategy::WaitIfStageEmpty
|
|
last_push.wait(thread.last_push_read[i], std::memory_order_relaxed);
|
|
}
|
|
|
|
thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
|
|
}
|
|
}
|
|
safe_len = std::min(safe_len, thread.last_push_read[i] - thread.local_pops);
|
|
}
|
|
|
|
return safe_len;
|
|
}
|
|
|
|
inline void update_thread_pops(WaitStrategy wait_strategy,
|
|
const PipelineTopology &topology, int stage,
|
|
int thread_in_stage,
|
|
std::vector<ThreadState> &all_threads,
|
|
uint32_t local_pops) {
|
|
int thread_idx = topology.thread_index(stage, thread_in_stage);
|
|
auto &thread_state = all_threads[thread_idx];
|
|
|
|
if (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
|
|
thread_state.pops.store(local_pops, std::memory_order_seq_cst);
|
|
thread_state.pops.notify_all();
|
|
} else if (stage == topology.num_stages - 1) { // last stage
|
|
thread_state.pops.store(local_pops, std::memory_order_seq_cst);
|
|
thread_state.pops.notify_all();
|
|
} else {
|
|
thread_state.pops.store(local_pops, std::memory_order_release);
|
|
}
|
|
}
|
|
|
|
inline int check_producer_capacity(const PipelineTopology &topology,
|
|
std::vector<ThreadState> &all_threads,
|
|
uint32_t slot, uint32_t size,
|
|
uint32_t slot_count, bool block) {
|
|
int last_stage = topology.num_stages - 1;
|
|
int last_stage_offset = topology.stage_offset(last_stage);
|
|
int last_stage_thread_count = topology.threads_per_stage[last_stage];
|
|
|
|
for (int i = 0; i < last_stage_thread_count; ++i) {
|
|
auto &thread = all_threads[last_stage_offset + i];
|
|
uint32_t pops = thread.pops.load(std::memory_order_acquire);
|
|
if (slot + size - pops > slot_count) {
|
|
if (!block) {
|
|
return 2; // Cannot proceed
|
|
}
|
|
thread.pops.wait(pops, std::memory_order_relaxed);
|
|
return 1; // Should retry
|
|
}
|
|
}
|
|
return 0; // Can proceed
|
|
}
|
|
} // namespace PipelineAlgorithms
|
|
|
|
// Multi-stage lock-free pipeline for inter-thread communication
|
|
// with runtime-configurable topology and wait strategy.
|
|
//
|
|
// Overview:
|
|
// - Items flow from producers through multiple processing stages (stage 0 ->
|
|
// stage 1 -> ... -> final stage)
|
|
// - Each stage can have multiple worker threads processing items in parallel
|
|
// - Uses a shared ring buffer with atomic counters for lock-free coordination
|
|
// - Supports batch processing for efficiency
|
|
// - Runtime-configurable topology and wait strategy via constructor parameters
|
|
//
|
|
// Architecture:
|
|
// - Producers: External threads that add items to the pipeline via push()
|
|
// - Stages: Processing stages numbered 0, 1, 2, ... that consume items via
|
|
// acquire(stage, thread)
|
|
// - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage
|
|
//
|
|
// Usage Pattern:
|
|
// ThreadPipeline<Item> pipeline(WaitStrategy::WaitIfStageEmpty, {1, 4, 2},
|
|
// lgSlotCount);
|
|
//
|
|
// // Producer threads (add items for stage 0 to consume):
|
|
// auto guard = pipeline.push(batchSize, /*block=*/true);
|
|
// for (auto& item : guard.batch) {
|
|
// // Initialize item data
|
|
// }
|
|
// // Guard destructor publishes batch to stage 0 consumers
|
|
//
|
|
// // Stage worker threads (process items and pass to next stage):
|
|
// auto guard = pipeline.acquire(stage, thread, maxBatch, /*may_block=*/true);
|
|
// for (auto& item : guard.batch) {
|
|
// // Process item
|
|
// }
|
|
// // Guard destructor marks items as consumed and available to next stage
|
|
//
|
|
// Multi-Thread Stage Processing:
|
|
// When a stage has multiple threads (e.g., {1, 1, 1, 2} = 2 threads in stage
|
|
// 3):
|
|
//
|
|
// OVERLAPPING BATCHES - EACH THREAD SEES EVERY ENTRY:
|
|
// - Multiple threads in the same stage get OVERLAPPING batches from the ring
|
|
// buffer
|
|
// - Thread 0: calls acquire(3, 0) - gets batch from ring positions 100-110
|
|
// - Thread 1: calls acquire(3, 1) - gets batch from ring positions 100-110
|
|
// (SAME)
|
|
// - Both threads see the same entries and must coordinate processing
|
|
//
|
|
// PARTITIONING STRATEGIES:
|
|
// Choose your partitioning approach based on your use case:
|
|
//
|
|
// 1. Ring buffer position-based partitioning:
|
|
// for (auto it = batch.begin(); it != batch.end(); ++it) {
|
|
// if (it.index() % 2 != thread_index) continue; // Skip entries for other
|
|
// threads process(*it); // Process only entries assigned to this thread
|
|
// }
|
|
//
|
|
// 2. Entry content-based partitioning:
|
|
// for (auto& item : guard.batch) {
|
|
// if (hash(item.connection_id) % 2 != thread_index) continue;
|
|
// process(item); // Process based on entry properties
|
|
// }
|
|
//
|
|
// 3. Process all entries (when each thread does different work):
|
|
// for (auto& item : guard.batch) {
|
|
// process(item); // Both threads process all items, but differently
|
|
// }
|
|
//
|
|
// Common Partitioning Patterns:
|
|
// - Position-based: it.index() % num_threads == thread_index
|
|
// - Hash-based: hash(item.key) % num_threads == thread_index
|
|
// - Type-based: item.type == MY_THREAD_TYPE
|
|
// - Load balancing: assign work based on thread load
|
|
// - All entries: each thread processes all items but performs different
|
|
// operations
|
|
//
|
|
// Note: it.index() returns the position in the ring buffer (0 to buffer_size-1)
|
|
//
|
|
// Memory Model:
|
|
// - Ring buffer size must be power of 2 for efficient masking
|
|
// - Actual ring slots accessed via: index & (slotCount - 1)
|
|
// - 128-byte aligned atomics prevent false sharing between CPU cache lines
|
|
//
|
|
// Thread Safety:
|
|
// - Fully lock-free using atomic operations with acquire/release memory
|
|
// ordering
|
|
// - Uses C++20 atomic wait/notify for efficient blocking when no work available
|
|
// - RAII guards ensure proper cleanup even with exceptions
|
|
template <class T> struct ThreadPipeline {
|
|
// Constructor
|
|
// wait_strategy: blocking behavior when no work is available
|
|
// threads_per_stage: number of threads in each stage (e.g., {1, 4, 2})
|
|
// lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots)
|
|
// Note: Producer threads are external to the pipeline and not counted in
|
|
// threads_per_stage
|
|
explicit ThreadPipeline(WaitStrategy wait_strategy,
|
|
std::vector<int> threads_per_stage, int lgSlotCount)
|
|
: wait_strategy_(wait_strategy), topology_(std::move(threads_per_stage)),
|
|
slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
|
|
ring(slot_count), all_threads(topology_.total_threads) {
|
|
// Otherwise we can't tell the difference between full and empty.
|
|
assert(!(slot_count_mask & 0x80000000));
|
|
initialize_all_threads();
|
|
}
|
|
|
|
ThreadPipeline(ThreadPipeline const &) = delete;
|
|
ThreadPipeline &operator=(ThreadPipeline const &) = delete;
|
|
ThreadPipeline(ThreadPipeline &&) = delete;
|
|
ThreadPipeline &operator=(ThreadPipeline &&) = delete;
|
|
|
|
struct Batch {
|
|
Batch() : ring(), begin_(), end_() {}
|
|
|
|
struct Iterator {
|
|
using iterator_category = std::random_access_iterator_tag;
|
|
using difference_type = std::ptrdiff_t;
|
|
using value_type = T;
|
|
using pointer = value_type *;
|
|
using reference = value_type &;
|
|
|
|
reference operator*() const {
|
|
return (*ring)[index_ & (ring->size() - 1)];
|
|
}
|
|
pointer operator->() const {
|
|
return &(*ring)[index_ & (ring->size() - 1)];
|
|
}
|
|
Iterator &operator++() {
|
|
++index_;
|
|
return *this;
|
|
}
|
|
Iterator operator++(int) {
|
|
auto tmp = *this;
|
|
++(*this);
|
|
return tmp;
|
|
}
|
|
Iterator &operator--() {
|
|
--index_;
|
|
return *this;
|
|
}
|
|
Iterator operator--(int) {
|
|
auto tmp = *this;
|
|
--(*this);
|
|
return tmp;
|
|
}
|
|
Iterator &operator+=(difference_type n) {
|
|
index_ += n;
|
|
return *this;
|
|
}
|
|
Iterator &operator-=(difference_type n) {
|
|
index_ -= n;
|
|
return *this;
|
|
}
|
|
Iterator operator+(difference_type n) const {
|
|
return Iterator(index_ + n, ring);
|
|
}
|
|
Iterator operator-(difference_type n) const {
|
|
return Iterator(index_ - n, ring);
|
|
}
|
|
difference_type operator-(const Iterator &rhs) const {
|
|
assert(ring == rhs.ring);
|
|
return static_cast<difference_type>(index_) -
|
|
static_cast<difference_type>(rhs.index_);
|
|
}
|
|
friend Iterator operator+(difference_type n, const Iterator &iter) {
|
|
return iter + n;
|
|
}
|
|
friend bool operator==(const Iterator &lhs, const Iterator &rhs) {
|
|
assert(lhs.ring == rhs.ring);
|
|
return lhs.index_ == rhs.index_;
|
|
}
|
|
friend bool operator!=(const Iterator &lhs, const Iterator &rhs) {
|
|
assert(lhs.ring == rhs.ring);
|
|
return lhs.index_ != rhs.index_;
|
|
}
|
|
friend bool operator<(const Iterator &lhs, const Iterator &rhs) {
|
|
assert(lhs.ring == rhs.ring);
|
|
return static_cast<int32_t>(lhs.index_ - rhs.index_) < 0;
|
|
}
|
|
friend bool operator<=(const Iterator &lhs, const Iterator &rhs) {
|
|
assert(lhs.ring == rhs.ring);
|
|
return static_cast<int32_t>(lhs.index_ - rhs.index_) <= 0;
|
|
}
|
|
friend bool operator>(const Iterator &lhs, const Iterator &rhs) {
|
|
assert(lhs.ring == rhs.ring);
|
|
return static_cast<int32_t>(lhs.index_ - rhs.index_) > 0;
|
|
}
|
|
friend bool operator>=(const Iterator &lhs, const Iterator &rhs) {
|
|
assert(lhs.ring == rhs.ring);
|
|
return static_cast<int32_t>(lhs.index_ - rhs.index_) >= 0;
|
|
}
|
|
|
|
uint32_t index() const { return index_ & (ring->size() - 1); }
|
|
|
|
private:
|
|
Iterator(uint32_t index, std::vector<T> *const ring)
|
|
: index_(index), ring(ring) {}
|
|
friend struct Batch;
|
|
uint32_t index_;
|
|
std::vector<T> *const ring;
|
|
};
|
|
|
|
[[nodiscard]] Iterator begin() { return Iterator(begin_, ring); }
|
|
[[nodiscard]] Iterator end() { return Iterator(end_, ring); }
|
|
|
|
[[nodiscard]] size_t size() const { return end_ - begin_; }
|
|
[[nodiscard]] bool empty() const { return end_ == begin_; }
|
|
T &operator[](uint32_t n) {
|
|
return (*ring)[(begin_ + n) & (ring->size() - 1)];
|
|
}
|
|
|
|
private:
|
|
friend struct ThreadPipeline;
|
|
Batch(std::vector<T> *const ring, uint32_t begin_, uint32_t end_)
|
|
: ring(ring), begin_(begin_), end_(end_) {}
|
|
std::vector<T> *const ring;
|
|
uint32_t begin_;
|
|
uint32_t end_;
|
|
};
|
|
|
|
private:
|
|
WaitStrategy wait_strategy_;
|
|
PipelineTopology topology_;
|
|
|
|
alignas(128) std::atomic<uint32_t> slots{0};
|
|
alignas(128) std::atomic<uint32_t> pushes{0};
|
|
const uint32_t slot_count;
|
|
const uint32_t slot_count_mask;
|
|
|
|
std::vector<T> ring;
|
|
std::vector<ThreadState> all_threads;
|
|
|
|
void initialize_all_threads() {
|
|
for (int stage = 0; stage < topology_.num_stages; ++stage) {
|
|
init_stage_threads(stage);
|
|
}
|
|
}
|
|
|
|
void init_stage_threads(int stage) {
|
|
int stage_offset = topology_.stage_offset(stage);
|
|
int stage_thread_count = topology_.threads_per_stage[stage];
|
|
int prev_stage_threads = topology_.prev_stage_thread_count(stage);
|
|
bool is_last_stage = (stage == topology_.num_stages - 1);
|
|
|
|
for (int thread = 0; thread < stage_thread_count; ++thread) {
|
|
auto &thread_state = all_threads[stage_offset + thread];
|
|
thread_state.last_stage = is_last_stage;
|
|
thread_state.last_push_read = std::vector<uint32_t>(prev_stage_threads);
|
|
}
|
|
}
|
|
|
|
Batch acquire_helper(int stage, int thread, uint32_t maxBatch,
|
|
bool may_block) {
|
|
int thread_idx = topology_.thread_index(stage, thread);
|
|
auto &thread_state = all_threads[thread_idx];
|
|
|
|
uint32_t begin = thread_state.local_pops & slot_count_mask;
|
|
uint32_t len = PipelineAlgorithms::calculate_safe_len(
|
|
wait_strategy_, topology_, stage, thread, all_threads, pushes,
|
|
may_block);
|
|
|
|
if (maxBatch != 0) {
|
|
len = std::min(len, maxBatch);
|
|
}
|
|
if (len == 0) {
|
|
return Batch{};
|
|
}
|
|
|
|
auto result = Batch{&ring, begin, begin + len};
|
|
thread_state.local_pops += len;
|
|
return result;
|
|
}
|
|
|
|
public:
|
|
struct StageGuard {
|
|
Batch batch;
|
|
|
|
~StageGuard() {
|
|
if (!batch.empty()) {
|
|
PipelineAlgorithms::update_thread_pops(
|
|
pipeline->wait_strategy_, pipeline->topology_, stage, thread,
|
|
pipeline->all_threads, local_pops);
|
|
}
|
|
}
|
|
|
|
StageGuard(StageGuard const &) = delete;
|
|
StageGuard &operator=(StageGuard const &) = delete;
|
|
StageGuard(StageGuard &&other) noexcept
|
|
: batch(other.batch), local_pops(other.local_pops), stage(other.stage),
|
|
thread(other.thread),
|
|
pipeline(std::exchange(other.pipeline, nullptr)) {}
|
|
StageGuard &operator=(StageGuard &&other) noexcept {
|
|
batch = other.batch;
|
|
local_pops = other.local_pops;
|
|
stage = other.stage;
|
|
thread = other.thread;
|
|
pipeline = std::exchange(other.pipeline, nullptr);
|
|
return *this;
|
|
}
|
|
|
|
private:
|
|
friend struct ThreadPipeline;
|
|
uint32_t local_pops;
|
|
int stage;
|
|
int thread;
|
|
ThreadPipeline *pipeline;
|
|
|
|
StageGuard(Batch batch, uint32_t local_pops, int stage, int thread,
|
|
ThreadPipeline *pipeline)
|
|
: batch(batch), local_pops(local_pops), stage(stage), thread(thread),
|
|
pipeline(batch.empty() ? nullptr : pipeline) {}
|
|
};
|
|
|
|
struct ProducerGuard {
|
|
Batch batch;
|
|
|
|
~ProducerGuard() {
|
|
if (tp == nullptr) {
|
|
return;
|
|
}
|
|
for (;;) {
|
|
uint32_t p = tp->pushes.load(std::memory_order_acquire);
|
|
if (p == old_slot) {
|
|
break;
|
|
}
|
|
tp->pushes.wait(p, std::memory_order_relaxed);
|
|
}
|
|
tp->pushes.store(new_slot, std::memory_order_seq_cst);
|
|
tp->pushes.notify_all();
|
|
}
|
|
|
|
private:
|
|
friend struct ThreadPipeline;
|
|
ProducerGuard() : batch(), tp() {}
|
|
ProducerGuard(Batch batch, ThreadPipeline *tp, uint32_t old_slot,
|
|
uint32_t new_slot)
|
|
: batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {}
|
|
ThreadPipeline *const tp;
|
|
uint32_t old_slot;
|
|
uint32_t new_slot;
|
|
};
|
|
|
|
// Acquire a batch of items for processing by a consumer thread.
|
|
// stage: which processing stage (0 = first consumer stage after producers)
|
|
// thread: thread ID within the stage (0 to threads_per_stage[stage]-1)
|
|
// maxBatch: maximum items to acquire (0 = no limit)
|
|
// may_block: whether to block waiting for items (false = return empty batch
|
|
// if none available) Returns: StageGuard with batch of items to process
|
|
[[nodiscard]] StageGuard acquire(int stage, int thread, int maxBatch = 0,
|
|
bool may_block = true) {
|
|
auto batch = acquire_helper(stage, thread, maxBatch, may_block);
|
|
|
|
int thread_idx = topology_.thread_index(stage, thread);
|
|
uint32_t local_pops = all_threads[thread_idx].local_pops;
|
|
|
|
return StageGuard{std::move(batch), local_pops, stage, thread, this};
|
|
}
|
|
|
|
// Reserve slots in the ring buffer for a producer thread to fill with items.
|
|
// This is used by producer threads to add new items to stage 0 of the
|
|
// pipeline.
|
|
//
|
|
// size: number of slots to reserve (must be > 0 and <= ring buffer capacity)
|
|
// block: if true, blocks when ring buffer is full; if false, returns empty
|
|
// guard Returns: ProducerGuard with exclusive access to reserved slots
|
|
//
|
|
// Usage: Fill items in the returned batch, then let guard destructor publish
|
|
// them. The guard destructor ensures items are published in the correct
|
|
// order.
|
|
//
|
|
// Preconditions:
|
|
// - size > 0 (must request at least one slot)
|
|
// - size <= slotCount (cannot request more slots than ring buffer capacity)
|
|
// Violating preconditions results in program termination via abort().
|
|
[[nodiscard]] ProducerGuard push(uint32_t const size, bool block) {
|
|
if (size == 0) {
|
|
std::abort();
|
|
}
|
|
if (size > slot_count) {
|
|
std::abort();
|
|
}
|
|
|
|
uint32_t slot;
|
|
uint32_t begin;
|
|
for (;;) {
|
|
slot = slots.load(std::memory_order_relaxed);
|
|
begin = slot & slot_count_mask;
|
|
|
|
int capacity_result = PipelineAlgorithms::check_producer_capacity(
|
|
topology_, all_threads, slot, size, slot_count, block);
|
|
if (capacity_result == 1) {
|
|
continue;
|
|
}
|
|
if (capacity_result == 2) {
|
|
return ProducerGuard{};
|
|
}
|
|
|
|
if (slots.compare_exchange_weak(slot, slot + size,
|
|
std::memory_order_relaxed,
|
|
std::memory_order_relaxed)) {
|
|
break;
|
|
}
|
|
}
|
|
return ProducerGuard{Batch{&ring, begin, begin + size}, this, slot,
|
|
slot + size};
|
|
}
|
|
};
|