#pragma once #include #include #include #include #include #include #include #include #include #if defined(__x86_64__) || defined(_M_X64) #include #endif // Wait strategies for controlling thread blocking behavior when no work is // available enum class WaitStrategy { // Never block - threads busy-wait (spin) when no work available. // Stage threads will always use 100% CPU even when idle. // Requires dedicated CPU cores to avoid scheduler thrashing. // Use when: latency is critical and you have spare cores. Never, // Block only when all upstream stages are idle (no new work entering // pipeline). // Downstream threads busy-wait if upstream has work but not for their stage. // Eliminates futex notifications between stages, reduces to 0% CPU when idle. // Requires dedicated cores to avoid priority inversion when pipeline has // work. // Use when: high throughput with spare cores and sustained workloads. WaitIfUpstreamIdle, // Block when individual stages are empty (original behavior). // Each stage waits independently on its input sources. // Safe for shared CPU environments, works well with variable workloads. // Use when: general purpose, shared cores, or unpredictable workloads. WaitIfStageEmpty, }; // Core thread state struct ThreadState { alignas(128) std::atomic pops{0}; uint32_t local_pops{0}; std::vector last_push_read; bool last_stage; }; // Runtime topology configuration for dynamic pipelines // // This class defines a pipeline topology at runtime: // - Stage and thread calculations done at runtime // - Flexible configuration: topology can be set via constructor // - Dynamic arrays with runtime bounds checking // - Single implementation works for any topology // // Example: PipelineTopology({1, 4, 2}) creates: // - Stage 0: 1 thread (index 0) // - Stage 1: 4 threads (indices 1-4) // - Stage 2: 2 threads (indices 5-6) // - Total: 7 threads across 3 stages struct PipelineTopology { const std::vector threads_per_stage; const int num_stages; const std::vector stage_offsets; const int total_threads; explicit PipelineTopology(std::vector threads_per_stage_) : threads_per_stage(validate_and_move(std::move(threads_per_stage_))), num_stages(static_cast(threads_per_stage.size())), stage_offsets(build_stage_offsets(threads_per_stage)), total_threads(build_total_threads(threads_per_stage)) {} // Runtime stage offset calculation int stage_offset(int stage) const { if (stage < 0 || stage >= num_stages) { std::abort(); // Stage index out of bounds } return stage_offsets[stage]; } // Runtime thread index calculation int thread_index(int stage, int thread) const { if (stage < 0 || stage >= num_stages) { std::abort(); // Stage index out of bounds } if (thread < 0 || thread >= threads_per_stage[stage]) { std::abort(); // Thread index out of bounds } return stage_offsets[stage] + thread; } // Runtime previous stage thread count int prev_stage_thread_count(int stage) const { if (stage < 0 || stage >= num_stages) { std::abort(); // Stage index out of bounds } if (stage == 0) { return 1; } else { return threads_per_stage[stage - 1]; } } private: static std::vector validate_and_move(std::vector threads) { if (threads.empty()) { std::abort(); // Must specify at least one stage } for (int count : threads) { if (count <= 0) { std::abort(); // All stages must have at least one thread } } return threads; } static std::vector build_stage_offsets(const std::vector &threads_per_stage) { std::vector offsets(threads_per_stage.size()); int offset = 0; for (size_t i = 0; i < threads_per_stage.size(); ++i) { offsets[i] = offset; offset += threads_per_stage[i]; } return offsets; } static int build_total_threads(const std::vector &threads_per_stage) { int total = 0; for (int count : threads_per_stage) { total += count; } return total; } }; // Pipeline algorithms - runtime configurable versions namespace PipelineAlgorithms { inline uint32_t calculate_safe_len(WaitStrategy wait_strategy, const PipelineTopology &topology, int stage, int thread_in_stage, std::vector &all_threads, std::atomic &pushes, bool may_block) { int thread_idx = topology.thread_index(stage, thread_in_stage); auto &thread = all_threads[thread_idx]; uint32_t safe_len = UINT32_MAX; int prev_stage_threads = topology.prev_stage_thread_count(stage); // Runtime loop over previous stage threads for (int i = 0; i < prev_stage_threads; ++i) { std::atomic &last_push = [&]() -> std::atomic & { if (stage == 0) { return pushes; } else { int prev_thread_idx = topology.thread_index(stage - 1, i); return all_threads[prev_thread_idx].pops; } }(); if (thread.last_push_read[i] == thread.local_pops) { thread.last_push_read[i] = last_push.load(std::memory_order_acquire); if (thread.last_push_read[i] == thread.local_pops) { if (!may_block) { safe_len = 0; return safe_len; } if (wait_strategy == WaitStrategy::Never) { // Empty - busy wait } else if (wait_strategy == WaitStrategy::WaitIfUpstreamIdle) { // We're allowed to spin as long as we eventually go to 0% cpu // usage on idle uint32_t push; bool should_wait = true; for (int j = 0; j < 100000; ++j) { push = pushes.load(std::memory_order_relaxed); if (push != thread.local_pops) { should_wait = false; break; } #if defined(__x86_64__) || defined(_M_X64) _mm_pause(); #endif } if (should_wait) { pushes.wait(push, std::memory_order_relaxed); } } else { // WaitStrategy::WaitIfStageEmpty last_push.wait(thread.last_push_read[i], std::memory_order_relaxed); } thread.last_push_read[i] = last_push.load(std::memory_order_acquire); } } safe_len = std::min(safe_len, thread.last_push_read[i] - thread.local_pops); } return safe_len; } inline void update_thread_pops(WaitStrategy wait_strategy, const PipelineTopology &topology, int stage, int thread_in_stage, std::vector &all_threads, uint32_t local_pops) { int thread_idx = topology.thread_index(stage, thread_in_stage); auto &thread_state = all_threads[thread_idx]; if (wait_strategy == WaitStrategy::WaitIfStageEmpty) { thread_state.pops.store(local_pops, std::memory_order_seq_cst); thread_state.pops.notify_all(); } else if (stage == topology.num_stages - 1) { // last stage thread_state.pops.store(local_pops, std::memory_order_seq_cst); thread_state.pops.notify_all(); } else { thread_state.pops.store(local_pops, std::memory_order_release); } } inline int check_producer_capacity(const PipelineTopology &topology, std::vector &all_threads, uint32_t slot, uint32_t size, uint32_t slot_count, bool block) { int last_stage = topology.num_stages - 1; int last_stage_offset = topology.stage_offset(last_stage); int last_stage_thread_count = topology.threads_per_stage[last_stage]; for (int i = 0; i < last_stage_thread_count; ++i) { auto &thread = all_threads[last_stage_offset + i]; uint32_t pops = thread.pops.load(std::memory_order_acquire); if (slot + size - pops > slot_count) { if (!block) { return 2; // Cannot proceed } thread.pops.wait(pops, std::memory_order_relaxed); return 1; // Should retry } } return 0; // Can proceed } } // namespace PipelineAlgorithms // Multi-stage lock-free pipeline for inter-thread communication // with runtime-configurable topology and wait strategy. // // Overview: // - Items flow from producers through multiple processing stages (stage 0 -> // stage 1 -> ... -> final stage) // - Each stage can have multiple worker threads processing items in parallel // - Uses a shared ring buffer with atomic counters for lock-free coordination // - Supports batch processing for efficiency // - Runtime-configurable topology and wait strategy via constructor parameters // // Architecture: // - Producers: External threads that add items to the pipeline via push() // - Stages: Processing stages numbered 0, 1, 2, ... that consume items via // acquire(stage, thread) // - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage // // Usage Pattern: // ThreadPipeline pipeline(WaitStrategy::WaitIfStageEmpty, {1, 4, 2}, // lgSlotCount); // // // Producer threads (add items for stage 0 to consume): // auto guard = pipeline.push(batchSize, /*block=*/true); // for (auto& item : guard.batch) { // // Initialize item data // } // // Guard destructor publishes batch to stage 0 consumers // // // Stage worker threads (process items and pass to next stage): // auto guard = pipeline.acquire(stage, thread, maxBatch, /*may_block=*/true); // for (auto& item : guard.batch) { // // Process item // } // // Guard destructor marks items as consumed and available to next stage // // Multi-Thread Stage Processing: // When a stage has multiple threads (e.g., {1, 1, 1, 2} = 2 threads in stage // 3): // // OVERLAPPING BATCHES - EACH THREAD SEES EVERY ENTRY: // - Multiple threads in the same stage get OVERLAPPING batches from the ring // buffer // - Thread 0: calls acquire(3, 0) - gets batch from ring positions 100-110 // - Thread 1: calls acquire(3, 1) - gets batch from ring positions 100-110 // (SAME) // - Both threads see the same entries and must coordinate processing // // PARTITIONING STRATEGIES: // Choose your partitioning approach based on your use case: // // 1. Ring buffer position-based partitioning: // for (auto it = batch.begin(); it != batch.end(); ++it) { // if (it.index() % 2 != thread_index) continue; // Skip entries for other // threads process(*it); // Process only entries assigned to this thread // } // // 2. Entry content-based partitioning: // for (auto& item : guard.batch) { // if (hash(item.connection_id) % 2 != thread_index) continue; // process(item); // Process based on entry properties // } // // 3. Process all entries (when each thread does different work): // for (auto& item : guard.batch) { // process(item); // Both threads process all items, but differently // } // // Common Partitioning Patterns: // - Position-based: it.index() % num_threads == thread_index // - Hash-based: hash(item.key) % num_threads == thread_index // - Type-based: item.type == MY_THREAD_TYPE // - Load balancing: assign work based on thread load // - All entries: each thread processes all items but performs different // operations // // Note: it.index() returns the position in the ring buffer (0 to buffer_size-1) // // Memory Model: // - Ring buffer size must be power of 2 for efficient masking // - Actual ring slots accessed via: index & (slotCount - 1) // - 128-byte aligned atomics prevent false sharing between CPU cache lines // // Thread Safety: // - Fully lock-free using atomic operations with acquire/release memory // ordering // - Uses C++20 atomic wait/notify for efficient blocking when no work available // - RAII guards ensure proper cleanup even with exceptions template struct ThreadPipeline { // Constructor // wait_strategy: blocking behavior when no work is available // threads_per_stage: number of threads in each stage (e.g., {1, 4, 2}) // lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots) // Note: Producer threads are external to the pipeline and not counted in // threads_per_stage explicit ThreadPipeline(WaitStrategy wait_strategy, std::vector threads_per_stage, int lgSlotCount) : wait_strategy_(wait_strategy), topology_(std::move(threads_per_stage)), slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1), ring(slot_count), all_threads(topology_.total_threads) { // Otherwise we can't tell the difference between full and empty. assert(!(slot_count_mask & 0x80000000)); initialize_all_threads(); } ThreadPipeline(ThreadPipeline const &) = delete; ThreadPipeline &operator=(ThreadPipeline const &) = delete; ThreadPipeline(ThreadPipeline &&) = delete; ThreadPipeline &operator=(ThreadPipeline &&) = delete; struct Batch { Batch() : ring(), begin_(), end_() {} struct Iterator { using iterator_category = std::random_access_iterator_tag; using difference_type = std::ptrdiff_t; using value_type = T; using pointer = value_type *; using reference = value_type &; reference operator*() const { return (*ring)[index_ & (ring->size() - 1)]; } pointer operator->() const { return &(*ring)[index_ & (ring->size() - 1)]; } Iterator &operator++() { ++index_; return *this; } Iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; } Iterator &operator--() { --index_; return *this; } Iterator operator--(int) { auto tmp = *this; --(*this); return tmp; } Iterator &operator+=(difference_type n) { index_ += n; return *this; } Iterator &operator-=(difference_type n) { index_ -= n; return *this; } Iterator operator+(difference_type n) const { return Iterator(index_ + n, ring); } Iterator operator-(difference_type n) const { return Iterator(index_ - n, ring); } difference_type operator-(const Iterator &rhs) const { assert(ring == rhs.ring); return static_cast(index_) - static_cast(rhs.index_); } friend Iterator operator+(difference_type n, const Iterator &iter) { return iter + n; } friend bool operator==(const Iterator &lhs, const Iterator &rhs) { assert(lhs.ring == rhs.ring); return lhs.index_ == rhs.index_; } friend bool operator!=(const Iterator &lhs, const Iterator &rhs) { assert(lhs.ring == rhs.ring); return lhs.index_ != rhs.index_; } friend bool operator<(const Iterator &lhs, const Iterator &rhs) { assert(lhs.ring == rhs.ring); return static_cast(lhs.index_ - rhs.index_) < 0; } friend bool operator<=(const Iterator &lhs, const Iterator &rhs) { assert(lhs.ring == rhs.ring); return static_cast(lhs.index_ - rhs.index_) <= 0; } friend bool operator>(const Iterator &lhs, const Iterator &rhs) { assert(lhs.ring == rhs.ring); return static_cast(lhs.index_ - rhs.index_) > 0; } friend bool operator>=(const Iterator &lhs, const Iterator &rhs) { assert(lhs.ring == rhs.ring); return static_cast(lhs.index_ - rhs.index_) >= 0; } uint32_t index() const { return index_ & (ring->size() - 1); } private: Iterator(uint32_t index, std::vector *const ring) : index_(index), ring(ring) {} friend struct Batch; uint32_t index_; std::vector *const ring; }; [[nodiscard]] Iterator begin() { return Iterator(begin_, ring); } [[nodiscard]] Iterator end() { return Iterator(end_, ring); } [[nodiscard]] size_t size() const { return end_ - begin_; } [[nodiscard]] bool empty() const { return end_ == begin_; } T &operator[](uint32_t n) { return (*ring)[(begin_ + n) & (ring->size() - 1)]; } private: friend struct ThreadPipeline; Batch(std::vector *const ring, uint32_t begin_, uint32_t end_) : ring(ring), begin_(begin_), end_(end_) {} std::vector *const ring; uint32_t begin_; uint32_t end_; }; private: WaitStrategy wait_strategy_; PipelineTopology topology_; alignas(128) std::atomic slots{0}; alignas(128) std::atomic pushes{0}; const uint32_t slot_count; const uint32_t slot_count_mask; std::vector ring; std::vector all_threads; void initialize_all_threads() { for (int stage = 0; stage < topology_.num_stages; ++stage) { init_stage_threads(stage); } } void init_stage_threads(int stage) { int stage_offset = topology_.stage_offset(stage); int stage_thread_count = topology_.threads_per_stage[stage]; int prev_stage_threads = topology_.prev_stage_thread_count(stage); bool is_last_stage = (stage == topology_.num_stages - 1); for (int thread = 0; thread < stage_thread_count; ++thread) { auto &thread_state = all_threads[stage_offset + thread]; thread_state.last_stage = is_last_stage; thread_state.last_push_read = std::vector(prev_stage_threads); } } Batch acquire_helper(int stage, int thread, uint32_t maxBatch, bool may_block) { int thread_idx = topology_.thread_index(stage, thread); auto &thread_state = all_threads[thread_idx]; uint32_t begin = thread_state.local_pops & slot_count_mask; uint32_t len = PipelineAlgorithms::calculate_safe_len( wait_strategy_, topology_, stage, thread, all_threads, pushes, may_block); if (maxBatch != 0) { len = std::min(len, maxBatch); } if (len == 0) { return Batch{}; } auto result = Batch{&ring, begin, begin + len}; thread_state.local_pops += len; return result; } public: struct StageGuard { Batch batch; ~StageGuard() { if (!batch.empty()) { PipelineAlgorithms::update_thread_pops( pipeline->wait_strategy_, pipeline->topology_, stage, thread, pipeline->all_threads, local_pops); } } StageGuard(StageGuard const &) = delete; StageGuard &operator=(StageGuard const &) = delete; StageGuard(StageGuard &&other) noexcept : batch(other.batch), local_pops(other.local_pops), stage(other.stage), thread(other.thread), pipeline(std::exchange(other.pipeline, nullptr)) {} StageGuard &operator=(StageGuard &&other) noexcept { batch = other.batch; local_pops = other.local_pops; stage = other.stage; thread = other.thread; pipeline = std::exchange(other.pipeline, nullptr); return *this; } private: friend struct ThreadPipeline; uint32_t local_pops; int stage; int thread; ThreadPipeline *pipeline; StageGuard(Batch batch, uint32_t local_pops, int stage, int thread, ThreadPipeline *pipeline) : batch(batch), local_pops(local_pops), stage(stage), thread(thread), pipeline(batch.empty() ? nullptr : pipeline) {} }; struct ProducerGuard { Batch batch; ~ProducerGuard() { if (tp == nullptr) { return; } for (;;) { uint32_t p = tp->pushes.load(std::memory_order_acquire); if (p == old_slot) { break; } tp->pushes.wait(p, std::memory_order_relaxed); } tp->pushes.store(new_slot, std::memory_order_seq_cst); tp->pushes.notify_all(); } private: friend struct ThreadPipeline; ProducerGuard() : batch(), tp() {} ProducerGuard(Batch batch, ThreadPipeline *tp, uint32_t old_slot, uint32_t new_slot) : batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {} ThreadPipeline *const tp; uint32_t old_slot; uint32_t new_slot; }; // Acquire a batch of items for processing by a consumer thread. // stage: which processing stage (0 = first consumer stage after producers) // thread: thread ID within the stage (0 to threads_per_stage[stage]-1) // maxBatch: maximum items to acquire (0 = no limit) // may_block: whether to block waiting for items (false = return empty batch // if none available) Returns: StageGuard with batch of items to process [[nodiscard]] StageGuard acquire(int stage, int thread, int maxBatch = 0, bool may_block = true) { auto batch = acquire_helper(stage, thread, maxBatch, may_block); int thread_idx = topology_.thread_index(stage, thread); uint32_t local_pops = all_threads[thread_idx].local_pops; return StageGuard{std::move(batch), local_pops, stage, thread, this}; } // Reserve slots in the ring buffer for a producer thread to fill with items. // This is used by producer threads to add new items to stage 0 of the // pipeline. // // size: number of slots to reserve (must be > 0 and <= ring buffer capacity) // block: if true, blocks when ring buffer is full; if false, returns empty // guard Returns: ProducerGuard with exclusive access to reserved slots // // Usage: Fill items in the returned batch, then let guard destructor publish // them. The guard destructor ensures items are published in the correct // order. // // Preconditions: // - size > 0 (must request at least one slot) // - size <= slotCount (cannot request more slots than ring buffer capacity) // Violating preconditions results in program termination via abort(). [[nodiscard]] ProducerGuard push(uint32_t const size, bool block) { if (size == 0) { std::abort(); } if (size > slot_count) { std::abort(); } uint32_t slot; uint32_t begin; for (;;) { slot = slots.load(std::memory_order_relaxed); begin = slot & slot_count_mask; int capacity_result = PipelineAlgorithms::check_producer_capacity( topology_, all_threads, slot, size, slot_count, block); if (capacity_result == 1) { continue; } if (capacity_result == 2) { return ProducerGuard{}; } if (slots.compare_exchange_weak(slot, slot + size, std::memory_order_relaxed, std::memory_order_relaxed)) { break; } } return ProducerGuard{Batch{&ring, begin, begin + size}, this, slot, slot + size}; } };