Files
weaseldb/src/thread_pipeline.hpp

593 lines
21 KiB
C++

#pragma once
#include <array>
#include <atomic>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <iterator>
#include <utility>
#include <vector>
// Wait strategies for controlling thread blocking behavior when no work is
// available
enum class WaitStrategy {
// Never block - threads busy-wait (spin) when no work available.
// Stage threads will always use 100% CPU even when idle.
// Requires dedicated CPU cores to avoid scheduler thrashing.
// Use when: latency is critical and you have spare cores.
Never,
// Block only when all upstream stages are idle (no new work entering
// pipeline).
// Downstream threads busy-wait if upstream has work but not for their stage.
// Eliminates futex notifications between stages, reduces to 0% CPU when idle.
// Requires dedicated cores to avoid priority inversion when pipeline has
// work.
// Use when: high throughput with spare cores and sustained workloads.
WaitIfUpstreamIdle,
// Block when individual stages are empty (original behavior).
// Each stage waits independently on its input sources.
// Safe for shared CPU environments, works well with variable workloads.
// Use when: general purpose, shared cores, or unpredictable workloads.
WaitIfStageEmpty,
};
// Core thread state
struct ThreadState {
alignas(128) std::atomic<uint32_t> pops{0};
uint32_t local_pops{0};
std::vector<uint32_t> last_push_read;
bool last_stage;
};
// Compile-time topology configuration for static pipelines
//
// This template defines a pipeline topology at compile-time:
// - Stage and thread calculations done at compile-time
// - Type-safe indexing: Stage and thread indices validated at compile-time
// - Fixed-size arrays with known bounds
// - Code specialization for each topology
//
// Example: StaticPipelineTopology<1, 4, 2> creates:
// - Stage 0: 1 thread (index 0)
// - Stage 1: 4 threads (indices 1-4)
// - Stage 2: 2 threads (indices 5-6)
// - Total: 7 threads across 3 stages
template <int... ThreadsPerStage> struct StaticPipelineTopology {
static_assert(sizeof...(ThreadsPerStage) > 0,
"Must specify at least one stage");
static_assert(((ThreadsPerStage > 0) && ...),
"All stages must have at least one thread");
static constexpr int num_stages = sizeof...(ThreadsPerStage);
static constexpr std::array<int, num_stages> threads_per_stage = {
ThreadsPerStage...};
static constexpr int total_threads = (ThreadsPerStage + ...);
// Compile-time stage offset calculation
template <int Stage> static constexpr int stage_offset() {
static_assert(Stage >= 0 && Stage < num_stages,
"Stage index out of bounds");
if constexpr (Stage == 0) {
return 0;
} else {
return stage_offset<Stage - 1>() + threads_per_stage[Stage - 1];
}
}
// Compile-time thread index calculation
template <int Stage, int Thread> static constexpr int thread_index() {
static_assert(Stage >= 0 && Stage < num_stages,
"Stage index out of bounds");
static_assert(Thread >= 0 && Thread < threads_per_stage[Stage],
"Thread index out of bounds");
return stage_offset<Stage>() + Thread;
}
// Compile-time previous stage thread count
template <int Stage> static constexpr int prev_stage_thread_count() {
static_assert(Stage >= 0 && Stage < num_stages,
"Stage index out of bounds");
if constexpr (Stage == 0) {
return 1;
} else {
return threads_per_stage[Stage - 1];
}
}
};
// Static pipeline algorithms - compile-time specialized versions
namespace StaticPipelineAlgorithms {
template <WaitStrategy wait_strategy, typename Topology, int Stage,
int ThreadInStage>
uint32_t calculate_safe_len(
std::array<ThreadState, Topology::total_threads> &all_threads,
std::atomic<uint32_t> &pushes, bool may_block) {
constexpr int thread_idx =
Topology::template thread_index<Stage, ThreadInStage>();
auto &thread = all_threads[thread_idx];
uint32_t safe_len = UINT32_MAX;
constexpr int prev_stage_threads =
Topology::template prev_stage_thread_count<Stage>();
// Compile-time loop over previous stage threads
[&]<std::size_t... Is>(std::index_sequence<Is...>) {
(
[&] {
auto &last_push = [&]() -> std::atomic<uint32_t> & {
if constexpr (Stage == 0) {
return pushes;
} else {
constexpr int prev_thread_idx =
Topology::template thread_index<Stage - 1, Is>();
return all_threads[prev_thread_idx].pops;
}
}();
if (thread.last_push_read[Is] == thread.local_pops) {
thread.last_push_read[Is] =
last_push.load(std::memory_order_acquire);
if (thread.last_push_read[Is] == thread.local_pops) {
if (!may_block) {
safe_len = 0;
return;
}
if constexpr (wait_strategy == WaitStrategy::Never) {
// Empty - busy wait
} else if constexpr (wait_strategy ==
WaitStrategy::WaitIfUpstreamIdle) {
// We're allowed to spin as long as we eventually go to 0% cpu
// usage on idle
uint32_t push;
for (int i = 0; i < 100000; ++i) {
push = pushes.load(std::memory_order_relaxed);
if (push != thread.local_pops) {
goto dont_wait;
}
}
pushes.wait(push, std::memory_order_relaxed);
dont_wait:;
} else {
static_assert(wait_strategy == WaitStrategy::WaitIfStageEmpty);
last_push.wait(thread.last_push_read[Is],
std::memory_order_relaxed);
}
thread.last_push_read[Is] =
last_push.load(std::memory_order_acquire);
}
}
safe_len =
std::min(safe_len, thread.last_push_read[Is] - thread.local_pops);
}(),
...);
}(std::make_index_sequence<prev_stage_threads>{});
return safe_len;
}
template <WaitStrategy wait_strategy, typename Topology, int Stage,
int ThreadInStage>
void update_thread_pops(
std::array<ThreadState, Topology::total_threads> &all_threads,
uint32_t local_pops) {
constexpr int thread_idx =
Topology::template thread_index<Stage, ThreadInStage>();
auto &thread_state = all_threads[thread_idx];
if constexpr (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
thread_state.pops.store(local_pops, std::memory_order_seq_cst);
thread_state.pops.notify_all();
} else if constexpr (Stage == Topology::num_stages - 1) { // last stage
thread_state.pops.store(local_pops, std::memory_order_seq_cst);
thread_state.pops.notify_all();
} else {
thread_state.pops.store(local_pops, std::memory_order_release);
}
}
template <typename Topology>
int check_producer_capacity(
std::array<ThreadState, Topology::total_threads> &all_threads,
uint32_t slot, uint32_t size, uint32_t slot_count, bool block) {
constexpr int last_stage = Topology::num_stages - 1;
constexpr int last_stage_offset =
Topology::template stage_offset<last_stage>();
constexpr int last_stage_thread_count =
Topology::threads_per_stage[last_stage];
for (int i = 0; i < last_stage_thread_count; ++i) {
auto &thread = all_threads[last_stage_offset + i];
uint32_t pops = thread.pops.load(std::memory_order_acquire);
if (slot + size - pops > slot_count) {
if (!block) {
return 2; // Cannot proceed
}
thread.pops.wait(pops, std::memory_order_relaxed);
return 1; // Should retry
}
}
return 0; // Can proceed
}
} // namespace StaticPipelineAlgorithms
// Static multi-stage lock-free pipeline for inter-thread communication
// with compile-time topology specification.
//
// Overview:
// - Items flow from producers through multiple processing stages (stage 0 ->
// stage 1 -> ... -> final stage)
// - Each stage can have multiple worker threads processing items in parallel
// - Uses a shared ring buffer with atomic counters for lock-free coordination
// - Supports batch processing for efficiency
// - Compile-time topology specification via template parameters
//
// Architecture:
// - Producers: External threads that add items to the pipeline via push()
// - Stages: Processing stages numbered 0, 1, 2, ... that consume items via
// acquire<Stage, Thread>()
// - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage
//
// Differences from Dynamic Version:
// - Template parameters specify topology at compile-time (e.g., <Item,
// WaitStrategy::Never, 1, 4, 2>)
// - Stage and thread indices are template parameters, validated at compile-time
// - Fixed-size arrays replace dynamic vectors
// - Specialized algorithms for each stage/thread combination
// - Type-safe guards prevent runtime indexing errors
//
// Usage Pattern:
// using Pipeline = StaticThreadPipeline<Item, WaitStrategy::WaitIfStageEmpty,
// 1, 4, 2>; Pipeline pipeline(lgSlotCount);
//
// // Producer threads (add items for stage 0 to consume):
// auto guard = pipeline.push(batchSize, /*block=*/true);
// for (auto& item : guard.batch) {
// // Initialize item data
// }
// // Guard destructor publishes batch to stage 0 consumers
//
// // Stage worker threads (process items and pass to next stage):
// auto guard = pipeline.acquire<Stage, Thread>(maxBatch, /*may_block=*/true);
// for (auto& item : guard.batch) {
// // Process item
// }
// // Guard destructor marks items as consumed and available to next stage
//
// Memory Model:
// - Ring buffer size must be power of 2 for efficient masking
// - Actual ring slots accessed via: index & (slotCount - 1)
// - 128-byte aligned atomics prevent false sharing between CPU cache lines
//
// Thread Safety:
// - Fully lock-free using atomic operations with acquire/release memory
// ordering
// - Uses C++20 atomic wait/notify for efficient blocking when no work available
// - RAII guards ensure proper cleanup even with exceptions
template <class T, WaitStrategy wait_strategy, int... ThreadsPerStage>
struct StaticThreadPipeline {
using Topology = StaticPipelineTopology<ThreadsPerStage...>;
// Constructor
// lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots)
// Template parameters specify pipeline topology (e.g., <Item, Never, 1, 4,
// 2>) Note: Producer threads are external to the pipeline and not counted in
// ThreadsPerStage
explicit StaticThreadPipeline(int lgSlotCount)
: slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
ring(slot_count) {
// Otherwise we can't tell the difference between full and empty.
assert(!(slot_count_mask & 0x80000000));
initialize_all_threads();
}
StaticThreadPipeline(StaticThreadPipeline const &) = delete;
StaticThreadPipeline &operator=(StaticThreadPipeline const &) = delete;
StaticThreadPipeline(StaticThreadPipeline &&) = delete;
StaticThreadPipeline &operator=(StaticThreadPipeline &&) = delete;
struct Batch {
Batch() : ring(), begin_(), end_() {}
struct Iterator {
using iterator_category = std::random_access_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = T;
using pointer = value_type *;
using reference = value_type &;
reference operator*() const {
return (*ring)[index_ & (ring->size() - 1)];
}
pointer operator->() const {
return &(*ring)[index_ & (ring->size() - 1)];
}
Iterator &operator++() {
++index_;
return *this;
}
Iterator operator++(int) {
auto tmp = *this;
++(*this);
return tmp;
}
Iterator &operator--() {
--index_;
return *this;
}
Iterator operator--(int) {
auto tmp = *this;
--(*this);
return tmp;
}
Iterator &operator+=(difference_type n) {
index_ += n;
return *this;
}
Iterator &operator-=(difference_type n) {
index_ -= n;
return *this;
}
Iterator operator+(difference_type n) const {
return Iterator(index_ + n, ring);
}
Iterator operator-(difference_type n) const {
return Iterator(index_ - n, ring);
}
difference_type operator-(const Iterator &rhs) const {
assert(ring == rhs.ring);
return static_cast<difference_type>(index_) -
static_cast<difference_type>(rhs.index_);
}
friend Iterator operator+(difference_type n, const Iterator &iter) {
return iter + n;
}
friend bool operator==(const Iterator &lhs, const Iterator &rhs) {
assert(lhs.ring == rhs.ring);
return lhs.index_ == rhs.index_;
}
friend bool operator!=(const Iterator &lhs, const Iterator &rhs) {
assert(lhs.ring == rhs.ring);
return lhs.index_ != rhs.index_;
}
friend bool operator<(const Iterator &lhs, const Iterator &rhs) {
assert(lhs.ring == rhs.ring);
return static_cast<int32_t>(lhs.index_ - rhs.index_) < 0;
}
friend bool operator<=(const Iterator &lhs, const Iterator &rhs) {
assert(lhs.ring == rhs.ring);
return static_cast<int32_t>(lhs.index_ - rhs.index_) <= 0;
}
friend bool operator>(const Iterator &lhs, const Iterator &rhs) {
assert(lhs.ring == rhs.ring);
return static_cast<int32_t>(lhs.index_ - rhs.index_) > 0;
}
friend bool operator>=(const Iterator &lhs, const Iterator &rhs) {
assert(lhs.ring == rhs.ring);
return static_cast<int32_t>(lhs.index_ - rhs.index_) >= 0;
}
uint32_t index() const { return index_ & (ring->size() - 1); }
private:
Iterator(uint32_t index, std::vector<T> *const ring)
: index_(index), ring(ring) {}
friend struct Batch;
uint32_t index_;
std::vector<T> *const ring;
};
[[nodiscard]] Iterator begin() { return Iterator(begin_, ring); }
[[nodiscard]] Iterator end() { return Iterator(end_, ring); }
[[nodiscard]] size_t size() const { return end_ - begin_; }
[[nodiscard]] bool empty() const { return end_ == begin_; }
T &operator[](uint32_t n) {
return (*ring)[(begin_ + n) & (ring->size() - 1)];
}
private:
friend struct StaticThreadPipeline;
Batch(std::vector<T> *const ring, uint32_t begin_, uint32_t end_)
: ring(ring), begin_(begin_), end_(end_) {}
std::vector<T> *const ring;
uint32_t begin_;
uint32_t end_;
};
// Static thread storage - fixed size array
std::array<ThreadState, Topology::total_threads> all_threads;
private:
alignas(128) std::atomic<uint32_t> slots{0};
alignas(128) std::atomic<uint32_t> pushes{0};
const uint32_t slot_count;
const uint32_t slot_count_mask;
std::vector<T> ring;
void initialize_all_threads() {
[&]<std::size_t... StageIndices>(std::index_sequence<StageIndices...>) {
(init_stage_threads<StageIndices>(), ...);
}(std::make_index_sequence<Topology::num_stages>{});
}
template <int Stage> void init_stage_threads() {
constexpr int stage_offset = Topology::template stage_offset<Stage>();
constexpr int stage_thread_count = Topology::threads_per_stage[Stage];
constexpr int prev_stage_threads =
Topology::template prev_stage_thread_count<Stage>();
constexpr bool is_last_stage = (Stage == Topology::num_stages - 1);
for (int thread = 0; thread < stage_thread_count; ++thread) {
auto &thread_state = all_threads[stage_offset + thread];
thread_state.last_stage = is_last_stage;
thread_state.last_push_read = std::vector<uint32_t>(prev_stage_threads);
}
}
template <int Stage, int Thread>
Batch acquire_helper(uint32_t maxBatch, bool mayBlock) {
constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
auto &thread_state = all_threads[thread_idx];
uint32_t begin = thread_state.local_pops & slot_count_mask;
uint32_t len = StaticPipelineAlgorithms::calculate_safe_len<
wait_strategy, Topology, Stage, Thread>(all_threads, pushes, mayBlock);
if (maxBatch != 0) {
len = std::min(len, maxBatch);
}
if (len == 0) {
return Batch{};
}
auto result = Batch{&ring, begin, begin + len};
thread_state.local_pops += len;
return result;
}
public:
template <int Stage, int Thread> struct StageGuard {
Batch batch;
~StageGuard() {
if (!batch.empty()) {
StaticPipelineAlgorithms::update_thread_pops<wait_strategy, Topology,
Stage, Thread>(
pipeline->all_threads, local_pops);
}
}
StageGuard(StageGuard const &) = delete;
StageGuard &operator=(StageGuard const &) = delete;
StageGuard(StageGuard &&other) noexcept
: batch(other.batch), local_pops(other.local_pops),
pipeline(std::exchange(other.pipeline, nullptr)) {}
StageGuard &operator=(StageGuard &&other) noexcept {
batch = other.batch;
local_pops = other.local_pops;
pipeline = std::exchange(other.pipeline, nullptr);
return *this;
}
private:
friend struct StaticThreadPipeline;
uint32_t local_pops;
StaticThreadPipeline *pipeline;
StageGuard(Batch batch, uint32_t local_pops, StaticThreadPipeline *pipeline)
: batch(batch), local_pops(local_pops),
pipeline(batch.empty() ? nullptr : pipeline) {}
};
struct ProducerGuard {
Batch batch;
~ProducerGuard() {
if (tp == nullptr) {
return;
}
for (;;) {
uint32_t p = tp->pushes.load(std::memory_order_acquire);
if (p == old_slot) {
break;
}
tp->pushes.wait(p, std::memory_order_relaxed);
}
tp->pushes.store(new_slot, std::memory_order_seq_cst);
tp->pushes.notify_all();
}
private:
friend struct StaticThreadPipeline;
ProducerGuard() : batch(), tp() {}
ProducerGuard(Batch batch, StaticThreadPipeline *tp, uint32_t old_slot,
uint32_t new_slot)
: batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {}
StaticThreadPipeline *const tp;
uint32_t old_slot;
uint32_t new_slot;
};
// Acquire a batch of items for processing by a consumer thread.
// Stage: which processing stage (0 = first consumer stage after producers) -
// compile-time parameter Thread: thread ID within the stage (0 to
// ThreadsPerStage[Stage]-1) - compile-time parameter maxBatch: maximum items
// to acquire (0 = no limit) may_block: whether to block waiting for items
// (false = return empty batch if none available) Returns: StageGuard<Stage,
// Thread> with batch of items to process and compile-time type safety
template <int Stage, int Thread>
[[nodiscard]] StageGuard<Stage, Thread> acquire(int maxBatch = 0,
bool may_block = true) {
static_assert(Stage >= 0 && Stage < Topology::num_stages,
"Stage index out of bounds");
static_assert(Thread >= 0 && Thread < Topology::threads_per_stage[Stage],
"Thread index out of bounds");
auto batch = acquire_helper<Stage, Thread>(maxBatch, may_block);
constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
uint32_t local_pops = all_threads[thread_idx].local_pops;
return StageGuard<Stage, Thread>{std::move(batch), local_pops, this};
}
// Reserve slots in the ring buffer for a producer thread to fill with items.
// This is used by producer threads to add new items to stage 0 of the
// pipeline.
//
// size: number of slots to reserve (must be > 0 and <= ring buffer capacity)
// block: if true, blocks when ring buffer is full; if false, returns empty
// guard Returns: ProducerGuard with exclusive access to reserved slots
//
// Usage: Fill items in the returned batch, then let guard destructor publish
// them. The guard destructor ensures items are published in the correct
// order.
//
// Preconditions:
// - size > 0 (must request at least one slot)
// - size <= slotCount (cannot request more slots than ring buffer capacity)
// Violating preconditions results in program termination via abort().
[[nodiscard]] ProducerGuard push(uint32_t const size, bool block) {
if (size == 0) {
std::abort();
}
if (size > slot_count) {
std::abort();
}
uint32_t slot;
uint32_t begin;
for (;;) {
slot = slots.load(std::memory_order_relaxed);
begin = slot & slot_count_mask;
int capacity_result =
StaticPipelineAlgorithms::check_producer_capacity<Topology>(
all_threads, slot, size, slot_count, block);
if (capacity_result == 1) {
continue;
}
if (capacity_result == 2) {
return ProducerGuard{};
}
if (slots.compare_exchange_weak(slot, slot + size,
std::memory_order_relaxed,
std::memory_order_relaxed)) {
break;
}
}
return ProducerGuard{Batch{&ring, begin, begin + size}, this, slot,
slot + size};
}
};