Make pipeline policy/topology configurable
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
@@ -48,151 +47,174 @@ struct ThreadState {
|
||||
bool last_stage;
|
||||
};
|
||||
|
||||
// Compile-time topology configuration for static pipelines
|
||||
// Runtime topology configuration for dynamic pipelines
|
||||
//
|
||||
// This template defines a pipeline topology at compile-time:
|
||||
// - Stage and thread calculations done at compile-time
|
||||
// - Type-safe indexing: Stage and thread indices validated at compile-time
|
||||
// - Fixed-size arrays with known bounds
|
||||
// - Code specialization for each topology
|
||||
// This class defines a pipeline topology at runtime:
|
||||
// - Stage and thread calculations done at runtime
|
||||
// - Flexible configuration: topology can be set via constructor
|
||||
// - Dynamic arrays with runtime bounds checking
|
||||
// - Single implementation works for any topology
|
||||
//
|
||||
// Example: StaticPipelineTopology<1, 4, 2> creates:
|
||||
// Example: PipelineTopology({1, 4, 2}) creates:
|
||||
// - Stage 0: 1 thread (index 0)
|
||||
// - Stage 1: 4 threads (indices 1-4)
|
||||
// - Stage 2: 2 threads (indices 5-6)
|
||||
// - Total: 7 threads across 3 stages
|
||||
template <int... ThreadsPerStage> struct StaticPipelineTopology {
|
||||
static_assert(sizeof...(ThreadsPerStage) > 0,
|
||||
"Must specify at least one stage");
|
||||
static_assert(((ThreadsPerStage > 0) && ...),
|
||||
"All stages must have at least one thread");
|
||||
struct PipelineTopology {
|
||||
const std::vector<int> threads_per_stage;
|
||||
const int num_stages;
|
||||
const std::vector<int> stage_offsets;
|
||||
const int total_threads;
|
||||
|
||||
static constexpr int num_stages = sizeof...(ThreadsPerStage);
|
||||
static constexpr std::array<int, num_stages> threads_per_stage = {
|
||||
ThreadsPerStage...};
|
||||
static constexpr int total_threads = (ThreadsPerStage + ...);
|
||||
explicit PipelineTopology(std::vector<int> threads_per_stage_)
|
||||
: threads_per_stage(validate_and_move(std::move(threads_per_stage_))),
|
||||
num_stages(static_cast<int>(threads_per_stage.size())),
|
||||
stage_offsets(build_stage_offsets(threads_per_stage)),
|
||||
total_threads(build_total_threads(threads_per_stage)) {}
|
||||
|
||||
// Compile-time stage offset calculation
|
||||
template <int Stage> static constexpr int stage_offset() {
|
||||
static_assert(Stage >= 0 && Stage < num_stages,
|
||||
"Stage index out of bounds");
|
||||
if constexpr (Stage == 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return stage_offset<Stage - 1>() + threads_per_stage[Stage - 1];
|
||||
// Runtime stage offset calculation
|
||||
int stage_offset(int stage) const {
|
||||
if (stage < 0 || stage >= num_stages) {
|
||||
std::abort(); // Stage index out of bounds
|
||||
}
|
||||
return stage_offsets[stage];
|
||||
}
|
||||
|
||||
// Compile-time thread index calculation
|
||||
template <int Stage, int Thread> static constexpr int thread_index() {
|
||||
static_assert(Stage >= 0 && Stage < num_stages,
|
||||
"Stage index out of bounds");
|
||||
static_assert(Thread >= 0 && Thread < threads_per_stage[Stage],
|
||||
"Thread index out of bounds");
|
||||
return stage_offset<Stage>() + Thread;
|
||||
// Runtime thread index calculation
|
||||
int thread_index(int stage, int thread) const {
|
||||
if (stage < 0 || stage >= num_stages) {
|
||||
std::abort(); // Stage index out of bounds
|
||||
}
|
||||
if (thread < 0 || thread >= threads_per_stage[stage]) {
|
||||
std::abort(); // Thread index out of bounds
|
||||
}
|
||||
return stage_offsets[stage] + thread;
|
||||
}
|
||||
|
||||
// Compile-time previous stage thread count
|
||||
template <int Stage> static constexpr int prev_stage_thread_count() {
|
||||
static_assert(Stage >= 0 && Stage < num_stages,
|
||||
"Stage index out of bounds");
|
||||
if constexpr (Stage == 0) {
|
||||
// Runtime previous stage thread count
|
||||
int prev_stage_thread_count(int stage) const {
|
||||
if (stage < 0 || stage >= num_stages) {
|
||||
std::abort(); // Stage index out of bounds
|
||||
}
|
||||
if (stage == 0) {
|
||||
return 1;
|
||||
} else {
|
||||
return threads_per_stage[Stage - 1];
|
||||
return threads_per_stage[stage - 1];
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static std::vector<int> validate_and_move(std::vector<int> threads) {
|
||||
if (threads.empty()) {
|
||||
std::abort(); // Must specify at least one stage
|
||||
}
|
||||
for (int count : threads) {
|
||||
if (count <= 0) {
|
||||
std::abort(); // All stages must have at least one thread
|
||||
}
|
||||
}
|
||||
return threads;
|
||||
}
|
||||
|
||||
static std::vector<int>
|
||||
build_stage_offsets(const std::vector<int> &threads_per_stage) {
|
||||
std::vector<int> offsets(threads_per_stage.size());
|
||||
int offset = 0;
|
||||
for (size_t i = 0; i < threads_per_stage.size(); ++i) {
|
||||
offsets[i] = offset;
|
||||
offset += threads_per_stage[i];
|
||||
}
|
||||
return offsets;
|
||||
}
|
||||
|
||||
static int build_total_threads(const std::vector<int> &threads_per_stage) {
|
||||
int total = 0;
|
||||
for (int count : threads_per_stage) {
|
||||
total += count;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
};
|
||||
|
||||
// Static pipeline algorithms - compile-time specialized versions
|
||||
namespace StaticPipelineAlgorithms {
|
||||
// Pipeline algorithms - runtime configurable versions
|
||||
namespace PipelineAlgorithms {
|
||||
|
||||
template <WaitStrategy wait_strategy, typename Topology, int Stage,
|
||||
int ThreadInStage>
|
||||
uint32_t calculate_safe_len(
|
||||
std::array<ThreadState, Topology::total_threads> &all_threads,
|
||||
std::atomic<uint32_t> &pushes, bool may_block) {
|
||||
constexpr int thread_idx =
|
||||
Topology::template thread_index<Stage, ThreadInStage>();
|
||||
inline uint32_t calculate_safe_len(WaitStrategy wait_strategy,
|
||||
const PipelineTopology &topology, int stage,
|
||||
int thread_in_stage,
|
||||
std::vector<ThreadState> &all_threads,
|
||||
std::atomic<uint32_t> &pushes,
|
||||
bool may_block) {
|
||||
int thread_idx = topology.thread_index(stage, thread_in_stage);
|
||||
auto &thread = all_threads[thread_idx];
|
||||
uint32_t safe_len = UINT32_MAX;
|
||||
|
||||
constexpr int prev_stage_threads =
|
||||
Topology::template prev_stage_thread_count<Stage>();
|
||||
int prev_stage_threads = topology.prev_stage_thread_count(stage);
|
||||
|
||||
// Compile-time loop over previous stage threads
|
||||
[&]<std::size_t... Is>(std::index_sequence<Is...>) {
|
||||
(
|
||||
[&] {
|
||||
auto &last_push = [&]() -> std::atomic<uint32_t> & {
|
||||
if constexpr (Stage == 0) {
|
||||
return pushes;
|
||||
} else {
|
||||
constexpr int prev_thread_idx =
|
||||
Topology::template thread_index<Stage - 1, Is>();
|
||||
return all_threads[prev_thread_idx].pops;
|
||||
// Runtime loop over previous stage threads
|
||||
for (int i = 0; i < prev_stage_threads; ++i) {
|
||||
std::atomic<uint32_t> &last_push = [&]() -> std::atomic<uint32_t> & {
|
||||
if (stage == 0) {
|
||||
return pushes;
|
||||
} else {
|
||||
int prev_thread_idx = topology.thread_index(stage - 1, i);
|
||||
return all_threads[prev_thread_idx].pops;
|
||||
}
|
||||
}();
|
||||
|
||||
if (thread.last_push_read[i] == thread.local_pops) {
|
||||
thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
|
||||
if (thread.last_push_read[i] == thread.local_pops) {
|
||||
if (!may_block) {
|
||||
safe_len = 0;
|
||||
return safe_len;
|
||||
}
|
||||
|
||||
if (wait_strategy == WaitStrategy::Never) {
|
||||
// Empty - busy wait
|
||||
} else if (wait_strategy == WaitStrategy::WaitIfUpstreamIdle) {
|
||||
// We're allowed to spin as long as we eventually go to 0% cpu
|
||||
// usage on idle
|
||||
uint32_t push;
|
||||
bool should_wait = true;
|
||||
for (int j = 0; j < 100000; ++j) {
|
||||
push = pushes.load(std::memory_order_relaxed);
|
||||
if (push != thread.local_pops) {
|
||||
should_wait = false;
|
||||
break;
|
||||
}
|
||||
}();
|
||||
|
||||
if (thread.last_push_read[Is] == thread.local_pops) {
|
||||
thread.last_push_read[Is] =
|
||||
last_push.load(std::memory_order_acquire);
|
||||
if (thread.last_push_read[Is] == thread.local_pops) {
|
||||
if (!may_block) {
|
||||
safe_len = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (wait_strategy == WaitStrategy::Never) {
|
||||
// Empty - busy wait
|
||||
} else if constexpr (wait_strategy ==
|
||||
WaitStrategy::WaitIfUpstreamIdle) {
|
||||
// We're allowed to spin as long as we eventually go to 0% cpu
|
||||
// usage on idle
|
||||
uint32_t push;
|
||||
for (int i = 0; i < 100000; ++i) {
|
||||
push = pushes.load(std::memory_order_relaxed);
|
||||
if (push != thread.local_pops) {
|
||||
goto dont_wait;
|
||||
}
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
_mm_pause();
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
pushes.wait(push, std::memory_order_relaxed);
|
||||
dont_wait:;
|
||||
} else {
|
||||
static_assert(wait_strategy == WaitStrategy::WaitIfStageEmpty);
|
||||
last_push.wait(thread.last_push_read[Is],
|
||||
std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
thread.last_push_read[Is] =
|
||||
last_push.load(std::memory_order_acquire);
|
||||
}
|
||||
}
|
||||
safe_len =
|
||||
std::min(safe_len, thread.last_push_read[Is] - thread.local_pops);
|
||||
}(),
|
||||
...);
|
||||
}(std::make_index_sequence<prev_stage_threads>{});
|
||||
if (should_wait) {
|
||||
pushes.wait(push, std::memory_order_relaxed);
|
||||
}
|
||||
} else { // WaitStrategy::WaitIfStageEmpty
|
||||
last_push.wait(thread.last_push_read[i], std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
|
||||
}
|
||||
}
|
||||
safe_len = std::min(safe_len, thread.last_push_read[i] - thread.local_pops);
|
||||
}
|
||||
|
||||
return safe_len;
|
||||
}
|
||||
|
||||
template <WaitStrategy wait_strategy, typename Topology, int Stage,
|
||||
int ThreadInStage>
|
||||
void update_thread_pops(
|
||||
std::array<ThreadState, Topology::total_threads> &all_threads,
|
||||
uint32_t local_pops) {
|
||||
constexpr int thread_idx =
|
||||
Topology::template thread_index<Stage, ThreadInStage>();
|
||||
inline void update_thread_pops(WaitStrategy wait_strategy,
|
||||
const PipelineTopology &topology, int stage,
|
||||
int thread_in_stage,
|
||||
std::vector<ThreadState> &all_threads,
|
||||
uint32_t local_pops) {
|
||||
int thread_idx = topology.thread_index(stage, thread_in_stage);
|
||||
auto &thread_state = all_threads[thread_idx];
|
||||
|
||||
if constexpr (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
|
||||
if (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
|
||||
thread_state.pops.store(local_pops, std::memory_order_seq_cst);
|
||||
thread_state.pops.notify_all();
|
||||
} else if constexpr (Stage == Topology::num_stages - 1) { // last stage
|
||||
} else if (stage == topology.num_stages - 1) { // last stage
|
||||
thread_state.pops.store(local_pops, std::memory_order_seq_cst);
|
||||
thread_state.pops.notify_all();
|
||||
} else {
|
||||
@@ -200,15 +222,13 @@ void update_thread_pops(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Topology>
|
||||
int check_producer_capacity(
|
||||
std::array<ThreadState, Topology::total_threads> &all_threads,
|
||||
uint32_t slot, uint32_t size, uint32_t slot_count, bool block) {
|
||||
constexpr int last_stage = Topology::num_stages - 1;
|
||||
constexpr int last_stage_offset =
|
||||
Topology::template stage_offset<last_stage>();
|
||||
constexpr int last_stage_thread_count =
|
||||
Topology::threads_per_stage[last_stage];
|
||||
inline int check_producer_capacity(const PipelineTopology &topology,
|
||||
std::vector<ThreadState> &all_threads,
|
||||
uint32_t slot, uint32_t size,
|
||||
uint32_t slot_count, bool block) {
|
||||
int last_stage = topology.num_stages - 1;
|
||||
int last_stage_offset = topology.stage_offset(last_stage);
|
||||
int last_stage_thread_count = topology.threads_per_stage[last_stage];
|
||||
|
||||
for (int i = 0; i < last_stage_thread_count; ++i) {
|
||||
auto &thread = all_threads[last_stage_offset + i];
|
||||
@@ -223,10 +243,10 @@ int check_producer_capacity(
|
||||
}
|
||||
return 0; // Can proceed
|
||||
}
|
||||
} // namespace StaticPipelineAlgorithms
|
||||
} // namespace PipelineAlgorithms
|
||||
|
||||
// Static multi-stage lock-free pipeline for inter-thread communication
|
||||
// with compile-time topology specification.
|
||||
// Multi-stage lock-free pipeline for inter-thread communication
|
||||
// with runtime-configurable topology and wait strategy.
|
||||
//
|
||||
// Overview:
|
||||
// - Items flow from producers through multiple processing stages (stage 0 ->
|
||||
@@ -234,25 +254,17 @@ int check_producer_capacity(
|
||||
// - Each stage can have multiple worker threads processing items in parallel
|
||||
// - Uses a shared ring buffer with atomic counters for lock-free coordination
|
||||
// - Supports batch processing for efficiency
|
||||
// - Compile-time topology specification via template parameters
|
||||
// - Runtime-configurable topology and wait strategy via constructor parameters
|
||||
//
|
||||
// Architecture:
|
||||
// - Producers: External threads that add items to the pipeline via push()
|
||||
// - Stages: Processing stages numbered 0, 1, 2, ... that consume items via
|
||||
// acquire<Stage, Thread>()
|
||||
// acquire(stage, thread)
|
||||
// - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage
|
||||
//
|
||||
// Differences from Dynamic Version:
|
||||
// - Template parameters specify topology at compile-time (e.g., <Item,
|
||||
// WaitStrategy::Never, 1, 4, 2>)
|
||||
// - Stage and thread indices are template parameters, validated at compile-time
|
||||
// - Fixed-size arrays replace dynamic vectors
|
||||
// - Specialized algorithms for each stage/thread combination
|
||||
// - Type-safe guards prevent runtime indexing errors
|
||||
//
|
||||
// Usage Pattern:
|
||||
// using Pipeline = StaticThreadPipeline<Item, WaitStrategy::WaitIfStageEmpty,
|
||||
// 1, 4, 2>; Pipeline pipeline(lgSlotCount);
|
||||
// ThreadPipeline<Item> pipeline(WaitStrategy::WaitIfStageEmpty, {1, 4, 2},
|
||||
// lgSlotCount);
|
||||
//
|
||||
// // Producer threads (add items for stage 0 to consume):
|
||||
// auto guard = pipeline.push(batchSize, /*block=*/true);
|
||||
@@ -262,20 +274,21 @@ int check_producer_capacity(
|
||||
// // Guard destructor publishes batch to stage 0 consumers
|
||||
//
|
||||
// // Stage worker threads (process items and pass to next stage):
|
||||
// auto guard = pipeline.acquire<Stage, Thread>(maxBatch, /*may_block=*/true);
|
||||
// auto guard = pipeline.acquire(stage, thread, maxBatch, /*may_block=*/true);
|
||||
// for (auto& item : guard.batch) {
|
||||
// // Process item
|
||||
// }
|
||||
// // Guard destructor marks items as consumed and available to next stage
|
||||
//
|
||||
// Multi-Thread Stage Processing:
|
||||
// When a stage has multiple threads (e.g., 1, 1, 1, 2 = 2 threads in stage 3):
|
||||
// When a stage has multiple threads (e.g., {1, 1, 1, 2} = 2 threads in stage
|
||||
// 3):
|
||||
//
|
||||
// OVERLAPPING BATCHES - EACH THREAD SEES EVERY ENTRY:
|
||||
// - Multiple threads in the same stage get OVERLAPPING batches from the ring
|
||||
// buffer
|
||||
// - Thread 0: calls acquire<3, 0>() - gets batch from ring positions 100-110
|
||||
// - Thread 1: calls acquire<3, 1>() - gets batch from ring positions 100-110
|
||||
// - Thread 0: calls acquire(3, 0) - gets batch from ring positions 100-110
|
||||
// - Thread 1: calls acquire(3, 1) - gets batch from ring positions 100-110
|
||||
// (SAME)
|
||||
// - Both threads see the same entries and must coordinate processing
|
||||
//
|
||||
@@ -319,27 +332,27 @@ int check_producer_capacity(
|
||||
// ordering
|
||||
// - Uses C++20 atomic wait/notify for efficient blocking when no work available
|
||||
// - RAII guards ensure proper cleanup even with exceptions
|
||||
template <class T, WaitStrategy wait_strategy, int... ThreadsPerStage>
|
||||
struct StaticThreadPipeline {
|
||||
using Topology = StaticPipelineTopology<ThreadsPerStage...>;
|
||||
|
||||
template <class T> struct ThreadPipeline {
|
||||
// Constructor
|
||||
// wait_strategy: blocking behavior when no work is available
|
||||
// threads_per_stage: number of threads in each stage (e.g., {1, 4, 2})
|
||||
// lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots)
|
||||
// Template parameters specify pipeline topology (e.g., <Item, Never, 1, 4,
|
||||
// 2>) Note: Producer threads are external to the pipeline and not counted in
|
||||
// ThreadsPerStage
|
||||
explicit StaticThreadPipeline(int lgSlotCount)
|
||||
: slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
|
||||
ring(slot_count) {
|
||||
// Note: Producer threads are external to the pipeline and not counted in
|
||||
// threads_per_stage
|
||||
explicit ThreadPipeline(WaitStrategy wait_strategy,
|
||||
std::vector<int> threads_per_stage, int lgSlotCount)
|
||||
: wait_strategy_(wait_strategy), topology_(std::move(threads_per_stage)),
|
||||
slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
|
||||
ring(slot_count), all_threads(topology_.total_threads) {
|
||||
// Otherwise we can't tell the difference between full and empty.
|
||||
assert(!(slot_count_mask & 0x80000000));
|
||||
initialize_all_threads();
|
||||
}
|
||||
|
||||
StaticThreadPipeline(StaticThreadPipeline const &) = delete;
|
||||
StaticThreadPipeline &operator=(StaticThreadPipeline const &) = delete;
|
||||
StaticThreadPipeline(StaticThreadPipeline &&) = delete;
|
||||
StaticThreadPipeline &operator=(StaticThreadPipeline &&) = delete;
|
||||
ThreadPipeline(ThreadPipeline const &) = delete;
|
||||
ThreadPipeline &operator=(ThreadPipeline const &) = delete;
|
||||
ThreadPipeline(ThreadPipeline &&) = delete;
|
||||
ThreadPipeline &operator=(ThreadPipeline &&) = delete;
|
||||
|
||||
struct Batch {
|
||||
Batch() : ring(), begin_(), end_() {}
|
||||
@@ -442,7 +455,7 @@ struct StaticThreadPipeline {
|
||||
}
|
||||
|
||||
private:
|
||||
friend struct StaticThreadPipeline;
|
||||
friend struct ThreadPipeline;
|
||||
Batch(std::vector<T> *const ring, uint32_t begin_, uint32_t end_)
|
||||
: ring(ring), begin_(begin_), end_(end_) {}
|
||||
std::vector<T> *const ring;
|
||||
@@ -450,29 +463,29 @@ struct StaticThreadPipeline {
|
||||
uint32_t end_;
|
||||
};
|
||||
|
||||
// Static thread storage - fixed size array
|
||||
std::array<ThreadState, Topology::total_threads> all_threads;
|
||||
|
||||
private:
|
||||
WaitStrategy wait_strategy_;
|
||||
PipelineTopology topology_;
|
||||
|
||||
alignas(128) std::atomic<uint32_t> slots{0};
|
||||
alignas(128) std::atomic<uint32_t> pushes{0};
|
||||
const uint32_t slot_count;
|
||||
const uint32_t slot_count_mask;
|
||||
|
||||
std::vector<T> ring;
|
||||
std::vector<ThreadState> all_threads;
|
||||
|
||||
void initialize_all_threads() {
|
||||
[&]<std::size_t... StageIndices>(std::index_sequence<StageIndices...>) {
|
||||
(init_stage_threads<StageIndices>(), ...);
|
||||
}(std::make_index_sequence<Topology::num_stages>{});
|
||||
for (int stage = 0; stage < topology_.num_stages; ++stage) {
|
||||
init_stage_threads(stage);
|
||||
}
|
||||
}
|
||||
|
||||
template <int Stage> void init_stage_threads() {
|
||||
constexpr int stage_offset = Topology::template stage_offset<Stage>();
|
||||
constexpr int stage_thread_count = Topology::threads_per_stage[Stage];
|
||||
constexpr int prev_stage_threads =
|
||||
Topology::template prev_stage_thread_count<Stage>();
|
||||
constexpr bool is_last_stage = (Stage == Topology::num_stages - 1);
|
||||
void init_stage_threads(int stage) {
|
||||
int stage_offset = topology_.stage_offset(stage);
|
||||
int stage_thread_count = topology_.threads_per_stage[stage];
|
||||
int prev_stage_threads = topology_.prev_stage_thread_count(stage);
|
||||
bool is_last_stage = (stage == topology_.num_stages - 1);
|
||||
|
||||
for (int thread = 0; thread < stage_thread_count; ++thread) {
|
||||
auto &thread_state = all_threads[stage_offset + thread];
|
||||
@@ -481,14 +494,15 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
template <int Stage, int Thread>
|
||||
Batch acquire_helper(uint32_t maxBatch, bool mayBlock) {
|
||||
constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
|
||||
Batch acquire_helper(int stage, int thread, uint32_t maxBatch,
|
||||
bool may_block) {
|
||||
int thread_idx = topology_.thread_index(stage, thread);
|
||||
auto &thread_state = all_threads[thread_idx];
|
||||
|
||||
uint32_t begin = thread_state.local_pops & slot_count_mask;
|
||||
uint32_t len = StaticPipelineAlgorithms::calculate_safe_len<
|
||||
wait_strategy, Topology, Stage, Thread>(all_threads, pushes, mayBlock);
|
||||
uint32_t len = PipelineAlgorithms::calculate_safe_len(
|
||||
wait_strategy_, topology_, stage, thread, all_threads, pushes,
|
||||
may_block);
|
||||
|
||||
if (maxBatch != 0) {
|
||||
len = std::min(len, maxBatch);
|
||||
@@ -503,13 +517,13 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
template <int Stage, int Thread> struct StageGuard {
|
||||
struct StageGuard {
|
||||
Batch batch;
|
||||
|
||||
~StageGuard() {
|
||||
if (!batch.empty()) {
|
||||
StaticPipelineAlgorithms::update_thread_pops<wait_strategy, Topology,
|
||||
Stage, Thread>(
|
||||
PipelineAlgorithms::update_thread_pops(
|
||||
pipeline->wait_strategy_, pipeline->topology_, stage, thread,
|
||||
pipeline->all_threads, local_pops);
|
||||
}
|
||||
}
|
||||
@@ -517,22 +531,28 @@ public:
|
||||
StageGuard(StageGuard const &) = delete;
|
||||
StageGuard &operator=(StageGuard const &) = delete;
|
||||
StageGuard(StageGuard &&other) noexcept
|
||||
: batch(other.batch), local_pops(other.local_pops),
|
||||
: batch(other.batch), local_pops(other.local_pops), stage(other.stage),
|
||||
thread(other.thread),
|
||||
pipeline(std::exchange(other.pipeline, nullptr)) {}
|
||||
StageGuard &operator=(StageGuard &&other) noexcept {
|
||||
batch = other.batch;
|
||||
local_pops = other.local_pops;
|
||||
stage = other.stage;
|
||||
thread = other.thread;
|
||||
pipeline = std::exchange(other.pipeline, nullptr);
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
friend struct StaticThreadPipeline;
|
||||
friend struct ThreadPipeline;
|
||||
uint32_t local_pops;
|
||||
StaticThreadPipeline *pipeline;
|
||||
int stage;
|
||||
int thread;
|
||||
ThreadPipeline *pipeline;
|
||||
|
||||
StageGuard(Batch batch, uint32_t local_pops, StaticThreadPipeline *pipeline)
|
||||
: batch(batch), local_pops(local_pops),
|
||||
StageGuard(Batch batch, uint32_t local_pops, int stage, int thread,
|
||||
ThreadPipeline *pipeline)
|
||||
: batch(batch), local_pops(local_pops), stage(stage), thread(thread),
|
||||
pipeline(batch.empty() ? nullptr : pipeline) {}
|
||||
};
|
||||
|
||||
@@ -555,37 +575,30 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
friend struct StaticThreadPipeline;
|
||||
friend struct ThreadPipeline;
|
||||
ProducerGuard() : batch(), tp() {}
|
||||
ProducerGuard(Batch batch, StaticThreadPipeline *tp, uint32_t old_slot,
|
||||
ProducerGuard(Batch batch, ThreadPipeline *tp, uint32_t old_slot,
|
||||
uint32_t new_slot)
|
||||
: batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {}
|
||||
StaticThreadPipeline *const tp;
|
||||
ThreadPipeline *const tp;
|
||||
uint32_t old_slot;
|
||||
uint32_t new_slot;
|
||||
};
|
||||
|
||||
// Acquire a batch of items for processing by a consumer thread.
|
||||
// Stage: which processing stage (0 = first consumer stage after producers) -
|
||||
// compile-time parameter Thread: thread ID within the stage (0 to
|
||||
// ThreadsPerStage[Stage]-1) - compile-time parameter maxBatch: maximum items
|
||||
// to acquire (0 = no limit) may_block: whether to block waiting for items
|
||||
// (false = return empty batch if none available) Returns: StageGuard<Stage,
|
||||
// Thread> with batch of items to process and compile-time type safety
|
||||
template <int Stage, int Thread>
|
||||
[[nodiscard]] StageGuard<Stage, Thread> acquire(int maxBatch = 0,
|
||||
bool may_block = true) {
|
||||
static_assert(Stage >= 0 && Stage < Topology::num_stages,
|
||||
"Stage index out of bounds");
|
||||
static_assert(Thread >= 0 && Thread < Topology::threads_per_stage[Stage],
|
||||
"Thread index out of bounds");
|
||||
// stage: which processing stage (0 = first consumer stage after producers)
|
||||
// thread: thread ID within the stage (0 to threads_per_stage[stage]-1)
|
||||
// maxBatch: maximum items to acquire (0 = no limit)
|
||||
// may_block: whether to block waiting for items (false = return empty batch
|
||||
// if none available) Returns: StageGuard with batch of items to process
|
||||
[[nodiscard]] StageGuard acquire(int stage, int thread, int maxBatch = 0,
|
||||
bool may_block = true) {
|
||||
auto batch = acquire_helper(stage, thread, maxBatch, may_block);
|
||||
|
||||
auto batch = acquire_helper<Stage, Thread>(maxBatch, may_block);
|
||||
|
||||
constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
|
||||
int thread_idx = topology_.thread_index(stage, thread);
|
||||
uint32_t local_pops = all_threads[thread_idx].local_pops;
|
||||
|
||||
return StageGuard<Stage, Thread>{std::move(batch), local_pops, this};
|
||||
return StageGuard{std::move(batch), local_pops, stage, thread, this};
|
||||
}
|
||||
|
||||
// Reserve slots in the ring buffer for a producer thread to fill with items.
|
||||
@@ -618,9 +631,8 @@ public:
|
||||
slot = slots.load(std::memory_order_relaxed);
|
||||
begin = slot & slot_count_mask;
|
||||
|
||||
int capacity_result =
|
||||
StaticPipelineAlgorithms::check_producer_capacity<Topology>(
|
||||
all_threads, slot, size, slot_count, block);
|
||||
int capacity_result = PipelineAlgorithms::check_producer_capacity(
|
||||
topology_, all_threads, slot, size, slot_count, block);
|
||||
if (capacity_result == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user