Make pipeline policy/topology configurable

2025-11-06 15:55:27 -05:00
parent 9f8562e30f
commit f458c6b249
8 changed files with 396 additions and 311 deletions
--- a/benchmarks/bench_thread_pipeline.cpp
+++ b/benchmarks/bench_thread_pipeline.cpp
@@ -24,15 +24,15 @@ int main() {
      }
    });

-    StaticThreadPipeline<std::latch *, WaitStrategy::WaitIfStageEmpty, 1>
-        pipeline(LOG_PIPELINE_SIZE);
+    ThreadPipeline<std::latch *> pipeline(WaitStrategy::WaitIfStageEmpty, {1},
+                                          LOG_PIPELINE_SIZE);

    std::latch done{0};

    // Stage 0 consumer thread
    std::thread stage0_thread([&pipeline, &done]() {
      for (;;) {
-        auto guard = pipeline.acquire<0, 0>();
+        auto guard = pipeline.acquire(0, 0);

        for (auto &item : guard.batch) {
          spend_cpu_cycles(BUSY_ITERS);
@@ -89,15 +89,15 @@ int main() {
                     .warmup(100);

    for (int batch_size : {1, 4, 16, 64, 256}) {
-      StaticThreadPipeline<std::latch *, WaitStrategy::WaitIfStageEmpty, 1>
-          pipeline(LOG_PIPELINE_SIZE);
+      ThreadPipeline<std::latch *> pipeline(WaitStrategy::WaitIfStageEmpty, {1},
+                                            LOG_PIPELINE_SIZE);

      std::latch done{0};

      // Stage 0 consumer thread
      std::thread stage0_thread([&pipeline, &done]() {
        for (;;) {
-          auto guard = pipeline.acquire<0, 0>();
+          auto guard = pipeline.acquire(0, 0);

          for (auto &item : guard.batch) {
            spend_cpu_cycles(BUSY_ITERS);
@@ -142,74 +142,73 @@ int main() {
  }

  // Helper function for wait strategy benchmarks
-  auto benchmark_wait_strategy =
-      []<WaitStrategy strategy>(const std::string &name,
-                                ankerl::nanobench::Bench &bench) {
-        constexpr int LOG_PIPELINE_SIZE =
-            8; // Smaller buffer to increase contention
-        constexpr int NUM_ITEMS = 50'000;
-        constexpr int BATCH_SIZE = 4; // Small batches to increase coordination
-        constexpr int BUSY_ITERS =
-            10; // Light work to emphasize coordination overhead
+  auto benchmark_wait_strategy = [](WaitStrategy strategy,
+                                    const std::string &name,
+                                    ankerl::nanobench::Bench &bench) {
+    constexpr int LOG_PIPELINE_SIZE =
+        8; // Smaller buffer to increase contention
+    constexpr int NUM_ITEMS = 50'000;
+    constexpr int BATCH_SIZE = 4; // Small batches to increase coordination
+    constexpr int BUSY_ITERS =
+        10; // Light work to emphasize coordination overhead

-        StaticThreadPipeline<std::latch *, strategy, 1, 1> pipeline(
-            LOG_PIPELINE_SIZE);
+    ThreadPipeline<std::latch *> pipeline(strategy, {1, 1}, LOG_PIPELINE_SIZE);

-        std::latch done{0};
+    std::latch done{0};

-        // Stage 0 worker
-        std::thread stage0_thread([&pipeline, &done]() {
-          for (;;) {
-            auto guard = pipeline.template acquire<0, 0>();
-            for (auto &item : guard.batch) {
-              spend_cpu_cycles(BUSY_ITERS);
-              if (item == &done)
-                return;
-            }
-          }
-        });
-
-        // Stage 1 worker (final stage - always calls futex wake)
-        std::thread stage1_thread([&pipeline, &done]() {
-          for (;;) {
-            auto guard = pipeline.template acquire<1, 0>();
-            for (auto &item : guard.batch) {
-              spend_cpu_cycles(BUSY_ITERS);
-              if (item == &done)
-                return;
-              if (item)
-                item->count_down();
-            }
-          }
-        });
-
-        bench.run(name, [&] {
-          int items_pushed = 0;
-          while (items_pushed < NUM_ITEMS - 1) {
-            auto guard = pipeline.push(
-                std::min(NUM_ITEMS - 1 - items_pushed, BATCH_SIZE), true);
-            auto it = guard.batch.begin();
-            items_pushed += guard.batch.size();
-            for (size_t i = 0; i < guard.batch.size(); ++i, ++it) {
-              *it = nullptr;
-            }
-          }
-          std::latch finish{1};
-          {
-            auto guard = pipeline.push(1, true);
-            guard.batch[0] = &finish;
-          }
-          finish.wait();
-        });
-
-        // Shutdown
-        {
-          auto guard = pipeline.push(1, true);
-          guard.batch[0] = &done;
+    // Stage 0 worker
+    std::thread stage0_thread([&pipeline, &done]() {
+      for (;;) {
+        auto guard = pipeline.acquire(0, 0);
+        for (auto &item : guard.batch) {
+          spend_cpu_cycles(BUSY_ITERS);
+          if (item == &done)
+            return;
        }
-        stage0_thread.join();
-        stage1_thread.join();
-      };
+      }
+    });
+
+    // Stage 1 worker (final stage - always calls futex wake)
+    std::thread stage1_thread([&pipeline, &done]() {
+      for (;;) {
+        auto guard = pipeline.acquire(1, 0);
+        for (auto &item : guard.batch) {
+          spend_cpu_cycles(BUSY_ITERS);
+          if (item == &done)
+            return;
+          if (item)
+            item->count_down();
+        }
+      }
+    });
+
+    bench.run(name, [&] {
+      int items_pushed = 0;
+      while (items_pushed < NUM_ITEMS - 1) {
+        auto guard = pipeline.push(
+            std::min(NUM_ITEMS - 1 - items_pushed, BATCH_SIZE), true);
+        auto it = guard.batch.begin();
+        items_pushed += guard.batch.size();
+        for (size_t i = 0; i < guard.batch.size(); ++i, ++it) {
+          *it = nullptr;
+        }
+      }
+      std::latch finish{1};
+      {
+        auto guard = pipeline.push(1, true);
+        guard.batch[0] = &finish;
+      }
+      finish.wait();
+    });
+
+    // Shutdown
+    {
+      auto guard = pipeline.push(1, true);
+      guard.batch[0] = &done;
+    }
+    stage0_thread.join();
+    stage1_thread.join();
+  };

  // Wait strategy comparison benchmark - multiple stages to trigger futex wakes
  {
@@ -220,12 +219,11 @@ int main() {
                     .relative(true)
                     .warmup(50);

-    benchmark_wait_strategy.template operator()<WaitStrategy::WaitIfStageEmpty>(
-        "WaitIfStageEmpty", bench);
-    benchmark_wait_strategy.template
-    operator()<WaitStrategy::WaitIfUpstreamIdle>("WaitIfUpstreamIdle", bench);
-    benchmark_wait_strategy.template operator()<WaitStrategy::Never>("Never",
-                                                                     bench);
+    benchmark_wait_strategy(WaitStrategy::WaitIfStageEmpty, "WaitIfStageEmpty",
+                            bench);
+    benchmark_wait_strategy(WaitStrategy::WaitIfUpstreamIdle,
+                            "WaitIfUpstreamIdle", bench);
+    benchmark_wait_strategy(WaitStrategy::Never, "Never", bench);
  }

  // TODO: Add more benchmarks for: