Benchmark different batches and wait strategies

2025-08-26 13:33:16 -04:00
parent 4e9e4d634c
commit 3c3555da7a
1 changed files with 166 additions and 0 deletions
@@ -80,6 +80,172 @@ int main() {
    stage0_thread.join();
  }
  // Batch size comparison benchmark
  {
    constexpr int LOG_PIPELINE_SIZE = 10; // 2^10 = 1024 slots
    constexpr int NUM_ITEMS = 100'000;
    constexpr int BUSY_ITERS = 100;
    auto bench = ankerl::nanobench::Bench()
                     .title("Batch Size Impact")
                     .unit("item")
                     .batch(NUM_ITEMS)
                     .relative(true)
                     .warmup(100);
    for (int batch_size : {1, 4, 16, 64, 256}) {
      std::vector<int> threads_per_stage = {1};
      ThreadPipeline<std::latch *> pipeline(LOG_PIPELINE_SIZE,
                                            threads_per_stage);
      std::latch done{0};
      // Stage 0 consumer thread
      std::thread stage0_thread([&pipeline, &done]() {
        const int stage = 0;
        const int thread_id = 0;
        for (;;) {
          auto guard = pipeline.acquire(stage, thread_id);
          for (auto &item : guard.batch) {
            for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
            }
            if (item == &done) {
              return;
            }
            if (item) {
              item->count_down();
            }
          }
        }
      });
      bench.run("Batch size " + std::to_string(batch_size), [&] {
        // Producer (main thread)
        int items_pushed = 0;
        while (items_pushed < NUM_ITEMS - 1) {
          auto guard = pipeline.push(
              std::min(NUM_ITEMS - 1 - items_pushed, batch_size), true);
          auto it = guard.batch.begin();
          items_pushed += guard.batch.size();
          for (size_t i = 0; i < guard.batch.size(); ++i, ++it) {
            *it = nullptr;
          }
        }
        std::latch finish{1};
        {
          auto guard = pipeline.push(1, true);
          *guard.batch.begin() = &finish;
        }
        finish.wait();
      });
      {
        auto guard = pipeline.push(1, true);
        *guard.batch.begin() = &done;
      }
      stage0_thread.join();
    }
  }
  // Helper function for wait strategy benchmarks
  auto benchmark_wait_strategy =
      []<WaitStrategy strategy>(const std::string &name,
                                ankerl::nanobench::Bench &bench) {
        constexpr int LOG_PIPELINE_SIZE =
            8; // Smaller buffer to increase contention
        constexpr int NUM_ITEMS = 50'000;
        constexpr int BATCH_SIZE = 4; // Small batches to increase coordination
        constexpr int BUSY_ITERS =
            10; // Light work to emphasize coordination overhead
        std::vector<int> threads_per_stage = {1, 1}; // Two stages
        ThreadPipeline<std::latch *, strategy> pipeline(LOG_PIPELINE_SIZE,
                                                        threads_per_stage);
        std::latch done{0};
        // Stage 0 worker
        std::thread stage0_thread([&pipeline, &done]() {
          const int stage = 0;
          const int thread_id = 0;
          for (;;) {
            auto guard = pipeline.acquire(stage, thread_id);
            for (auto &item : guard.batch) {
              for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
              }
              if (item == &done)
                return;
            }
          }
        });
        // Stage 1 worker (final stage - always calls futex wake)
        std::thread stage1_thread([&pipeline, &done]() {
          const int stage = 1;
          const int thread_id = 0;
          for (;;) {
            auto guard = pipeline.acquire(stage, thread_id);
            for (auto &item : guard.batch) {
              for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
              }
              if (item == &done)
                return;
              if (item)
                item->count_down();
            }
          }
        });
        bench.run(name, [&] {
          int items_pushed = 0;
          while (items_pushed < NUM_ITEMS - 1) {
            auto guard = pipeline.push(
                std::min(NUM_ITEMS - 1 - items_pushed, BATCH_SIZE), true);
            auto it = guard.batch.begin();
            items_pushed += guard.batch.size();
            for (size_t i = 0; i < guard.batch.size(); ++i, ++it) {
              *it = nullptr;
            }
          }
          std::latch finish{1};
          {
            auto guard = pipeline.push(1, true);
            *guard.batch.begin() = &finish;
          }
          finish.wait();
        });
        // Shutdown
        {
          auto guard = pipeline.push(1, true);
          auto it = guard.batch.begin();
          *it = &done;
        }
        stage0_thread.join();
        stage1_thread.join();
      };
  // Wait strategy comparison benchmark - multiple stages to trigger futex wakes
  {
    auto bench = ankerl::nanobench::Bench()
                     .title("Wait Strategy Comparison")
                     .unit("item")
                     .batch(50'000)
                     .relative(true)
                     .warmup(50);
    benchmark_wait_strategy.template operator()<WaitStrategy::WaitIfStageEmpty>(
        "WaitIfStageEmpty", bench);
    benchmark_wait_strategy.template
    operator()<WaitStrategy::WaitIfUpstreamIdle>("WaitIfUpstreamIdle", bench);
    benchmark_wait_strategy.template operator()<WaitStrategy::Never>("Never",
                                                                     bench);
  }
  // TODO: Add more benchmarks for:
  // - Multi-stage pipelines (3+ stages)
  // - Multiple threads per stage