From c915fed4ccb959527ee56dc74c4542bbe2dcc71b Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 26 Aug 2025 16:54:29 -0400 Subject: [PATCH] Change max batch to one --- src/http_handler.hpp | 6 ++++-- src/loop_iterations.h | 2 +- src/thread_pipeline.hpp | 7 +++++++ threading_performance_report.md | 23 ++++++++++++++++++----- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/http_handler.hpp b/src/http_handler.hpp index 5930d90..a3a7242 100644 --- a/src/http_handler.hpp +++ b/src/http_handler.hpp @@ -103,10 +103,12 @@ struct HttpHandler : ConnectionHandler { }); stage0Thread = std::thread{[this]() { pthread_setname_np(pthread_self(), "stage-0"); + int nulls = 0; for (;;) { - auto guard = pipeline.acquire<0, 0>(); + auto guard = pipeline.acquire<0, 0>(1); for (auto &c : guard.batch) { - if (!c) { + nulls += !c; + if (nulls == 2) { return; } for (volatile int i = 0; i < loopIterations; i = i + 1) diff --git a/src/loop_iterations.h b/src/loop_iterations.h index 0eaf3f2..7fa7453 100644 --- a/src/loop_iterations.h +++ b/src/loop_iterations.h @@ -1,3 +1,3 @@ #pragma once -constexpr int loopIterations = 1550; +constexpr int loopIterations = 1725; diff --git a/src/thread_pipeline.hpp b/src/thread_pipeline.hpp index 5434689..c153e18 100644 --- a/src/thread_pipeline.hpp +++ b/src/thread_pipeline.hpp @@ -11,6 +11,10 @@ #include #include +#if defined(__x86_64__) || defined(_M_X64) +#include +#endif + // Wait strategies for controlling thread blocking behavior when no work is // available enum class WaitStrategy { @@ -151,6 +155,9 @@ uint32_t calculate_safe_len( if (push != thread.local_pops) { goto dont_wait; } +#if defined(__x86_64__) || defined(_M_X64) + _mm_pause(); +#endif } pushes.wait(push, std::memory_order_relaxed); dont_wait:; diff --git a/threading_performance_report.md b/threading_performance_report.md index 42ebcd3..1d45663 100644 --- a/threading_performance_report.md +++ b/threading_performance_report.md @@ -2,7 +2,7 @@ ## Summary -WeaselDB achieved 1.3M requests/second throughput using a two-stage ThreadPipeline with futex wake optimization, delivering 488ns serial CPU time per request while maintaining 0% CPU usage when idle. Higher serial CPU time means more CPU budget available for serial processing. +WeaselDB achieved 1.3M requests/second throughput using a two-stage ThreadPipeline with futex wake optimization, delivering 550ns serial CPU time per request while maintaining 0% CPU usage when idle. Higher serial CPU time means more CPU budget available for serial processing. ## Performance Metrics @@ -23,23 +23,36 @@ WeaselDB achieved 1.3M requests/second throughput using a two-stage ThreadPipeli **Optimized Pipeline Mode**: - **Throughput**: 1.3M requests/second -- **Serial CPU time per request**: 488ns (validated with nanobench) +- **Serial CPU time per request**: 550ns (validated with nanobench) - **Theoretical maximum serial CPU time**: 769ns (1,000,000,000ns ÷ 1,300,000 req/s) -- **Serial efficiency**: 63.4% (488ns ÷ 769ns) +- **Serial efficiency**: 71.5% (550ns ÷ 769ns) - **CPU usage when idle**: 0% -### Key Optimization: Futex Wake Reduction +### Key Optimizations + +**Futex Wake Reduction**: - **Previous approach**: Futex wake at every pipeline stage (10% CPU overhead) - **Optimized approach**: Futex wake only at final stage to wake producers. Stages now do their futex wait on the beginning of the pipeline instead of the previous stage. - **Result**: 23% increase in serial CPU budget (396ns → 488ns) - **Benefits**: Higher throughput per CPU cycle + idle efficiency +**CPU-Friendly Spin Loop**: +- **Added**: `_mm_pause()` intrinsics in polling loop to reduce power consumption and improve hyperthreading efficiency +- **Maintained**: 100,000 spin iterations necessary to prevent thread descheduling +- **Result**: Same throughput with more efficient spinning + +**Stage-0 Batch Size Optimization**: +- **Changed**: Stage-0 max batch size from unlimited to 1 +- **Result**: Additional 12.7% increase in serial CPU budget (488ns → 550ns) +- **Overall improvement**: 38.9% increase from baseline (396ns → 550ns) + ### Request Flow ``` I/O Threads (8) → HttpHandler::on_batch_complete() → ThreadPipeline ↑ ↓ | Stage 0: Noop thread - | (488ns serial CPU per request) + | (550ns serial CPU per request) + | (batch size: 1) | ↓ | Stage 1: Connection return | (optimized futex wake)