diff --git a/CMakeLists.txt b/CMakeLists.txt index e74258d..71c6a1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,9 @@ add_executable(bench_arena_allocator benchmarks/bench_arena_allocator.cpp target_link_libraries(bench_arena_allocator nanobench) target_include_directories(bench_arena_allocator PRIVATE src) +add_executable(bench_volatile_loop bench_volatile_loop.cpp) +target_link_libraries(bench_volatile_loop nanobench) + add_executable( bench_commit_request benchmarks/bench_commit_request.cpp src/json_commit_request_parser.cpp diff --git a/bench_volatile_loop.cpp b/bench_volatile_loop.cpp new file mode 100644 index 0000000..ee9364e --- /dev/null +++ b/bench_volatile_loop.cpp @@ -0,0 +1,10 @@ +#include + +int main() { + ankerl::nanobench::Bench().run("volatile loop to 500", [&] { + for (volatile int i = 0; i < 800; i = i + 1) + ; + }); + + return 0; +} diff --git a/src/http_handler.hpp b/src/http_handler.hpp index 59cdba7..24a9b86 100644 --- a/src/http_handler.hpp +++ b/src/http_handler.hpp @@ -67,9 +67,9 @@ struct HttpHandler : ConnectionHandler { for (int threadId = 0; threadId < kFinalStageThreads; ++threadId) { finalStageThreads.emplace_back([this, threadId]() { pthread_setname_np(pthread_self(), - ("stage-0-" + std::to_string(threadId)).c_str()); + ("stage-1-" + std::to_string(threadId)).c_str()); for (;;) { - auto guard = pipeline.acquire(0, threadId); + auto guard = pipeline.acquire(1, threadId); for (auto it = guard.batch.begin(); it != guard.batch.end(); ++it) { if ((it.index() % kFinalStageThreads) == threadId) { auto &c = *it; @@ -85,6 +85,20 @@ struct HttpHandler : ConnectionHandler { } }); } + stage0Thread = std::thread{[this]() { + pthread_setname_np(pthread_self(), "stage-0"); + for (;;) { + auto guard = pipeline.acquire(0, 0, 0, false); + for (auto it = guard.batch.begin(); it != guard.batch.end(); ++it) { + auto &c = *it; + if (!c) { + return; + } + for (volatile int i = 0; i < 1200; i = i + 1) + ; + } + } + }}; } ~HttpHandler() { { @@ -93,6 +107,7 @@ struct HttpHandler : ConnectionHandler { c = {}; } } + stage0Thread.join(); for (auto &thread : finalStageThreads) { thread.join(); } @@ -122,8 +137,9 @@ struct HttpHandler : ConnectionHandler { private: static constexpr int kFinalStageThreads = 2; static constexpr int kLogSize = 12; - ThreadPipeline> pipeline{kLogSize, - {kFinalStageThreads}}; + ThreadPipeline> pipeline{ + kLogSize, {/*noop serial thread*/ 1, kFinalStageThreads}}; + std::thread stage0Thread; std::vector finalStageThreads; // Route handlers diff --git a/threading_performance_report.md b/threading_performance_report.md new file mode 100644 index 0000000..cc4cdb9 --- /dev/null +++ b/threading_performance_report.md @@ -0,0 +1,60 @@ +# WeaselDB Threading Performance Analysis Report + +## Summary + +WeaselDB achieved 1.3M requests/second throughput using a two-stage ThreadPipeline, with 396ns serial CPU time per request. Higher serial CPU time means more CPU budget available for serial processing. + +## Performance Metrics + +### Throughput +- Non-blocking: 1.3M requests/second over unix socket +- Blocking: 1.1M requests/second over unix socket +- 8 I/O threads with 8 epoll instances +- Load tester used 12 network threads +- Max latency: 4ms out of 90M requests + +### Threading Architecture +- Two-stage pipeline: Stage-0 (noop) → Stage-1 (connection return) +- Lock-free coordination using atomic ring buffer +- Each request "processed" serially on single thread + +### Non-blocking vs Blocking Acquisition + +**Non-blocking acquisition (`mayBlock=false`)**: +- Throughput: 1.3M requests/second (maintained with up to 1200 loop iterations) +- Stage-0 CPU: 100% (10% futex wake, 90% other) +- Serial CPU time per request: 396ns (1200 iterations, validated with nanobench) +- 100% CPU usage when idle + +**Blocking acquisition (`mayBlock=true`)**: +- Throughput: 1.1M requests/second (800 loop iterations) +- Stage-0 CPU: 100% total (18% sched_yield, 8% futex wait, 7% futex wake, 67% other) +- Serial CPU time per request: 266ns (800 iterations, validated with nanobench) +- 0% CPU usage when idle + +### Request Flow +``` +I/O Threads (8) → HttpHandler::on_batch_complete() → ThreadPipeline + ↓ + Stage 0: Noop thread + (396ns serial CPU per request) + ↓ + Stage 1: Connection return + ↓ + Server::release_back_to_server() +``` + +### Pipeline Configuration +- Stage 0: 1 noop thread +- Stage 1: 2 worker threads for connection return +- Atomic counters with shared ring buffer + +### Memory Management +- Transfer ownership of the connection along the pipeline + +## Test Configuration + +- Server: test_config.toml with 8 io_threads, 8 epoll_instances +- Load tester: ./load_tester --network-threads 12 +- Build: ninja +- Command: ./weaseldb --config test_config.toml