Make pipeline policy/topology configurable

Fix histogram thread death bug
Add a bit more precision to docs, plus philosophy
2025-11-06 15:55:27 -05:00 · 2025-09-18 12:39:47 -04:00 · 2025-09-16 00:57:20 -04:00 · 2025-09-15 23:34:30 -04:00 · 2025-09-15 23:04:54 -04:00 · 2025-09-15 22:51:41 -04:00
56 changed files with 3463 additions and 2427 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,11 +3,13 @@ repos:
    rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c  # frozen: v6.0.0
    hooks:
      - id: trailing-whitespace
+      - id: end-of-file-fixer
+        exclude: ".*third_party/.*"
      - id: check-added-large-files
      - id: check-merge-conflict

  - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: 182152eb8c5ce1cf5299b956b04392c86bd8a126  # frozen: v20.1.8
+    rev: 86fdcc9bd34d6afbbd29358b97436c8ffe3aa3b2  # frozen: v21.1.0
    hooks:
      - id: clang-format
        exclude: ".*third_party/.*"
@@ -23,6 +25,11 @@ repos:
      - id: black
        language_version: python3

+  - repo: https://github.com/executablebooks/mdformat
+    rev: ff29be1a1ba8029d9375882aa2c812b62112a593  # frozen: 0.7.22
+    hooks:
+      - id: mdformat
+
  - repo: local
    hooks:
      - id: snake-case-enforcement
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,6 +139,7 @@ target_link_libraries(nanobench_impl PUBLIC nanobench)
 # Define all source files in one place
 set(WEASELDB_SOURCES
    src/arena.cpp
+    src/commit_pipeline.cpp
    src/cpu_work.cpp
    src/format.cpp
    src/metric.cpp
@@ -188,6 +189,11 @@ add_executable(test_arena tests/test_arena.cpp)
 target_link_libraries(test_arena doctest_impl weaseldb_sources_debug)
 target_compile_options(test_arena PRIVATE -UNDEBUG)

+add_executable(test_server tests/test_server.cpp)
+target_link_libraries(test_server doctest_impl weaseldb_sources_debug)
+target_compile_options(test_server PRIVATE -UNDEBUG)
+add_test(NAME test_server COMMAND test_server)
+
 add_executable(
  test_commit_request
  tests/test_commit_request.cpp tests/nlohmann_reference_parser.cpp
@@ -197,23 +203,19 @@ target_link_libraries(test_commit_request doctest_impl weaseldb_sources_debug
 target_include_directories(test_commit_request PRIVATE tests)
 target_compile_options(test_commit_request PRIVATE -UNDEBUG)

-add_executable(test_http_handler tests/test_http_handler.cpp)
-target_link_libraries(test_http_handler doctest_impl weaseldb_sources_debug)
-target_compile_options(test_http_handler PRIVATE -UNDEBUG)
-
-add_executable(test_server_connection_return
-               tests/test_server_connection_return.cpp)
-target_link_libraries(test_server_connection_return doctest_impl
-                      weaseldb_sources_debug)
-target_compile_options(test_server_connection_return PRIVATE -UNDEBUG)
-
 # Metrics system test
 add_executable(test_metric tests/test_metric.cpp)
 target_link_libraries(test_metric doctest_impl weaseldb_sources_debug)
 target_compile_options(test_metric PRIVATE -UNDEBUG)

+# HTTP handler test
+add_executable(test_http_handler tests/test_http_handler.cpp)
+target_link_libraries(test_http_handler doctest_impl weaseldb_sources_debug)
+target_compile_options(test_http_handler PRIVATE -UNDEBUG)
+add_test(NAME test_http_handler COMMAND test_http_handler)
+
 # Register with CTest
-add_test(NAME metric_tests COMMAND test_metric)
+add_test(NAME test_metric COMMAND test_metric)

 add_executable(bench_arena benchmarks/bench_arena.cpp)
 target_link_libraries(bench_arena nanobench_impl weaseldb_sources)
@@ -231,7 +233,8 @@ target_link_libraries(bench_parser_comparison nanobench_impl weaseldb_sources
 target_include_directories(bench_parser_comparison
                           PRIVATE ${rapidjson_SOURCE_DIR}/include)

-add_executable(bench_thread_pipeline benchmarks/bench_thread_pipeline.cpp)
+add_executable(bench_thread_pipeline benchmarks/bench_thread_pipeline.cpp
+                                     src/cpu_work.cpp)
 target_link_libraries(bench_thread_pipeline nanobench_impl Threads::Threads)
 target_include_directories(bench_thread_pipeline PRIVATE src)

@@ -253,11 +256,8 @@ target_link_libraries(debug_arena weaseldb_sources)
 add_executable(load_tester tools/load_tester.cpp)
 target_link_libraries(load_tester Threads::Threads llhttp_static perfetto)

-add_test(NAME arena_tests COMMAND test_arena)
-add_test(NAME commit_request_tests COMMAND test_commit_request)
-add_test(NAME http_handler_tests COMMAND test_http_handler)
-add_test(NAME server_connection_return_tests
-         COMMAND test_server_connection_return)
+add_test(NAME test_arena COMMAND test_arena)
+add_test(NAME test_commit_request COMMAND test_commit_request)
 add_test(NAME arena_benchmarks COMMAND bench_arena)
 add_test(NAME commit_request_benchmarks COMMAND bench_commit_request)
 add_test(NAME parser_comparison_benchmarks COMMAND bench_parser_comparison)
@@ -267,14 +267,14 @@ add_test(NAME format_comparison_benchmarks COMMAND bench_format_comparison)
 add_executable(test_api_url_parser tests/test_api_url_parser.cpp)
 target_link_libraries(test_api_url_parser doctest_impl weaseldb_sources_debug)
 target_compile_options(test_api_url_parser PRIVATE -UNDEBUG)
-add_test(NAME api_url_parser_tests COMMAND test_api_url_parser)
+add_test(NAME test_api_url_parser COMMAND test_api_url_parser)

 # Reference counting tests and benchmarks
 add_executable(test_reference tests/test_reference.cpp)
 target_link_libraries(test_reference doctest_impl)
 target_include_directories(test_reference PRIVATE src)
 target_compile_options(test_reference PRIVATE -UNDEBUG)
-add_test(NAME reference_tests COMMAND test_reference)
+add_test(NAME test_reference COMMAND test_reference)

 add_executable(bench_reference benchmarks/bench_reference.cpp)
 target_link_libraries(bench_reference doctest_impl nanobench_impl
--- a/api.md
+++ b/api.md
@@ -2,7 +2,7 @@

 > **Note:** This is a design for the API of the write-side of a database system where writing and reading are decoupled. The read-side of the system is expected to use the `/v1/subscribe` endpoint to maintain a queryable representation of the key-value data. In other words, reading from this "database" is left as an exercise for the reader. Authentication and authorization are out of scope for this design.

-----
+______________________________________________________________________

 ## `GET /v1/version`

@@ -20,16 +20,16 @@ Retrieves the latest known committed version and the current leader.
 }
 ```

-----
+______________________________________________________________________

 ## `POST /v1/commit`

 Submits a transaction to be committed. The transaction consists of read preconditions, writes, and deletes.

-  * Clients may receive a **`413 Content Too Large`** response if the request exceeds a configurable limit.
-  * A malformed request will result in a **`400 Bad Request`** response.
-  * Keys are sorted by a lexicographical comparison of their raw byte values.
-  * All binary data for keys and values must be encoded using the standard base64 scheme defined in [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4), with padding included.
+- Clients may receive a **`413 Content Too Large`** response if the request exceeds a configurable limit.
+- A malformed request will result in a **`400 Bad Request`** response.
+- Keys are sorted by a lexicographical comparison of their raw byte values.
+- All binary data for keys and values must be encoded using the standard base64 scheme defined in [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4), with padding included.

 ### Request

@@ -91,7 +91,9 @@ Submits a transaction to be committed. The transaction consists of read precondi
  // If not committed, a more recent version that the client can use to retry.
  "version": 123456,
  // The unique ID of the leader at this version.
-  "leader_id": "abcdefg"
+  "leader_id": "abcdefg",
+  // Echo back the request_id if it was provided in the original request
+  "request_id": "abcdefg"
 }
 ```

@@ -99,16 +101,16 @@ Submits a transaction to be committed. The transaction consists of read precondi

 1. **`request_id`**: Optional field that can be used with `/v1/status` to determine the outcome if no reply is received. If omitted, a UUID will be automatically generated by the server, and clients will not be able to determine commit status if there's no response. When provided, the request_id must meet the minimum length requirement (configurable, default 20 characters) to ensure sufficient entropy for collision avoidance. This ID must not be reused in a commit request. For idempotency, if a response is not received, the client must use `/v1/status` to determine the request's outcome. The original `request_id` should not be reused for a new commit attempt; instead, a retry should be sent with a new `request_id`. The alternative design would require the leader to store every request ID in memory.

-2.  **`preconditions` (Guarantees and Usage)**: The condition is satisfied if the server verifies that the range has not changed since the specified version. Clients can achieve serializable isolation by including all reads that influenced their writes. By default, clients should assume that any read they perform influences their writes. Omitting reads is an expert-level optimization and should generally be avoided.
+1. **`preconditions` (Guarantees and Usage)**: The condition is satisfied if the server verifies that the range has not changed since the specified version. Clients can achieve serializable isolation by including all reads that influenced their writes. By default, clients should assume that any read they perform influences their writes. Omitting reads is an expert-level optimization and should generally be avoided.

-3.  **`preconditions` (False Positives & Leader Changes)**: Precondition checks are conservative and best-effort; it's possible to reject a transaction where the range hasn't actually changed. In all such cases, clients should retry with a more recent read version. Two examples of false positives are:
+1. **`preconditions` (False Positives & Leader Changes)**: Precondition checks are conservative and best-effort; it's possible to reject a transaction where the range hasn't actually changed. In all such cases, clients should retry with a more recent read version. Two examples of false positives are:

-      * **Implementation Detail:** The leader may use partitioned conflict history for performance. A conflict in one partition (even from a transaction that later aborts) can cause a rejection.
-      * **Leader Changes:** A version is only valid within the term of the leader that issued it. Since conflict history is stored in memory, a leadership change invalidates all previously issued read versions. Any transaction using such a version will be rejected.
+   - **Implementation Detail:** The leader may use partitioned conflict history for performance. A conflict in one partition (even from a transaction that later aborts) can cause a rejection.
+   - **Leader Changes:** A version is only valid within the term of the leader that issued it. Since conflict history is stored in memory, a leadership change invalidates all previously issued read versions. Any transaction using such a version will be rejected.

   The versions in the precondition checks need not be the same.

-----
+______________________________________________________________________

 ## `GET /v1/status`

@@ -125,7 +127,7 @@ Gets the status of a previous commit request by its `request_id`.
 | `request_id` | string | Yes | The `request_id` from the original `/v1/commit` request. |
 | `min_version` | integer | Yes | An optimization that constrains the log scan. This value should be the latest version the client knew to be committed *before* sending the original request. |

-> **Warning\!** If the provided `min_version` is later than the transaction's actual commit version, the server might not find the record in the scanned portion of the log. This can result in an `id_not_found` status, even if the transaction actually committed.
+> **Warning!** If the provided `min_version` is later than the transaction's actual commit version, the server might not find the record in the scanned portion of the log. This can result in an `id_not_found` status, even if the transaction actually committed.

 ### Response

@@ -144,7 +146,7 @@ A response from this endpoint guarantees the original request is no longer in fl

 > **Note on `log_truncated` status:** This indicates the `request_id` log has been truncated after `min_version`, making it impossible to determine the original request's outcome. There is no way to avoid this without storing an arbitrarily large number of request IDs. Clients must treat this as an indeterminate outcome. Retrying the transaction is unsafe unless the client has an external method to verify the original transaction's status. This error should be propagated to the caller. `request_id`s are retained for a configurable minimum time and number of versions so this should be extremely rare.

-----
+______________________________________________________________________

 ## `GET /v1/subscribe`

@@ -169,7 +171,7 @@ The response is a stream of events compliant with the SSE protocol.

 ```
 event: transaction
-data: {"request_id":"abcdefg","version":123456,"timestamp":"2025-08-07T20:27:42.555Z","leader_id":"abcdefg","operations":[...]}
+data: {"request_id":"abcdefg","version":123456,"prev_version":123455,"timestamp":"2025-08-07T20:27:42.555Z","leader_id":"abcdefg","operations":[...]}

 ```

@@ -192,9 +194,11 @@ data: {"committed_version":123456,"leader_id":"abcdefg"}

 1. **Data Guarantees**: When `durable=false`, this endpoint streams *accepted*, but not necessarily *durable/committed*, transactions. *Accepted* transactions will eventually commit unless the current leader changes.

-2.  **Leader Changes & Reconnection**: When `durable=false`, if the leader changes, clients **must** discard all of that leader's `transaction` events received after their last-seen `checkpoint` event. They must then manually reconnect (as the server connection will likely be terminated) and restart the subscription by setting the `after` query parameter to the version specified in that last-known checkpoint. Clients should implement a randomized exponential backoff strategy (backoff with jitter) when reconnecting.
+1. **Leader Changes & Reconnection**: When `durable=false`, if the leader changes, clients **must** discard all of that leader's `transaction` events received after their last-seen `checkpoint` event. They must then manually reconnect (as the server connection will likely be terminated) and restart the subscription by setting the `after` query parameter to the version specified in that last-known checkpoint. Clients should implement a randomized exponential backoff strategy (backoff with jitter) when reconnecting.

-3.  **Connection Handling & Errors**: The server may periodically send `keepalive` comments to prevent idle timeouts on network proxies. The server will buffer unconsumed data up to a configurable limit; if the client falls too far behind, the connection will be closed. If the `after` version has been truncated from the log, this endpoint will return a standard `410 Gone` HTTP error instead of an event stream.
+1. **Gap Detection**: Each `transaction` event includes a `prev_version` field linking to the previous transaction version, forming a linked list. Clients can detect gaps in the transaction stream by checking that each transaction's `prev_version` matches the previous transaction's `version`. This ensures gapless transitions between historical data from S3 and live events from the server.
+
+1. **Connection Handling & Errors**: The server may periodically send `keepalive` comments to prevent idle timeouts on network proxies. The server will buffer unconsumed data up to a configurable limit; if the client falls too far behind, the connection will be closed. If the `after` version has been truncated from the log, this endpoint will return a standard `410 Gone` HTTP error instead of an event stream.

 ## `PUT /v1/retention/<policy_id>`

@@ -211,10 +215,10 @@ Creates or updates a retention policy.

 ### Response

-  * `201 Created` if the policy was created.
-  * `200 OK` if the policy was updated.
+- `201 Created` if the policy was created.
+- `200 OK` if the policy was updated.

-----
+______________________________________________________________________

 ## `GET /v1/retention/<policy_id>`

@@ -228,7 +232,7 @@ Retrieves a retention policy by ID.
 }
 ```

-----
+______________________________________________________________________

 ## `GET /v1/retention/`

@@ -245,7 +249,7 @@ Retrieves all retention policies.
 ]
 ```

-----
+______________________________________________________________________

 ## `DELETE /v1/retention/<policy_id>`

@@ -255,7 +259,7 @@ Removes a retention policy, which may allow the log to be truncated.

 `204 No Content`

-----
+______________________________________________________________________

 ## `GET /ok`

@@ -265,7 +269,7 @@ Simple health check endpoint.

 Returns `200 OK` with minimal content for basic health monitoring.

-----
+______________________________________________________________________

 ## `GET /metrics`

--- a/benchmarks/bench_parser_comparison.cpp
+++ b/benchmarks/bench_parser_comparison.cpp
@@ -76,13 +76,6 @@ private:
  Precondition current_precondition;
  Operation current_operation;

-  // Helper to store string in arena and return string_view
-  std::string_view store_string(const char *str, size_t length) {
-    char *stored = arena.allocate<char>(length);
-    std::memcpy(stored, str, length);
-    return std::string_view(stored, length);
-  }
-
 public:
  explicit CommitRequestArenaHandler()
      : preconditions(ArenaStlAllocator<Precondition>(&arena)),
@@ -109,7 +102,7 @@ public:
  bool RawNumber(const char *, rapidjson::SizeType, bool) { abort(); }

  bool String(const char *str, rapidjson::SizeType length, bool) {
-    std::string_view value = store_string(str, length);
+    std::string_view value = arena.copy_string({str, length});

    if (state == State::Root) {
      if (current_key == "request_id") {
--- a/benchmarks/bench_reference.cpp
+++ b/benchmarks/bench_reference.cpp
@@ -27,6 +27,18 @@ template <typename T> struct PointerTraits<std::shared_ptr<T>> {
    return std::make_shared<T>(std::forward<Args>(args)...);
  }

+  static pointer_type copy(const pointer_type &ptr) {
+    return ptr; // std::shared_ptr copies implicitly
+  }
+
+  static weak_type as_weak(const pointer_type &ptr) {
+    return ptr; // std::weak_ptr converts implicitly from std::shared_ptr
+  }
+
+  static weak_type copy_weak(const weak_type &weak) {
+    return weak; // std::weak_ptr copies implicitly
+  }
+
  static const char *name() { return "std::shared_ptr"; }
  static const char *weak_name() { return "std::weak_ptr"; }
 };
@@ -39,6 +51,18 @@ template <typename T> struct PointerTraits<Ref<T>> {
    return make_ref<T>(std::forward<Args>(args)...);
  }

+  static pointer_type copy(const pointer_type &ptr) {
+    return ptr.copy(); // Ref requires explicit copy
+  }
+
+  static weak_type as_weak(const pointer_type &ptr) {
+    return ptr.as_weak(); // Ref requires explicit as_weak
+  }
+
+  static weak_type copy_weak(const weak_type &weak) {
+    return weak.copy(); // WeakRef requires explicit copy
+  }
+
  static const char *name() { return "Ref"; }
  static const char *weak_name() { return "WeakRef"; }
 };
@@ -67,7 +91,7 @@ void benchmark_copy(ankerl::nanobench::Bench &bench) {

  auto original = Traits::make(TestObject{123});
  bench.run(std::string(Traits::name()) + " copy", [&] {
-    auto copy = original;
+    auto copy = Traits::copy(original);
    ankerl::nanobench::doNotOptimizeAway(copy);
  });
 }
@@ -91,9 +115,9 @@ void benchmark_weak_copy(ankerl::nanobench::Bench &bench) {
  force_multithreaded();

  auto strong_ptr = Traits::make(TestObject{123});
-  typename Traits::weak_type weak_original = strong_ptr;
+  typename Traits::weak_type weak_original = Traits::as_weak(strong_ptr);
  bench.run(std::string(Traits::weak_name()) + " copy", [&] {
-    auto weak_copy = weak_original;
+    auto weak_copy = Traits::copy_weak(weak_original);
    ankerl::nanobench::doNotOptimizeAway(weak_copy);
  });
 }
@@ -103,7 +127,7 @@ void benchmark_weak_move(ankerl::nanobench::Bench &bench) {
  using Traits = PointerTraits<PtrType>;

  auto strong_ptr = Traits::make(TestObject{123});
-  typename Traits::weak_type weak_original = strong_ptr;
+  typename Traits::weak_type weak_original = Traits::as_weak(strong_ptr);
  bench.run(std::string(Traits::weak_name()) + " move", [&] {
    auto weak_moved = std::move(weak_original);
    ankerl::nanobench::doNotOptimizeAway(weak_moved);
@@ -126,7 +150,7 @@ void benchmark_weak_lock_success(ankerl::nanobench::Bench &bench) {
  using Traits = PointerTraits<PtrType>;

  auto strong_ptr = Traits::make(TestObject{789});
-  typename Traits::weak_type weak_ptr = strong_ptr;
+  typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);
  bench.run(std::string(Traits::weak_name()) + " lock success", [&] {
    auto locked = weak_ptr.lock();
    ankerl::nanobench::doNotOptimizeAway(locked);
@@ -140,7 +164,7 @@ void benchmark_weak_lock_failure(ankerl::nanobench::Bench &bench) {
  typename Traits::weak_type weak_ptr;
  {
    auto strong_ptr = Traits::make(TestObject{999});
-    weak_ptr = strong_ptr;
+    weak_ptr = Traits::as_weak(strong_ptr);
  }
  bench.run(std::string(Traits::weak_name()) + " lock failure", [&] {
    auto locked = weak_ptr.lock();
@@ -163,7 +187,7 @@ void benchmark_multithreaded_copy(ankerl::nanobench::Bench &bench,
  for (int i = 0; i < num_threads - 1; ++i) {
    background_threads.emplace_back([&]() {
      while (keep_running.load(std::memory_order_relaxed)) {
-        auto copy = ptr;
+        auto copy = Traits::copy(ptr);
        ankerl::nanobench::doNotOptimizeAway(copy);
      }
    });
@@ -171,7 +195,7 @@ void benchmark_multithreaded_copy(ankerl::nanobench::Bench &bench,

  // Benchmark the foreground thread under contention
  bench.run(std::string(Traits::name()) + " copy under contention", [&] {
-    auto copy = ptr;
+    auto copy = Traits::copy(ptr);
    ankerl::nanobench::doNotOptimizeAway(copy);
  });

@@ -189,7 +213,7 @@ void benchmark_multithreaded_weak_lock(ankerl::nanobench::Bench &bench,

  // Create the shared object and weak reference outside the benchmark
  auto strong_ptr = Traits::make(TestObject{789});
-  typename Traits::weak_type weak_ptr = strong_ptr;
+  typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);

  // Create background threads that will create contention
  std::atomic<bool> keep_running{true};
@@ -224,7 +248,7 @@ void benchmark_weak_copy_with_strong_contention(ankerl::nanobench::Bench &bench,

  // Create the shared object and weak reference outside the benchmark
  auto strong_ptr = Traits::make(TestObject{456});
-  typename Traits::weak_type weak_ptr = strong_ptr;
+  typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);

  // Create background threads copying the strong pointer
  std::atomic<bool> keep_running{true};
@@ -233,7 +257,7 @@ void benchmark_weak_copy_with_strong_contention(ankerl::nanobench::Bench &bench,
  for (int i = 0; i < num_threads - 1; ++i) {
    background_threads.emplace_back([&]() {
      while (keep_running.load(std::memory_order_relaxed)) {
-        auto copy = strong_ptr;
+        auto copy = Traits::copy(strong_ptr);
        ankerl::nanobench::doNotOptimizeAway(copy);
      }
    });
@@ -242,7 +266,7 @@ void benchmark_weak_copy_with_strong_contention(ankerl::nanobench::Bench &bench,
  // Benchmark weak reference copying under strong reference contention
  bench.run(std::string(Traits::weak_name()) + " copy with strong contention",
            [&] {
-              auto weak_copy = weak_ptr;
+              auto weak_copy = Traits::copy_weak(weak_ptr);
              ankerl::nanobench::doNotOptimizeAway(weak_copy);
            });

@@ -260,7 +284,7 @@ void benchmark_strong_copy_with_weak_contention(ankerl::nanobench::Bench &bench,

  // Create the shared object and weak reference outside the benchmark
  auto strong_ptr = Traits::make(TestObject{789});
-  typename Traits::weak_type weak_ptr = strong_ptr;
+  typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);

  // Create background threads copying the weak pointer
  std::atomic<bool> keep_running{true};
@@ -269,7 +293,7 @@ void benchmark_strong_copy_with_weak_contention(ankerl::nanobench::Bench &bench,
  for (int i = 0; i < num_threads - 1; ++i) {
    background_threads.emplace_back([&]() {
      while (keep_running.load(std::memory_order_relaxed)) {
-        auto weak_copy = weak_ptr;
+        auto weak_copy = Traits::copy_weak(weak_ptr);
        ankerl::nanobench::doNotOptimizeAway(weak_copy);
      }
    });
@@ -277,7 +301,7 @@ void benchmark_strong_copy_with_weak_contention(ankerl::nanobench::Bench &bench,

  // Benchmark strong reference copying under weak reference contention
  bench.run(std::string(Traits::name()) + " copy with weak contention", [&] {
-    auto strong_copy = strong_ptr;
+    auto strong_copy = Traits::copy(strong_ptr);
    ankerl::nanobench::doNotOptimizeAway(strong_copy);
  });

--- a/benchmarks/bench_thread_pipeline.cpp
+++ b/benchmarks/bench_thread_pipeline.cpp
@@ -1,3 +1,4 @@
+#include "cpu_work.hpp"
 #include "thread_pipeline.hpp"

 #include <latch>
@@ -19,24 +20,22 @@ int main() {
                     .warmup(100);
    bench.run("Zero stage pipeline", [&] {
      for (int i = 0; i < NUM_ITEMS; ++i) {
-        for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
-        }
+        spend_cpu_cycles(BUSY_ITERS);
      }
    });

-    StaticThreadPipeline<std::latch *, WaitStrategy::WaitIfStageEmpty, 1>
-        pipeline(LOG_PIPELINE_SIZE);
+    ThreadPipeline<std::latch *> pipeline(WaitStrategy::WaitIfStageEmpty, {1},
+                                          LOG_PIPELINE_SIZE);

    std::latch done{0};

    // Stage 0 consumer thread
    std::thread stage0_thread([&pipeline, &done]() {
      for (;;) {
-        auto guard = pipeline.acquire<0, 0>();
+        auto guard = pipeline.acquire(0, 0);

        for (auto &item : guard.batch) {
-          for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
-          }
+          spend_cpu_cycles(BUSY_ITERS);
          if (item == &done) {
            return;
          }
@@ -90,19 +89,18 @@ int main() {
                     .warmup(100);

    for (int batch_size : {1, 4, 16, 64, 256}) {
-      StaticThreadPipeline<std::latch *, WaitStrategy::WaitIfStageEmpty, 1>
-          pipeline(LOG_PIPELINE_SIZE);
+      ThreadPipeline<std::latch *> pipeline(WaitStrategy::WaitIfStageEmpty, {1},
+                                            LOG_PIPELINE_SIZE);

      std::latch done{0};

      // Stage 0 consumer thread
      std::thread stage0_thread([&pipeline, &done]() {
        for (;;) {
-          auto guard = pipeline.acquire<0, 0>();
+          auto guard = pipeline.acquire(0, 0);

          for (auto &item : guard.batch) {
-            for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
-            }
+            spend_cpu_cycles(BUSY_ITERS);
            if (item == &done) {
              return;
            }
@@ -144,8 +142,8 @@ int main() {
  }

  // Helper function for wait strategy benchmarks
-  auto benchmark_wait_strategy =
-      []<WaitStrategy strategy>(const std::string &name,
+  auto benchmark_wait_strategy = [](WaitStrategy strategy,
+                                    const std::string &name,
                                    ankerl::nanobench::Bench &bench) {
    constexpr int LOG_PIPELINE_SIZE =
        8; // Smaller buffer to increase contention
@@ -154,18 +152,16 @@ int main() {
    constexpr int BUSY_ITERS =
        10; // Light work to emphasize coordination overhead

-        StaticThreadPipeline<std::latch *, strategy, 1, 1> pipeline(
-            LOG_PIPELINE_SIZE);
+    ThreadPipeline<std::latch *> pipeline(strategy, {1, 1}, LOG_PIPELINE_SIZE);

    std::latch done{0};

    // Stage 0 worker
    std::thread stage0_thread([&pipeline, &done]() {
      for (;;) {
-            auto guard = pipeline.template acquire<0, 0>();
+        auto guard = pipeline.acquire(0, 0);
        for (auto &item : guard.batch) {
-              for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
-              }
+          spend_cpu_cycles(BUSY_ITERS);
          if (item == &done)
            return;
        }
@@ -175,10 +171,9 @@ int main() {
    // Stage 1 worker (final stage - always calls futex wake)
    std::thread stage1_thread([&pipeline, &done]() {
      for (;;) {
-            auto guard = pipeline.template acquire<1, 0>();
+        auto guard = pipeline.acquire(1, 0);
        for (auto &item : guard.batch) {
-              for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
-              }
+          spend_cpu_cycles(BUSY_ITERS);
          if (item == &done)
            return;
          if (item)
@@ -224,12 +219,11 @@ int main() {
                     .relative(true)
                     .warmup(50);

-    benchmark_wait_strategy.template operator()<WaitStrategy::WaitIfStageEmpty>(
-        "WaitIfStageEmpty", bench);
-    benchmark_wait_strategy.template
-    operator()<WaitStrategy::WaitIfUpstreamIdle>("WaitIfUpstreamIdle", bench);
-    benchmark_wait_strategy.template operator()<WaitStrategy::Never>("Never",
+    benchmark_wait_strategy(WaitStrategy::WaitIfStageEmpty, "WaitIfStageEmpty",
                            bench);
+    benchmark_wait_strategy(WaitStrategy::WaitIfUpstreamIdle,
+                            "WaitIfUpstreamIdle", bench);
+    benchmark_wait_strategy(WaitStrategy::Never, "Never", bench);
  }

  // TODO: Add more benchmarks for:
--- a/commit_pipeline.md
+++ b/commit_pipeline.md
@@ -15,10 +15,10 @@ HTTP I/O Threads → [Sequence] → [Resolve] → [Persist] → [Release] → HT
 ### Pipeline Flow

 1. **HTTP I/O Threads**: Parse and validate incoming commit requests
-2. **Sequence Stage**: Assign sequential version numbers to commits
-3. **Resolve Stage**: Validate preconditions and check for conflicts
-4. **Persist Stage**: Write commits to durable storage and notify subscribers
-5. **Release Stage**: Return connections to HTTP I/O threads for response handling
+1. **Sequence Stage**: Assign sequential version numbers to commits
+1. **Resolve Stage**: Validate preconditions and check for conflicts
+1. **Persist Stage**: Write commits to durable storage and notify subscribers
+1. **Release Stage**: Return connections to HTTP I/O threads for response handling

 ## Stage Details

@@ -29,21 +29,25 @@ HTTP I/O Threads → [Sequence] → [Resolve] → [Persist] → [Release] → HT
 **Serialization**: **Required** - Must be single-threaded

 **Responsibilities**:
+
 - **For CommitEntry**: Check request_id against banned list, assign sequential version number if not banned, forward to resolve stage
- **For StatusEntry**: Add request_id to banned list, note current highest assigned version as upper bound, transfer connection to status threadpool
+- **For StatusEntry**: Add request_id to banned list, note current highest assigned version as upper bound for version range scanning
 - Record version assignments for transaction tracking

 **Why Serialization is Required**:
+
 - Version numbers must be strictly sequential without gaps
 - Banned list updates must be atomic with version assignment
 - Status requests must get accurate upper bound on potential commit versions

 **Request ID Banned List**:
+
 - Purpose: Make transactions no longer in-flight and establish version upper bounds for status queries
 - Lifecycle: Grows indefinitely until process restart (leader change)
 - Removal: Only on process restart/leader change, which invalidates all old request IDs

 **Current Implementation**:
+
 ```cpp
 bool HttpHandler::process_sequence_batch(BatchType &batch) {
  for (auto &entry : batch) {
@@ -64,20 +68,24 @@ bool HttpHandler::process_sequence_batch(BatchType &batch) {
 **Serialization**: **Required** - Must be single-threaded

 **Responsibilities**:
+
 - **For CommitEntry**: Check preconditions against in-memory recent writes set, add writes to recent writes set if accepted
 - **For StatusEntry**: N/A (transferred to status threadpool after sequence stage)
 - Mark failed commits with failure information (including which preconditions failed)

 **Why Serialization is Required**:
+
 - Must maintain consistent view of in-memory recent writes set
 - Conflict detection requires atomic evaluation of all preconditions against recent writes
 - Recent writes set updates must be synchronized

 **Transaction State Transitions**:
+
 - **Assigned Version** (from sequence) → **Semi-committed** (resolve accepts) → **Committed** (persist completes)
 - Failed transactions continue through pipeline with failure information for client response

 **Current Implementation**:
+
 ```cpp
 bool HttpHandler::process_resolve_batch(BatchType &batch) {
  // TODO: Implement precondition resolution logic:
@@ -95,55 +103,62 @@ bool HttpHandler::process_resolve_batch(BatchType &batch) {
 **Serialization**: **Required** - Must mark batches durable in order

 **Responsibilities**:
- **For CommitEntry**: Apply operations to persistent storage, update committed version high water mark
- **For StatusEntry**: N/A (transferred to status threadpool after sequence stage)
+
+- **For CommitEntry**: Apply operations to persistent storage, update committed version high water mark, generate success response JSON
+- **For StatusEntry**: N/A (empty husk, connection transferred to status threadpool after sequence stage)
 - Generate durability events for `/v1/subscribe` when committed version advances
 - Batch multiple commits for efficient persistence operations

 **Why Serialization is Required**:
+
 - Batches must be marked durable in sequential version order
 - High water mark updates must reflect strict ordering of committed versions
 - Ensures consistency guarantees across all endpoints

 **Committed Version High Water Mark**:
+
 - Global atomic value tracking highest durably committed version
 - Updated after each batch commits: set to highest version in the batch
 - Read by `/v1/version` endpoint using atomic seq_cst reads
 - Enables `/v1/subscribe` durability events when high water mark advances

 **Batching Strategy**:
+
 - Multiple semi-committed transactions can be persisted in a single batch
 - High water mark updated once per batch to highest version in that batch
 - See `persistence.md` for detailed persistence design

 **Current Implementation**:
+
 ```cpp
 bool HttpHandler::process_persist_batch(BatchType &batch) {
-  // TODO: Implement actual persistence logic:
-  // 1. For CommitEntry: Apply operations to persistent storage
-  // 2. Update committed version high water mark to highest version in batch
-  // 3. Generate durability events for /v1/subscribe
-  // 4. For StatusEntry: N/A (already transferred to status threadpool)
+  // For CommitEntry: Apply operations to persistent storage, update high water mark, generate response JSON
+  // For StatusEntry: N/A (empty husk, connection transferred to status threadpool)
+  // Generate durability events for /v1/subscribe when committed version advances
+  // Semi-committed transactions are retried until durable or leader fails
 }
 ```

 ### Stage 3: Connection Release

-**Thread**: `txn-release`
+**Threads**: Multiple `txn-release` threads (configurable)
 **Purpose**: Return connections to HTTP server for client response
 **Serialization**: Not required - Independent connection handling

 **Responsibilities**:
+
 - Return processed connections to HTTP server for all request types
 - Connection carries response data (success/failure) and status information
 - Trigger response transmission to clients

 **Response Handling**:
- **CommitRequests**: Response generated by persist stage (success with version, or failure with conflicting preconditions)
- **StatusRequests**: Response generated by separate status lookup logic (not part of pipeline)
+
+- **CommitRequests**: Response JSON generated by persist stage (success with version, or failure with conflicting preconditions from resolve stage)
+- **StatusRequests**: Response generated by separate status threadpool (connection transferred after sequence stage)
 - Failed transactions carry failure information through entire pipeline for proper client response

 **Implementation**:
+
 ```cpp
 bool HttpHandler::process_release_batch(BatchType &batch) {
  // Stage 3: Connection release
@@ -151,8 +166,9 @@ bool HttpHandler::process_release_batch(BatchType &batch) {
    if (!conn) {
      return true; // Shutdown signal
    }
-    // Return connection to server for further processing or cleanup
-    Server::release_back_to_server(std::move(conn));
+    // Connection is server-owned - respond to client and connection
+    // remains managed by server's connection registry
+    // TODO: Implement response sending with new server-owned connection model
  }
  return false; // Continue processing
 }
@@ -164,12 +180,12 @@ bool HttpHandler::process_release_batch(BatchType &batch) {

 ```cpp
 // 4-stage pipeline: sequence -> resolve -> persist -> release
-// TODO: Update pipeline type from std::unique_ptr<Connection> to PipelineEntry variant
+// Pipeline with PipelineEntry variant instead of connection ownership transfer
 StaticThreadPipeline<PipelineEntry,  // Was: std::unique_ptr<Connection>
                     WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1, 1>
    commitPipeline{lg_size};

-// Pipeline entry type (to be implemented)
+// Pipeline entry type for server-owned connection model
 using PipelineEntry = std::variant<CommitEntry, StatusEntry, ShutdownEntry>;
 ```

@@ -212,7 +228,7 @@ for (auto &conn : guard.batch) {
 Commit requests enter the pipeline via `HttpHandler::on_batch_complete()`:

 ```cpp
-void HttpHandler::on_batch_complete(std::span<std::unique_ptr<Connection>> batch) {
+void HttpHandler::on_batch_complete(std::span<Connection*> batch) {
  // Collect commit requests that passed basic validation for 4-stage pipeline processing
  int commit_count = 0;
  for (auto &conn : batch) {
@@ -237,7 +253,10 @@ void HttpHandler::on_batch_complete(std::span<std::unique_ptr<Connection>> batch
 ### Backpressure Handling

 The pipeline implements natural backpressure:
- Each stage blocks if downstream stages are full
+
+- Fixed-size pipeline buffer causes I/O threads to block when pipeline is full
+- This prevents unbounded memory growth under high load
+- I/O threads blocking may impact accept() rate, but provides system-wide flow control
 - `WaitIfUpstreamIdle` strategy balances latency vs throughput
 - Ring buffer size (`lg_size = 16`) controls maximum queued batches

@@ -288,7 +307,7 @@ std::visit([&](auto&& entry) {
 - Failed CommitEntries are passed through the pipeline with error information
 - Downstream stages skip processing for error connections but forward them
 - Error responses are sent when connection reaches release stage
- Connection ownership is always transferred to ensure cleanup
+- Server-owned connections ensure proper cleanup and response handling

 ### Pipeline Integrity

@@ -310,7 +329,7 @@ std::visit([&](auto&& entry) {

 - **Single-Pass Processing**: Each connection flows through all stages once
 - **Streaming Design**: Stages process concurrently
- **Minimal Copying**: Connection ownership transfer, not data copying
+- **Minimal Copying**: Request processing with server-owned connections
 - **Direct Response**: Release stage triggers immediate response transmission

 ### Scalability Characteristics
@@ -328,7 +347,7 @@ private:
  static constexpr int lg_size = 16;  // Ring buffer size = 2^16 entries

  // 4-stage pipeline configuration
-  StaticThreadPipeline<std::unique_ptr<Connection>,
+  StaticThreadPipeline<PipelineEntry,
                       WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1, 1>
      commitPipeline{lg_size};
 ```
@@ -344,8 +363,9 @@ private:
 The pipeline processes different types of entries using a variant/union type system instead of `std::unique_ptr<Connection>`:

 ### Pipeline Entry Variants
- **CommitEntry**: Contains `std::unique_ptr<Connection>` with CommitRequest and connection state
- **StatusEntry**: Contains `std::unique_ptr<Connection>` with StatusRequest (transferred to status threadpool after sequence)
+
+- **CommitEntry**: Contains connection reference/ID with CommitRequest and connection state
+- **StatusEntry**: Contains connection reference/ID with StatusRequest (transferred to status threadpool after sequence)
 - **ShutdownEntry**: Signals pipeline shutdown to all stages
 - **Future types**: Pipeline design supports additional entry types

@@ -354,9 +374,9 @@ The pipeline processes different types of entries using a variant/union type sys
 | Stage | CommitEntry | StatusEntry | ShutdownEntry | Serialization |
 |-------|-------------|-------------|---------------|---------------|
 | **Sequence** | Check banned list, assign version | Add to banned list, transfer to status threadpool | Return true (shutdown) | **Required** |
-| **Resolve** | Check preconditions, update recent writes | N/A (transferred) | Return true (shutdown) | **Required** |
-| **Persist** | Apply operations, update high water mark | N/A (transferred) | Return true (shutdown) | **Required** |
-| **Release** | Return connection to HTTP threads | N/A (transferred) | Return true (shutdown) | Not required |
+| **Resolve** | Check preconditions, update recent writes | N/A (empty husk) | Return true (shutdown) | **Required** |
+| **Persist** | Apply operations, update high water mark | N/A (empty husk) | Return true (shutdown) | **Required** |
+| **Release** | Return connection to HTTP threads | N/A (empty husk) | Return true (shutdown) | Not required (multiple threads) |

 ## API Endpoint Integration

@@ -367,6 +387,7 @@ The pipeline processes different types of entries using a variant/union type sys
 #### Request Processing Flow

 1. **HTTP I/O Thread Processing** (`src/http_handler.cpp:210-273`):
+
   ```cpp
   void HttpHandler::handlePostCommit(Connection &conn, HttpConnectionState &state) {
     // Parse and validate anything that doesn't need serialization:
@@ -379,47 +400,57 @@ The pipeline processes different types of entries using a variant/union type sys
   }
   ```

-2. **Pipeline Entry**: Successfully parsed connections enter pipeline as CommitEntry (containing the connection with CommitRequest)
+1. **Pipeline Entry**: Successfully parsed connections enter pipeline as CommitEntry (containing the connection with CommitRequest)
+
+1. **Pipeline Processing**:

-3. **Pipeline Processing**:
   - **Sequence**: Check banned list → assign version (or reject)
   - **Resolve**: Check preconditions against in-memory recent writes → mark semi-committed (or failed with conflict details)
   - **Persist**: Apply operations → mark committed, update high water mark
   - **Release**: Return connection with response data

-4. **Response Generation**: Based on pipeline results
+1. **Response Generation**: Based on pipeline results
+
   - **Success**: `{"status": "committed", "version": N, "leader_id": "...", "request_id": "..."}`
   - **Failure**: `{"status": "not_committed", "conflicts": [...], "version": N, "leader_id": "..."}`

 ### `/v1/status` - Commit Status Lookup

-**Pipeline Interaction**: StatusEntry through sequence stage, then transfer to status threadpool
+**Pipeline Interaction**: StatusEntry through sequence stage only

 #### Request Processing Flow

 1. **HTTP I/O Thread Processing**:
+
   ```cpp
   void HttpHandler::handleGetStatus(Connection &conn, const HttpConnectionState &state) {
-     // TODO: Extract request_id from URL and min_version from query params
-     // Current: Returns placeholder static response
+     // Extract request_id from URL and min_version from query params
+     // Create StatusEntry for pipeline processing
   }
   ```

-2. **Two-Phase Processing**:
-   - **Phase 1 - Sequence Stage**: StatusEntry enters pipeline to add request_id to banned list and get version upper bound
-   - **Phase 2 - Status Threadpool**: Connection transferred from sequence stage to dedicated status threadpool for actual status lookup logic
+1. **Pipeline Processing**:

-3. **Status Lookup Logic**: Performed in status threadpool - scan transaction log to determine actual commit status of the now-banned request_id
+   - **Sequence Stage**: StatusEntry adds request_id to banned list, establishes version scanning range, transfers connection to status threadpool
+   - **Subsequent Stages**: Empty StatusEntry husk flows through resolve/persist/release as no-op
+
+1. **Status Lookup Logic**:
+
+   - Version range determined in sequence stage (min_version parameter to version upper bound)
+   - Actual S3 scanning performed by separate status threadpool outside the pipeline
+   - Return "committed" with version if found, "not_found" if not found in scanned range

 ### `/v1/subscribe` - Real-time Transaction Stream

 **Pipeline Integration**: Consumes events from resolve and persist stages

 #### Event Sources
+
 - **Resolve Stage**: Semi-committed transactions (accepted preconditions) for low-latency streaming
 - **Persist Stage**: Durability events when committed version high water mark advances

 #### Current Implementation
+
 ```cpp
 void HttpHandler::handleGetSubscribe(Connection &conn, const HttpConnectionState &state) {
  // TODO: Parse query parameters (after, durable)
@@ -447,11 +478,12 @@ void HttpHandler::handleGetSubscribe(Connection &conn, const HttpConnectionState
 The pipeline integrates with the HTTP handler at two points:

 1. **Entry**: `on_batch_complete()` feeds connections into sequence stage
-2. **Exit**: Release stage calls `Server::release_back_to_server()`
+1. **Exit**: Release stage responds to clients with server-owned connections

 ### Persistence Layer Integration

 The persist stage interfaces with:
+
 - **S3 Backend**: Batch writes for durability (see `persistence.md`)
 - **Subscriber System**: Real-time change stream notifications
 - **Metrics System**: Transaction throughput and latency tracking
@@ -467,17 +499,17 @@ The persist stage interfaces with:
 ### Potential Enhancements

 1. **Dynamic Thread Counts**: Make resolve and release thread counts configurable
-2. **NUMA Optimization**: Pin pipeline threads to specific CPU cores
-3. **Batch Size Tuning**: Dynamic batch size based on load
-4. **Stage Bypassing**: Skip resolve stage for transactions without preconditions
-5. **Persistence Batching**: Aggregate multiple commits into larger S3 writes
+1. **NUMA Optimization**: Pin pipeline threads to specific CPU cores
+1. **Batch Size Tuning**: Dynamic batch size based on load
+1. **Stage Bypassing**: Skip resolve stage for transactions without preconditions
+1. **Persistence Batching**: Aggregate multiple commits into larger S3 writes

 ### Monitoring and Observability

 1. **Stage Metrics**: Throughput, latency, and queue depth per stage
-2. **Error Tracking**: Error rates and types by stage
-3. **Resource Utilization**: CPU and memory usage per pipeline thread
-4. **Flow Control Events**: Backpressure and stall detection
+1. **Error Tracking**: Error rates and types by stage
+1. **Resource Utilization**: CPU and memory usage per pipeline thread
+1. **Flow Control Events**: Backpressure and stall detection

 ## Implementation Status

--- a/config.md
+++ b/config.md
@@ -28,13 +28,15 @@ Controls server networking, threading, and request handling behavior.

 ### Commit Configuration (`[commit]`)

-Controls behavior of the `/v1/commit` endpoint and request ID management.
+Controls behavior of the `/v1/commit` endpoint, request ID management, and commit pipeline threading.

 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
 | `min_request_id_length` | integer | `20` | Minimum length required for client-provided `request_id` fields to ensure sufficient entropy for collision avoidance |
 | `request_id_retention_hours` | integer | `24` | How long to retain request IDs in memory for `/v1/status` queries. Longer retention reduces the chance of `log_truncated` responses |
 | `request_id_retention_versions` | integer | `100000000` | Minimum number of versions to retain request IDs for, regardless of time. Provides additional protection against `log_truncated` responses |
+| `pipeline_wait_strategy` | string | `"WaitIfUpstreamIdle"` | Wait strategy for the commit pipeline. `"WaitIfStageEmpty"` = block when individual stages are empty (safe for shared CPUs), `"WaitIfUpstreamIdle"` = block only when all upstream stages are idle (requires dedicated cores, highest throughput), `"Never"` = never block, busy-wait continuously (requires dedicated cores, lowest latency) |
+| `pipeline_release_threads` | integer | `1` | Number of threads in the release stage (final stage of commit pipeline). Higher values increase parallelism for connection release and response transmission |

 ### Subscription Configuration (`[subscription]`)

@@ -77,6 +79,8 @@ read_buffer_size = 32768  # 32KB
 min_request_id_length = 32
 request_id_retention_hours = 48
 request_id_retention_versions = 50000
+pipeline_wait_strategy = "WaitIfUpstreamIdle"  # Options: "WaitIfStageEmpty", "WaitIfUpstreamIdle", "Never"
+pipeline_release_threads = 4  # Default: 1, increase for higher throughput

 [subscription]
 max_buffer_size_bytes = 52428800  # 50MB
@@ -101,18 +105,27 @@ WeaselDB uses the `toml11` library for configuration parsing with robust error h
 These configuration parameters directly affect server and API behavior:

 **Server Performance:**
+
 - **`io_threads`**: Controls parallelism for both accepting new connections and I/O processing. Should typically match CPU core count for optimal performance
 - **`event_batch_size`**: Larger batches reduce syscall overhead but may increase latency under light load
 - **`max_connections`**: Prevents resource exhaustion by limiting concurrent connections

 **Request Handling:**
+
 - **`max_request_size_bytes`**: Determines when `/v1/commit` returns `413 Content Too Large`
 - **`min_request_id_length`**: Validates `request_id` fields in `/v1/commit` requests for sufficient entropy

 **Request ID Management:**
+
 - **`request_id_retention_*`**: Affects availability of data for `/v1/status` queries and likelihood of `log_truncated` responses

+**Commit Pipeline Performance:**
+
+- **`pipeline_wait_strategy`**: Controls CPU usage vs latency tradeoff in commit processing. `WaitIfStageEmpty` is safest for shared CPUs, `WaitIfUpstreamIdle` provides highest throughput with dedicated cores, `Never` provides lowest latency but uses 100% CPU
+- **`pipeline_release_threads`**: Determines parallelism in the final stage of commit processing. More threads can improve throughput when processing many concurrent requests
+
 **Subscription Streaming:**
+
 - **`max_buffer_size_bytes`**: Controls when `/v1/subscribe` connections are terminated due to slow consumption
 - **`keepalive_interval_seconds`**: Frequency of keepalive comments in `/v1/subscribe` streams

@@ -121,6 +134,7 @@ These configuration parameters directly affect server and API behavior:
 The configuration system includes comprehensive validation with specific bounds checking:

 ### Server Configuration Limits
+
 - **`port`**: Must be between 1 and 65535
 - **`max_request_size_bytes`**: Must be > 0 and ≤ 100MB
 - **`io_threads`**: Must be between 1 and 1000
@@ -128,26 +142,33 @@ The configuration system includes comprehensive validation with specific bounds
 - **`max_connections`**: Must be between 0 and 100000 (0 = unlimited)

 ### Commit Configuration Limits
+
 - **`min_request_id_length`**: Must be between 8 and 256 characters
 - **`request_id_retention_hours`**: Must be between 1 and 8760 hours (1 year)
 - **`request_id_retention_versions`**: Must be > 0
+- **`pipeline_wait_strategy`**: Must be one of: `"WaitIfStageEmpty"`, `"WaitIfUpstreamIdle"`, or `"Never"`
+- **`pipeline_release_threads`**: Must be between 1 and 64

 ### Subscription Configuration Limits
+
 - **`max_buffer_size_bytes`**: Must be > 0 and ≤ 1GB
 - **`keepalive_interval_seconds`**: Must be between 1 and 3600 seconds (1 hour)

 ### Cross-Validation
+
 - Warns if `max_request_size_bytes` > `max_buffer_size_bytes` (potential buffering issues)

 ## Configuration Management

 ### Code Integration
+
 - **Configuration Structure**: Defined in `src/config.hpp` with structured types
 - **Parser Implementation**: Located in `src/config.cpp` using template-based parsing
 - **Default Values**: Embedded as struct defaults for compile-time initialization
 - **Runtime Usage**: Configuration passed to server components during initialization

 ### Development Guidelines
+
 - **New Parameters**: Add to appropriate struct in `src/config.hpp`
 - **Validation**: Include bounds checking in `ConfigParser::validate_config()`
 - **Documentation**: Update this file when adding new configuration options
--- a/design.md
+++ b/design.md
@@ -3,15 +3,15 @@
 ## Table of Contents

 1. [Project Overview](#project-overview)
-2. [Quick Start](#quick-start)
-3. [Architecture](#architecture)
-4. [Development Guidelines](#development-guidelines)
-5. [Common Patterns](#common-patterns)
-6. [Reference](#reference)
+1. [Quick Start](#quick-start)
+1. [Architecture](#architecture)
+1. [Development Guidelines](#development-guidelines)
+1. [Common Patterns](#common-patterns)
+1. [Reference](#reference)

 **IMPORTANT:** Read [style.md](style.md) first - contains mandatory C++ coding standards, threading rules, and testing guidelines that must be followed for all code changes.

---
+______________________________________________________________________

 ## Project Overview

@@ -22,11 +22,21 @@ WeaselDB is a high-performance write-side database component designed for system
 - **Ultra-fast arena allocation** (~1ns vs ~20-270ns for malloc)
 - **High-performance JSON parsing** with streaming support and SIMD optimization
 - **Multi-threaded networking** using multiple epoll instances with unified I/O thread pool
+- **Multi-stage commit pipeline** with serial processing for consistency and parallel I/O for performance
+- **Non-blocking metrics system** with try-lock optimization preventing pipeline stalls
 - **Configurable epoll instances** to eliminate kernel-level contention
 - **Optimized memory management** with arena allocation and efficient copying
 - **Factory pattern safety** ensuring correct object lifecycle management

---
+### Design Philosophy
+
+**"Two machines once you've mastered one"** - Optimize aggressively for single-machine performance before distributing. Most systems prematurely scale horizontally and never fully utilize their hardware. How are you supposed to horizontally scale strict serializability anyway?
+
+**Boring formats, fast implementations** - Use standard data formats (JSON, HTTP, base64) with heavily optimized parsing. Universal compatibility without sacrificing performance.
+
+**Read/write separation** - Fan out reads from the single write stream (persist stage to many subscribers), with true horizontal scaling via S3 for historical data. Keep writes simple and fast.
+
+______________________________________________________________________

 ## Quick Start

@@ -43,39 +53,49 @@ ninja
 ### Testing & Development

 **Run all tests:**
+
 ```bash
 ninja test    # or ctest
 ```

 **Individual targets:**
+
 - `./test_arena` - Arena allocator unit tests
 - `./test_commit_request` - JSON parsing and validation tests
 - `./test_http_handler` - HTTP protocol handling tests
 - `./test_metric` - Metrics system tests
 - `./test_api_url_parser` - API URL parsing tests
+- `./test_reference` - Reference counting system tests
 - `./test_server_connection_return` - Connection lifecycle tests

 **Benchmarking:**
+
 - `./bench_arena` - Memory allocation performance
 - `./bench_commit_request` - JSON parsing performance
- `./bench_parser_comparison` - Compare vs nlohmann::json and RapidJSON
- `./bench_metric` - Metrics system performance
- `./bench_thread_pipeline` - Lock-free pipeline performance
+- `./bench_cpu_work` - CPU work benchmarking utility
 - `./bench_format_comparison` - String formatting performance
+- `./bench_metric` - Metrics system performance
+- `./bench_parser_comparison` - Compare vs nlohmann::json and RapidJSON
+- `./bench_reference` - Reference counting performance
+- `./bench_thread_pipeline` - Lock-free pipeline performance

 **Debug tools:**
+
 - `./debug_arena` - Analyze arena allocator behavior

 **Load Testing:**
+
 - `./load_tester` - A tool to generate load against the server for performance and stability analysis.

 ### Dependencies

 **System requirements:**
+
 - **weaseljson** - Must be installed system-wide (high-performance JSON parser)
 - **gperf** - System requirement for perfect hash generation

 **Auto-fetched:**
+
 - **simdutf** - SIMD base64 encoding/decoding
 - **toml11** - TOML configuration parsing
 - **doctest** - Testing framework
@@ -84,7 +104,7 @@ ninja test    # or ctest
 - **RapidJSON** - High-performance JSON library (used in benchmarks)
 - **llhttp** - Fast HTTP parser

---
+______________________________________________________________________

 ## Architecture

@@ -106,17 +126,19 @@ Ultra-fast memory allocator optimized for request/response patterns:
 #### **Networking Layer**

 **Server** (`src/server.{hpp,cpp}`):
+
 - **High-performance multi-threaded networking** using multiple epoll instances with unified I/O thread pool
 - **Configurable epoll instances** to eliminate kernel-level epoll_ctl contention (default: 2, max: io_threads)
 - **Round-robin thread-to-epoll assignment** distributes I/O threads across epoll instances
 - **Connection distribution** keeps accepted connections on same epoll, returns via round-robin
- **Factory pattern construction** via `Server::create()` ensures proper shared_ptr semantics
+- **Factory pattern construction** via `Server::create()` ensures you can only get a `Ref<Server>`
 - **Safe shutdown mechanism** with async-signal-safe shutdown() method
 - **Connection ownership management** with automatic cleanup on server destruction
 - **Pluggable protocol handlers** via ConnectionHandler interface
 - **EPOLL_EXCLUSIVE** on listen socket across all epoll instances prevents thundering herd

 **Connection** (`src/connection.{hpp,cpp}`):
+
 - **Efficient per-connection state management** with arena-based memory allocation
 - **Safe ownership transfer** between server threads and protocol handlers
 - **Automatic cleanup** on connection closure or server shutdown
@@ -124,6 +146,7 @@ Ultra-fast memory allocator optimized for request/response patterns:
 - **Protocol-specific data:** `user_data` `void*` for custom handler data

 **ConnectionHandler Interface** (`src/connection_handler.hpp`):
+
 - **Abstract protocol interface** decoupling networking from application logic
 - **Ownership transfer support** allowing handlers to take connections for async processing
 - **Streaming data processing** with partial message handling
@@ -141,6 +164,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 #### **Parsing Layer**

 **JSON Commit Request Parser** (`src/json_commit_request_parser.{hpp,cpp}`):
+
 - **High-performance JSON parser** using `weaseljson` library
 - **Streaming parser support** for incremental parsing of network data
 - **gperf-optimized token recognition** for fast JSON key parsing
@@ -150,6 +174,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 - **Zero hash collisions** for known JSON tokens eliminates branching

 **Parser Interface** (`src/commit_request_parser.hpp`):
+
 - **Abstract base class** for commit request parsers
 - **Format-agnostic parsing interface** supporting multiple serialization formats
 - **Streaming and one-shot parsing modes**
@@ -158,6 +183,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 #### **Data Model**

 **Commit Request Data Model** (`src/commit_request.hpp`):
+
 - **Format-agnostic data structure** for representing transactional commits
 - **Arena-backed string storage** with efficient memory management
 - **Move-only semantics** for optimal performance
@@ -167,18 +193,21 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 #### **Metrics System** (`src/metric.{hpp,cpp}`)

 **High-Performance Metrics Implementation:**
+
 - **Thread-local counters/histograms** with single writer for performance
 - **Global gauges** with lock-free atomic CAS operations for multi-writer scenarios
 - **SIMD-optimized histogram bucket updates** using AVX instructions for high throughput
 - **Arena allocator integration** for efficient memory management during rendering

 **Threading Model:**
+
 - **Counters**: Per-thread storage, single writer, atomic write in `Counter::inc()`, atomic read in render thread
 - **Histograms**: Per-thread storage, single writer, per-histogram mutex serializes all access (observe and render)
 - **Gauges**: Lock-free atomic operations using `std::bit_cast` for double precision
 - **Thread cleanup**: Automatic accumulation of thread-local state into global state on destruction

 **Prometheus Compatibility:**
+
 - **Standard metric types** with proper label handling and validation
 - **Bucket generation helpers** for linear/exponential histogram distributions
 - **Callback-based metrics** for dynamic values
@@ -187,6 +216,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 #### **Configuration & Optimization**

 **Configuration System** (`src/config.{hpp,cpp}`):
+
 - **TOML-based configuration** using `toml11` library
 - **Structured configuration** with server, commit, and subscription sections
 - **Default fallback values** for all configuration options
@@ -194,6 +224,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 - See `config.md` for complete configuration documentation

 **JSON Token Optimization** (`src/json_tokens.gperf`, `src/json_token_enum.hpp`):
+
 - **Perfect hash table** generated by gperf for O(1) JSON key lookup
 - **Compile-time token enumeration** for type-safe key identification
 - **Minimal perfect hash** reduces memory overhead and improves cache locality
@@ -202,6 +233,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 ### Transaction Data Model

 #### CommitRequest Structure
+
 ```
 CommitRequest {
  - request_id: Optional unique identifier
@@ -220,48 +252,54 @@ CommitRequest {
 ### Memory Management Model

 #### Connection Ownership Lifecycle
-1. **Creation**: Accept threads create connections, transfer to epoll as raw pointers
-2. **Processing**: Network threads claim ownership by wrapping in unique_ptr
-3. **Handler Transfer**: Handlers can take ownership for async processing via unique_ptr.release()
-4. **Return Path**: Handlers use Server::release_back_to_server() to return connections
-5. **Safety**: All transfers use weak_ptr to server for safe cleanup
-6. **Cleanup**: RAII ensures proper resource cleanup in all scenarios
+
+1. **Creation**: Server creates connections and stores them in registry
+1. **Processing**: I/O threads access connections via registry lookup
+1. **Handler Access**: Handlers receive Connection& references, server retains ownership
+1. **Async Processing**: Handlers use WeakRef<Connection> for safe async access
+1. **Safety**: Connection mutex synchronizes concurrent access between threads
+1. **Cleanup**: RAII ensures proper resource cleanup when connections are destroyed

 #### Arena Memory Lifecycle
-1. **Request Processing**: Handler uses `conn->get_arena()` to allocate memory for parsing request data
-2. **Response Generation**: Handler uses arena for temporary response construction (headers, JSON, etc.)
-3. **Response Queuing**: Handler calls `conn->append_message()` which copies data to arena-backed message queue
-4. **Response Writing**: Server writes all queued messages to socket via `writeBytes()`
+
+1. **Request Processing**: Handler creates request-scoped arena for parsing request data
+1. **Response Generation**: Handler uses same arena for response construction (headers, JSON, etc.)
+1. **Response Queuing**: Handler calls `conn->append_message()` passing span + arena ownership
+1. **Response Writing**: I/O thread writes messages to socket, arena freed after completion

 > **Note**: Call `conn->reset()` periodically to reclaim arena memory. Best practice is after all outgoing bytes have been written.

-#### Threading Model and EPOLLONESHOT
+#### Threading Model and Server-Owned Connections

-**EPOLLONESHOT Design Rationale:**
-WeaselDB uses `EPOLLONESHOT` for all connection file descriptors to enable safe multi-threaded ownership transfer without complex synchronization:
+**Server-Owned Connection Design:**
+WeaselDB uses a server-owned connection model where the server retains ownership of all connections while providing safe concurrent access to handlers:

 **Key Benefits:**
-1. **Automatic fd disarming** - When epoll triggers an event, the fd is automatically removed from epoll monitoring
-2. **Race-free ownership transfer** - Handlers can safely take connection ownership and move to other threads
-3. **Zero-coordination async processing** - No manual synchronization needed between network threads and handler threads
+
+1. **Simplified ownership** - Server always owns connections, eliminating complex ownership transfers
+1. **Safe concurrent access** - Connection mutexes synchronize access between I/O threads and handlers
+1. **WeakRef pattern** - Handlers use WeakRef<Connection> for safe async processing without ownership

 **Threading Flow:**
-1. **Event Trigger**: Network thread gets epoll event → connection auto-disarmed via ONESHOT
-2. **Safe Transfer**: Handler can take ownership (`std::move(conn_ptr)`) with no epoll interference
-3. **Async Processing**: Connection processed on handler thread while epoll cannot trigger spurious events
-4. **Return & Re-arm**: `Server::receiveConnectionBack()` re-arms fd with `epoll_ctl(EPOLL_CTL_MOD)`

-**Performance Trade-off:**
- **Cost**: One `epoll_ctl(MOD)` syscall per connection return (~100-200ns)
- **Benefit**: Eliminates complex thread synchronization and prevents race conditions
- **Alternative cost**: Manual `EPOLL_CTL_DEL`/`ADD` + locking would be significantly higher
+1. **Event Trigger**: Network thread gets epoll event and processes data
+1. **Handler Invocation**: Handler receives Connection& reference - server retains ownership
+1. **Async Processing**: Handler obtains WeakRef<Connection> for safe background processing
+1. **Connection Cleanup**: Server manages connection lifecycle including file descriptor operations

-**Without EPOLLONESHOT risks:**
- Multiple threads processing same fd simultaneously
- Use-after-move when network thread accesses transferred connection
- Complex synchronization between epoll events and ownership transfers
+**Performance Benefits:**

-This design enables the async handler pattern where connections can be safely moved between threads for background processing while maintaining high performance and thread safety.
+- **Reduced syscalls**: Eliminates epoll_ctl(MOD) calls needed for ownership transfer
+- **Simplified synchronization**: Connection mutexes provide clear concurrent access patterns
+- **Memory efficiency**: No unique_ptr overhead for ownership management
+
+**Safe Async Processing:**
+
+- WeakRef<Connection> prevents use-after-free in background threads
+- Connection mutex ensures thread-safe access to connection state
+- Server handles all file descriptor management automatically
+
+This design provides high performance concurrent processing while maintaining thread safety through clear ownership boundaries and synchronization primitives.

 ### API Endpoints

@@ -270,14 +308,14 @@ The system implements a RESTful API. See [api.md](api.md) for comprehensive API
 ### Design Principles

 1. **Performance-first** - Every component optimized for high throughput
-2. **Scalable concurrency** - Multiple epoll instances eliminate kernel contention
-3. **Memory efficiency** - Arena allocation eliminates fragmentation
-4. **Efficient copying** - Minimize unnecessary copies while accepting required ones
-5. **Streaming-ready** - Support incremental processing
-6. **Type safety** - Compile-time validation where possible
-7. **Resource management** - RAII and move semantics throughout
+1. **Scalable concurrency** - Multiple epoll instances eliminate kernel contention
+1. **Memory efficiency** - Arena allocation eliminates fragmentation
+1. **Efficient copying** - Minimize unnecessary copies while accepting required ones
+1. **Streaming-ready** - Support incremental processing
+1. **Type safety** - Compile-time validation where possible
+1. **Resource management** - RAII and move semantics throughout

---
+______________________________________________________________________

 ## Development Guidelines

@@ -289,13 +327,13 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.

 - **Server Creation**: Always use `Server::create()` factory method - direct construction is impossible
 - **Connection Creation**: Only the Server can create connections - no public constructor or factory method
- **Connection Ownership**: Use unique_ptr semantics for safe ownership transfer between components
+- **Connection Ownership**: Server retains ownership, handlers use Connection& references
 - **Arena Allocator Pattern**: Always use `Arena` for temporary allocations within request processing
 - **String View Usage**: Prefer `std::string_view` over `std::string` when pointing to arena-allocated memory
- **Ownership Transfer**: Use `Server::release_back_to_server()` for returning connections to server from handlers
+- **Async Processing**: Use `conn.get_weak_ref()` for safe background processing without ownership
 - **JSON Token Lookup**: Use the gperf-generated perfect hash table in `json_tokens.hpp` for O(1) key recognition
 - **Base64 Handling**: Always use simdutf for base64 encoding/decoding for performance
- **Thread Safety**: Connection ownership transfers are designed to be thread-safe with proper RAII cleanup
+- **Thread Safety**: Connection mutexes provide safe concurrent access between threads

 ### Project Structure

@@ -308,20 +346,22 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
 ### Extension Points

 #### Adding New Protocol Handlers
+
 1. Inherit from `ConnectionHandler` in `src/connection_handler.hpp`
-2. Implement `on_data_arrived()` with proper ownership semantics
-3. Use connection's arena allocator for temporary allocations: `conn->get_arena()`
-4. Handle partial messages and streaming protocols appropriately
-5. Use `Server::release_back_to_server()` if taking ownership for async processing
-6. Add corresponding test cases and integration tests
-7. Consider performance implications of ownership transfers
+1. Implement `on_data_arrived()` using Connection& reference parameter
+1. Use connection's arena allocator for temporary allocations: `conn.get_arena()`
+1. Handle partial messages and streaming protocols appropriately
+1. Use `conn.get_weak_ref()` for safe async processing without ownership transfer
+1. Add corresponding test cases and integration tests
+1. Consider performance implications of concurrent access patterns

 #### Adding New Parsers
+
 1. Inherit from `CommitRequestParser` in `src/commit_request_parser.hpp`
-2. Implement both streaming and one-shot parsing modes
-3. Use arena allocation for all temporary string storage
-4. Add corresponding test cases in `tests/`
-5. Add benchmark comparisons in `benchmarks/`
+1. Implement both streaming and one-shot parsing modes
+1. Use arena allocation for all temporary string storage
+1. Add corresponding test cases in `tests/`
+1. Add benchmark comparisons in `benchmarks/`

 ### Performance Guidelines

@@ -329,6 +369,7 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
 - **CPU**: Perfect hashing and SIMD operations are critical paths - avoid alternatives
 - **I/O**: Streaming parser design supports incremental network data processing
 - **Cache**: String views avoid copying, keeping data cache-friendly
+- **Pipeline**: Serial stages must never block - only parallel release stage can take locks

 ### Configuration & Testing

@@ -337,13 +378,14 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
 - **Build System**: CMake generates gperf hash tables at build time
 - **Testing Guidelines**: See [style.md](style.md) for comprehensive testing standards including synchronization rules

---
+______________________________________________________________________

 ## Common Patterns

 ### Factory Method Patterns

 #### Server Creation
+
 ```cpp
 // Server must be created via factory method
 auto server = Server::create(config, handler);
@@ -354,59 +396,51 @@ auto server = Server::create(config, handler);
 ```

 #### Connection Creation (Server-Only)
-```cpp
-// Only Server can create connections (using private friend method)
-class Server {
-private:
-  auto conn = Connection::createForServer(addr, fd, id, handler, weak_from_this());
-};

-// No public way to create connections - all these fail:
-// auto conn = Connection::create(...);        // ERROR: no such method
-// Connection conn(addr, fd, id, handler, server);  // ERROR: private constructor
-// auto conn = std::make_unique<Connection>(...);   // ERROR: private constructor
-```
+Only Server can create connections (using private constructor via friend access)

 ### ConnectionHandler Implementation Patterns

 #### Simple Synchronous Handler
+
 ```cpp
-class HttpHandler : public ConnectionHandler {
+class HttpHandler : ConnectionHandler {
 public:
-  void on_data_arrived(std::string_view data, std::unique_ptr<Connection>& conn_ptr) override {
+  void on_data_arrived(std::string_view data, Connection& conn) override {
    // Parse HTTP request using connection's arena
-    Arena& arena = conn_ptr->get_arena();
+    Arena& arena = conn.get_arena();

    // Generate response
-    conn_ptr->append_message("HTTP/1.1 200 OK\r\n\r\nHello World");
+    conn.append_message("HTTP/1.1 200 OK\r\n\r\nHello World");

    // Server retains ownership
  }
 };
 ```

-#### Async Handler with Ownership Transfer
+#### Async Handler with WeakRef
+
 ```cpp
-class AsyncHandler : public ConnectionHandler {
+class AsyncHandler : ConnectionHandler {
 public:
-  void on_data_arrived(std::string_view data, std::unique_ptr<Connection>& conn_ptr) override {
-    // Take ownership for async processing
-    auto connection = std::move(conn_ptr); // conn_ptr is now null
+  void on_data_arrived(std::string_view data, Connection& conn) override {
+    // Get weak reference for async processing
+    auto weak_conn = conn.get_weak_ref();

-    work_queue.push([connection = std::move(connection)](std::string_view data) mutable {
-      // Process asynchronously
-      connection->append_message("Async response");
-
-      // Return ownership to server when done
-      Server::release_back_to_server(std::move(connection));
+    work_queue.push([weak_conn, data = std::string(data)]() {
+      // Process asynchronously - connection may be closed
+      if (auto conn_ref = weak_conn.lock()) {
+        conn_ref->append_message("Async response");
+      }
    });
  }
 };
 ```

 #### Batching Handler with User Data
+
 ```cpp
-class BatchingHandler : public ConnectionHandler {
+class BatchingHandler : ConnectionHandler {
 public:
  void on_connection_established(Connection &conn) override {
    // Allocate some protocol-specific data and attach it to the connection
@@ -418,21 +452,20 @@ public:
    delete static_cast<MyProtocolData*>(conn.user_data);
  }

-  void on_data_arrived(std::string_view data,
-                       std::unique_ptr<Connection> &conn_ptr) override {
+  void on_data_arrived(std::string_view data, Connection& conn) override {
    // Process data and maybe store some results in the user_data
-    auto* proto_data = static_cast<MyProtocolData*>(conn_ptr->user_data);
+    auto* proto_data = static_cast<MyProtocolData*>(conn.user_data);
    proto_data->process(data);
  }

-  void on_batch_complete(std::span<std::unique_ptr<Connection>> batch) override {
+  void on_batch_complete(std::span<Connection *const> batch) override {
    // Process a batch of connections
-    for (auto& conn_ptr : batch) {
-      if (conn_ptr) {
-        auto* proto_data = static_cast<MyProtocolData*>(conn_ptr->user_data);
+    for (auto* conn : batch) {
+      if (conn) {
+        auto* proto_data = static_cast<MyProtocolData*>(conn->user_data);
        if (proto_data->is_ready()) {
-          // This connection is ready for the next stage, move it to the pipeline
-          pipeline_.push(std::move(conn_ptr));
+          // This connection is ready for the next stage, get weak ref for pipeline
+          pipeline_.push(conn->get_weak_ref());
        }
      }
    }
@@ -444,20 +477,21 @@ private:
 ```

 #### Streaming "yes" Handler
+
 ```cpp
-class YesHandler : public ConnectionHandler {
+class YesHandler : ConnectionHandler {
 public:
  void on_connection_established(Connection &conn) override {
    // Write an initial "y\n"
    conn.append_message("y\n");
  }

-  void on_write_progress(std::unique_ptr<Connection> &conn) override {
-    if (conn->outgoingBytesQueued() == 0) {
+  void on_write_progress(Connection &conn) override {
+    if (conn.outgoing_bytes_queued() == 0) {
      // Don't use an unbounded amount of memory
-      conn->reset();
+      conn.reset();
      // Write "y\n" repeatedly
-      conn->append_message("y\n");
+      conn.append_message("y\n");
    }
  }
 };
@@ -466,6 +500,7 @@ public:
 ### Memory Management Patterns

 #### Arena-Based String Handling
+
 ```cpp
 // Preferred: String view with arena allocation to minimize copying
 std::string_view process_json_key(const char* data, Arena& arena);
@@ -474,24 +509,26 @@ std::string_view process_json_key(const char* data, Arena& arena);
 std::string process_json_key(const char* data);
 ```

-#### Safe Connection Ownership Transfer
+#### Safe Async Connection Processing
+
 ```cpp
-// In handler - take ownership for background processing
-Connection* raw_conn = conn_ptr.release();
+// In handler - get weak reference for background processing
+auto weak_conn = conn.get_weak_ref();

 // Process on worker thread
-background_processor.submit([raw_conn]() {
+background_processor.submit([weak_conn]() {
  // Do work...
-  raw_conn->append_message("Background result");
-
-  // Return to server safely (handles server destruction)
-  Server::release_back_to_server(std::unique_ptr<Connection>(raw_conn));
+  if (auto conn_ref = weak_conn.lock()) {
+    conn_ref->append_message("Background result");
+  }
+  // Connection automatically cleaned up by server
 });
 ```

 ### Data Construction Patterns

 #### Builder Pattern Usage
+
 ```cpp
 CommitRequest request = CommitRequestBuilder(arena)
    .request_id("example-id")
@@ -501,41 +538,47 @@ CommitRequest request = CommitRequestBuilder(arena)
 ```

 #### Error Handling Pattern
+
 ```cpp
 enum class ParseResult { Success, InvalidJson, MissingField };
 ParseResult parse_commit_request(const char* json, CommitRequest& out);
 ```

---
+______________________________________________________________________

 ## Reference

 ### Build Targets

 **Test Executables:**
+
 - `test_arena` - Arena allocator functionality tests
 - `test_commit_request` - JSON parsing and validation tests
 - `test_metric` - Metrics system functionality tests
 - Main server executable (compiled from `src/main.cpp`)

 **Benchmark Executables:**
+
 - `bench_arena` - Arena allocator performance benchmarks
 - `bench_commit_request` - JSON parsing performance benchmarks
 - `bench_parser_comparison` - Comparison benchmarks vs nlohmann::json and RapidJSON
 - `bench_metric` - Metrics system performance benchmarks

 **Debug Tools:**
+
 - `debug_arena` - Debug tool for arena allocator analysis

 ### Performance Characteristics

 **Memory Allocation:**
+
 - **~1ns allocation time** vs standard allocators
 - **Bulk deallocation** eliminates individual free() calls
 - **Optimized geometric growth** uses current block size for doubling strategy
 - **Alignment-aware** allocation prevents performance penalties

 **JSON Parsing:**
+
 - **Streaming parser** handles large payloads efficiently
 - **Incremental processing** suitable for network protocols
 - **Arena storage** eliminates string allocation overhead
--- a/persistence.md
+++ b/persistence.md
@@ -16,7 +16,7 @@ The persistence thread receives commit batches from the main processing pipeline
 The persistence thread collects commits into batches using two trigger conditions:

 1. **Time Trigger**: `batch_timeout_ms` elapsed since batch collection started
-2. **Size Trigger**: `batch_size_threshold` commits collected (can be exceeded by final commit)
+1. **Size Trigger**: `batch_size_threshold` commits collected (can be exceeded by final commit)

 **Flow Control**: When `max_in_flight_requests` reached, block until responses received. Batches in retry backoff count toward the in-flight limit, creating natural backpressure during failures.

@@ -25,10 +25,12 @@ The persistence thread collects commits into batches using two trigger condition
 ### 1. Batch Collection

 **No In-Flight Requests** (no I/O to pump):
+
 - Use blocking acquire to get first commit batch (can afford to wait)
 - Process immediately (no batching delay)

 **With In-Flight Requests** (I/O to pump in event loop):
+
 - Check flow control: if at `max_in_flight_requests`, block for responses
 - Collect commits using non-blocking acquire until trigger condition:
  - Check for available commits (non-blocking)
@@ -97,9 +99,10 @@ The persistence thread collects commits into batches using two trigger condition
 ## Configuration Validation

 **Required Constraints**:
+
 - `batch_size_threshold` > 0 (must process at least one commit per batch)
 - `max_in_flight_requests` > 0 (must allow at least one concurrent request)
- `max_in_flight_requests` <= 1000 (required for single-call recovery guarantee)
+- `max_in_flight_requests` \<= 1000 (required for single-call recovery guarantee)
 - `batch_timeout_ms` > 0 (timeout must be positive)
 - `max_retry_attempts` >= 0 (zero disables retries)
 - `retry_base_delay_ms` > 0 (delay must be positive if retries enabled)
@@ -123,16 +126,19 @@ WeaselDB's batched persistence design enables efficient recovery while maintaini
 WeaselDB uses a **sequential batch numbering** scheme with **S3 atomic operations** to provide efficient crash recovery and split-brain prevention without external coordination services.

 **Batch Numbering Scheme**:
+
 - Batch numbers start at `2^64 - 1` and count downward: `18446744073709551615, 18446744073709551614, 18446744073709551613, ...`
 - Each batch is stored as S3 object `batches/{batch_number:020d}` with zero-padding
 - S3 lexicographic ordering on zero-padded numbers returns batches in ascending numerical order (latest batches first)

 **Terminology**: Since batch numbers decrease over time, we use numerical ordering:
+
 - "Older" batches = higher numbers (written first in time)
 - "Newer" batches = lower numbers (written more recently)
 - "Most recent" batches = lowest numbers (most recently written)

 **Example**: If batches 100, 99, 98, 97 are written, S3 LIST returns them as:
+
 ```
 batches/00000000000000000097  (newest, lowest batch number)
 batches/00000000000000000098
@@ -142,6 +148,7 @@ batches/00000000000000000100  (oldest, highest batch number)
 ```

 **Leadership and Split-Brain Prevention**:
+
 - New persistence thread instances scan S3 to find the highest (oldest) available batch number
 - Each batch write uses `If-None-Match="*"` to atomically claim the sequential batch number
 - Only one instance can successfully claim each batch number, preventing split-brain scenarios
@@ -150,28 +157,32 @@ batches/00000000000000000100  (oldest, highest batch number)
 **Recovery Scenarios**:

 **Clean Shutdown**:
+
 - All in-flight batches are drained to completion before termination
 - Durability watermark accurately reflects all durable state
 - No recovery required on restart

 **Crash Recovery**:
+
 1. **S3 Scan with Bounded Cost**: List S3 objects with prefix `batches/` and limit of 1000 objects
-2. **Gap Detection**: Check for missing sequential batch numbers. WeaselDB never puts more than 1000 batches in flight concurrently, so a limit of 1000 is sufficient.
-3. **Watermark Reconstruction**: Set durability watermark to the latest consecutive batch (scanning from highest numbers downward, until a gap)
-4. **Leadership Transition**: Begin writing batches starting from next available batch number. Skip past any batch numbers already claimed in the durability watermark scan.
+1. **Gap Detection**: Check for missing sequential batch numbers. WeaselDB never puts more than 1000 batches in flight concurrently, so a limit of 1000 is sufficient.
+1. **Watermark Reconstruction**: Set durability watermark to the latest consecutive batch (scanning from highest numbers downward, until a gap)
+1. **Leadership Transition**: Begin writing batches starting from next available batch number. Skip past any batch numbers already claimed in the durability watermark scan.

 **Bounded Recovery Guarantee**: Since at most 1000 batches can be in-flight during a crash, any gap in the sequential numbering (indicating the durability watermark) must appear within the first 1000 S3 objects. This is because:
+
 1. At most 1000 batches can be incomplete when crash occurs
-2. S3 LIST returns objects in ascending numerical order (most recent batches first due to countdown numbering)
-3. The first gap found represents the boundary between durable and potentially incomplete batches
-4. S3 LIST operations have a maximum limit of 1000 objects per request
-5. Therefore, scanning 1000 objects (the maximum S3 allows in one request) is sufficient to find this boundary
+1. S3 LIST returns objects in ascending numerical order (most recent batches first due to countdown numbering)
+1. The first gap found represents the boundary between durable and potentially incomplete batches
+1. S3 LIST operations have a maximum limit of 1000 objects per request
+1. Therefore, scanning 1000 objects (the maximum S3 allows in one request) is sufficient to find this boundary

 This ensures **O(1) recovery time** regardless of database size, with at most **one S3 LIST operation** required.

 **Recovery Protocol Detail**: Even with exactly 1000 batches in-flight, recovery works correctly:

 **Example Scenario**: Batches 2000 down to 1001 (1000 batches) are in-flight when crash occurs
+
 - Previous successful run had written through batch 2001
 - Worst case: batch 2000 (oldest in-flight) fails, batches 1999 down to 1001 (newer) all succeed
 - S3 LIST(limit=1000) returns: 1001, 1002, ..., 1998, 1999, 2001 (ascending numerical order)
--- a/src/arena.cpp
+++ b/src/arena.cpp
@@ -1,39 +1,10 @@
 #include "arena.hpp"
+
 #include <cassert>
 #include <iomanip>
 #include <limits>
 #include <vector>

-Arena::~Arena() {
-  while (current_block_) {
-    Block *prev = current_block_->prev;
-    std::free(current_block_);
-    current_block_ = prev;
-  }
-}
-
-Arena::Arena(Arena &&other) noexcept
-    : initial_block_size_(other.initial_block_size_),
-      current_block_(other.current_block_) {
-  other.current_block_ = nullptr;
-}
-
-Arena &Arena::operator=(Arena &&other) noexcept {
-  if (this != &other) {
-    while (current_block_) {
-      Block *prev = current_block_->prev;
-      std::free(current_block_);
-      current_block_ = prev;
-    }
-
-    initial_block_size_ = other.initial_block_size_;
-    current_block_ = other.current_block_;
-
-    other.current_block_ = nullptr;
-  }
-  return *this;
-}
-
 void Arena::reset() {
  if (!current_block_) {
    return;
--- a/src/arena.hpp
+++ b/src/arena.hpp
@@ -59,27 +59,24 @@
 *
 * ### Safe Usage Patterns in WeaselDB:
 * - **Per-Connection Instances**: Each Connection owns its own Arena
- *   instance, accessed only by the thread that currently owns the connection
- * - **Single Owner Principle**: Connection ownership transfers atomically
- * between threads using unique_ptr, ensuring only one thread accesses the arena
- * at a time
+ *   instance, accessed by its io thread
+ * - **Server Ownership**: Server retains connection ownership, handlers access
+ * arenas through Connection& references with proper mutex protection
 *
 * ### Thread Ownership Model:
- * 1. **Network Thread**: Claims connection ownership, accesses arena for I/O
- * buffers
- * 2. **Handler Thread**: Can take ownership via unique_ptr.release(), uses
- * arena for request parsing and response generation
- * 3. **Background Thread**: Can receive ownership for async processing, uses
- * arena for temporary data structures
- * 4. **Return Path**: Connection (and its arena) safely returned via
- *    Server::release_back_to_server()
+ * 1. **I/O Thread**: Server owns connections, processes socket I/O events
+ * 2. **Handler Thread**: Receives Connection& reference, creates request-scoped
+ * arenas for parsing and response generation
+ * 3. **Pipeline Thread**: Can use WeakRef<Connection> for async processing,
+ * creates own arenas for temporary data structures
+ * 4. **Arena Lifecycle**: Request-scoped arenas moved to message queue, freed
+ * after I/O completion without holding connection mutex
 *
 * ### Why This Design is Thread-Safe:
- * - **Exclusive Access**: Only the current owner thread should access the arena
- * - **Transfer Points**: Ownership transfers happen at well-defined
- * synchronization points with proper memory barriers.
- * - **No Shared State**: Each arena is completely isolated - no shared data
- *   between different arena instances
+ * - **Request-Scoped**: Each request gets its own Arena instance for isolation
+ * - **Move Semantics**: Arenas transferred via move, avoiding shared access
+ * - **Deferred Cleanup**: Arena destruction deferred to avoid malloc contention
+ *   while holding connection mutex
 *
 * @warning Do not share Arena instances between threads. Use separate
 * instances per thread or per logical unit of work.
@@ -157,7 +154,13 @@ public:
   * Traverses the intrusive linked list backwards from current_block_,
   * freeing each block. This ensures no memory leaks.
   */
-  ~Arena();
+  ~Arena() {
+    while (current_block_) {
+      Block *prev = current_block_->prev;
+      std::free(current_block_);
+      current_block_ = prev;
+    }
+  }

  /// Copy construction is not allowed (would be expensive and error-prone)
  Arena(const Arena &) = delete;
@@ -166,9 +169,21 @@ public:

  /**
   * @brief Move constructor - transfers ownership of all blocks.
-   * @param other The Arena to move from (will be left empty)
+   *
+   * @param other The Arena to move from (will be left in a valid, empty state)
+   *
+   * @note Post-move state: The moved-from Arena is left in a valid state
+   * equivalent to a newly constructed Arena. All operations remain safe:
+   * - allocate_raw(), allocate(), construct() work normally
+   * - reset() is safe and well-defined (no-op on empty arena)
+   * - used_bytes(), total_bytes() return 0
+   * - Destructor is safe to call
   */
-  Arena(Arena &&other) noexcept;
+  Arena(Arena &&other) noexcept
+      : initial_block_size_(other.initial_block_size_),
+        current_block_(other.current_block_) {
+    other.current_block_ = nullptr;
+  }

  /**
   * @brief Move assignment operator - transfers ownership of all blocks.
@@ -176,10 +191,31 @@ public:
   * Frees any existing blocks in this allocator before taking ownership
   * of blocks from the other allocator.
   *
-   * @param other The Arena to move from (will be left empty)
+   * @param other The Arena to move from (will be left in a valid, empty state)
   * @return Reference to this allocator
+   *
+   * @note Post-move state: The moved-from Arena is left in a valid state
+   * equivalent to a newly constructed Arena. All operations remain safe:
+   * - allocate_raw(), allocate(), construct() work normally
+   * - reset() is safe and well-defined (no-op on empty arena)
+   * - used_bytes(), total_bytes() return 0
+   * - Destructor is safe to call
   */
-  Arena &operator=(Arena &&other) noexcept;
+  Arena &operator=(Arena &&other) noexcept {
+    if (this != &other) {
+      while (current_block_) {
+        Block *prev = current_block_->prev;
+        std::free(current_block_);
+        current_block_ = prev;
+      }
+
+      initial_block_size_ = other.initial_block_size_;
+      current_block_ = other.current_block_;
+
+      other.current_block_ = nullptr;
+    }
+    return *this;
+  }

  /**
   * @brief Allocate raw memory with the specified size and alignment.
@@ -429,6 +465,72 @@ public:
    return static_cast<T *>(ptr);
  }

+  /**
+   * @brief Allocate an array of type T and return it as a std::span<T>.
+   *
+   * This method provides bounds-safe allocation by returning a std::span
+   * that knows its size, improving safety over raw pointer allocation.
+   *
+   * @tparam T The type to allocate (must be trivially destructible)
+   * @param count The number of elements to allocate
+   * @return std::span<T> A span covering the allocated array
+   *
+   * ## Safety:
+   * The returned span is valid for the lifetime of the arena and until
+   * the next reset() call. The span provides bounds checking in debug
+   * builds and clear size information.
+   *
+   * ## Usage:
+   * ```cpp
+   * auto buffer = arena.allocate_span<char>(1024);
+   * auto strings = arena.allocate_span<std::string_view>(10);
+   * ```
+   *
+   * ## Note:
+   * Returns an empty span (nullptr, 0) if count is 0.
+   * This method only allocates memory - it does not construct objects.
+   */
+  template <typename T> std::span<T> allocate_span(uint32_t count) {
+    if (count == 0) {
+      return std::span<T>{};
+    }
+    return std::span<T>{allocate<T>(count), count};
+  }
+
+  /**
+   * @brief Copy a string into arena memory and return a string_view.
+   *
+   * This method provides a safe way to copy string data into arena-allocated
+   * memory, ensuring the data remains valid for the arena's lifetime.
+   *
+   * @param str The string to copy into arena memory
+   * @return std::string_view pointing to the arena-allocated copy
+   *
+   * ## Safety:
+   * The returned string_view is valid for the lifetime of the arena and until
+   * the next reset() call. The string data is guaranteed to be null-terminated
+   * only if the input string was null-terminated.
+   *
+   * ## Usage:
+   * ```cpp
+   * Arena arena;
+   * std::string_view copy = arena.copy_string("Hello World");
+   * std::string_view copy2 = arena.copy_string(some_string_view);
+   * ```
+   *
+   * ## Note:
+   * Returns an empty string_view if the input string is empty.
+   * This method allocates exactly str.size() bytes (no null terminator added).
+   */
+  std::string_view copy_string(std::string_view str) {
+    if (str.empty()) {
+      return std::string_view{};
+    }
+    char *copied = allocate<char>(str.size());
+    std::memcpy(copied, str.data(), str.size());
+    return std::string_view(copied, str.size());
+  }
+
  /**
   * @brief Reset the allocator to reuse the first block, freeing all others.
   *
--- a/src/commit_pipeline.cpp
+++ b/src/commit_pipeline.cpp
@@ -0,0 +1,389 @@
+#include "commit_pipeline.hpp"
+
+#include <cstring>
+#include <pthread.h>
+#include <unordered_set>
+
+#include "commit_request.hpp"
+#include "cpu_work.hpp"
+#include "format.hpp"
+#include "metric.hpp"
+#include "pipeline_entry.hpp"
+
+// Metric for banned request IDs memory usage
+auto banned_request_ids_memory_gauge =
+    metric::create_gauge("weaseldb_banned_request_ids_memory_bytes",
+                         "Memory used by banned request IDs arena")
+        .create({});
+
+CommitPipeline::CommitPipeline(const weaseldb::Config &config)
+    : config_(config),
+      pipeline_(config.commit.pipeline_wait_strategy,
+                {1, 1, 1, config.commit.pipeline_release_threads}, lg_size) {
+
+  // Stage 0: Sequence assignment thread
+  sequence_thread_ = std::thread{[this]() {
+    pthread_setname_np(pthread_self(), "txn-sequence");
+    run_sequence_stage();
+  }};
+
+  // Stage 1: Precondition resolution thread
+  resolve_thread_ = std::thread{[this]() {
+    pthread_setname_np(pthread_self(), "txn-resolve");
+    run_resolve_stage();
+  }};
+
+  // Stage 2: Transaction persistence thread
+  persist_thread_ = std::thread{[this]() {
+    pthread_setname_np(pthread_self(), "txn-persist");
+    run_persist_stage();
+  }};
+
+  // Stage 3: Connection return to server threads (configurable count)
+  release_threads_.reserve(config.commit.pipeline_release_threads);
+  for (int i = 0; i < config.commit.pipeline_release_threads; ++i) {
+    release_threads_.emplace_back([this, i]() {
+      char name[16];
+      std::snprintf(name, sizeof(name), "txn-release-%d", i);
+      pthread_setname_np(pthread_self(), name);
+      run_release_stage(i);
+    });
+  }
+}
+
+CommitPipeline::~CommitPipeline() {
+  // Send shutdown signals for all release threads (adjacent in same batch)
+  {
+    int num_release_threads = static_cast<int>(release_threads_.size());
+    auto guard = pipeline_.push(num_release_threads, true);
+    for (int i = 0; i < num_release_threads; ++i) {
+      guard.batch[i] = ShutdownEntry{};
+    }
+  }
+
+  // Join all pipeline threads
+  sequence_thread_.join();
+  resolve_thread_.join();
+  persist_thread_.join();
+  for (auto &thread : release_threads_) {
+    thread.join();
+  }
+}
+
+void CommitPipeline::submit_batch(std::span<PipelineEntry> entries) {
+  if (entries.empty()) {
+    return;
+  }
+
+  // Get pipeline guard for batch size
+  auto guard = pipeline_.push(entries.size(), /*block=*/true);
+
+  // Move entries into pipeline slots
+  std::move(entries.begin(), entries.end(), guard.batch.begin());
+
+  // Guard destructor publishes batch to stage 0
+}
+
+// AVOID BLOCKING IN THIS STAGE!
+void CommitPipeline::run_sequence_stage() {
+
+  int64_t next_version = 1;
+  // Request ID deduplication (sequence stage only)
+  Arena banned_request_arena;
+  using BannedRequestIdSet =
+      std::unordered_set<std::string_view, std::hash<std::string_view>,
+                         std::equal_to<std::string_view>,
+                         ArenaStlAllocator<std::string_view>>;
+  BannedRequestIdSet banned_request_ids{
+      ArenaStlAllocator<std::string_view>(&banned_request_arena)};
+
+  int expected_shutdowns = config_.commit.pipeline_release_threads;
+  for (int shutdowns_received = 0; shutdowns_received < expected_shutdowns;) {
+    auto guard = pipeline_.acquire(0, 0);
+    auto &batch = guard.batch;
+
+    // Stage 0: Sequence assignment
+    // This stage performs ONLY work that requires serial processing:
+    // - Version/sequence number assignment (must be sequential)
+    // - Request ID banned list management
+
+    for (auto &entry : batch) {
+      // Pattern match on pipeline entry variant
+      std::visit(
+          [&](auto &&e) {
+            using T = std::decay_t<decltype(e)>;
+
+            if constexpr (std::is_same_v<T, ShutdownEntry>) {
+              ++shutdowns_received;
+            } else if constexpr (std::is_same_v<T, CommitEntry>) {
+              // Process commit entry: check banned list, assign version
+              auto &commit_entry = e;
+
+              assert(commit_entry.commit_request);
+
+              // Check if request_id is banned (for status queries)
+              // Only check CommitRequest request_id, not HTTP header
+              if (commit_entry.commit_request &&
+                  commit_entry.commit_request->request_id().has_value()) {
+                auto commit_request_id =
+                    commit_entry.commit_request->request_id().value();
+                if (banned_request_ids.contains(commit_request_id)) {
+                  // Request ID is banned, this commit should fail
+                  commit_entry.response_json =
+                      R"({"status": "not_committed", "error": "request_id_banned"})";
+                  return;
+                }
+              }
+
+              // Assign sequential version number
+              commit_entry.assigned_version = next_version++;
+            } else if constexpr (std::is_same_v<T, StatusEntry>) {
+              // Process status entry: add request_id to banned list, get
+              // version upper bound
+              auto &status_entry = e;
+
+              // Add request_id to banned list - store the string in arena and
+              // use string_view
+              std::string_view request_id_view =
+                  banned_request_arena.copy_string(
+                      status_entry.status_request_id);
+              banned_request_ids.insert(request_id_view);
+
+              // Update memory usage metric
+              banned_request_ids_memory_gauge.set(
+                  banned_request_arena.total_allocated());
+
+              // Set version upper bound to current highest assigned version
+              status_entry.version_upper_bound = next_version - 1;
+            } else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
+              // Process health check entry: noop in sequence stage
+            }
+          },
+          entry);
+    }
+  }
+}
+
+// AVOID BLOCKING IN THIS STAGE!
+void CommitPipeline::run_resolve_stage() {
+  int expected_shutdowns = config_.commit.pipeline_release_threads;
+  for (int shutdowns_received = 0; shutdowns_received < expected_shutdowns;) {
+    auto guard = pipeline_.acquire(1, 0, /*maxBatch*/ 1);
+    auto &batch = guard.batch;
+
+    // Stage 1: Precondition resolution
+    // This stage must be serialized to maintain consistent database state view
+    // - Validate preconditions against current database state
+    // - Check for conflicts with other transactions
+
+    for (auto &entry : batch) {
+      // Pattern match on pipeline entry variant
+      std::visit(
+          [&](auto &&e) {
+            using T = std::decay_t<decltype(e)>;
+
+            if constexpr (std::is_same_v<T, ShutdownEntry>) {
+              ++shutdowns_received;
+            } else if constexpr (std::is_same_v<T, CommitEntry>) {
+              // Process commit entry: accept all commits (simplified
+              // implementation)
+              auto &commit_entry = e;
+              // Accept all commits (simplified implementation)
+              commit_entry.resolve_success = true;
+            } else if constexpr (std::is_same_v<T, StatusEntry>) {
+              // Status entries are not processed in resolve stage
+              // They were already handled in sequence stage
+            } else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
+              // Perform configurable CPU-intensive work for benchmarking
+              spend_cpu_cycles(config_.benchmark.ok_resolve_iterations);
+            }
+          },
+          entry);
+    }
+  }
+}
+
+void CommitPipeline::run_persist_stage() {
+  int expected_shutdowns = config_.commit.pipeline_release_threads;
+  for (int shutdowns_received = 0; shutdowns_received < expected_shutdowns;) {
+    auto guard = pipeline_.acquire(2, 0);
+    auto &batch = guard.batch;
+
+    // Stage 2: Transaction persistence
+    // Mark everything as durable immediately (simplified implementation)
+    // In real implementation: batch S3 writes, update subscribers, etc.
+
+    for (auto &entry : batch) {
+      // Pattern match on pipeline entry variant
+      std::visit(
+          [&](auto &&e) {
+            using T = std::decay_t<decltype(e)>;
+
+            if constexpr (std::is_same_v<T, ShutdownEntry>) {
+              ++shutdowns_received;
+            } else if constexpr (std::is_same_v<T, CommitEntry>) {
+              // Process commit entry: mark as durable, generate response
+              auto &commit_entry = e;
+              // Check if connection is still alive first
+
+              // Skip if resolve failed or connection is in error state
+              if (!commit_entry.commit_request ||
+                  !commit_entry.resolve_success) {
+                return;
+              }
+
+              // Mark as persisted and update committed version high water mark
+              commit_entry.persist_success = true;
+              committed_version_.store(commit_entry.assigned_version,
+                                       std::memory_order_seq_cst);
+
+              const CommitRequest &commit_request =
+                  *commit_entry.commit_request;
+
+              // Generate success JSON response with actual assigned version
+              std::string_view response_json;
+              if (commit_request.request_id().has_value()) {
+                response_json = format(
+                    commit_entry.request_arena,
+                    R"({"request_id":"%.*s","status":"committed","version":%ld,"leader_id":"leader123"})",
+                    static_cast<int>(
+                        commit_request.request_id().value().size()),
+                    commit_request.request_id().value().data(),
+                    commit_entry.assigned_version);
+              } else {
+                response_json = format(
+                    commit_entry.request_arena,
+                    R"({"status":"committed","version":%ld,"leader_id":"leader123"})",
+                    commit_entry.assigned_version);
+              }
+
+              // Store JSON response in arena for release stage
+              char *json_buffer =
+                  commit_entry.request_arena.template allocate<char>(
+                      response_json.size());
+              std::memcpy(json_buffer, response_json.data(),
+                          response_json.size());
+              commit_entry.response_json =
+                  std::string_view(json_buffer, response_json.size());
+
+              return; // Continue processing
+            } else if constexpr (std::is_same_v<T, StatusEntry>) {
+              // Process status entry: generate not_committed response
+              auto &status_entry = e;
+
+              // Store JSON response for release stage
+              status_entry.response_json = R"({"status": "not_committed"})";
+
+            } else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
+              // Process health check entry: generate OK response
+              auto &health_check_entry = e;
+
+              // Store plain text "OK" response for release stage
+              health_check_entry.response_json = "OK";
+
+            } else if constexpr (std::is_same_v<T, GetVersionEntry>) {
+              auto &get_version_entry = e;
+
+              // TODO validate we're still the leader at some version > the
+              // proposed version for external consistency.
+              // TODO include leader in response
+              get_version_entry.response_json = format(
+                  get_version_entry.request_arena,
+                  R"({"version":%ld,"leader":""})", get_version_entry.version);
+            }
+          },
+          entry);
+    }
+  }
+}
+
+void CommitPipeline::run_release_stage(int thread_index) {
+  for (int shutdowns_received = 0; shutdowns_received < 1;) {
+    auto guard = pipeline_.acquire(3, thread_index);
+    auto &batch = guard.batch;
+
+    // Stage 3: Connection release
+    // Return connections to server for response transmission
+
+    for (auto it = batch.begin(); it != batch.end(); ++it) {
+      auto &entry = *it;
+
+      // Partition work: thread 0 handles even indices, thread 1 handles odd
+      // indices
+      if (static_cast<int>(it.index() %
+                           config_.commit.pipeline_release_threads) !=
+          thread_index) {
+        continue;
+      }
+
+      // Process non-shutdown entries with partitioning
+      std::visit(
+          [&](auto &&e) {
+            using T = std::decay_t<decltype(e)>;
+
+            if constexpr (std::is_same_v<T, ShutdownEntry>) {
+              // Already handled above
+              ++shutdowns_received;
+            } else if constexpr (std::is_same_v<T, CommitEntry>) {
+              // Process commit entry: return connection to server
+              auto &commit_entry = e;
+              auto conn_ref = commit_entry.connection.lock();
+              if (!conn_ref) {
+                // Connection is gone, drop the entry silently
+                return; // Skip this entry and continue processing
+              }
+
+              // Send the JSON response using protocol-agnostic interface
+              // HTTP formatting will happen in on_preprocess_writes()
+              conn_ref->send_response(commit_entry.protocol_context,
+                                      commit_entry.response_json,
+                                      std::move(commit_entry.request_arena));
+            } else if constexpr (std::is_same_v<T, StatusEntry>) {
+              // Process status entry: return connection to server
+              auto &status_entry = e;
+              auto conn_ref = status_entry.connection.lock();
+              if (!conn_ref) {
+                // Connection is gone, drop the entry silently
+                return; // Skip this entry and continue processing
+              }
+
+              // Send the JSON response using protocol-agnostic interface
+              // HTTP formatting will happen in on_preprocess_writes()
+              conn_ref->send_response(status_entry.protocol_context,
+                                      status_entry.response_json,
+                                      std::move(status_entry.request_arena));
+            } else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
+              // Process health check entry: return connection to server
+              auto &health_check_entry = e;
+              auto conn_ref = health_check_entry.connection.lock();
+              if (!conn_ref) {
+                // Connection is gone, drop the entry silently
+                return; // Skip this entry and continue processing
+              }
+
+              // Send the response using protocol-agnostic interface
+              // HTTP formatting will happen in on_preprocess_writes()
+              conn_ref->send_response(
+                  health_check_entry.protocol_context,
+                  health_check_entry.response_json,
+                  std::move(health_check_entry.request_arena));
+            } else if constexpr (std::is_same_v<T, GetVersionEntry>) {
+              auto &get_version_entry = e;
+              auto conn_ref = get_version_entry.connection.lock();
+              if (!conn_ref) {
+                // Connection is gone, drop the entry silently
+                return; // Skip this entry and continue processing
+              }
+
+              // Send the response using protocol-agnostic interface
+              // HTTP formatting will happen in on_preprocess_writes()
+              conn_ref->send_response(
+                  get_version_entry.protocol_context,
+                  get_version_entry.response_json,
+                  std::move(get_version_entry.request_arena));
+            }
+          },
+          entry);
+    }
+  }
+}
--- a/src/commit_pipeline.hpp
+++ b/src/commit_pipeline.hpp
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <atomic>
+#include <span>
+#include <thread>
+
+#include "config.hpp"
+#include "pipeline_entry.hpp"
+#include "thread_pipeline.hpp"
+
+/**
+ * High-performance 4-stage commit processing pipeline.
+ *
+ * Provides protocol-agnostic transaction processing through a lock-free
+ * multi-stage pipeline optimized for high throughput and low latency.
+ *
+ * Pipeline Stages:
+ * 1. Sequence: Version assignment and request ID deduplication
+ * 2. Resolve: Precondition validation and conflict detection
+ * 3. Persist: Transaction durability and response generation
+ * 4. Release: Connection return and response transmission
+ *
+ * Thread Safety:
+ * - submit_batch() is thread-safe for concurrent producers
+ * - Internal pipeline uses lock-free algorithms
+ * - Each stage runs on dedicated threads for optimal performance
+ *
+ * Usage:
+ * ```cpp
+ * CommitPipeline pipeline(config);
+ *
+ * // Build pipeline entries
+ * std::vector<PipelineEntry> entries;
+ * entries.emplace_back(CommitEntry(connection, context, request, arena));
+ *
+ * // Submit for processing
+ * pipeline.submit_batch(entries);
+ * ```
+ */
+struct CommitPipeline {
+  /**
+   * Create pipeline with 4 processing stages.
+   *
+   * @param config Server configuration for pipeline tuning
+   */
+  explicit CommitPipeline(const weaseldb::Config &config);
+
+  /**
+   * Destructor ensures clean shutdown and thread join.
+   * Sends shutdown signal through pipeline and waits for all stages to
+   * complete.
+   */
+  ~CommitPipeline();
+
+  /**
+   * Submit batch of pipeline entries for processing.
+   *
+   * Thread-safe method for submitting work to the pipeline. Entries flow
+   * through all 4 stages in order with proper synchronization.
+   *
+   * @param entries Span of pipeline entries to process
+   *
+   * Entry types:
+   * - CommitEntry: Full transaction processing through all stages
+   * - StatusEntry: Request status lookup with sequence stage processing
+   * - HealthCheckEntry: Health check with configurable CPU work
+   * - ShutdownEntry: Coordinated pipeline shutdown signal
+   *
+   * @note Thread Safety: Safe for concurrent calls from multiple threads
+   * @note Performance: Batching reduces pipeline contention - prefer larger
+   * batches
+   * @note Blocking: May block if pipeline is at capacity (backpressure)
+   */
+  void submit_batch(std::span<PipelineEntry> entries);
+
+  /**
+   * Get the highest committed version number.
+   *
+   * @return Current committed version (persist thread writes, other threads
+   * read)
+   * @note Thread Safety: Safe to read from any thread
+   */
+  int64_t get_committed_version() const {
+    return committed_version_.load(std::memory_order_seq_cst);
+  }
+
+private:
+  // Configuration reference
+  const weaseldb::Config &config_;
+
+  // Pipeline state (persist thread writes, other threads read)
+  std::atomic<int64_t> committed_version_{0}; // Highest committed version
+
+  // Lock-free pipeline configuration
+  static constexpr int lg_size = 16; // Ring buffer size (2^16 slots)
+
+  // 4-stage pipeline: sequence -> resolve -> persist -> release
+  ThreadPipeline<PipelineEntry> pipeline_;
+
+  // Stage processing threads
+  std::thread sequence_thread_;
+  std::thread resolve_thread_;
+  std::thread persist_thread_;
+  std::vector<std::thread> release_threads_;
+
+  // Pipeline stage main loops
+  void run_sequence_stage();
+  void run_resolve_stage();
+  void run_persist_stage();
+  void run_release_stage(int thread_index);
+
+  // Pipeline batch type alias
+  using BatchType = ThreadPipeline<PipelineEntry>::Batch;
+
+  // Make non-copyable and non-movable
+  CommitPipeline(const CommitPipeline &) = delete;
+  CommitPipeline &operator=(const CommitPipeline &) = delete;
+  CommitPipeline(CommitPipeline &&) = delete;
+  CommitPipeline &operator=(CommitPipeline &&) = delete;
+};
--- a/src/commit_request.hpp
+++ b/src/commit_request.hpp
@@ -244,14 +244,7 @@ public:
   * @return String view pointing to arena-allocated memory
   */
  std::string_view copy_to_arena(std::string_view str) {
-    if (str.empty()) {
-      return {};
-    }
-
-    char *arena_str = arena_.allocate<char>(str.size());
-    std::memcpy(arena_str, str.data(), str.size());
-
-    return std::string_view(arena_str, str.size());
+    return arena_.copy_string(str);
  }

  /**
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -1,5 +1,4 @@
 #include "config.hpp"
-#include <fstream>
 #include <iostream>
 #include <toml.hpp>

@@ -109,22 +108,13 @@ void ConfigParser::parse_server_config(const auto &toml_data,
    parse_field(srv, "max_request_size_bytes", config.max_request_size_bytes);
    parse_field(srv, "io_threads", config.io_threads);

-    // Set epoll_instances default to io_threads if not explicitly configured
-    bool epoll_instances_specified = srv.contains("epoll_instances");
-    if (!epoll_instances_specified) {
-      config.epoll_instances = config.io_threads;
-    } else {
-      parse_field(srv, "epoll_instances", config.epoll_instances);
-    }
+    // epoll_instances removed - now 1:1 with io_threads

    parse_field(srv, "event_batch_size", config.event_batch_size);
    parse_field(srv, "max_connections", config.max_connections);
    parse_field(srv, "read_buffer_size", config.read_buffer_size);

-    // Clamp epoll_instances to not exceed io_threads
-    if (config.epoll_instances > config.io_threads) {
-      config.epoll_instances = config.io_threads;
-    }
+    // epoll_instances validation removed - now always equals io_threads
  });
 }

@@ -136,6 +126,25 @@ void ConfigParser::parse_commit_config(const auto &toml_data,
                         config.request_id_retention_hours);
    parse_field(commit, "request_id_retention_versions",
                config.request_id_retention_versions);
+
+    // Parse wait strategy
+    if (commit.contains("pipeline_wait_strategy")) {
+      std::string strategy_str =
+          toml::get<std::string>(commit.at("pipeline_wait_strategy"));
+      if (strategy_str == "WaitIfStageEmpty") {
+        config.pipeline_wait_strategy = WaitStrategy::WaitIfStageEmpty;
+      } else if (strategy_str == "WaitIfUpstreamIdle") {
+        config.pipeline_wait_strategy = WaitStrategy::WaitIfUpstreamIdle;
+      } else if (strategy_str == "Never") {
+        config.pipeline_wait_strategy = WaitStrategy::Never;
+      } else {
+        std::cerr << "Warning: Unknown pipeline_wait_strategy '" << strategy_str
+                  << "', using default (WaitIfUpstreamIdle)" << std::endl;
+      }
+    }
+
+    parse_field(commit, "pipeline_release_threads",
+                config.pipeline_release_threads);
  });
 }

@@ -213,15 +222,7 @@ bool ConfigParser::validate_config(const Config &config) {
    valid = false;
  }

-  if (config.server.epoll_instances < 1 ||
-      config.server.epoll_instances > config.server.io_threads) {
-    std::cerr
-        << "Configuration error: server.epoll_instances must be between 1 "
-           "and io_threads ("
-        << config.server.io_threads << "), got "
-        << config.server.epoll_instances << std::endl;
-    valid = false;
-  }
+  // epoll_instances validation removed - now always 1:1 with io_threads

  if (config.server.event_batch_size < 1 ||
      config.server.event_batch_size > 10000) {
@@ -271,6 +272,14 @@ bool ConfigParser::validate_config(const Config &config) {
    valid = false;
  }

+  if (config.commit.pipeline_release_threads < 1 ||
+      config.commit.pipeline_release_threads > 64) {
+    std::cerr << "Configuration error: commit.pipeline_release_threads must be "
+                 "between 1 and 64, got "
+              << config.commit.pipeline_release_threads << std::endl;
+    valid = false;
+  }
+
  // Validate subscription configuration
  if (config.subscription.max_buffer_size_bytes == 0) {
    std::cerr << "Configuration error: subscription.max_buffer_size_bytes must "
--- a/src/config.hpp
+++ b/src/config.hpp
@@ -5,6 +5,8 @@
 #include <string>
 #include <vector>

+#include "thread_pipeline.hpp"
+
 namespace weaseldb {

 /**
@@ -40,10 +42,8 @@ struct ServerConfig {
  /// Maximum size in bytes for incoming HTTP requests (default: 1MB)
  int64_t max_request_size_bytes = 1024 * 1024;
  /// Number of I/O threads for handling connections and network events
+  /// Each I/O thread gets its own dedicated epoll instance
  int io_threads = 1;
-  /// Number of epoll instances to reduce epoll_ctl contention (default:
-  /// io_threads, max: io_threads)
-  int epoll_instances = 1;
  /// Event batch size for epoll processing
  int event_batch_size = 32;
  /// Maximum number of concurrent connections (0 = unlimited)
@@ -62,6 +62,16 @@ struct CommitConfig {
  std::chrono::hours request_id_retention_hours{24};
  /// Minimum number of commit versions to retain request IDs for
  int64_t request_id_retention_versions = 100000000;
+  /// Wait strategy for the commit pipeline
+  /// - WaitIfStageEmpty: Block when individual stages are empty (default, safe
+  /// for shared CPUs)
+  /// - WaitIfUpstreamIdle: Block only when all upstream stages are idle
+  /// (requires dedicated cores)
+  /// - Never: Never block, busy-wait continuously (requires dedicated cores)
+  WaitStrategy pipeline_wait_strategy = WaitStrategy::WaitIfUpstreamIdle;
+  /// Number of threads in the release stage (final stage of commit pipeline)
+  /// Default: 1 thread for simplicity (can increase for higher throughput)
+  int pipeline_release_threads = 1;
 };

 /**
--- a/src/connection.cpp
+++ b/src/connection.cpp
@@ -4,9 +4,10 @@
 #include <climits>
 #include <cstdio>
 #include <cstdlib>
+#include <sys/epoll.h>

 #include "metric.hpp"
-#include "server.hpp" // Need this for release_back_to_server implementation
+#include "server.hpp" // Need this for server reference

 namespace {
 // Thread-local metric instances
@@ -35,15 +36,16 @@ thread_local auto write_eagain_failures =

 // Static thread-local storage for iovec buffer
 static thread_local std::vector<struct iovec> g_iovec_buffer{IOV_MAX};
+// Thread-local storage for arenas to be freed after unlocking
+static thread_local std::vector<Arena> g_arenas_to_free;

 Connection::Connection(struct sockaddr_storage addr, int fd, int64_t id,
                       size_t epoll_index, ConnectionHandler *handler,
                       WeakRef<Server> server)
-    : fd_(fd), id_(id), epoll_index_(epoll_index), addr_(addr), arena_(),
-      handler_(handler), server_(std::move(server)) {
+    : id_(id), epoll_index_(epoll_index), addr_(addr), handler_(handler),
+      server_(std::move(server)), fd_(fd) {
  auto server_ref = server_.lock();
-  // This should only be called from a member of Server itself, so I should
-  // hope it's alive.
+  // Should only be called from the io thread
  assert(server_ref);
  server_ref->active_connections_.fetch_add(1, std::memory_order_relaxed);

@@ -56,34 +58,98 @@ Connection::Connection(struct sockaddr_storage addr, int fd, int64_t id,
 }

 Connection::~Connection() {
-  if (handler_) {
  handler_->on_connection_closed(*this);
-  }
-  // Server may legitimately be gone now
-  if (auto server_ptr = server_.lock()) {
-    server_ptr->active_connections_.fetch_sub(1, std::memory_order_relaxed);
-  }
-
-  // Decrement active connections gauge
-  connections_active.dec();
-
-  int e = close(fd_);
+  if (fd_ >= 0) {
+    int e = ::close(fd_);
    if (e == -1 && errno != EINTR) {
      perror("close");
      std::abort();
    }
    // EINTR ignored - fd is guaranteed closed on Linux
  }
-
-void Connection::append_message(std::string_view s, bool copy_to_arena) {
-  if (copy_to_arena) {
-    char *arena_str = arena_.allocate<char>(s.size());
-    std::memcpy(arena_str, s.data(), s.size());
-    messages_.emplace_back(arena_str, s.size());
-  } else {
-    messages_.push_back(s);
 }
-  outgoing_bytes_queued_ += s.size();
+
+void Connection::close() {
+  std::lock_guard lock{mutex_};
+  auto server_ptr = server_.lock();
+  // Should only be called from the io thread
+  assert(server_ptr);
+  server_ptr->active_connections_.fetch_sub(1, std::memory_order_relaxed);
+  assert(fd_ >= 0);
+  int e = ::close(fd_);
+  if (e == -1 && errno != EINTR) {
+    perror("close");
+    std::abort();
+  }
+  // EINTR ignored - fd is guaranteed closed on Linux
+  fd_ = -1;
+  // Decrement active connections gauge
+  connections_active.dec();
+}
+
+// Called from I/O thread only
+void Connection::append_bytes(std::span<std::string_view> data_parts,
+                              Arena arena, ConnectionShutdown shutdown_mode) {
+  // Prevent queueing messages after shutdown has been requested
+  if (shutdown_requested_ != ConnectionShutdown::None) {
+    return;
+  }
+
+  // Check if queue was empty to determine if we need to enable EPOLLOUT
+  bool was_empty = message_queue_.empty();
+
+  // Set shutdown mode if requested
+  if (shutdown_mode != ConnectionShutdown::None) {
+    shutdown_requested_ = shutdown_mode;
+  }
+
+  // Add message to queue
+  // TODO this allocates while holding the connection lock
+  message_queue_.emplace_back(Message{std::move(arena), data_parts});
+
+  // If queue was empty, we need to add EPOLLOUT interest.
+  if (was_empty) {
+    auto server = server_.lock();
+    if (fd_ >= 0 && server) {
+      // Add EPOLLOUT interest - pipeline thread manages epoll
+      struct epoll_event event;
+      event.data.fd = fd_;
+      event.events = EPOLLIN | EPOLLOUT;
+      tsan_release();
+      // I think we have to call epoll_ctl while holding mutex_. Otherwise a
+      // call that clears the write interest could get reordered with one that
+      // sets it and we would hang.
+      epoll_ctl(server->epoll_fds_[epoll_index_], EPOLL_CTL_MOD, fd_, &event);
+    }
+  }
+}
+
+// May be called from a foreign thread!
+void Connection::send_response(void *protocol_context,
+                               std::string_view response_json, Arena arena) {
+  std::unique_lock lock(mutex_);
+
+  // Prevent queueing responses after shutdown has been requested
+  if (shutdown_requested_ != ConnectionShutdown::None) {
+    return;
+  }
+
+  // Store response in queue for protocol handler processing
+  pending_response_queue_.emplace_back(
+      PendingResponse{protocol_context, response_json, std::move(arena)});
+
+  // Trigger epoll interest if this is the first pending response
+  if (pending_response_queue_.size() == 1) {
+    auto server = server_.lock();
+    if (fd_ >= 0 && server) {
+      // Add EPOLLOUT interest to trigger on_preprocess_writes
+      struct epoll_event event;
+      event.data.fd = fd_;
+      event.events = EPOLLIN | EPOLLOUT;
+      tsan_release();
+      epoll_ctl(server->epoll_fds_[epoll_index_], EPOLL_CTL_MOD, fd_, &event);
+    }
+  }
 }

 int Connection::readBytes(char *buf, size_t buffer_size) {
@@ -105,36 +171,61 @@ int Connection::readBytes(char *buf, size_t buffer_size) {
    }

    // Increment bytes read metric
-    if (r > 0) {
+    assert(r > 0);
    bytes_read.inc(r);
-    }

    return r;
  }
 }

-bool Connection::writeBytes() {
+uint32_t Connection::write_bytes() {
  ssize_t total_bytes_written = 0;
-  while (!messages_.empty()) {
+
+  uint32_t result = 0;
+
+  while (true) {
+    // Build iovec array while holding mutex using thread-local buffer
+    int iov_count = 0;
+    {
+      std::lock_guard lock(mutex_);
+
+      if (message_queue_.empty()) {
+        break;
+      }
+
      // Build iovec array up to IOV_MAX limit using thread-local vector
      assert(g_iovec_buffer.size() == IOV_MAX);
      struct iovec *iov = g_iovec_buffer.data();
-    int iov_count = 0;

-    for (auto it = messages_.begin();
-         it != messages_.end() && iov_count < IOV_MAX; ++it) {
-      const auto &msg = *it;
+      for (auto &message : message_queue_) {
+        if (iov_count >= IOV_MAX)
+          break;
+
+        for (const auto &part : message.data_parts) {
+          if (iov_count >= IOV_MAX)
+            break;
+          if (part.empty())
+            continue;
+
          iov[iov_count] = {
-          const_cast<void *>(static_cast<const void *>(msg.data())),
-          msg.size()};
+              const_cast<void *>(static_cast<const void *>(part.data())),
+              part.size()};
          iov_count++;
        }
+      }

-    assert(iov_count > 0);
+      if (iov_count == 0)
+        break;
+    } // Release mutex during I/O

+    // Perform I/O without holding mutex
    ssize_t w;
    for (;;) {
-      w = writev(fd_, iov, iov_count);
+      struct msghdr msg = {};
+      msg.msg_iov = g_iovec_buffer.data();
+      msg.msg_iovlen = iov_count;
+
+      w = sendmsg(fd_, &msg, MSG_NOSIGNAL);
      if (w == -1) {
        if (errno == EINTR) {
          continue; // Standard practice: retry on signal interruption
@@ -142,45 +233,87 @@ bool Connection::writeBytes() {
        if (errno == EAGAIN) {
          // Increment EAGAIN failure metric
          write_eagain_failures.inc();
-          // Increment bytes written metric before returning
-          if (total_bytes_written > 0) {
          bytes_written.inc(total_bytes_written);
+          return result;
        }
-          return false;
-        }
-        perror("writev");
-        return true;
+        perror("sendmsg");
+        result |= Error;
+        return result;
      }
      break;
    }

+    result |= Progress;
+
    assert(w > 0);
    total_bytes_written += w;

-    // Handle partial writes by updating string_view data/size
-    size_t bytes_written = static_cast<size_t>(w);
-    outgoing_bytes_queued_ -= bytes_written;
-    while (bytes_written > 0 && !messages_.empty()) {
-      auto &front = messages_.front();
+    // Handle partial writes by updating message data_parts
+    {
+      std::lock_guard lock(mutex_);
+      size_t bytes_remaining = static_cast<size_t>(w);

-      if (bytes_written >= front.size()) {
-        // This message is completely written
-        bytes_written -= front.size();
-        messages_.pop_front();
+      while (bytes_remaining > 0 && !message_queue_.empty()) {
+        auto &front_message = message_queue_.front();
+
+        for (auto &part : front_message.data_parts) {
+          if (part.empty())
+            continue;
+
+          if (bytes_remaining >= part.size()) {
+            // This part is completely written
+            bytes_remaining -= part.size();
+            part = std::string_view(); // Mark as consumed
          } else {
-        // Partial write of this message - update string_view
-        front = std::string_view(front.data() + bytes_written,
-                                 front.size() - bytes_written);
-        bytes_written = 0;
+            // Partial write of this part
+            part = std::string_view(part.data() + bytes_remaining,
+                                    part.size() - bytes_remaining);
+            bytes_remaining = 0;
+            break;
+          }
+        }
+
+        // Move arena to thread-local vector for deferred cleanup
+        g_arenas_to_free.emplace_back(std::move(front_message.arena));
+        message_queue_.pop_front();
+        if (result & Close) {
+          break;
+        }
+      }
+    }
+  }
+
+  // Check if queue is empty and remove EPOLLOUT interest
+  {
+    std::lock_guard lock(mutex_);
+    if (message_queue_.empty() && pending_response_queue_.empty()) {
+      auto server = server_.lock();
+      if (server) {
+        struct epoll_event event;
+        event.data.fd = fd_;
+        event.events = EPOLLIN; // Remove EPOLLOUT
+        tsan_release();
+        // I think we have to call epoll_ctl while holding mutex_. Otherwise a
+        // call that clears the write interest could get reordered with one that
+        // sets it and we would hang.
+        epoll_ctl(server->epoll_fds_[epoll_index_], EPOLL_CTL_MOD, fd_, &event);
+      }
+      // Handle shutdown modes after all messages are sent
+      if (shutdown_requested_ == ConnectionShutdown::WriteOnly) {
+        // Shutdown write side but keep connection alive for reading
+        shutdown(fd_, SHUT_WR);
+      } else if (shutdown_requested_ == ConnectionShutdown::Full) {
+        result |= Close;
      }
    }
  }
-  assert(messages_.empty());

  // Increment bytes written metric
-  if (total_bytes_written > 0) {
  bytes_written.inc(total_bytes_written);
-  }

-  return false;
+  // Clean up arenas after all mutex operations are complete
+  // This avoids holding the connection mutex while calling free()
+  g_arenas_to_free.clear();
+
+  return result;
 }
--- a/src/connection.hpp
+++ b/src/connection.hpp
@@ -3,6 +3,8 @@
 #include <cassert>
 #include <cstring>
 #include <deque>
+#include <mutex>
+#include <span>
 #include <sys/socket.h>
 #include <sys/uio.h>
 #include <unistd.h>
@@ -15,34 +17,76 @@
 #define __has_feature(x) 0
 #endif

-/**
- * Represents a single client connection with efficient memory management.
- *
- * Connection ownership model:
- * - Created by I/O thread, processed immediately, then transferred to epoll via
- * raw pointer
- * - I/O threads claim ownership by wrapping raw pointer in unique_ptr
- * - I/O thread optionally passes ownership to a thread pipeline
- * - Owner eventually transfers back to epoll by releasing unique_ptr to raw
- * pointer
- * - RAII cleanup happens if I/O thread doesn't transfer back
- *
- * Arena allocator thread safety:
- * Each Connection contains its own Arena instance that is accessed
- * exclusively by the thread that currently owns the connection. This ensures
- * thread safety without requiring locks:
- * - Arena is used by the owning thread for I/O buffers, request parsing, and
- *   response generation
- * - Arena memory is automatically freed when the connection is destroyed
- * - reset() should only be called by the current owner thread
- *
- * Only the handler interface methods are public - all networking details are
- * private.
- */
 // Forward declaration
 struct Server;

-struct Connection {
+/**
+ * Shutdown modes for connection termination.
+ */
+enum class ConnectionShutdown {
+  None,      // Normal operation - no shutdown requested
+  WriteOnly, // shutdown(SHUT_WR) after sending queued data
+  Full       // close() after sending queued data
+};
+
+/**
+ * Base interface for sending messages to a connection.
+ * This restricted interface is safe for use by pipeline threads,
+ * containing only the append_message method needed for responses.
+ * Pipeline threads should use WeakRef<MessageSender> to safely
+ * send responses without accessing other connection functionality
+ * that should only be used by the I/O thread.
+ */
+struct MessageSender {
+  /**
+   * @brief Send response with protocol-specific context for ordering.
+   *
+   * Thread-safe method for pipeline threads to send responses back to clients.
+   * Delegates to the connection's protocol handler for ordering logic.
+   * The protocol handler may queue the response or send it immediately.
+   *
+   * @param protocol_context Arena-allocated protocol-specific context
+   * @param data Response data parts (may be empty for deferred serialization)
+   * @param arena Arena containing response data and context
+   *
+   * Example usage:
+   * ```cpp
+   * auto* ctx = arena.allocate<HttpResponseContext>();
+   * ctx->sequence_id = 42;
+   * auto response_data = format_response(arena);
+   * conn.send_response(ctx, response_data, std::move(arena));
+   * ```
+   */
+  virtual void send_response(void *protocol_context,
+                             std::string_view response_json, Arena arena) = 0;
+
+  virtual ~MessageSender() = default;
+};
+
+/**
+ * Represents a single client connection - the full interface available to the
+ * io thread and connection handler.
+ *
+ * Connection ownership model:
+ * - Server owns all connections
+ * - Handlers receive Connection& references, and can keep a WeakRef to
+ * MessageSender for async responses.
+ * - Multiple pipeline threads can safely access the MessageSender concurrently
+ * - I/O thread has exclusive access to socket operations
+ *
+ * Threading model:
+ * - Single mutex protects state shared with pipeline threads
+ * - Pipeline threads call Connection methods (append_message, etc.)
+ * - I/O thread processes socket events and message queue
+ * - Pipeline threads register epoll write interest via append_message
+ * - Connection tracks closed state to prevent EBADF errors
+ *
+ * Arena allocator usage:
+ * - Request-scoped arenas created by handlers for each request
+ * - No connection-owned arena for parsing/response generation
+ * - Message queue stores spans + owning arenas until I/O completion
+ */
+struct Connection : MessageSender {
  // No public constructor or factory method - only Server can create
  // connections

@@ -64,90 +108,72 @@ struct Connection {
  // Handler interface - public methods that handlers can use

  /**
-   * @brief Queue a message to be sent to the client.
+   * @brief Queue an atomic message to be sent to the client.
   *
-   * Adds data to the connection's outgoing message queue. The data will be sent
-   * asynchronously by the server's I/O threads using efficient vectored
-   * I/O.
+   * Adds a complete message with all associated data to the connection's
+   * outgoing byte queue with guaranteed ordering.
   *
-   * @param s The data to send (string view parameter for efficiency)
-   * @param copy_to_arena If true (default), copies data to the connection's
-   * arena for safe storage. If false, the caller must ensure the data remains
-   * valid until all queued messages are sent.
+   * I/O thread only method for protocol handlers to queue bytes for sending.
+   * Bytes are queued in order and sent using efficient vectored I/O.
   *
-   * @warning Thread Safety: Only call from the thread that currently owns this
-   * connection. The arena allocator is not thread-safe.
+   * @param data_parts Span of string_views pointing to arena-allocated data
+   * @param arena Arena that owns all the memory referenced by data_parts
+   * @param shutdown_mode Shutdown mode to apply after sending all queued data
   *
-   * @note Performance: Use copy_to_arena=false for static strings or data with
-   * guaranteed lifetime, copy_to_arena=true for temporary/dynamic data.
+   * @note Thread Safety: Must be called from I/O thread only.
+   * @note Ordering: Bytes are sent in the order calls are made.
+   * @note The memory referenced by the data_parts span, must outlive @p arena.
+   * @note Shutdown Request: To request connection shutdown without sending
+   * data, pass empty data_parts span with desired shutdown_mode. This ensures
+   *       all previously queued messages are sent before shutdown.
+   *
+   * Example usage (from ConnectionHandler::on_preprocess_writes):
+   * ```cpp
+   * Arena arena;
+   * auto parts = arena.allocate_span<std::string_view>(2);
+   * parts[0] = build_header(arena);
+   * parts[1] = build_body(arena);
+   * conn.append_bytes({parts, 2}, std::move(arena), ConnectionShutdown::None);
+   * ```
+   */
+  void
+  append_bytes(std::span<std::string_view> data_parts, Arena arena,
+               ConnectionShutdown shutdown_mode = ConnectionShutdown::None);
+
+  void send_response(void *protocol_context, std::string_view response_json,
+                     Arena arena) override;
+
+  /**
+   * @brief Get a WeakRef to this connection for async operations.
+   *
+   * Returns a WeakRef that can be safely used to access this connection
+   * from other threads, such as pipeline processing threads. The WeakRef
+   * allows safe access even if the connection might be destroyed by the
+   * time the async operation executes.
+   *
+   * @return WeakRef to this connection
+   *
+   * @note Thread Safety: This method is thread-safe.
+   *
+   * @note The WeakRef should be used with lock() to safely access the
+   * connection. If lock() returns null, the connection has been destroyed.
   *
   * Example usage:
   * ```cpp
-   * conn->append_message("HTTP/1.1 200 OK\r\n\r\n", false);    // Static string
-   * conn->append_message(dynamic_response, true);              // Dynamic data
-   * conn->append_message(arena_allocated_data, false);         // Arena data
+   * auto weak_conn = conn.get_weak_ref();
+   * async_processor.submit([weak_conn, request_data]() {
+   *   if (auto conn = weak_conn.lock()) {
+   *     Arena arena;
+   *     auto response = process_request(request_data, arena);
+   *     conn->append_message({&response, 1}, std::move(arena));
+   *   }
+   * });
   * ```
   */
-  void append_message(std::string_view s, bool copy_to_arena = true);
-
-  /**
-   * @brief Mark the connection to be closed after sending all queued messages.
-   *
-   * Sets a flag that instructs the server to close this connection gracefully
-   * after all currently queued messages have been successfully sent to the
-   * client. This enables proper connection cleanup for protocols like HTTP/1.0
-   * or when implementing connection limits.
-   *
-   * @note The connection will remain active until:
-   * 1. All queued messages are sent to the client
-   * 2. The server processes the close flag during the next I/O cycle
-   * 3. The connection is properly closed and cleaned up
-   *
-   * @warning Thread Safety: Only call from the thread that currently owns this
-   * connection.
-   *
-   * Typical usage:
-   * ```cpp
-   * conn->append_message("HTTP/1.1 200 OK\r\n\r\nBye!");
-   * conn->close_after_send();  // Close after sending response
-   * ```
-   */
-  void close_after_send() { closeConnection_ = true; }
-
-  /**
-   * @brief Get access to the connection's arena allocator.
-   *
-   * Returns a reference to this connection's private Arena instance,
-   * which should be used for all temporary allocations during request
-   * processing. The arena provides extremely fast allocation (~1ns) and
-   * automatic cleanup when the connection is destroyed or reset.
-   *
-   * @return Reference to the connection's arena allocator
-   *
-   * @warning Thread Safety: Only access from the thread that currently owns
-   * this connection. The arena allocator is not thread-safe and concurrent
-   * access will result in undefined behavior.
-   *
-   * @note Memory Lifecycle: Arena memory is automatically freed when:
-   * - The connection is destroyed
-   * - reset() is called (keeps first block, frees others)
-   * - The connection is moved (arena ownership transfers)
-   *
-   * Best practices:
-   * ```cpp
-   * Arena& arena = conn->get_arena();
-   *
-   * // Allocate temporary parsing buffers
-   * char* buffer = arena.allocate<char>(1024);
-   *
-   * // Construct temporary objects
-   * auto* request = arena.construct<HttpRequest>(arena);
-   *
-   * // Use arena-backed STL containers
-   * std::vector<Token, ArenaStlAllocator<Token>> tokens{&arena};
-   * ```
-   */
-  Arena &get_arena() { return arena_; }
+  WeakRef<MessageSender> get_weak_ref() const {
+    assert(self_ref_.lock());
+    return self_ref_.copy();
+  }

  /**
   * @brief Get the unique identifier for this connection.
@@ -175,54 +201,6 @@ struct Connection {
   */
  int64_t get_id() const { return id_; }

-  /**
-   * @brief Get the number of bytes queued for transmission.
-   *
-   * Returns the total number of bytes in all messages currently
-   * queued for transmission to the client. This includes all data added via
-   * append_message() that has not yet been sent over the network.
-   *
-   * @return Total bytes queued for transmission
-   *
-   * @warning Thread Safety: Only call from the thread that currently owns this
-   * connection. Concurrent access to the message queue is not thread-safe.
-   *
-   * @note Performance: This method uses an O(1) counter for fast retrieval
-   * in release builds. In debug builds, validates counter accuracy.
-   *
-   * @note The count decreases as the server sends data via writeBytes() and
-   * removes completed messages from the queue.
-   *
-   * Use cases:
-   * ```cpp
-   * // Check if all data has been sent
-   * if (conn->outgoingBytesQueued() == 0) {
-   *     conn->reset();  // Safe to reset arena
-   * }
-   *
-   * // Implement backpressure
-   * if (conn->outgoingBytesQueued() > MAX_BUFFER_SIZE) {
-   *     // Stop adding more data until queue drains
-   * }
-   *
-   * // Logging/monitoring
-   * metrics.recordQueueDepth(conn->get_id(), conn->outgoingBytesQueued());
-   * ```
-   */
-  int64_t outgoing_bytes_queued() const {
-#ifndef NDEBUG
-    // Debug build: validate counter accuracy
-    int64_t computed_total = 0;
-    for (auto s : messages_) {
-      computed_total += s.size();
-    }
-    assert(
-        outgoing_bytes_queued_ == computed_total &&
-        "outgoing_bytes_queued_ counter is out of sync with actual queue size");
-#endif
-    return outgoing_bytes_queued_;
-  }
-
  /**
   * @brief Protocol-specific data pointer for handler use.
   *
@@ -245,7 +223,7 @@ struct Connection {
   *
   * Example usage:
   * ```cpp
-   * class HttpHandler : public ConnectionHandler {
+   * class HttpHandler : ConnectionHandler {
   *   void on_connection_established(Connection& conn) override {
   *     // Allocate HTTP state in connection's arena or heap
   *     auto* state = conn.get_arena().construct<HttpConnectionState>();
@@ -259,8 +237,8 @@ struct Connection {
   *   }
   *
   *   void on_data_arrived(std::string_view data,
-   *                        std::unique_ptr<Connection>& conn_ptr) override {
-   *     auto* state = static_cast<HttpConnectionState*>(conn_ptr->user_data);
+   *                        Connection& conn) override {
+   *     auto* state = static_cast<HttpConnectionState*>(conn.user_data);
   *     // Use state for protocol processing...
   *   }
   * };
@@ -268,50 +246,13 @@ struct Connection {
   */
  void *user_data = nullptr;

-  /**
-   * Reset the connection's arena allocator and message queue for reuse.
-   *
-   * This method efficiently reclaims arena memory by keeping the first block
-   * and freeing all others, then reinitializes the message queue.
-   *
-   * @warning Thread Safety: This method should ONLY be called by the thread
-   * that currently owns this connection. Calling reset() while the connection
-   * is being transferred between threads or accessed by another thread will
-   * result in undefined behavior.
-   *
-   * @note The assert(messages_.empty()) ensures all outgoing data has been
-   * sent before resetting. This prevents data loss and indicates the connection
-   * is in a clean state for reuse.
-   *
-   * Typical usage pattern:
-   * - HTTP handlers call this after completing a request/response cycle
-   */
-  void reset() {
-    assert(messages_.empty());
-    outgoing_bytes_queued_ = 0;
-    arena_.reset();
-    messages_ =
-        std::deque<std::string_view, ArenaStlAllocator<std::string_view>>{
-            ArenaStlAllocator<std::string_view>{&arena_}};
-  }
-
-  /**
-   * @note Ownership Transfer: To release a connection back to the server for
-   * continued processing, use the static method:
-   * ```cpp
-   * Server::release_back_to_server(std::move(connection_ptr));
-   * ```
-   *
-   * This is the correct way to return connection ownership when:
-   * - A handler has taken ownership via unique_ptr.release()
-   * - Background processing of the connection is complete
-   * - The connection should resume normal server-managed I/O processing
-   *
-   * The method is thread-safe and handles the case where the server may have
-   * been destroyed while the connection was being processed elsewhere.
-   */
-
 private:
+  struct Message {
+    Arena arena;                            // Owns all the memory (movable)
+    std::span<std::string_view> data_parts; // Points to arena-allocated memory
+                                            // (mutable for partial writes)
+  };
+
  // Server is a friend and can access all networking internals
  friend struct Server;

@@ -333,30 +274,43 @@ private:
             size_t epoll_index, ConnectionHandler *handler,
             WeakRef<Server> server);

+  template <typename T, typename... Args>
+  friend Ref<T> make_ref(Args &&...args);
+
  // Networking interface - only accessible by Server
  int readBytes(char *buf, size_t buffer_size);
-  bool writeBytes();
+  enum WriteBytesResult {
+    Error = 1 << 0,
+    Progress = 1 << 1,
+    Close = 1 << 2,
+  };
+  uint32_t write_bytes();

-  // Direct access methods for Server
-  int getFd() const { return fd_; }
-  bool has_messages() const { return !messages_.empty(); }
-  bool should_close() const { return closeConnection_; }
-  size_t getEpollIndex() const { return epoll_index_; }
-  const int fd_;
+  void close();
+
+  // Immutable connection properties
  const int64_t id_;
  const size_t epoll_index_; // Index of the epoll instance this connection uses
  struct sockaddr_storage addr_; // sockaddr_storage handles IPv4/IPv6
-  Arena arena_;
-  ConnectionHandler *handler_;
-  WeakRef<Server> server_; // Weak reference to server for safe cleanup
+  ConnectionHandler *const handler_;
+  WeakRef<Server> server_; // Weak reference to server for safe epoll_ctl calls
+  WeakRef<Connection> self_ref_; // WeakRef to self for get_weak_ref()

-  std::deque<std::string_view, ArenaStlAllocator<std::string_view>> messages_{
-      ArenaStlAllocator<std::string_view>{&arena_}};
+  // Only accessed from io thread
+  std::deque<Message> message_queue_;

-  // Counter tracking total bytes queued for transmission
-  int64_t outgoing_bytes_queued_{0};
+  mutable std::mutex mutex_;
+  ConnectionShutdown shutdown_requested_{
+      ConnectionShutdown::None};                       // Protected by mutex_
+  std::deque<PendingResponse> pending_response_queue_; // Protected by mutex_
+  int fd_;                                             // Protected by mutex_

-  // Whether or not to close the connection after completing writing the
-  // response
-  bool closeConnection_{false};
+#if __has_feature(thread_sanitizer)
+  void tsan_acquire() { tsan_sync.load(std::memory_order_acquire); }
+  void tsan_release() { tsan_sync.store(0, std::memory_order_release); }
+  std::atomic<int> tsan_sync;
+#else
+  void tsan_acquire() {}
+  void tsan_release() {}
+#endif
 };
--- a/src/connection_handler.hpp
+++ b/src/connection_handler.hpp
@@ -1,12 +1,24 @@
 #pragma once

-#include <memory>
 #include <span>
 #include <string_view>

-// Forward declaration to avoid circular dependency
+// Forward declarations to avoid circular dependency
 struct Connection;

+// Include Arena header since PendingResponse uses Arena by value
+#include "arena.hpp"
+
+/**
+ * Represents a response queued by pipeline threads for protocol processing.
+ * Contains JSON response data that can be wrapped by any protocol.
+ */
+struct PendingResponse {
+  void *protocol_context;         // Arena-allocated protocol-specific context
+  std::string_view response_json; // JSON response body (arena-allocated)
+  Arena arena;                    // Arena containing response data and context
+};
+
 /**
 * Abstract interface for handling connection data processing.
 *
@@ -25,22 +37,21 @@ public:
   * Process incoming data from a connection.
   *
   * @param data Incoming data buffer (may be partial message)
-   * @param conn_ptr Unique pointer to connection - handler can take ownership
-   * by releasing it
+   * @param conn Connection reference - server retains ownership
   *
   * Implementation should:
-   * - Parse incoming data using arena allocator when needed
-   * - Use conn_ptr->append_message() to queue response data to be sent
+   * - Create request-scoped Arena for parsing and response generation
+   * - Parse incoming data using the request arena
+   * - Use conn.append_message() to queue response data to be sent
   * - Handle partial messages and streaming protocols appropriately
-   * - Can take ownership by calling conn_ptr.release() to pass to other threads
-   * - If ownership is taken, handler must call Server::release_back_to_server()
-   * when done
-   * @note `data` is *not* owned by the connection arena, and its lifetime ends
-   * after the call to on_data_arrived.
-   * @note May be called from an arbitrary server thread.
+   * - Use conn.get_weak_ref() for async processing if needed
+   *
+   * @note `data` lifetime ends after the call to on_data_arrived.
+   * @note Called from this connection's io thread.
+   * @note Handler can safely access connection concurrently via thread-safe
+   * methods.
   */
-  virtual void on_data_arrived(std::string_view /*data*/,
-                               std::unique_ptr<Connection> &) {};
+  virtual void on_data_arrived(std::string_view /*data*/, Connection &) {};

  /**
   * Called when data has been successfully written to the connection.
@@ -50,29 +61,12 @@ public:
   * - Implementing backpressure for continuous data streams
   * - Progress monitoring for long-running transfers
   *
-   * @param conn_ptr Connection that made write progress - handler can take
-   * ownership
-   * @note May be called from an arbitrary server thread.
+   * @param conn Connection that made write progress - server retains ownership
+   * @note Called from this connection's io thread.
   * @note Called during writes, not necessarily when buffer becomes empty
+   * TODO Add bytes written argument?
   */
-  virtual void on_write_progress(std::unique_ptr<Connection> &) {}
-
-  /**
-   * Called when the connection's outgoing write buffer becomes empty.
-   *
-   * This indicates all queued messages have been successfully written
-   * to the socket. Useful for:
-   * - Resetting arena allocators safely
-   * - Implementing keep-alive connection reuse
-   * - Closing connections after final response
-   * - Relieving backpressure conditions
-   *
-   * @param conn_ptr Connection with empty write buffer - handler can take
-   * ownership
-   * @note May be called from an arbitrary server thread.
-   * @note Only called on transitions from non-empty → empty buffer
-   */
-  virtual void on_write_buffer_drained(std::unique_ptr<Connection> &) {}
+  virtual void on_write_progress(Connection &) {}

  /**
   * Called when a new connection is established.
@@ -81,7 +75,7 @@ public:
   *
   * Use this for:
   * - Connection-specific initialization.
-   * @note May be called from an arbitrary server thread.
+   * @note Called from this connection's io thread.
   */
  virtual void on_connection_established(Connection &) {}

@@ -92,21 +86,34 @@ public:
   *
   * Use this for:
   * - Cleanup of connection-specific resources.
-   * @note May be called from an arbitrary server thread.
+   * @note Called from this connection's io thread, or possibly a foreign thread
+   * that has locked the MessageSender associated with this connection.
   */
  virtual void on_connection_closed(Connection &) {}

  /**
   * @brief Called after a batch of connections has been processed.
   *
-   * This hook is called after on_data_arrived, on_write_progress, or
-   * on_write_buffer_drained has been called for each connection in the batch.
-   * The handler can take ownership of the connections by moving the unique_ptr
-   * out of the span. Any connections left in the span will remain owned by the
-   * server.
+   * This hook is called after on_data_arrived or on_write_progress has been
+   * called for each connection in the batch. All connections remain
+   * server-owned.
   *
-   * @param batch A span of unique_ptrs to the connections in the batch.
+   * @param batch A span of connection references in the batch.
+   * @note Called from this connection's io thread.
   */
-  virtual void
-  on_batch_complete(std::span<std::unique_ptr<Connection>> /*batch*/) {}
+  virtual void on_batch_complete(std::span<Connection *const> /*batch*/) {}
+
+  /**
+   * Called before processing outgoing writes on a connection.
+   *
+   * This hook allows protocol handlers to process queued responses
+   * before actual socket writes occur. Used for response ordering,
+   * serialization, and other preprocessing.
+   *
+   * @param conn Connection about to write data
+   * @param pending_responses Responses queued by pipeline threads
+   * @note Called from this connection's io thread.
+   * @note Called when EPOLLOUT event occurs
+   */
+  virtual void on_preprocess_writes(Connection &, std::span<PendingResponse>) {}
 };
--- a/src/connection_registry.cpp
+++ b/src/connection_registry.cpp
@@ -1,6 +1,5 @@
 #include "connection_registry.hpp"
 #include "connection.hpp"
-#include <atomic>
 #include <cstdlib>
 #include <cstring>
 #include <unistd.h>
@@ -14,49 +13,50 @@ ConnectionRegistry::ConnectionRegistry() : connections_(nullptr), max_fds_(0) {
  }
  max_fds_ = rlim.rlim_cur;

-  // Calculate size rounded up to page boundary
-  size_t array_size = max_fds_ * sizeof(Connection *);
-  size_t page_size = getpagesize();
-  size_t aligned_size = (array_size + page_size - 1) & ~(page_size - 1);
+  // TODO re-enable "ondemand pages" behavior
+  // // Calculate size rounded up to page boundary
+  // size_t array_size = max_fds_ * sizeof(Connection *);
+  // size_t page_size = getpagesize();
+  // size_t aligned_size = (array_size + page_size - 1) & ~(page_size - 1);

-  // Allocate virtual address space using mmap
-  // MAP_ANONYMOUS provides zero-initialized pages on-demand (lazy allocation)
-  connections_ = static_cast<std::atomic<Connection *> *>(
-      mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE,
-           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+  // // Allocate virtual address space using mmap
+  // // MAP_ANONYMOUS provides zero-initialized pages on-demand (lazy
+  // allocation) connections_ = static_cast<std::atomic<Connection *> *>(
+  //     mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE,
+  //          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));

-  if (connections_ == MAP_FAILED) {
-    perror("mmap");
-    std::abort();
-  }
+  // if (connections_ == MAP_FAILED) {
+  //   perror("mmap");
+  //   std::abort();
+  // }

-  // Store aligned size for munmap
-  aligned_size_ = aligned_size;
+  // // Store aligned size for munmap
+  // aligned_size_ = aligned_size;
+  connections_ = new Ref<Connection>[max_fds_];
 }

 ConnectionRegistry::~ConnectionRegistry() {
-  if (connections_ != nullptr) {
-    for (int fd = 0; fd < static_cast<int>(max_fds_); ++fd) {
-      delete connections_[fd].load(std::memory_order_relaxed);
-    }
-    if (munmap(connections_, aligned_size_) == -1) {
-      perror("munmap");
-    }
-  }
+  delete[] connections_;
+  // if (connections_ != nullptr) {
+  //   for (int fd = 0; fd < static_cast<int>(max_fds_); ++fd) {
+  //     delete connections_[fd].load(std::memory_order_relaxed);
+  //   }
+  //   if (munmap(connections_, aligned_size_) == -1) {
+  //     perror("munmap");
+  //   }
+  // }
 }

-void ConnectionRegistry::store(int fd, std::unique_ptr<Connection> connection) {
+void ConnectionRegistry::store(int fd, Ref<Connection> connection) {
  if (fd < 0 || static_cast<size_t>(fd) >= max_fds_) {
    std::abort();
  }
-  // Release ownership from unique_ptr and store raw pointer
-  connections_[fd].store(connection.release(), std::memory_order_release);
+  connections_[fd] = std::move(connection);
 }

-std::unique_ptr<Connection> ConnectionRegistry::remove(int fd) {
+Ref<Connection> ConnectionRegistry::remove(int fd) {
  if (fd < 0 || static_cast<size_t>(fd) >= max_fds_) {
    std::abort();
  }
-  return std::unique_ptr<Connection>(
-      connections_[fd].exchange(nullptr, std::memory_order_acquire));
+  return std::move(connections_[fd]);
 }
--- a/src/connection_registry.hpp
+++ b/src/connection_registry.hpp
@@ -1,10 +1,11 @@
 #pragma once

 #include <cstddef>
-#include <memory>
 #include <sys/mman.h>
 #include <sys/resource.h>

+#include "reference.hpp"
+
 struct Connection;

 /**
@@ -33,12 +34,12 @@ public:

  /**
   * Store a connection in the registry, indexed by its file descriptor.
-   * Takes ownership of the connection via unique_ptr.
+   * Takes a reference to the connection for storage.
   *
   * @param fd File descriptor (must be valid and < max_fds_)
-   * @param connection unique_ptr to the connection (ownership transferred)
+   * @param connection Ref<Connection> to store in the registry
   */
-  void store(int fd, std::unique_ptr<Connection> connection);
+  void store(int fd, Ref<Connection> connection);

  /**
   * Remove a connection from the registry and transfer ownership to caller.
@@ -47,7 +48,7 @@ public:
   * @param fd File descriptor
   * @return unique_ptr to the connection, or nullptr if not found
   */
-  std::unique_ptr<Connection> remove(int fd);
+  Ref<Connection> remove(int fd);

  /**
   * Get the maximum number of file descriptors supported.
@@ -63,10 +64,7 @@ public:
  ConnectionRegistry &operator=(ConnectionRegistry &&) = delete;

 private:
-  std::atomic<Connection *>
-      *connections_;    ///< mmap'd array of raw connection pointers. It's
-                        ///< thread-safe without since epoll_ctl happens before
-                        ///< epoll_wait, but this makes tsan happy /shrug.
+  Ref<Connection> *connections_;
  size_t max_fds_;      ///< Maximum file descriptor limit
  size_t aligned_size_; ///< Page-aligned size for munmap
 };
--- a/src/http_handler.cpp
+++ b/src/http_handler.cpp
--- a/src/http_handler.hpp
+++ b/src/http_handler.hpp
@@ -1,40 +1,55 @@
 #pragma once

-#include <atomic>
-#include <memory>
+#include <map>
 #include <string_view>
-#include <thread>
-#include <unordered_set>

 #include <llhttp.h>

 #include "api_url_parser.hpp"
 #include "arena.hpp"
+#include "commit_pipeline.hpp"
 #include "config.hpp"
 #include "connection.hpp"
 #include "connection_handler.hpp"
-#include "perfetto_categories.hpp"
-#include "pipeline_entry.hpp"
-#include "server.hpp"
-#include "thread_pipeline.hpp"

 // Forward declarations
 struct CommitRequest;
 struct JsonCommitRequestParser;
 struct RouteMatch;

+/**
+ * HTTP-specific response context stored in pipeline entries.
+ * Arena-allocated and passed through pipeline for response correlation.
+ */
+struct HttpResponseContext {
+  int64_t sequence_id;     // For response ordering in pipelining
+  int64_t http_request_id; // For X-Response-ID header
+  bool connection_close;   // Whether to close connection after response
+};
+
+/**
+ * Response data ready to send (sequence_id -> response data).
+ * Absence from map indicates response not ready yet.
+ */
+struct ResponseData {
+  std::span<std::string_view> data;
+  Arena arena;
+  bool connection_close;
+};
+
 /**
 * HTTP connection state stored in Connection::user_data.
 * Manages llhttp parser state and request data.
 */
-struct HttpConnectionState {
-  Arena &arena;
-  llhttp_t parser;
-  llhttp_settings_t settings;
+struct HttpRequestState {
+  Arena arena{16 << 10}; // Request-scoped arena for parsing state

  // Current request data (arena-allocated)
  std::string_view method;
-  std::string_view url;
+
+  using ArenaString =
+      std::basic_string<char, std::char_traits<char>, ArenaStlAllocator<char>>;
+  ArenaString url;

  // Parse state
  bool headers_complete = false;
@@ -47,13 +62,12 @@ struct HttpConnectionState {
      status_request_id; // Request ID extracted from /v1/status/{id} URL

  // Header accumulation buffers (arena-allocated)
-  using ArenaString =
-      std::basic_string<char, std::char_traits<char>, ArenaStlAllocator<char>>;
  ArenaString current_header_field_buf;
  ArenaString current_header_value_buf;
  bool header_field_complete = false;
  int64_t http_request_id =
      0;                   // X-Request-Id header value (for tracing/logging)
+  int64_t sequence_id = 0; // Assigned for response ordering in pipelining

  // Streaming parser for POST requests
  Arena::Ptr<JsonCommitRequestParser> commit_parser;
@@ -62,7 +76,30 @@ struct HttpConnectionState {
  bool basic_validation_passed =
      false; // Set to true if basic validation passes

-  explicit HttpConnectionState(Arena &arena);
+  HttpRequestState();
+};
+
+struct HttpConnectionState {
+  llhttp_t parser;
+  llhttp_settings_t settings;
+
+  HttpRequestState pending;
+  std::deque<HttpRequestState> queue;
+
+  int64_t get_next_sequence_id() { return next_sequence_id++; }
+
+  HttpConnectionState();
+
+  void send_ordered_response(Connection &conn, int64_t sequence_id,
+                             std::span<std::string_view> http_response,
+                             Arena arena, bool close_connection);
+
+private:
+  // Response ordering for HTTP pipelining
+  std::map<int64_t, ResponseData>
+      ready_responses; // sequence_id -> response data
+  int64_t next_sequence_to_send = 0;
+  int64_t next_sequence_id = 0;
 };

 /**
@@ -71,74 +108,15 @@ struct HttpConnectionState {
 */
 struct HttpHandler : ConnectionHandler {
  explicit HttpHandler(const weaseldb::Config &config)
-      : config_(config), banned_request_ids(ArenaStlAllocator<std::string_view>(
-                             &banned_request_arena)) {
-    // Stage 0: Sequence assignment thread
-    sequenceThread = std::thread{[this]() {
-      pthread_setname_np(pthread_self(), "txn-sequence");
-      for (;;) {
-        auto guard = commitPipeline.acquire<0, 0>();
-        if (process_sequence_batch(guard.batch)) {
-          return; // Shutdown signal received
-        }
-      }
-    }};
-
-    // Stage 1: Precondition resolution thread
-    resolveThread = std::thread{[this]() {
-      pthread_setname_np(pthread_self(), "txn-resolve");
-      for (;;) {
-        auto guard = commitPipeline.acquire<1, 0>(/*maxBatch*/ 1);
-        if (process_resolve_batch(guard.batch)) {
-          return; // Shutdown signal received
-        }
-      }
-    }};
-
-    // Stage 2: Transaction persistence thread
-    persistThread = std::thread{[this]() {
-      pthread_setname_np(pthread_self(), "txn-persist");
-      for (;;) {
-        auto guard = commitPipeline.acquire<2, 0>();
-        if (process_persist_batch(guard.batch)) {
-          return; // Shutdown signal received
-        }
-      }
-    }};
-
-    // Stage 3: Connection return to server thread
-    releaseThread = std::thread{[this]() {
-      pthread_setname_np(pthread_self(), "txn-release");
-      for (;;) {
-        auto guard = commitPipeline.acquire<3, 0>();
-        if (process_release_batch(guard.batch)) {
-          return; // Shutdown signal received
-        }
-      }
-    }};
-  }
-  ~HttpHandler() {
-    // Send single shutdown signal that flows through all pipeline stages
-    {
-      auto guard = commitPipeline.push(1, true);
-      guard.batch[0] =
-          ShutdownEntry{}; // Single ShutdownEntry flows through all stages
-    }
-
-    // Join all pipeline threads
-    sequenceThread.join();
-    resolveThread.join();
-    persistThread.join();
-    releaseThread.join();
-  }
+      : config_(config), commit_pipeline_(config) {}

  void on_connection_established(Connection &conn) override;
  void on_connection_closed(Connection &conn) override;
-  void on_data_arrived(std::string_view data,
-                       std::unique_ptr<Connection> &conn_ptr) override;
-  void on_write_buffer_drained(std::unique_ptr<Connection> &conn_ptr) override;
-  void on_batch_complete(
-      std::span<std::unique_ptr<Connection>> /*batch*/) override;
+  void on_data_arrived(std::string_view data, Connection &conn) override;
+  void
+  on_preprocess_writes(Connection &conn,
+                       std::span<PendingResponse> pending_responses) override;
+  void on_batch_complete(std::span<Connection *const> batch) override;

  // llhttp callbacks (public for HttpConnectionState access)
  static int onUrl(llhttp_t *parser, const char *at, size_t length);
@@ -151,74 +129,37 @@ struct HttpHandler : ConnectionHandler {
  static int onMessageComplete(llhttp_t *parser);

 private:
-  static constexpr int lg_size = 16;
-
  // Configuration reference
  const weaseldb::Config &config_;

-  // Pipeline state (sequence thread only)
-  int64_t next_version = 1; // Next version to assign (sequence thread only)
-
-  // Pipeline state (persist thread writes, I/O threads read)
-  std::atomic<int64_t> committed_version{
-      0}; // Highest committed version (persist thread writes, I/O threads read)
-
-  // Arena for banned request IDs and related data structures (sequence thread
-  // only)
-  Arena banned_request_arena;
-  using BannedRequestIdSet =
-      std::unordered_set<std::string_view, std::hash<std::string_view>,
-                         std::equal_to<std::string_view>,
-                         ArenaStlAllocator<std::string_view>>;
-  BannedRequestIdSet banned_request_ids; // Request IDs that should not commit
-                                         // (string_views into arena)
-
-  // Main commit processing pipeline: sequence -> resolve -> persist -> release
-  StaticThreadPipeline<PipelineEntry, WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1,
-                       1>
-      commitPipeline{lg_size};
-
-  // Pipeline stage threads
-  std::thread sequenceThread;
-  std::thread resolveThread;
-  std::thread persistThread;
-  std::thread releaseThread;
-
-  // Pipeline stage processing methods (batch-based)
-  using BatchType =
-      StaticThreadPipeline<PipelineEntry, WaitStrategy::WaitIfUpstreamIdle, 1,
-                           1, 1, 1>::Batch;
-  bool process_sequence_batch(BatchType &batch);
-  bool process_resolve_batch(BatchType &batch);
-  bool process_persist_batch(BatchType &batch);
-  bool process_release_batch(BatchType &batch);
+  // Commit processing pipeline
+  CommitPipeline commit_pipeline_;

  // Route handlers
-  void handle_get_version(Connection &conn, const HttpConnectionState &state);
-  void handle_post_commit(Connection &conn, const HttpConnectionState &state);
-  void handle_get_subscribe(Connection &conn, const HttpConnectionState &state);
-  void handle_get_status(Connection &conn, HttpConnectionState &state,
+  void handle_get_version(Connection &conn, HttpRequestState &state);
+  void handle_post_commit(Connection &conn, HttpRequestState &state);
+  void handle_get_subscribe(Connection &conn, HttpRequestState &state);
+  void handle_get_status(Connection &conn, HttpRequestState &state,
                         const RouteMatch &route_match);
-  void handle_put_retention(Connection &conn, const HttpConnectionState &state,
+  void handle_put_retention(Connection &conn, HttpRequestState &state,
                            const RouteMatch &route_match);
-  void handle_get_retention(Connection &conn, const HttpConnectionState &state,
+  void handle_get_retention(Connection &conn, HttpRequestState &state,
                            const RouteMatch &route_match);
-  void handle_delete_retention(Connection &conn,
-                               const HttpConnectionState &state,
+  void handle_delete_retention(Connection &conn, HttpRequestState &state,
                               const RouteMatch &route_match);
-  void handle_get_metrics(Connection &conn, const HttpConnectionState &state);
-  void handle_get_ok(Connection &conn, const HttpConnectionState &state);
-  void handle_not_found(Connection &conn, const HttpConnectionState &state);
+  void handle_get_metrics(Connection &conn, HttpRequestState &state);
+  void handle_get_ok(Connection &conn, HttpRequestState &state);
+  void handle_not_found(Connection &conn, HttpRequestState &state);

  // HTTP utilities
-  static void send_response(Connection &conn, int status_code,
-                            std::string_view content_type,
-                            std::string_view body,
-                            bool close_connection = false);
-  static void send_json_response(Connection &conn, int status_code,
-                                 std::string_view json,
-                                 bool close_connection = false);
-  static void send_error_response(Connection &conn, int status_code,
-                                  std::string_view message,
-                                  bool close_connection = false);
+
+  // Helper functions for formatting responses without sending
+  static std::span<std::string_view>
+  format_response(int status_code, std::string_view content_type,
+                  std::string_view body, Arena &response_arena,
+                  int64_t http_request_id, bool close_connection);
+  static std::span<std::string_view>
+  format_json_response(int status_code, std::string_view json,
+                       Arena &response_arena, int64_t http_request_id,
+                       bool close_connection);
 };
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -234,8 +234,7 @@ int main(int argc, char *argv[]) {
  std::cout << "Max request size: " << config->server.max_request_size_bytes
            << " bytes" << std::endl;
  std::cout << "I/O threads: " << config->server.io_threads << std::endl;
-  std::cout << "Epoll instances: " << config->server.epoll_instances
-            << std::endl;
+  std::cout << "Epoll instances: " << config->server.io_threads << std::endl;
  std::cout << "Event batch size: " << config->server.event_batch_size
            << std::endl;
  std::cout << "Max connections: " << config->server.max_connections
@@ -247,6 +246,24 @@ int main(int argc, char *argv[]) {
  std::cout << "Request ID retention: "
            << config->commit.request_id_retention_hours.count() << " hours"
            << std::endl;
+
+  // Print pipeline configuration
+  std::string wait_strategy_str;
+  switch (config->commit.pipeline_wait_strategy) {
+  case WaitStrategy::WaitIfStageEmpty:
+    wait_strategy_str = "WaitIfStageEmpty";
+    break;
+  case WaitStrategy::WaitIfUpstreamIdle:
+    wait_strategy_str = "WaitIfUpstreamIdle";
+    break;
+  case WaitStrategy::Never:
+    wait_strategy_str = "Never";
+    break;
+  }
+  std::cout << "Pipeline wait strategy: " << wait_strategy_str << std::endl;
+  std::cout << "Pipeline release threads: "
+            << config->commit.pipeline_release_threads << std::endl;
+
  std::cout << "Subscription buffer size: "
            << config->subscription.max_buffer_size_bytes << " bytes"
            << std::endl;
@@ -265,7 +282,6 @@ int main(int argc, char *argv[]) {
  g_server = server.get();

  // Setup signal handling
-  std::signal(SIGPIPE, SIG_IGN);
  std::signal(SIGTERM, signal_handler);
  std::signal(SIGINT, signal_handler);

--- a/src/metric.cpp
+++ b/src/metric.cpp
@@ -123,16 +123,6 @@ static void validate_or_abort(bool condition, const char *message,
  }
 }

-// Helper to copy a string into arena memory
-static std::string_view arena_copy_string(std::string_view str, Arena &arena) {
-  if (str.empty()) {
-    return std::string_view{};
-  }
-  char *copied = arena.allocate<char>(str.size());
-  std::memcpy(copied, str.data(), str.size());
-  return std::string_view(copied, str.size());
-}
-
 // Arena-based labels key for second level of map
 // Uses string_view containing labels in Prometheus text format
 struct LabelsKey {
@@ -149,8 +139,8 @@ struct LabelsKey {
      validate_or_abort(is_valid_label_value(value), "invalid label value",
                        value);

-      auto key_view = arena_copy_string(key, arena);
-      auto value_view = arena_copy_string(value, arena);
+      auto key_view = arena.copy_string(key);
+      auto value_view = arena.copy_string(value);
      labels.push_back({key_view, value_view});
    }

@@ -352,13 +342,21 @@ struct Gauge::State {
 struct Histogram::State {
  std::span<const double> thresholds; // Bucket boundaries (sorted,
                                      // deduplicated, sizes never change)
-  std::span<uint64_t> counts;         // Count per bucket
-  double sum;                         // Sum of observations
-  uint64_t observations;              // Total observation count
+
+  // Histogram counter data
+  struct Counters {
+    std::span<uint64_t> bucket_counts; // Count per bucket
+    double sum = 0.0;                  // Sum of observations
+    uint64_t observations = 0;         // Total observation count
+  };
+
+  Counters shared;  // Protected by mutex, read by scrapes
+  Counters pending; // Lock-free accumulation when mutex busy
+
  std::mutex
      mutex; // Per-thread, per-histogram mutex for consistent reads/writes

-  State() : sum(0.0), observations(0) {}
+  State() {}
  friend struct Metric;
 };

@@ -454,7 +452,7 @@ struct Metric {
    Arena arena;
    ThreadInit() {
      // Register this thread's arena for memory tracking
-      std::unique_lock<std::mutex> _{mutex};
+      std::unique_lock _{mutex};
      get_thread_arenas()[std::this_thread::get_id()] = &arena;
    }
    ~ThreadInit() {
@@ -462,7 +460,7 @@ struct Metric {
      // THREAD SAFETY: All operations below are protected by the global mutex,
      // including writes to global accumulated state, preventing races with
      // render thread
-      std::unique_lock<std::mutex> _{mutex};
+      std::unique_lock _{mutex};
      // NOTE: registration_version increment is REQUIRED here because:
      // - Cached render plan has pre-resolved pointers to thread-local state
      // - When threads disappear, these pointers become invalid
@@ -501,7 +499,21 @@ struct Metric {
        if (thread_it != family->per_thread_state.end()) {
          for (auto &[labels_key, instance] : thread_it->second.instances) {
            // Acquire lock to get consistent snapshot
-            std::lock_guard<std::mutex> lock(instance->mutex);
+            std::lock_guard lock(instance->mutex);
+
+            // BUGFIX: Flush pending observations into shared before
+            // accumulating
+            if (instance->pending.observations > 0) {
+              // Add pending to shared
+              for (size_t i = 0; i < instance->pending.bucket_counts.size();
+                   ++i) {
+                instance->shared.bucket_counts[i] +=
+                    instance->pending.bucket_counts[i];
+              }
+              instance->shared.sum += instance->pending.sum;
+              instance->shared.observations += instance->pending.observations;
+              // No need to reset pending since instance is being destroyed
+            }

            // Global accumulator should have been created when we made the
            // histogram
@@ -509,13 +521,14 @@ struct Metric {
            assert(global_state);

            // Accumulate bucket counts (mutex already held)
-            for (size_t i = 0; i < instance->counts.size(); ++i) {
-              global_state->counts[i] += instance->counts[i];
+            for (size_t i = 0; i < instance->shared.bucket_counts.size(); ++i) {
+              global_state->shared.bucket_counts[i] +=
+                  instance->shared.bucket_counts[i];
            }

            // Accumulate sum and observations
-            global_state->sum += instance->sum;
-            global_state->observations += instance->observations;
+            global_state->shared.sum += instance->shared.sum;
+            global_state->shared.observations += instance->shared.observations;
          }
          family->per_thread_state.erase(thread_it);
        }
@@ -581,7 +594,7 @@ struct Metric {
    }

    // Not found - copy to global arena and intern
-    auto interned_text = arena_copy_string(text, get_global_arena());
+    auto interned_text = get_global_arena().copy_string(text);
    auto result = interned_set.emplace(interned_text);
    return *result.first;
  }
@@ -592,7 +605,7 @@ struct Metric {
    // Force thread_local initialization
    (void)thread_init;

-    std::unique_lock<std::mutex> _{mutex};
+    std::unique_lock _{mutex};
    ++Metric::registration_version;
    const LabelsKey &key = intern_labels(labels);

@@ -633,7 +646,7 @@ struct Metric {
  static Gauge create_gauge_instance(
      Family<Gauge> *family,
      std::span<const std::pair<std::string_view, std::string_view>> labels) {
-    std::unique_lock<std::mutex> _{mutex};
+    std::unique_lock _{mutex};
    ++Metric::registration_version;
    const LabelsKey &key = intern_labels(labels);

@@ -659,7 +672,7 @@ struct Metric {
    // Force thread_local initialization
    (void)thread_init;

-    std::unique_lock<std::mutex> _{mutex};
+    std::unique_lock _{mutex};
    ++Metric::registration_version;
    const LabelsKey &key = intern_labels(labels);

@@ -683,16 +696,23 @@ struct Metric {
      size_t bucket_count = family->p->buckets.size();
      double *thresholds_data =
          get_thread_local_arena().allocate<double>(bucket_count);
-      uint64_t *counts_data =
-          get_thread_local_arena().allocate<uint64_t>(bucket_count);

-      // Copy thresholds and initialize counts
+      // Initialize thresholds
      std::memcpy(thresholds_data, family->p->buckets.data(),
                  bucket_count * sizeof(double));
-      std::memset(counts_data, 0, bucket_count * sizeof(uint64_t));
-
      ptr->thresholds = std::span<const double>(thresholds_data, bucket_count);
-      ptr->counts = std::span<uint64_t>(counts_data, bucket_count);
+
+      // Initialize shared counts
+      auto shared_counts_span =
+          get_thread_local_arena().allocate_span<uint64_t>(bucket_count);
+      std::fill(shared_counts_span.begin(), shared_counts_span.end(), 0);
+      ptr->shared.bucket_counts = shared_counts_span;
+
+      // Initialize pending counts
+      auto pending_counts_span =
+          get_thread_local_arena().allocate_span<uint64_t>(bucket_count);
+      std::fill(pending_counts_span.begin(), pending_counts_span.end(), 0);
+      ptr->pending.bucket_counts = pending_counts_span;

      // Ensure global accumulator exists for this label set
      auto &global_state = family->p->global_accumulated_values[key];
@@ -702,17 +722,16 @@ struct Metric {
        // Allocate and copy thresholds, initialize counts
        double *global_thresholds_data =
            get_global_arena().allocate<double>(bucket_count);
-        uint64_t *global_counts_data =
-            get_global_arena().allocate<uint64_t>(bucket_count);
-
        std::memcpy(global_thresholds_data, ptr->thresholds.data(),
                    bucket_count * sizeof(double));
-        std::memset(global_counts_data, 0, bucket_count * sizeof(uint64_t));
-
        global_state->thresholds =
            std::span<const double>(global_thresholds_data, bucket_count);
-        global_state->counts =
-            std::span<uint64_t>(global_counts_data, bucket_count);
+
+        auto global_shared_counts_span =
+            get_global_arena().allocate_span<uint64_t>(bucket_count);
+        std::fill(global_shared_counts_span.begin(),
+                  global_shared_counts_span.end(), 0);
+        global_state->shared.bucket_counts = global_shared_counts_span;
      }
    }
    Histogram result;
@@ -1137,12 +1156,12 @@ struct Metric {
          uint64_t observations_snapshot;

          {
-            std::lock_guard<std::mutex> lock(instance->mutex);
-            for (size_t i = 0; i < instance->counts.size(); ++i) {
-              counts_snapshot[i] = instance->counts[i];
+            std::lock_guard lock(instance->mutex);
+            for (size_t i = 0; i < instance->shared.bucket_counts.size(); ++i) {
+              counts_snapshot[i] = instance->shared.bucket_counts[i];
            }
-            sum_snapshot = instance->sum;
-            observations_snapshot = instance->observations;
+            sum_snapshot = instance->shared.sum;
+            observations_snapshot = instance->shared.observations;
          }

          for (size_t i = 0; i < bucket_count; ++i) {
@@ -1155,11 +1174,12 @@ struct Metric {
        // Add global accumulated values
        if (instruction.aggregate_histogram.global_state) {
          auto *global_state = instruction.aggregate_histogram.global_state;
-          for (size_t i = 0; i < global_state->counts.size(); ++i) {
-            total_counts[i] += global_state->counts[i];
+          for (size_t i = 0; i < global_state->shared.bucket_counts.size();
+               ++i) {
+            total_counts[i] += global_state->shared.bucket_counts[i];
          }
-          total_sum += global_state->sum;
-          total_observations += global_state->observations;
+          total_sum += global_state->shared.sum;
+          total_observations += global_state->shared.observations;
        }

        // Format explicit bucket counts
@@ -1421,16 +1441,36 @@ update_histogram_buckets_simd(std::span<const double> thresholds,
 }

 void Histogram::observe(double x) {
-  assert(p->thresholds.size() == p->counts.size());
+  assert(p->thresholds.size() == p->shared.bucket_counts.size());

-  std::lock_guard<std::mutex> lock(p->mutex);
+  // Try to get lock immediately
+  if (p->mutex.try_lock()) {
+    // Fast path: got lock, flush any pending first
+    if (p->pending.observations > 0) {
+      // Add pending to shared
+      for (size_t i = 0; i < p->pending.bucket_counts.size(); ++i) {
+        p->shared.bucket_counts[i] += p->pending.bucket_counts[i];
+        p->pending.bucket_counts[i] = 0;
+      }
+      p->shared.sum += p->pending.sum;
+      p->shared.observations += p->pending.observations;
+      p->pending.sum = 0.0;
+      p->pending.observations = 0;
+    }

-  // Update bucket counts using SIMD
-  update_histogram_buckets_simd(p->thresholds, p->counts, x, 0);
+    // Update shared directly
+    update_histogram_buckets_simd(p->thresholds, p->shared.bucket_counts, x, 0);
+    p->shared.sum += x;
+    p->shared.observations++;

-  // Update sum and observation count
-  p->sum += x;
-  p->observations++;
+    p->mutex.unlock();
+  } else {
+    // Slow path: accumulate in pending (lock-free)
+    update_histogram_buckets_simd(p->thresholds, p->pending.bucket_counts, x,
+                                  0);
+    p->pending.sum += x;
+    p->pending.observations++;
+  }
 }

 template <> Family<Counter>::Family() = default;
@@ -1458,15 +1498,15 @@ Histogram Family<Histogram>::create(
 Family<Counter> create_counter(std::string_view name, std::string_view help) {
  validate_or_abort(is_valid_metric_name(name), "invalid counter name", name);

-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};
  ++Metric::registration_version;
  auto &global_arena = Metric::get_global_arena();
-  auto name_view = arena_copy_string(name, global_arena);
+  auto name_view = global_arena.copy_string(name);
  auto &familyPtr = Metric::get_counter_families()[name_view];
  if (!familyPtr) {
    familyPtr = global_arena.construct<Family<Counter>::State>(global_arena);
    familyPtr->name = name_view;
-    familyPtr->help = arena_copy_string(help, global_arena);
+    familyPtr->help = global_arena.copy_string(help);
  } else {
    validate_or_abort(
        familyPtr->help == help,
@@ -1480,16 +1520,16 @@ Family<Counter> create_counter(std::string_view name, std::string_view help) {
 Family<Gauge> create_gauge(std::string_view name, std::string_view help) {
  validate_or_abort(is_valid_metric_name(name), "invalid gauge name", name);

-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};
  ++Metric::registration_version;
  auto &global_arena = Metric::get_global_arena();
-  auto name_view = arena_copy_string(name, global_arena);
+  auto name_view = global_arena.copy_string(name);
  auto &familyPtr = Metric::get_gauge_families()[name_view];
  if (!familyPtr) {
    // Family<T>::State instances use Arena::Ptr for automatic cleanup
    familyPtr = global_arena.construct<Family<Gauge>::State>(global_arena);
    familyPtr->name = name_view;
-    familyPtr->help = arena_copy_string(help, global_arena);
+    familyPtr->help = global_arena.copy_string(help);
  } else {
    validate_or_abort(
        familyPtr->help == help,
@@ -1504,16 +1544,16 @@ Family<Histogram> create_histogram(std::string_view name, std::string_view help,
                                   std::span<const double> buckets) {
  validate_or_abort(is_valid_metric_name(name), "invalid histogram name", name);

-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};
  ++Metric::registration_version;
  auto &global_arena = Metric::get_global_arena();
-  auto name_view = arena_copy_string(name, global_arena);
+  auto name_view = global_arena.copy_string(name);
  auto &family_ptr = Metric::get_histogram_families()[name_view];
  if (!family_ptr) {
    // Family<T>::State instances use Arena::Ptr for automatic cleanup
    family_ptr = global_arena.construct<Family<Histogram>::State>(global_arena);
    family_ptr->name = name_view;
-    family_ptr->help = arena_copy_string(help, global_arena);
+    family_ptr->help = global_arena.copy_string(help);

    // DESIGN: Prometheus-compatible histogram buckets
    // Convert to vector for sorting
@@ -1693,7 +1733,7 @@ std::span<std::string_view> render(Arena &arena) {
  // Hold lock throughout all phases to prevent registry changes
  // THREAD SAFETY: Global mutex protects cached_plan initialization and access,
  // prevents races during static member initialization at program startup
-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};

  // Call all registered collectors to update their metrics
  for (const auto &collector : Metric::get_collectors()) {
@@ -1723,7 +1763,7 @@ template <>
 void Family<Counter>::register_callback(
    std::span<const std::pair<std::string_view, std::string_view>> labels,
    MetricCallback<Counter> callback) {
-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};
  ++Metric::registration_version;
  const LabelsKey &key = Metric::intern_labels(labels);

@@ -1748,7 +1788,7 @@ template <>
 void Family<Gauge>::register_callback(
    std::span<const std::pair<std::string_view, std::string_view>> labels,
    MetricCallback<Gauge> callback) {
-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};
  ++Metric::registration_version;
  const LabelsKey &key = Metric::intern_labels(labels);

@@ -1804,7 +1844,7 @@ void reset_metrics_for_testing() {
 }

 void register_collector(Ref<Collector> collector) {
-  std::unique_lock<std::mutex> _{Metric::mutex};
+  std::unique_lock _{Metric::mutex};
  ++Metric::registration_version;
  Metric::get_collectors().push_back(std::move(collector));
 }
--- a/src/metric.hpp
+++ b/src/metric.hpp
@@ -45,7 +45,6 @@

 #include <functional>
 #include <initializer_list>
-#include <memory>
 #include <span>
 #include <type_traits>
 #include <vector>
@@ -74,7 +73,7 @@ template <typename T> using MetricCallback = std::function<double()>;
 // 3. When rendered, the values of all Counter objects with the same labels
 //    are summed together into a single total.
 struct Counter {
-  void inc(double = 1.0); // Increment counter (must be >= 0)
+  void inc(double = 1.0); // Increment counter (must be >= 0, never blocks)

 private:
  Counter();
@@ -95,9 +94,9 @@ private:
 //    are cumulative.
 // 4. For independent gauges, create them with unique labels.
 struct Gauge {
-  void inc(double = 1.0);
-  void dec(double = 1.0);
-  void set(double);
+  void inc(double = 1.0); // (never blocks)
+  void dec(double = 1.0); // (never blocks)
+  void set(double);       // (never blocks)

 private:
  Gauge();
@@ -117,7 +116,8 @@ private:
 // 3. When rendered, the observations from all Histogram objects with the
 //    same labels are combined into a single histogram.
 struct Histogram {
-  void observe(double); // Record observation in appropriate bucket
+  void
+  observe(double); // Record observation in appropriate bucket (never blocks)

 private:
  Histogram();
--- a/src/perfetto_categories.hpp
+++ b/src/perfetto_categories.hpp
@@ -1,11 +1,15 @@
 #pragma once

-#define ENABLE_PERFETTO 1
+#ifndef ENABLE_PERFETTO
+#define ENABLE_PERFETTO 0
+#endif

 #if ENABLE_PERFETTO
 #include <perfetto.h>
 #else
 #define PERFETTO_DEFINE_CATEGORIES(...)
+#define PERFETTO_TRACK_EVENT_STATIC_STORAGE                                    \
+  void perfetto_track_event_static_storage
 #define TRACE_EVENT(...)
 #endif

--- a/src/pipeline_entry.hpp
+++ b/src/pipeline_entry.hpp
@@ -1,22 +1,37 @@
 #pragma once

+#include "arena.hpp"
 #include "connection.hpp"
-#include <memory>
 #include <variant>

+// Forward declarations
+struct CommitRequest;
+
 /**
 * Pipeline entry for commit requests that need full 4-stage processing.
 * Contains connection with parsed CommitRequest.
 */
 struct CommitEntry {
-  std::unique_ptr<Connection> connection;
-  int64_t assigned_version = 0; // Set by sequence stage
+  WeakRef<MessageSender> connection;
+  int64_t assigned_version = -1; // Set by sequence stage
  bool resolve_success = false;  // Set by resolve stage
  bool persist_success = false;  // Set by persist stage

+  // Protocol-agnostic context (arena-allocated, protocol-specific)
+  void *protocol_context = nullptr;
+  const CommitRequest *commit_request = nullptr; // Points to request_arena data
+
+  // Request arena contains parsed request data and response data
+  Arena request_arena;
+
+  // JSON response body (set by persist stage, arena-allocated)
+  std::string_view response_json;
+
  CommitEntry() = default; // Default constructor for variant
-  explicit CommitEntry(std::unique_ptr<Connection> conn)
-      : connection(std::move(conn)) {}
+  explicit CommitEntry(WeakRef<MessageSender> conn, void *ctx,
+                       const CommitRequest *req, Arena arena)
+      : connection(std::move(conn)), protocol_context(ctx), commit_request(req),
+        request_arena(std::move(arena)) {}
 };

 /**
@@ -24,12 +39,24 @@ struct CommitEntry {
 * then transfer to status threadpool.
 */
 struct StatusEntry {
-  std::unique_ptr<Connection> connection;
+  WeakRef<MessageSender> connection;
  int64_t version_upper_bound = 0; // Set by sequence stage

+  // Protocol-agnostic context (arena-allocated, protocol-specific)
+  void *protocol_context = nullptr;
+  std::string_view status_request_id; // Points to request_arena data
+
+  // Request arena for request data
+  Arena request_arena;
+
+  // JSON response body (set by persist stage, arena-allocated)
+  std::string_view response_json;
+
  StatusEntry() = default; // Default constructor for variant
-  explicit StatusEntry(std::unique_ptr<Connection> conn)
-      : connection(std::move(conn)) {}
+  explicit StatusEntry(WeakRef<MessageSender> conn, void *ctx,
+                       std::string_view request_id, Arena arena)
+      : connection(std::move(conn)), protocol_context(ctx),
+        status_request_id(request_id), request_arena(std::move(arena)) {}
 };

 /**
@@ -38,11 +65,47 @@ struct StatusEntry {
 * Resolve stage can perform configurable CPU work for benchmarking.
 */
 struct HealthCheckEntry {
-  std::unique_ptr<Connection> connection;
+  WeakRef<MessageSender> connection;
+
+  // Protocol-agnostic context (arena-allocated, protocol-specific)
+  void *protocol_context = nullptr;
+
+  // Request arena for response data
+  Arena request_arena;
+
+  // JSON response body (set by persist stage, arena-allocated)
+  std::string_view response_json;

  HealthCheckEntry() = default; // Default constructor for variant
-  explicit HealthCheckEntry(std::unique_ptr<Connection> conn)
-      : connection(std::move(conn)) {}
+  explicit HealthCheckEntry(WeakRef<MessageSender> conn, void *ctx, Arena arena)
+      : connection(std::move(conn)), protocol_context(ctx),
+        request_arena(std::move(arena)) {}
+};
+
+/**
+ * Pipeline entry for /v1/version requests.
+ * Needs to integrate with the pipeline because for external consistency.
+ */
+struct GetVersionEntry {
+  WeakRef<MessageSender> connection;
+
+  // Protocol-agnostic context (arena-allocated, protocol-specific)
+  void *protocol_context = nullptr;
+
+  // Request arena for response data
+  Arena request_arena;
+
+  // JSON response body (set by persist stage, arena-allocated)
+  std::string_view response_json;
+
+  // Proposed response version
+  int64_t version;
+
+  GetVersionEntry() = default; // Default constructor for variant
+  explicit GetVersionEntry(WeakRef<MessageSender> conn, void *ctx, Arena arena,
+                           int64_t version)
+      : connection(std::move(conn)), protocol_context(ctx),
+        request_arena(std::move(arena)), version(version) {}
 };

 /**
@@ -57,5 +120,5 @@ struct ShutdownEntry {
 * Pipeline entry variant type used by the commit processing pipeline.
 * Each stage pattern-matches on the variant type to handle appropriately.
 */
-using PipelineEntry =
-    std::variant<CommitEntry, StatusEntry, HealthCheckEntry, ShutdownEntry>;
+using PipelineEntry = std::variant<CommitEntry, StatusEntry, HealthCheckEntry,
+                                   ShutdownEntry, GetVersionEntry>;
--- a/src/process_collector.hpp
+++ b/src/process_collector.hpp
@@ -8,7 +8,7 @@
 * Gathers metrics like CPU usage, memory, and file descriptors by reading
 * files from the /proc filesystem.
 */
-struct ProcessCollector : public metric::Collector {
+struct ProcessCollector : metric::Collector {
  /**
   * @brief Constructs the collector and initializes the process metrics.
   */
--- a/src/reference.hpp
+++ b/src/reference.hpp
@@ -22,8 +22,8 @@
 * Basic usage:
 * @code
 * auto obj = make_ref<MyClass>(args...);     // Create managed object
- * auto copy = obj;                           // Copy (thread-safe)
- * WeakRef<MyClass> weak = obj;               // Create weak reference
+ * auto copy = obj.copy();                    // Explicit copy (thread-safe)
+ * WeakRef<MyClass> weak = obj.as_weak();     // Create weak reference
 * auto locked = weak.lock();                 // Try to promote to strong
 * @endcode
 *
@@ -31,6 +31,9 @@
 * safely copy, move, and destroy references to the same object.
 */

+// Forward declaration
+template <typename T> struct WeakRef;
+
 namespace detail {
 struct ControlBlock {
  std::atomic<uint32_t> strong_count;
@@ -82,7 +85,7 @@ struct ControlBlock {
 *
 * Usage:
 * - Use make_ref<T>() to create new objects
- * - Copy/assign to share ownership
+ * - Use copy() method for explicit sharing of ownership
 * - Use get(), operator*, operator-> to access the object
 * - Use operator bool() to check if valid
 * - Use reset() to release ownership
@@ -126,57 +129,28 @@ template <typename T> struct Ref {
  ~Ref() { release(); }

  /**
-   * @brief Copy constructor - increments strong reference count
+   * @brief Copy constructor - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  Ref(const Ref &other) noexcept
-      : ptr(other.ptr), control_block(other.control_block) {
-    if (control_block) {
-      control_block->increment_strong();
-    }
-  }
+  Ref(const Ref &other) = delete;

  /**
-   * @brief Converting copy constructor for polymorphism (Derived -> Base)
+   * @brief Converting copy constructor - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  template <typename U>
-  Ref(const Ref<U> &other) noexcept
-    requires std::is_convertible_v<U *, T *>
-      : ptr(other.ptr), control_block(other.control_block) {
-    if (control_block) {
-      control_block->increment_strong();
-    }
-  }
+  template <typename U> Ref(const Ref<U> &other) = delete;

  /**
-   * @brief Copy assignment operator
+   * @brief Copy assignment operator - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  Ref &operator=(const Ref &other) noexcept {
-    if (this != &other) {
-      release();
-      ptr = other.ptr;
-      control_block = other.control_block;
-      if (control_block) {
-        control_block->increment_strong();
-      }
-    }
-    return *this;
-  }
+  Ref &operator=(const Ref &other) = delete;

  /**
-   * @brief Converting assignment operator for polymorphism (Derived -> Base)
+   * @brief Converting assignment operator - deleted to prevent accidental
+   * copies Use copy() method for explicit copying
   */
-  template <typename U>
-  Ref &operator=(const Ref<U> &other) noexcept
-    requires std::is_convertible_v<U *, T *>
-  {
-    release();
-    ptr = other.ptr;
-    control_block = other.control_block;
-    if (control_block) {
-      control_block->increment_strong();
-    }
-    return *this;
-  }
+  template <typename U> Ref &operator=(const Ref<U> &other) = delete;

  /**
   * @brief Move constructor - transfers ownership
@@ -228,6 +202,28 @@ template <typename T> struct Ref {
    return *this;
  }

+  /**
+   * @brief Explicitly create a copy with shared ownership
+   * @return New Ref that shares ownership of the same object
+   */
+  [[nodiscard]] Ref copy() const noexcept {
+    if (control_block) {
+      control_block->increment_strong();
+    }
+    return Ref(ptr, control_block);
+  }
+
+  /**
+   * @brief Create a WeakRef that observes this object
+   * @return New WeakRef that observes the same object
+   */
+  [[nodiscard]] WeakRef<T> as_weak() const noexcept {
+    if (control_block) {
+      control_block->increment_weak();
+    }
+    return WeakRef<T>(ptr, control_block);
+  }
+
  /**
   * @brief Reset to empty state
   */
@@ -301,7 +297,8 @@ private:
 * that might be destroyed by other threads.
 *
 * Usage:
- * - Create from Ref<T> to observe without owning
+ * - Create from Ref<T> using as_weak() to observe without owning
+ * - Use copy() method for explicit copying
 * - Use lock() to attempt promotion to Ref<T>
 * - Returns empty Ref<T> if object was already destroyed
 * - Use reset() to stop observing
@@ -332,7 +329,7 @@ template <typename T> struct WeakRef {
              expected_strong, expected_strong + 1, std::memory_order_acquire,
              std::memory_order_relaxed)) {
        // Success - we incremented the strong count
-        return Ref<T>(get_object_ptr(), control_block);
+        return Ref<T>(ptr, control_block);
      }
      // CAS failed, expected_strong now contains the current value, retry
    }
@@ -347,76 +344,40 @@ template <typename T> struct WeakRef {
  ~WeakRef() { release(); }

  /**
-   * @brief Copy constructor from WeakRef
+   * @brief Copy constructor from WeakRef - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  WeakRef(const WeakRef &other) noexcept : control_block(other.control_block) {
-    if (control_block) {
-      control_block->increment_weak();
-    }
-  }
+  WeakRef(const WeakRef &other) = delete;

  /**
-   * @brief Copy constructor from Ref
+   * @brief Copy constructor from Ref - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  WeakRef(const Ref<T> &ref) noexcept : control_block(ref.control_block) {
-    if (control_block) {
-      control_block->increment_weak();
-    }
-  }
+  WeakRef(const Ref<T> &ref) = delete;

  /**
-   * @brief Converting copy constructor from WeakRef for polymorphism
+   * @brief Converting copy constructor from WeakRef - deleted to prevent
+   * accidental copies Use copy() method for explicit copying
   */
-  template <typename U>
-  WeakRef(const WeakRef<U> &other) noexcept
-    requires std::is_convertible_v<U *, T *>
-      : control_block(other.control_block) {
-    if (control_block) {
-      control_block->increment_weak();
-    }
-  }
+  template <typename U> WeakRef(const WeakRef<U> &other) = delete;

  /**
-   * @brief Converting copy constructor from Ref for polymorphism
+   * @brief Converting copy constructor from Ref - deleted to prevent accidental
+   * copies Use copy() method for explicit copying
   */
-  template <typename U>
-  WeakRef(const Ref<U> &ref) noexcept
-    requires std::is_convertible_v<U *, T *>
-      : control_block(ref.control_block) {
-    if (control_block) {
-      control_block->increment_weak();
-    }
-  }
+  template <typename U> WeakRef(const Ref<U> &ref) = delete;

  /**
-   * @brief Converting copy assignment from WeakRef for polymorphism
+   * @brief Converting copy assignment from WeakRef - deleted to prevent
+   * accidental copies Use copy() method for explicit copying
   */
-  template <typename U>
-  WeakRef &operator=(const WeakRef<U> &other) noexcept
-    requires std::is_convertible_v<U *, T *>
-  {
-    release();
-    control_block = other.control_block;
-    if (control_block) {
-      control_block->increment_weak();
-    }
-    return *this;
-  }
+  template <typename U> WeakRef &operator=(const WeakRef<U> &other) = delete;

  /**
-   * @brief Converting copy assignment from Ref for polymorphism
+   * @brief Converting copy assignment from Ref - deleted to prevent accidental
+   * copies Use copy() method for explicit copying
   */
-  template <typename U>
-  WeakRef &operator=(const Ref<U> &ref) noexcept
-    requires std::is_convertible_v<U *, T *>
-  {
-    release();
-    control_block = ref.control_block;
-    if (control_block) {
-      control_block->increment_weak();
-    }
-    return *this;
-  }
+  template <typename U> WeakRef &operator=(const Ref<U> &ref) = delete;

  /**
   * @brief Converting move constructor from WeakRef for polymorphism
@@ -424,7 +385,8 @@ template <typename T> struct WeakRef {
  template <typename U>
  WeakRef(WeakRef<U> &&other) noexcept
    requires std::is_convertible_v<U *, T *>
-      : control_block(other.control_block) {
+      : ptr(other.ptr), control_block(other.control_block) {
+    other.ptr = nullptr;
    other.control_block = nullptr;
  }

@@ -436,41 +398,31 @@ template <typename T> struct WeakRef {
    requires std::is_convertible_v<U *, T *>
  {
    release();
+    ptr = other.ptr;
    control_block = other.control_block;
+    other.ptr = nullptr;
    other.control_block = nullptr;
    return *this;
  }

  /**
-   * @brief Copy assignment from WeakRef
+   * @brief Copy assignment from WeakRef - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  WeakRef &operator=(const WeakRef &other) noexcept {
-    if (this != &other) {
-      release();
-      control_block = other.control_block;
-      if (control_block) {
-        control_block->increment_weak();
-      }
-    }
-    return *this;
-  }
+  WeakRef &operator=(const WeakRef &other) = delete;

  /**
-   * @brief Copy assignment from Ref
+   * @brief Copy assignment from Ref - deleted to prevent accidental copies
+   * Use copy() method for explicit copying
   */
-  WeakRef &operator=(const Ref<T> &ref) noexcept {
-    release();
-    control_block = ref.control_block;
-    if (control_block) {
-      control_block->increment_weak();
-    }
-    return *this;
-  }
+  WeakRef &operator=(const Ref<T> &ref) = delete;

  /**
   * @brief Move constructor
   */
-  WeakRef(WeakRef &&other) noexcept : control_block(other.control_block) {
+  WeakRef(WeakRef &&other) noexcept
+      : ptr(other.ptr), control_block(other.control_block) {
+    other.ptr = nullptr;
    other.control_block = nullptr;
  }

@@ -480,42 +432,46 @@ template <typename T> struct WeakRef {
  WeakRef &operator=(WeakRef &&other) noexcept {
    if (this != &other) {
      release();
+      ptr = other.ptr;
      control_block = other.control_block;
+      other.ptr = nullptr;
      other.control_block = nullptr;
    }
    return *this;
  }

+  /**
+   * @brief Explicitly create a copy with shared weak reference
+   * @return New WeakRef that observes the same object
+   */
+  [[nodiscard]] WeakRef copy() const noexcept {
+    if (control_block) {
+      control_block->increment_weak();
+    }
+    return WeakRef(ptr, control_block);
+  }
+
  /**
   * @brief Reset to empty state
   */
  void reset() noexcept {
    release();
+    ptr = nullptr;
    control_block = nullptr;
  }

  /**
   * @brief Default constructor - creates empty WeakRef
   */
-  WeakRef() : control_block(nullptr) {}
+  WeakRef() : ptr(nullptr), control_block(nullptr) {}

 private:
-  explicit WeakRef(detail::ControlBlock *cb) : control_block(cb) {}
+  explicit WeakRef(T *object_ptr, detail::ControlBlock *cb)
+      : ptr(object_ptr), control_block(cb) {}

+  T *ptr;
  detail::ControlBlock *control_block;

-  // Helper to calculate object pointer from control block
-  T *get_object_ptr() const {
-    if (!control_block)
-      return nullptr;
-    constexpr size_t cb_size = sizeof(detail::ControlBlock);
-    constexpr size_t alignment = alignof(T);
-    constexpr size_t padded_cb_size =
-        (cb_size + alignment - 1) & ~(alignment - 1);
-    return reinterpret_cast<T *>(reinterpret_cast<char *>(control_block) +
-                                 padded_cb_size);
-  }
-
  /**
   * @brief Release current weak reference and handle cleanup
   */
@@ -550,6 +506,8 @@ private:
 * @code
 * auto obj = make_ref<MyClass>(arg1, arg2);
 * auto empty_vec = make_ref<std::vector<int>>();
+ * auto obj_copy = obj.copy();                    // Explicit copy
+ * WeakRef<MyClass> weak = obj.as_weak();         // Create weak reference
 * @endcode
 *
 * Thread safety: Safe to call from multiple threads simultaneously.
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -5,7 +5,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <fcntl.h>
-#include <memory>
 #include <netdb.h>
 #include <netinet/tcp.h>
 #include <pthread.h>
@@ -25,7 +24,7 @@ Ref<Server> Server::create(const weaseldb::Config &config,
                           ConnectionHandler &handler,
                           const std::vector<int> &listen_fds) {
  auto result = make_ref<Server>(config, handler, listen_fds);
-  result->self_ = result;
+  result->self_ = result.as_weak();
  return result;
 }

@@ -139,51 +138,6 @@ void Server::shutdown() {
  }
 }

-void Server::release_back_to_server(std::unique_ptr<Connection> connection) {
-  if (!connection) {
-    return; // Nothing to release
-  }
-
-  // Try to get the server from the connection's weak_ptr
-  if (auto server = connection->server_.lock()) {
-    // Server still exists - pass unique_ptr directly
-    server->receiveConnectionBack(std::move(connection));
-  }
-
-  // If server is gone, connection will be automatically cleaned up when
-  // unique_ptr destructs
-}
-
-void Server::receiveConnectionBack(std::unique_ptr<Connection> connection) {
-  if (!connection) {
-    return; // Nothing to process
-  }
-
-  // Re-add the connection to epoll for continued processing
-  struct epoll_event event{};
-
-  if (!connection->has_messages()) {
-    event.events = EPOLLIN | EPOLLONESHOT;
-  } else {
-    event.events = EPOLLOUT | EPOLLONESHOT;
-  }
-
-  int fd = connection->getFd();
-  event.data.fd = fd;
-
-  // Store connection in registry before adding to epoll
-  // This mirrors the pattern used in process_connection_batch
-  size_t epoll_index = connection->getEpollIndex();
-  int epollfd = epoll_fds_[epoll_index];
-  connection_registry_.store(fd, std::move(connection));
-
-  if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event) == -1) {
-    perror("epoll_ctl MOD in receiveConnectionBack");
-    // Remove from registry and clean up on failure
-    (void)connection_registry_.remove(fd);
-  }
-}
-
 int Server::create_local_connection() {
  int sockets[2];
  if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) != 0) {
@@ -210,22 +164,24 @@ int Server::create_local_connection() {
  struct sockaddr_storage addr{};
  addr.ss_family = AF_UNIX;

-  // Calculate epoll_index for connection distribution
+  // Use round-robin distribution for local connections across epoll instances
  size_t epoll_index =
      connection_distribution_counter_.fetch_add(1, std::memory_order_relaxed) %
      epoll_fds_.size();

  // Create Connection object
-  auto connection = std::unique_ptr<Connection>(new Connection(
+  auto connection = make_ref<Connection>(
      addr, server_fd, connection_id_.fetch_add(1, std::memory_order_relaxed),
-      epoll_index, &handler_, self_));
+      epoll_index, &handler_, self_.copy());
+  connection->self_ref_ = connection.as_weak();
+  connection->tsan_release();

  // Store in registry
  connection_registry_.store(server_fd, std::move(connection));

  // Add to appropriate epoll instance
  struct epoll_event event{};
-  event.events = EPOLLIN | EPOLLONESHOT;
+  event.events = EPOLLIN;
  event.data.fd = server_fd;

  int epollfd = epoll_fds_[epoll_index];
@@ -263,10 +219,11 @@ void Server::setup_shutdown_pipe() {
 }

 void Server::create_epoll_instances() {
-  // Create multiple epoll instances to reduce contention
-  epoll_fds_.resize(config_.server.epoll_instances);
+  // Create one epoll instance per I/O thread (1:1 mapping) to eliminate
+  // contention
+  epoll_fds_.resize(config_.server.io_threads);

-  for (int i = 0; i < config_.server.epoll_instances; ++i) {
+  for (int i = 0; i < config_.server.io_threads; ++i) {
    epoll_fds_[i] = epoll_create1(EPOLL_CLOEXEC);
    if (epoll_fds_[i] == -1) {
      perror("epoll_create1");
@@ -299,11 +256,6 @@ void Server::create_epoll_instances() {
  }
 }

-int Server::get_epoll_for_thread(int thread_id) const {
-  // Round-robin assignment of threads to epoll instances
-  return epoll_fds_[thread_id % epoll_fds_.size()];
-}
-
 void Server::start_io_threads(std::vector<std::thread> &threads) {
  int io_threads = config_.server.io_threads;

@@ -312,12 +264,11 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
      pthread_setname_np(pthread_self(),
                         ("io-" + std::to_string(thread_id)).c_str());

-      // Each thread uses its assigned epoll instance (round-robin)
-      int epollfd = get_epoll_for_thread(thread_id);
+      // Each thread uses its assigned epoll instance (1:1 mapping)
+      int epollfd = epoll_fds_[thread_id];

      std::vector<epoll_event> events(config_.server.event_batch_size);
-      std::vector<std::unique_ptr<Connection>> batch(
-          config_.server.event_batch_size);
+      std::vector<Ref<Connection>> batch(config_.server.event_batch_size);
      std::vector<int> batch_events(config_.server.event_batch_size);
      std::vector<int>
          ready_listen_fds; // Reused across iterations to avoid allocation
@@ -351,11 +302,12 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {

          // Handle existing connection events
          int fd = events[i].data.fd;
-          std::unique_ptr<Connection> conn = connection_registry_.remove(fd);
+          Ref<Connection> conn = connection_registry_.remove(fd);
+          conn->tsan_acquire();
          assert(conn);

          if (events[i].events & (EPOLLERR | EPOLLHUP)) {
-            // unique_ptr will automatically delete on scope exit
+            close_connection(conn);
            continue;
          }

@@ -368,7 +320,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
        // Process existing connections in batch
        if (batch_count > 0) {
          process_connection_batch(
-              epollfd, std::span(batch).subspan(0, batch_count),
+              std::span(batch).subspan(0, batch_count),
              std::span(batch_events).subspan(0, batch_count));
        }

@@ -408,9 +360,9 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
              perror("setsockopt SO_KEEPALIVE");
            }

-            // Add to epoll with no interests
+            // Add to epoll
            struct epoll_event event{};
-            event.events = 0;
+            event.events = EPOLLIN;
            event.data.fd = fd;
            if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event) == -1) {
              perror("epoll_ctl ADD");
@@ -418,11 +370,13 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
            }

            // Transfer ownership from registry to batch processing
-            size_t epoll_index = thread_id % epoll_fds_.size();
-            batch[batch_count] = std::unique_ptr<Connection>(new Connection(
+            size_t epoll_index = thread_id;
+            batch[batch_count] = make_ref<Connection>(
                addr, fd,
                connection_id_.fetch_add(1, std::memory_order_relaxed),
-                epoll_index, &handler_, self_));
+                epoll_index, &handler_, self_.copy());
+            batch[batch_count]->self_ref_ = batch[batch_count].as_weak();
+            batch[batch_count]->tsan_release();
            batch_events[batch_count] =
                EPOLLIN; // New connections always start with read
            batch_count++;
@@ -430,7 +384,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
            // Process batch if full
            if (batch_count == config_.server.event_batch_size) {
              process_connection_batch(
-                  epollfd, {batch.data(), (size_t)batch_count},
+                  {batch.data(), (size_t)batch_count},
                  {batch_events.data(), (size_t)batch_count});
              batch_count = 0;
            }
@@ -440,7 +394,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
        // Process remaining accepted connections
        if (batch_count > 0) {
          process_connection_batch(
-              epollfd, std::span(batch).subspan(0, batch_count),
+              std::span(batch).subspan(0, batch_count),
              std::span(batch_events).subspan(0, batch_count));
          batch_count = 0;
        }
@@ -449,8 +403,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
  }
 }

-void Server::process_connection_reads(std::unique_ptr<Connection> &conn,
-                                      int events) {
+void Server::process_connection_reads(Ref<Connection> &conn, int events) {
  assert(conn);
  // Handle EPOLLIN - read data and process it
  if (events & EPOLLIN) {
@@ -461,7 +414,7 @@ void Server::process_connection_reads(std::unique_ptr<Connection> &conn,

    if (r < 0) {
      // Error or EOF - connection should be closed
-      conn.reset();
+      close_connection(conn);
      return;
    }

@@ -470,59 +423,57 @@ void Server::process_connection_reads(std::unique_ptr<Connection> &conn,
      return;
    }

-    // Call handler with unique_ptr - handler can take ownership if needed
-    handler_.on_data_arrived(std::string_view{buf, size_t(r)}, conn);
-
-    // If handler took ownership (conn is now null), return true to indicate
-    // processing is done
-    if (!conn) {
-      return;
-    }
+    // Call handler with connection reference - server retains ownership
+    handler_.on_data_arrived(std::string_view{buf, size_t(r)}, *conn);
  }
 }

-void Server::process_connection_writes(std::unique_ptr<Connection> &conn,
-                                       int /*events*/) {
+void Server::process_connection_writes(Ref<Connection> &conn, int events) {
  assert(conn);
-  // For simplicity, we always attempt to write when an event fires. We could be
-  // more precise and skip the write if we detect that we've already seen EAGAIN
-  // on this connection and we don't have EPOLLOUT.
-  if (conn->has_messages()) {
-    bool had_messages = conn->has_messages();
-    bool error = conn->writeBytes();

-    if (error) {
-      conn.reset(); // Connection should be closed
+  // Process pending responses first if this is an EPOLLOUT event
+  if (events & EPOLLOUT) {
+    std::unique_lock lock(conn->mutex_);
+    if (!conn->pending_response_queue_.empty()) {
+      std::vector<PendingResponse> pending_vec;
+      pending_vec.reserve(conn->pending_response_queue_.size());
+      for (auto &response : conn->pending_response_queue_) {
+        pending_vec.push_back(std::move(response));
+      }
+      conn->pending_response_queue_.clear();
+      lock.unlock();
+
+      handler_.on_preprocess_writes(*conn, std::span{pending_vec});
+    }
+  }
+
+  auto result = conn->write_bytes();
+
+  if (result & Connection::WriteBytesResult::Error) {
+    close_connection(conn);
    return;
  }

-    // Call handler with unique_ptr - handler can take ownership if needed
-    handler_.on_write_progress(conn);
-    // If handler took ownership (conn is now null), return true to indicate
-    // processing is done
-    if (!conn) {
-      return;
-    }
-
-    // Check if buffer became empty (transition from non-empty -> empty)
-    if (had_messages && !conn->has_messages()) {
-      handler_.on_write_buffer_drained(conn);
-      // If handler took ownership (conn is now null), return
-      if (!conn) {
-        return;
-      }
+  if (result & Connection::WriteBytesResult::Progress) {
+    // Call handler with connection reference - server retains ownership
+    handler_.on_write_progress(*conn);
  }

  // Check if we should close the connection according to application
-    if (!conn->has_messages() && conn->should_close()) {
-      conn.reset(); // Connection should be closed
+  if (result & Connection::WriteBytesResult::Close) {
+    close_connection(conn);
    return;
  }
 }
+
+void Server::close_connection(Ref<Connection> &conn) {
+  conn->close();
+  conn.reset();
 }

-void Server::process_connection_batch(
-    int epollfd, std::span<std::unique_ptr<Connection>> batch,
+static thread_local std::vector<Connection *> batch_connections;
+
+void Server::process_connection_batch(std::span<Ref<Connection>> batch,
                                      std::span<const int> events) {

  // First process writes for each connection
@@ -539,29 +490,20 @@ void Server::process_connection_batch(
    }
  }

-  // Call batch complete handler - handlers can take ownership here
-  handler_.on_batch_complete(batch);
+  // Call batch complete handler with connection pointers
+  batch_connections.clear();
+  for (auto &conn : batch) {
+    if (conn) {
+      batch_connections.push_back(conn.get());
+    }
+  }
+  handler_.on_batch_complete(batch_connections);

-  // Transfer all remaining connections back to epoll
-  for (auto &conn_ptr : batch) {
-    if (conn_ptr) {
-      int fd = conn_ptr->getFd();
-
-      struct epoll_event event{};
-      if (!conn_ptr->has_messages()) {
-        event.events = EPOLLIN | EPOLLONESHOT;
-      } else {
-        event.events = EPOLLOUT | EPOLLONESHOT;
-      }
-
-      event.data.fd = fd; // Use file descriptor for epoll
-      // Put connection back in registry since handler didn't take ownership.
-      // Must happen before epoll_ctl
-      connection_registry_.store(fd, std::move(conn_ptr));
-      if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event) == -1) {
-        perror("epoll_ctl MOD");
-        (void)connection_registry_.remove(fd);
-      }
+  // Return all connections to registry
+  for (auto &conn : batch) {
+    if (conn) {
+      const int fd = conn->fd_;
+      connection_registry_.store(fd, std::move(conn));
    }
  }
 }
--- a/src/server.hpp
+++ b/src/server.hpp
@@ -1,7 +1,6 @@
 #pragma once

 #include <atomic>
-#include <memory>
 #include <span>
 #include <thread>
 #include <vector>
@@ -95,26 +94,14 @@ struct Server {
   */
  int create_local_connection();

-  /**
-   * Release a connection back to its server for continued processing.
-   *
-   * This static method safely returns ownership of a connection back to its
-   * server. If the server has been destroyed, the connection will be safely
-   * cleaned up.
-   *
-   * This method is thread-safe and can be called from any thread.
-   *
-   * @param connection unique_ptr to the connection being released back
-   */
-  static void release_back_to_server(std::unique_ptr<Connection> connection);
-
 private:
  friend struct Connection;
  /**
   * Private constructor - use create() factory method instead.
   *
   * @param config Server configuration (threads, ports, limits, etc.)
-   * @param handler Protocol handler for processing connection data
+   * @param handler Protocol handler for processing connection data. Must
+   * outlive the server.
   * @param listen_fds Vector of file descriptors to accept connections on.
   * Server takes ownership and will close them on
   * destruction. Server will set these to non-blocking mode for safe epoll
@@ -122,12 +109,12 @@ private:
   */
  explicit Server(const weaseldb::Config &config, ConnectionHandler &handler,
                  const std::vector<int> &listen_fds);
-  friend Ref<Server> make_ref<Server>(const weaseldb::Config &config,
-                                      ConnectionHandler &handler,
-                                      const std::vector<int> &listen_fds);
+  template <typename T, typename... Args>
+  friend Ref<T> make_ref(Args &&...args);
+
  WeakRef<Server> self_;

-  const weaseldb::Config &config_;
+  weaseldb::Config config_;
  ConnectionHandler &handler_;

  // Connection registry
@@ -137,13 +124,14 @@ private:
  std::atomic<int64_t> connection_id_{0};
  std::atomic<int> active_connections_{0};

-  // Round-robin counter for connection distribution
+  // Round-robin counter for local connection distribution across epoll
+  // instances
  std::atomic<size_t> connection_distribution_counter_{0};

  // Shutdown coordination
  int shutdown_pipe_[2] = {-1, -1};

-  // Multiple epoll file descriptors to reduce contention
+  // Multiple epoll file descriptors (1:1 with I/O threads) to reduce contention
  std::vector<int> epoll_fds_;
  std::vector<int>
      listen_fds_; // FDs to accept connections on (Server owns these)
@@ -154,30 +142,16 @@ private:
  void create_epoll_instances();
  void start_io_threads(std::vector<std::thread> &threads);

-  // Helper to get epoll fd for a thread using round-robin
-  int get_epoll_for_thread(int thread_id) const;
-
  // Helper for processing connection I/O
-  void process_connection_reads(std::unique_ptr<Connection> &conn_ptr,
-                                int events);
-  void process_connection_writes(std::unique_ptr<Connection> &conn_ptr,
-                                 int events);
+  void process_connection_reads(Ref<Connection> &conn, int events);
+  void process_connection_writes(Ref<Connection> &conn, int events);
+
+  void close_connection(Ref<Connection> &conn);

  // Helper for processing a batch of connections with their events
-  void process_connection_batch(int epollfd,
-                                std::span<std::unique_ptr<Connection>> batch,
+  void process_connection_batch(std::span<Ref<Connection>> batch,
                                std::span<const int> events);

-  /**
-   * Called internally to return ownership to the server.
-   *
-   * This method is thread-safe and can be called from any thread.
-   * The connection will be re-added to the epoll for continued processing.
-   *
-   * @param connection Unique pointer to the connection being released back
-   */
-  void receiveConnectionBack(std::unique_ptr<Connection> connection);
-
  // Make non-copyable and non-movable
  Server(const Server &) = delete;
  Server &operator=(const Server &) = delete;
--- a/src/thread_pipeline.hpp
+++ b/src/thread_pipeline.hpp
@@ -1,6 +1,5 @@
 #pragma once

-#include <array>
 #include <atomic>
 #include <cassert>
 #include <cstddef>
@@ -48,151 +47,174 @@ struct ThreadState {
  bool last_stage;
 };

-// Compile-time topology configuration for static pipelines
+// Runtime topology configuration for dynamic pipelines
 //
-// This template defines a pipeline topology at compile-time:
-// - Stage and thread calculations done at compile-time
-// - Type-safe indexing: Stage and thread indices validated at compile-time
-// - Fixed-size arrays with known bounds
-// - Code specialization for each topology
+// This class defines a pipeline topology at runtime:
+// - Stage and thread calculations done at runtime
+// - Flexible configuration: topology can be set via constructor
+// - Dynamic arrays with runtime bounds checking
+// - Single implementation works for any topology
 //
-// Example: StaticPipelineTopology<1, 4, 2> creates:
+// Example: PipelineTopology({1, 4, 2}) creates:
 //   - Stage 0: 1 thread  (index 0)
 //   - Stage 1: 4 threads (indices 1-4)
 //   - Stage 2: 2 threads (indices 5-6)
 //   - Total: 7 threads across 3 stages
-template <int... ThreadsPerStage> struct StaticPipelineTopology {
-  static_assert(sizeof...(ThreadsPerStage) > 0,
-                "Must specify at least one stage");
-  static_assert(((ThreadsPerStage > 0) && ...),
-                "All stages must have at least one thread");
+struct PipelineTopology {
+  const std::vector<int> threads_per_stage;
+  const int num_stages;
+  const std::vector<int> stage_offsets;
+  const int total_threads;

-  static constexpr int num_stages = sizeof...(ThreadsPerStage);
-  static constexpr std::array<int, num_stages> threads_per_stage = {
-      ThreadsPerStage...};
-  static constexpr int total_threads = (ThreadsPerStage + ...);
+  explicit PipelineTopology(std::vector<int> threads_per_stage_)
+      : threads_per_stage(validate_and_move(std::move(threads_per_stage_))),
+        num_stages(static_cast<int>(threads_per_stage.size())),
+        stage_offsets(build_stage_offsets(threads_per_stage)),
+        total_threads(build_total_threads(threads_per_stage)) {}

-  // Compile-time stage offset calculation
-  template <int Stage> static constexpr int stage_offset() {
-    static_assert(Stage >= 0 && Stage < num_stages,
-                  "Stage index out of bounds");
-    if constexpr (Stage == 0) {
-      return 0;
-    } else {
-      return stage_offset<Stage - 1>() + threads_per_stage[Stage - 1];
+  // Runtime stage offset calculation
+  int stage_offset(int stage) const {
+    if (stage < 0 || stage >= num_stages) {
+      std::abort(); // Stage index out of bounds
    }
+    return stage_offsets[stage];
  }

-  // Compile-time thread index calculation
-  template <int Stage, int Thread> static constexpr int thread_index() {
-    static_assert(Stage >= 0 && Stage < num_stages,
-                  "Stage index out of bounds");
-    static_assert(Thread >= 0 && Thread < threads_per_stage[Stage],
-                  "Thread index out of bounds");
-    return stage_offset<Stage>() + Thread;
+  // Runtime thread index calculation
+  int thread_index(int stage, int thread) const {
+    if (stage < 0 || stage >= num_stages) {
+      std::abort(); // Stage index out of bounds
+    }
+    if (thread < 0 || thread >= threads_per_stage[stage]) {
+      std::abort(); // Thread index out of bounds
+    }
+    return stage_offsets[stage] + thread;
  }

-  // Compile-time previous stage thread count
-  template <int Stage> static constexpr int prev_stage_thread_count() {
-    static_assert(Stage >= 0 && Stage < num_stages,
-                  "Stage index out of bounds");
-    if constexpr (Stage == 0) {
+  // Runtime previous stage thread count
+  int prev_stage_thread_count(int stage) const {
+    if (stage < 0 || stage >= num_stages) {
+      std::abort(); // Stage index out of bounds
+    }
+    if (stage == 0) {
      return 1;
    } else {
-      return threads_per_stage[Stage - 1];
+      return threads_per_stage[stage - 1];
    }
  }
+
+private:
+  static std::vector<int> validate_and_move(std::vector<int> threads) {
+    if (threads.empty()) {
+      std::abort(); // Must specify at least one stage
+    }
+    for (int count : threads) {
+      if (count <= 0) {
+        std::abort(); // All stages must have at least one thread
+      }
+    }
+    return threads;
+  }
+
+  static std::vector<int>
+  build_stage_offsets(const std::vector<int> &threads_per_stage) {
+    std::vector<int> offsets(threads_per_stage.size());
+    int offset = 0;
+    for (size_t i = 0; i < threads_per_stage.size(); ++i) {
+      offsets[i] = offset;
+      offset += threads_per_stage[i];
+    }
+    return offsets;
+  }
+
+  static int build_total_threads(const std::vector<int> &threads_per_stage) {
+    int total = 0;
+    for (int count : threads_per_stage) {
+      total += count;
+    }
+    return total;
+  }
 };

-// Static pipeline algorithms - compile-time specialized versions
-namespace StaticPipelineAlgorithms {
+// Pipeline algorithms - runtime configurable versions
+namespace PipelineAlgorithms {

-template <WaitStrategy wait_strategy, typename Topology, int Stage,
-          int ThreadInStage>
-uint32_t calculate_safe_len(
-    std::array<ThreadState, Topology::total_threads> &all_threads,
-    std::atomic<uint32_t> &pushes, bool may_block) {
-  constexpr int thread_idx =
-      Topology::template thread_index<Stage, ThreadInStage>();
+inline uint32_t calculate_safe_len(WaitStrategy wait_strategy,
+                                   const PipelineTopology &topology, int stage,
+                                   int thread_in_stage,
+                                   std::vector<ThreadState> &all_threads,
+                                   std::atomic<uint32_t> &pushes,
+                                   bool may_block) {
+  int thread_idx = topology.thread_index(stage, thread_in_stage);
  auto &thread = all_threads[thread_idx];
  uint32_t safe_len = UINT32_MAX;

-  constexpr int prev_stage_threads =
-      Topology::template prev_stage_thread_count<Stage>();
+  int prev_stage_threads = topology.prev_stage_thread_count(stage);

-  // Compile-time loop over previous stage threads
-  [&]<std::size_t... Is>(std::index_sequence<Is...>) {
-    (
-        [&] {
-          auto &last_push = [&]() -> std::atomic<uint32_t> & {
-            if constexpr (Stage == 0) {
+  // Runtime loop over previous stage threads
+  for (int i = 0; i < prev_stage_threads; ++i) {
+    std::atomic<uint32_t> &last_push = [&]() -> std::atomic<uint32_t> & {
+      if (stage == 0) {
        return pushes;
      } else {
-              constexpr int prev_thread_idx =
-                  Topology::template thread_index<Stage - 1, Is>();
+        int prev_thread_idx = topology.thread_index(stage - 1, i);
        return all_threads[prev_thread_idx].pops;
      }
    }();

-          if (thread.last_push_read[Is] == thread.local_pops) {
-            thread.last_push_read[Is] =
-                last_push.load(std::memory_order_acquire);
-            if (thread.last_push_read[Is] == thread.local_pops) {
+    if (thread.last_push_read[i] == thread.local_pops) {
+      thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
+      if (thread.last_push_read[i] == thread.local_pops) {
        if (!may_block) {
          safe_len = 0;
-                return;
+          return safe_len;
        }

-              if constexpr (wait_strategy == WaitStrategy::Never) {
+        if (wait_strategy == WaitStrategy::Never) {
          // Empty - busy wait
-              } else if constexpr (wait_strategy ==
-                                   WaitStrategy::WaitIfUpstreamIdle) {
+        } else if (wait_strategy == WaitStrategy::WaitIfUpstreamIdle) {
          // We're allowed to spin as long as we eventually go to 0% cpu
          // usage on idle
          uint32_t push;
-                for (int i = 0; i < 100000; ++i) {
+          bool should_wait = true;
+          for (int j = 0; j < 100000; ++j) {
            push = pushes.load(std::memory_order_relaxed);
            if (push != thread.local_pops) {
-                    goto dont_wait;
+              should_wait = false;
+              break;
            }
 #if defined(__x86_64__) || defined(_M_X64)
            _mm_pause();
 #endif
          }
+          if (should_wait) {
            pushes.wait(push, std::memory_order_relaxed);
-              dont_wait:;
-              } else {
-                static_assert(wait_strategy == WaitStrategy::WaitIfStageEmpty);
-                last_push.wait(thread.last_push_read[Is],
-                               std::memory_order_relaxed);
+          }
+        } else { // WaitStrategy::WaitIfStageEmpty
+          last_push.wait(thread.last_push_read[i], std::memory_order_relaxed);
        }

-              thread.last_push_read[Is] =
-                  last_push.load(std::memory_order_acquire);
+        thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
      }
    }
-          safe_len =
-              std::min(safe_len, thread.last_push_read[Is] - thread.local_pops);
-        }(),
-        ...);
-  }(std::make_index_sequence<prev_stage_threads>{});
+    safe_len = std::min(safe_len, thread.last_push_read[i] - thread.local_pops);
+  }

  return safe_len;
 }

-template <WaitStrategy wait_strategy, typename Topology, int Stage,
-          int ThreadInStage>
-void update_thread_pops(
-    std::array<ThreadState, Topology::total_threads> &all_threads,
+inline void update_thread_pops(WaitStrategy wait_strategy,
+                               const PipelineTopology &topology, int stage,
+                               int thread_in_stage,
+                               std::vector<ThreadState> &all_threads,
                               uint32_t local_pops) {
-  constexpr int thread_idx =
-      Topology::template thread_index<Stage, ThreadInStage>();
+  int thread_idx = topology.thread_index(stage, thread_in_stage);
  auto &thread_state = all_threads[thread_idx];

-  if constexpr (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
+  if (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
    thread_state.pops.store(local_pops, std::memory_order_seq_cst);
    thread_state.pops.notify_all();
-  } else if constexpr (Stage == Topology::num_stages - 1) { // last stage
+  } else if (stage == topology.num_stages - 1) { // last stage
    thread_state.pops.store(local_pops, std::memory_order_seq_cst);
    thread_state.pops.notify_all();
  } else {
@@ -200,15 +222,13 @@ void update_thread_pops(
  }
 }

-template <typename Topology>
-int check_producer_capacity(
-    std::array<ThreadState, Topology::total_threads> &all_threads,
-    uint32_t slot, uint32_t size, uint32_t slot_count, bool block) {
-  constexpr int last_stage = Topology::num_stages - 1;
-  constexpr int last_stage_offset =
-      Topology::template stage_offset<last_stage>();
-  constexpr int last_stage_thread_count =
-      Topology::threads_per_stage[last_stage];
+inline int check_producer_capacity(const PipelineTopology &topology,
+                                   std::vector<ThreadState> &all_threads,
+                                   uint32_t slot, uint32_t size,
+                                   uint32_t slot_count, bool block) {
+  int last_stage = topology.num_stages - 1;
+  int last_stage_offset = topology.stage_offset(last_stage);
+  int last_stage_thread_count = topology.threads_per_stage[last_stage];

  for (int i = 0; i < last_stage_thread_count; ++i) {
    auto &thread = all_threads[last_stage_offset + i];
@@ -223,10 +243,10 @@ int check_producer_capacity(
  }
  return 0; // Can proceed
 }
-} // namespace StaticPipelineAlgorithms
+} // namespace PipelineAlgorithms

-// Static multi-stage lock-free pipeline for inter-thread communication
-// with compile-time topology specification.
+// Multi-stage lock-free pipeline for inter-thread communication
+// with runtime-configurable topology and wait strategy.
 //
 // Overview:
 // - Items flow from producers through multiple processing stages (stage 0 ->
@@ -234,25 +254,17 @@ int check_producer_capacity(
 // - Each stage can have multiple worker threads processing items in parallel
 // - Uses a shared ring buffer with atomic counters for lock-free coordination
 // - Supports batch processing for efficiency
-// - Compile-time topology specification via template parameters
+// - Runtime-configurable topology and wait strategy via constructor parameters
 //
 // Architecture:
 // - Producers: External threads that add items to the pipeline via push()
 // - Stages: Processing stages numbered 0, 1, 2, ... that consume items via
-// acquire<Stage, Thread>()
+// acquire(stage, thread)
 // - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage
 //
-// Differences from Dynamic Version:
-// - Template parameters specify topology at compile-time (e.g., <Item,
-// WaitStrategy::Never, 1, 4, 2>)
-// - Stage and thread indices are template parameters, validated at compile-time
-// - Fixed-size arrays replace dynamic vectors
-// - Specialized algorithms for each stage/thread combination
-// - Type-safe guards prevent runtime indexing errors
-//
 // Usage Pattern:
-//   using Pipeline = StaticThreadPipeline<Item, WaitStrategy::WaitIfStageEmpty,
-//   1, 4, 2>; Pipeline pipeline(lgSlotCount);
+//   ThreadPipeline<Item> pipeline(WaitStrategy::WaitIfStageEmpty, {1, 4, 2},
+//   lgSlotCount);
 //
 //   // Producer threads (add items for stage 0 to consume):
 //   auto guard = pipeline.push(batchSize, /*block=*/true);
@@ -262,12 +274,54 @@ int check_producer_capacity(
 //   // Guard destructor publishes batch to stage 0 consumers
 //
 //   // Stage worker threads (process items and pass to next stage):
-//   auto guard = pipeline.acquire<Stage, Thread>(maxBatch, /*may_block=*/true);
+//   auto guard = pipeline.acquire(stage, thread, maxBatch, /*may_block=*/true);
 //   for (auto& item : guard.batch) {
 //     // Process item
 //   }
 //   // Guard destructor marks items as consumed and available to next stage
 //
+// Multi-Thread Stage Processing:
+// When a stage has multiple threads (e.g., {1, 1, 1, 2} = 2 threads in stage
+// 3):
+//
+// OVERLAPPING BATCHES - EACH THREAD SEES EVERY ENTRY:
+// - Multiple threads in the same stage get OVERLAPPING batches from the ring
+// buffer
+// - Thread 0: calls acquire(3, 0) - gets batch from ring positions 100-110
+// - Thread 1: calls acquire(3, 1) - gets batch from ring positions 100-110
+// (SAME)
+// - Both threads see the same entries and must coordinate processing
+//
+// PARTITIONING STRATEGIES:
+// Choose your partitioning approach based on your use case:
+//
+// 1. Ring buffer position-based partitioning:
+//   for (auto it = batch.begin(); it != batch.end(); ++it) {
+//     if (it.index() % 2 != thread_index) continue; // Skip entries for other
+//     threads process(*it); // Process only entries assigned to this thread
+//   }
+//
+// 2. Entry content-based partitioning:
+//   for (auto& item : guard.batch) {
+//     if (hash(item.connection_id) % 2 != thread_index) continue;
+//     process(item); // Process based on entry properties
+//   }
+//
+// 3. Process all entries (when each thread does different work):
+//   for (auto& item : guard.batch) {
+//     process(item); // Both threads process all items, but differently
+//   }
+//
+// Common Partitioning Patterns:
+// - Position-based: it.index() % num_threads == thread_index
+// - Hash-based: hash(item.key) % num_threads == thread_index
+// - Type-based: item.type == MY_THREAD_TYPE
+// - Load balancing: assign work based on thread load
+// - All entries: each thread processes all items but performs different
+// operations
+//
+// Note: it.index() returns the position in the ring buffer (0 to buffer_size-1)
+//
 // Memory Model:
 // - Ring buffer size must be power of 2 for efficient masking
 // - Actual ring slots accessed via: index & (slotCount - 1)
@@ -278,27 +332,27 @@ int check_producer_capacity(
 // ordering
 // - Uses C++20 atomic wait/notify for efficient blocking when no work available
 // - RAII guards ensure proper cleanup even with exceptions
-template <class T, WaitStrategy wait_strategy, int... ThreadsPerStage>
-struct StaticThreadPipeline {
-  using Topology = StaticPipelineTopology<ThreadsPerStage...>;
-
+template <class T> struct ThreadPipeline {
  // Constructor
+  // wait_strategy: blocking behavior when no work is available
+  // threads_per_stage: number of threads in each stage (e.g., {1, 4, 2})
  // lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots)
-  // Template parameters specify pipeline topology (e.g., <Item, Never, 1, 4,
-  // 2>) Note: Producer threads are external to the pipeline and not counted in
-  // ThreadsPerStage
-  explicit StaticThreadPipeline(int lgSlotCount)
-      : slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
-        ring(slot_count) {
+  // Note: Producer threads are external to the pipeline and not counted in
+  // threads_per_stage
+  explicit ThreadPipeline(WaitStrategy wait_strategy,
+                          std::vector<int> threads_per_stage, int lgSlotCount)
+      : wait_strategy_(wait_strategy), topology_(std::move(threads_per_stage)),
+        slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
+        ring(slot_count), all_threads(topology_.total_threads) {
    // Otherwise we can't tell the difference between full and empty.
    assert(!(slot_count_mask & 0x80000000));
    initialize_all_threads();
  }

-  StaticThreadPipeline(StaticThreadPipeline const &) = delete;
-  StaticThreadPipeline &operator=(StaticThreadPipeline const &) = delete;
-  StaticThreadPipeline(StaticThreadPipeline &&) = delete;
-  StaticThreadPipeline &operator=(StaticThreadPipeline &&) = delete;
+  ThreadPipeline(ThreadPipeline const &) = delete;
+  ThreadPipeline &operator=(ThreadPipeline const &) = delete;
+  ThreadPipeline(ThreadPipeline &&) = delete;
+  ThreadPipeline &operator=(ThreadPipeline &&) = delete;

  struct Batch {
    Batch() : ring(), begin_(), end_() {}
@@ -401,7 +455,7 @@ struct StaticThreadPipeline {
    }

  private:
-    friend struct StaticThreadPipeline;
+    friend struct ThreadPipeline;
    Batch(std::vector<T> *const ring, uint32_t begin_, uint32_t end_)
        : ring(ring), begin_(begin_), end_(end_) {}
    std::vector<T> *const ring;
@@ -409,29 +463,29 @@ struct StaticThreadPipeline {
    uint32_t end_;
  };

-  // Static thread storage - fixed size array
-  std::array<ThreadState, Topology::total_threads> all_threads;
-
 private:
+  WaitStrategy wait_strategy_;
+  PipelineTopology topology_;
+
  alignas(128) std::atomic<uint32_t> slots{0};
  alignas(128) std::atomic<uint32_t> pushes{0};
  const uint32_t slot_count;
  const uint32_t slot_count_mask;

  std::vector<T> ring;
+  std::vector<ThreadState> all_threads;

  void initialize_all_threads() {
-    [&]<std::size_t... StageIndices>(std::index_sequence<StageIndices...>) {
-      (init_stage_threads<StageIndices>(), ...);
-    }(std::make_index_sequence<Topology::num_stages>{});
+    for (int stage = 0; stage < topology_.num_stages; ++stage) {
+      init_stage_threads(stage);
+    }
  }

-  template <int Stage> void init_stage_threads() {
-    constexpr int stage_offset = Topology::template stage_offset<Stage>();
-    constexpr int stage_thread_count = Topology::threads_per_stage[Stage];
-    constexpr int prev_stage_threads =
-        Topology::template prev_stage_thread_count<Stage>();
-    constexpr bool is_last_stage = (Stage == Topology::num_stages - 1);
+  void init_stage_threads(int stage) {
+    int stage_offset = topology_.stage_offset(stage);
+    int stage_thread_count = topology_.threads_per_stage[stage];
+    int prev_stage_threads = topology_.prev_stage_thread_count(stage);
+    bool is_last_stage = (stage == topology_.num_stages - 1);

    for (int thread = 0; thread < stage_thread_count; ++thread) {
      auto &thread_state = all_threads[stage_offset + thread];
@@ -440,14 +494,15 @@ private:
    }
  }

-  template <int Stage, int Thread>
-  Batch acquire_helper(uint32_t maxBatch, bool mayBlock) {
-    constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
+  Batch acquire_helper(int stage, int thread, uint32_t maxBatch,
+                       bool may_block) {
+    int thread_idx = topology_.thread_index(stage, thread);
    auto &thread_state = all_threads[thread_idx];

    uint32_t begin = thread_state.local_pops & slot_count_mask;
-    uint32_t len = StaticPipelineAlgorithms::calculate_safe_len<
-        wait_strategy, Topology, Stage, Thread>(all_threads, pushes, mayBlock);
+    uint32_t len = PipelineAlgorithms::calculate_safe_len(
+        wait_strategy_, topology_, stage, thread, all_threads, pushes,
+        may_block);

    if (maxBatch != 0) {
      len = std::min(len, maxBatch);
@@ -462,13 +517,13 @@ private:
  }

 public:
-  template <int Stage, int Thread> struct StageGuard {
+  struct StageGuard {
    Batch batch;

    ~StageGuard() {
      if (!batch.empty()) {
-        StaticPipelineAlgorithms::update_thread_pops<wait_strategy, Topology,
-                                                     Stage, Thread>(
+        PipelineAlgorithms::update_thread_pops(
+            pipeline->wait_strategy_, pipeline->topology_, stage, thread,
            pipeline->all_threads, local_pops);
      }
    }
@@ -476,22 +531,28 @@ public:
    StageGuard(StageGuard const &) = delete;
    StageGuard &operator=(StageGuard const &) = delete;
    StageGuard(StageGuard &&other) noexcept
-        : batch(other.batch), local_pops(other.local_pops),
+        : batch(other.batch), local_pops(other.local_pops), stage(other.stage),
+          thread(other.thread),
          pipeline(std::exchange(other.pipeline, nullptr)) {}
    StageGuard &operator=(StageGuard &&other) noexcept {
      batch = other.batch;
      local_pops = other.local_pops;
+      stage = other.stage;
+      thread = other.thread;
      pipeline = std::exchange(other.pipeline, nullptr);
      return *this;
    }

  private:
-    friend struct StaticThreadPipeline;
+    friend struct ThreadPipeline;
    uint32_t local_pops;
-    StaticThreadPipeline *pipeline;
+    int stage;
+    int thread;
+    ThreadPipeline *pipeline;

-    StageGuard(Batch batch, uint32_t local_pops, StaticThreadPipeline *pipeline)
-        : batch(batch), local_pops(local_pops),
+    StageGuard(Batch batch, uint32_t local_pops, int stage, int thread,
+               ThreadPipeline *pipeline)
+        : batch(batch), local_pops(local_pops), stage(stage), thread(thread),
          pipeline(batch.empty() ? nullptr : pipeline) {}
  };

@@ -514,37 +575,30 @@ public:
    }

  private:
-    friend struct StaticThreadPipeline;
+    friend struct ThreadPipeline;
    ProducerGuard() : batch(), tp() {}
-    ProducerGuard(Batch batch, StaticThreadPipeline *tp, uint32_t old_slot,
+    ProducerGuard(Batch batch, ThreadPipeline *tp, uint32_t old_slot,
                  uint32_t new_slot)
        : batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {}
-    StaticThreadPipeline *const tp;
+    ThreadPipeline *const tp;
    uint32_t old_slot;
    uint32_t new_slot;
  };

  // Acquire a batch of items for processing by a consumer thread.
-  // Stage: which processing stage (0 = first consumer stage after producers) -
-  // compile-time parameter Thread: thread ID within the stage (0 to
-  // ThreadsPerStage[Stage]-1) - compile-time parameter maxBatch: maximum items
-  // to acquire (0 = no limit) may_block: whether to block waiting for items
-  // (false = return empty batch if none available) Returns: StageGuard<Stage,
-  // Thread> with batch of items to process and compile-time type safety
-  template <int Stage, int Thread>
-  [[nodiscard]] StageGuard<Stage, Thread> acquire(int maxBatch = 0,
+  // stage: which processing stage (0 = first consumer stage after producers)
+  // thread: thread ID within the stage (0 to threads_per_stage[stage]-1)
+  // maxBatch: maximum items to acquire (0 = no limit)
+  // may_block: whether to block waiting for items (false = return empty batch
+  // if none available) Returns: StageGuard with batch of items to process
+  [[nodiscard]] StageGuard acquire(int stage, int thread, int maxBatch = 0,
                                   bool may_block = true) {
-    static_assert(Stage >= 0 && Stage < Topology::num_stages,
-                  "Stage index out of bounds");
-    static_assert(Thread >= 0 && Thread < Topology::threads_per_stage[Stage],
-                  "Thread index out of bounds");
+    auto batch = acquire_helper(stage, thread, maxBatch, may_block);

-    auto batch = acquire_helper<Stage, Thread>(maxBatch, may_block);
-
-    constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
+    int thread_idx = topology_.thread_index(stage, thread);
    uint32_t local_pops = all_threads[thread_idx].local_pops;

-    return StageGuard<Stage, Thread>{std::move(batch), local_pops, this};
+    return StageGuard{std::move(batch), local_pops, stage, thread, this};
  }

  // Reserve slots in the ring buffer for a producer thread to fill with items.
@@ -577,9 +631,8 @@ public:
      slot = slots.load(std::memory_order_relaxed);
      begin = slot & slot_count_mask;

-      int capacity_result =
-          StaticPipelineAlgorithms::check_producer_capacity<Topology>(
-              all_threads, slot, size, slot_count, block);
+      int capacity_result = PipelineAlgorithms::check_producer_capacity(
+          topology_, all_threads, slot, size, slot_count, block);
      if (capacity_result == 1) {
        continue;
      }
--- a/style.md
+++ b/style.md
@@ -5,28 +5,30 @@ This document describes the C++ coding style used in the WeaselDB project. These
 ## Table of Contents

 1. [General Principles](#general-principles)
-2. [Naming Conventions](#naming-conventions)
-3. [File Organization](#file-organization)
-4. [Code Structure](#code-structure)
-5. [Memory Management](#memory-management)
-6. [Error Handling](#error-handling)
-7. [Documentation](#documentation)
-8. [Testing](#testing)
+1. [Naming Conventions](#naming-conventions)
+1. [File Organization](#file-organization)
+1. [Code Structure](#code-structure)
+1. [Memory Management](#memory-management)
+1. [Error Handling](#error-handling)
+1. [Documentation](#documentation)
+1. [Testing](#testing)

---
+______________________________________________________________________

 ## General Principles

 ### Language Standard
+
 - **C++20** is the target standard
 - Use modern C++ features: RAII, move semantics, constexpr, concepts where appropriate
- Prefer standard library containers and algorithms over custom implementations

 ### C Library Functions and Headers
+
 - **Always use std:: prefixed versions** of C library functions for consistency and clarity
 - **Use C++ style headers** (`<cstring>`, `<cstdlib>`, etc.) instead of C style headers (`<string.h>`, `<stdlib.h>`, etc.)
 - This applies to all standard libc functions: `std::abort()`, `std::fprintf()`, `std::free()`, `std::memcpy()`, `std::strlen()`, `std::strncpy()`, `std::memset()`, `std::signal()`, etc.
 - **Exception:** Functions with no std:: equivalent (e.g., `perror()`, `gai_strerror()`) and system-specific headers (e.g., `<unistd.h>`, `<fcntl.h>`)
+
 ```cpp
 // Preferred - C++ style
 #include <cstring>
@@ -56,23 +58,25 @@ signal(SIGTERM, handler);
 ```

 ### Data Types
+
 - **Almost always signed** - prefer `int`, `int64_t`, `ssize_t` over unsigned types except for:
  - Bit manipulation operations
  - Interfacing with APIs that require unsigned types
  - Where defined unsigned overflow behavior (wraparound) is intentional and desired
 - **Almost always auto** - let the compiler deduce types except when:
-  - The type is not obvious from context (prefer explicit for clarity)
+  - The type is not obvious from context and the exact type is important (prefer explicit for clarity)
  - Specific type requirements matter (numeric conversions, template parameters)
  - Interface contracts need explicit types (public APIs, function signatures)
 - **Prefer uninitialized memory to default initialization** when using before initializing would be an error
  - Valgrind will catch uninitialized memory usage bugs
-  - Avoid hiding logic errors with unnecessary zero-initialization
+  - Avoid hiding logic errors that Valgrind would have caught with unnecessary zero-initialization
  - Default initialization can mask bugs and hurt performance
 - **Floating point is for metrics only** - avoid `float`/`double` in core data structures and algorithms
  - Use for performance measurements, statistics, and monitoring data
-  - Never use for counts, sizes, or business logic
+  - Avoid branching on the values of floats

 ### Type Casting
+
 - **Never use C-style casts** - they're unsafe and can hide bugs by performing dangerous conversions
 - **Use C++ cast operators** for explicit type conversions with clear intent and safety checks
 - **Avoid `reinterpret_cast`** - almost always indicates poor design; redesign APIs instead
@@ -94,18 +98,21 @@ auto addr = reinterpret_cast<uintptr_t>(ptr);         // Pointer to integer conv
 ```

 ### Performance Focus
+
 - **Performance-first design** - optimize for the hot path
 - **Simple is fast** - find exactly what's necessary, strip away everything else
 - **Complexity must be justified with benchmarks** - measure performance impact before adding complexity
 - **Strive for 0% CPU usage when idle** - avoid polling, busy waiting, or unnecessary background activity
 - Use **inline functions** for performance-critical code (e.g., `allocate_raw`)
 - **String views** with `std::string_view` to minimize unnecessary copying
- **Arena allocation** for efficient memory management (~1ns vs ~20-270ns for malloc)
+- **Arena allocation** for efficient memory management, and to group related lifetimes together for simplicity

 ### String Formatting
+
 - **Always use `format.hpp` functions** - formats directly into arena-allocated memory
 - **Use `static_format()` for performance-sensitive code** - faster but less flexible than `format()`
 - **Use `format()` function with arena allocator** for printf-style formatting
+
 ```cpp
 // Most performance-sensitive - compile-time optimized concatenation
 std::string_view response = static_format(arena,
@@ -123,7 +130,10 @@ std::string_view response = format(arena,
    static_cast<int>(body.size()), body.data());
 ```

+- Offer APIs that let you avoid concatenating strings if possible - e.g. if the bytes are going to get written to a file descriptor you can skip concatenating and use scatter/gather writev-type calls.
+
 ### Complexity Control
+
 - **Encapsulation is the main tool for controlling complexity**
 - **Header files define the interface** - they are the contract with users of your code
 - **Headers should be complete** - include everything needed to use the interface effectively:
@@ -132,15 +142,17 @@ std::string_view response = format(arena,
  - Thread safety guarantees
  - Performance characteristics
  - Ownership and lifetime semantics
- **Do not rely on undocumented interface properties** - if it's not in the header, don't depend on it
+- **Do not rely on undocumented properties of an interface** - if it's not in the header, don't depend on it

---
+______________________________________________________________________

 ## Naming Conventions

 ### Variables and Functions
+
 - **snake_case** for all variables, functions, and member functions
 - **Legacy camelCase exists** - the codebase currently contains mixed naming due to historical development. New code should use snake_case. Existing camelCase should be converted to snake_case during natural refactoring (not mass renaming).
+
 ```cpp
 int64_t used_bytes() const;
 void add_block(int64_t size);
@@ -148,27 +160,31 @@ int32_t initial_block_size_;
 ```

 ### Classes and Structs
+
 - **PascalCase** for class/struct names
 - **Always use struct keyword** - eliminates debates about complexity and maintains consistency
 - **Public members first, private after** - puts the interface users care about at the top, implementation details below
 - **Full encapsulation still applies** - use `private:` sections to hide implementation details and maintain deep, capable structs
 - The struct keyword doesn't mean shallow design - it means interface-first organization for human readers
+- Omit the `public` keyword when inheriting from a struct. It's public by default. E.g. `struct A : B {};` instead of `struct A : public B {};`
+
 ```cpp
-struct Arena {
+struct MyClass {
  // Public interface first
-  explicit Arena(int64_t initial_size = 1024);
-  void* allocate_raw(int64_t size);
+  void do_thing();

 private:
  // Private members after
-  int32_t initial_block_size_;
-  Block* current_block_;
+  int thing_count_;
 };
 ```

 ### Enums
+
 - **PascalCase** for enum class names
 - **PascalCase** for enum values (not SCREAMING_SNAKE_CASE)
+- C-style enums are acceptable where implicit int conversion is desirable, like for bitflags
+
 ```cpp
 enum class Type {
  PointRead,
@@ -183,14 +199,18 @@ enum class ParseState {
 ```

 ### Constants and Macros
+
 - **snake_case** for constants
 - Avoid macros when possible; prefer `constexpr` variables
+
 ```cpp
 static const WeaselJsonCallbacks json_callbacks;
 ```

 ### Member Variables
+
 - **Trailing underscore** for private member variables
+
 ```cpp
 private:
  int32_t initial_block_size_;
@@ -198,24 +218,28 @@ private:
 ```

 ### Template Parameters
+
 - **PascalCase** for template type parameters
+
 ```cpp
 template <typename T, typename... Args>
 template <typename T> struct rebind { using type = T*; };
 ```

---
+______________________________________________________________________

 ## File Organization

 ### Include Organization
+
 - Use **`#pragma once`** instead of include guards
 - **Never `using namespace std`** - always use fully qualified names for clarity and safety
 - **Include order** (applies to both headers and source files):
  1. Corresponding header file (for .cpp files only)
-  2. Standard library headers (alphabetical)
-  3. Third-party library headers
-  4. Project headers
+  1. Standard library headers (alphabetical)
+  1. Third-party library headers
+  1. Project headers
+
 ```cpp
 #pragma once

@@ -239,14 +263,16 @@ std::vector<int> data;
 std::unique_ptr<Parser> parser;
 ```

---
+______________________________________________________________________

 ## Code Structure

 ### Class Design
+
 - **Move-only semantics** for resource-owning types
 - **Explicit constructors** to prevent implicit conversions
- **Delete copy operations** when inappropriate
+- **Delete copy operations** when copying is inappropriate or should be discouraged
+
 ```cpp
 struct Arena {
  explicit Arena(int64_t initial_size = 1024);
@@ -266,12 +292,14 @@ private:
 ```

 ### Function Design
+
 - **Const correctness** - mark methods const when appropriate
 - **Parameter passing:**
  - Pass by value for types ≤ 16 bytes (int, pointers, string_view, small structs)
  - Pass by const reference for types > 16 bytes (containers, large objects)
 - **Return by value** for small types (≤ 16 bytes), **string_view** to avoid copying strings
 - **noexcept specification** for move operations and non-throwing functions
+
 ```cpp
 std::span<const Operation> operations() const { return operations_; }
 void process_data(std::string_view request_data);  // ≤ 16 bytes, pass by value
@@ -280,27 +308,30 @@ Arena(Arena &&source) noexcept;
 ```

 ### Template Usage
+
 - **Template constraints** using static_assert for better error messages
 - **SFINAE** or concepts for template specialization

 ### Factory Patterns & Ownership
- **Static factory methods** for complex construction requiring shared ownership
+
+- **Static factory methods** for complex construction requirements like enforcing shared ownership
 - **Friend-based factories** for access control when constructor should be private
 - **Ownership guidelines:**
  - **unique_ptr** for exclusive ownership (most common case)
-  - **shared_ptr** only when multiple owners need concurrent access to same object
+  - **Ref** only when object logically has multiple owners (`Ref` is our custom std::shared_ptr variant)
  - **Factory methods return appropriate smart pointer type** based on ownership needs

 ```cpp
 // Shared ownership - multiple components need concurrent access
-auto server = Server::create(config, handler);  // Returns shared_ptr
+auto server = Server::create(config, handler);  // Returns Ref<Server>

 // Exclusive ownership - single owner, transfer via move
 auto connection = Connection::createForServer(addr, fd, connection_id, handler, server_ref);

 // Friend-based factory for access control
 struct Connection {
-  void append_message(std::string_view message_data);
+  WeakRef<MessageSender> get_weak_ref() const;
+
 private:
  Connection(struct sockaddr_storage client_addr, int file_descriptor,
             int64_t connection_id, ConnectionHandler* request_handler,
@@ -310,8 +341,10 @@ private:
 ```

 ### Control Flow
+
 - **Early returns** to reduce nesting
 - **Range-based for loops** when possible
+
 ```cpp
 if (size == 0) {
  return nullptr;
@@ -323,9 +356,11 @@ for (auto &precondition : preconditions_) {
 ```

 ### Atomic Operations
+
 - **Never use assignment operators** with `std::atomic` - always use explicit `store()` and `load()`
 - **Always specify memory ordering** explicitly for atomic operations
 - **Use the least restrictive correct memory ordering** - choose the weakest ordering that maintains correctness
+
 ```cpp
 // Preferred - explicit store/load with precise memory ordering
 std::atomic<uint64_t> counter;
@@ -343,14 +378,16 @@ counter = 42;        // Implicit - memory ordering not explicit
 auto value = counter; // Implicit - memory ordering not explicit
 ```

---
+______________________________________________________________________

 ## Memory Management

 ### Ownership & Allocation
- **Arena allocators** for request-scoped memory with **STL allocator adapters** (see Performance Focus section for characteristics)
+
+- **Arena** for request-scoped memory with **STL allocator adapters**
 - **String views** pointing to arena-allocated memory to avoid unnecessary copying
 - **STL containers with arena allocators require default construction after arena reset** - `clear()` is not sufficient
+
 ```cpp
 // STL containers with arena allocators - correct reset pattern
 std::vector<Operation, ArenaStlAllocator<Operation>> operations(arena);
@@ -359,10 +396,33 @@ operations = {};  // Default construct - clear() won't work correctly
 arena.reset();  // Reset arena memory
 ```

+### Arena String Copying
+
+- **Always use `Arena::copy_string()`** for copying string data into arena memory
+- **Avoid manual allocation and memcpy** for string copying
+- **Use `Arena::allocate_span<T>()`** for array allocations instead of manual span construction
+
+```cpp
+// Preferred - unified arena methods
+std::string_view copy = arena.copy_string(original_string);
+auto buffer = arena.allocate_span<char>(1024);
+auto strings = arena.allocate_span<std::string_view>(count);
+
+// Avoid - manual allocation and copying
+char *copied = arena.allocate<char>(str.size());
+std::memcpy(copied, str.data(), str.size());
+std::string_view copy(copied, str.size());
+
+// Avoid - manual span construction
+auto span = std::span{arena.allocate<std::string_view>(count), count};
+```
+
 ### Resource Management
+
 - **RAII** everywhere - constructors acquire, destructors release
 - **Move semantics** for efficient resource transfer
 - **Explicit cleanup** methods where appropriate
+
 ```cpp
 ~Arena() {
  while (current_block_) {
@@ -373,20 +433,22 @@ arena.reset();  // Reset arena memory
 }
 ```

---
+______________________________________________________________________

 ## Error Handling

 ### Error Classification & Response
+
 - **Expected errors** (invalid input, timeouts): Return error codes for programmatic handling
 - **System failures** (malloc fail, socket fail): Abort immediately with error message
 - **Programming errors** (precondition violations, assertions): Abort immediately

 ### Error Contract Design
+
 - **Error codes are the API contract** - use enums for programmatic decisions
 - **Error messages are human-readable only** - never parse message strings
 - **Consistent error boundaries** - each component defines what it can/cannot recover from
- **Interface precondition violations are undefined behavior** - acceptable to skip checks for performance in hot paths
+- **Interface precondition violations are undefined behavior** - it's acceptable to skip checks for performance in hot paths
 - **Error code types must be nodiscard** - mark error code enums with `[[nodiscard]]` to prevent silent failures

 ```cpp
@@ -400,11 +462,12 @@ if (!memory) {
 }
 // ... use memory, eventually std::free(memory)

-// Programming error - precondition violation (may be omitted for performance)
+// Programming error - precondition violation (gets compiled out in release builds)
 assert(ptr != nullptr && "Precondition violated: pointer must be non-null");
 ```

 ### Assertions
+
 - **Programming error detection** using standard `assert()` macro
 - **Assertion behavior follows C++ standards:**
  - **Debug builds**: Assertions active (undefined `NDEBUG`)
@@ -413,6 +476,7 @@ assert(ptr != nullptr && "Precondition violated: pointer must be non-null");
 - **Static assertions** for compile-time validation (always active)

 **Usage guidelines:**
+
 - Use for programming errors: null checks, precondition validation, invariants
 - Don't use for expected runtime errors: use return codes instead

@@ -468,26 +532,28 @@ if (result == -1 && errno != EINTR) {

 Most system calls are not interruptible in practice. For these, it is not necessary to add a retry loop. This includes:

-*   `fcntl` (with `F_GETFL`, `F_SETFL`, `F_GETFD`, `F_SETFD` - note: `F_SETLKW` and `F_OFD_SETLKW` CAN return EINTR)
-*   `epoll_ctl`
-*   `socketpair`
-*   `pipe`
-*   `setsockopt`
-*   `epoll_create1`
-*   `close` (special case: guaranteed closed even on EINTR on Linux)
+- `fcntl` (with `F_GETFL`, `F_SETFL`, `F_GETFD`, `F_SETFD` - note: `F_SETLKW` and `F_OFD_SETLKW` CAN return EINTR)
+- `epoll_ctl`
+- `socketpair`
+- `pipe`
+- `setsockopt`
+- `epoll_create1`
+- `close` (special case: guaranteed closed even on EINTR on Linux)

 When in doubt, consult the `man` page for the specific system call to see if it can return `EINTR`.

---
+______________________________________________________________________

 ## Documentation

 ### Doxygen Style
+
 - **/** for struct and public method documentation
 - **@brief** for short descriptions
 - **@param** and **@return** for function parameters
 - **@note** for important implementation notes
 - **@warning** for critical usage warnings
+
 ```cpp
 /**
 * @brief Type-safe version of realloc_raw for arrays of type T.
@@ -502,9 +568,11 @@ T *realloc(T *existing_ptr, int32_t current_size, int32_t requested_size);
 ```

 ### Code Comments
- **Explain why, not what** - code should be self-documenting
+
+- **Explain why, not what** - *what* the code does should be clear without any comments
 - **Performance notes** for optimization decisions
 - **Thread safety** and ownership semantics
+
 ```cpp
 // Uses O(1) accumulated counters for fast retrieval
 int64_t total_allocated() const;
@@ -514,20 +582,23 @@ Connection(struct sockaddr_storage addr, int fd, int64_t id,
           ConnectionHandler *handler, std::weak_ptr<Server> server);
 ```

---
+______________________________________________________________________

 ## Testing

 ### Test Framework
+
 - **doctest** for unit testing
 - **TEST_CASE** and **SUBCASE** for test organization
 - **CHECK** for assertions (non-terminating)
 - **REQUIRE** for critical assertions (terminating)

 ### Test Structure
+
 - **Descriptive test names** explaining the scenario
- **SUBCASE** for related test variations
+- **SUBCASE** for related test variations that share setup/teardown code
 - **Fresh instances** for each test to avoid state contamination
+
 ```cpp
 TEST_CASE("Arena basic allocation") {
  Arena arena;
@@ -546,34 +617,27 @@ TEST_CASE("Arena basic allocation") {
 ```

 ### Test Design Principles
+
 - **Test the contract, not the implementation** - validate what the API promises to deliver, not implementation details
 - **Both integration and unit tests** - test components in isolation and working together
 - **Prefer fakes to mocks** - use real implementations for internal components, fake external dependencies
 - **Always enable assertions in tests** - use `-UNDEBUG` pattern to ensure assertions are checked (see Build Integration section)
+
+TODO make a new example here using APIs that exist
+
 ```cpp
-// Good: Testing through public API
-TEST_CASE("Server accepts connections") {
-  auto config = Config::defaultConfig();
-  auto handler = std::make_unique<TestHandler>();
-  auto server = Server::create(config, std::move(handler));
-
-  // Test observable behavior - server can accept connections
-  auto result = connectToServer(server->getPort());
-  CHECK(result.connected);
-}
-
-// Avoid: Testing internal implementation details
-// TEST_CASE("Server creates epoll instance") { /* implementation detail */ }
 ```

 ### What NOT to Test

-**Avoid testing language features and plumbing:**
+**Avoid testing language features:**
+
 - Don't test that virtual functions dispatch correctly
 - Don't test that standard library types work (unique_ptr, containers, etc.)
 - Don't test basic constructor/destructor calls

 **Test business logic instead:**
+
 - When does your code call hooks/callbacks and why?
 - What state transitions trigger behavior changes?
 - How does your code handle error conditions?
@@ -582,6 +646,7 @@ TEST_CASE("Server accepts connections") {
 **Ask: "Am I testing the C++ compiler or my application logic?"**

 ### Test Synchronization (Authoritative Rules)
+
 - **ABSOLUTELY NEVER use timeouts** (`sleep_for`, `wait_for`, etc.)
 - **Deterministic synchronization only:**
  - Blocking I/O (naturally waits for completion)
@@ -592,6 +657,7 @@ TEST_CASE("Server accepts connections") {
 #### Threading Checklist for Tests/Benchmarks

 **Common threading principles (all concurrent code):**
+
 - **Count total threads** - Include main/benchmark thread in count
 - **Always assume concurrent execution needed** - Tests/benchmarks require real concurrency
 - **Add synchronization primitive** - `std::latch start_latch{N}` (most common), `std::barrier`, or similar where N = total concurrent threads
@@ -599,18 +665,21 @@ TEST_CASE("Server accepts connections") {
 - **Main thread synchronizes before measurement/execution** - ensures all threads start simultaneously

 **Test-specific:**
+
 - **Perform many operations per thread creation** - amortize thread creation cost and increase chances of hitting race conditions
 - **Pattern: Create test that spawns threads and runs many operations, then run that test many times** - amortizes thread creation cost while providing fresh test instances
 - **Run 100-10000 operations per test, and 100-10000 test iterations** - maximizes chances of hitting race conditions
 - **Always run with ThreadSanitizer** - compile with `-fsanitize=thread`

 **Benchmark-specific:**
+
 - **NEVER create threads inside the benchmark measurement** - creates thread creation/destruction overhead, not contention
 - **Create background threads OUTSIDE the benchmark** that run continuously during measurement
 - **Use `std::atomic<bool> keep_running` to cleanly shut down background threads after benchmark**
 - **Measure only the foreground operation under real contention from background threads**

 **Red flags to catch immediately:**
+
 - ❌ Creating threads in a loop without `std::latch`
 - ❌ Background threads starting work immediately
 - ❌ Benchmark measuring before all threads synchronized
@@ -636,11 +705,12 @@ for (int i = 0; i < 4; ++i) {
 }
 ```

---
+______________________________________________________________________

 ## Build Integration

 ### Build Configuration
+
 ```bash
 # Debug: assertions on, optimizations off
 cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
@@ -650,6 +720,7 @@ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ```

 **Test Target Pattern:**
+
 - Production targets follow build type (assertions off in Release)
 - Test targets use `-UNDEBUG` to force assertions on in all builds
 - Ensures consistent test validation regardless of build type
@@ -657,8 +728,9 @@ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ```cmake
 # Test target with assertions always enabled
 add_executable(test_example tests/test_example.cpp src/example.cpp)
-target_link_libraries(test_example doctest::doctest)
+target_link_libraries(test_example doctest_impl)
 target_compile_options(test_example PRIVATE -UNDEBUG)  # Always enable assertions
+add_test(NAME test_example COMMAND test_example)

 # Production target follows build type
 add_executable(example src/example.cpp src/main.cpp)
@@ -666,4 +738,5 @@ add_executable(example src/example.cpp src/main.cpp)
 ```

 ### Code Generation
+
 - Generated files go in build directory, not source
--- a/test_benchmark_config.toml
+++ b/test_benchmark_config.toml
@@ -10,9 +10,8 @@ interfaces = [
 max_request_size_bytes = 1048576  # 1MB
 # Number of I/O threads for handling connections and network events
 io_threads = 8
-epoll_instances = 8
 # Event batch size for epoll processing
-event_batch_size = 64
+event_batch_size = 128

 [commit]
 # Minimum length for request_id to ensure sufficient entropy
--- a/test_config.toml
+++ b/test_config.toml
@@ -27,3 +27,6 @@ request_id_retention_versions = 100000000
 max_buffer_size_bytes = 10485760  # 10MB
 # Interval for sending keepalive comments to prevent idle timeouts (seconds)
 keepalive_interval_seconds = 30
+
+[benchmark]
+ok_resolve_iterations = 4000
--- a/tests/test_http_handler.cpp
+++ b/tests/test_http_handler.cpp
@@ -1,60 +1,268 @@
-#include "arena.hpp"
+#include "config.hpp"
+#include "connection.hpp"
 #include "http_handler.hpp"
-#include "perfetto_categories.hpp"
-#include <atomic>
+#include "server.hpp"
+
+#include <chrono>
 #include <doctest/doctest.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <string>
+#include <thread>
+#include <unistd.h>

-// Perfetto static storage for tests
-PERFETTO_TRACK_EVENT_STATIC_STORAGE();
+// Test to demonstrate HTTP pipelining response ordering issue
+//
+// HTTP/1.1 pipelining allows multiple requests to be sent on a single
+// connection without waiting for responses, but responses MUST be sent in the
+// same order as requests were received (RFC 2616 Section 8.1.2.2).
+//
+// This test sends two pipelined requests:
+// 1. GET /ok - Slow response (goes through 4-stage pipeline processing)
+// 2. GET /metrics - Fast response (handled directly, just collects metrics)
+//
+// Even though /ok takes longer to process due to pipeline overhead, the /ok
+// response should be sent first since it was requested first. Currently this
+// test FAILS because the faster /metrics response completes before /ok and
+// gets sent out of order.
+TEST_CASE("HTTP pipelined responses out of order") {
+  weaseldb::Config config;
+  HttpHandler handler(config);
+  auto server = Server::create(config, handler, {});
+  int fd = server->create_local_connection();

-// Global variable needed by Connection
-std::atomic<int> activeConnections{0};
+  auto runThread = std::thread{[&]() { server->run(); }};

-// Simple test helper since Connection has complex constructor requirements
-struct TestConnectionData {
-  Arena arena;
-  std::string message_buffer;
-  void *user_data = nullptr;
+  // Send two pipelined requests in a single write() call
+  // Request order: /ok first, then /metrics
+  // Expected response order: /ok response first, then /metrics response
+  // Actual result: /metrics response first (fast), then /ok response (slow)
+  std::string pipelined_requests = "GET /ok HTTP/1.1\r\n"
+                                   "Host: localhost\r\n"
+                                   "Connection: keep-alive\r\n"
+                                   "\r\n"
+                                   "GET /metrics HTTP/1.1\r\n"
+                                   "Host: localhost\r\n"
+                                   "Connection: keep-alive\r\n"
+                                   "\r\n";

-  void append_message(std::string_view data) { message_buffer += data; }
+  int w = write(fd, pipelined_requests.c_str(), pipelined_requests.size());
+  REQUIRE(w == static_cast<int>(pipelined_requests.size()));

-  Arena &get_arena() { return arena; }
-  const std::string &getResponse() const { return message_buffer; }
-  void clearResponse() { message_buffer.clear(); }
-  void reset() {
-    arena.reset();
-    message_buffer.clear();
+  // Set socket to non-blocking
+  int flags = fcntl(fd, F_GETFL, 0);
+  fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+
+  // Read all responses with non-blocking I/O and poll
+  char buf[8192];
+  int total_read = 0;
+
+  bool found_ok = false;
+  bool found_http_response = false;
+
+  std::string ok_response_header = "Content-Length: 2";
+
+  while (true) {
+    // Use poll to wait for data availability
+    struct pollfd pfd = {fd, POLLIN, 0};
+    int poll_result = poll(&pfd, 1, -1); // Block indefinitely
+
+    if (poll_result > 0 && (pfd.revents & POLLIN)) {
+      int r = read(fd, buf + total_read, sizeof(buf) - total_read - 1);
+      if (r > 0) {
+        printf("%.*s", r, buf + total_read);
+        total_read += r;
+
+        // Check if we have what we need after each read
+        buf[total_read] = '\0';
+        std::string current_data(buf, total_read);
+
+        found_http_response =
+            current_data.find("HTTP/1.1") != std::string::npos;
+        found_ok = current_data.find(ok_response_header) != std::string::npos;
+
+        // If we have both HTTP response and ok_response_header, we can proceed
+        // with the test
+        if (found_http_response && found_ok) {
+          break;
        }
-};
-
-// Test helper to verify the new hook functionality
-struct MockConnectionHandler : public ConnectionHandler {
-  bool write_progress_called = false;
-  bool write_buffer_drained_called = false;
-
-  void on_write_progress(std::unique_ptr<Connection> &) override {
-    write_progress_called = true;
-  }
-
-  void on_write_buffer_drained(std::unique_ptr<Connection> &) override {
-    write_buffer_drained_called = true;
-  }
-};
-
-TEST_CASE("ConnectionHandler hooks") {
-  SUBCASE("on_write_buffer_drained hook exists") {
-    MockConnectionHandler handler;
-
-    // Verify hooks are available and can be overridden
-    CHECK_FALSE(handler.write_progress_called);
-    CHECK_FALSE(handler.write_buffer_drained_called);
-
-    // Would normally be called by Server during write operations
-    std::unique_ptr<Connection> null_conn;
-    handler.on_write_progress(null_conn);
-    handler.on_write_buffer_drained(null_conn);
-
-    CHECK(handler.write_progress_called);
-    CHECK(handler.write_buffer_drained_called);
+      } else if (r == 0) {
+        REQUIRE(false);
+        break; // EOF
+      } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
+        REQUIRE(false);
      }
    }
+  }
+
+  buf[total_read] = '\0';
+  std::string response_data(buf, total_read);
+
+  // Ensure we found both HTTP response and ok_response_header
+  REQUIRE(found_http_response);
+  REQUIRE(found_ok);
+
+  // Find first occurrence of ok_response_header in response body
+  std::size_t ok_pos = response_data.find(ok_response_header);
+  REQUIRE(ok_pos != std::string::npos);
+
+  // Count HTTP response status lines before the /ok response body
+  // This tests response ordering: should be exactly 1 (the /ok response itself)
+  std::string before_ok = response_data.substr(0, ok_pos);
+  int http_response_count = 0;
+  std::size_t pos = 0;
+  while ((pos = before_ok.find("HTTP/1.1", pos)) != std::string::npos) {
+    http_response_count++;
+    pos += 8;
+  }
+
+  // Assert there's exactly one HTTP response line before /ok response body
+  // If http_response_count == 2, it means /metrics response came first (wrong
+  // order) If http_response_count == 1, it means /ok response came first
+  // (correct order)
+  CHECK(http_response_count == 1);
+
+  close(fd);
+  server->shutdown();
+  runThread.join();
+}
+
+TEST_CASE("HTTP pipelined POST requests race condition") {
+  weaseldb::Config config;
+  HttpHandler handler(config);
+  auto server = Server::create(config, handler, {});
+  int fd = server->create_local_connection();
+
+  auto runThread = std::thread{[&]() { server->run(); }};
+
+  // Create a POST request with JSON body that requires parsing
+  std::string json_body = R"({
+    "request_id": "test-123",
+    "leader_id": "leader-1",
+    "read_version": 1,
+    "preconditions": [],
+    "operations": [{"write": {"key": "dGVzdA==", "value": "dmFsdWU="}}]
+  })";
+
+  std::string first_post = "POST /v1/commit HTTP/1.1\r\n"
+                           "Host: localhost\r\n"
+                           "Content-Type: application/json\r\n"
+                           "Content-Length: " +
+                           std::to_string(json_body.size()) +
+                           "\r\n"
+                           "Connection: keep-alive\r\n"
+                           "\r\n" +
+                           json_body;
+
+  std::string second_get = "GET /v1/version HTTP/1.1\r\n"
+                           "Host: localhost\r\n"
+                           "Connection: close\r\n"
+                           "\r\n";
+
+  // Send POST request followed immediately by GET request
+  // This creates a scenario where the GET request starts parsing
+  // while the POST response is being written (triggering the reset)
+  int w1 = write(fd, first_post.c_str(), first_post.size());
+  REQUIRE(w1 == static_cast<int>(first_post.size()));
+
+  int w2 = write(fd, second_get.c_str(), second_get.size());
+  REQUIRE(w2 == static_cast<int>(second_get.size()));
+
+  // Read responses using blocking I/O (deterministic synchronization)
+  char buf[4096];
+  int total_read = 0;
+  int responses_found = 0;
+
+  while (total_read < 4000) {
+    int r = read(fd, buf + total_read, sizeof(buf) - total_read - 1);
+    if (r <= 0)
+      break;
+    total_read += r;
+
+    buf[total_read] = '\0';
+    std::string response(buf, total_read);
+    std::size_t pos = 0;
+    while ((pos = response.find("HTTP/1.1", pos)) != std::string::npos) {
+      responses_found++;
+      pos += 8;
+    }
+
+    if (responses_found >= 2)
+      break;
+  }
+
+  // Should get responses to both requests
+  // Race condition might cause parsing errors or connection issues
+  CHECK(responses_found >= 1); // At minimum should handle first request
+
+  close(fd);
+  server->shutdown();
+  runThread.join();
+}
+
+TEST_CASE("HTTP URL split across multiple writes") {
+  weaseldb::Config config;
+  HttpHandler handler(config);
+  auto server = Server::create(config, handler, {});
+  int fd = server->create_local_connection();
+
+  auto runThread = std::thread{[&]() { server->run(); }};
+
+  // Test URL accumulation by splitting the URL across multiple writes
+  // This would have caught the original bug where URL string_view pointed
+  // to llhttp's internal buffer that gets reused between writes
+
+  // Split "GET /metrics HTTP/1.1\r\n" across multiple writes
+  std::string part1 = "GET /met";
+  std::string part2 = "rics HTTP/1.1\r\n";
+  std::string headers = "Host: localhost\r\n"
+                        "Connection: close\r\n"
+                        "\r\n";
+
+  // Write URL in two parts - this tests URL accumulation
+  int w1 = write(fd, part1.c_str(), part1.size());
+  REQUIRE(w1 == static_cast<int>(part1.size()));
+
+  // Attempt to trigger separate llhttp parsing calls
+  std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+  int w2 = write(fd, part2.c_str(), part2.size());
+  REQUIRE(w2 == static_cast<int>(part2.size()));
+
+  int w3 = write(fd, headers.c_str(), headers.size());
+  REQUIRE(w3 == static_cast<int>(headers.size()));
+
+  // Read response
+  char buf[4096];
+  int total_read = 0;
+  bool found_metrics_response = false;
+
+  while (total_read < 4000) {
+    int r = read(fd, buf + total_read, sizeof(buf) - total_read - 1);
+    if (r <= 0)
+      break;
+    total_read += r;
+
+    buf[total_read] = '\0';
+    std::string response(buf, total_read);
+
+    // Check for successful metrics response (not 404)
+    if (response.find("HTTP/1.1 200 OK") != std::string::npos &&
+        response.find("text/plain; version=0.0.4") != std::string::npos) {
+      found_metrics_response = true;
+      break;
+    }
+
+    // Check for 404 which would indicate URL accumulation failed
+    if (response.find("HTTP/1.1 404") != std::string::npos) {
+      FAIL("Got 404 - URL accumulation failed, split URL was not properly "
+           "reconstructed");
+    }
+  }
+
+  REQUIRE(found_metrics_response);
+
+  close(fd);
+  server->shutdown();
+  runThread.join();
+}
--- a/tests/test_metric.cpp
+++ b/tests/test_metric.cpp
@@ -4,9 +4,7 @@
 #include "metric.hpp"

 #include <atomic>
-#include <chrono>
 #include <cmath>
-#include <fstream>
 #include <latch>
 #include <sstream>
 #include <thread>
@@ -587,32 +585,6 @@ TEST_CASE("thread counter cleanup bug") {
  }
 }

-TEST_CASE("error conditions") {
-  SUBCASE("counter negative increment") {
-    auto counter_family = metric::create_counter("error_counter", "Error test");
-    auto counter = counter_family.create({});
-
-    // This should abort in debug builds due to validation
-    // In release builds, behavior is undefined
-    // counter.inc(-1.0); // Would abort
-  }
-
-  SUBCASE("invalid metric names") {
-    // These should abort due to validation
-    // auto bad_counter = metric::create_counter("123invalid", "help"); // Would
-    // abort auto bad_gauge = metric::create_gauge("invalid-name", "help");   //
-    // Would abort
-  }
-
-  SUBCASE("invalid label keys") {
-    auto counter_family = metric::create_counter("valid_name", "help");
-
-    // This should abort due to label validation
-    // auto counter = counter_family.create({{"123invalid", "value"}}); // Would
-    // abort
-  }
-}
-
 TEST_CASE("memory management") {
  SUBCASE("arena allocation in render") {
    Arena arena;
@@ -655,6 +627,113 @@ TEST_CASE("memory management") {
  }
 }

+TEST_CASE("histogram pending buffer thread cleanup bug") {
+  for (int iterations = 0; iterations < 1000; ++iterations) {
+    // This test demonstrates the bug where pending histogram observations
+    // are lost when a thread dies because ThreadInit destructor doesn't
+    // flush pending data into shared before accumulating into global state.
+
+    metric::reset_metrics_for_testing();
+
+    auto hist_family = metric::create_histogram(
+        "pending_bug_test", "Test histogram for pending buffer bug",
+        {1.0}); // Single bucket for simplicity
+
+    std::atomic<bool> keep_rendering{true};
+    constexpr int num_threads = 100;
+
+    std::latch ready{2};
+
+    // Background thread that calls render in a tight loop to hold global mutex
+    std::thread render_thread([&]() {
+      ready.arrive_and_wait();
+      Arena arena;
+      while (keep_rendering.load(std::memory_order_relaxed)) {
+        metric::render(arena);
+        arena.reset();
+      }
+    });
+    // Don't spawn threads until render thread is running
+    ready.arrive_and_wait();
+
+    // Spawn threads that observe once and exit
+    std::vector<std::thread> observer_threads;
+    for (int i = 0; i < num_threads; ++i) {
+      observer_threads.emplace_back([&hist_family]() {
+        auto hist = hist_family.create({{"test", "observer"}});
+        hist.observe(0.5); // Goes into first bucket (le="1.0")
+        // Thread dies here - pending observations should be lost due to bug
+      });
+    }
+
+    // Join all observer threads
+    for (auto &t : observer_threads) {
+      t.join();
+    }
+
+    // Stop render thread
+    keep_rendering.store(false, std::memory_order_relaxed);
+    render_thread.join();
+
+    // Check if the worker's observations were preserved
+    Arena arena;
+    auto output = metric::render(arena);
+
+    // First, let's debug what we actually got
+    std::ostringstream debug_output;
+    for (const auto &line : output) {
+      debug_output << line;
+    }
+    std::string full_output = debug_output.str();
+
+    // Parse the output to find the worker's bucket count for le="2.0"
+    uint64_t worker_bucket_2_count = 0;
+    bool found_worker_metric = false;
+
+    // The render output alternates between metric name and value in separate
+    // string_views
+    for (size_t i = 0; i < output.size(); ++i) {
+      const auto &line = output[i];
+      // Look for: pending_bug_test_bucket{test="observer",le="1.0"}
+      if (line.find("pending_bug_test_bucket{test=\"observer\",le=\"1.0\"}") !=
+          std::string_view::npos) {
+        found_worker_metric = true;
+        // The value should be in the next element
+        if (i + 1 < output.size()) {
+          auto value_str = output[i + 1];
+          // Remove trailing newline if present
+          while (!value_str.empty() &&
+                 (value_str.back() == '\n' || value_str.back() == '\r')) {
+            value_str.remove_suffix(1);
+          }
+          try {
+            worker_bucket_2_count = std::stoull(std::string(value_str));
+          } catch (const std::exception &e) {
+            MESSAGE("Failed to parse value: '"
+                    << value_str << "' from metric line: '" << line << "'");
+            MESSAGE("Full output:\n" << full_output);
+            throw;
+          }
+        }
+        break;
+      }
+    }
+
+    REQUIRE(found_worker_metric); // The metric should exist
+
+    // BUG: This will fail because pending observations are lost on thread death
+    // Expected: num_threads observations (each thread made 1 observation)
+    // Actual: less than num_threads (observations stuck in pending are lost
+    // when threads die)
+    CHECK_MESSAGE(
+        worker_bucket_2_count == num_threads,
+        "Expected "
+            << num_threads << " observations but got " << worker_bucket_2_count
+            << ". This indicates the pending buffer bug where observations "
+            << "stuck in pending are lost when thread dies.");
+  }
+}
+
 TEST_CASE("render output deterministic order golden test") {
  // Clean slate - reset all metrics before this test
  metric::reset_metrics_for_testing();
--- a/tests/test_reference.cpp
+++ b/tests/test_reference.cpp
@@ -28,19 +28,39 @@ struct Base {
  virtual int get_value() const { return base_value; }
 };

-struct Derived : public Base {
+struct Derived : Base {
  int derived_value;
  explicit Derived(int base_v, int derived_v)
      : Base(base_v), derived_value(derived_v) {}
  int get_value() const override { return base_value + derived_value; }
 };

-struct AnotherDerived : public Base {
+struct AnotherDerived : Base {
  int another_value;
  explicit AnotherDerived(int base_v, int another_v)
      : Base(base_v), another_value(another_v) {}
  int get_value() const override { return base_value * another_value; }
 };
+
+// Classes to test polymorphic pointer address changes
+struct Interface1 {
+  int interface1_data = 1;
+  virtual ~Interface1() = default;
+  virtual int get_interface1() const { return interface1_data; }
+};
+
+struct Interface2 {
+  int interface2_data = 2;
+  virtual ~Interface2() = default;
+  virtual int get_interface2() const { return interface2_data; }
+};
+
+// Multiple inheritance - this will cause pointer address changes
+struct MultipleInheritance : Interface1, Interface2 {
+  int own_data;
+  explicit MultipleInheritance(int data) : own_data(data) {}
+  int get_own_data() const { return own_data; }
+};
 } // anonymous namespace

 TEST_CASE("Ref basic functionality") {
@@ -52,9 +72,9 @@ TEST_CASE("Ref basic functionality") {
    CHECK((*ref).value == 42);
  }

-  SUBCASE("copy construction increments reference count") {
+  SUBCASE("explicit copy increments reference count") {
    auto ref1 = make_ref<TestObject>(123);
-    auto ref2 = ref1;
+    auto ref2 = ref1.copy();

    CHECK(ref1);
    CHECK(ref2);
@@ -63,11 +83,11 @@ TEST_CASE("Ref basic functionality") {
    CHECK(ref2->value == 123);
  }

-  SUBCASE("copy assignment works correctly") {
+  SUBCASE("explicit copy assignment works correctly") {
    auto ref1 = make_ref<TestObject>(100);
    auto ref2 = make_ref<TestObject>(200);

-    ref2 = ref1;
+    ref2 = ref1.copy();
    CHECK(ref1.get() == ref2.get());
    CHECK(ref1->value == 100);
    CHECK(ref2->value == 100);
@@ -109,7 +129,7 @@ TEST_CASE("Ref basic functionality") {
 TEST_CASE("WeakRef basic functionality") {
  SUBCASE("construction from Ref") {
    auto ref = make_ref<TestObject>(333);
-    WeakRef<TestObject> weak_ref = ref;
+    WeakRef<TestObject> weak_ref = ref.as_weak();

    auto locked = weak_ref.lock();
    CHECK(locked);
@@ -121,7 +141,7 @@ TEST_CASE("WeakRef basic functionality") {
    WeakRef<TestObject> weak_ref;
    {
      auto ref = make_ref<TestObject>(444);
-      weak_ref = ref;
+      weak_ref = ref.as_weak();
    }
    // ref goes out of scope, object should be destroyed

@@ -131,8 +151,8 @@ TEST_CASE("WeakRef basic functionality") {

  SUBCASE("copy and move semantics") {
    auto ref = make_ref<TestObject>(666);
-    WeakRef<TestObject> weak1 = ref;
-    WeakRef<TestObject> weak2 = weak1;            // copy
+    WeakRef<TestObject> weak1 = ref.as_weak();
+    WeakRef<TestObject> weak2 = weak1.copy();     // explicit copy
    WeakRef<TestObject> weak3 = std::move(weak1); // move

    auto locked2 = weak2.lock();
@@ -160,7 +180,7 @@ TEST_CASE("Ref thread safety") {
          start_latch.arrive_and_wait();

          for (int j = 0; j < copies_per_thread; ++j) {
-            auto copy = ref;
+            auto copy = ref.copy();
            CHECK(copy);
            CHECK(copy->value == 777);
          }
@@ -191,7 +211,7 @@ TEST_CASE("Control block cleanup race condition test") {
  WeakRef<TestObject> ptr2;
  auto setup = [&]() {
    ptr1 = make_ref<TestObject>(0);
-    ptr2 = ptr1;
+    ptr2 = ptr1.as_weak();
  };

  // Barrier for synchronization - 2 participants (main thread + worker thread)
@@ -243,7 +263,7 @@ TEST_CASE("WeakRef prevents circular references") {
    // Create object and weak reference
    {
      auto ref = make_ref<TestObject>(123);
-      weak_ref = ref;
+      weak_ref = ref.as_weak();

      // Should be able to lock while object exists
      auto locked = weak_ref.lock();
@@ -262,8 +282,8 @@ TEST_CASE("WeakRef prevents circular references") {
    auto child = make_ref<Node>(2);

    // Create potential cycle
-    parent->next = child;   // Strong reference: parent → child
-    child->parent = parent; // WeakRef: child ⇝ parent (breaks cycle)
+    parent->next = child.copy();      // Strong reference: parent → child
+    child->parent = parent.as_weak(); // WeakRef: child ⇝ parent (breaks cycle)

    CHECK(parent->data == 1);
    CHECK(child->data == 2);
@@ -286,7 +306,7 @@ TEST_CASE("Polymorphic Ref conversions") {
    CHECK(derived_ref->get_value() == 30); // 10 + 20

    // Convert Ref<Derived> to Ref<Base>
-    Ref<Base> base_ref = derived_ref;
+    Ref<Base> base_ref = derived_ref.copy();
    CHECK(base_ref);
    CHECK(base_ref->get_value() == 30); // Virtual dispatch works
    CHECK(base_ref->base_value == 10);
@@ -303,7 +323,7 @@ TEST_CASE("Polymorphic Ref conversions") {
    CHECK(base_ref->get_value() == 100);

    // Assign derived to base
-    base_ref = derived_ref;
+    base_ref = derived_ref.copy();
    CHECK(base_ref->get_value() == 20); // 5 + 15
    CHECK(base_ref.get() == derived_ref.get());
  }
@@ -338,7 +358,7 @@ TEST_CASE("Polymorphic Ref conversions") {
    CHECK(another_derived->get_value() == 24); // 6 * 4

    // Convert to base
-    Ref<Base> base_ref = another_derived;
+    Ref<Base> base_ref = another_derived.copy();
    CHECK(base_ref->get_value() == 24); // Virtual dispatch
    CHECK(base_ref.get() == another_derived.get());
  }
@@ -349,10 +369,10 @@ TEST_CASE("Polymorphic WeakRef conversions") {
    auto derived_ref = make_ref<Derived>(3, 7);

    // Create WeakRef<Derived>
-    WeakRef<Derived> weak_derived = derived_ref;
+    WeakRef<Derived> weak_derived = derived_ref.as_weak();

    // Convert to WeakRef<Base>
-    WeakRef<Base> weak_base = weak_derived;
+    WeakRef<Base> weak_base = weak_derived.copy();

    // Both should lock to same object
    auto locked_derived = weak_derived.lock();
@@ -368,11 +388,11 @@ TEST_CASE("Polymorphic WeakRef conversions") {
    auto derived_ref = make_ref<Derived>(4, 6);
    auto base_ref = make_ref<Base>(999);

-    WeakRef<Derived> weak_derived = derived_ref;
-    WeakRef<Base> weak_base = base_ref;
+    WeakRef<Derived> weak_derived = derived_ref.as_weak();
+    WeakRef<Base> weak_base = base_ref.as_weak();

    // Assign derived weak ref to base weak ref
-    weak_base = weak_derived;
+    weak_base = weak_derived.copy();

    auto locked = weak_base.lock();
    CHECK(locked);
@@ -384,7 +404,7 @@ TEST_CASE("Polymorphic WeakRef conversions") {
    auto derived_ref = make_ref<Derived>(2, 8);

    // Create WeakRef<Base> directly from Ref<Derived>
-    WeakRef<Base> weak_base = derived_ref;
+    WeakRef<Base> weak_base = derived_ref.as_weak();

    auto locked = weak_base.lock();
    CHECK(locked);
@@ -394,7 +414,7 @@ TEST_CASE("Polymorphic WeakRef conversions") {

  SUBCASE("WeakRef move operations") {
    auto derived_ref = make_ref<Derived>(1, 9);
-    WeakRef<Derived> weak_derived = derived_ref;
+    WeakRef<Derived> weak_derived = derived_ref.as_weak();

    // Move construct
    WeakRef<Base> weak_base = std::move(weak_derived);
@@ -414,7 +434,7 @@ TEST_CASE("Polymorphic edge cases") {
    CHECK(!empty_derived);

    // Convert empty derived to base
-    Ref<Base> empty_base = empty_derived;
+    Ref<Base> empty_base = empty_derived.copy();
    CHECK(!empty_base);

    // Move empty derived to base
@@ -427,7 +447,7 @@ TEST_CASE("Polymorphic edge cases") {
    CHECK(!empty_weak_derived.lock());

    // Convert empty weak derived to weak base
-    WeakRef<Base> empty_weak_base = empty_weak_derived;
+    WeakRef<Base> empty_weak_base = empty_weak_derived.copy();
    CHECK(!empty_weak_base.lock());
  }

@@ -435,7 +455,7 @@ TEST_CASE("Polymorphic edge cases") {
    auto derived_ref = make_ref<Derived>(5, 5);

    // Ref<Derived> → WeakRef<Base>
-    WeakRef<Base> weak_base_from_ref = derived_ref;
+    WeakRef<Base> weak_base_from_ref = derived_ref.as_weak();

    // WeakRef<Base> → Ref<Base> via lock
    auto base_ref_from_weak = weak_base_from_ref.lock();
@@ -444,6 +464,36 @@ TEST_CASE("Polymorphic edge cases") {
    CHECK(base_ref_from_weak->get_value() == 10); // 5 + 5
    CHECK(base_ref_from_weak.get() == derived_ref.get());
  }
+
+  SUBCASE("multiple inheritance pointer address bug test") {
+    auto multi_ref = make_ref<MultipleInheritance>(42);
+
+    // Get pointers to different base classes - these will have different
+    // addresses
+    Interface1 *interface1_ptr = multi_ref.get();
+    Interface2 *interface2_ptr = multi_ref.get();
+
+    // Verify that pointers are indeed different (demonstrating the issue)
+    CHECK(static_cast<void *>(interface1_ptr) !=
+          static_cast<void *>(interface2_ptr));
+
+    // Create WeakRef to Interface2 (which has a different pointer address)
+    WeakRef<Interface2> weak_interface2 = multi_ref.as_weak();
+
+    // Lock should return the correct Interface2 pointer, not miscalculated one
+    auto locked_interface2 = weak_interface2.lock();
+    CHECK(locked_interface2);
+    CHECK(locked_interface2.get() ==
+          interface2_ptr); // This might fail due to the bug!
+    CHECK(locked_interface2->get_interface2() == 2);
+
+    // Also test Interface1
+    WeakRef<Interface1> weak_interface1 = multi_ref.as_weak();
+    auto locked_interface1 = weak_interface1.lock();
+    CHECK(locked_interface1);
+    CHECK(locked_interface1.get() == interface1_ptr); // This might also fail!
+    CHECK(locked_interface1->get_interface1() == 1);
+  }
 }

 // Should be run with asan or valgrind
@@ -457,5 +507,5 @@ TEST_CASE("Self-referencing WeakRef pattern") {
    WeakRef<SelfReferencing> self_;
  };
  auto x = make_ref<SelfReferencing>();
-  x->self_ = x;
+  x->self_ = x.as_weak();
 }
--- a/tests/test_server.cpp
+++ b/tests/test_server.cpp
@@ -0,0 +1,173 @@
+#include "config.hpp"
+#include "connection.hpp"
+#include "connection_handler.hpp"
+#include "server.hpp"
+
+#include <doctest/doctest.h>
+#include <latch>
+#include <string_view>
+#include <thread>
+
+struct EchoHandler : ConnectionHandler {
+  Arena arena;
+  std::span<std::string_view> reply;
+  WeakRef<MessageSender> wconn;
+  std::latch done{1};
+  void on_data_arrived(std::string_view data, Connection &conn) override {
+    reply = arena.allocate_span<std::string_view>(1);
+    reply[0] = arena.copy_string(data);
+    wconn = conn.get_weak_ref();
+    CHECK(wconn.lock());
+    done.count_down();
+  }
+};
+
+TEST_CASE("Echo test") {
+  EchoHandler handler;
+  weaseldb::Config config;
+  auto server = Server::create(config, handler, {});
+  int fd = server->create_local_connection();
+
+  auto runThread = std::thread{[&]() { server->run(); }};
+
+  int w = write(fd, "hello", 5);
+  REQUIRE(w == 5);
+
+  handler.done.wait();
+  if (auto conn = handler.wconn.lock()) {
+    // Cast to Connection* to access append_bytes (not available on
+    // MessageSender)
+    auto *conn_ptr = static_cast<Connection *>(conn.get());
+    conn_ptr->append_bytes(std::exchange(handler.reply, {}),
+                           std::move(handler.arena), ConnectionShutdown::None);
+  } else {
+    REQUIRE(false);
+  }
+
+  char buf[6];
+  buf[5] = 0;
+  int r = read(fd, buf, 5);
+  REQUIRE(r == 5);
+  CHECK(std::string(buf) == "hello");
+
+  close(fd);
+
+  server->shutdown();
+  runThread.join();
+}
+
+struct ShutdownTestHandler : ConnectionHandler {
+  Arena arena;
+  std::span<std::string_view> reply;
+  WeakRef<MessageSender> wconn;
+  std::latch received_data{1};
+  std::latch connection_closed_latch{1};
+  ConnectionShutdown shutdown_mode = ConnectionShutdown::None;
+  std::atomic<bool> connection_closed{false};
+
+  void on_data_arrived(std::string_view data, Connection &conn) override {
+    reply = arena.allocate_span<std::string_view>(1);
+    reply[0] = arena.copy_string(data);
+    wconn = conn.get_weak_ref();
+    received_data.count_down();
+  }
+
+  void on_connection_closed(Connection &) override {
+    connection_closed = true;
+    connection_closed_latch.count_down();
+  }
+};
+
+TEST_CASE("Connection shutdown write-only mode") {
+  ShutdownTestHandler handler;
+  handler.shutdown_mode = ConnectionShutdown::WriteOnly;
+  weaseldb::Config config;
+  auto server = Server::create(config, handler, {});
+  int fd = server->create_local_connection();
+
+  auto runThread = std::thread{[&]() { server->run(); }};
+
+  // Send data to trigger handler
+  int w = write(fd, "test", 4);
+  REQUIRE(w == 4);
+
+  handler.received_data.wait();
+
+  // Send response with write shutdown
+  if (auto conn = handler.wconn.lock()) {
+    auto *conn_ptr = static_cast<Connection *>(conn.get());
+    conn_ptr->append_bytes(std::exchange(handler.reply, {}),
+                           std::move(handler.arena),
+                           ConnectionShutdown::WriteOnly);
+  } else {
+    REQUIRE(false);
+  }
+
+  // Read the response
+  char buf[5];
+  buf[4] = 0;
+  int r = read(fd, buf, 4);
+  REQUIRE(r == 4);
+  CHECK(std::string(buf) == "test");
+
+  // After write shutdown, we should get EOF when trying to read more
+  char extra_buf[1];
+  int eof_result = read(fd, extra_buf, 1);
+  CHECK(eof_result == 0); // EOF indicates successful write shutdown
+
+  // Connection should still be alive (not closed) after write shutdown
+  // We can verify this by checking that we can still write to the socket
+  int write_result = write(fd, "x", 1);
+  CHECK(write_result == 1); // Should succeed - connection still alive
+  CHECK(handler.connection_closed.load() ==
+        false); // Connection should still be alive
+
+  close(fd);
+  server->shutdown();
+  runThread.join();
+}
+
+TEST_CASE("Connection shutdown full mode") {
+  ShutdownTestHandler handler;
+  handler.shutdown_mode = ConnectionShutdown::Full;
+  weaseldb::Config config;
+  auto server = Server::create(config, handler, {});
+  int fd = server->create_local_connection();
+
+  auto runThread = std::thread{[&]() { server->run(); }};
+
+  // Send data to trigger handler
+  int w = write(fd, "test", 4);
+  REQUIRE(w == 4);
+
+  handler.received_data.wait();
+
+  // Send response with full shutdown
+  if (auto conn = handler.wconn.lock()) {
+    auto *conn_ptr = static_cast<Connection *>(conn.get());
+    conn_ptr->append_bytes(std::exchange(handler.reply, {}),
+                           std::move(handler.arena), ConnectionShutdown::Full);
+  } else {
+    REQUIRE(false);
+  }
+
+  // Read the response - connection should close after this
+  char buf[5];
+  buf[4] = 0;
+  int r = read(fd, buf, 4);
+  REQUIRE(r == 4);
+  CHECK(std::string(buf) == "test");
+
+  // Connection should be closed by server (full shutdown)
+  char extra_buf[1];
+  int close_result = read(fd, extra_buf, 1);
+  CHECK(close_result == 0); // EOF indicates connection was closed
+
+  // Wait for connection closed callback to be called
+  handler.connection_closed_latch.wait();
+  CHECK(handler.connection_closed.load() == true);
+
+  close(fd);
+  server->shutdown();
+  runThread.join();
+}
--- a/tests/test_server_connection_return.cpp
+++ b/tests/test_server_connection_return.cpp
@@ -1,110 +0,0 @@
-#include "../src/thread_pipeline.hpp"
-#include "config.hpp"
-#include "connection.hpp"
-#include "perfetto_categories.hpp"
-#include "server.hpp"
-#include <cstring>
-#include <doctest/doctest.h>
-#include <thread>
-
-// Perfetto static storage for tests
-PERFETTO_TRACK_EVENT_STATIC_STORAGE();
-
-struct Message {
-  std::unique_ptr<Connection> conn;
-  std::string data;
-  bool done;
-};
-
-struct EchoHandler : public ConnectionHandler {
-private:
-  StaticThreadPipeline<Message, WaitStrategy::WaitIfStageEmpty, 1> &pipeline;
-
-public:
-  explicit EchoHandler(
-      StaticThreadPipeline<Message, WaitStrategy::WaitIfStageEmpty, 1>
-          &pipeline)
-      : pipeline(pipeline) {}
-
-  void on_data_arrived(std::string_view data,
-                       std::unique_ptr<Connection> &conn_ptr) override {
-    assert(conn_ptr);
-    auto guard = pipeline.push(1, true);
-    for (auto &message : guard.batch) {
-      message.conn = std::move(conn_ptr);
-      message.data = data;
-      message.done = false;
-    }
-  }
-};
-
-TEST_CASE(
-    "Server correctly handles connection ownership transfer via pipeline") {
-  weaseldb::Config config;
-  config.server.io_threads = 1;
-  config.server.epoll_instances = 1;
-
-  StaticThreadPipeline<Message, WaitStrategy::WaitIfStageEmpty, 1> pipeline{10};
-  EchoHandler handler{pipeline};
-  auto echoThread = std::thread{[&]() {
-    for (;;) {
-      auto guard = pipeline.acquire<0, 0>();
-      for (auto &message : guard.batch) {
-        bool done = message.done;
-        if (done) {
-          return;
-        }
-        assert(message.conn);
-        message.conn->append_message(message.data);
-        Server::release_back_to_server(std::move(message.conn));
-      }
-    }
-  }};
-
-  // Create server with NO listen sockets (empty vector)
-  auto server = Server::create(config, handler, {});
-
-  std::thread server_thread([&server]() { server->run(); });
-
-  // Create local connection
-  int client_fd = server->create_local_connection();
-  REQUIRE(client_fd > 0);
-
-  // Write some test data
-  const char *test_message = "Hello, World!";
-  ssize_t bytes_written;
-  do {
-    bytes_written = write(client_fd, test_message, std::strlen(test_message));
-  } while (bytes_written == -1 && errno == EINTR);
-  REQUIRE(bytes_written == std::strlen(test_message));
-
-  // Read the echoed response
-  char buffer[1024] = {0};
-  ssize_t bytes_read;
-  do {
-    bytes_read = read(client_fd, buffer, sizeof(buffer) - 1);
-  } while (bytes_read == -1 && errno == EINTR);
-  if (bytes_read == -1) {
-    perror("read failed");
-  }
-  REQUIRE(bytes_read == std::strlen(test_message));
-
-  // Verify we got back exactly what we sent
-  CHECK(std::string(buffer, bytes_read) == std::string(test_message));
-
-  // Cleanup
-  int e = close(client_fd);
-  if (e == -1 && errno != EINTR) {
-    perror("close client_fd");
-    std::abort();
-  }
-  server->shutdown();
-  server_thread.join();
-  {
-    auto guard = pipeline.push(1, true);
-    for (auto &message : guard.batch) {
-      message.done = true;
-    }
-  }
-  echoThread.join();
-}
--- a/threading_performance_report.md
+++ b/threading_performance_report.md
@@ -7,12 +7,14 @@ WeaselDB's /ok health check endpoint achieves 1M requests/second with 740ns of c
 ## Performance Metrics

 ### Throughput
+
 - **1.0M requests/second** /ok health check endpoint (4-stage commit pipeline)
 - 8 I/O threads with 8 epoll instances
 - Load tester used 12 network threads
 - **0% CPU usage when idle** (optimized futex wake implementation)

 ### Threading Architecture
+
 - **Four-stage commit pipeline**: Sequence → Resolve → Persist → Release
 - Lock-free coordination using atomic ring buffer
 - **Optimized futex wake**: Only wake on final pipeline stage
@@ -21,6 +23,7 @@ WeaselDB's /ok health check endpoint achieves 1M requests/second with 740ns of c
 ### Performance Characteristics

 **Health Check Pipeline (/ok endpoint)**:
+
 - **Throughput**: 1.0M requests/second
 - **Configurable CPU work**: 740ns (4000 iterations, validated with nanobench)
 - **Theoretical maximum CPU time**: 1000ns (1,000,000,000ns ÷ 1,000,000 req/s)
@@ -31,23 +34,27 @@ WeaselDB's /ok health check endpoint achieves 1M requests/second with 740ns of c
 ### Key Optimizations

 **Futex Wake Reduction**:
+
 - **Previous approach**: Futex wake at every pipeline stage (10% CPU overhead)
 - **Optimized approach**: Futex wake only at final stage to wake producers. Stages now do their futex wait on the beginning of the pipeline instead of the previous stage.
 - **Result**: 23% increase in serial CPU budget (396ns → 488ns)
 - **Benefits**: Higher throughput per CPU cycle + idle efficiency

 **CPU-Friendly Spin Loop**:
+
 - **Added**: `_mm_pause()` intrinsics in polling loop to reduce power consumption and improve hyperthreading efficiency
 - **Maintained**: 100,000 spin iterations necessary to prevent thread descheduling
 - **Result**: Same throughput with more efficient spinning

 **Resolve Batch Size Optimization**:
+
 - **Changed**: Resolve max batch size from unlimited to 1
 - **Mechanism**: Single-item processing checks for work more frequently, keeping the thread in fast coordination paths instead of expensive spin/wait cycles

 ### Request Flow

 **Health Check Pipeline** (/ok endpoint):
+
 ```
 I/O Threads (8) → HttpHandler::on_batch_complete() → Commit Pipeline
    ↑                                                        ↓
@@ -59,10 +66,10 @@ I/O Threads (8) → HttpHandler::on_batch_complete() → Commit Pipeline
    |                                                 Stage 2: Persist (generate response)
    |                                                 (send "OK" response)
    |                                                        ↓
-    |                                                 Stage 3: Release (connection return)
+    |                                                 Stage 3: Release (wake I/O threads)
    |                                                 (optimized futex wake)
    |                                                        ↓
-    └─────────────────────── Server::release_back_to_server()
+    └─────────────────────── I/O threads send response to client
 ```

 ## Test Configuration
--- a/todo.md
+++ b/todo.md
@@ -3,16 +3,19 @@
 ## 📋 Planned Tasks

 ### Core Database Features
- [ ] Design commit pipeline architecture with three-stage processing
-  - [ ] Stage 1: Version assignment and precondition validation thread
-  - [ ] Stage 2: Transaction persistence and subscriber streaming thread
-  - [ ] Stage 3: Connection return to server thread
+
+- [ ] Design commit pipeline architecture with four-stage processing
+  - [ ] Stage 0: Sequence assignment and request validation
+  - [ ] Stage 1: Precondition resolution and conflict detection
+  - [ ] Stage 2: Transaction persistence and subscriber streaming
+  - [ ] Stage 3: Response generation and connection cleanup
  - [ ] Use ThreadPipeline for inter-stage communication
  - [ ] Design persistence interface for pluggable storage backends (S3, local disk)
 - [ ] Integrate https://git.weaselab.dev/weaselab/conflict-set for optimistic concurrency control
 - [ ] Design and architect the subscription component for change streams

 ### API Endpoints Implementation
+
 - [ ] Implement `GET /v1/version` endpoint to return latest committed version and leader
 - [ ] Implement `POST /v1/commit` endpoint for transaction submission with precondition validation
 - [ ] Implement `GET /v1/status` endpoint for commit request status lookup by request_id
@@ -23,6 +26,7 @@
 - [ ] Implement `DELETE /v1/retention/<policy_id>` endpoint for retention policy removal

 ### Infrastructure & Tooling
+
 - [x] Implement thread-safe Prometheus metrics library and serve `GET /metrics` endpoint
 - [ ] Implement gperf-based HTTP routing for efficient request dispatching
 - [ ] Replace nlohmann/json with simdjson DOM API in parser comparison benchmarks
@@ -54,6 +58,7 @@
  - [ ] Implement `DeleteObjects` for batch object deletion

 ### Client Libraries
+
 - [ ] Implement high-level Python client library for WeaselDB REST API
  - [ ] Wrap `/v1/version`, `/v1/commit`, `/v1/status` endpoints
  - [ ] Handle `/v1/subscribe` SSE streaming with reconnection logic
@@ -64,6 +69,7 @@
  - [ ] Provide CLI tooling for database administration

 ### Testing & Validation
+
 - [ ] Build out-of-process API test suite using client library over real TCP
  - [ ] Test all `/v1/version`, `/v1/commit`, `/v1/status` endpoints
  - [ ] Test `/v1/subscribe` Server-Sent Events streaming
@@ -79,6 +85,6 @@
 - [x] Built streaming JSON parser for commit requests with high-performance parsing
 - [x] Implemented HTTP server with multi-threaded networking using multiple epoll instances
 - [x] Created threading model with pipeline for serial request processing for optimistic concurrency control
- [x] Designed connection ownership transfer system to enable the serial processing model
+- [x] Implemented server-owned connection model with WeakRef pattern for safe concurrent access
 - [x] Implemented arena-per-connection memory model for clean memory lifetime management
 - [x] Built TOML configuration system for server settings
--- a/tools/load_tester.cpp
+++ b/tools/load_tester.cpp
@@ -297,10 +297,10 @@ struct Connection {
    }
  }

-  bool writeBytes() {
+  bool write_bytes() {
    for (;;) {
      assert(!request.empty());
-      int w = write(fd, request.data(), request.size());
+      int w = send(fd, request.data(), request.size(), MSG_NOSIGNAL);
      if (w == -1) {
        if (errno == EINTR) {
          continue;
@@ -610,7 +610,6 @@ int main(int argc, char *argv[]) {
  }
  printf("\n");

-  signal(SIGPIPE, SIG_IGN);
  signal(SIGTERM, signal_handler);
  signal(SIGINT, signal_handler);

@@ -673,7 +672,7 @@ int main(int argc, char *argv[]) {
            continue; // Let unique_ptr destructor clean up
          }
          if (events[i].events & EPOLLOUT) {
-            bool finished = conn->writeBytes();
+            bool finished = conn->write_bytes();
            if (conn->error) {
              continue;
            }
@@ -749,14 +748,14 @@ int main(int argc, char *argv[]) {
        // Try to write once in the connect thread before handing off to network
        // threads
        assert(conn->has_messages());
-        bool writeFinished = conn->writeBytes();
+        bool write_finished = conn->write_bytes();
        if (conn->error) {
          continue; // Connection failed, destructor will clean up
        }

        // Determine the appropriate epoll events based on write result
        struct epoll_event event{};
-        if (writeFinished) {
+        if (write_finished) {
          // All data was written, wait for response
          int shutdown_result = shutdown(conn->fd, SHUT_WR);
          if (shutdown_result == -1) {
Author	SHA1	Message	Date
Andrew Noyes	f458c6b249	Make pipeline policy/topology configurable	2025-11-06 15:55:27 -05:00
Andrew Noyes	9f8562e30f	Fix histogram thread death bug	2025-09-18 12:39:47 -04:00
Andrew Noyes	cb66d65479	Add a bit more precision to docs, plus philosophy	2025-09-16 00:57:20 -04:00
Andrew Noyes	4d015fa3dc	Make recording metrics never block	2025-09-15 23:34:30 -04:00
Andrew Noyes	0659319906	Prepare for try_lock optimization So histogram observations never block	2025-09-15 23:04:54 -04:00
Andrew Noyes	6f421629aa	Add comments to avoid blocking in sequence and resolve stages	2025-09-15 22:51:41 -04:00
Andrew Noyes	ba59a992dd	Document per-connection locking strategy	2025-09-15 22:33:52 -04:00
Andrew Noyes	0d76c73077	Only shut down write side for http	2025-09-15 21:42:36 -04:00
Andrew Noyes	4ecbc07367	Fix flaky connection shutdown test	2025-09-15 21:39:48 -04:00
Andrew Noyes	5e625197aa	Fix url accumulation bug	2025-09-15 21:36:49 -04:00
Andrew Noyes	5dda7353fa	Add test for url accumulation bug	2025-09-15 21:36:49 -04:00
Andrew Noyes	345d8e21b2	Use WaitIfUpstreamIdle	2025-09-15 20:48:12 -04:00
Andrew Noyes	917066d8c0	Move thread local state into stack	2025-09-15 20:33:45 -04:00
Andrew Noyes	5a88047b9f	Two release threads	2025-09-15 20:09:15 -04:00
Andrew Noyes	1acdc1e753	Put Arena destructor and move constructors in header	2025-09-15 17:01:38 -04:00
Andrew Noyes	ae0c014298	Don't clear write interest if pending_response_queue_ non empty	2025-09-15 16:35:37 -04:00
Andrew Noyes	e2115152c8	Add tests for shutdown vs close	2025-09-15 15:48:10 -04:00
Andrew Noyes	55f6ebc02b	Implement shutting down the write-side only	2025-09-15 15:39:28 -04:00
Andrew Noyes	6b52c4289c	Prevent queueing of messages on connection after it will be closed	2025-09-15 15:25:40 -04:00
Andrew Noyes	7ee5ca2a9b	Remove dead code, use proper send_ordered_response And prepare to try to close a connection gracefully	2025-09-15 15:08:19 -04:00
Andrew Noyes	9120c05847	Use spend_cpu_cycles instead of volatile loop	2025-09-15 14:07:38 -04:00
Andrew Noyes	528a467518	Make test names match binary names	2025-09-15 13:06:15 -04:00
Andrew Noyes	a67d7a8531	Consolidate into one send_ordered_response	2025-09-15 13:01:56 -04:00
Andrew Noyes	6717b70772	Remove HttpConnectionState::response_queue_mutex	2025-09-15 12:32:19 -04:00
Andrew Noyes	34accb9d80	Move GetVersion to commit pipeline	2025-09-15 12:30:02 -04:00
Andrew Noyes	f3c3f77a24	Extract commit pipeline to its own module	2025-09-15 11:51:01 -04:00
Andrew Noyes	afd240dba7	Remove vestigial "round-robin" code	2025-09-15 11:22:14 -04:00
Andrew Noyes	1cb7a4c301	Remove has_pending_responses_	2025-09-15 11:10:47 -04:00
Andrew Noyes	1b220d0d1c	WIP	2025-09-15 10:28:17 -04:00
Andrew Noyes	ec2ad27e33	Add explanatory comments	2025-09-15 00:07:44 -04:00
Andrew Noyes	eb98e51867	We expect to get valid fds to close in ~Connection in ~Server	2025-09-15 00:07:09 -04:00
Andrew Noyes	022a79bf5b	Separate HttpRequestState and HttpConnectionState Now HttpConnectionState has a queue of HttpRequestState	2025-09-14 23:49:32 -04:00
Andrew Noyes	fac6b8de88	Add test that shows parsing issue It's meant to show the pipelining issue. I guess we'll solve the newly-discovered parsing issue first.	2025-09-14 22:24:02 -04:00
Andrew Noyes	1f61f91bf5	Reset connection state after finishing with it in http_handler	2025-09-14 21:16:41 -04:00
Andrew Noyes	632113f792	Add test for pipeline request parsing bug	2025-09-14 20:53:19 -04:00
Andrew Noyes	f62770c4ab	Add copying utility methods to Arena	2025-09-14 20:38:54 -04:00
Andrew Noyes	147edf5c93	More cleanup	2025-09-14 20:27:14 -04:00
Andrew Noyes	f39149d516	Update documentation with new networking model	2025-09-14 19:03:56 -04:00
Andrew Noyes	0389fd2c9f	Consistently use state->arena for http handling	2025-09-14 17:16:05 -04:00
Andrew Noyes	7ef54a2d08	Call epoll_ctl in release stage	2025-09-14 16:28:12 -04:00
Andrew Noyes	16c7ee0408	Separate Connection and Request lifetimes	2025-09-14 15:04:37 -04:00
Andrew Noyes	cf0c1b7cc2	Add echo test for server	2025-09-14 12:56:22 -04:00
Andrew Noyes	bd06798fd3	Remove test_http_handler and test_server_connection_return	2025-09-14 11:38:43 -04:00
Andrew Noyes	e96a493835	Remove release_back_to_server	2025-09-14 09:03:05 -04:00
Andrew Noyes	e887906da8	Remove some unused/indirectly used headers	2025-09-13 17:25:46 -04:00
Andrew Noyes	de6f38694f	std::unique_ptr<Connection> -> Ref<Connection>	2025-09-13 17:25:46 -04:00
Andrew Noyes	1fa3381e4b	Use send/sendmsg and don't ignore SIGPIPE	2025-09-13 17:25:20 -04:00
Andrew Noyes	cd2e15677a	Remove epoll instances config	2025-09-12 18:05:07 -04:00
Andrew Noyes	2b8f095d27	Fix minor issues	2025-09-12 12:13:50 -04:00
Andrew Noyes	543447971f	Fix polymorphic WeakRef bug	2025-09-12 12:08:46 -04:00
Andrew Noyes	f89868058a	Require explicit copies for Ref/WeakRef	2025-09-12 11:59:56 -04:00
Andrew Noyes	674ff581e7	Update comments/docs to match code	2025-09-12 11:40:38 -04:00
Andrew Noyes	be5a0c6d8e	Update some inaccuracies in markdown files	2025-09-12 11:31:22 -04:00
Andrew Noyes	bf90b8856a	Add mdformat pre-commit hook	2025-09-12 11:24:16 -04:00
Andrew Noyes	9d48caca76	add end-of-file-fixer	2025-09-12 11:21:00 -04:00