Compare commits

...

55 Commits

Author SHA1 Message Date
f458c6b249 Make pipeline policy/topology configurable 2025-11-06 15:55:27 -05:00
9f8562e30f Fix histogram thread death bug 2025-09-18 12:39:47 -04:00
cb66d65479 Add a bit more precision to docs, plus philosophy 2025-09-16 00:57:20 -04:00
4d015fa3dc Make recording metrics never block 2025-09-15 23:34:30 -04:00
0659319906 Prepare for try_lock optimization
So histogram observations never block
2025-09-15 23:04:54 -04:00
6f421629aa Add comments to avoid blocking in sequence and resolve stages 2025-09-15 22:51:41 -04:00
ba59a992dd Document per-connection locking strategy 2025-09-15 22:33:52 -04:00
0d76c73077 Only shut down write side for http 2025-09-15 21:42:36 -04:00
4ecbc07367 Fix flaky connection shutdown test 2025-09-15 21:39:48 -04:00
5e625197aa Fix url accumulation bug 2025-09-15 21:36:49 -04:00
5dda7353fa Add test for url accumulation bug 2025-09-15 21:36:49 -04:00
345d8e21b2 Use WaitIfUpstreamIdle 2025-09-15 20:48:12 -04:00
917066d8c0 Move thread local state into stack 2025-09-15 20:33:45 -04:00
5a88047b9f Two release threads 2025-09-15 20:09:15 -04:00
1acdc1e753 Put Arena destructor and move constructors in header 2025-09-15 17:01:38 -04:00
ae0c014298 Don't clear write interest if pending_response_queue_ non empty 2025-09-15 16:35:37 -04:00
e2115152c8 Add tests for shutdown vs close 2025-09-15 15:48:10 -04:00
55f6ebc02b Implement shutting down the write-side only 2025-09-15 15:39:28 -04:00
6b52c4289c Prevent queueing of messages on connection after it will be closed 2025-09-15 15:25:40 -04:00
7ee5ca2a9b Remove dead code, use proper send_ordered_response
And prepare to try to close a connection gracefully
2025-09-15 15:08:19 -04:00
9120c05847 Use spend_cpu_cycles instead of volatile loop 2025-09-15 14:07:38 -04:00
528a467518 Make test names match binary names 2025-09-15 13:06:15 -04:00
a67d7a8531 Consolidate into one send_ordered_response 2025-09-15 13:01:56 -04:00
6717b70772 Remove HttpConnectionState::response_queue_mutex 2025-09-15 12:32:19 -04:00
34accb9d80 Move GetVersion to commit pipeline 2025-09-15 12:30:02 -04:00
f3c3f77a24 Extract commit pipeline to its own module 2025-09-15 11:51:01 -04:00
afd240dba7 Remove vestigial "round-robin" code 2025-09-15 11:22:14 -04:00
1cb7a4c301 Remove has_pending_responses_ 2025-09-15 11:10:47 -04:00
1b220d0d1c WIP 2025-09-15 10:28:17 -04:00
ec2ad27e33 Add explanatory comments 2025-09-15 00:07:44 -04:00
eb98e51867 We expect to get valid fds to close in ~Connection in ~Server 2025-09-15 00:07:09 -04:00
022a79bf5b Separate HttpRequestState and HttpConnectionState
Now HttpConnectionState has a queue of HttpRequestState
2025-09-14 23:49:32 -04:00
fac6b8de88 Add test that shows parsing issue
It's meant to show the pipelining issue. I guess we'll solve the
newly-discovered parsing issue first.
2025-09-14 22:24:02 -04:00
1f61f91bf5 Reset connection state after finishing with it in http_handler 2025-09-14 21:16:41 -04:00
632113f792 Add test for pipeline request parsing bug 2025-09-14 20:53:19 -04:00
f62770c4ab Add copying utility methods to Arena 2025-09-14 20:38:54 -04:00
147edf5c93 More cleanup 2025-09-14 20:27:14 -04:00
f39149d516 Update documentation with new networking model 2025-09-14 19:03:56 -04:00
0389fd2c9f Consistently use state->arena for http handling 2025-09-14 17:16:05 -04:00
7ef54a2d08 Call epoll_ctl in release stage 2025-09-14 16:28:12 -04:00
16c7ee0408 Separate Connection and Request lifetimes 2025-09-14 15:04:37 -04:00
cf0c1b7cc2 Add echo test for server 2025-09-14 12:56:22 -04:00
bd06798fd3 Remove test_http_handler and test_server_connection_return 2025-09-14 11:38:43 -04:00
e96a493835 Remove release_back_to_server 2025-09-14 09:03:05 -04:00
e887906da8 Remove some unused/indirectly used headers 2025-09-13 17:25:46 -04:00
de6f38694f std::unique_ptr<Connection> -> Ref<Connection> 2025-09-13 17:25:46 -04:00
1fa3381e4b Use send/sendmsg and don't ignore SIGPIPE 2025-09-13 17:25:20 -04:00
cd2e15677a Remove epoll instances config 2025-09-12 18:05:07 -04:00
2b8f095d27 Fix minor issues 2025-09-12 12:13:50 -04:00
543447971f Fix polymorphic WeakRef bug 2025-09-12 12:08:46 -04:00
f89868058a Require explicit copies for Ref/WeakRef 2025-09-12 11:59:56 -04:00
674ff581e7 Update comments/docs to match code 2025-09-12 11:40:38 -04:00
be5a0c6d8e Update some inaccuracies in markdown files 2025-09-12 11:31:22 -04:00
bf90b8856a Add mdformat pre-commit hook 2025-09-12 11:24:16 -04:00
9d48caca76 add end-of-file-fixer 2025-09-12 11:21:00 -04:00
56 changed files with 3463 additions and 2427 deletions

View File

@@ -3,11 +3,13 @@ repos:
rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c # frozen: v6.0.0 rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c # frozen: v6.0.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer
exclude: ".*third_party/.*"
- id: check-added-large-files - id: check-added-large-files
- id: check-merge-conflict - id: check-merge-conflict
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: 182152eb8c5ce1cf5299b956b04392c86bd8a126 # frozen: v20.1.8 rev: 86fdcc9bd34d6afbbd29358b97436c8ffe3aa3b2 # frozen: v21.1.0
hooks: hooks:
- id: clang-format - id: clang-format
exclude: ".*third_party/.*" exclude: ".*third_party/.*"
@@ -23,6 +25,11 @@ repos:
- id: black - id: black
language_version: python3 language_version: python3
- repo: https://github.com/executablebooks/mdformat
rev: ff29be1a1ba8029d9375882aa2c812b62112a593 # frozen: 0.7.22
hooks:
- id: mdformat
- repo: local - repo: local
hooks: hooks:
- id: snake-case-enforcement - id: snake-case-enforcement

View File

@@ -139,6 +139,7 @@ target_link_libraries(nanobench_impl PUBLIC nanobench)
# Define all source files in one place # Define all source files in one place
set(WEASELDB_SOURCES set(WEASELDB_SOURCES
src/arena.cpp src/arena.cpp
src/commit_pipeline.cpp
src/cpu_work.cpp src/cpu_work.cpp
src/format.cpp src/format.cpp
src/metric.cpp src/metric.cpp
@@ -188,6 +189,11 @@ add_executable(test_arena tests/test_arena.cpp)
target_link_libraries(test_arena doctest_impl weaseldb_sources_debug) target_link_libraries(test_arena doctest_impl weaseldb_sources_debug)
target_compile_options(test_arena PRIVATE -UNDEBUG) target_compile_options(test_arena PRIVATE -UNDEBUG)
add_executable(test_server tests/test_server.cpp)
target_link_libraries(test_server doctest_impl weaseldb_sources_debug)
target_compile_options(test_server PRIVATE -UNDEBUG)
add_test(NAME test_server COMMAND test_server)
add_executable( add_executable(
test_commit_request test_commit_request
tests/test_commit_request.cpp tests/nlohmann_reference_parser.cpp tests/test_commit_request.cpp tests/nlohmann_reference_parser.cpp
@@ -197,23 +203,19 @@ target_link_libraries(test_commit_request doctest_impl weaseldb_sources_debug
target_include_directories(test_commit_request PRIVATE tests) target_include_directories(test_commit_request PRIVATE tests)
target_compile_options(test_commit_request PRIVATE -UNDEBUG) target_compile_options(test_commit_request PRIVATE -UNDEBUG)
add_executable(test_http_handler tests/test_http_handler.cpp)
target_link_libraries(test_http_handler doctest_impl weaseldb_sources_debug)
target_compile_options(test_http_handler PRIVATE -UNDEBUG)
add_executable(test_server_connection_return
tests/test_server_connection_return.cpp)
target_link_libraries(test_server_connection_return doctest_impl
weaseldb_sources_debug)
target_compile_options(test_server_connection_return PRIVATE -UNDEBUG)
# Metrics system test # Metrics system test
add_executable(test_metric tests/test_metric.cpp) add_executable(test_metric tests/test_metric.cpp)
target_link_libraries(test_metric doctest_impl weaseldb_sources_debug) target_link_libraries(test_metric doctest_impl weaseldb_sources_debug)
target_compile_options(test_metric PRIVATE -UNDEBUG) target_compile_options(test_metric PRIVATE -UNDEBUG)
# HTTP handler test
add_executable(test_http_handler tests/test_http_handler.cpp)
target_link_libraries(test_http_handler doctest_impl weaseldb_sources_debug)
target_compile_options(test_http_handler PRIVATE -UNDEBUG)
add_test(NAME test_http_handler COMMAND test_http_handler)
# Register with CTest # Register with CTest
add_test(NAME metric_tests COMMAND test_metric) add_test(NAME test_metric COMMAND test_metric)
add_executable(bench_arena benchmarks/bench_arena.cpp) add_executable(bench_arena benchmarks/bench_arena.cpp)
target_link_libraries(bench_arena nanobench_impl weaseldb_sources) target_link_libraries(bench_arena nanobench_impl weaseldb_sources)
@@ -231,7 +233,8 @@ target_link_libraries(bench_parser_comparison nanobench_impl weaseldb_sources
target_include_directories(bench_parser_comparison target_include_directories(bench_parser_comparison
PRIVATE ${rapidjson_SOURCE_DIR}/include) PRIVATE ${rapidjson_SOURCE_DIR}/include)
add_executable(bench_thread_pipeline benchmarks/bench_thread_pipeline.cpp) add_executable(bench_thread_pipeline benchmarks/bench_thread_pipeline.cpp
src/cpu_work.cpp)
target_link_libraries(bench_thread_pipeline nanobench_impl Threads::Threads) target_link_libraries(bench_thread_pipeline nanobench_impl Threads::Threads)
target_include_directories(bench_thread_pipeline PRIVATE src) target_include_directories(bench_thread_pipeline PRIVATE src)
@@ -253,11 +256,8 @@ target_link_libraries(debug_arena weaseldb_sources)
add_executable(load_tester tools/load_tester.cpp) add_executable(load_tester tools/load_tester.cpp)
target_link_libraries(load_tester Threads::Threads llhttp_static perfetto) target_link_libraries(load_tester Threads::Threads llhttp_static perfetto)
add_test(NAME arena_tests COMMAND test_arena) add_test(NAME test_arena COMMAND test_arena)
add_test(NAME commit_request_tests COMMAND test_commit_request) add_test(NAME test_commit_request COMMAND test_commit_request)
add_test(NAME http_handler_tests COMMAND test_http_handler)
add_test(NAME server_connection_return_tests
COMMAND test_server_connection_return)
add_test(NAME arena_benchmarks COMMAND bench_arena) add_test(NAME arena_benchmarks COMMAND bench_arena)
add_test(NAME commit_request_benchmarks COMMAND bench_commit_request) add_test(NAME commit_request_benchmarks COMMAND bench_commit_request)
add_test(NAME parser_comparison_benchmarks COMMAND bench_parser_comparison) add_test(NAME parser_comparison_benchmarks COMMAND bench_parser_comparison)
@@ -267,14 +267,14 @@ add_test(NAME format_comparison_benchmarks COMMAND bench_format_comparison)
add_executable(test_api_url_parser tests/test_api_url_parser.cpp) add_executable(test_api_url_parser tests/test_api_url_parser.cpp)
target_link_libraries(test_api_url_parser doctest_impl weaseldb_sources_debug) target_link_libraries(test_api_url_parser doctest_impl weaseldb_sources_debug)
target_compile_options(test_api_url_parser PRIVATE -UNDEBUG) target_compile_options(test_api_url_parser PRIVATE -UNDEBUG)
add_test(NAME api_url_parser_tests COMMAND test_api_url_parser) add_test(NAME test_api_url_parser COMMAND test_api_url_parser)
# Reference counting tests and benchmarks # Reference counting tests and benchmarks
add_executable(test_reference tests/test_reference.cpp) add_executable(test_reference tests/test_reference.cpp)
target_link_libraries(test_reference doctest_impl) target_link_libraries(test_reference doctest_impl)
target_include_directories(test_reference PRIVATE src) target_include_directories(test_reference PRIVATE src)
target_compile_options(test_reference PRIVATE -UNDEBUG) target_compile_options(test_reference PRIVATE -UNDEBUG)
add_test(NAME reference_tests COMMAND test_reference) add_test(NAME test_reference COMMAND test_reference)
add_executable(bench_reference benchmarks/bench_reference.cpp) add_executable(bench_reference benchmarks/bench_reference.cpp)
target_link_libraries(bench_reference doctest_impl nanobench_impl target_link_libraries(bench_reference doctest_impl nanobench_impl

64
api.md
View File

@@ -2,7 +2,7 @@
> **Note:** This is a design for the API of the write-side of a database system where writing and reading are decoupled. The read-side of the system is expected to use the `/v1/subscribe` endpoint to maintain a queryable representation of the key-value data. In other words, reading from this "database" is left as an exercise for the reader. Authentication and authorization are out of scope for this design. > **Note:** This is a design for the API of the write-side of a database system where writing and reading are decoupled. The read-side of the system is expected to use the `/v1/subscribe` endpoint to maintain a queryable representation of the key-value data. In other words, reading from this "database" is left as an exercise for the reader. Authentication and authorization are out of scope for this design.
----- ______________________________________________________________________
## `GET /v1/version` ## `GET /v1/version`
@@ -20,16 +20,16 @@ Retrieves the latest known committed version and the current leader.
} }
``` ```
----- ______________________________________________________________________
## `POST /v1/commit` ## `POST /v1/commit`
Submits a transaction to be committed. The transaction consists of read preconditions, writes, and deletes. Submits a transaction to be committed. The transaction consists of read preconditions, writes, and deletes.
* Clients may receive a **`413 Content Too Large`** response if the request exceeds a configurable limit. - Clients may receive a **`413 Content Too Large`** response if the request exceeds a configurable limit.
* A malformed request will result in a **`400 Bad Request`** response. - A malformed request will result in a **`400 Bad Request`** response.
* Keys are sorted by a lexicographical comparison of their raw byte values. - Keys are sorted by a lexicographical comparison of their raw byte values.
* All binary data for keys and values must be encoded using the standard base64 scheme defined in [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4), with padding included. - All binary data for keys and values must be encoded using the standard base64 scheme defined in [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4), with padding included.
### Request ### Request
@@ -91,24 +91,26 @@ Submits a transaction to be committed. The transaction consists of read precondi
// If not committed, a more recent version that the client can use to retry. // If not committed, a more recent version that the client can use to retry.
"version": 123456, "version": 123456,
// The unique ID of the leader at this version. // The unique ID of the leader at this version.
"leader_id": "abcdefg" "leader_id": "abcdefg",
// Echo back the request_id if it was provided in the original request
"request_id": "abcdefg"
} }
``` ```
### Detailed Notes for `/v1/commit` ### Detailed Notes for `/v1/commit`
1. **`request_id`**: Optional field that can be used with `/v1/status` to determine the outcome if no reply is received. If omitted, a UUID will be automatically generated by the server, and clients will not be able to determine commit status if there's no response. When provided, the request_id must meet the minimum length requirement (configurable, default 20 characters) to ensure sufficient entropy for collision avoidance. This ID must not be reused in a commit request. For idempotency, if a response is not received, the client must use `/v1/status` to determine the request's outcome. The original `request_id` should not be reused for a new commit attempt; instead, a retry should be sent with a new `request_id`. The alternative design would require the leader to store every request ID in memory. 1. **`request_id`**: Optional field that can be used with `/v1/status` to determine the outcome if no reply is received. If omitted, a UUID will be automatically generated by the server, and clients will not be able to determine commit status if there's no response. When provided, the request_id must meet the minimum length requirement (configurable, default 20 characters) to ensure sufficient entropy for collision avoidance. This ID must not be reused in a commit request. For idempotency, if a response is not received, the client must use `/v1/status` to determine the request's outcome. The original `request_id` should not be reused for a new commit attempt; instead, a retry should be sent with a new `request_id`. The alternative design would require the leader to store every request ID in memory.
2. **`preconditions` (Guarantees and Usage)**: The condition is satisfied if the server verifies that the range has not changed since the specified version. Clients can achieve serializable isolation by including all reads that influenced their writes. By default, clients should assume that any read they perform influences their writes. Omitting reads is an expert-level optimization and should generally be avoided. 1. **`preconditions` (Guarantees and Usage)**: The condition is satisfied if the server verifies that the range has not changed since the specified version. Clients can achieve serializable isolation by including all reads that influenced their writes. By default, clients should assume that any read they perform influences their writes. Omitting reads is an expert-level optimization and should generally be avoided.
3. **`preconditions` (False Positives & Leader Changes)**: Precondition checks are conservative and best-effort; it's possible to reject a transaction where the range hasn't actually changed. In all such cases, clients should retry with a more recent read version. Two examples of false positives are: 1. **`preconditions` (False Positives & Leader Changes)**: Precondition checks are conservative and best-effort; it's possible to reject a transaction where the range hasn't actually changed. In all such cases, clients should retry with a more recent read version. Two examples of false positives are:
* **Implementation Detail:** The leader may use partitioned conflict history for performance. A conflict in one partition (even from a transaction that later aborts) can cause a rejection. - **Implementation Detail:** The leader may use partitioned conflict history for performance. A conflict in one partition (even from a transaction that later aborts) can cause a rejection.
* **Leader Changes:** A version is only valid within the term of the leader that issued it. Since conflict history is stored in memory, a leadership change invalidates all previously issued read versions. Any transaction using such a version will be rejected. - **Leader Changes:** A version is only valid within the term of the leader that issued it. Since conflict history is stored in memory, a leadership change invalidates all previously issued read versions. Any transaction using such a version will be rejected.
The versions in the precondition checks need not be the same. The versions in the precondition checks need not be the same.
----- ______________________________________________________________________
## `GET /v1/status` ## `GET /v1/status`
@@ -125,7 +127,7 @@ Gets the status of a previous commit request by its `request_id`.
| `request_id` | string | Yes | The `request_id` from the original `/v1/commit` request. | | `request_id` | string | Yes | The `request_id` from the original `/v1/commit` request. |
| `min_version` | integer | Yes | An optimization that constrains the log scan. This value should be the latest version the client knew to be committed *before* sending the original request. | | `min_version` | integer | Yes | An optimization that constrains the log scan. This value should be the latest version the client knew to be committed *before* sending the original request. |
> **Warning\!** If the provided `min_version` is later than the transaction's actual commit version, the server might not find the record in the scanned portion of the log. This can result in an `id_not_found` status, even if the transaction actually committed. > **Warning!** If the provided `min_version` is later than the transaction's actual commit version, the server might not find the record in the scanned portion of the log. This can result in an `id_not_found` status, even if the transaction actually committed.
### Response ### Response
@@ -144,7 +146,7 @@ A response from this endpoint guarantees the original request is no longer in fl
> **Note on `log_truncated` status:** This indicates the `request_id` log has been truncated after `min_version`, making it impossible to determine the original request's outcome. There is no way to avoid this without storing an arbitrarily large number of request IDs. Clients must treat this as an indeterminate outcome. Retrying the transaction is unsafe unless the client has an external method to verify the original transaction's status. This error should be propagated to the caller. `request_id`s are retained for a configurable minimum time and number of versions so this should be extremely rare. > **Note on `log_truncated` status:** This indicates the `request_id` log has been truncated after `min_version`, making it impossible to determine the original request's outcome. There is no way to avoid this without storing an arbitrarily large number of request IDs. Clients must treat this as an indeterminate outcome. Retrying the transaction is unsafe unless the client has an external method to verify the original transaction's status. This error should be propagated to the caller. `request_id`s are retained for a configurable minimum time and number of versions so this should be extremely rare.
----- ______________________________________________________________________
## `GET /v1/subscribe` ## `GET /v1/subscribe`
@@ -156,10 +158,10 @@ Clients should rely on the `version` field within the `transaction` and `checkpo
### Query Parameters ### Query Parameters
| Parameter | Type | Required | Description | | Parameter | Type | Required | Description |
| :-------- | :------ | :------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | :-------- | :------ | :------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `after` | integer | No | The version after which to start streaming transactions. Defaults to streaming from the latest committed version. On reconnect, clients should set this to the last version they successfully processed. | | `after` | integer | No | The version after which to start streaming transactions. Defaults to streaming from the latest committed version. On reconnect, clients should set this to the last version they successfully processed. |
| `durable` | boolean | No | If `true` (the default), the stream sends `transaction` events only after they are durably committed. This increases latency but simplifies client logic. When `durable=true`, `checkpoint` events are not sent. | | `durable` | boolean | No | If `true` (the default), the stream sends `transaction` events only after they are durably committed. This increases latency but simplifies client logic. When `durable=true`, `checkpoint` events are not sent. |
### Server-Sent Events Stream ### Server-Sent Events Stream
@@ -169,7 +171,7 @@ The response is a stream of events compliant with the SSE protocol.
``` ```
event: transaction event: transaction
data: {"request_id":"abcdefg","version":123456,"timestamp":"2025-08-07T20:27:42.555Z","leader_id":"abcdefg","operations":[...]} data: {"request_id":"abcdefg","version":123456,"prev_version":123455,"timestamp":"2025-08-07T20:27:42.555Z","leader_id":"abcdefg","operations":[...]}
``` ```
@@ -190,11 +192,13 @@ data: {"committed_version":123456,"leader_id":"abcdefg"}
### Detailed Notes for `/v1/subscribe` ### Detailed Notes for `/v1/subscribe`
1. **Data Guarantees**: When `durable=false`, this endpoint streams *accepted*, but not necessarily *durable/committed*, transactions. *Accepted* transactions will eventually commit unless the current leader changes. 1. **Data Guarantees**: When `durable=false`, this endpoint streams *accepted*, but not necessarily *durable/committed*, transactions. *Accepted* transactions will eventually commit unless the current leader changes.
2. **Leader Changes & Reconnection**: When `durable=false`, if the leader changes, clients **must** discard all of that leader's `transaction` events received after their last-seen `checkpoint` event. They must then manually reconnect (as the server connection will likely be terminated) and restart the subscription by setting the `after` query parameter to the version specified in that last-known checkpoint. Clients should implement a randomized exponential backoff strategy (backoff with jitter) when reconnecting. 1. **Leader Changes & Reconnection**: When `durable=false`, if the leader changes, clients **must** discard all of that leader's `transaction` events received after their last-seen `checkpoint` event. They must then manually reconnect (as the server connection will likely be terminated) and restart the subscription by setting the `after` query parameter to the version specified in that last-known checkpoint. Clients should implement a randomized exponential backoff strategy (backoff with jitter) when reconnecting.
3. **Connection Handling & Errors**: The server may periodically send `keepalive` comments to prevent idle timeouts on network proxies. The server will buffer unconsumed data up to a configurable limit; if the client falls too far behind, the connection will be closed. If the `after` version has been truncated from the log, this endpoint will return a standard `410 Gone` HTTP error instead of an event stream. 1. **Gap Detection**: Each `transaction` event includes a `prev_version` field linking to the previous transaction version, forming a linked list. Clients can detect gaps in the transaction stream by checking that each transaction's `prev_version` matches the previous transaction's `version`. This ensures gapless transitions between historical data from S3 and live events from the server.
1. **Connection Handling & Errors**: The server may periodically send `keepalive` comments to prevent idle timeouts on network proxies. The server will buffer unconsumed data up to a configurable limit; if the client falls too far behind, the connection will be closed. If the `after` version has been truncated from the log, this endpoint will return a standard `410 Gone` HTTP error instead of an event stream.
## `PUT /v1/retention/<policy_id>` ## `PUT /v1/retention/<policy_id>`
@@ -211,10 +215,10 @@ Creates or updates a retention policy.
### Response ### Response
* `201 Created` if the policy was created. - `201 Created` if the policy was created.
* `200 OK` if the policy was updated. - `200 OK` if the policy was updated.
----- ______________________________________________________________________
## `GET /v1/retention/<policy_id>` ## `GET /v1/retention/<policy_id>`
@@ -228,7 +232,7 @@ Retrieves a retention policy by ID.
} }
``` ```
----- ______________________________________________________________________
## `GET /v1/retention/` ## `GET /v1/retention/`
@@ -245,7 +249,7 @@ Retrieves all retention policies.
] ]
``` ```
----- ______________________________________________________________________
## `DELETE /v1/retention/<policy_id>` ## `DELETE /v1/retention/<policy_id>`
@@ -255,7 +259,7 @@ Removes a retention policy, which may allow the log to be truncated.
`204 No Content` `204 No Content`
----- ______________________________________________________________________
## `GET /ok` ## `GET /ok`
@@ -265,7 +269,7 @@ Simple health check endpoint.
Returns `200 OK` with minimal content for basic health monitoring. Returns `200 OK` with minimal content for basic health monitoring.
----- ______________________________________________________________________
## `GET /metrics` ## `GET /metrics`

View File

@@ -76,13 +76,6 @@ private:
Precondition current_precondition; Precondition current_precondition;
Operation current_operation; Operation current_operation;
// Helper to store string in arena and return string_view
std::string_view store_string(const char *str, size_t length) {
char *stored = arena.allocate<char>(length);
std::memcpy(stored, str, length);
return std::string_view(stored, length);
}
public: public:
explicit CommitRequestArenaHandler() explicit CommitRequestArenaHandler()
: preconditions(ArenaStlAllocator<Precondition>(&arena)), : preconditions(ArenaStlAllocator<Precondition>(&arena)),
@@ -109,7 +102,7 @@ public:
bool RawNumber(const char *, rapidjson::SizeType, bool) { abort(); } bool RawNumber(const char *, rapidjson::SizeType, bool) { abort(); }
bool String(const char *str, rapidjson::SizeType length, bool) { bool String(const char *str, rapidjson::SizeType length, bool) {
std::string_view value = store_string(str, length); std::string_view value = arena.copy_string({str, length});
if (state == State::Root) { if (state == State::Root) {
if (current_key == "request_id") { if (current_key == "request_id") {

View File

@@ -27,6 +27,18 @@ template <typename T> struct PointerTraits<std::shared_ptr<T>> {
return std::make_shared<T>(std::forward<Args>(args)...); return std::make_shared<T>(std::forward<Args>(args)...);
} }
static pointer_type copy(const pointer_type &ptr) {
return ptr; // std::shared_ptr copies implicitly
}
static weak_type as_weak(const pointer_type &ptr) {
return ptr; // std::weak_ptr converts implicitly from std::shared_ptr
}
static weak_type copy_weak(const weak_type &weak) {
return weak; // std::weak_ptr copies implicitly
}
static const char *name() { return "std::shared_ptr"; } static const char *name() { return "std::shared_ptr"; }
static const char *weak_name() { return "std::weak_ptr"; } static const char *weak_name() { return "std::weak_ptr"; }
}; };
@@ -39,6 +51,18 @@ template <typename T> struct PointerTraits<Ref<T>> {
return make_ref<T>(std::forward<Args>(args)...); return make_ref<T>(std::forward<Args>(args)...);
} }
static pointer_type copy(const pointer_type &ptr) {
return ptr.copy(); // Ref requires explicit copy
}
static weak_type as_weak(const pointer_type &ptr) {
return ptr.as_weak(); // Ref requires explicit as_weak
}
static weak_type copy_weak(const weak_type &weak) {
return weak.copy(); // WeakRef requires explicit copy
}
static const char *name() { return "Ref"; } static const char *name() { return "Ref"; }
static const char *weak_name() { return "WeakRef"; } static const char *weak_name() { return "WeakRef"; }
}; };
@@ -67,7 +91,7 @@ void benchmark_copy(ankerl::nanobench::Bench &bench) {
auto original = Traits::make(TestObject{123}); auto original = Traits::make(TestObject{123});
bench.run(std::string(Traits::name()) + " copy", [&] { bench.run(std::string(Traits::name()) + " copy", [&] {
auto copy = original; auto copy = Traits::copy(original);
ankerl::nanobench::doNotOptimizeAway(copy); ankerl::nanobench::doNotOptimizeAway(copy);
}); });
} }
@@ -91,9 +115,9 @@ void benchmark_weak_copy(ankerl::nanobench::Bench &bench) {
force_multithreaded(); force_multithreaded();
auto strong_ptr = Traits::make(TestObject{123}); auto strong_ptr = Traits::make(TestObject{123});
typename Traits::weak_type weak_original = strong_ptr; typename Traits::weak_type weak_original = Traits::as_weak(strong_ptr);
bench.run(std::string(Traits::weak_name()) + " copy", [&] { bench.run(std::string(Traits::weak_name()) + " copy", [&] {
auto weak_copy = weak_original; auto weak_copy = Traits::copy_weak(weak_original);
ankerl::nanobench::doNotOptimizeAway(weak_copy); ankerl::nanobench::doNotOptimizeAway(weak_copy);
}); });
} }
@@ -103,7 +127,7 @@ void benchmark_weak_move(ankerl::nanobench::Bench &bench) {
using Traits = PointerTraits<PtrType>; using Traits = PointerTraits<PtrType>;
auto strong_ptr = Traits::make(TestObject{123}); auto strong_ptr = Traits::make(TestObject{123});
typename Traits::weak_type weak_original = strong_ptr; typename Traits::weak_type weak_original = Traits::as_weak(strong_ptr);
bench.run(std::string(Traits::weak_name()) + " move", [&] { bench.run(std::string(Traits::weak_name()) + " move", [&] {
auto weak_moved = std::move(weak_original); auto weak_moved = std::move(weak_original);
ankerl::nanobench::doNotOptimizeAway(weak_moved); ankerl::nanobench::doNotOptimizeAway(weak_moved);
@@ -126,7 +150,7 @@ void benchmark_weak_lock_success(ankerl::nanobench::Bench &bench) {
using Traits = PointerTraits<PtrType>; using Traits = PointerTraits<PtrType>;
auto strong_ptr = Traits::make(TestObject{789}); auto strong_ptr = Traits::make(TestObject{789});
typename Traits::weak_type weak_ptr = strong_ptr; typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);
bench.run(std::string(Traits::weak_name()) + " lock success", [&] { bench.run(std::string(Traits::weak_name()) + " lock success", [&] {
auto locked = weak_ptr.lock(); auto locked = weak_ptr.lock();
ankerl::nanobench::doNotOptimizeAway(locked); ankerl::nanobench::doNotOptimizeAway(locked);
@@ -140,7 +164,7 @@ void benchmark_weak_lock_failure(ankerl::nanobench::Bench &bench) {
typename Traits::weak_type weak_ptr; typename Traits::weak_type weak_ptr;
{ {
auto strong_ptr = Traits::make(TestObject{999}); auto strong_ptr = Traits::make(TestObject{999});
weak_ptr = strong_ptr; weak_ptr = Traits::as_weak(strong_ptr);
} }
bench.run(std::string(Traits::weak_name()) + " lock failure", [&] { bench.run(std::string(Traits::weak_name()) + " lock failure", [&] {
auto locked = weak_ptr.lock(); auto locked = weak_ptr.lock();
@@ -163,7 +187,7 @@ void benchmark_multithreaded_copy(ankerl::nanobench::Bench &bench,
for (int i = 0; i < num_threads - 1; ++i) { for (int i = 0; i < num_threads - 1; ++i) {
background_threads.emplace_back([&]() { background_threads.emplace_back([&]() {
while (keep_running.load(std::memory_order_relaxed)) { while (keep_running.load(std::memory_order_relaxed)) {
auto copy = ptr; auto copy = Traits::copy(ptr);
ankerl::nanobench::doNotOptimizeAway(copy); ankerl::nanobench::doNotOptimizeAway(copy);
} }
}); });
@@ -171,7 +195,7 @@ void benchmark_multithreaded_copy(ankerl::nanobench::Bench &bench,
// Benchmark the foreground thread under contention // Benchmark the foreground thread under contention
bench.run(std::string(Traits::name()) + " copy under contention", [&] { bench.run(std::string(Traits::name()) + " copy under contention", [&] {
auto copy = ptr; auto copy = Traits::copy(ptr);
ankerl::nanobench::doNotOptimizeAway(copy); ankerl::nanobench::doNotOptimizeAway(copy);
}); });
@@ -189,7 +213,7 @@ void benchmark_multithreaded_weak_lock(ankerl::nanobench::Bench &bench,
// Create the shared object and weak reference outside the benchmark // Create the shared object and weak reference outside the benchmark
auto strong_ptr = Traits::make(TestObject{789}); auto strong_ptr = Traits::make(TestObject{789});
typename Traits::weak_type weak_ptr = strong_ptr; typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);
// Create background threads that will create contention // Create background threads that will create contention
std::atomic<bool> keep_running{true}; std::atomic<bool> keep_running{true};
@@ -224,7 +248,7 @@ void benchmark_weak_copy_with_strong_contention(ankerl::nanobench::Bench &bench,
// Create the shared object and weak reference outside the benchmark // Create the shared object and weak reference outside the benchmark
auto strong_ptr = Traits::make(TestObject{456}); auto strong_ptr = Traits::make(TestObject{456});
typename Traits::weak_type weak_ptr = strong_ptr; typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);
// Create background threads copying the strong pointer // Create background threads copying the strong pointer
std::atomic<bool> keep_running{true}; std::atomic<bool> keep_running{true};
@@ -233,7 +257,7 @@ void benchmark_weak_copy_with_strong_contention(ankerl::nanobench::Bench &bench,
for (int i = 0; i < num_threads - 1; ++i) { for (int i = 0; i < num_threads - 1; ++i) {
background_threads.emplace_back([&]() { background_threads.emplace_back([&]() {
while (keep_running.load(std::memory_order_relaxed)) { while (keep_running.load(std::memory_order_relaxed)) {
auto copy = strong_ptr; auto copy = Traits::copy(strong_ptr);
ankerl::nanobench::doNotOptimizeAway(copy); ankerl::nanobench::doNotOptimizeAway(copy);
} }
}); });
@@ -242,7 +266,7 @@ void benchmark_weak_copy_with_strong_contention(ankerl::nanobench::Bench &bench,
// Benchmark weak reference copying under strong reference contention // Benchmark weak reference copying under strong reference contention
bench.run(std::string(Traits::weak_name()) + " copy with strong contention", bench.run(std::string(Traits::weak_name()) + " copy with strong contention",
[&] { [&] {
auto weak_copy = weak_ptr; auto weak_copy = Traits::copy_weak(weak_ptr);
ankerl::nanobench::doNotOptimizeAway(weak_copy); ankerl::nanobench::doNotOptimizeAway(weak_copy);
}); });
@@ -260,7 +284,7 @@ void benchmark_strong_copy_with_weak_contention(ankerl::nanobench::Bench &bench,
// Create the shared object and weak reference outside the benchmark // Create the shared object and weak reference outside the benchmark
auto strong_ptr = Traits::make(TestObject{789}); auto strong_ptr = Traits::make(TestObject{789});
typename Traits::weak_type weak_ptr = strong_ptr; typename Traits::weak_type weak_ptr = Traits::as_weak(strong_ptr);
// Create background threads copying the weak pointer // Create background threads copying the weak pointer
std::atomic<bool> keep_running{true}; std::atomic<bool> keep_running{true};
@@ -269,7 +293,7 @@ void benchmark_strong_copy_with_weak_contention(ankerl::nanobench::Bench &bench,
for (int i = 0; i < num_threads - 1; ++i) { for (int i = 0; i < num_threads - 1; ++i) {
background_threads.emplace_back([&]() { background_threads.emplace_back([&]() {
while (keep_running.load(std::memory_order_relaxed)) { while (keep_running.load(std::memory_order_relaxed)) {
auto weak_copy = weak_ptr; auto weak_copy = Traits::copy_weak(weak_ptr);
ankerl::nanobench::doNotOptimizeAway(weak_copy); ankerl::nanobench::doNotOptimizeAway(weak_copy);
} }
}); });
@@ -277,7 +301,7 @@ void benchmark_strong_copy_with_weak_contention(ankerl::nanobench::Bench &bench,
// Benchmark strong reference copying under weak reference contention // Benchmark strong reference copying under weak reference contention
bench.run(std::string(Traits::name()) + " copy with weak contention", [&] { bench.run(std::string(Traits::name()) + " copy with weak contention", [&] {
auto strong_copy = strong_ptr; auto strong_copy = Traits::copy(strong_ptr);
ankerl::nanobench::doNotOptimizeAway(strong_copy); ankerl::nanobench::doNotOptimizeAway(strong_copy);
}); });

View File

@@ -1,3 +1,4 @@
#include "cpu_work.hpp"
#include "thread_pipeline.hpp" #include "thread_pipeline.hpp"
#include <latch> #include <latch>
@@ -19,24 +20,22 @@ int main() {
.warmup(100); .warmup(100);
bench.run("Zero stage pipeline", [&] { bench.run("Zero stage pipeline", [&] {
for (int i = 0; i < NUM_ITEMS; ++i) { for (int i = 0; i < NUM_ITEMS; ++i) {
for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) { spend_cpu_cycles(BUSY_ITERS);
}
} }
}); });
StaticThreadPipeline<std::latch *, WaitStrategy::WaitIfStageEmpty, 1> ThreadPipeline<std::latch *> pipeline(WaitStrategy::WaitIfStageEmpty, {1},
pipeline(LOG_PIPELINE_SIZE); LOG_PIPELINE_SIZE);
std::latch done{0}; std::latch done{0};
// Stage 0 consumer thread // Stage 0 consumer thread
std::thread stage0_thread([&pipeline, &done]() { std::thread stage0_thread([&pipeline, &done]() {
for (;;) { for (;;) {
auto guard = pipeline.acquire<0, 0>(); auto guard = pipeline.acquire(0, 0);
for (auto &item : guard.batch) { for (auto &item : guard.batch) {
for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) { spend_cpu_cycles(BUSY_ITERS);
}
if (item == &done) { if (item == &done) {
return; return;
} }
@@ -90,19 +89,18 @@ int main() {
.warmup(100); .warmup(100);
for (int batch_size : {1, 4, 16, 64, 256}) { for (int batch_size : {1, 4, 16, 64, 256}) {
StaticThreadPipeline<std::latch *, WaitStrategy::WaitIfStageEmpty, 1> ThreadPipeline<std::latch *> pipeline(WaitStrategy::WaitIfStageEmpty, {1},
pipeline(LOG_PIPELINE_SIZE); LOG_PIPELINE_SIZE);
std::latch done{0}; std::latch done{0};
// Stage 0 consumer thread // Stage 0 consumer thread
std::thread stage0_thread([&pipeline, &done]() { std::thread stage0_thread([&pipeline, &done]() {
for (;;) { for (;;) {
auto guard = pipeline.acquire<0, 0>(); auto guard = pipeline.acquire(0, 0);
for (auto &item : guard.batch) { for (auto &item : guard.batch) {
for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) { spend_cpu_cycles(BUSY_ITERS);
}
if (item == &done) { if (item == &done) {
return; return;
} }
@@ -144,76 +142,73 @@ int main() {
} }
// Helper function for wait strategy benchmarks // Helper function for wait strategy benchmarks
auto benchmark_wait_strategy = auto benchmark_wait_strategy = [](WaitStrategy strategy,
[]<WaitStrategy strategy>(const std::string &name, const std::string &name,
ankerl::nanobench::Bench &bench) { ankerl::nanobench::Bench &bench) {
constexpr int LOG_PIPELINE_SIZE = constexpr int LOG_PIPELINE_SIZE =
8; // Smaller buffer to increase contention 8; // Smaller buffer to increase contention
constexpr int NUM_ITEMS = 50'000; constexpr int NUM_ITEMS = 50'000;
constexpr int BATCH_SIZE = 4; // Small batches to increase coordination constexpr int BATCH_SIZE = 4; // Small batches to increase coordination
constexpr int BUSY_ITERS = constexpr int BUSY_ITERS =
10; // Light work to emphasize coordination overhead 10; // Light work to emphasize coordination overhead
StaticThreadPipeline<std::latch *, strategy, 1, 1> pipeline( ThreadPipeline<std::latch *> pipeline(strategy, {1, 1}, LOG_PIPELINE_SIZE);
LOG_PIPELINE_SIZE);
std::latch done{0}; std::latch done{0};
// Stage 0 worker // Stage 0 worker
std::thread stage0_thread([&pipeline, &done]() { std::thread stage0_thread([&pipeline, &done]() {
for (;;) { for (;;) {
auto guard = pipeline.template acquire<0, 0>(); auto guard = pipeline.acquire(0, 0);
for (auto &item : guard.batch) { for (auto &item : guard.batch) {
for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) { spend_cpu_cycles(BUSY_ITERS);
} if (item == &done)
if (item == &done) return;
return;
}
}
});
// Stage 1 worker (final stage - always calls futex wake)
std::thread stage1_thread([&pipeline, &done]() {
for (;;) {
auto guard = pipeline.template acquire<1, 0>();
for (auto &item : guard.batch) {
for (volatile int i = 0; i < BUSY_ITERS; i = i + 1) {
}
if (item == &done)
return;
if (item)
item->count_down();
}
}
});
bench.run(name, [&] {
int items_pushed = 0;
while (items_pushed < NUM_ITEMS - 1) {
auto guard = pipeline.push(
std::min(NUM_ITEMS - 1 - items_pushed, BATCH_SIZE), true);
auto it = guard.batch.begin();
items_pushed += guard.batch.size();
for (size_t i = 0; i < guard.batch.size(); ++i, ++it) {
*it = nullptr;
}
}
std::latch finish{1};
{
auto guard = pipeline.push(1, true);
guard.batch[0] = &finish;
}
finish.wait();
});
// Shutdown
{
auto guard = pipeline.push(1, true);
guard.batch[0] = &done;
} }
stage0_thread.join(); }
stage1_thread.join(); });
};
// Stage 1 worker (final stage - always calls futex wake)
std::thread stage1_thread([&pipeline, &done]() {
for (;;) {
auto guard = pipeline.acquire(1, 0);
for (auto &item : guard.batch) {
spend_cpu_cycles(BUSY_ITERS);
if (item == &done)
return;
if (item)
item->count_down();
}
}
});
bench.run(name, [&] {
int items_pushed = 0;
while (items_pushed < NUM_ITEMS - 1) {
auto guard = pipeline.push(
std::min(NUM_ITEMS - 1 - items_pushed, BATCH_SIZE), true);
auto it = guard.batch.begin();
items_pushed += guard.batch.size();
for (size_t i = 0; i < guard.batch.size(); ++i, ++it) {
*it = nullptr;
}
}
std::latch finish{1};
{
auto guard = pipeline.push(1, true);
guard.batch[0] = &finish;
}
finish.wait();
});
// Shutdown
{
auto guard = pipeline.push(1, true);
guard.batch[0] = &done;
}
stage0_thread.join();
stage1_thread.join();
};
// Wait strategy comparison benchmark - multiple stages to trigger futex wakes // Wait strategy comparison benchmark - multiple stages to trigger futex wakes
{ {
@@ -224,12 +219,11 @@ int main() {
.relative(true) .relative(true)
.warmup(50); .warmup(50);
benchmark_wait_strategy.template operator()<WaitStrategy::WaitIfStageEmpty>( benchmark_wait_strategy(WaitStrategy::WaitIfStageEmpty, "WaitIfStageEmpty",
"WaitIfStageEmpty", bench); bench);
benchmark_wait_strategy.template benchmark_wait_strategy(WaitStrategy::WaitIfUpstreamIdle,
operator()<WaitStrategy::WaitIfUpstreamIdle>("WaitIfUpstreamIdle", bench); "WaitIfUpstreamIdle", bench);
benchmark_wait_strategy.template operator()<WaitStrategy::Never>("Never", benchmark_wait_strategy(WaitStrategy::Never, "Never", bench);
bench);
} }
// TODO: Add more benchmarks for: // TODO: Add more benchmarks for:

View File

@@ -15,10 +15,10 @@ HTTP I/O Threads → [Sequence] → [Resolve] → [Persist] → [Release] → HT
### Pipeline Flow ### Pipeline Flow
1. **HTTP I/O Threads**: Parse and validate incoming commit requests 1. **HTTP I/O Threads**: Parse and validate incoming commit requests
2. **Sequence Stage**: Assign sequential version numbers to commits 1. **Sequence Stage**: Assign sequential version numbers to commits
3. **Resolve Stage**: Validate preconditions and check for conflicts 1. **Resolve Stage**: Validate preconditions and check for conflicts
4. **Persist Stage**: Write commits to durable storage and notify subscribers 1. **Persist Stage**: Write commits to durable storage and notify subscribers
5. **Release Stage**: Return connections to HTTP I/O threads for response handling 1. **Release Stage**: Return connections to HTTP I/O threads for response handling
## Stage Details ## Stage Details
@@ -29,21 +29,25 @@ HTTP I/O Threads → [Sequence] → [Resolve] → [Persist] → [Release] → HT
**Serialization**: **Required** - Must be single-threaded **Serialization**: **Required** - Must be single-threaded
**Responsibilities**: **Responsibilities**:
- **For CommitEntry**: Check request_id against banned list, assign sequential version number if not banned, forward to resolve stage - **For CommitEntry**: Check request_id against banned list, assign sequential version number if not banned, forward to resolve stage
- **For StatusEntry**: Add request_id to banned list, note current highest assigned version as upper bound, transfer connection to status threadpool - **For StatusEntry**: Add request_id to banned list, note current highest assigned version as upper bound for version range scanning
- Record version assignments for transaction tracking - Record version assignments for transaction tracking
**Why Serialization is Required**: **Why Serialization is Required**:
- Version numbers must be strictly sequential without gaps - Version numbers must be strictly sequential without gaps
- Banned list updates must be atomic with version assignment - Banned list updates must be atomic with version assignment
- Status requests must get accurate upper bound on potential commit versions - Status requests must get accurate upper bound on potential commit versions
**Request ID Banned List**: **Request ID Banned List**:
- Purpose: Make transactions no longer in-flight and establish version upper bounds for status queries - Purpose: Make transactions no longer in-flight and establish version upper bounds for status queries
- Lifecycle: Grows indefinitely until process restart (leader change) - Lifecycle: Grows indefinitely until process restart (leader change)
- Removal: Only on process restart/leader change, which invalidates all old request IDs - Removal: Only on process restart/leader change, which invalidates all old request IDs
**Current Implementation**: **Current Implementation**:
```cpp ```cpp
bool HttpHandler::process_sequence_batch(BatchType &batch) { bool HttpHandler::process_sequence_batch(BatchType &batch) {
for (auto &entry : batch) { for (auto &entry : batch) {
@@ -64,20 +68,24 @@ bool HttpHandler::process_sequence_batch(BatchType &batch) {
**Serialization**: **Required** - Must be single-threaded **Serialization**: **Required** - Must be single-threaded
**Responsibilities**: **Responsibilities**:
- **For CommitEntry**: Check preconditions against in-memory recent writes set, add writes to recent writes set if accepted - **For CommitEntry**: Check preconditions against in-memory recent writes set, add writes to recent writes set if accepted
- **For StatusEntry**: N/A (transferred to status threadpool after sequence stage) - **For StatusEntry**: N/A (transferred to status threadpool after sequence stage)
- Mark failed commits with failure information (including which preconditions failed) - Mark failed commits with failure information (including which preconditions failed)
**Why Serialization is Required**: **Why Serialization is Required**:
- Must maintain consistent view of in-memory recent writes set - Must maintain consistent view of in-memory recent writes set
- Conflict detection requires atomic evaluation of all preconditions against recent writes - Conflict detection requires atomic evaluation of all preconditions against recent writes
- Recent writes set updates must be synchronized - Recent writes set updates must be synchronized
**Transaction State Transitions**: **Transaction State Transitions**:
- **Assigned Version** (from sequence) → **Semi-committed** (resolve accepts) → **Committed** (persist completes) - **Assigned Version** (from sequence) → **Semi-committed** (resolve accepts) → **Committed** (persist completes)
- Failed transactions continue through pipeline with failure information for client response - Failed transactions continue through pipeline with failure information for client response
**Current Implementation**: **Current Implementation**:
```cpp ```cpp
bool HttpHandler::process_resolve_batch(BatchType &batch) { bool HttpHandler::process_resolve_batch(BatchType &batch) {
// TODO: Implement precondition resolution logic: // TODO: Implement precondition resolution logic:
@@ -95,55 +103,62 @@ bool HttpHandler::process_resolve_batch(BatchType &batch) {
**Serialization**: **Required** - Must mark batches durable in order **Serialization**: **Required** - Must mark batches durable in order
**Responsibilities**: **Responsibilities**:
- **For CommitEntry**: Apply operations to persistent storage, update committed version high water mark
- **For StatusEntry**: N/A (transferred to status threadpool after sequence stage) - **For CommitEntry**: Apply operations to persistent storage, update committed version high water mark, generate success response JSON
- **For StatusEntry**: N/A (empty husk, connection transferred to status threadpool after sequence stage)
- Generate durability events for `/v1/subscribe` when committed version advances - Generate durability events for `/v1/subscribe` when committed version advances
- Batch multiple commits for efficient persistence operations - Batch multiple commits for efficient persistence operations
**Why Serialization is Required**: **Why Serialization is Required**:
- Batches must be marked durable in sequential version order - Batches must be marked durable in sequential version order
- High water mark updates must reflect strict ordering of committed versions - High water mark updates must reflect strict ordering of committed versions
- Ensures consistency guarantees across all endpoints - Ensures consistency guarantees across all endpoints
**Committed Version High Water Mark**: **Committed Version High Water Mark**:
- Global atomic value tracking highest durably committed version - Global atomic value tracking highest durably committed version
- Updated after each batch commits: set to highest version in the batch - Updated after each batch commits: set to highest version in the batch
- Read by `/v1/version` endpoint using atomic seq_cst reads - Read by `/v1/version` endpoint using atomic seq_cst reads
- Enables `/v1/subscribe` durability events when high water mark advances - Enables `/v1/subscribe` durability events when high water mark advances
**Batching Strategy**: **Batching Strategy**:
- Multiple semi-committed transactions can be persisted in a single batch - Multiple semi-committed transactions can be persisted in a single batch
- High water mark updated once per batch to highest version in that batch - High water mark updated once per batch to highest version in that batch
- See `persistence.md` for detailed persistence design - See `persistence.md` for detailed persistence design
**Current Implementation**: **Current Implementation**:
```cpp ```cpp
bool HttpHandler::process_persist_batch(BatchType &batch) { bool HttpHandler::process_persist_batch(BatchType &batch) {
// TODO: Implement actual persistence logic: // For CommitEntry: Apply operations to persistent storage, update high water mark, generate response JSON
// 1. For CommitEntry: Apply operations to persistent storage // For StatusEntry: N/A (empty husk, connection transferred to status threadpool)
// 2. Update committed version high water mark to highest version in batch // Generate durability events for /v1/subscribe when committed version advances
// 3. Generate durability events for /v1/subscribe // Semi-committed transactions are retried until durable or leader fails
// 4. For StatusEntry: N/A (already transferred to status threadpool)
} }
``` ```
### Stage 3: Connection Release ### Stage 3: Connection Release
**Thread**: `txn-release` **Threads**: Multiple `txn-release` threads (configurable)
**Purpose**: Return connections to HTTP server for client response **Purpose**: Return connections to HTTP server for client response
**Serialization**: Not required - Independent connection handling **Serialization**: Not required - Independent connection handling
**Responsibilities**: **Responsibilities**:
- Return processed connections to HTTP server for all request types - Return processed connections to HTTP server for all request types
- Connection carries response data (success/failure) and status information - Connection carries response data (success/failure) and status information
- Trigger response transmission to clients - Trigger response transmission to clients
**Response Handling**: **Response Handling**:
- **CommitRequests**: Response generated by persist stage (success with version, or failure with conflicting preconditions)
- **StatusRequests**: Response generated by separate status lookup logic (not part of pipeline) - **CommitRequests**: Response JSON generated by persist stage (success with version, or failure with conflicting preconditions from resolve stage)
- **StatusRequests**: Response generated by separate status threadpool (connection transferred after sequence stage)
- Failed transactions carry failure information through entire pipeline for proper client response - Failed transactions carry failure information through entire pipeline for proper client response
**Implementation**: **Implementation**:
```cpp ```cpp
bool HttpHandler::process_release_batch(BatchType &batch) { bool HttpHandler::process_release_batch(BatchType &batch) {
// Stage 3: Connection release // Stage 3: Connection release
@@ -151,8 +166,9 @@ bool HttpHandler::process_release_batch(BatchType &batch) {
if (!conn) { if (!conn) {
return true; // Shutdown signal return true; // Shutdown signal
} }
// Return connection to server for further processing or cleanup // Connection is server-owned - respond to client and connection
Server::release_back_to_server(std::move(conn)); // remains managed by server's connection registry
// TODO: Implement response sending with new server-owned connection model
} }
return false; // Continue processing return false; // Continue processing
} }
@@ -164,12 +180,12 @@ bool HttpHandler::process_release_batch(BatchType &batch) {
```cpp ```cpp
// 4-stage pipeline: sequence -> resolve -> persist -> release // 4-stage pipeline: sequence -> resolve -> persist -> release
// TODO: Update pipeline type from std::unique_ptr<Connection> to PipelineEntry variant // Pipeline with PipelineEntry variant instead of connection ownership transfer
StaticThreadPipeline<PipelineEntry, // Was: std::unique_ptr<Connection> StaticThreadPipeline<PipelineEntry, // Was: std::unique_ptr<Connection>
WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1, 1> WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1, 1>
commitPipeline{lg_size}; commitPipeline{lg_size};
// Pipeline entry type (to be implemented) // Pipeline entry type for server-owned connection model
using PipelineEntry = std::variant<CommitEntry, StatusEntry, ShutdownEntry>; using PipelineEntry = std::variant<CommitEntry, StatusEntry, ShutdownEntry>;
``` ```
@@ -212,7 +228,7 @@ for (auto &conn : guard.batch) {
Commit requests enter the pipeline via `HttpHandler::on_batch_complete()`: Commit requests enter the pipeline via `HttpHandler::on_batch_complete()`:
```cpp ```cpp
void HttpHandler::on_batch_complete(std::span<std::unique_ptr<Connection>> batch) { void HttpHandler::on_batch_complete(std::span<Connection*> batch) {
// Collect commit requests that passed basic validation for 4-stage pipeline processing // Collect commit requests that passed basic validation for 4-stage pipeline processing
int commit_count = 0; int commit_count = 0;
for (auto &conn : batch) { for (auto &conn : batch) {
@@ -237,7 +253,10 @@ void HttpHandler::on_batch_complete(std::span<std::unique_ptr<Connection>> batch
### Backpressure Handling ### Backpressure Handling
The pipeline implements natural backpressure: The pipeline implements natural backpressure:
- Each stage blocks if downstream stages are full
- Fixed-size pipeline buffer causes I/O threads to block when pipeline is full
- This prevents unbounded memory growth under high load
- I/O threads blocking may impact accept() rate, but provides system-wide flow control
- `WaitIfUpstreamIdle` strategy balances latency vs throughput - `WaitIfUpstreamIdle` strategy balances latency vs throughput
- Ring buffer size (`lg_size = 16`) controls maximum queued batches - Ring buffer size (`lg_size = 16`) controls maximum queued batches
@@ -288,7 +307,7 @@ std::visit([&](auto&& entry) {
- Failed CommitEntries are passed through the pipeline with error information - Failed CommitEntries are passed through the pipeline with error information
- Downstream stages skip processing for error connections but forward them - Downstream stages skip processing for error connections but forward them
- Error responses are sent when connection reaches release stage - Error responses are sent when connection reaches release stage
- Connection ownership is always transferred to ensure cleanup - Server-owned connections ensure proper cleanup and response handling
### Pipeline Integrity ### Pipeline Integrity
@@ -310,7 +329,7 @@ std::visit([&](auto&& entry) {
- **Single-Pass Processing**: Each connection flows through all stages once - **Single-Pass Processing**: Each connection flows through all stages once
- **Streaming Design**: Stages process concurrently - **Streaming Design**: Stages process concurrently
- **Minimal Copying**: Connection ownership transfer, not data copying - **Minimal Copying**: Request processing with server-owned connections
- **Direct Response**: Release stage triggers immediate response transmission - **Direct Response**: Release stage triggers immediate response transmission
### Scalability Characteristics ### Scalability Characteristics
@@ -328,7 +347,7 @@ private:
static constexpr int lg_size = 16; // Ring buffer size = 2^16 entries static constexpr int lg_size = 16; // Ring buffer size = 2^16 entries
// 4-stage pipeline configuration // 4-stage pipeline configuration
StaticThreadPipeline<std::unique_ptr<Connection>, StaticThreadPipeline<PipelineEntry,
WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1, 1> WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1, 1>
commitPipeline{lg_size}; commitPipeline{lg_size};
``` ```
@@ -344,8 +363,9 @@ private:
The pipeline processes different types of entries using a variant/union type system instead of `std::unique_ptr<Connection>`: The pipeline processes different types of entries using a variant/union type system instead of `std::unique_ptr<Connection>`:
### Pipeline Entry Variants ### Pipeline Entry Variants
- **CommitEntry**: Contains `std::unique_ptr<Connection>` with CommitRequest and connection state
- **StatusEntry**: Contains `std::unique_ptr<Connection>` with StatusRequest (transferred to status threadpool after sequence) - **CommitEntry**: Contains connection reference/ID with CommitRequest and connection state
- **StatusEntry**: Contains connection reference/ID with StatusRequest (transferred to status threadpool after sequence)
- **ShutdownEntry**: Signals pipeline shutdown to all stages - **ShutdownEntry**: Signals pipeline shutdown to all stages
- **Future types**: Pipeline design supports additional entry types - **Future types**: Pipeline design supports additional entry types
@@ -354,9 +374,9 @@ The pipeline processes different types of entries using a variant/union type sys
| Stage | CommitEntry | StatusEntry | ShutdownEntry | Serialization | | Stage | CommitEntry | StatusEntry | ShutdownEntry | Serialization |
|-------|-------------|-------------|---------------|---------------| |-------|-------------|-------------|---------------|---------------|
| **Sequence** | Check banned list, assign version | Add to banned list, transfer to status threadpool | Return true (shutdown) | **Required** | | **Sequence** | Check banned list, assign version | Add to banned list, transfer to status threadpool | Return true (shutdown) | **Required** |
| **Resolve** | Check preconditions, update recent writes | N/A (transferred) | Return true (shutdown) | **Required** | | **Resolve** | Check preconditions, update recent writes | N/A (empty husk) | Return true (shutdown) | **Required** |
| **Persist** | Apply operations, update high water mark | N/A (transferred) | Return true (shutdown) | **Required** | | **Persist** | Apply operations, update high water mark | N/A (empty husk) | Return true (shutdown) | **Required** |
| **Release** | Return connection to HTTP threads | N/A (transferred) | Return true (shutdown) | Not required | | **Release** | Return connection to HTTP threads | N/A (empty husk) | Return true (shutdown) | Not required (multiple threads) |
## API Endpoint Integration ## API Endpoint Integration
@@ -367,6 +387,7 @@ The pipeline processes different types of entries using a variant/union type sys
#### Request Processing Flow #### Request Processing Flow
1. **HTTP I/O Thread Processing** (`src/http_handler.cpp:210-273`): 1. **HTTP I/O Thread Processing** (`src/http_handler.cpp:210-273`):
```cpp ```cpp
void HttpHandler::handlePostCommit(Connection &conn, HttpConnectionState &state) { void HttpHandler::handlePostCommit(Connection &conn, HttpConnectionState &state) {
// Parse and validate anything that doesn't need serialization: // Parse and validate anything that doesn't need serialization:
@@ -379,47 +400,57 @@ The pipeline processes different types of entries using a variant/union type sys
} }
``` ```
2. **Pipeline Entry**: Successfully parsed connections enter pipeline as CommitEntry (containing the connection with CommitRequest) 1. **Pipeline Entry**: Successfully parsed connections enter pipeline as CommitEntry (containing the connection with CommitRequest)
1. **Pipeline Processing**:
3. **Pipeline Processing**:
- **Sequence**: Check banned list → assign version (or reject) - **Sequence**: Check banned list → assign version (or reject)
- **Resolve**: Check preconditions against in-memory recent writes → mark semi-committed (or failed with conflict details) - **Resolve**: Check preconditions against in-memory recent writes → mark semi-committed (or failed with conflict details)
- **Persist**: Apply operations → mark committed, update high water mark - **Persist**: Apply operations → mark committed, update high water mark
- **Release**: Return connection with response data - **Release**: Return connection with response data
4. **Response Generation**: Based on pipeline results 1. **Response Generation**: Based on pipeline results
- **Success**: `{"status": "committed", "version": N, "leader_id": "...", "request_id": "..."}` - **Success**: `{"status": "committed", "version": N, "leader_id": "...", "request_id": "..."}`
- **Failure**: `{"status": "not_committed", "conflicts": [...], "version": N, "leader_id": "..."}` - **Failure**: `{"status": "not_committed", "conflicts": [...], "version": N, "leader_id": "..."}`
### `/v1/status` - Commit Status Lookup ### `/v1/status` - Commit Status Lookup
**Pipeline Interaction**: StatusEntry through sequence stage, then transfer to status threadpool **Pipeline Interaction**: StatusEntry through sequence stage only
#### Request Processing Flow #### Request Processing Flow
1. **HTTP I/O Thread Processing**: 1. **HTTP I/O Thread Processing**:
```cpp ```cpp
void HttpHandler::handleGetStatus(Connection &conn, const HttpConnectionState &state) { void HttpHandler::handleGetStatus(Connection &conn, const HttpConnectionState &state) {
// TODO: Extract request_id from URL and min_version from query params // Extract request_id from URL and min_version from query params
// Current: Returns placeholder static response // Create StatusEntry for pipeline processing
} }
``` ```
2. **Two-Phase Processing**: 1. **Pipeline Processing**:
- **Phase 1 - Sequence Stage**: StatusEntry enters pipeline to add request_id to banned list and get version upper bound
- **Phase 2 - Status Threadpool**: Connection transferred from sequence stage to dedicated status threadpool for actual status lookup logic
3. **Status Lookup Logic**: Performed in status threadpool - scan transaction log to determine actual commit status of the now-banned request_id - **Sequence Stage**: StatusEntry adds request_id to banned list, establishes version scanning range, transfers connection to status threadpool
- **Subsequent Stages**: Empty StatusEntry husk flows through resolve/persist/release as no-op
1. **Status Lookup Logic**:
- Version range determined in sequence stage (min_version parameter to version upper bound)
- Actual S3 scanning performed by separate status threadpool outside the pipeline
- Return "committed" with version if found, "not_found" if not found in scanned range
### `/v1/subscribe` - Real-time Transaction Stream ### `/v1/subscribe` - Real-time Transaction Stream
**Pipeline Integration**: Consumes events from resolve and persist stages **Pipeline Integration**: Consumes events from resolve and persist stages
#### Event Sources #### Event Sources
- **Resolve Stage**: Semi-committed transactions (accepted preconditions) for low-latency streaming - **Resolve Stage**: Semi-committed transactions (accepted preconditions) for low-latency streaming
- **Persist Stage**: Durability events when committed version high water mark advances - **Persist Stage**: Durability events when committed version high water mark advances
#### Current Implementation #### Current Implementation
```cpp ```cpp
void HttpHandler::handleGetSubscribe(Connection &conn, const HttpConnectionState &state) { void HttpHandler::handleGetSubscribe(Connection &conn, const HttpConnectionState &state) {
// TODO: Parse query parameters (after, durable) // TODO: Parse query parameters (after, durable)
@@ -447,11 +478,12 @@ void HttpHandler::handleGetSubscribe(Connection &conn, const HttpConnectionState
The pipeline integrates with the HTTP handler at two points: The pipeline integrates with the HTTP handler at two points:
1. **Entry**: `on_batch_complete()` feeds connections into sequence stage 1. **Entry**: `on_batch_complete()` feeds connections into sequence stage
2. **Exit**: Release stage calls `Server::release_back_to_server()` 1. **Exit**: Release stage responds to clients with server-owned connections
### Persistence Layer Integration ### Persistence Layer Integration
The persist stage interfaces with: The persist stage interfaces with:
- **S3 Backend**: Batch writes for durability (see `persistence.md`) - **S3 Backend**: Batch writes for durability (see `persistence.md`)
- **Subscriber System**: Real-time change stream notifications - **Subscriber System**: Real-time change stream notifications
- **Metrics System**: Transaction throughput and latency tracking - **Metrics System**: Transaction throughput and latency tracking
@@ -467,17 +499,17 @@ The persist stage interfaces with:
### Potential Enhancements ### Potential Enhancements
1. **Dynamic Thread Counts**: Make resolve and release thread counts configurable 1. **Dynamic Thread Counts**: Make resolve and release thread counts configurable
2. **NUMA Optimization**: Pin pipeline threads to specific CPU cores 1. **NUMA Optimization**: Pin pipeline threads to specific CPU cores
3. **Batch Size Tuning**: Dynamic batch size based on load 1. **Batch Size Tuning**: Dynamic batch size based on load
4. **Stage Bypassing**: Skip resolve stage for transactions without preconditions 1. **Stage Bypassing**: Skip resolve stage for transactions without preconditions
5. **Persistence Batching**: Aggregate multiple commits into larger S3 writes 1. **Persistence Batching**: Aggregate multiple commits into larger S3 writes
### Monitoring and Observability ### Monitoring and Observability
1. **Stage Metrics**: Throughput, latency, and queue depth per stage 1. **Stage Metrics**: Throughput, latency, and queue depth per stage
2. **Error Tracking**: Error rates and types by stage 1. **Error Tracking**: Error rates and types by stage
3. **Resource Utilization**: CPU and memory usage per pipeline thread 1. **Resource Utilization**: CPU and memory usage per pipeline thread
4. **Flow Control Events**: Backpressure and stall detection 1. **Flow Control Events**: Backpressure and stall detection
## Implementation Status ## Implementation Status

View File

@@ -28,13 +28,15 @@ Controls server networking, threading, and request handling behavior.
### Commit Configuration (`[commit]`) ### Commit Configuration (`[commit]`)
Controls behavior of the `/v1/commit` endpoint and request ID management. Controls behavior of the `/v1/commit` endpoint, request ID management, and commit pipeline threading.
| Parameter | Type | Default | Description | | Parameter | Type | Default | Description |
|-----------|------|---------|-------------| |-----------|------|---------|-------------|
| `min_request_id_length` | integer | `20` | Minimum length required for client-provided `request_id` fields to ensure sufficient entropy for collision avoidance | | `min_request_id_length` | integer | `20` | Minimum length required for client-provided `request_id` fields to ensure sufficient entropy for collision avoidance |
| `request_id_retention_hours` | integer | `24` | How long to retain request IDs in memory for `/v1/status` queries. Longer retention reduces the chance of `log_truncated` responses | | `request_id_retention_hours` | integer | `24` | How long to retain request IDs in memory for `/v1/status` queries. Longer retention reduces the chance of `log_truncated` responses |
| `request_id_retention_versions` | integer | `100000000` | Minimum number of versions to retain request IDs for, regardless of time. Provides additional protection against `log_truncated` responses | | `request_id_retention_versions` | integer | `100000000` | Minimum number of versions to retain request IDs for, regardless of time. Provides additional protection against `log_truncated` responses |
| `pipeline_wait_strategy` | string | `"WaitIfUpstreamIdle"` | Wait strategy for the commit pipeline. `"WaitIfStageEmpty"` = block when individual stages are empty (safe for shared CPUs), `"WaitIfUpstreamIdle"` = block only when all upstream stages are idle (requires dedicated cores, highest throughput), `"Never"` = never block, busy-wait continuously (requires dedicated cores, lowest latency) |
| `pipeline_release_threads` | integer | `1` | Number of threads in the release stage (final stage of commit pipeline). Higher values increase parallelism for connection release and response transmission |
### Subscription Configuration (`[subscription]`) ### Subscription Configuration (`[subscription]`)
@@ -77,6 +79,8 @@ read_buffer_size = 32768 # 32KB
min_request_id_length = 32 min_request_id_length = 32
request_id_retention_hours = 48 request_id_retention_hours = 48
request_id_retention_versions = 50000 request_id_retention_versions = 50000
pipeline_wait_strategy = "WaitIfUpstreamIdle" # Options: "WaitIfStageEmpty", "WaitIfUpstreamIdle", "Never"
pipeline_release_threads = 4 # Default: 1, increase for higher throughput
[subscription] [subscription]
max_buffer_size_bytes = 52428800 # 50MB max_buffer_size_bytes = 52428800 # 50MB
@@ -101,18 +105,27 @@ WeaselDB uses the `toml11` library for configuration parsing with robust error h
These configuration parameters directly affect server and API behavior: These configuration parameters directly affect server and API behavior:
**Server Performance:** **Server Performance:**
- **`io_threads`**: Controls parallelism for both accepting new connections and I/O processing. Should typically match CPU core count for optimal performance - **`io_threads`**: Controls parallelism for both accepting new connections and I/O processing. Should typically match CPU core count for optimal performance
- **`event_batch_size`**: Larger batches reduce syscall overhead but may increase latency under light load - **`event_batch_size`**: Larger batches reduce syscall overhead but may increase latency under light load
- **`max_connections`**: Prevents resource exhaustion by limiting concurrent connections - **`max_connections`**: Prevents resource exhaustion by limiting concurrent connections
**Request Handling:** **Request Handling:**
- **`max_request_size_bytes`**: Determines when `/v1/commit` returns `413 Content Too Large` - **`max_request_size_bytes`**: Determines when `/v1/commit` returns `413 Content Too Large`
- **`min_request_id_length`**: Validates `request_id` fields in `/v1/commit` requests for sufficient entropy - **`min_request_id_length`**: Validates `request_id` fields in `/v1/commit` requests for sufficient entropy
**Request ID Management:** **Request ID Management:**
- **`request_id_retention_*`**: Affects availability of data for `/v1/status` queries and likelihood of `log_truncated` responses - **`request_id_retention_*`**: Affects availability of data for `/v1/status` queries and likelihood of `log_truncated` responses
**Commit Pipeline Performance:**
- **`pipeline_wait_strategy`**: Controls CPU usage vs latency tradeoff in commit processing. `WaitIfStageEmpty` is safest for shared CPUs, `WaitIfUpstreamIdle` provides highest throughput with dedicated cores, `Never` provides lowest latency but uses 100% CPU
- **`pipeline_release_threads`**: Determines parallelism in the final stage of commit processing. More threads can improve throughput when processing many concurrent requests
**Subscription Streaming:** **Subscription Streaming:**
- **`max_buffer_size_bytes`**: Controls when `/v1/subscribe` connections are terminated due to slow consumption - **`max_buffer_size_bytes`**: Controls when `/v1/subscribe` connections are terminated due to slow consumption
- **`keepalive_interval_seconds`**: Frequency of keepalive comments in `/v1/subscribe` streams - **`keepalive_interval_seconds`**: Frequency of keepalive comments in `/v1/subscribe` streams
@@ -121,6 +134,7 @@ These configuration parameters directly affect server and API behavior:
The configuration system includes comprehensive validation with specific bounds checking: The configuration system includes comprehensive validation with specific bounds checking:
### Server Configuration Limits ### Server Configuration Limits
- **`port`**: Must be between 1 and 65535 - **`port`**: Must be between 1 and 65535
- **`max_request_size_bytes`**: Must be > 0 and ≤ 100MB - **`max_request_size_bytes`**: Must be > 0 and ≤ 100MB
- **`io_threads`**: Must be between 1 and 1000 - **`io_threads`**: Must be between 1 and 1000
@@ -128,26 +142,33 @@ The configuration system includes comprehensive validation with specific bounds
- **`max_connections`**: Must be between 0 and 100000 (0 = unlimited) - **`max_connections`**: Must be between 0 and 100000 (0 = unlimited)
### Commit Configuration Limits ### Commit Configuration Limits
- **`min_request_id_length`**: Must be between 8 and 256 characters - **`min_request_id_length`**: Must be between 8 and 256 characters
- **`request_id_retention_hours`**: Must be between 1 and 8760 hours (1 year) - **`request_id_retention_hours`**: Must be between 1 and 8760 hours (1 year)
- **`request_id_retention_versions`**: Must be > 0 - **`request_id_retention_versions`**: Must be > 0
- **`pipeline_wait_strategy`**: Must be one of: `"WaitIfStageEmpty"`, `"WaitIfUpstreamIdle"`, or `"Never"`
- **`pipeline_release_threads`**: Must be between 1 and 64
### Subscription Configuration Limits ### Subscription Configuration Limits
- **`max_buffer_size_bytes`**: Must be > 0 and ≤ 1GB - **`max_buffer_size_bytes`**: Must be > 0 and ≤ 1GB
- **`keepalive_interval_seconds`**: Must be between 1 and 3600 seconds (1 hour) - **`keepalive_interval_seconds`**: Must be between 1 and 3600 seconds (1 hour)
### Cross-Validation ### Cross-Validation
- Warns if `max_request_size_bytes` > `max_buffer_size_bytes` (potential buffering issues) - Warns if `max_request_size_bytes` > `max_buffer_size_bytes` (potential buffering issues)
## Configuration Management ## Configuration Management
### Code Integration ### Code Integration
- **Configuration Structure**: Defined in `src/config.hpp` with structured types - **Configuration Structure**: Defined in `src/config.hpp` with structured types
- **Parser Implementation**: Located in `src/config.cpp` using template-based parsing - **Parser Implementation**: Located in `src/config.cpp` using template-based parsing
- **Default Values**: Embedded as struct defaults for compile-time initialization - **Default Values**: Embedded as struct defaults for compile-time initialization
- **Runtime Usage**: Configuration passed to server components during initialization - **Runtime Usage**: Configuration passed to server components during initialization
### Development Guidelines ### Development Guidelines
- **New Parameters**: Add to appropriate struct in `src/config.hpp` - **New Parameters**: Add to appropriate struct in `src/config.hpp`
- **Validation**: Include bounds checking in `ConfigParser::validate_config()` - **Validation**: Include bounds checking in `ConfigParser::validate_config()`
- **Documentation**: Update this file when adding new configuration options - **Documentation**: Update this file when adding new configuration options

267
design.md
View File

@@ -3,15 +3,15 @@
## Table of Contents ## Table of Contents
1. [Project Overview](#project-overview) 1. [Project Overview](#project-overview)
2. [Quick Start](#quick-start) 1. [Quick Start](#quick-start)
3. [Architecture](#architecture) 1. [Architecture](#architecture)
4. [Development Guidelines](#development-guidelines) 1. [Development Guidelines](#development-guidelines)
5. [Common Patterns](#common-patterns) 1. [Common Patterns](#common-patterns)
6. [Reference](#reference) 1. [Reference](#reference)
**IMPORTANT:** Read [style.md](style.md) first - contains mandatory C++ coding standards, threading rules, and testing guidelines that must be followed for all code changes. **IMPORTANT:** Read [style.md](style.md) first - contains mandatory C++ coding standards, threading rules, and testing guidelines that must be followed for all code changes.
--- ______________________________________________________________________
## Project Overview ## Project Overview
@@ -22,11 +22,21 @@ WeaselDB is a high-performance write-side database component designed for system
- **Ultra-fast arena allocation** (~1ns vs ~20-270ns for malloc) - **Ultra-fast arena allocation** (~1ns vs ~20-270ns for malloc)
- **High-performance JSON parsing** with streaming support and SIMD optimization - **High-performance JSON parsing** with streaming support and SIMD optimization
- **Multi-threaded networking** using multiple epoll instances with unified I/O thread pool - **Multi-threaded networking** using multiple epoll instances with unified I/O thread pool
- **Multi-stage commit pipeline** with serial processing for consistency and parallel I/O for performance
- **Non-blocking metrics system** with try-lock optimization preventing pipeline stalls
- **Configurable epoll instances** to eliminate kernel-level contention - **Configurable epoll instances** to eliminate kernel-level contention
- **Optimized memory management** with arena allocation and efficient copying - **Optimized memory management** with arena allocation and efficient copying
- **Factory pattern safety** ensuring correct object lifecycle management - **Factory pattern safety** ensuring correct object lifecycle management
--- ### Design Philosophy
**"Two machines once you've mastered one"** - Optimize aggressively for single-machine performance before distributing. Most systems prematurely scale horizontally and never fully utilize their hardware. How are you supposed to horizontally scale strict serializability anyway?
**Boring formats, fast implementations** - Use standard data formats (JSON, HTTP, base64) with heavily optimized parsing. Universal compatibility without sacrificing performance.
**Read/write separation** - Fan out reads from the single write stream (persist stage to many subscribers), with true horizontal scaling via S3 for historical data. Keep writes simple and fast.
______________________________________________________________________
## Quick Start ## Quick Start
@@ -43,39 +53,49 @@ ninja
### Testing & Development ### Testing & Development
**Run all tests:** **Run all tests:**
```bash ```bash
ninja test # or ctest ninja test # or ctest
``` ```
**Individual targets:** **Individual targets:**
- `./test_arena` - Arena allocator unit tests - `./test_arena` - Arena allocator unit tests
- `./test_commit_request` - JSON parsing and validation tests - `./test_commit_request` - JSON parsing and validation tests
- `./test_http_handler` - HTTP protocol handling tests - `./test_http_handler` - HTTP protocol handling tests
- `./test_metric` - Metrics system tests - `./test_metric` - Metrics system tests
- `./test_api_url_parser` - API URL parsing tests - `./test_api_url_parser` - API URL parsing tests
- `./test_reference` - Reference counting system tests
- `./test_server_connection_return` - Connection lifecycle tests - `./test_server_connection_return` - Connection lifecycle tests
**Benchmarking:** **Benchmarking:**
- `./bench_arena` - Memory allocation performance - `./bench_arena` - Memory allocation performance
- `./bench_commit_request` - JSON parsing performance - `./bench_commit_request` - JSON parsing performance
- `./bench_parser_comparison` - Compare vs nlohmann::json and RapidJSON - `./bench_cpu_work` - CPU work benchmarking utility
- `./bench_metric` - Metrics system performance
- `./bench_thread_pipeline` - Lock-free pipeline performance
- `./bench_format_comparison` - String formatting performance - `./bench_format_comparison` - String formatting performance
- `./bench_metric` - Metrics system performance
- `./bench_parser_comparison` - Compare vs nlohmann::json and RapidJSON
- `./bench_reference` - Reference counting performance
- `./bench_thread_pipeline` - Lock-free pipeline performance
**Debug tools:** **Debug tools:**
- `./debug_arena` - Analyze arena allocator behavior - `./debug_arena` - Analyze arena allocator behavior
**Load Testing:** **Load Testing:**
- `./load_tester` - A tool to generate load against the server for performance and stability analysis. - `./load_tester` - A tool to generate load against the server for performance and stability analysis.
### Dependencies ### Dependencies
**System requirements:** **System requirements:**
- **weaseljson** - Must be installed system-wide (high-performance JSON parser) - **weaseljson** - Must be installed system-wide (high-performance JSON parser)
- **gperf** - System requirement for perfect hash generation - **gperf** - System requirement for perfect hash generation
**Auto-fetched:** **Auto-fetched:**
- **simdutf** - SIMD base64 encoding/decoding - **simdutf** - SIMD base64 encoding/decoding
- **toml11** - TOML configuration parsing - **toml11** - TOML configuration parsing
- **doctest** - Testing framework - **doctest** - Testing framework
@@ -84,7 +104,7 @@ ninja test # or ctest
- **RapidJSON** - High-performance JSON library (used in benchmarks) - **RapidJSON** - High-performance JSON library (used in benchmarks)
- **llhttp** - Fast HTTP parser - **llhttp** - Fast HTTP parser
--- ______________________________________________________________________
## Architecture ## Architecture
@@ -106,17 +126,19 @@ Ultra-fast memory allocator optimized for request/response patterns:
#### **Networking Layer** #### **Networking Layer**
**Server** (`src/server.{hpp,cpp}`): **Server** (`src/server.{hpp,cpp}`):
- **High-performance multi-threaded networking** using multiple epoll instances with unified I/O thread pool - **High-performance multi-threaded networking** using multiple epoll instances with unified I/O thread pool
- **Configurable epoll instances** to eliminate kernel-level epoll_ctl contention (default: 2, max: io_threads) - **Configurable epoll instances** to eliminate kernel-level epoll_ctl contention (default: 2, max: io_threads)
- **Round-robin thread-to-epoll assignment** distributes I/O threads across epoll instances - **Round-robin thread-to-epoll assignment** distributes I/O threads across epoll instances
- **Connection distribution** keeps accepted connections on same epoll, returns via round-robin - **Connection distribution** keeps accepted connections on same epoll, returns via round-robin
- **Factory pattern construction** via `Server::create()` ensures proper shared_ptr semantics - **Factory pattern construction** via `Server::create()` ensures you can only get a `Ref<Server>`
- **Safe shutdown mechanism** with async-signal-safe shutdown() method - **Safe shutdown mechanism** with async-signal-safe shutdown() method
- **Connection ownership management** with automatic cleanup on server destruction - **Connection ownership management** with automatic cleanup on server destruction
- **Pluggable protocol handlers** via ConnectionHandler interface - **Pluggable protocol handlers** via ConnectionHandler interface
- **EPOLL_EXCLUSIVE** on listen socket across all epoll instances prevents thundering herd - **EPOLL_EXCLUSIVE** on listen socket across all epoll instances prevents thundering herd
**Connection** (`src/connection.{hpp,cpp}`): **Connection** (`src/connection.{hpp,cpp}`):
- **Efficient per-connection state management** with arena-based memory allocation - **Efficient per-connection state management** with arena-based memory allocation
- **Safe ownership transfer** between server threads and protocol handlers - **Safe ownership transfer** between server threads and protocol handlers
- **Automatic cleanup** on connection closure or server shutdown - **Automatic cleanup** on connection closure or server shutdown
@@ -124,6 +146,7 @@ Ultra-fast memory allocator optimized for request/response patterns:
- **Protocol-specific data:** `user_data` `void*` for custom handler data - **Protocol-specific data:** `user_data` `void*` for custom handler data
**ConnectionHandler Interface** (`src/connection_handler.hpp`): **ConnectionHandler Interface** (`src/connection_handler.hpp`):
- **Abstract protocol interface** decoupling networking from application logic - **Abstract protocol interface** decoupling networking from application logic
- **Ownership transfer support** allowing handlers to take connections for async processing - **Ownership transfer support** allowing handlers to take connections for async processing
- **Streaming data processing** with partial message handling - **Streaming data processing** with partial message handling
@@ -141,6 +164,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
#### **Parsing Layer** #### **Parsing Layer**
**JSON Commit Request Parser** (`src/json_commit_request_parser.{hpp,cpp}`): **JSON Commit Request Parser** (`src/json_commit_request_parser.{hpp,cpp}`):
- **High-performance JSON parser** using `weaseljson` library - **High-performance JSON parser** using `weaseljson` library
- **Streaming parser support** for incremental parsing of network data - **Streaming parser support** for incremental parsing of network data
- **gperf-optimized token recognition** for fast JSON key parsing - **gperf-optimized token recognition** for fast JSON key parsing
@@ -150,6 +174,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
- **Zero hash collisions** for known JSON tokens eliminates branching - **Zero hash collisions** for known JSON tokens eliminates branching
**Parser Interface** (`src/commit_request_parser.hpp`): **Parser Interface** (`src/commit_request_parser.hpp`):
- **Abstract base class** for commit request parsers - **Abstract base class** for commit request parsers
- **Format-agnostic parsing interface** supporting multiple serialization formats - **Format-agnostic parsing interface** supporting multiple serialization formats
- **Streaming and one-shot parsing modes** - **Streaming and one-shot parsing modes**
@@ -158,6 +183,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
#### **Data Model** #### **Data Model**
**Commit Request Data Model** (`src/commit_request.hpp`): **Commit Request Data Model** (`src/commit_request.hpp`):
- **Format-agnostic data structure** for representing transactional commits - **Format-agnostic data structure** for representing transactional commits
- **Arena-backed string storage** with efficient memory management - **Arena-backed string storage** with efficient memory management
- **Move-only semantics** for optimal performance - **Move-only semantics** for optimal performance
@@ -167,18 +193,21 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
#### **Metrics System** (`src/metric.{hpp,cpp}`) #### **Metrics System** (`src/metric.{hpp,cpp}`)
**High-Performance Metrics Implementation:** **High-Performance Metrics Implementation:**
- **Thread-local counters/histograms** with single writer for performance - **Thread-local counters/histograms** with single writer for performance
- **Global gauges** with lock-free atomic CAS operations for multi-writer scenarios - **Global gauges** with lock-free atomic CAS operations for multi-writer scenarios
- **SIMD-optimized histogram bucket updates** using AVX instructions for high throughput - **SIMD-optimized histogram bucket updates** using AVX instructions for high throughput
- **Arena allocator integration** for efficient memory management during rendering - **Arena allocator integration** for efficient memory management during rendering
**Threading Model:** **Threading Model:**
- **Counters**: Per-thread storage, single writer, atomic write in `Counter::inc()`, atomic read in render thread - **Counters**: Per-thread storage, single writer, atomic write in `Counter::inc()`, atomic read in render thread
- **Histograms**: Per-thread storage, single writer, per-histogram mutex serializes all access (observe and render) - **Histograms**: Per-thread storage, single writer, per-histogram mutex serializes all access (observe and render)
- **Gauges**: Lock-free atomic operations using `std::bit_cast` for double precision - **Gauges**: Lock-free atomic operations using `std::bit_cast` for double precision
- **Thread cleanup**: Automatic accumulation of thread-local state into global state on destruction - **Thread cleanup**: Automatic accumulation of thread-local state into global state on destruction
**Prometheus Compatibility:** **Prometheus Compatibility:**
- **Standard metric types** with proper label handling and validation - **Standard metric types** with proper label handling and validation
- **Bucket generation helpers** for linear/exponential histogram distributions - **Bucket generation helpers** for linear/exponential histogram distributions
- **Callback-based metrics** for dynamic values - **Callback-based metrics** for dynamic values
@@ -187,6 +216,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
#### **Configuration & Optimization** #### **Configuration & Optimization**
**Configuration System** (`src/config.{hpp,cpp}`): **Configuration System** (`src/config.{hpp,cpp}`):
- **TOML-based configuration** using `toml11` library - **TOML-based configuration** using `toml11` library
- **Structured configuration** with server, commit, and subscription sections - **Structured configuration** with server, commit, and subscription sections
- **Default fallback values** for all configuration options - **Default fallback values** for all configuration options
@@ -194,6 +224,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
- See `config.md` for complete configuration documentation - See `config.md` for complete configuration documentation
**JSON Token Optimization** (`src/json_tokens.gperf`, `src/json_token_enum.hpp`): **JSON Token Optimization** (`src/json_tokens.gperf`, `src/json_token_enum.hpp`):
- **Perfect hash table** generated by gperf for O(1) JSON key lookup - **Perfect hash table** generated by gperf for O(1) JSON key lookup
- **Compile-time token enumeration** for type-safe key identification - **Compile-time token enumeration** for type-safe key identification
- **Minimal perfect hash** reduces memory overhead and improves cache locality - **Minimal perfect hash** reduces memory overhead and improves cache locality
@@ -202,6 +233,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
### Transaction Data Model ### Transaction Data Model
#### CommitRequest Structure #### CommitRequest Structure
``` ```
CommitRequest { CommitRequest {
- request_id: Optional unique identifier - request_id: Optional unique identifier
@@ -220,48 +252,54 @@ CommitRequest {
### Memory Management Model ### Memory Management Model
#### Connection Ownership Lifecycle #### Connection Ownership Lifecycle
1. **Creation**: Accept threads create connections, transfer to epoll as raw pointers
2. **Processing**: Network threads claim ownership by wrapping in unique_ptr 1. **Creation**: Server creates connections and stores them in registry
3. **Handler Transfer**: Handlers can take ownership for async processing via unique_ptr.release() 1. **Processing**: I/O threads access connections via registry lookup
4. **Return Path**: Handlers use Server::release_back_to_server() to return connections 1. **Handler Access**: Handlers receive Connection& references, server retains ownership
5. **Safety**: All transfers use weak_ptr to server for safe cleanup 1. **Async Processing**: Handlers use WeakRef<Connection> for safe async access
6. **Cleanup**: RAII ensures proper resource cleanup in all scenarios 1. **Safety**: Connection mutex synchronizes concurrent access between threads
1. **Cleanup**: RAII ensures proper resource cleanup when connections are destroyed
#### Arena Memory Lifecycle #### Arena Memory Lifecycle
1. **Request Processing**: Handler uses `conn->get_arena()` to allocate memory for parsing request data
2. **Response Generation**: Handler uses arena for temporary response construction (headers, JSON, etc.) 1. **Request Processing**: Handler creates request-scoped arena for parsing request data
3. **Response Queuing**: Handler calls `conn->append_message()` which copies data to arena-backed message queue 1. **Response Generation**: Handler uses same arena for response construction (headers, JSON, etc.)
4. **Response Writing**: Server writes all queued messages to socket via `writeBytes()` 1. **Response Queuing**: Handler calls `conn->append_message()` passing span + arena ownership
1. **Response Writing**: I/O thread writes messages to socket, arena freed after completion
> **Note**: Call `conn->reset()` periodically to reclaim arena memory. Best practice is after all outgoing bytes have been written. > **Note**: Call `conn->reset()` periodically to reclaim arena memory. Best practice is after all outgoing bytes have been written.
#### Threading Model and EPOLLONESHOT #### Threading Model and Server-Owned Connections
**EPOLLONESHOT Design Rationale:** **Server-Owned Connection Design:**
WeaselDB uses `EPOLLONESHOT` for all connection file descriptors to enable safe multi-threaded ownership transfer without complex synchronization: WeaselDB uses a server-owned connection model where the server retains ownership of all connections while providing safe concurrent access to handlers:
**Key Benefits:** **Key Benefits:**
1. **Automatic fd disarming** - When epoll triggers an event, the fd is automatically removed from epoll monitoring
2. **Race-free ownership transfer** - Handlers can safely take connection ownership and move to other threads 1. **Simplified ownership** - Server always owns connections, eliminating complex ownership transfers
3. **Zero-coordination async processing** - No manual synchronization needed between network threads and handler threads 1. **Safe concurrent access** - Connection mutexes synchronize access between I/O threads and handlers
1. **WeakRef pattern** - Handlers use WeakRef<Connection> for safe async processing without ownership
**Threading Flow:** **Threading Flow:**
1. **Event Trigger**: Network thread gets epoll event → connection auto-disarmed via ONESHOT
2. **Safe Transfer**: Handler can take ownership (`std::move(conn_ptr)`) with no epoll interference
3. **Async Processing**: Connection processed on handler thread while epoll cannot trigger spurious events
4. **Return & Re-arm**: `Server::receiveConnectionBack()` re-arms fd with `epoll_ctl(EPOLL_CTL_MOD)`
**Performance Trade-off:** 1. **Event Trigger**: Network thread gets epoll event and processes data
- **Cost**: One `epoll_ctl(MOD)` syscall per connection return (~100-200ns) 1. **Handler Invocation**: Handler receives Connection& reference - server retains ownership
- **Benefit**: Eliminates complex thread synchronization and prevents race conditions 1. **Async Processing**: Handler obtains WeakRef<Connection> for safe background processing
- **Alternative cost**: Manual `EPOLL_CTL_DEL`/`ADD` + locking would be significantly higher 1. **Connection Cleanup**: Server manages connection lifecycle including file descriptor operations
**Without EPOLLONESHOT risks:** **Performance Benefits:**
- Multiple threads processing same fd simultaneously
- Use-after-move when network thread accesses transferred connection
- Complex synchronization between epoll events and ownership transfers
This design enables the async handler pattern where connections can be safely moved between threads for background processing while maintaining high performance and thread safety. - **Reduced syscalls**: Eliminates epoll_ctl(MOD) calls needed for ownership transfer
- **Simplified synchronization**: Connection mutexes provide clear concurrent access patterns
- **Memory efficiency**: No unique_ptr overhead for ownership management
**Safe Async Processing:**
- WeakRef<Connection> prevents use-after-free in background threads
- Connection mutex ensures thread-safe access to connection state
- Server handles all file descriptor management automatically
This design provides high performance concurrent processing while maintaining thread safety through clear ownership boundaries and synchronization primitives.
### API Endpoints ### API Endpoints
@@ -270,14 +308,14 @@ The system implements a RESTful API. See [api.md](api.md) for comprehensive API
### Design Principles ### Design Principles
1. **Performance-first** - Every component optimized for high throughput 1. **Performance-first** - Every component optimized for high throughput
2. **Scalable concurrency** - Multiple epoll instances eliminate kernel contention 1. **Scalable concurrency** - Multiple epoll instances eliminate kernel contention
3. **Memory efficiency** - Arena allocation eliminates fragmentation 1. **Memory efficiency** - Arena allocation eliminates fragmentation
4. **Efficient copying** - Minimize unnecessary copies while accepting required ones 1. **Efficient copying** - Minimize unnecessary copies while accepting required ones
5. **Streaming-ready** - Support incremental processing 1. **Streaming-ready** - Support incremental processing
6. **Type safety** - Compile-time validation where possible 1. **Type safety** - Compile-time validation where possible
7. **Resource management** - RAII and move semantics throughout 1. **Resource management** - RAII and move semantics throughout
--- ______________________________________________________________________
## Development Guidelines ## Development Guidelines
@@ -289,13 +327,13 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
- **Server Creation**: Always use `Server::create()` factory method - direct construction is impossible - **Server Creation**: Always use `Server::create()` factory method - direct construction is impossible
- **Connection Creation**: Only the Server can create connections - no public constructor or factory method - **Connection Creation**: Only the Server can create connections - no public constructor or factory method
- **Connection Ownership**: Use unique_ptr semantics for safe ownership transfer between components - **Connection Ownership**: Server retains ownership, handlers use Connection& references
- **Arena Allocator Pattern**: Always use `Arena` for temporary allocations within request processing - **Arena Allocator Pattern**: Always use `Arena` for temporary allocations within request processing
- **String View Usage**: Prefer `std::string_view` over `std::string` when pointing to arena-allocated memory - **String View Usage**: Prefer `std::string_view` over `std::string` when pointing to arena-allocated memory
- **Ownership Transfer**: Use `Server::release_back_to_server()` for returning connections to server from handlers - **Async Processing**: Use `conn.get_weak_ref()` for safe background processing without ownership
- **JSON Token Lookup**: Use the gperf-generated perfect hash table in `json_tokens.hpp` for O(1) key recognition - **JSON Token Lookup**: Use the gperf-generated perfect hash table in `json_tokens.hpp` for O(1) key recognition
- **Base64 Handling**: Always use simdutf for base64 encoding/decoding for performance - **Base64 Handling**: Always use simdutf for base64 encoding/decoding for performance
- **Thread Safety**: Connection ownership transfers are designed to be thread-safe with proper RAII cleanup - **Thread Safety**: Connection mutexes provide safe concurrent access between threads
### Project Structure ### Project Structure
@@ -308,20 +346,22 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
### Extension Points ### Extension Points
#### Adding New Protocol Handlers #### Adding New Protocol Handlers
1. Inherit from `ConnectionHandler` in `src/connection_handler.hpp` 1. Inherit from `ConnectionHandler` in `src/connection_handler.hpp`
2. Implement `on_data_arrived()` with proper ownership semantics 1. Implement `on_data_arrived()` using Connection& reference parameter
3. Use connection's arena allocator for temporary allocations: `conn->get_arena()` 1. Use connection's arena allocator for temporary allocations: `conn.get_arena()`
4. Handle partial messages and streaming protocols appropriately 1. Handle partial messages and streaming protocols appropriately
5. Use `Server::release_back_to_server()` if taking ownership for async processing 1. Use `conn.get_weak_ref()` for safe async processing without ownership transfer
6. Add corresponding test cases and integration tests 1. Add corresponding test cases and integration tests
7. Consider performance implications of ownership transfers 1. Consider performance implications of concurrent access patterns
#### Adding New Parsers #### Adding New Parsers
1. Inherit from `CommitRequestParser` in `src/commit_request_parser.hpp` 1. Inherit from `CommitRequestParser` in `src/commit_request_parser.hpp`
2. Implement both streaming and one-shot parsing modes 1. Implement both streaming and one-shot parsing modes
3. Use arena allocation for all temporary string storage 1. Use arena allocation for all temporary string storage
4. Add corresponding test cases in `tests/` 1. Add corresponding test cases in `tests/`
5. Add benchmark comparisons in `benchmarks/` 1. Add benchmark comparisons in `benchmarks/`
### Performance Guidelines ### Performance Guidelines
@@ -329,6 +369,7 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
- **CPU**: Perfect hashing and SIMD operations are critical paths - avoid alternatives - **CPU**: Perfect hashing and SIMD operations are critical paths - avoid alternatives
- **I/O**: Streaming parser design supports incremental network data processing - **I/O**: Streaming parser design supports incremental network data processing
- **Cache**: String views avoid copying, keeping data cache-friendly - **Cache**: String views avoid copying, keeping data cache-friendly
- **Pipeline**: Serial stages must never block - only parallel release stage can take locks
### Configuration & Testing ### Configuration & Testing
@@ -337,13 +378,14 @@ See [style.md](style.md) for comprehensive C++ coding standards and conventions.
- **Build System**: CMake generates gperf hash tables at build time - **Build System**: CMake generates gperf hash tables at build time
- **Testing Guidelines**: See [style.md](style.md) for comprehensive testing standards including synchronization rules - **Testing Guidelines**: See [style.md](style.md) for comprehensive testing standards including synchronization rules
--- ______________________________________________________________________
## Common Patterns ## Common Patterns
### Factory Method Patterns ### Factory Method Patterns
#### Server Creation #### Server Creation
```cpp ```cpp
// Server must be created via factory method // Server must be created via factory method
auto server = Server::create(config, handler); auto server = Server::create(config, handler);
@@ -354,59 +396,51 @@ auto server = Server::create(config, handler);
``` ```
#### Connection Creation (Server-Only) #### Connection Creation (Server-Only)
```cpp
// Only Server can create connections (using private friend method)
class Server {
private:
auto conn = Connection::createForServer(addr, fd, id, handler, weak_from_this());
};
// No public way to create connections - all these fail: Only Server can create connections (using private constructor via friend access)
// auto conn = Connection::create(...); // ERROR: no such method
// Connection conn(addr, fd, id, handler, server); // ERROR: private constructor
// auto conn = std::make_unique<Connection>(...); // ERROR: private constructor
```
### ConnectionHandler Implementation Patterns ### ConnectionHandler Implementation Patterns
#### Simple Synchronous Handler #### Simple Synchronous Handler
```cpp ```cpp
class HttpHandler : public ConnectionHandler { class HttpHandler : ConnectionHandler {
public: public:
void on_data_arrived(std::string_view data, std::unique_ptr<Connection>& conn_ptr) override { void on_data_arrived(std::string_view data, Connection& conn) override {
// Parse HTTP request using connection's arena // Parse HTTP request using connection's arena
Arena& arena = conn_ptr->get_arena(); Arena& arena = conn.get_arena();
// Generate response // Generate response
conn_ptr->append_message("HTTP/1.1 200 OK\r\n\r\nHello World"); conn.append_message("HTTP/1.1 200 OK\r\n\r\nHello World");
// Server retains ownership // Server retains ownership
} }
}; };
``` ```
#### Async Handler with Ownership Transfer #### Async Handler with WeakRef
```cpp ```cpp
class AsyncHandler : public ConnectionHandler { class AsyncHandler : ConnectionHandler {
public: public:
void on_data_arrived(std::string_view data, std::unique_ptr<Connection>& conn_ptr) override { void on_data_arrived(std::string_view data, Connection& conn) override {
// Take ownership for async processing // Get weak reference for async processing
auto connection = std::move(conn_ptr); // conn_ptr is now null auto weak_conn = conn.get_weak_ref();
work_queue.push([connection = std::move(connection)](std::string_view data) mutable { work_queue.push([weak_conn, data = std::string(data)]() {
// Process asynchronously // Process asynchronously - connection may be closed
connection->append_message("Async response"); if (auto conn_ref = weak_conn.lock()) {
conn_ref->append_message("Async response");
// Return ownership to server when done }
Server::release_back_to_server(std::move(connection));
}); });
} }
}; };
``` ```
#### Batching Handler with User Data #### Batching Handler with User Data
```cpp ```cpp
class BatchingHandler : public ConnectionHandler { class BatchingHandler : ConnectionHandler {
public: public:
void on_connection_established(Connection &conn) override { void on_connection_established(Connection &conn) override {
// Allocate some protocol-specific data and attach it to the connection // Allocate some protocol-specific data and attach it to the connection
@@ -418,21 +452,20 @@ public:
delete static_cast<MyProtocolData*>(conn.user_data); delete static_cast<MyProtocolData*>(conn.user_data);
} }
void on_data_arrived(std::string_view data, void on_data_arrived(std::string_view data, Connection& conn) override {
std::unique_ptr<Connection> &conn_ptr) override {
// Process data and maybe store some results in the user_data // Process data and maybe store some results in the user_data
auto* proto_data = static_cast<MyProtocolData*>(conn_ptr->user_data); auto* proto_data = static_cast<MyProtocolData*>(conn.user_data);
proto_data->process(data); proto_data->process(data);
} }
void on_batch_complete(std::span<std::unique_ptr<Connection>> batch) override { void on_batch_complete(std::span<Connection *const> batch) override {
// Process a batch of connections // Process a batch of connections
for (auto& conn_ptr : batch) { for (auto* conn : batch) {
if (conn_ptr) { if (conn) {
auto* proto_data = static_cast<MyProtocolData*>(conn_ptr->user_data); auto* proto_data = static_cast<MyProtocolData*>(conn->user_data);
if (proto_data->is_ready()) { if (proto_data->is_ready()) {
// This connection is ready for the next stage, move it to the pipeline // This connection is ready for the next stage, get weak ref for pipeline
pipeline_.push(std::move(conn_ptr)); pipeline_.push(conn->get_weak_ref());
} }
} }
} }
@@ -444,20 +477,21 @@ private:
``` ```
#### Streaming "yes" Handler #### Streaming "yes" Handler
```cpp ```cpp
class YesHandler : public ConnectionHandler { class YesHandler : ConnectionHandler {
public: public:
void on_connection_established(Connection &conn) override { void on_connection_established(Connection &conn) override {
// Write an initial "y\n" // Write an initial "y\n"
conn.append_message("y\n"); conn.append_message("y\n");
} }
void on_write_progress(std::unique_ptr<Connection> &conn) override { void on_write_progress(Connection &conn) override {
if (conn->outgoingBytesQueued() == 0) { if (conn.outgoing_bytes_queued() == 0) {
// Don't use an unbounded amount of memory // Don't use an unbounded amount of memory
conn->reset(); conn.reset();
// Write "y\n" repeatedly // Write "y\n" repeatedly
conn->append_message("y\n"); conn.append_message("y\n");
} }
} }
}; };
@@ -466,6 +500,7 @@ public:
### Memory Management Patterns ### Memory Management Patterns
#### Arena-Based String Handling #### Arena-Based String Handling
```cpp ```cpp
// Preferred: String view with arena allocation to minimize copying // Preferred: String view with arena allocation to minimize copying
std::string_view process_json_key(const char* data, Arena& arena); std::string_view process_json_key(const char* data, Arena& arena);
@@ -474,24 +509,26 @@ std::string_view process_json_key(const char* data, Arena& arena);
std::string process_json_key(const char* data); std::string process_json_key(const char* data);
``` ```
#### Safe Connection Ownership Transfer #### Safe Async Connection Processing
```cpp ```cpp
// In handler - take ownership for background processing // In handler - get weak reference for background processing
Connection* raw_conn = conn_ptr.release(); auto weak_conn = conn.get_weak_ref();
// Process on worker thread // Process on worker thread
background_processor.submit([raw_conn]() { background_processor.submit([weak_conn]() {
// Do work... // Do work...
raw_conn->append_message("Background result"); if (auto conn_ref = weak_conn.lock()) {
conn_ref->append_message("Background result");
// Return to server safely (handles server destruction) }
Server::release_back_to_server(std::unique_ptr<Connection>(raw_conn)); // Connection automatically cleaned up by server
}); });
``` ```
### Data Construction Patterns ### Data Construction Patterns
#### Builder Pattern Usage #### Builder Pattern Usage
```cpp ```cpp
CommitRequest request = CommitRequestBuilder(arena) CommitRequest request = CommitRequestBuilder(arena)
.request_id("example-id") .request_id("example-id")
@@ -501,41 +538,47 @@ CommitRequest request = CommitRequestBuilder(arena)
``` ```
#### Error Handling Pattern #### Error Handling Pattern
```cpp ```cpp
enum class ParseResult { Success, InvalidJson, MissingField }; enum class ParseResult { Success, InvalidJson, MissingField };
ParseResult parse_commit_request(const char* json, CommitRequest& out); ParseResult parse_commit_request(const char* json, CommitRequest& out);
``` ```
--- ______________________________________________________________________
## Reference ## Reference
### Build Targets ### Build Targets
**Test Executables:** **Test Executables:**
- `test_arena` - Arena allocator functionality tests - `test_arena` - Arena allocator functionality tests
- `test_commit_request` - JSON parsing and validation tests - `test_commit_request` - JSON parsing and validation tests
- `test_metric` - Metrics system functionality tests - `test_metric` - Metrics system functionality tests
- Main server executable (compiled from `src/main.cpp`) - Main server executable (compiled from `src/main.cpp`)
**Benchmark Executables:** **Benchmark Executables:**
- `bench_arena` - Arena allocator performance benchmarks - `bench_arena` - Arena allocator performance benchmarks
- `bench_commit_request` - JSON parsing performance benchmarks - `bench_commit_request` - JSON parsing performance benchmarks
- `bench_parser_comparison` - Comparison benchmarks vs nlohmann::json and RapidJSON - `bench_parser_comparison` - Comparison benchmarks vs nlohmann::json and RapidJSON
- `bench_metric` - Metrics system performance benchmarks - `bench_metric` - Metrics system performance benchmarks
**Debug Tools:** **Debug Tools:**
- `debug_arena` - Debug tool for arena allocator analysis - `debug_arena` - Debug tool for arena allocator analysis
### Performance Characteristics ### Performance Characteristics
**Memory Allocation:** **Memory Allocation:**
- **~1ns allocation time** vs standard allocators - **~1ns allocation time** vs standard allocators
- **Bulk deallocation** eliminates individual free() calls - **Bulk deallocation** eliminates individual free() calls
- **Optimized geometric growth** uses current block size for doubling strategy - **Optimized geometric growth** uses current block size for doubling strategy
- **Alignment-aware** allocation prevents performance penalties - **Alignment-aware** allocation prevents performance penalties
**JSON Parsing:** **JSON Parsing:**
- **Streaming parser** handles large payloads efficiently - **Streaming parser** handles large payloads efficiently
- **Incremental processing** suitable for network protocols - **Incremental processing** suitable for network protocols
- **Arena storage** eliminates string allocation overhead - **Arena storage** eliminates string allocation overhead

View File

@@ -16,7 +16,7 @@ The persistence thread receives commit batches from the main processing pipeline
The persistence thread collects commits into batches using two trigger conditions: The persistence thread collects commits into batches using two trigger conditions:
1. **Time Trigger**: `batch_timeout_ms` elapsed since batch collection started 1. **Time Trigger**: `batch_timeout_ms` elapsed since batch collection started
2. **Size Trigger**: `batch_size_threshold` commits collected (can be exceeded by final commit) 1. **Size Trigger**: `batch_size_threshold` commits collected (can be exceeded by final commit)
**Flow Control**: When `max_in_flight_requests` reached, block until responses received. Batches in retry backoff count toward the in-flight limit, creating natural backpressure during failures. **Flow Control**: When `max_in_flight_requests` reached, block until responses received. Batches in retry backoff count toward the in-flight limit, creating natural backpressure during failures.
@@ -25,10 +25,12 @@ The persistence thread collects commits into batches using two trigger condition
### 1. Batch Collection ### 1. Batch Collection
**No In-Flight Requests** (no I/O to pump): **No In-Flight Requests** (no I/O to pump):
- Use blocking acquire to get first commit batch (can afford to wait) - Use blocking acquire to get first commit batch (can afford to wait)
- Process immediately (no batching delay) - Process immediately (no batching delay)
**With In-Flight Requests** (I/O to pump in event loop): **With In-Flight Requests** (I/O to pump in event loop):
- Check flow control: if at `max_in_flight_requests`, block for responses - Check flow control: if at `max_in_flight_requests`, block for responses
- Collect commits using non-blocking acquire until trigger condition: - Collect commits using non-blocking acquire until trigger condition:
- Check for available commits (non-blocking) - Check for available commits (non-blocking)
@@ -97,9 +99,10 @@ The persistence thread collects commits into batches using two trigger condition
## Configuration Validation ## Configuration Validation
**Required Constraints**: **Required Constraints**:
- `batch_size_threshold` > 0 (must process at least one commit per batch) - `batch_size_threshold` > 0 (must process at least one commit per batch)
- `max_in_flight_requests` > 0 (must allow at least one concurrent request) - `max_in_flight_requests` > 0 (must allow at least one concurrent request)
- `max_in_flight_requests` <= 1000 (required for single-call recovery guarantee) - `max_in_flight_requests` \<= 1000 (required for single-call recovery guarantee)
- `batch_timeout_ms` > 0 (timeout must be positive) - `batch_timeout_ms` > 0 (timeout must be positive)
- `max_retry_attempts` >= 0 (zero disables retries) - `max_retry_attempts` >= 0 (zero disables retries)
- `retry_base_delay_ms` > 0 (delay must be positive if retries enabled) - `retry_base_delay_ms` > 0 (delay must be positive if retries enabled)
@@ -123,16 +126,19 @@ WeaselDB's batched persistence design enables efficient recovery while maintaini
WeaselDB uses a **sequential batch numbering** scheme with **S3 atomic operations** to provide efficient crash recovery and split-brain prevention without external coordination services. WeaselDB uses a **sequential batch numbering** scheme with **S3 atomic operations** to provide efficient crash recovery and split-brain prevention without external coordination services.
**Batch Numbering Scheme**: **Batch Numbering Scheme**:
- Batch numbers start at `2^64 - 1` and count downward: `18446744073709551615, 18446744073709551614, 18446744073709551613, ...` - Batch numbers start at `2^64 - 1` and count downward: `18446744073709551615, 18446744073709551614, 18446744073709551613, ...`
- Each batch is stored as S3 object `batches/{batch_number:020d}` with zero-padding - Each batch is stored as S3 object `batches/{batch_number:020d}` with zero-padding
- S3 lexicographic ordering on zero-padded numbers returns batches in ascending numerical order (latest batches first) - S3 lexicographic ordering on zero-padded numbers returns batches in ascending numerical order (latest batches first)
**Terminology**: Since batch numbers decrease over time, we use numerical ordering: **Terminology**: Since batch numbers decrease over time, we use numerical ordering:
- "Older" batches = higher numbers (written first in time) - "Older" batches = higher numbers (written first in time)
- "Newer" batches = lower numbers (written more recently) - "Newer" batches = lower numbers (written more recently)
- "Most recent" batches = lowest numbers (most recently written) - "Most recent" batches = lowest numbers (most recently written)
**Example**: If batches 100, 99, 98, 97 are written, S3 LIST returns them as: **Example**: If batches 100, 99, 98, 97 are written, S3 LIST returns them as:
``` ```
batches/00000000000000000097 (newest, lowest batch number) batches/00000000000000000097 (newest, lowest batch number)
batches/00000000000000000098 batches/00000000000000000098
@@ -142,6 +148,7 @@ batches/00000000000000000100 (oldest, highest batch number)
``` ```
**Leadership and Split-Brain Prevention**: **Leadership and Split-Brain Prevention**:
- New persistence thread instances scan S3 to find the highest (oldest) available batch number - New persistence thread instances scan S3 to find the highest (oldest) available batch number
- Each batch write uses `If-None-Match="*"` to atomically claim the sequential batch number - Each batch write uses `If-None-Match="*"` to atomically claim the sequential batch number
- Only one instance can successfully claim each batch number, preventing split-brain scenarios - Only one instance can successfully claim each batch number, preventing split-brain scenarios
@@ -150,28 +157,32 @@ batches/00000000000000000100 (oldest, highest batch number)
**Recovery Scenarios**: **Recovery Scenarios**:
**Clean Shutdown**: **Clean Shutdown**:
- All in-flight batches are drained to completion before termination - All in-flight batches are drained to completion before termination
- Durability watermark accurately reflects all durable state - Durability watermark accurately reflects all durable state
- No recovery required on restart - No recovery required on restart
**Crash Recovery**: **Crash Recovery**:
1. **S3 Scan with Bounded Cost**: List S3 objects with prefix `batches/` and limit of 1000 objects 1. **S3 Scan with Bounded Cost**: List S3 objects with prefix `batches/` and limit of 1000 objects
2. **Gap Detection**: Check for missing sequential batch numbers. WeaselDB never puts more than 1000 batches in flight concurrently, so a limit of 1000 is sufficient. 1. **Gap Detection**: Check for missing sequential batch numbers. WeaselDB never puts more than 1000 batches in flight concurrently, so a limit of 1000 is sufficient.
3. **Watermark Reconstruction**: Set durability watermark to the latest consecutive batch (scanning from highest numbers downward, until a gap) 1. **Watermark Reconstruction**: Set durability watermark to the latest consecutive batch (scanning from highest numbers downward, until a gap)
4. **Leadership Transition**: Begin writing batches starting from next available batch number. Skip past any batch numbers already claimed in the durability watermark scan. 1. **Leadership Transition**: Begin writing batches starting from next available batch number. Skip past any batch numbers already claimed in the durability watermark scan.
**Bounded Recovery Guarantee**: Since at most 1000 batches can be in-flight during a crash, any gap in the sequential numbering (indicating the durability watermark) must appear within the first 1000 S3 objects. This is because: **Bounded Recovery Guarantee**: Since at most 1000 batches can be in-flight during a crash, any gap in the sequential numbering (indicating the durability watermark) must appear within the first 1000 S3 objects. This is because:
1. At most 1000 batches can be incomplete when crash occurs 1. At most 1000 batches can be incomplete when crash occurs
2. S3 LIST returns objects in ascending numerical order (most recent batches first due to countdown numbering) 1. S3 LIST returns objects in ascending numerical order (most recent batches first due to countdown numbering)
3. The first gap found represents the boundary between durable and potentially incomplete batches 1. The first gap found represents the boundary between durable and potentially incomplete batches
4. S3 LIST operations have a maximum limit of 1000 objects per request 1. S3 LIST operations have a maximum limit of 1000 objects per request
5. Therefore, scanning 1000 objects (the maximum S3 allows in one request) is sufficient to find this boundary 1. Therefore, scanning 1000 objects (the maximum S3 allows in one request) is sufficient to find this boundary
This ensures **O(1) recovery time** regardless of database size, with at most **one S3 LIST operation** required. This ensures **O(1) recovery time** regardless of database size, with at most **one S3 LIST operation** required.
**Recovery Protocol Detail**: Even with exactly 1000 batches in-flight, recovery works correctly: **Recovery Protocol Detail**: Even with exactly 1000 batches in-flight, recovery works correctly:
**Example Scenario**: Batches 2000 down to 1001 (1000 batches) are in-flight when crash occurs **Example Scenario**: Batches 2000 down to 1001 (1000 batches) are in-flight when crash occurs
- Previous successful run had written through batch 2001 - Previous successful run had written through batch 2001
- Worst case: batch 2000 (oldest in-flight) fails, batches 1999 down to 1001 (newer) all succeed - Worst case: batch 2000 (oldest in-flight) fails, batches 1999 down to 1001 (newer) all succeed
- S3 LIST(limit=1000) returns: 1001, 1002, ..., 1998, 1999, 2001 (ascending numerical order) - S3 LIST(limit=1000) returns: 1001, 1002, ..., 1998, 1999, 2001 (ascending numerical order)

View File

@@ -1,39 +1,10 @@
#include "arena.hpp" #include "arena.hpp"
#include <cassert> #include <cassert>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
#include <vector> #include <vector>
Arena::~Arena() {
while (current_block_) {
Block *prev = current_block_->prev;
std::free(current_block_);
current_block_ = prev;
}
}
Arena::Arena(Arena &&other) noexcept
: initial_block_size_(other.initial_block_size_),
current_block_(other.current_block_) {
other.current_block_ = nullptr;
}
Arena &Arena::operator=(Arena &&other) noexcept {
if (this != &other) {
while (current_block_) {
Block *prev = current_block_->prev;
std::free(current_block_);
current_block_ = prev;
}
initial_block_size_ = other.initial_block_size_;
current_block_ = other.current_block_;
other.current_block_ = nullptr;
}
return *this;
}
void Arena::reset() { void Arena::reset() {
if (!current_block_) { if (!current_block_) {
return; return;

View File

@@ -59,27 +59,24 @@
* *
* ### Safe Usage Patterns in WeaselDB: * ### Safe Usage Patterns in WeaselDB:
* - **Per-Connection Instances**: Each Connection owns its own Arena * - **Per-Connection Instances**: Each Connection owns its own Arena
* instance, accessed only by the thread that currently owns the connection * instance, accessed by its io thread
* - **Single Owner Principle**: Connection ownership transfers atomically * - **Server Ownership**: Server retains connection ownership, handlers access
* between threads using unique_ptr, ensuring only one thread accesses the arena * arenas through Connection& references with proper mutex protection
* at a time
* *
* ### Thread Ownership Model: * ### Thread Ownership Model:
* 1. **Network Thread**: Claims connection ownership, accesses arena for I/O * 1. **I/O Thread**: Server owns connections, processes socket I/O events
* buffers * 2. **Handler Thread**: Receives Connection& reference, creates request-scoped
* 2. **Handler Thread**: Can take ownership via unique_ptr.release(), uses * arenas for parsing and response generation
* arena for request parsing and response generation * 3. **Pipeline Thread**: Can use WeakRef<Connection> for async processing,
* 3. **Background Thread**: Can receive ownership for async processing, uses * creates own arenas for temporary data structures
* arena for temporary data structures * 4. **Arena Lifecycle**: Request-scoped arenas moved to message queue, freed
* 4. **Return Path**: Connection (and its arena) safely returned via * after I/O completion without holding connection mutex
* Server::release_back_to_server()
* *
* ### Why This Design is Thread-Safe: * ### Why This Design is Thread-Safe:
* - **Exclusive Access**: Only the current owner thread should access the arena * - **Request-Scoped**: Each request gets its own Arena instance for isolation
* - **Transfer Points**: Ownership transfers happen at well-defined * - **Move Semantics**: Arenas transferred via move, avoiding shared access
* synchronization points with proper memory barriers. * - **Deferred Cleanup**: Arena destruction deferred to avoid malloc contention
* - **No Shared State**: Each arena is completely isolated - no shared data * while holding connection mutex
* between different arena instances
* *
* @warning Do not share Arena instances between threads. Use separate * @warning Do not share Arena instances between threads. Use separate
* instances per thread or per logical unit of work. * instances per thread or per logical unit of work.
@@ -157,7 +154,13 @@ public:
* Traverses the intrusive linked list backwards from current_block_, * Traverses the intrusive linked list backwards from current_block_,
* freeing each block. This ensures no memory leaks. * freeing each block. This ensures no memory leaks.
*/ */
~Arena(); ~Arena() {
while (current_block_) {
Block *prev = current_block_->prev;
std::free(current_block_);
current_block_ = prev;
}
}
/// Copy construction is not allowed (would be expensive and error-prone) /// Copy construction is not allowed (would be expensive and error-prone)
Arena(const Arena &) = delete; Arena(const Arena &) = delete;
@@ -166,9 +169,21 @@ public:
/** /**
* @brief Move constructor - transfers ownership of all blocks. * @brief Move constructor - transfers ownership of all blocks.
* @param other The Arena to move from (will be left empty) *
* @param other The Arena to move from (will be left in a valid, empty state)
*
* @note Post-move state: The moved-from Arena is left in a valid state
* equivalent to a newly constructed Arena. All operations remain safe:
* - allocate_raw(), allocate(), construct() work normally
* - reset() is safe and well-defined (no-op on empty arena)
* - used_bytes(), total_bytes() return 0
* - Destructor is safe to call
*/ */
Arena(Arena &&other) noexcept; Arena(Arena &&other) noexcept
: initial_block_size_(other.initial_block_size_),
current_block_(other.current_block_) {
other.current_block_ = nullptr;
}
/** /**
* @brief Move assignment operator - transfers ownership of all blocks. * @brief Move assignment operator - transfers ownership of all blocks.
@@ -176,10 +191,31 @@ public:
* Frees any existing blocks in this allocator before taking ownership * Frees any existing blocks in this allocator before taking ownership
* of blocks from the other allocator. * of blocks from the other allocator.
* *
* @param other The Arena to move from (will be left empty) * @param other The Arena to move from (will be left in a valid, empty state)
* @return Reference to this allocator * @return Reference to this allocator
*
* @note Post-move state: The moved-from Arena is left in a valid state
* equivalent to a newly constructed Arena. All operations remain safe:
* - allocate_raw(), allocate(), construct() work normally
* - reset() is safe and well-defined (no-op on empty arena)
* - used_bytes(), total_bytes() return 0
* - Destructor is safe to call
*/ */
Arena &operator=(Arena &&other) noexcept; Arena &operator=(Arena &&other) noexcept {
if (this != &other) {
while (current_block_) {
Block *prev = current_block_->prev;
std::free(current_block_);
current_block_ = prev;
}
initial_block_size_ = other.initial_block_size_;
current_block_ = other.current_block_;
other.current_block_ = nullptr;
}
return *this;
}
/** /**
* @brief Allocate raw memory with the specified size and alignment. * @brief Allocate raw memory with the specified size and alignment.
@@ -429,6 +465,72 @@ public:
return static_cast<T *>(ptr); return static_cast<T *>(ptr);
} }
/**
* @brief Allocate an array of type T and return it as a std::span<T>.
*
* This method provides bounds-safe allocation by returning a std::span
* that knows its size, improving safety over raw pointer allocation.
*
* @tparam T The type to allocate (must be trivially destructible)
* @param count The number of elements to allocate
* @return std::span<T> A span covering the allocated array
*
* ## Safety:
* The returned span is valid for the lifetime of the arena and until
* the next reset() call. The span provides bounds checking in debug
* builds and clear size information.
*
* ## Usage:
* ```cpp
* auto buffer = arena.allocate_span<char>(1024);
* auto strings = arena.allocate_span<std::string_view>(10);
* ```
*
* ## Note:
* Returns an empty span (nullptr, 0) if count is 0.
* This method only allocates memory - it does not construct objects.
*/
template <typename T> std::span<T> allocate_span(uint32_t count) {
if (count == 0) {
return std::span<T>{};
}
return std::span<T>{allocate<T>(count), count};
}
/**
* @brief Copy a string into arena memory and return a string_view.
*
* This method provides a safe way to copy string data into arena-allocated
* memory, ensuring the data remains valid for the arena's lifetime.
*
* @param str The string to copy into arena memory
* @return std::string_view pointing to the arena-allocated copy
*
* ## Safety:
* The returned string_view is valid for the lifetime of the arena and until
* the next reset() call. The string data is guaranteed to be null-terminated
* only if the input string was null-terminated.
*
* ## Usage:
* ```cpp
* Arena arena;
* std::string_view copy = arena.copy_string("Hello World");
* std::string_view copy2 = arena.copy_string(some_string_view);
* ```
*
* ## Note:
* Returns an empty string_view if the input string is empty.
* This method allocates exactly str.size() bytes (no null terminator added).
*/
std::string_view copy_string(std::string_view str) {
if (str.empty()) {
return std::string_view{};
}
char *copied = allocate<char>(str.size());
std::memcpy(copied, str.data(), str.size());
return std::string_view(copied, str.size());
}
/** /**
* @brief Reset the allocator to reuse the first block, freeing all others. * @brief Reset the allocator to reuse the first block, freeing all others.
* *

389
src/commit_pipeline.cpp Normal file
View File

@@ -0,0 +1,389 @@
#include "commit_pipeline.hpp"
#include <cstring>
#include <pthread.h>
#include <unordered_set>
#include "commit_request.hpp"
#include "cpu_work.hpp"
#include "format.hpp"
#include "metric.hpp"
#include "pipeline_entry.hpp"
// Metric for banned request IDs memory usage
auto banned_request_ids_memory_gauge =
metric::create_gauge("weaseldb_banned_request_ids_memory_bytes",
"Memory used by banned request IDs arena")
.create({});
CommitPipeline::CommitPipeline(const weaseldb::Config &config)
: config_(config),
pipeline_(config.commit.pipeline_wait_strategy,
{1, 1, 1, config.commit.pipeline_release_threads}, lg_size) {
// Stage 0: Sequence assignment thread
sequence_thread_ = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-sequence");
run_sequence_stage();
}};
// Stage 1: Precondition resolution thread
resolve_thread_ = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-resolve");
run_resolve_stage();
}};
// Stage 2: Transaction persistence thread
persist_thread_ = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-persist");
run_persist_stage();
}};
// Stage 3: Connection return to server threads (configurable count)
release_threads_.reserve(config.commit.pipeline_release_threads);
for (int i = 0; i < config.commit.pipeline_release_threads; ++i) {
release_threads_.emplace_back([this, i]() {
char name[16];
std::snprintf(name, sizeof(name), "txn-release-%d", i);
pthread_setname_np(pthread_self(), name);
run_release_stage(i);
});
}
}
CommitPipeline::~CommitPipeline() {
// Send shutdown signals for all release threads (adjacent in same batch)
{
int num_release_threads = static_cast<int>(release_threads_.size());
auto guard = pipeline_.push(num_release_threads, true);
for (int i = 0; i < num_release_threads; ++i) {
guard.batch[i] = ShutdownEntry{};
}
}
// Join all pipeline threads
sequence_thread_.join();
resolve_thread_.join();
persist_thread_.join();
for (auto &thread : release_threads_) {
thread.join();
}
}
void CommitPipeline::submit_batch(std::span<PipelineEntry> entries) {
if (entries.empty()) {
return;
}
// Get pipeline guard for batch size
auto guard = pipeline_.push(entries.size(), /*block=*/true);
// Move entries into pipeline slots
std::move(entries.begin(), entries.end(), guard.batch.begin());
// Guard destructor publishes batch to stage 0
}
// AVOID BLOCKING IN THIS STAGE!
void CommitPipeline::run_sequence_stage() {
int64_t next_version = 1;
// Request ID deduplication (sequence stage only)
Arena banned_request_arena;
using BannedRequestIdSet =
std::unordered_set<std::string_view, std::hash<std::string_view>,
std::equal_to<std::string_view>,
ArenaStlAllocator<std::string_view>>;
BannedRequestIdSet banned_request_ids{
ArenaStlAllocator<std::string_view>(&banned_request_arena)};
int expected_shutdowns = config_.commit.pipeline_release_threads;
for (int shutdowns_received = 0; shutdowns_received < expected_shutdowns;) {
auto guard = pipeline_.acquire(0, 0);
auto &batch = guard.batch;
// Stage 0: Sequence assignment
// This stage performs ONLY work that requires serial processing:
// - Version/sequence number assignment (must be sequential)
// - Request ID banned list management
for (auto &entry : batch) {
// Pattern match on pipeline entry variant
std::visit(
[&](auto &&e) {
using T = std::decay_t<decltype(e)>;
if constexpr (std::is_same_v<T, ShutdownEntry>) {
++shutdowns_received;
} else if constexpr (std::is_same_v<T, CommitEntry>) {
// Process commit entry: check banned list, assign version
auto &commit_entry = e;
assert(commit_entry.commit_request);
// Check if request_id is banned (for status queries)
// Only check CommitRequest request_id, not HTTP header
if (commit_entry.commit_request &&
commit_entry.commit_request->request_id().has_value()) {
auto commit_request_id =
commit_entry.commit_request->request_id().value();
if (banned_request_ids.contains(commit_request_id)) {
// Request ID is banned, this commit should fail
commit_entry.response_json =
R"({"status": "not_committed", "error": "request_id_banned"})";
return;
}
}
// Assign sequential version number
commit_entry.assigned_version = next_version++;
} else if constexpr (std::is_same_v<T, StatusEntry>) {
// Process status entry: add request_id to banned list, get
// version upper bound
auto &status_entry = e;
// Add request_id to banned list - store the string in arena and
// use string_view
std::string_view request_id_view =
banned_request_arena.copy_string(
status_entry.status_request_id);
banned_request_ids.insert(request_id_view);
// Update memory usage metric
banned_request_ids_memory_gauge.set(
banned_request_arena.total_allocated());
// Set version upper bound to current highest assigned version
status_entry.version_upper_bound = next_version - 1;
} else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
// Process health check entry: noop in sequence stage
}
},
entry);
}
}
}
// AVOID BLOCKING IN THIS STAGE!
void CommitPipeline::run_resolve_stage() {
int expected_shutdowns = config_.commit.pipeline_release_threads;
for (int shutdowns_received = 0; shutdowns_received < expected_shutdowns;) {
auto guard = pipeline_.acquire(1, 0, /*maxBatch*/ 1);
auto &batch = guard.batch;
// Stage 1: Precondition resolution
// This stage must be serialized to maintain consistent database state view
// - Validate preconditions against current database state
// - Check for conflicts with other transactions
for (auto &entry : batch) {
// Pattern match on pipeline entry variant
std::visit(
[&](auto &&e) {
using T = std::decay_t<decltype(e)>;
if constexpr (std::is_same_v<T, ShutdownEntry>) {
++shutdowns_received;
} else if constexpr (std::is_same_v<T, CommitEntry>) {
// Process commit entry: accept all commits (simplified
// implementation)
auto &commit_entry = e;
// Accept all commits (simplified implementation)
commit_entry.resolve_success = true;
} else if constexpr (std::is_same_v<T, StatusEntry>) {
// Status entries are not processed in resolve stage
// They were already handled in sequence stage
} else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
// Perform configurable CPU-intensive work for benchmarking
spend_cpu_cycles(config_.benchmark.ok_resolve_iterations);
}
},
entry);
}
}
}
void CommitPipeline::run_persist_stage() {
int expected_shutdowns = config_.commit.pipeline_release_threads;
for (int shutdowns_received = 0; shutdowns_received < expected_shutdowns;) {
auto guard = pipeline_.acquire(2, 0);
auto &batch = guard.batch;
// Stage 2: Transaction persistence
// Mark everything as durable immediately (simplified implementation)
// In real implementation: batch S3 writes, update subscribers, etc.
for (auto &entry : batch) {
// Pattern match on pipeline entry variant
std::visit(
[&](auto &&e) {
using T = std::decay_t<decltype(e)>;
if constexpr (std::is_same_v<T, ShutdownEntry>) {
++shutdowns_received;
} else if constexpr (std::is_same_v<T, CommitEntry>) {
// Process commit entry: mark as durable, generate response
auto &commit_entry = e;
// Check if connection is still alive first
// Skip if resolve failed or connection is in error state
if (!commit_entry.commit_request ||
!commit_entry.resolve_success) {
return;
}
// Mark as persisted and update committed version high water mark
commit_entry.persist_success = true;
committed_version_.store(commit_entry.assigned_version,
std::memory_order_seq_cst);
const CommitRequest &commit_request =
*commit_entry.commit_request;
// Generate success JSON response with actual assigned version
std::string_view response_json;
if (commit_request.request_id().has_value()) {
response_json = format(
commit_entry.request_arena,
R"({"request_id":"%.*s","status":"committed","version":%ld,"leader_id":"leader123"})",
static_cast<int>(
commit_request.request_id().value().size()),
commit_request.request_id().value().data(),
commit_entry.assigned_version);
} else {
response_json = format(
commit_entry.request_arena,
R"({"status":"committed","version":%ld,"leader_id":"leader123"})",
commit_entry.assigned_version);
}
// Store JSON response in arena for release stage
char *json_buffer =
commit_entry.request_arena.template allocate<char>(
response_json.size());
std::memcpy(json_buffer, response_json.data(),
response_json.size());
commit_entry.response_json =
std::string_view(json_buffer, response_json.size());
return; // Continue processing
} else if constexpr (std::is_same_v<T, StatusEntry>) {
// Process status entry: generate not_committed response
auto &status_entry = e;
// Store JSON response for release stage
status_entry.response_json = R"({"status": "not_committed"})";
} else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
// Process health check entry: generate OK response
auto &health_check_entry = e;
// Store plain text "OK" response for release stage
health_check_entry.response_json = "OK";
} else if constexpr (std::is_same_v<T, GetVersionEntry>) {
auto &get_version_entry = e;
// TODO validate we're still the leader at some version > the
// proposed version for external consistency.
// TODO include leader in response
get_version_entry.response_json = format(
get_version_entry.request_arena,
R"({"version":%ld,"leader":""})", get_version_entry.version);
}
},
entry);
}
}
}
void CommitPipeline::run_release_stage(int thread_index) {
for (int shutdowns_received = 0; shutdowns_received < 1;) {
auto guard = pipeline_.acquire(3, thread_index);
auto &batch = guard.batch;
// Stage 3: Connection release
// Return connections to server for response transmission
for (auto it = batch.begin(); it != batch.end(); ++it) {
auto &entry = *it;
// Partition work: thread 0 handles even indices, thread 1 handles odd
// indices
if (static_cast<int>(it.index() %
config_.commit.pipeline_release_threads) !=
thread_index) {
continue;
}
// Process non-shutdown entries with partitioning
std::visit(
[&](auto &&e) {
using T = std::decay_t<decltype(e)>;
if constexpr (std::is_same_v<T, ShutdownEntry>) {
// Already handled above
++shutdowns_received;
} else if constexpr (std::is_same_v<T, CommitEntry>) {
// Process commit entry: return connection to server
auto &commit_entry = e;
auto conn_ref = commit_entry.connection.lock();
if (!conn_ref) {
// Connection is gone, drop the entry silently
return; // Skip this entry and continue processing
}
// Send the JSON response using protocol-agnostic interface
// HTTP formatting will happen in on_preprocess_writes()
conn_ref->send_response(commit_entry.protocol_context,
commit_entry.response_json,
std::move(commit_entry.request_arena));
} else if constexpr (std::is_same_v<T, StatusEntry>) {
// Process status entry: return connection to server
auto &status_entry = e;
auto conn_ref = status_entry.connection.lock();
if (!conn_ref) {
// Connection is gone, drop the entry silently
return; // Skip this entry and continue processing
}
// Send the JSON response using protocol-agnostic interface
// HTTP formatting will happen in on_preprocess_writes()
conn_ref->send_response(status_entry.protocol_context,
status_entry.response_json,
std::move(status_entry.request_arena));
} else if constexpr (std::is_same_v<T, HealthCheckEntry>) {
// Process health check entry: return connection to server
auto &health_check_entry = e;
auto conn_ref = health_check_entry.connection.lock();
if (!conn_ref) {
// Connection is gone, drop the entry silently
return; // Skip this entry and continue processing
}
// Send the response using protocol-agnostic interface
// HTTP formatting will happen in on_preprocess_writes()
conn_ref->send_response(
health_check_entry.protocol_context,
health_check_entry.response_json,
std::move(health_check_entry.request_arena));
} else if constexpr (std::is_same_v<T, GetVersionEntry>) {
auto &get_version_entry = e;
auto conn_ref = get_version_entry.connection.lock();
if (!conn_ref) {
// Connection is gone, drop the entry silently
return; // Skip this entry and continue processing
}
// Send the response using protocol-agnostic interface
// HTTP formatting will happen in on_preprocess_writes()
conn_ref->send_response(
get_version_entry.protocol_context,
get_version_entry.response_json,
std::move(get_version_entry.request_arena));
}
},
entry);
}
}
}

120
src/commit_pipeline.hpp Normal file
View File

@@ -0,0 +1,120 @@
#pragma once
#include <atomic>
#include <span>
#include <thread>
#include "config.hpp"
#include "pipeline_entry.hpp"
#include "thread_pipeline.hpp"
/**
* High-performance 4-stage commit processing pipeline.
*
* Provides protocol-agnostic transaction processing through a lock-free
* multi-stage pipeline optimized for high throughput and low latency.
*
* Pipeline Stages:
* 1. Sequence: Version assignment and request ID deduplication
* 2. Resolve: Precondition validation and conflict detection
* 3. Persist: Transaction durability and response generation
* 4. Release: Connection return and response transmission
*
* Thread Safety:
* - submit_batch() is thread-safe for concurrent producers
* - Internal pipeline uses lock-free algorithms
* - Each stage runs on dedicated threads for optimal performance
*
* Usage:
* ```cpp
* CommitPipeline pipeline(config);
*
* // Build pipeline entries
* std::vector<PipelineEntry> entries;
* entries.emplace_back(CommitEntry(connection, context, request, arena));
*
* // Submit for processing
* pipeline.submit_batch(entries);
* ```
*/
struct CommitPipeline {
/**
* Create pipeline with 4 processing stages.
*
* @param config Server configuration for pipeline tuning
*/
explicit CommitPipeline(const weaseldb::Config &config);
/**
* Destructor ensures clean shutdown and thread join.
* Sends shutdown signal through pipeline and waits for all stages to
* complete.
*/
~CommitPipeline();
/**
* Submit batch of pipeline entries for processing.
*
* Thread-safe method for submitting work to the pipeline. Entries flow
* through all 4 stages in order with proper synchronization.
*
* @param entries Span of pipeline entries to process
*
* Entry types:
* - CommitEntry: Full transaction processing through all stages
* - StatusEntry: Request status lookup with sequence stage processing
* - HealthCheckEntry: Health check with configurable CPU work
* - ShutdownEntry: Coordinated pipeline shutdown signal
*
* @note Thread Safety: Safe for concurrent calls from multiple threads
* @note Performance: Batching reduces pipeline contention - prefer larger
* batches
* @note Blocking: May block if pipeline is at capacity (backpressure)
*/
void submit_batch(std::span<PipelineEntry> entries);
/**
* Get the highest committed version number.
*
* @return Current committed version (persist thread writes, other threads
* read)
* @note Thread Safety: Safe to read from any thread
*/
int64_t get_committed_version() const {
return committed_version_.load(std::memory_order_seq_cst);
}
private:
// Configuration reference
const weaseldb::Config &config_;
// Pipeline state (persist thread writes, other threads read)
std::atomic<int64_t> committed_version_{0}; // Highest committed version
// Lock-free pipeline configuration
static constexpr int lg_size = 16; // Ring buffer size (2^16 slots)
// 4-stage pipeline: sequence -> resolve -> persist -> release
ThreadPipeline<PipelineEntry> pipeline_;
// Stage processing threads
std::thread sequence_thread_;
std::thread resolve_thread_;
std::thread persist_thread_;
std::vector<std::thread> release_threads_;
// Pipeline stage main loops
void run_sequence_stage();
void run_resolve_stage();
void run_persist_stage();
void run_release_stage(int thread_index);
// Pipeline batch type alias
using BatchType = ThreadPipeline<PipelineEntry>::Batch;
// Make non-copyable and non-movable
CommitPipeline(const CommitPipeline &) = delete;
CommitPipeline &operator=(const CommitPipeline &) = delete;
CommitPipeline(CommitPipeline &&) = delete;
CommitPipeline &operator=(CommitPipeline &&) = delete;
};

View File

@@ -244,14 +244,7 @@ public:
* @return String view pointing to arena-allocated memory * @return String view pointing to arena-allocated memory
*/ */
std::string_view copy_to_arena(std::string_view str) { std::string_view copy_to_arena(std::string_view str) {
if (str.empty()) { return arena_.copy_string(str);
return {};
}
char *arena_str = arena_.allocate<char>(str.size());
std::memcpy(arena_str, str.data(), str.size());
return std::string_view(arena_str, str.size());
} }
/** /**

View File

@@ -1,5 +1,4 @@
#include "config.hpp" #include "config.hpp"
#include <fstream>
#include <iostream> #include <iostream>
#include <toml.hpp> #include <toml.hpp>
@@ -109,22 +108,13 @@ void ConfigParser::parse_server_config(const auto &toml_data,
parse_field(srv, "max_request_size_bytes", config.max_request_size_bytes); parse_field(srv, "max_request_size_bytes", config.max_request_size_bytes);
parse_field(srv, "io_threads", config.io_threads); parse_field(srv, "io_threads", config.io_threads);
// Set epoll_instances default to io_threads if not explicitly configured // epoll_instances removed - now 1:1 with io_threads
bool epoll_instances_specified = srv.contains("epoll_instances");
if (!epoll_instances_specified) {
config.epoll_instances = config.io_threads;
} else {
parse_field(srv, "epoll_instances", config.epoll_instances);
}
parse_field(srv, "event_batch_size", config.event_batch_size); parse_field(srv, "event_batch_size", config.event_batch_size);
parse_field(srv, "max_connections", config.max_connections); parse_field(srv, "max_connections", config.max_connections);
parse_field(srv, "read_buffer_size", config.read_buffer_size); parse_field(srv, "read_buffer_size", config.read_buffer_size);
// Clamp epoll_instances to not exceed io_threads // epoll_instances validation removed - now always equals io_threads
if (config.epoll_instances > config.io_threads) {
config.epoll_instances = config.io_threads;
}
}); });
} }
@@ -136,6 +126,25 @@ void ConfigParser::parse_commit_config(const auto &toml_data,
config.request_id_retention_hours); config.request_id_retention_hours);
parse_field(commit, "request_id_retention_versions", parse_field(commit, "request_id_retention_versions",
config.request_id_retention_versions); config.request_id_retention_versions);
// Parse wait strategy
if (commit.contains("pipeline_wait_strategy")) {
std::string strategy_str =
toml::get<std::string>(commit.at("pipeline_wait_strategy"));
if (strategy_str == "WaitIfStageEmpty") {
config.pipeline_wait_strategy = WaitStrategy::WaitIfStageEmpty;
} else if (strategy_str == "WaitIfUpstreamIdle") {
config.pipeline_wait_strategy = WaitStrategy::WaitIfUpstreamIdle;
} else if (strategy_str == "Never") {
config.pipeline_wait_strategy = WaitStrategy::Never;
} else {
std::cerr << "Warning: Unknown pipeline_wait_strategy '" << strategy_str
<< "', using default (WaitIfUpstreamIdle)" << std::endl;
}
}
parse_field(commit, "pipeline_release_threads",
config.pipeline_release_threads);
}); });
} }
@@ -213,15 +222,7 @@ bool ConfigParser::validate_config(const Config &config) {
valid = false; valid = false;
} }
if (config.server.epoll_instances < 1 || // epoll_instances validation removed - now always 1:1 with io_threads
config.server.epoll_instances > config.server.io_threads) {
std::cerr
<< "Configuration error: server.epoll_instances must be between 1 "
"and io_threads ("
<< config.server.io_threads << "), got "
<< config.server.epoll_instances << std::endl;
valid = false;
}
if (config.server.event_batch_size < 1 || if (config.server.event_batch_size < 1 ||
config.server.event_batch_size > 10000) { config.server.event_batch_size > 10000) {
@@ -271,6 +272,14 @@ bool ConfigParser::validate_config(const Config &config) {
valid = false; valid = false;
} }
if (config.commit.pipeline_release_threads < 1 ||
config.commit.pipeline_release_threads > 64) {
std::cerr << "Configuration error: commit.pipeline_release_threads must be "
"between 1 and 64, got "
<< config.commit.pipeline_release_threads << std::endl;
valid = false;
}
// Validate subscription configuration // Validate subscription configuration
if (config.subscription.max_buffer_size_bytes == 0) { if (config.subscription.max_buffer_size_bytes == 0) {
std::cerr << "Configuration error: subscription.max_buffer_size_bytes must " std::cerr << "Configuration error: subscription.max_buffer_size_bytes must "

View File

@@ -5,6 +5,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "thread_pipeline.hpp"
namespace weaseldb { namespace weaseldb {
/** /**
@@ -40,10 +42,8 @@ struct ServerConfig {
/// Maximum size in bytes for incoming HTTP requests (default: 1MB) /// Maximum size in bytes for incoming HTTP requests (default: 1MB)
int64_t max_request_size_bytes = 1024 * 1024; int64_t max_request_size_bytes = 1024 * 1024;
/// Number of I/O threads for handling connections and network events /// Number of I/O threads for handling connections and network events
/// Each I/O thread gets its own dedicated epoll instance
int io_threads = 1; int io_threads = 1;
/// Number of epoll instances to reduce epoll_ctl contention (default:
/// io_threads, max: io_threads)
int epoll_instances = 1;
/// Event batch size for epoll processing /// Event batch size for epoll processing
int event_batch_size = 32; int event_batch_size = 32;
/// Maximum number of concurrent connections (0 = unlimited) /// Maximum number of concurrent connections (0 = unlimited)
@@ -62,6 +62,16 @@ struct CommitConfig {
std::chrono::hours request_id_retention_hours{24}; std::chrono::hours request_id_retention_hours{24};
/// Minimum number of commit versions to retain request IDs for /// Minimum number of commit versions to retain request IDs for
int64_t request_id_retention_versions = 100000000; int64_t request_id_retention_versions = 100000000;
/// Wait strategy for the commit pipeline
/// - WaitIfStageEmpty: Block when individual stages are empty (default, safe
/// for shared CPUs)
/// - WaitIfUpstreamIdle: Block only when all upstream stages are idle
/// (requires dedicated cores)
/// - Never: Never block, busy-wait continuously (requires dedicated cores)
WaitStrategy pipeline_wait_strategy = WaitStrategy::WaitIfUpstreamIdle;
/// Number of threads in the release stage (final stage of commit pipeline)
/// Default: 1 thread for simplicity (can increase for higher throughput)
int pipeline_release_threads = 1;
}; };
/** /**

View File

@@ -4,9 +4,10 @@
#include <climits> #include <climits>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <sys/epoll.h>
#include "metric.hpp" #include "metric.hpp"
#include "server.hpp" // Need this for release_back_to_server implementation #include "server.hpp" // Need this for server reference
namespace { namespace {
// Thread-local metric instances // Thread-local metric instances
@@ -35,15 +36,16 @@ thread_local auto write_eagain_failures =
// Static thread-local storage for iovec buffer // Static thread-local storage for iovec buffer
static thread_local std::vector<struct iovec> g_iovec_buffer{IOV_MAX}; static thread_local std::vector<struct iovec> g_iovec_buffer{IOV_MAX};
// Thread-local storage for arenas to be freed after unlocking
static thread_local std::vector<Arena> g_arenas_to_free;
Connection::Connection(struct sockaddr_storage addr, int fd, int64_t id, Connection::Connection(struct sockaddr_storage addr, int fd, int64_t id,
size_t epoll_index, ConnectionHandler *handler, size_t epoll_index, ConnectionHandler *handler,
WeakRef<Server> server) WeakRef<Server> server)
: fd_(fd), id_(id), epoll_index_(epoll_index), addr_(addr), arena_(), : id_(id), epoll_index_(epoll_index), addr_(addr), handler_(handler),
handler_(handler), server_(std::move(server)) { server_(std::move(server)), fd_(fd) {
auto server_ref = server_.lock(); auto server_ref = server_.lock();
// This should only be called from a member of Server itself, so I should // Should only be called from the io thread
// hope it's alive.
assert(server_ref); assert(server_ref);
server_ref->active_connections_.fetch_add(1, std::memory_order_relaxed); server_ref->active_connections_.fetch_add(1, std::memory_order_relaxed);
@@ -56,34 +58,98 @@ Connection::Connection(struct sockaddr_storage addr, int fd, int64_t id,
} }
Connection::~Connection() { Connection::~Connection() {
if (handler_) { handler_->on_connection_closed(*this);
handler_->on_connection_closed(*this); if (fd_ >= 0) {
} int e = ::close(fd_);
// Server may legitimately be gone now if (e == -1 && errno != EINTR) {
if (auto server_ptr = server_.lock()) { perror("close");
server_ptr->active_connections_.fetch_sub(1, std::memory_order_relaxed); std::abort();
}
// EINTR ignored - fd is guaranteed closed on Linux
} }
}
// Decrement active connections gauge void Connection::close() {
connections_active.dec(); std::lock_guard lock{mutex_};
auto server_ptr = server_.lock();
int e = close(fd_); // Should only be called from the io thread
assert(server_ptr);
server_ptr->active_connections_.fetch_sub(1, std::memory_order_relaxed);
assert(fd_ >= 0);
int e = ::close(fd_);
if (e == -1 && errno != EINTR) { if (e == -1 && errno != EINTR) {
perror("close"); perror("close");
std::abort(); std::abort();
} }
// EINTR ignored - fd is guaranteed closed on Linux // EINTR ignored - fd is guaranteed closed on Linux
fd_ = -1;
// Decrement active connections gauge
connections_active.dec();
} }
void Connection::append_message(std::string_view s, bool copy_to_arena) { // Called from I/O thread only
if (copy_to_arena) { void Connection::append_bytes(std::span<std::string_view> data_parts,
char *arena_str = arena_.allocate<char>(s.size()); Arena arena, ConnectionShutdown shutdown_mode) {
std::memcpy(arena_str, s.data(), s.size()); // Prevent queueing messages after shutdown has been requested
messages_.emplace_back(arena_str, s.size()); if (shutdown_requested_ != ConnectionShutdown::None) {
} else { return;
messages_.push_back(s); }
// Check if queue was empty to determine if we need to enable EPOLLOUT
bool was_empty = message_queue_.empty();
// Set shutdown mode if requested
if (shutdown_mode != ConnectionShutdown::None) {
shutdown_requested_ = shutdown_mode;
}
// Add message to queue
// TODO this allocates while holding the connection lock
message_queue_.emplace_back(Message{std::move(arena), data_parts});
// If queue was empty, we need to add EPOLLOUT interest.
if (was_empty) {
auto server = server_.lock();
if (fd_ >= 0 && server) {
// Add EPOLLOUT interest - pipeline thread manages epoll
struct epoll_event event;
event.data.fd = fd_;
event.events = EPOLLIN | EPOLLOUT;
tsan_release();
// I think we have to call epoll_ctl while holding mutex_. Otherwise a
// call that clears the write interest could get reordered with one that
// sets it and we would hang.
epoll_ctl(server->epoll_fds_[epoll_index_], EPOLL_CTL_MOD, fd_, &event);
}
}
}
// May be called from a foreign thread!
void Connection::send_response(void *protocol_context,
std::string_view response_json, Arena arena) {
std::unique_lock lock(mutex_);
// Prevent queueing responses after shutdown has been requested
if (shutdown_requested_ != ConnectionShutdown::None) {
return;
}
// Store response in queue for protocol handler processing
pending_response_queue_.emplace_back(
PendingResponse{protocol_context, response_json, std::move(arena)});
// Trigger epoll interest if this is the first pending response
if (pending_response_queue_.size() == 1) {
auto server = server_.lock();
if (fd_ >= 0 && server) {
// Add EPOLLOUT interest to trigger on_preprocess_writes
struct epoll_event event;
event.data.fd = fd_;
event.events = EPOLLIN | EPOLLOUT;
tsan_release();
epoll_ctl(server->epoll_fds_[epoll_index_], EPOLL_CTL_MOD, fd_, &event);
}
} }
outgoing_bytes_queued_ += s.size();
} }
int Connection::readBytes(char *buf, size_t buffer_size) { int Connection::readBytes(char *buf, size_t buffer_size) {
@@ -105,36 +171,61 @@ int Connection::readBytes(char *buf, size_t buffer_size) {
} }
// Increment bytes read metric // Increment bytes read metric
if (r > 0) { assert(r > 0);
bytes_read.inc(r); bytes_read.inc(r);
}
return r; return r;
} }
} }
bool Connection::writeBytes() { uint32_t Connection::write_bytes() {
ssize_t total_bytes_written = 0; ssize_t total_bytes_written = 0;
while (!messages_.empty()) {
// Build iovec array up to IOV_MAX limit using thread-local vector uint32_t result = 0;
assert(g_iovec_buffer.size() == IOV_MAX);
struct iovec *iov = g_iovec_buffer.data(); while (true) {
// Build iovec array while holding mutex using thread-local buffer
int iov_count = 0; int iov_count = 0;
{
std::lock_guard lock(mutex_);
for (auto it = messages_.begin(); if (message_queue_.empty()) {
it != messages_.end() && iov_count < IOV_MAX; ++it) { break;
const auto &msg = *it; }
iov[iov_count] = {
const_cast<void *>(static_cast<const void *>(msg.data())),
msg.size()};
iov_count++;
}
assert(iov_count > 0); // Build iovec array up to IOV_MAX limit using thread-local vector
assert(g_iovec_buffer.size() == IOV_MAX);
struct iovec *iov = g_iovec_buffer.data();
for (auto &message : message_queue_) {
if (iov_count >= IOV_MAX)
break;
for (const auto &part : message.data_parts) {
if (iov_count >= IOV_MAX)
break;
if (part.empty())
continue;
iov[iov_count] = {
const_cast<void *>(static_cast<const void *>(part.data())),
part.size()};
iov_count++;
}
}
if (iov_count == 0)
break;
} // Release mutex during I/O
// Perform I/O without holding mutex
ssize_t w; ssize_t w;
for (;;) { for (;;) {
w = writev(fd_, iov, iov_count); struct msghdr msg = {};
msg.msg_iov = g_iovec_buffer.data();
msg.msg_iovlen = iov_count;
w = sendmsg(fd_, &msg, MSG_NOSIGNAL);
if (w == -1) { if (w == -1) {
if (errno == EINTR) { if (errno == EINTR) {
continue; // Standard practice: retry on signal interruption continue; // Standard practice: retry on signal interruption
@@ -142,45 +233,87 @@ bool Connection::writeBytes() {
if (errno == EAGAIN) { if (errno == EAGAIN) {
// Increment EAGAIN failure metric // Increment EAGAIN failure metric
write_eagain_failures.inc(); write_eagain_failures.inc();
// Increment bytes written metric before returning bytes_written.inc(total_bytes_written);
if (total_bytes_written > 0) { return result;
bytes_written.inc(total_bytes_written);
}
return false;
} }
perror("writev"); perror("sendmsg");
return true; result |= Error;
return result;
} }
break; break;
} }
result |= Progress;
assert(w > 0); assert(w > 0);
total_bytes_written += w; total_bytes_written += w;
// Handle partial writes by updating string_view data/size // Handle partial writes by updating message data_parts
size_t bytes_written = static_cast<size_t>(w); {
outgoing_bytes_queued_ -= bytes_written; std::lock_guard lock(mutex_);
while (bytes_written > 0 && !messages_.empty()) { size_t bytes_remaining = static_cast<size_t>(w);
auto &front = messages_.front();
if (bytes_written >= front.size()) { while (bytes_remaining > 0 && !message_queue_.empty()) {
// This message is completely written auto &front_message = message_queue_.front();
bytes_written -= front.size();
messages_.pop_front(); for (auto &part : front_message.data_parts) {
} else { if (part.empty())
// Partial write of this message - update string_view continue;
front = std::string_view(front.data() + bytes_written,
front.size() - bytes_written); if (bytes_remaining >= part.size()) {
bytes_written = 0; // This part is completely written
bytes_remaining -= part.size();
part = std::string_view(); // Mark as consumed
} else {
// Partial write of this part
part = std::string_view(part.data() + bytes_remaining,
part.size() - bytes_remaining);
bytes_remaining = 0;
break;
}
}
// Move arena to thread-local vector for deferred cleanup
g_arenas_to_free.emplace_back(std::move(front_message.arena));
message_queue_.pop_front();
if (result & Close) {
break;
}
} }
} }
} }
assert(messages_.empty());
// Increment bytes written metric // Check if queue is empty and remove EPOLLOUT interest
if (total_bytes_written > 0) { {
bytes_written.inc(total_bytes_written); std::lock_guard lock(mutex_);
if (message_queue_.empty() && pending_response_queue_.empty()) {
auto server = server_.lock();
if (server) {
struct epoll_event event;
event.data.fd = fd_;
event.events = EPOLLIN; // Remove EPOLLOUT
tsan_release();
// I think we have to call epoll_ctl while holding mutex_. Otherwise a
// call that clears the write interest could get reordered with one that
// sets it and we would hang.
epoll_ctl(server->epoll_fds_[epoll_index_], EPOLL_CTL_MOD, fd_, &event);
}
// Handle shutdown modes after all messages are sent
if (shutdown_requested_ == ConnectionShutdown::WriteOnly) {
// Shutdown write side but keep connection alive for reading
shutdown(fd_, SHUT_WR);
} else if (shutdown_requested_ == ConnectionShutdown::Full) {
result |= Close;
}
}
} }
return false; // Increment bytes written metric
bytes_written.inc(total_bytes_written);
// Clean up arenas after all mutex operations are complete
// This avoids holding the connection mutex while calling free()
g_arenas_to_free.clear();
return result;
} }

View File

@@ -3,6 +3,8 @@
#include <cassert> #include <cassert>
#include <cstring> #include <cstring>
#include <deque> #include <deque>
#include <mutex>
#include <span>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/uio.h> #include <sys/uio.h>
#include <unistd.h> #include <unistd.h>
@@ -15,34 +17,76 @@
#define __has_feature(x) 0 #define __has_feature(x) 0
#endif #endif
/**
* Represents a single client connection with efficient memory management.
*
* Connection ownership model:
* - Created by I/O thread, processed immediately, then transferred to epoll via
* raw pointer
* - I/O threads claim ownership by wrapping raw pointer in unique_ptr
* - I/O thread optionally passes ownership to a thread pipeline
* - Owner eventually transfers back to epoll by releasing unique_ptr to raw
* pointer
* - RAII cleanup happens if I/O thread doesn't transfer back
*
* Arena allocator thread safety:
* Each Connection contains its own Arena instance that is accessed
* exclusively by the thread that currently owns the connection. This ensures
* thread safety without requiring locks:
* - Arena is used by the owning thread for I/O buffers, request parsing, and
* response generation
* - Arena memory is automatically freed when the connection is destroyed
* - reset() should only be called by the current owner thread
*
* Only the handler interface methods are public - all networking details are
* private.
*/
// Forward declaration // Forward declaration
struct Server; struct Server;
struct Connection { /**
* Shutdown modes for connection termination.
*/
enum class ConnectionShutdown {
None, // Normal operation - no shutdown requested
WriteOnly, // shutdown(SHUT_WR) after sending queued data
Full // close() after sending queued data
};
/**
* Base interface for sending messages to a connection.
* This restricted interface is safe for use by pipeline threads,
* containing only the append_message method needed for responses.
* Pipeline threads should use WeakRef<MessageSender> to safely
* send responses without accessing other connection functionality
* that should only be used by the I/O thread.
*/
struct MessageSender {
/**
* @brief Send response with protocol-specific context for ordering.
*
* Thread-safe method for pipeline threads to send responses back to clients.
* Delegates to the connection's protocol handler for ordering logic.
* The protocol handler may queue the response or send it immediately.
*
* @param protocol_context Arena-allocated protocol-specific context
* @param data Response data parts (may be empty for deferred serialization)
* @param arena Arena containing response data and context
*
* Example usage:
* ```cpp
* auto* ctx = arena.allocate<HttpResponseContext>();
* ctx->sequence_id = 42;
* auto response_data = format_response(arena);
* conn.send_response(ctx, response_data, std::move(arena));
* ```
*/
virtual void send_response(void *protocol_context,
std::string_view response_json, Arena arena) = 0;
virtual ~MessageSender() = default;
};
/**
* Represents a single client connection - the full interface available to the
* io thread and connection handler.
*
* Connection ownership model:
* - Server owns all connections
* - Handlers receive Connection& references, and can keep a WeakRef to
* MessageSender for async responses.
* - Multiple pipeline threads can safely access the MessageSender concurrently
* - I/O thread has exclusive access to socket operations
*
* Threading model:
* - Single mutex protects state shared with pipeline threads
* - Pipeline threads call Connection methods (append_message, etc.)
* - I/O thread processes socket events and message queue
* - Pipeline threads register epoll write interest via append_message
* - Connection tracks closed state to prevent EBADF errors
*
* Arena allocator usage:
* - Request-scoped arenas created by handlers for each request
* - No connection-owned arena for parsing/response generation
* - Message queue stores spans + owning arenas until I/O completion
*/
struct Connection : MessageSender {
// No public constructor or factory method - only Server can create // No public constructor or factory method - only Server can create
// connections // connections
@@ -64,90 +108,72 @@ struct Connection {
// Handler interface - public methods that handlers can use // Handler interface - public methods that handlers can use
/** /**
* @brief Queue a message to be sent to the client. * @brief Queue an atomic message to be sent to the client.
* *
* Adds data to the connection's outgoing message queue. The data will be sent * Adds a complete message with all associated data to the connection's
* asynchronously by the server's I/O threads using efficient vectored * outgoing byte queue with guaranteed ordering.
* I/O.
* *
* @param s The data to send (string view parameter for efficiency) * I/O thread only method for protocol handlers to queue bytes for sending.
* @param copy_to_arena If true (default), copies data to the connection's * Bytes are queued in order and sent using efficient vectored I/O.
* arena for safe storage. If false, the caller must ensure the data remains
* valid until all queued messages are sent.
* *
* @warning Thread Safety: Only call from the thread that currently owns this * @param data_parts Span of string_views pointing to arena-allocated data
* connection. The arena allocator is not thread-safe. * @param arena Arena that owns all the memory referenced by data_parts
* @param shutdown_mode Shutdown mode to apply after sending all queued data
* *
* @note Performance: Use copy_to_arena=false for static strings or data with * @note Thread Safety: Must be called from I/O thread only.
* guaranteed lifetime, copy_to_arena=true for temporary/dynamic data. * @note Ordering: Bytes are sent in the order calls are made.
* @note The memory referenced by the data_parts span, must outlive @p arena.
* @note Shutdown Request: To request connection shutdown without sending
* data, pass empty data_parts span with desired shutdown_mode. This ensures
* all previously queued messages are sent before shutdown.
*
* Example usage (from ConnectionHandler::on_preprocess_writes):
* ```cpp
* Arena arena;
* auto parts = arena.allocate_span<std::string_view>(2);
* parts[0] = build_header(arena);
* parts[1] = build_body(arena);
* conn.append_bytes({parts, 2}, std::move(arena), ConnectionShutdown::None);
* ```
*/
void
append_bytes(std::span<std::string_view> data_parts, Arena arena,
ConnectionShutdown shutdown_mode = ConnectionShutdown::None);
void send_response(void *protocol_context, std::string_view response_json,
Arena arena) override;
/**
* @brief Get a WeakRef to this connection for async operations.
*
* Returns a WeakRef that can be safely used to access this connection
* from other threads, such as pipeline processing threads. The WeakRef
* allows safe access even if the connection might be destroyed by the
* time the async operation executes.
*
* @return WeakRef to this connection
*
* @note Thread Safety: This method is thread-safe.
*
* @note The WeakRef should be used with lock() to safely access the
* connection. If lock() returns null, the connection has been destroyed.
* *
* Example usage: * Example usage:
* ```cpp * ```cpp
* conn->append_message("HTTP/1.1 200 OK\r\n\r\n", false); // Static string * auto weak_conn = conn.get_weak_ref();
* conn->append_message(dynamic_response, true); // Dynamic data * async_processor.submit([weak_conn, request_data]() {
* conn->append_message(arena_allocated_data, false); // Arena data * if (auto conn = weak_conn.lock()) {
* Arena arena;
* auto response = process_request(request_data, arena);
* conn->append_message({&response, 1}, std::move(arena));
* }
* });
* ``` * ```
*/ */
void append_message(std::string_view s, bool copy_to_arena = true); WeakRef<MessageSender> get_weak_ref() const {
assert(self_ref_.lock());
/** return self_ref_.copy();
* @brief Mark the connection to be closed after sending all queued messages. }
*
* Sets a flag that instructs the server to close this connection gracefully
* after all currently queued messages have been successfully sent to the
* client. This enables proper connection cleanup for protocols like HTTP/1.0
* or when implementing connection limits.
*
* @note The connection will remain active until:
* 1. All queued messages are sent to the client
* 2. The server processes the close flag during the next I/O cycle
* 3. The connection is properly closed and cleaned up
*
* @warning Thread Safety: Only call from the thread that currently owns this
* connection.
*
* Typical usage:
* ```cpp
* conn->append_message("HTTP/1.1 200 OK\r\n\r\nBye!");
* conn->close_after_send(); // Close after sending response
* ```
*/
void close_after_send() { closeConnection_ = true; }
/**
* @brief Get access to the connection's arena allocator.
*
* Returns a reference to this connection's private Arena instance,
* which should be used for all temporary allocations during request
* processing. The arena provides extremely fast allocation (~1ns) and
* automatic cleanup when the connection is destroyed or reset.
*
* @return Reference to the connection's arena allocator
*
* @warning Thread Safety: Only access from the thread that currently owns
* this connection. The arena allocator is not thread-safe and concurrent
* access will result in undefined behavior.
*
* @note Memory Lifecycle: Arena memory is automatically freed when:
* - The connection is destroyed
* - reset() is called (keeps first block, frees others)
* - The connection is moved (arena ownership transfers)
*
* Best practices:
* ```cpp
* Arena& arena = conn->get_arena();
*
* // Allocate temporary parsing buffers
* char* buffer = arena.allocate<char>(1024);
*
* // Construct temporary objects
* auto* request = arena.construct<HttpRequest>(arena);
*
* // Use arena-backed STL containers
* std::vector<Token, ArenaStlAllocator<Token>> tokens{&arena};
* ```
*/
Arena &get_arena() { return arena_; }
/** /**
* @brief Get the unique identifier for this connection. * @brief Get the unique identifier for this connection.
@@ -175,54 +201,6 @@ struct Connection {
*/ */
int64_t get_id() const { return id_; } int64_t get_id() const { return id_; }
/**
* @brief Get the number of bytes queued for transmission.
*
* Returns the total number of bytes in all messages currently
* queued for transmission to the client. This includes all data added via
* append_message() that has not yet been sent over the network.
*
* @return Total bytes queued for transmission
*
* @warning Thread Safety: Only call from the thread that currently owns this
* connection. Concurrent access to the message queue is not thread-safe.
*
* @note Performance: This method uses an O(1) counter for fast retrieval
* in release builds. In debug builds, validates counter accuracy.
*
* @note The count decreases as the server sends data via writeBytes() and
* removes completed messages from the queue.
*
* Use cases:
* ```cpp
* // Check if all data has been sent
* if (conn->outgoingBytesQueued() == 0) {
* conn->reset(); // Safe to reset arena
* }
*
* // Implement backpressure
* if (conn->outgoingBytesQueued() > MAX_BUFFER_SIZE) {
* // Stop adding more data until queue drains
* }
*
* // Logging/monitoring
* metrics.recordQueueDepth(conn->get_id(), conn->outgoingBytesQueued());
* ```
*/
int64_t outgoing_bytes_queued() const {
#ifndef NDEBUG
// Debug build: validate counter accuracy
int64_t computed_total = 0;
for (auto s : messages_) {
computed_total += s.size();
}
assert(
outgoing_bytes_queued_ == computed_total &&
"outgoing_bytes_queued_ counter is out of sync with actual queue size");
#endif
return outgoing_bytes_queued_;
}
/** /**
* @brief Protocol-specific data pointer for handler use. * @brief Protocol-specific data pointer for handler use.
* *
@@ -245,7 +223,7 @@ struct Connection {
* *
* Example usage: * Example usage:
* ```cpp * ```cpp
* class HttpHandler : public ConnectionHandler { * class HttpHandler : ConnectionHandler {
* void on_connection_established(Connection& conn) override { * void on_connection_established(Connection& conn) override {
* // Allocate HTTP state in connection's arena or heap * // Allocate HTTP state in connection's arena or heap
* auto* state = conn.get_arena().construct<HttpConnectionState>(); * auto* state = conn.get_arena().construct<HttpConnectionState>();
@@ -259,8 +237,8 @@ struct Connection {
* } * }
* *
* void on_data_arrived(std::string_view data, * void on_data_arrived(std::string_view data,
* std::unique_ptr<Connection>& conn_ptr) override { * Connection& conn) override {
* auto* state = static_cast<HttpConnectionState*>(conn_ptr->user_data); * auto* state = static_cast<HttpConnectionState*>(conn.user_data);
* // Use state for protocol processing... * // Use state for protocol processing...
* } * }
* }; * };
@@ -268,50 +246,13 @@ struct Connection {
*/ */
void *user_data = nullptr; void *user_data = nullptr;
/**
* Reset the connection's arena allocator and message queue for reuse.
*
* This method efficiently reclaims arena memory by keeping the first block
* and freeing all others, then reinitializes the message queue.
*
* @warning Thread Safety: This method should ONLY be called by the thread
* that currently owns this connection. Calling reset() while the connection
* is being transferred between threads or accessed by another thread will
* result in undefined behavior.
*
* @note The assert(messages_.empty()) ensures all outgoing data has been
* sent before resetting. This prevents data loss and indicates the connection
* is in a clean state for reuse.
*
* Typical usage pattern:
* - HTTP handlers call this after completing a request/response cycle
*/
void reset() {
assert(messages_.empty());
outgoing_bytes_queued_ = 0;
arena_.reset();
messages_ =
std::deque<std::string_view, ArenaStlAllocator<std::string_view>>{
ArenaStlAllocator<std::string_view>{&arena_}};
}
/**
* @note Ownership Transfer: To release a connection back to the server for
* continued processing, use the static method:
* ```cpp
* Server::release_back_to_server(std::move(connection_ptr));
* ```
*
* This is the correct way to return connection ownership when:
* - A handler has taken ownership via unique_ptr.release()
* - Background processing of the connection is complete
* - The connection should resume normal server-managed I/O processing
*
* The method is thread-safe and handles the case where the server may have
* been destroyed while the connection was being processed elsewhere.
*/
private: private:
struct Message {
Arena arena; // Owns all the memory (movable)
std::span<std::string_view> data_parts; // Points to arena-allocated memory
// (mutable for partial writes)
};
// Server is a friend and can access all networking internals // Server is a friend and can access all networking internals
friend struct Server; friend struct Server;
@@ -333,30 +274,43 @@ private:
size_t epoll_index, ConnectionHandler *handler, size_t epoll_index, ConnectionHandler *handler,
WeakRef<Server> server); WeakRef<Server> server);
template <typename T, typename... Args>
friend Ref<T> make_ref(Args &&...args);
// Networking interface - only accessible by Server // Networking interface - only accessible by Server
int readBytes(char *buf, size_t buffer_size); int readBytes(char *buf, size_t buffer_size);
bool writeBytes(); enum WriteBytesResult {
Error = 1 << 0,
Progress = 1 << 1,
Close = 1 << 2,
};
uint32_t write_bytes();
// Direct access methods for Server void close();
int getFd() const { return fd_; }
bool has_messages() const { return !messages_.empty(); } // Immutable connection properties
bool should_close() const { return closeConnection_; }
size_t getEpollIndex() const { return epoll_index_; }
const int fd_;
const int64_t id_; const int64_t id_;
const size_t epoll_index_; // Index of the epoll instance this connection uses const size_t epoll_index_; // Index of the epoll instance this connection uses
struct sockaddr_storage addr_; // sockaddr_storage handles IPv4/IPv6 struct sockaddr_storage addr_; // sockaddr_storage handles IPv4/IPv6
Arena arena_; ConnectionHandler *const handler_;
ConnectionHandler *handler_; WeakRef<Server> server_; // Weak reference to server for safe epoll_ctl calls
WeakRef<Server> server_; // Weak reference to server for safe cleanup WeakRef<Connection> self_ref_; // WeakRef to self for get_weak_ref()
std::deque<std::string_view, ArenaStlAllocator<std::string_view>> messages_{ // Only accessed from io thread
ArenaStlAllocator<std::string_view>{&arena_}}; std::deque<Message> message_queue_;
// Counter tracking total bytes queued for transmission mutable std::mutex mutex_;
int64_t outgoing_bytes_queued_{0}; ConnectionShutdown shutdown_requested_{
ConnectionShutdown::None}; // Protected by mutex_
std::deque<PendingResponse> pending_response_queue_; // Protected by mutex_
int fd_; // Protected by mutex_
// Whether or not to close the connection after completing writing the #if __has_feature(thread_sanitizer)
// response void tsan_acquire() { tsan_sync.load(std::memory_order_acquire); }
bool closeConnection_{false}; void tsan_release() { tsan_sync.store(0, std::memory_order_release); }
std::atomic<int> tsan_sync;
#else
void tsan_acquire() {}
void tsan_release() {}
#endif
}; };

View File

@@ -1,12 +1,24 @@
#pragma once #pragma once
#include <memory>
#include <span> #include <span>
#include <string_view> #include <string_view>
// Forward declaration to avoid circular dependency // Forward declarations to avoid circular dependency
struct Connection; struct Connection;
// Include Arena header since PendingResponse uses Arena by value
#include "arena.hpp"
/**
* Represents a response queued by pipeline threads for protocol processing.
* Contains JSON response data that can be wrapped by any protocol.
*/
struct PendingResponse {
void *protocol_context; // Arena-allocated protocol-specific context
std::string_view response_json; // JSON response body (arena-allocated)
Arena arena; // Arena containing response data and context
};
/** /**
* Abstract interface for handling connection data processing. * Abstract interface for handling connection data processing.
* *
@@ -25,22 +37,21 @@ public:
* Process incoming data from a connection. * Process incoming data from a connection.
* *
* @param data Incoming data buffer (may be partial message) * @param data Incoming data buffer (may be partial message)
* @param conn_ptr Unique pointer to connection - handler can take ownership * @param conn Connection reference - server retains ownership
* by releasing it
* *
* Implementation should: * Implementation should:
* - Parse incoming data using arena allocator when needed * - Create request-scoped Arena for parsing and response generation
* - Use conn_ptr->append_message() to queue response data to be sent * - Parse incoming data using the request arena
* - Use conn.append_message() to queue response data to be sent
* - Handle partial messages and streaming protocols appropriately * - Handle partial messages and streaming protocols appropriately
* - Can take ownership by calling conn_ptr.release() to pass to other threads * - Use conn.get_weak_ref() for async processing if needed
* - If ownership is taken, handler must call Server::release_back_to_server() *
* when done * @note `data` lifetime ends after the call to on_data_arrived.
* @note `data` is *not* owned by the connection arena, and its lifetime ends * @note Called from this connection's io thread.
* after the call to on_data_arrived. * @note Handler can safely access connection concurrently via thread-safe
* @note May be called from an arbitrary server thread. * methods.
*/ */
virtual void on_data_arrived(std::string_view /*data*/, virtual void on_data_arrived(std::string_view /*data*/, Connection &) {};
std::unique_ptr<Connection> &) {};
/** /**
* Called when data has been successfully written to the connection. * Called when data has been successfully written to the connection.
@@ -50,29 +61,12 @@ public:
* - Implementing backpressure for continuous data streams * - Implementing backpressure for continuous data streams
* - Progress monitoring for long-running transfers * - Progress monitoring for long-running transfers
* *
* @param conn_ptr Connection that made write progress - handler can take * @param conn Connection that made write progress - server retains ownership
* ownership * @note Called from this connection's io thread.
* @note May be called from an arbitrary server thread.
* @note Called during writes, not necessarily when buffer becomes empty * @note Called during writes, not necessarily when buffer becomes empty
* TODO Add bytes written argument?
*/ */
virtual void on_write_progress(std::unique_ptr<Connection> &) {} virtual void on_write_progress(Connection &) {}
/**
* Called when the connection's outgoing write buffer becomes empty.
*
* This indicates all queued messages have been successfully written
* to the socket. Useful for:
* - Resetting arena allocators safely
* - Implementing keep-alive connection reuse
* - Closing connections after final response
* - Relieving backpressure conditions
*
* @param conn_ptr Connection with empty write buffer - handler can take
* ownership
* @note May be called from an arbitrary server thread.
* @note Only called on transitions from non-empty → empty buffer
*/
virtual void on_write_buffer_drained(std::unique_ptr<Connection> &) {}
/** /**
* Called when a new connection is established. * Called when a new connection is established.
@@ -81,7 +75,7 @@ public:
* *
* Use this for: * Use this for:
* - Connection-specific initialization. * - Connection-specific initialization.
* @note May be called from an arbitrary server thread. * @note Called from this connection's io thread.
*/ */
virtual void on_connection_established(Connection &) {} virtual void on_connection_established(Connection &) {}
@@ -92,21 +86,34 @@ public:
* *
* Use this for: * Use this for:
* - Cleanup of connection-specific resources. * - Cleanup of connection-specific resources.
* @note May be called from an arbitrary server thread. * @note Called from this connection's io thread, or possibly a foreign thread
* that has locked the MessageSender associated with this connection.
*/ */
virtual void on_connection_closed(Connection &) {} virtual void on_connection_closed(Connection &) {}
/** /**
* @brief Called after a batch of connections has been processed. * @brief Called after a batch of connections has been processed.
* *
* This hook is called after on_data_arrived, on_write_progress, or * This hook is called after on_data_arrived or on_write_progress has been
* on_write_buffer_drained has been called for each connection in the batch. * called for each connection in the batch. All connections remain
* The handler can take ownership of the connections by moving the unique_ptr * server-owned.
* out of the span. Any connections left in the span will remain owned by the
* server.
* *
* @param batch A span of unique_ptrs to the connections in the batch. * @param batch A span of connection references in the batch.
* @note Called from this connection's io thread.
*/ */
virtual void virtual void on_batch_complete(std::span<Connection *const> /*batch*/) {}
on_batch_complete(std::span<std::unique_ptr<Connection>> /*batch*/) {}
/**
* Called before processing outgoing writes on a connection.
*
* This hook allows protocol handlers to process queued responses
* before actual socket writes occur. Used for response ordering,
* serialization, and other preprocessing.
*
* @param conn Connection about to write data
* @param pending_responses Responses queued by pipeline threads
* @note Called from this connection's io thread.
* @note Called when EPOLLOUT event occurs
*/
virtual void on_preprocess_writes(Connection &, std::span<PendingResponse>) {}
}; };

View File

@@ -1,6 +1,5 @@
#include "connection_registry.hpp" #include "connection_registry.hpp"
#include "connection.hpp" #include "connection.hpp"
#include <atomic>
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <unistd.h> #include <unistd.h>
@@ -14,49 +13,50 @@ ConnectionRegistry::ConnectionRegistry() : connections_(nullptr), max_fds_(0) {
} }
max_fds_ = rlim.rlim_cur; max_fds_ = rlim.rlim_cur;
// Calculate size rounded up to page boundary // TODO re-enable "ondemand pages" behavior
size_t array_size = max_fds_ * sizeof(Connection *); // // Calculate size rounded up to page boundary
size_t page_size = getpagesize(); // size_t array_size = max_fds_ * sizeof(Connection *);
size_t aligned_size = (array_size + page_size - 1) & ~(page_size - 1); // size_t page_size = getpagesize();
// size_t aligned_size = (array_size + page_size - 1) & ~(page_size - 1);
// Allocate virtual address space using mmap // // Allocate virtual address space using mmap
// MAP_ANONYMOUS provides zero-initialized pages on-demand (lazy allocation) // // MAP_ANONYMOUS provides zero-initialized pages on-demand (lazy
connections_ = static_cast<std::atomic<Connection *> *>( // allocation) connections_ = static_cast<std::atomic<Connection *> *>(
mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE, // mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); // MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
if (connections_ == MAP_FAILED) { // if (connections_ == MAP_FAILED) {
perror("mmap"); // perror("mmap");
std::abort(); // std::abort();
} // }
// Store aligned size for munmap // // Store aligned size for munmap
aligned_size_ = aligned_size; // aligned_size_ = aligned_size;
connections_ = new Ref<Connection>[max_fds_];
} }
ConnectionRegistry::~ConnectionRegistry() { ConnectionRegistry::~ConnectionRegistry() {
if (connections_ != nullptr) { delete[] connections_;
for (int fd = 0; fd < static_cast<int>(max_fds_); ++fd) { // if (connections_ != nullptr) {
delete connections_[fd].load(std::memory_order_relaxed); // for (int fd = 0; fd < static_cast<int>(max_fds_); ++fd) {
} // delete connections_[fd].load(std::memory_order_relaxed);
if (munmap(connections_, aligned_size_) == -1) { // }
perror("munmap"); // if (munmap(connections_, aligned_size_) == -1) {
} // perror("munmap");
} // }
// }
} }
void ConnectionRegistry::store(int fd, std::unique_ptr<Connection> connection) { void ConnectionRegistry::store(int fd, Ref<Connection> connection) {
if (fd < 0 || static_cast<size_t>(fd) >= max_fds_) { if (fd < 0 || static_cast<size_t>(fd) >= max_fds_) {
std::abort(); std::abort();
} }
// Release ownership from unique_ptr and store raw pointer connections_[fd] = std::move(connection);
connections_[fd].store(connection.release(), std::memory_order_release);
} }
std::unique_ptr<Connection> ConnectionRegistry::remove(int fd) { Ref<Connection> ConnectionRegistry::remove(int fd) {
if (fd < 0 || static_cast<size_t>(fd) >= max_fds_) { if (fd < 0 || static_cast<size_t>(fd) >= max_fds_) {
std::abort(); std::abort();
} }
return std::unique_ptr<Connection>( return std::move(connections_[fd]);
connections_[fd].exchange(nullptr, std::memory_order_acquire));
} }

View File

@@ -1,10 +1,11 @@
#pragma once #pragma once
#include <cstddef> #include <cstddef>
#include <memory>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/resource.h> #include <sys/resource.h>
#include "reference.hpp"
struct Connection; struct Connection;
/** /**
@@ -33,12 +34,12 @@ public:
/** /**
* Store a connection in the registry, indexed by its file descriptor. * Store a connection in the registry, indexed by its file descriptor.
* Takes ownership of the connection via unique_ptr. * Takes a reference to the connection for storage.
* *
* @param fd File descriptor (must be valid and < max_fds_) * @param fd File descriptor (must be valid and < max_fds_)
* @param connection unique_ptr to the connection (ownership transferred) * @param connection Ref<Connection> to store in the registry
*/ */
void store(int fd, std::unique_ptr<Connection> connection); void store(int fd, Ref<Connection> connection);
/** /**
* Remove a connection from the registry and transfer ownership to caller. * Remove a connection from the registry and transfer ownership to caller.
@@ -47,7 +48,7 @@ public:
* @param fd File descriptor * @param fd File descriptor
* @return unique_ptr to the connection, or nullptr if not found * @return unique_ptr to the connection, or nullptr if not found
*/ */
std::unique_ptr<Connection> remove(int fd); Ref<Connection> remove(int fd);
/** /**
* Get the maximum number of file descriptors supported. * Get the maximum number of file descriptors supported.
@@ -63,10 +64,7 @@ public:
ConnectionRegistry &operator=(ConnectionRegistry &&) = delete; ConnectionRegistry &operator=(ConnectionRegistry &&) = delete;
private: private:
std::atomic<Connection *> Ref<Connection> *connections_;
*connections_; ///< mmap'd array of raw connection pointers. It's
///< thread-safe without since epoll_ctl happens before
///< epoll_wait, but this makes tsan happy /shrug.
size_t max_fds_; ///< Maximum file descriptor limit size_t max_fds_; ///< Maximum file descriptor limit
size_t aligned_size_; ///< Page-aligned size for munmap size_t aligned_size_; ///< Page-aligned size for munmap
}; };

File diff suppressed because it is too large Load Diff

View File

@@ -1,40 +1,55 @@
#pragma once #pragma once
#include <atomic> #include <map>
#include <memory>
#include <string_view> #include <string_view>
#include <thread>
#include <unordered_set>
#include <llhttp.h> #include <llhttp.h>
#include "api_url_parser.hpp" #include "api_url_parser.hpp"
#include "arena.hpp" #include "arena.hpp"
#include "commit_pipeline.hpp"
#include "config.hpp" #include "config.hpp"
#include "connection.hpp" #include "connection.hpp"
#include "connection_handler.hpp" #include "connection_handler.hpp"
#include "perfetto_categories.hpp"
#include "pipeline_entry.hpp"
#include "server.hpp"
#include "thread_pipeline.hpp"
// Forward declarations // Forward declarations
struct CommitRequest; struct CommitRequest;
struct JsonCommitRequestParser; struct JsonCommitRequestParser;
struct RouteMatch; struct RouteMatch;
/**
* HTTP-specific response context stored in pipeline entries.
* Arena-allocated and passed through pipeline for response correlation.
*/
struct HttpResponseContext {
int64_t sequence_id; // For response ordering in pipelining
int64_t http_request_id; // For X-Response-ID header
bool connection_close; // Whether to close connection after response
};
/**
* Response data ready to send (sequence_id -> response data).
* Absence from map indicates response not ready yet.
*/
struct ResponseData {
std::span<std::string_view> data;
Arena arena;
bool connection_close;
};
/** /**
* HTTP connection state stored in Connection::user_data. * HTTP connection state stored in Connection::user_data.
* Manages llhttp parser state and request data. * Manages llhttp parser state and request data.
*/ */
struct HttpConnectionState { struct HttpRequestState {
Arena &arena; Arena arena{16 << 10}; // Request-scoped arena for parsing state
llhttp_t parser;
llhttp_settings_t settings;
// Current request data (arena-allocated) // Current request data (arena-allocated)
std::string_view method; std::string_view method;
std::string_view url;
using ArenaString =
std::basic_string<char, std::char_traits<char>, ArenaStlAllocator<char>>;
ArenaString url;
// Parse state // Parse state
bool headers_complete = false; bool headers_complete = false;
@@ -47,13 +62,12 @@ struct HttpConnectionState {
status_request_id; // Request ID extracted from /v1/status/{id} URL status_request_id; // Request ID extracted from /v1/status/{id} URL
// Header accumulation buffers (arena-allocated) // Header accumulation buffers (arena-allocated)
using ArenaString =
std::basic_string<char, std::char_traits<char>, ArenaStlAllocator<char>>;
ArenaString current_header_field_buf; ArenaString current_header_field_buf;
ArenaString current_header_value_buf; ArenaString current_header_value_buf;
bool header_field_complete = false; bool header_field_complete = false;
int64_t http_request_id = int64_t http_request_id =
0; // X-Request-Id header value (for tracing/logging) 0; // X-Request-Id header value (for tracing/logging)
int64_t sequence_id = 0; // Assigned for response ordering in pipelining
// Streaming parser for POST requests // Streaming parser for POST requests
Arena::Ptr<JsonCommitRequestParser> commit_parser; Arena::Ptr<JsonCommitRequestParser> commit_parser;
@@ -62,7 +76,30 @@ struct HttpConnectionState {
bool basic_validation_passed = bool basic_validation_passed =
false; // Set to true if basic validation passes false; // Set to true if basic validation passes
explicit HttpConnectionState(Arena &arena); HttpRequestState();
};
struct HttpConnectionState {
llhttp_t parser;
llhttp_settings_t settings;
HttpRequestState pending;
std::deque<HttpRequestState> queue;
int64_t get_next_sequence_id() { return next_sequence_id++; }
HttpConnectionState();
void send_ordered_response(Connection &conn, int64_t sequence_id,
std::span<std::string_view> http_response,
Arena arena, bool close_connection);
private:
// Response ordering for HTTP pipelining
std::map<int64_t, ResponseData>
ready_responses; // sequence_id -> response data
int64_t next_sequence_to_send = 0;
int64_t next_sequence_id = 0;
}; };
/** /**
@@ -71,74 +108,15 @@ struct HttpConnectionState {
*/ */
struct HttpHandler : ConnectionHandler { struct HttpHandler : ConnectionHandler {
explicit HttpHandler(const weaseldb::Config &config) explicit HttpHandler(const weaseldb::Config &config)
: config_(config), banned_request_ids(ArenaStlAllocator<std::string_view>( : config_(config), commit_pipeline_(config) {}
&banned_request_arena)) {
// Stage 0: Sequence assignment thread
sequenceThread = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-sequence");
for (;;) {
auto guard = commitPipeline.acquire<0, 0>();
if (process_sequence_batch(guard.batch)) {
return; // Shutdown signal received
}
}
}};
// Stage 1: Precondition resolution thread
resolveThread = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-resolve");
for (;;) {
auto guard = commitPipeline.acquire<1, 0>(/*maxBatch*/ 1);
if (process_resolve_batch(guard.batch)) {
return; // Shutdown signal received
}
}
}};
// Stage 2: Transaction persistence thread
persistThread = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-persist");
for (;;) {
auto guard = commitPipeline.acquire<2, 0>();
if (process_persist_batch(guard.batch)) {
return; // Shutdown signal received
}
}
}};
// Stage 3: Connection return to server thread
releaseThread = std::thread{[this]() {
pthread_setname_np(pthread_self(), "txn-release");
for (;;) {
auto guard = commitPipeline.acquire<3, 0>();
if (process_release_batch(guard.batch)) {
return; // Shutdown signal received
}
}
}};
}
~HttpHandler() {
// Send single shutdown signal that flows through all pipeline stages
{
auto guard = commitPipeline.push(1, true);
guard.batch[0] =
ShutdownEntry{}; // Single ShutdownEntry flows through all stages
}
// Join all pipeline threads
sequenceThread.join();
resolveThread.join();
persistThread.join();
releaseThread.join();
}
void on_connection_established(Connection &conn) override; void on_connection_established(Connection &conn) override;
void on_connection_closed(Connection &conn) override; void on_connection_closed(Connection &conn) override;
void on_data_arrived(std::string_view data, void on_data_arrived(std::string_view data, Connection &conn) override;
std::unique_ptr<Connection> &conn_ptr) override; void
void on_write_buffer_drained(std::unique_ptr<Connection> &conn_ptr) override; on_preprocess_writes(Connection &conn,
void on_batch_complete( std::span<PendingResponse> pending_responses) override;
std::span<std::unique_ptr<Connection>> /*batch*/) override; void on_batch_complete(std::span<Connection *const> batch) override;
// llhttp callbacks (public for HttpConnectionState access) // llhttp callbacks (public for HttpConnectionState access)
static int onUrl(llhttp_t *parser, const char *at, size_t length); static int onUrl(llhttp_t *parser, const char *at, size_t length);
@@ -151,74 +129,37 @@ struct HttpHandler : ConnectionHandler {
static int onMessageComplete(llhttp_t *parser); static int onMessageComplete(llhttp_t *parser);
private: private:
static constexpr int lg_size = 16;
// Configuration reference // Configuration reference
const weaseldb::Config &config_; const weaseldb::Config &config_;
// Pipeline state (sequence thread only) // Commit processing pipeline
int64_t next_version = 1; // Next version to assign (sequence thread only) CommitPipeline commit_pipeline_;
// Pipeline state (persist thread writes, I/O threads read)
std::atomic<int64_t> committed_version{
0}; // Highest committed version (persist thread writes, I/O threads read)
// Arena for banned request IDs and related data structures (sequence thread
// only)
Arena banned_request_arena;
using BannedRequestIdSet =
std::unordered_set<std::string_view, std::hash<std::string_view>,
std::equal_to<std::string_view>,
ArenaStlAllocator<std::string_view>>;
BannedRequestIdSet banned_request_ids; // Request IDs that should not commit
// (string_views into arena)
// Main commit processing pipeline: sequence -> resolve -> persist -> release
StaticThreadPipeline<PipelineEntry, WaitStrategy::WaitIfUpstreamIdle, 1, 1, 1,
1>
commitPipeline{lg_size};
// Pipeline stage threads
std::thread sequenceThread;
std::thread resolveThread;
std::thread persistThread;
std::thread releaseThread;
// Pipeline stage processing methods (batch-based)
using BatchType =
StaticThreadPipeline<PipelineEntry, WaitStrategy::WaitIfUpstreamIdle, 1,
1, 1, 1>::Batch;
bool process_sequence_batch(BatchType &batch);
bool process_resolve_batch(BatchType &batch);
bool process_persist_batch(BatchType &batch);
bool process_release_batch(BatchType &batch);
// Route handlers // Route handlers
void handle_get_version(Connection &conn, const HttpConnectionState &state); void handle_get_version(Connection &conn, HttpRequestState &state);
void handle_post_commit(Connection &conn, const HttpConnectionState &state); void handle_post_commit(Connection &conn, HttpRequestState &state);
void handle_get_subscribe(Connection &conn, const HttpConnectionState &state); void handle_get_subscribe(Connection &conn, HttpRequestState &state);
void handle_get_status(Connection &conn, HttpConnectionState &state, void handle_get_status(Connection &conn, HttpRequestState &state,
const RouteMatch &route_match); const RouteMatch &route_match);
void handle_put_retention(Connection &conn, const HttpConnectionState &state, void handle_put_retention(Connection &conn, HttpRequestState &state,
const RouteMatch &route_match); const RouteMatch &route_match);
void handle_get_retention(Connection &conn, const HttpConnectionState &state, void handle_get_retention(Connection &conn, HttpRequestState &state,
const RouteMatch &route_match); const RouteMatch &route_match);
void handle_delete_retention(Connection &conn, void handle_delete_retention(Connection &conn, HttpRequestState &state,
const HttpConnectionState &state,
const RouteMatch &route_match); const RouteMatch &route_match);
void handle_get_metrics(Connection &conn, const HttpConnectionState &state); void handle_get_metrics(Connection &conn, HttpRequestState &state);
void handle_get_ok(Connection &conn, const HttpConnectionState &state); void handle_get_ok(Connection &conn, HttpRequestState &state);
void handle_not_found(Connection &conn, const HttpConnectionState &state); void handle_not_found(Connection &conn, HttpRequestState &state);
// HTTP utilities // HTTP utilities
static void send_response(Connection &conn, int status_code,
std::string_view content_type, // Helper functions for formatting responses without sending
std::string_view body, static std::span<std::string_view>
bool close_connection = false); format_response(int status_code, std::string_view content_type,
static void send_json_response(Connection &conn, int status_code, std::string_view body, Arena &response_arena,
std::string_view json, int64_t http_request_id, bool close_connection);
bool close_connection = false); static std::span<std::string_view>
static void send_error_response(Connection &conn, int status_code, format_json_response(int status_code, std::string_view json,
std::string_view message, Arena &response_arena, int64_t http_request_id,
bool close_connection = false); bool close_connection);
}; };

View File

@@ -234,8 +234,7 @@ int main(int argc, char *argv[]) {
std::cout << "Max request size: " << config->server.max_request_size_bytes std::cout << "Max request size: " << config->server.max_request_size_bytes
<< " bytes" << std::endl; << " bytes" << std::endl;
std::cout << "I/O threads: " << config->server.io_threads << std::endl; std::cout << "I/O threads: " << config->server.io_threads << std::endl;
std::cout << "Epoll instances: " << config->server.epoll_instances std::cout << "Epoll instances: " << config->server.io_threads << std::endl;
<< std::endl;
std::cout << "Event batch size: " << config->server.event_batch_size std::cout << "Event batch size: " << config->server.event_batch_size
<< std::endl; << std::endl;
std::cout << "Max connections: " << config->server.max_connections std::cout << "Max connections: " << config->server.max_connections
@@ -247,6 +246,24 @@ int main(int argc, char *argv[]) {
std::cout << "Request ID retention: " std::cout << "Request ID retention: "
<< config->commit.request_id_retention_hours.count() << " hours" << config->commit.request_id_retention_hours.count() << " hours"
<< std::endl; << std::endl;
// Print pipeline configuration
std::string wait_strategy_str;
switch (config->commit.pipeline_wait_strategy) {
case WaitStrategy::WaitIfStageEmpty:
wait_strategy_str = "WaitIfStageEmpty";
break;
case WaitStrategy::WaitIfUpstreamIdle:
wait_strategy_str = "WaitIfUpstreamIdle";
break;
case WaitStrategy::Never:
wait_strategy_str = "Never";
break;
}
std::cout << "Pipeline wait strategy: " << wait_strategy_str << std::endl;
std::cout << "Pipeline release threads: "
<< config->commit.pipeline_release_threads << std::endl;
std::cout << "Subscription buffer size: " std::cout << "Subscription buffer size: "
<< config->subscription.max_buffer_size_bytes << " bytes" << config->subscription.max_buffer_size_bytes << " bytes"
<< std::endl; << std::endl;
@@ -265,7 +282,6 @@ int main(int argc, char *argv[]) {
g_server = server.get(); g_server = server.get();
// Setup signal handling // Setup signal handling
std::signal(SIGPIPE, SIG_IGN);
std::signal(SIGTERM, signal_handler); std::signal(SIGTERM, signal_handler);
std::signal(SIGINT, signal_handler); std::signal(SIGINT, signal_handler);

View File

@@ -123,16 +123,6 @@ static void validate_or_abort(bool condition, const char *message,
} }
} }
// Helper to copy a string into arena memory
static std::string_view arena_copy_string(std::string_view str, Arena &arena) {
if (str.empty()) {
return std::string_view{};
}
char *copied = arena.allocate<char>(str.size());
std::memcpy(copied, str.data(), str.size());
return std::string_view(copied, str.size());
}
// Arena-based labels key for second level of map // Arena-based labels key for second level of map
// Uses string_view containing labels in Prometheus text format // Uses string_view containing labels in Prometheus text format
struct LabelsKey { struct LabelsKey {
@@ -149,8 +139,8 @@ struct LabelsKey {
validate_or_abort(is_valid_label_value(value), "invalid label value", validate_or_abort(is_valid_label_value(value), "invalid label value",
value); value);
auto key_view = arena_copy_string(key, arena); auto key_view = arena.copy_string(key);
auto value_view = arena_copy_string(value, arena); auto value_view = arena.copy_string(value);
labels.push_back({key_view, value_view}); labels.push_back({key_view, value_view});
} }
@@ -352,13 +342,21 @@ struct Gauge::State {
struct Histogram::State { struct Histogram::State {
std::span<const double> thresholds; // Bucket boundaries (sorted, std::span<const double> thresholds; // Bucket boundaries (sorted,
// deduplicated, sizes never change) // deduplicated, sizes never change)
std::span<uint64_t> counts; // Count per bucket
double sum; // Sum of observations // Histogram counter data
uint64_t observations; // Total observation count struct Counters {
std::span<uint64_t> bucket_counts; // Count per bucket
double sum = 0.0; // Sum of observations
uint64_t observations = 0; // Total observation count
};
Counters shared; // Protected by mutex, read by scrapes
Counters pending; // Lock-free accumulation when mutex busy
std::mutex std::mutex
mutex; // Per-thread, per-histogram mutex for consistent reads/writes mutex; // Per-thread, per-histogram mutex for consistent reads/writes
State() : sum(0.0), observations(0) {} State() {}
friend struct Metric; friend struct Metric;
}; };
@@ -454,7 +452,7 @@ struct Metric {
Arena arena; Arena arena;
ThreadInit() { ThreadInit() {
// Register this thread's arena for memory tracking // Register this thread's arena for memory tracking
std::unique_lock<std::mutex> _{mutex}; std::unique_lock _{mutex};
get_thread_arenas()[std::this_thread::get_id()] = &arena; get_thread_arenas()[std::this_thread::get_id()] = &arena;
} }
~ThreadInit() { ~ThreadInit() {
@@ -462,7 +460,7 @@ struct Metric {
// THREAD SAFETY: All operations below are protected by the global mutex, // THREAD SAFETY: All operations below are protected by the global mutex,
// including writes to global accumulated state, preventing races with // including writes to global accumulated state, preventing races with
// render thread // render thread
std::unique_lock<std::mutex> _{mutex}; std::unique_lock _{mutex};
// NOTE: registration_version increment is REQUIRED here because: // NOTE: registration_version increment is REQUIRED here because:
// - Cached render plan has pre-resolved pointers to thread-local state // - Cached render plan has pre-resolved pointers to thread-local state
// - When threads disappear, these pointers become invalid // - When threads disappear, these pointers become invalid
@@ -501,7 +499,21 @@ struct Metric {
if (thread_it != family->per_thread_state.end()) { if (thread_it != family->per_thread_state.end()) {
for (auto &[labels_key, instance] : thread_it->second.instances) { for (auto &[labels_key, instance] : thread_it->second.instances) {
// Acquire lock to get consistent snapshot // Acquire lock to get consistent snapshot
std::lock_guard<std::mutex> lock(instance->mutex); std::lock_guard lock(instance->mutex);
// BUGFIX: Flush pending observations into shared before
// accumulating
if (instance->pending.observations > 0) {
// Add pending to shared
for (size_t i = 0; i < instance->pending.bucket_counts.size();
++i) {
instance->shared.bucket_counts[i] +=
instance->pending.bucket_counts[i];
}
instance->shared.sum += instance->pending.sum;
instance->shared.observations += instance->pending.observations;
// No need to reset pending since instance is being destroyed
}
// Global accumulator should have been created when we made the // Global accumulator should have been created when we made the
// histogram // histogram
@@ -509,13 +521,14 @@ struct Metric {
assert(global_state); assert(global_state);
// Accumulate bucket counts (mutex already held) // Accumulate bucket counts (mutex already held)
for (size_t i = 0; i < instance->counts.size(); ++i) { for (size_t i = 0; i < instance->shared.bucket_counts.size(); ++i) {
global_state->counts[i] += instance->counts[i]; global_state->shared.bucket_counts[i] +=
instance->shared.bucket_counts[i];
} }
// Accumulate sum and observations // Accumulate sum and observations
global_state->sum += instance->sum; global_state->shared.sum += instance->shared.sum;
global_state->observations += instance->observations; global_state->shared.observations += instance->shared.observations;
} }
family->per_thread_state.erase(thread_it); family->per_thread_state.erase(thread_it);
} }
@@ -581,7 +594,7 @@ struct Metric {
} }
// Not found - copy to global arena and intern // Not found - copy to global arena and intern
auto interned_text = arena_copy_string(text, get_global_arena()); auto interned_text = get_global_arena().copy_string(text);
auto result = interned_set.emplace(interned_text); auto result = interned_set.emplace(interned_text);
return *result.first; return *result.first;
} }
@@ -592,7 +605,7 @@ struct Metric {
// Force thread_local initialization // Force thread_local initialization
(void)thread_init; (void)thread_init;
std::unique_lock<std::mutex> _{mutex}; std::unique_lock _{mutex};
++Metric::registration_version; ++Metric::registration_version;
const LabelsKey &key = intern_labels(labels); const LabelsKey &key = intern_labels(labels);
@@ -633,7 +646,7 @@ struct Metric {
static Gauge create_gauge_instance( static Gauge create_gauge_instance(
Family<Gauge> *family, Family<Gauge> *family,
std::span<const std::pair<std::string_view, std::string_view>> labels) { std::span<const std::pair<std::string_view, std::string_view>> labels) {
std::unique_lock<std::mutex> _{mutex}; std::unique_lock _{mutex};
++Metric::registration_version; ++Metric::registration_version;
const LabelsKey &key = intern_labels(labels); const LabelsKey &key = intern_labels(labels);
@@ -659,7 +672,7 @@ struct Metric {
// Force thread_local initialization // Force thread_local initialization
(void)thread_init; (void)thread_init;
std::unique_lock<std::mutex> _{mutex}; std::unique_lock _{mutex};
++Metric::registration_version; ++Metric::registration_version;
const LabelsKey &key = intern_labels(labels); const LabelsKey &key = intern_labels(labels);
@@ -683,16 +696,23 @@ struct Metric {
size_t bucket_count = family->p->buckets.size(); size_t bucket_count = family->p->buckets.size();
double *thresholds_data = double *thresholds_data =
get_thread_local_arena().allocate<double>(bucket_count); get_thread_local_arena().allocate<double>(bucket_count);
uint64_t *counts_data =
get_thread_local_arena().allocate<uint64_t>(bucket_count);
// Copy thresholds and initialize counts // Initialize thresholds
std::memcpy(thresholds_data, family->p->buckets.data(), std::memcpy(thresholds_data, family->p->buckets.data(),
bucket_count * sizeof(double)); bucket_count * sizeof(double));
std::memset(counts_data, 0, bucket_count * sizeof(uint64_t));
ptr->thresholds = std::span<const double>(thresholds_data, bucket_count); ptr->thresholds = std::span<const double>(thresholds_data, bucket_count);
ptr->counts = std::span<uint64_t>(counts_data, bucket_count);
// Initialize shared counts
auto shared_counts_span =
get_thread_local_arena().allocate_span<uint64_t>(bucket_count);
std::fill(shared_counts_span.begin(), shared_counts_span.end(), 0);
ptr->shared.bucket_counts = shared_counts_span;
// Initialize pending counts
auto pending_counts_span =
get_thread_local_arena().allocate_span<uint64_t>(bucket_count);
std::fill(pending_counts_span.begin(), pending_counts_span.end(), 0);
ptr->pending.bucket_counts = pending_counts_span;
// Ensure global accumulator exists for this label set // Ensure global accumulator exists for this label set
auto &global_state = family->p->global_accumulated_values[key]; auto &global_state = family->p->global_accumulated_values[key];
@@ -702,17 +722,16 @@ struct Metric {
// Allocate and copy thresholds, initialize counts // Allocate and copy thresholds, initialize counts
double *global_thresholds_data = double *global_thresholds_data =
get_global_arena().allocate<double>(bucket_count); get_global_arena().allocate<double>(bucket_count);
uint64_t *global_counts_data =
get_global_arena().allocate<uint64_t>(bucket_count);
std::memcpy(global_thresholds_data, ptr->thresholds.data(), std::memcpy(global_thresholds_data, ptr->thresholds.data(),
bucket_count * sizeof(double)); bucket_count * sizeof(double));
std::memset(global_counts_data, 0, bucket_count * sizeof(uint64_t));
global_state->thresholds = global_state->thresholds =
std::span<const double>(global_thresholds_data, bucket_count); std::span<const double>(global_thresholds_data, bucket_count);
global_state->counts =
std::span<uint64_t>(global_counts_data, bucket_count); auto global_shared_counts_span =
get_global_arena().allocate_span<uint64_t>(bucket_count);
std::fill(global_shared_counts_span.begin(),
global_shared_counts_span.end(), 0);
global_state->shared.bucket_counts = global_shared_counts_span;
} }
} }
Histogram result; Histogram result;
@@ -1137,12 +1156,12 @@ struct Metric {
uint64_t observations_snapshot; uint64_t observations_snapshot;
{ {
std::lock_guard<std::mutex> lock(instance->mutex); std::lock_guard lock(instance->mutex);
for (size_t i = 0; i < instance->counts.size(); ++i) { for (size_t i = 0; i < instance->shared.bucket_counts.size(); ++i) {
counts_snapshot[i] = instance->counts[i]; counts_snapshot[i] = instance->shared.bucket_counts[i];
} }
sum_snapshot = instance->sum; sum_snapshot = instance->shared.sum;
observations_snapshot = instance->observations; observations_snapshot = instance->shared.observations;
} }
for (size_t i = 0; i < bucket_count; ++i) { for (size_t i = 0; i < bucket_count; ++i) {
@@ -1155,11 +1174,12 @@ struct Metric {
// Add global accumulated values // Add global accumulated values
if (instruction.aggregate_histogram.global_state) { if (instruction.aggregate_histogram.global_state) {
auto *global_state = instruction.aggregate_histogram.global_state; auto *global_state = instruction.aggregate_histogram.global_state;
for (size_t i = 0; i < global_state->counts.size(); ++i) { for (size_t i = 0; i < global_state->shared.bucket_counts.size();
total_counts[i] += global_state->counts[i]; ++i) {
total_counts[i] += global_state->shared.bucket_counts[i];
} }
total_sum += global_state->sum; total_sum += global_state->shared.sum;
total_observations += global_state->observations; total_observations += global_state->shared.observations;
} }
// Format explicit bucket counts // Format explicit bucket counts
@@ -1421,16 +1441,36 @@ update_histogram_buckets_simd(std::span<const double> thresholds,
} }
void Histogram::observe(double x) { void Histogram::observe(double x) {
assert(p->thresholds.size() == p->counts.size()); assert(p->thresholds.size() == p->shared.bucket_counts.size());
std::lock_guard<std::mutex> lock(p->mutex); // Try to get lock immediately
if (p->mutex.try_lock()) {
// Fast path: got lock, flush any pending first
if (p->pending.observations > 0) {
// Add pending to shared
for (size_t i = 0; i < p->pending.bucket_counts.size(); ++i) {
p->shared.bucket_counts[i] += p->pending.bucket_counts[i];
p->pending.bucket_counts[i] = 0;
}
p->shared.sum += p->pending.sum;
p->shared.observations += p->pending.observations;
p->pending.sum = 0.0;
p->pending.observations = 0;
}
// Update bucket counts using SIMD // Update shared directly
update_histogram_buckets_simd(p->thresholds, p->counts, x, 0); update_histogram_buckets_simd(p->thresholds, p->shared.bucket_counts, x, 0);
p->shared.sum += x;
p->shared.observations++;
// Update sum and observation count p->mutex.unlock();
p->sum += x; } else {
p->observations++; // Slow path: accumulate in pending (lock-free)
update_histogram_buckets_simd(p->thresholds, p->pending.bucket_counts, x,
0);
p->pending.sum += x;
p->pending.observations++;
}
} }
template <> Family<Counter>::Family() = default; template <> Family<Counter>::Family() = default;
@@ -1458,15 +1498,15 @@ Histogram Family<Histogram>::create(
Family<Counter> create_counter(std::string_view name, std::string_view help) { Family<Counter> create_counter(std::string_view name, std::string_view help) {
validate_or_abort(is_valid_metric_name(name), "invalid counter name", name); validate_or_abort(is_valid_metric_name(name), "invalid counter name", name);
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
++Metric::registration_version; ++Metric::registration_version;
auto &global_arena = Metric::get_global_arena(); auto &global_arena = Metric::get_global_arena();
auto name_view = arena_copy_string(name, global_arena); auto name_view = global_arena.copy_string(name);
auto &familyPtr = Metric::get_counter_families()[name_view]; auto &familyPtr = Metric::get_counter_families()[name_view];
if (!familyPtr) { if (!familyPtr) {
familyPtr = global_arena.construct<Family<Counter>::State>(global_arena); familyPtr = global_arena.construct<Family<Counter>::State>(global_arena);
familyPtr->name = name_view; familyPtr->name = name_view;
familyPtr->help = arena_copy_string(help, global_arena); familyPtr->help = global_arena.copy_string(help);
} else { } else {
validate_or_abort( validate_or_abort(
familyPtr->help == help, familyPtr->help == help,
@@ -1480,16 +1520,16 @@ Family<Counter> create_counter(std::string_view name, std::string_view help) {
Family<Gauge> create_gauge(std::string_view name, std::string_view help) { Family<Gauge> create_gauge(std::string_view name, std::string_view help) {
validate_or_abort(is_valid_metric_name(name), "invalid gauge name", name); validate_or_abort(is_valid_metric_name(name), "invalid gauge name", name);
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
++Metric::registration_version; ++Metric::registration_version;
auto &global_arena = Metric::get_global_arena(); auto &global_arena = Metric::get_global_arena();
auto name_view = arena_copy_string(name, global_arena); auto name_view = global_arena.copy_string(name);
auto &familyPtr = Metric::get_gauge_families()[name_view]; auto &familyPtr = Metric::get_gauge_families()[name_view];
if (!familyPtr) { if (!familyPtr) {
// Family<T>::State instances use Arena::Ptr for automatic cleanup // Family<T>::State instances use Arena::Ptr for automatic cleanup
familyPtr = global_arena.construct<Family<Gauge>::State>(global_arena); familyPtr = global_arena.construct<Family<Gauge>::State>(global_arena);
familyPtr->name = name_view; familyPtr->name = name_view;
familyPtr->help = arena_copy_string(help, global_arena); familyPtr->help = global_arena.copy_string(help);
} else { } else {
validate_or_abort( validate_or_abort(
familyPtr->help == help, familyPtr->help == help,
@@ -1504,16 +1544,16 @@ Family<Histogram> create_histogram(std::string_view name, std::string_view help,
std::span<const double> buckets) { std::span<const double> buckets) {
validate_or_abort(is_valid_metric_name(name), "invalid histogram name", name); validate_or_abort(is_valid_metric_name(name), "invalid histogram name", name);
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
++Metric::registration_version; ++Metric::registration_version;
auto &global_arena = Metric::get_global_arena(); auto &global_arena = Metric::get_global_arena();
auto name_view = arena_copy_string(name, global_arena); auto name_view = global_arena.copy_string(name);
auto &family_ptr = Metric::get_histogram_families()[name_view]; auto &family_ptr = Metric::get_histogram_families()[name_view];
if (!family_ptr) { if (!family_ptr) {
// Family<T>::State instances use Arena::Ptr for automatic cleanup // Family<T>::State instances use Arena::Ptr for automatic cleanup
family_ptr = global_arena.construct<Family<Histogram>::State>(global_arena); family_ptr = global_arena.construct<Family<Histogram>::State>(global_arena);
family_ptr->name = name_view; family_ptr->name = name_view;
family_ptr->help = arena_copy_string(help, global_arena); family_ptr->help = global_arena.copy_string(help);
// DESIGN: Prometheus-compatible histogram buckets // DESIGN: Prometheus-compatible histogram buckets
// Convert to vector for sorting // Convert to vector for sorting
@@ -1693,7 +1733,7 @@ std::span<std::string_view> render(Arena &arena) {
// Hold lock throughout all phases to prevent registry changes // Hold lock throughout all phases to prevent registry changes
// THREAD SAFETY: Global mutex protects cached_plan initialization and access, // THREAD SAFETY: Global mutex protects cached_plan initialization and access,
// prevents races during static member initialization at program startup // prevents races during static member initialization at program startup
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
// Call all registered collectors to update their metrics // Call all registered collectors to update their metrics
for (const auto &collector : Metric::get_collectors()) { for (const auto &collector : Metric::get_collectors()) {
@@ -1723,7 +1763,7 @@ template <>
void Family<Counter>::register_callback( void Family<Counter>::register_callback(
std::span<const std::pair<std::string_view, std::string_view>> labels, std::span<const std::pair<std::string_view, std::string_view>> labels,
MetricCallback<Counter> callback) { MetricCallback<Counter> callback) {
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
++Metric::registration_version; ++Metric::registration_version;
const LabelsKey &key = Metric::intern_labels(labels); const LabelsKey &key = Metric::intern_labels(labels);
@@ -1748,7 +1788,7 @@ template <>
void Family<Gauge>::register_callback( void Family<Gauge>::register_callback(
std::span<const std::pair<std::string_view, std::string_view>> labels, std::span<const std::pair<std::string_view, std::string_view>> labels,
MetricCallback<Gauge> callback) { MetricCallback<Gauge> callback) {
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
++Metric::registration_version; ++Metric::registration_version;
const LabelsKey &key = Metric::intern_labels(labels); const LabelsKey &key = Metric::intern_labels(labels);
@@ -1804,7 +1844,7 @@ void reset_metrics_for_testing() {
} }
void register_collector(Ref<Collector> collector) { void register_collector(Ref<Collector> collector) {
std::unique_lock<std::mutex> _{Metric::mutex}; std::unique_lock _{Metric::mutex};
++Metric::registration_version; ++Metric::registration_version;
Metric::get_collectors().push_back(std::move(collector)); Metric::get_collectors().push_back(std::move(collector));
} }

View File

@@ -45,7 +45,6 @@
#include <functional> #include <functional>
#include <initializer_list> #include <initializer_list>
#include <memory>
#include <span> #include <span>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
@@ -74,7 +73,7 @@ template <typename T> using MetricCallback = std::function<double()>;
// 3. When rendered, the values of all Counter objects with the same labels // 3. When rendered, the values of all Counter objects with the same labels
// are summed together into a single total. // are summed together into a single total.
struct Counter { struct Counter {
void inc(double = 1.0); // Increment counter (must be >= 0) void inc(double = 1.0); // Increment counter (must be >= 0, never blocks)
private: private:
Counter(); Counter();
@@ -95,9 +94,9 @@ private:
// are cumulative. // are cumulative.
// 4. For independent gauges, create them with unique labels. // 4. For independent gauges, create them with unique labels.
struct Gauge { struct Gauge {
void inc(double = 1.0); void inc(double = 1.0); // (never blocks)
void dec(double = 1.0); void dec(double = 1.0); // (never blocks)
void set(double); void set(double); // (never blocks)
private: private:
Gauge(); Gauge();
@@ -117,7 +116,8 @@ private:
// 3. When rendered, the observations from all Histogram objects with the // 3. When rendered, the observations from all Histogram objects with the
// same labels are combined into a single histogram. // same labels are combined into a single histogram.
struct Histogram { struct Histogram {
void observe(double); // Record observation in appropriate bucket void
observe(double); // Record observation in appropriate bucket (never blocks)
private: private:
Histogram(); Histogram();

View File

@@ -1,11 +1,15 @@
#pragma once #pragma once
#define ENABLE_PERFETTO 1 #ifndef ENABLE_PERFETTO
#define ENABLE_PERFETTO 0
#endif
#if ENABLE_PERFETTO #if ENABLE_PERFETTO
#include <perfetto.h> #include <perfetto.h>
#else #else
#define PERFETTO_DEFINE_CATEGORIES(...) #define PERFETTO_DEFINE_CATEGORIES(...)
#define PERFETTO_TRACK_EVENT_STATIC_STORAGE \
void perfetto_track_event_static_storage
#define TRACE_EVENT(...) #define TRACE_EVENT(...)
#endif #endif

View File

@@ -1,22 +1,37 @@
#pragma once #pragma once
#include "arena.hpp"
#include "connection.hpp" #include "connection.hpp"
#include <memory>
#include <variant> #include <variant>
// Forward declarations
struct CommitRequest;
/** /**
* Pipeline entry for commit requests that need full 4-stage processing. * Pipeline entry for commit requests that need full 4-stage processing.
* Contains connection with parsed CommitRequest. * Contains connection with parsed CommitRequest.
*/ */
struct CommitEntry { struct CommitEntry {
std::unique_ptr<Connection> connection; WeakRef<MessageSender> connection;
int64_t assigned_version = 0; // Set by sequence stage int64_t assigned_version = -1; // Set by sequence stage
bool resolve_success = false; // Set by resolve stage bool resolve_success = false; // Set by resolve stage
bool persist_success = false; // Set by persist stage bool persist_success = false; // Set by persist stage
// Protocol-agnostic context (arena-allocated, protocol-specific)
void *protocol_context = nullptr;
const CommitRequest *commit_request = nullptr; // Points to request_arena data
// Request arena contains parsed request data and response data
Arena request_arena;
// JSON response body (set by persist stage, arena-allocated)
std::string_view response_json;
CommitEntry() = default; // Default constructor for variant CommitEntry() = default; // Default constructor for variant
explicit CommitEntry(std::unique_ptr<Connection> conn) explicit CommitEntry(WeakRef<MessageSender> conn, void *ctx,
: connection(std::move(conn)) {} const CommitRequest *req, Arena arena)
: connection(std::move(conn)), protocol_context(ctx), commit_request(req),
request_arena(std::move(arena)) {}
}; };
/** /**
@@ -24,12 +39,24 @@ struct CommitEntry {
* then transfer to status threadpool. * then transfer to status threadpool.
*/ */
struct StatusEntry { struct StatusEntry {
std::unique_ptr<Connection> connection; WeakRef<MessageSender> connection;
int64_t version_upper_bound = 0; // Set by sequence stage int64_t version_upper_bound = 0; // Set by sequence stage
// Protocol-agnostic context (arena-allocated, protocol-specific)
void *protocol_context = nullptr;
std::string_view status_request_id; // Points to request_arena data
// Request arena for request data
Arena request_arena;
// JSON response body (set by persist stage, arena-allocated)
std::string_view response_json;
StatusEntry() = default; // Default constructor for variant StatusEntry() = default; // Default constructor for variant
explicit StatusEntry(std::unique_ptr<Connection> conn) explicit StatusEntry(WeakRef<MessageSender> conn, void *ctx,
: connection(std::move(conn)) {} std::string_view request_id, Arena arena)
: connection(std::move(conn)), protocol_context(ctx),
status_request_id(request_id), request_arena(std::move(arena)) {}
}; };
/** /**
@@ -38,11 +65,47 @@ struct StatusEntry {
* Resolve stage can perform configurable CPU work for benchmarking. * Resolve stage can perform configurable CPU work for benchmarking.
*/ */
struct HealthCheckEntry { struct HealthCheckEntry {
std::unique_ptr<Connection> connection; WeakRef<MessageSender> connection;
// Protocol-agnostic context (arena-allocated, protocol-specific)
void *protocol_context = nullptr;
// Request arena for response data
Arena request_arena;
// JSON response body (set by persist stage, arena-allocated)
std::string_view response_json;
HealthCheckEntry() = default; // Default constructor for variant HealthCheckEntry() = default; // Default constructor for variant
explicit HealthCheckEntry(std::unique_ptr<Connection> conn) explicit HealthCheckEntry(WeakRef<MessageSender> conn, void *ctx, Arena arena)
: connection(std::move(conn)) {} : connection(std::move(conn)), protocol_context(ctx),
request_arena(std::move(arena)) {}
};
/**
* Pipeline entry for /v1/version requests.
* Needs to integrate with the pipeline because for external consistency.
*/
struct GetVersionEntry {
WeakRef<MessageSender> connection;
// Protocol-agnostic context (arena-allocated, protocol-specific)
void *protocol_context = nullptr;
// Request arena for response data
Arena request_arena;
// JSON response body (set by persist stage, arena-allocated)
std::string_view response_json;
// Proposed response version
int64_t version;
GetVersionEntry() = default; // Default constructor for variant
explicit GetVersionEntry(WeakRef<MessageSender> conn, void *ctx, Arena arena,
int64_t version)
: connection(std::move(conn)), protocol_context(ctx),
request_arena(std::move(arena)), version(version) {}
}; };
/** /**
@@ -57,5 +120,5 @@ struct ShutdownEntry {
* Pipeline entry variant type used by the commit processing pipeline. * Pipeline entry variant type used by the commit processing pipeline.
* Each stage pattern-matches on the variant type to handle appropriately. * Each stage pattern-matches on the variant type to handle appropriately.
*/ */
using PipelineEntry = using PipelineEntry = std::variant<CommitEntry, StatusEntry, HealthCheckEntry,
std::variant<CommitEntry, StatusEntry, HealthCheckEntry, ShutdownEntry>; ShutdownEntry, GetVersionEntry>;

View File

@@ -8,7 +8,7 @@
* Gathers metrics like CPU usage, memory, and file descriptors by reading * Gathers metrics like CPU usage, memory, and file descriptors by reading
* files from the /proc filesystem. * files from the /proc filesystem.
*/ */
struct ProcessCollector : public metric::Collector { struct ProcessCollector : metric::Collector {
/** /**
* @brief Constructs the collector and initializes the process metrics. * @brief Constructs the collector and initializes the process metrics.
*/ */

View File

@@ -22,8 +22,8 @@
* Basic usage: * Basic usage:
* @code * @code
* auto obj = make_ref<MyClass>(args...); // Create managed object * auto obj = make_ref<MyClass>(args...); // Create managed object
* auto copy = obj; // Copy (thread-safe) * auto copy = obj.copy(); // Explicit copy (thread-safe)
* WeakRef<MyClass> weak = obj; // Create weak reference * WeakRef<MyClass> weak = obj.as_weak(); // Create weak reference
* auto locked = weak.lock(); // Try to promote to strong * auto locked = weak.lock(); // Try to promote to strong
* @endcode * @endcode
* *
@@ -31,6 +31,9 @@
* safely copy, move, and destroy references to the same object. * safely copy, move, and destroy references to the same object.
*/ */
// Forward declaration
template <typename T> struct WeakRef;
namespace detail { namespace detail {
struct ControlBlock { struct ControlBlock {
std::atomic<uint32_t> strong_count; std::atomic<uint32_t> strong_count;
@@ -82,7 +85,7 @@ struct ControlBlock {
* *
* Usage: * Usage:
* - Use make_ref<T>() to create new objects * - Use make_ref<T>() to create new objects
* - Copy/assign to share ownership * - Use copy() method for explicit sharing of ownership
* - Use get(), operator*, operator-> to access the object * - Use get(), operator*, operator-> to access the object
* - Use operator bool() to check if valid * - Use operator bool() to check if valid
* - Use reset() to release ownership * - Use reset() to release ownership
@@ -126,57 +129,28 @@ template <typename T> struct Ref {
~Ref() { release(); } ~Ref() { release(); }
/** /**
* @brief Copy constructor - increments strong reference count * @brief Copy constructor - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
Ref(const Ref &other) noexcept Ref(const Ref &other) = delete;
: ptr(other.ptr), control_block(other.control_block) {
if (control_block) {
control_block->increment_strong();
}
}
/** /**
* @brief Converting copy constructor for polymorphism (Derived -> Base) * @brief Converting copy constructor - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
template <typename U> template <typename U> Ref(const Ref<U> &other) = delete;
Ref(const Ref<U> &other) noexcept
requires std::is_convertible_v<U *, T *>
: ptr(other.ptr), control_block(other.control_block) {
if (control_block) {
control_block->increment_strong();
}
}
/** /**
* @brief Copy assignment operator * @brief Copy assignment operator - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
Ref &operator=(const Ref &other) noexcept { Ref &operator=(const Ref &other) = delete;
if (this != &other) {
release();
ptr = other.ptr;
control_block = other.control_block;
if (control_block) {
control_block->increment_strong();
}
}
return *this;
}
/** /**
* @brief Converting assignment operator for polymorphism (Derived -> Base) * @brief Converting assignment operator - deleted to prevent accidental
* copies Use copy() method for explicit copying
*/ */
template <typename U> template <typename U> Ref &operator=(const Ref<U> &other) = delete;
Ref &operator=(const Ref<U> &other) noexcept
requires std::is_convertible_v<U *, T *>
{
release();
ptr = other.ptr;
control_block = other.control_block;
if (control_block) {
control_block->increment_strong();
}
return *this;
}
/** /**
* @brief Move constructor - transfers ownership * @brief Move constructor - transfers ownership
@@ -228,6 +202,28 @@ template <typename T> struct Ref {
return *this; return *this;
} }
/**
* @brief Explicitly create a copy with shared ownership
* @return New Ref that shares ownership of the same object
*/
[[nodiscard]] Ref copy() const noexcept {
if (control_block) {
control_block->increment_strong();
}
return Ref(ptr, control_block);
}
/**
* @brief Create a WeakRef that observes this object
* @return New WeakRef that observes the same object
*/
[[nodiscard]] WeakRef<T> as_weak() const noexcept {
if (control_block) {
control_block->increment_weak();
}
return WeakRef<T>(ptr, control_block);
}
/** /**
* @brief Reset to empty state * @brief Reset to empty state
*/ */
@@ -301,7 +297,8 @@ private:
* that might be destroyed by other threads. * that might be destroyed by other threads.
* *
* Usage: * Usage:
* - Create from Ref<T> to observe without owning * - Create from Ref<T> using as_weak() to observe without owning
* - Use copy() method for explicit copying
* - Use lock() to attempt promotion to Ref<T> * - Use lock() to attempt promotion to Ref<T>
* - Returns empty Ref<T> if object was already destroyed * - Returns empty Ref<T> if object was already destroyed
* - Use reset() to stop observing * - Use reset() to stop observing
@@ -332,7 +329,7 @@ template <typename T> struct WeakRef {
expected_strong, expected_strong + 1, std::memory_order_acquire, expected_strong, expected_strong + 1, std::memory_order_acquire,
std::memory_order_relaxed)) { std::memory_order_relaxed)) {
// Success - we incremented the strong count // Success - we incremented the strong count
return Ref<T>(get_object_ptr(), control_block); return Ref<T>(ptr, control_block);
} }
// CAS failed, expected_strong now contains the current value, retry // CAS failed, expected_strong now contains the current value, retry
} }
@@ -347,76 +344,40 @@ template <typename T> struct WeakRef {
~WeakRef() { release(); } ~WeakRef() { release(); }
/** /**
* @brief Copy constructor from WeakRef * @brief Copy constructor from WeakRef - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
WeakRef(const WeakRef &other) noexcept : control_block(other.control_block) { WeakRef(const WeakRef &other) = delete;
if (control_block) {
control_block->increment_weak();
}
}
/** /**
* @brief Copy constructor from Ref * @brief Copy constructor from Ref - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
WeakRef(const Ref<T> &ref) noexcept : control_block(ref.control_block) { WeakRef(const Ref<T> &ref) = delete;
if (control_block) {
control_block->increment_weak();
}
}
/** /**
* @brief Converting copy constructor from WeakRef for polymorphism * @brief Converting copy constructor from WeakRef - deleted to prevent
* accidental copies Use copy() method for explicit copying
*/ */
template <typename U> template <typename U> WeakRef(const WeakRef<U> &other) = delete;
WeakRef(const WeakRef<U> &other) noexcept
requires std::is_convertible_v<U *, T *>
: control_block(other.control_block) {
if (control_block) {
control_block->increment_weak();
}
}
/** /**
* @brief Converting copy constructor from Ref for polymorphism * @brief Converting copy constructor from Ref - deleted to prevent accidental
* copies Use copy() method for explicit copying
*/ */
template <typename U> template <typename U> WeakRef(const Ref<U> &ref) = delete;
WeakRef(const Ref<U> &ref) noexcept
requires std::is_convertible_v<U *, T *>
: control_block(ref.control_block) {
if (control_block) {
control_block->increment_weak();
}
}
/** /**
* @brief Converting copy assignment from WeakRef for polymorphism * @brief Converting copy assignment from WeakRef - deleted to prevent
* accidental copies Use copy() method for explicit copying
*/ */
template <typename U> template <typename U> WeakRef &operator=(const WeakRef<U> &other) = delete;
WeakRef &operator=(const WeakRef<U> &other) noexcept
requires std::is_convertible_v<U *, T *>
{
release();
control_block = other.control_block;
if (control_block) {
control_block->increment_weak();
}
return *this;
}
/** /**
* @brief Converting copy assignment from Ref for polymorphism * @brief Converting copy assignment from Ref - deleted to prevent accidental
* copies Use copy() method for explicit copying
*/ */
template <typename U> template <typename U> WeakRef &operator=(const Ref<U> &ref) = delete;
WeakRef &operator=(const Ref<U> &ref) noexcept
requires std::is_convertible_v<U *, T *>
{
release();
control_block = ref.control_block;
if (control_block) {
control_block->increment_weak();
}
return *this;
}
/** /**
* @brief Converting move constructor from WeakRef for polymorphism * @brief Converting move constructor from WeakRef for polymorphism
@@ -424,7 +385,8 @@ template <typename T> struct WeakRef {
template <typename U> template <typename U>
WeakRef(WeakRef<U> &&other) noexcept WeakRef(WeakRef<U> &&other) noexcept
requires std::is_convertible_v<U *, T *> requires std::is_convertible_v<U *, T *>
: control_block(other.control_block) { : ptr(other.ptr), control_block(other.control_block) {
other.ptr = nullptr;
other.control_block = nullptr; other.control_block = nullptr;
} }
@@ -436,41 +398,31 @@ template <typename T> struct WeakRef {
requires std::is_convertible_v<U *, T *> requires std::is_convertible_v<U *, T *>
{ {
release(); release();
ptr = other.ptr;
control_block = other.control_block; control_block = other.control_block;
other.ptr = nullptr;
other.control_block = nullptr; other.control_block = nullptr;
return *this; return *this;
} }
/** /**
* @brief Copy assignment from WeakRef * @brief Copy assignment from WeakRef - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
WeakRef &operator=(const WeakRef &other) noexcept { WeakRef &operator=(const WeakRef &other) = delete;
if (this != &other) {
release();
control_block = other.control_block;
if (control_block) {
control_block->increment_weak();
}
}
return *this;
}
/** /**
* @brief Copy assignment from Ref * @brief Copy assignment from Ref - deleted to prevent accidental copies
* Use copy() method for explicit copying
*/ */
WeakRef &operator=(const Ref<T> &ref) noexcept { WeakRef &operator=(const Ref<T> &ref) = delete;
release();
control_block = ref.control_block;
if (control_block) {
control_block->increment_weak();
}
return *this;
}
/** /**
* @brief Move constructor * @brief Move constructor
*/ */
WeakRef(WeakRef &&other) noexcept : control_block(other.control_block) { WeakRef(WeakRef &&other) noexcept
: ptr(other.ptr), control_block(other.control_block) {
other.ptr = nullptr;
other.control_block = nullptr; other.control_block = nullptr;
} }
@@ -480,42 +432,46 @@ template <typename T> struct WeakRef {
WeakRef &operator=(WeakRef &&other) noexcept { WeakRef &operator=(WeakRef &&other) noexcept {
if (this != &other) { if (this != &other) {
release(); release();
ptr = other.ptr;
control_block = other.control_block; control_block = other.control_block;
other.ptr = nullptr;
other.control_block = nullptr; other.control_block = nullptr;
} }
return *this; return *this;
} }
/**
* @brief Explicitly create a copy with shared weak reference
* @return New WeakRef that observes the same object
*/
[[nodiscard]] WeakRef copy() const noexcept {
if (control_block) {
control_block->increment_weak();
}
return WeakRef(ptr, control_block);
}
/** /**
* @brief Reset to empty state * @brief Reset to empty state
*/ */
void reset() noexcept { void reset() noexcept {
release(); release();
ptr = nullptr;
control_block = nullptr; control_block = nullptr;
} }
/** /**
* @brief Default constructor - creates empty WeakRef * @brief Default constructor - creates empty WeakRef
*/ */
WeakRef() : control_block(nullptr) {} WeakRef() : ptr(nullptr), control_block(nullptr) {}
private: private:
explicit WeakRef(detail::ControlBlock *cb) : control_block(cb) {} explicit WeakRef(T *object_ptr, detail::ControlBlock *cb)
: ptr(object_ptr), control_block(cb) {}
T *ptr;
detail::ControlBlock *control_block; detail::ControlBlock *control_block;
// Helper to calculate object pointer from control block
T *get_object_ptr() const {
if (!control_block)
return nullptr;
constexpr size_t cb_size = sizeof(detail::ControlBlock);
constexpr size_t alignment = alignof(T);
constexpr size_t padded_cb_size =
(cb_size + alignment - 1) & ~(alignment - 1);
return reinterpret_cast<T *>(reinterpret_cast<char *>(control_block) +
padded_cb_size);
}
/** /**
* @brief Release current weak reference and handle cleanup * @brief Release current weak reference and handle cleanup
*/ */
@@ -550,6 +506,8 @@ private:
* @code * @code
* auto obj = make_ref<MyClass>(arg1, arg2); * auto obj = make_ref<MyClass>(arg1, arg2);
* auto empty_vec = make_ref<std::vector<int>>(); * auto empty_vec = make_ref<std::vector<int>>();
* auto obj_copy = obj.copy(); // Explicit copy
* WeakRef<MyClass> weak = obj.as_weak(); // Create weak reference
* @endcode * @endcode
* *
* Thread safety: Safe to call from multiple threads simultaneously. * Thread safety: Safe to call from multiple threads simultaneously.

View File

@@ -5,7 +5,6 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <fcntl.h> #include <fcntl.h>
#include <memory>
#include <netdb.h> #include <netdb.h>
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <pthread.h> #include <pthread.h>
@@ -25,7 +24,7 @@ Ref<Server> Server::create(const weaseldb::Config &config,
ConnectionHandler &handler, ConnectionHandler &handler,
const std::vector<int> &listen_fds) { const std::vector<int> &listen_fds) {
auto result = make_ref<Server>(config, handler, listen_fds); auto result = make_ref<Server>(config, handler, listen_fds);
result->self_ = result; result->self_ = result.as_weak();
return result; return result;
} }
@@ -139,51 +138,6 @@ void Server::shutdown() {
} }
} }
void Server::release_back_to_server(std::unique_ptr<Connection> connection) {
if (!connection) {
return; // Nothing to release
}
// Try to get the server from the connection's weak_ptr
if (auto server = connection->server_.lock()) {
// Server still exists - pass unique_ptr directly
server->receiveConnectionBack(std::move(connection));
}
// If server is gone, connection will be automatically cleaned up when
// unique_ptr destructs
}
void Server::receiveConnectionBack(std::unique_ptr<Connection> connection) {
if (!connection) {
return; // Nothing to process
}
// Re-add the connection to epoll for continued processing
struct epoll_event event{};
if (!connection->has_messages()) {
event.events = EPOLLIN | EPOLLONESHOT;
} else {
event.events = EPOLLOUT | EPOLLONESHOT;
}
int fd = connection->getFd();
event.data.fd = fd;
// Store connection in registry before adding to epoll
// This mirrors the pattern used in process_connection_batch
size_t epoll_index = connection->getEpollIndex();
int epollfd = epoll_fds_[epoll_index];
connection_registry_.store(fd, std::move(connection));
if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event) == -1) {
perror("epoll_ctl MOD in receiveConnectionBack");
// Remove from registry and clean up on failure
(void)connection_registry_.remove(fd);
}
}
int Server::create_local_connection() { int Server::create_local_connection() {
int sockets[2]; int sockets[2];
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) != 0) { if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) != 0) {
@@ -210,22 +164,24 @@ int Server::create_local_connection() {
struct sockaddr_storage addr{}; struct sockaddr_storage addr{};
addr.ss_family = AF_UNIX; addr.ss_family = AF_UNIX;
// Calculate epoll_index for connection distribution // Use round-robin distribution for local connections across epoll instances
size_t epoll_index = size_t epoll_index =
connection_distribution_counter_.fetch_add(1, std::memory_order_relaxed) % connection_distribution_counter_.fetch_add(1, std::memory_order_relaxed) %
epoll_fds_.size(); epoll_fds_.size();
// Create Connection object // Create Connection object
auto connection = std::unique_ptr<Connection>(new Connection( auto connection = make_ref<Connection>(
addr, server_fd, connection_id_.fetch_add(1, std::memory_order_relaxed), addr, server_fd, connection_id_.fetch_add(1, std::memory_order_relaxed),
epoll_index, &handler_, self_)); epoll_index, &handler_, self_.copy());
connection->self_ref_ = connection.as_weak();
connection->tsan_release();
// Store in registry // Store in registry
connection_registry_.store(server_fd, std::move(connection)); connection_registry_.store(server_fd, std::move(connection));
// Add to appropriate epoll instance // Add to appropriate epoll instance
struct epoll_event event{}; struct epoll_event event{};
event.events = EPOLLIN | EPOLLONESHOT; event.events = EPOLLIN;
event.data.fd = server_fd; event.data.fd = server_fd;
int epollfd = epoll_fds_[epoll_index]; int epollfd = epoll_fds_[epoll_index];
@@ -263,10 +219,11 @@ void Server::setup_shutdown_pipe() {
} }
void Server::create_epoll_instances() { void Server::create_epoll_instances() {
// Create multiple epoll instances to reduce contention // Create one epoll instance per I/O thread (1:1 mapping) to eliminate
epoll_fds_.resize(config_.server.epoll_instances); // contention
epoll_fds_.resize(config_.server.io_threads);
for (int i = 0; i < config_.server.epoll_instances; ++i) { for (int i = 0; i < config_.server.io_threads; ++i) {
epoll_fds_[i] = epoll_create1(EPOLL_CLOEXEC); epoll_fds_[i] = epoll_create1(EPOLL_CLOEXEC);
if (epoll_fds_[i] == -1) { if (epoll_fds_[i] == -1) {
perror("epoll_create1"); perror("epoll_create1");
@@ -299,11 +256,6 @@ void Server::create_epoll_instances() {
} }
} }
int Server::get_epoll_for_thread(int thread_id) const {
// Round-robin assignment of threads to epoll instances
return epoll_fds_[thread_id % epoll_fds_.size()];
}
void Server::start_io_threads(std::vector<std::thread> &threads) { void Server::start_io_threads(std::vector<std::thread> &threads) {
int io_threads = config_.server.io_threads; int io_threads = config_.server.io_threads;
@@ -312,12 +264,11 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
pthread_setname_np(pthread_self(), pthread_setname_np(pthread_self(),
("io-" + std::to_string(thread_id)).c_str()); ("io-" + std::to_string(thread_id)).c_str());
// Each thread uses its assigned epoll instance (round-robin) // Each thread uses its assigned epoll instance (1:1 mapping)
int epollfd = get_epoll_for_thread(thread_id); int epollfd = epoll_fds_[thread_id];
std::vector<epoll_event> events(config_.server.event_batch_size); std::vector<epoll_event> events(config_.server.event_batch_size);
std::vector<std::unique_ptr<Connection>> batch( std::vector<Ref<Connection>> batch(config_.server.event_batch_size);
config_.server.event_batch_size);
std::vector<int> batch_events(config_.server.event_batch_size); std::vector<int> batch_events(config_.server.event_batch_size);
std::vector<int> std::vector<int>
ready_listen_fds; // Reused across iterations to avoid allocation ready_listen_fds; // Reused across iterations to avoid allocation
@@ -351,11 +302,12 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
// Handle existing connection events // Handle existing connection events
int fd = events[i].data.fd; int fd = events[i].data.fd;
std::unique_ptr<Connection> conn = connection_registry_.remove(fd); Ref<Connection> conn = connection_registry_.remove(fd);
conn->tsan_acquire();
assert(conn); assert(conn);
if (events[i].events & (EPOLLERR | EPOLLHUP)) { if (events[i].events & (EPOLLERR | EPOLLHUP)) {
// unique_ptr will automatically delete on scope exit close_connection(conn);
continue; continue;
} }
@@ -368,7 +320,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
// Process existing connections in batch // Process existing connections in batch
if (batch_count > 0) { if (batch_count > 0) {
process_connection_batch( process_connection_batch(
epollfd, std::span(batch).subspan(0, batch_count), std::span(batch).subspan(0, batch_count),
std::span(batch_events).subspan(0, batch_count)); std::span(batch_events).subspan(0, batch_count));
} }
@@ -408,9 +360,9 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
perror("setsockopt SO_KEEPALIVE"); perror("setsockopt SO_KEEPALIVE");
} }
// Add to epoll with no interests // Add to epoll
struct epoll_event event{}; struct epoll_event event{};
event.events = 0; event.events = EPOLLIN;
event.data.fd = fd; event.data.fd = fd;
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event) == -1) { if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event) == -1) {
perror("epoll_ctl ADD"); perror("epoll_ctl ADD");
@@ -418,11 +370,13 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
} }
// Transfer ownership from registry to batch processing // Transfer ownership from registry to batch processing
size_t epoll_index = thread_id % epoll_fds_.size(); size_t epoll_index = thread_id;
batch[batch_count] = std::unique_ptr<Connection>(new Connection( batch[batch_count] = make_ref<Connection>(
addr, fd, addr, fd,
connection_id_.fetch_add(1, std::memory_order_relaxed), connection_id_.fetch_add(1, std::memory_order_relaxed),
epoll_index, &handler_, self_)); epoll_index, &handler_, self_.copy());
batch[batch_count]->self_ref_ = batch[batch_count].as_weak();
batch[batch_count]->tsan_release();
batch_events[batch_count] = batch_events[batch_count] =
EPOLLIN; // New connections always start with read EPOLLIN; // New connections always start with read
batch_count++; batch_count++;
@@ -430,7 +384,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
// Process batch if full // Process batch if full
if (batch_count == config_.server.event_batch_size) { if (batch_count == config_.server.event_batch_size) {
process_connection_batch( process_connection_batch(
epollfd, {batch.data(), (size_t)batch_count}, {batch.data(), (size_t)batch_count},
{batch_events.data(), (size_t)batch_count}); {batch_events.data(), (size_t)batch_count});
batch_count = 0; batch_count = 0;
} }
@@ -440,7 +394,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
// Process remaining accepted connections // Process remaining accepted connections
if (batch_count > 0) { if (batch_count > 0) {
process_connection_batch( process_connection_batch(
epollfd, std::span(batch).subspan(0, batch_count), std::span(batch).subspan(0, batch_count),
std::span(batch_events).subspan(0, batch_count)); std::span(batch_events).subspan(0, batch_count));
batch_count = 0; batch_count = 0;
} }
@@ -449,8 +403,7 @@ void Server::start_io_threads(std::vector<std::thread> &threads) {
} }
} }
void Server::process_connection_reads(std::unique_ptr<Connection> &conn, void Server::process_connection_reads(Ref<Connection> &conn, int events) {
int events) {
assert(conn); assert(conn);
// Handle EPOLLIN - read data and process it // Handle EPOLLIN - read data and process it
if (events & EPOLLIN) { if (events & EPOLLIN) {
@@ -461,7 +414,7 @@ void Server::process_connection_reads(std::unique_ptr<Connection> &conn,
if (r < 0) { if (r < 0) {
// Error or EOF - connection should be closed // Error or EOF - connection should be closed
conn.reset(); close_connection(conn);
return; return;
} }
@@ -470,60 +423,58 @@ void Server::process_connection_reads(std::unique_ptr<Connection> &conn,
return; return;
} }
// Call handler with unique_ptr - handler can take ownership if needed // Call handler with connection reference - server retains ownership
handler_.on_data_arrived(std::string_view{buf, size_t(r)}, conn); handler_.on_data_arrived(std::string_view{buf, size_t(r)}, *conn);
// If handler took ownership (conn is now null), return true to indicate
// processing is done
if (!conn) {
return;
}
} }
} }
void Server::process_connection_writes(std::unique_ptr<Connection> &conn, void Server::process_connection_writes(Ref<Connection> &conn, int events) {
int /*events*/) {
assert(conn); assert(conn);
// For simplicity, we always attempt to write when an event fires. We could be
// more precise and skip the write if we detect that we've already seen EAGAIN
// on this connection and we don't have EPOLLOUT.
if (conn->has_messages()) {
bool had_messages = conn->has_messages();
bool error = conn->writeBytes();
if (error) { // Process pending responses first if this is an EPOLLOUT event
conn.reset(); // Connection should be closed if (events & EPOLLOUT) {
return; std::unique_lock lock(conn->mutex_);
} if (!conn->pending_response_queue_.empty()) {
std::vector<PendingResponse> pending_vec;
// Call handler with unique_ptr - handler can take ownership if needed pending_vec.reserve(conn->pending_response_queue_.size());
handler_.on_write_progress(conn); for (auto &response : conn->pending_response_queue_) {
// If handler took ownership (conn is now null), return true to indicate pending_vec.push_back(std::move(response));
// processing is done
if (!conn) {
return;
}
// Check if buffer became empty (transition from non-empty -> empty)
if (had_messages && !conn->has_messages()) {
handler_.on_write_buffer_drained(conn);
// If handler took ownership (conn is now null), return
if (!conn) {
return;
} }
} conn->pending_response_queue_.clear();
lock.unlock();
// Check if we should close the connection according to application handler_.on_preprocess_writes(*conn, std::span{pending_vec});
if (!conn->has_messages() && conn->should_close()) {
conn.reset(); // Connection should be closed
return;
} }
} }
auto result = conn->write_bytes();
if (result & Connection::WriteBytesResult::Error) {
close_connection(conn);
return;
}
if (result & Connection::WriteBytesResult::Progress) {
// Call handler with connection reference - server retains ownership
handler_.on_write_progress(*conn);
}
// Check if we should close the connection according to application
if (result & Connection::WriteBytesResult::Close) {
close_connection(conn);
return;
}
} }
void Server::process_connection_batch( void Server::close_connection(Ref<Connection> &conn) {
int epollfd, std::span<std::unique_ptr<Connection>> batch, conn->close();
std::span<const int> events) { conn.reset();
}
static thread_local std::vector<Connection *> batch_connections;
void Server::process_connection_batch(std::span<Ref<Connection>> batch,
std::span<const int> events) {
// First process writes for each connection // First process writes for each connection
for (int i = 0; i < static_cast<int>(batch.size()); ++i) { for (int i = 0; i < static_cast<int>(batch.size()); ++i) {
@@ -539,29 +490,20 @@ void Server::process_connection_batch(
} }
} }
// Call batch complete handler - handlers can take ownership here // Call batch complete handler with connection pointers
handler_.on_batch_complete(batch); batch_connections.clear();
for (auto &conn : batch) {
if (conn) {
batch_connections.push_back(conn.get());
}
}
handler_.on_batch_complete(batch_connections);
// Transfer all remaining connections back to epoll // Return all connections to registry
for (auto &conn_ptr : batch) { for (auto &conn : batch) {
if (conn_ptr) { if (conn) {
int fd = conn_ptr->getFd(); const int fd = conn->fd_;
connection_registry_.store(fd, std::move(conn));
struct epoll_event event{};
if (!conn_ptr->has_messages()) {
event.events = EPOLLIN | EPOLLONESHOT;
} else {
event.events = EPOLLOUT | EPOLLONESHOT;
}
event.data.fd = fd; // Use file descriptor for epoll
// Put connection back in registry since handler didn't take ownership.
// Must happen before epoll_ctl
connection_registry_.store(fd, std::move(conn_ptr));
if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event) == -1) {
perror("epoll_ctl MOD");
(void)connection_registry_.remove(fd);
}
} }
} }
} }

View File

@@ -1,7 +1,6 @@
#pragma once #pragma once
#include <atomic> #include <atomic>
#include <memory>
#include <span> #include <span>
#include <thread> #include <thread>
#include <vector> #include <vector>
@@ -95,39 +94,27 @@ struct Server {
*/ */
int create_local_connection(); int create_local_connection();
/**
* Release a connection back to its server for continued processing.
*
* This static method safely returns ownership of a connection back to its
* server. If the server has been destroyed, the connection will be safely
* cleaned up.
*
* This method is thread-safe and can be called from any thread.
*
* @param connection unique_ptr to the connection being released back
*/
static void release_back_to_server(std::unique_ptr<Connection> connection);
private: private:
friend struct Connection; friend struct Connection;
/** /**
* Private constructor - use create() factory method instead. * Private constructor - use create() factory method instead.
* *
* @param config Server configuration (threads, ports, limits, etc.) * @param config Server configuration (threads, ports, limits, etc.)
* @param handler Protocol handler for processing connection data * @param handler Protocol handler for processing connection data. Must
* outlive the server.
* @param listen_fds Vector of file descriptors to accept connections on. * @param listen_fds Vector of file descriptors to accept connections on.
* Server takes ownership and will close them on * Server takes ownership and will close them on
* destruction. Server will set these to non-blocking mode for safe epoll * destruction. Server will set these to non-blocking mode for safe epoll
* usage. * usage.
*/ */
explicit Server(const weaseldb::Config &config, ConnectionHandler &handler, explicit Server(const weaseldb::Config &config, ConnectionHandler &handler,
const std::vector<int> &listen_fds); const std::vector<int> &listen_fds);
friend Ref<Server> make_ref<Server>(const weaseldb::Config &config, template <typename T, typename... Args>
ConnectionHandler &handler, friend Ref<T> make_ref(Args &&...args);
const std::vector<int> &listen_fds);
WeakRef<Server> self_; WeakRef<Server> self_;
const weaseldb::Config &config_; weaseldb::Config config_;
ConnectionHandler &handler_; ConnectionHandler &handler_;
// Connection registry // Connection registry
@@ -137,13 +124,14 @@ private:
std::atomic<int64_t> connection_id_{0}; std::atomic<int64_t> connection_id_{0};
std::atomic<int> active_connections_{0}; std::atomic<int> active_connections_{0};
// Round-robin counter for connection distribution // Round-robin counter for local connection distribution across epoll
// instances
std::atomic<size_t> connection_distribution_counter_{0}; std::atomic<size_t> connection_distribution_counter_{0};
// Shutdown coordination // Shutdown coordination
int shutdown_pipe_[2] = {-1, -1}; int shutdown_pipe_[2] = {-1, -1};
// Multiple epoll file descriptors to reduce contention // Multiple epoll file descriptors (1:1 with I/O threads) to reduce contention
std::vector<int> epoll_fds_; std::vector<int> epoll_fds_;
std::vector<int> std::vector<int>
listen_fds_; // FDs to accept connections on (Server owns these) listen_fds_; // FDs to accept connections on (Server owns these)
@@ -154,30 +142,16 @@ private:
void create_epoll_instances(); void create_epoll_instances();
void start_io_threads(std::vector<std::thread> &threads); void start_io_threads(std::vector<std::thread> &threads);
// Helper to get epoll fd for a thread using round-robin
int get_epoll_for_thread(int thread_id) const;
// Helper for processing connection I/O // Helper for processing connection I/O
void process_connection_reads(std::unique_ptr<Connection> &conn_ptr, void process_connection_reads(Ref<Connection> &conn, int events);
int events); void process_connection_writes(Ref<Connection> &conn, int events);
void process_connection_writes(std::unique_ptr<Connection> &conn_ptr,
int events); void close_connection(Ref<Connection> &conn);
// Helper for processing a batch of connections with their events // Helper for processing a batch of connections with their events
void process_connection_batch(int epollfd, void process_connection_batch(std::span<Ref<Connection>> batch,
std::span<std::unique_ptr<Connection>> batch,
std::span<const int> events); std::span<const int> events);
/**
* Called internally to return ownership to the server.
*
* This method is thread-safe and can be called from any thread.
* The connection will be re-added to the epoll for continued processing.
*
* @param connection Unique pointer to the connection being released back
*/
void receiveConnectionBack(std::unique_ptr<Connection> connection);
// Make non-copyable and non-movable // Make non-copyable and non-movable
Server(const Server &) = delete; Server(const Server &) = delete;
Server &operator=(const Server &) = delete; Server &operator=(const Server &) = delete;

View File

@@ -1,6 +1,5 @@
#pragma once #pragma once
#include <array>
#include <atomic> #include <atomic>
#include <cassert> #include <cassert>
#include <cstddef> #include <cstddef>
@@ -48,151 +47,174 @@ struct ThreadState {
bool last_stage; bool last_stage;
}; };
// Compile-time topology configuration for static pipelines // Runtime topology configuration for dynamic pipelines
// //
// This template defines a pipeline topology at compile-time: // This class defines a pipeline topology at runtime:
// - Stage and thread calculations done at compile-time // - Stage and thread calculations done at runtime
// - Type-safe indexing: Stage and thread indices validated at compile-time // - Flexible configuration: topology can be set via constructor
// - Fixed-size arrays with known bounds // - Dynamic arrays with runtime bounds checking
// - Code specialization for each topology // - Single implementation works for any topology
// //
// Example: StaticPipelineTopology<1, 4, 2> creates: // Example: PipelineTopology({1, 4, 2}) creates:
// - Stage 0: 1 thread (index 0) // - Stage 0: 1 thread (index 0)
// - Stage 1: 4 threads (indices 1-4) // - Stage 1: 4 threads (indices 1-4)
// - Stage 2: 2 threads (indices 5-6) // - Stage 2: 2 threads (indices 5-6)
// - Total: 7 threads across 3 stages // - Total: 7 threads across 3 stages
template <int... ThreadsPerStage> struct StaticPipelineTopology { struct PipelineTopology {
static_assert(sizeof...(ThreadsPerStage) > 0, const std::vector<int> threads_per_stage;
"Must specify at least one stage"); const int num_stages;
static_assert(((ThreadsPerStage > 0) && ...), const std::vector<int> stage_offsets;
"All stages must have at least one thread"); const int total_threads;
static constexpr int num_stages = sizeof...(ThreadsPerStage); explicit PipelineTopology(std::vector<int> threads_per_stage_)
static constexpr std::array<int, num_stages> threads_per_stage = { : threads_per_stage(validate_and_move(std::move(threads_per_stage_))),
ThreadsPerStage...}; num_stages(static_cast<int>(threads_per_stage.size())),
static constexpr int total_threads = (ThreadsPerStage + ...); stage_offsets(build_stage_offsets(threads_per_stage)),
total_threads(build_total_threads(threads_per_stage)) {}
// Compile-time stage offset calculation // Runtime stage offset calculation
template <int Stage> static constexpr int stage_offset() { int stage_offset(int stage) const {
static_assert(Stage >= 0 && Stage < num_stages, if (stage < 0 || stage >= num_stages) {
"Stage index out of bounds"); std::abort(); // Stage index out of bounds
if constexpr (Stage == 0) {
return 0;
} else {
return stage_offset<Stage - 1>() + threads_per_stage[Stage - 1];
} }
return stage_offsets[stage];
} }
// Compile-time thread index calculation // Runtime thread index calculation
template <int Stage, int Thread> static constexpr int thread_index() { int thread_index(int stage, int thread) const {
static_assert(Stage >= 0 && Stage < num_stages, if (stage < 0 || stage >= num_stages) {
"Stage index out of bounds"); std::abort(); // Stage index out of bounds
static_assert(Thread >= 0 && Thread < threads_per_stage[Stage], }
"Thread index out of bounds"); if (thread < 0 || thread >= threads_per_stage[stage]) {
return stage_offset<Stage>() + Thread; std::abort(); // Thread index out of bounds
}
return stage_offsets[stage] + thread;
} }
// Compile-time previous stage thread count // Runtime previous stage thread count
template <int Stage> static constexpr int prev_stage_thread_count() { int prev_stage_thread_count(int stage) const {
static_assert(Stage >= 0 && Stage < num_stages, if (stage < 0 || stage >= num_stages) {
"Stage index out of bounds"); std::abort(); // Stage index out of bounds
if constexpr (Stage == 0) { }
if (stage == 0) {
return 1; return 1;
} else { } else {
return threads_per_stage[Stage - 1]; return threads_per_stage[stage - 1];
} }
} }
private:
static std::vector<int> validate_and_move(std::vector<int> threads) {
if (threads.empty()) {
std::abort(); // Must specify at least one stage
}
for (int count : threads) {
if (count <= 0) {
std::abort(); // All stages must have at least one thread
}
}
return threads;
}
static std::vector<int>
build_stage_offsets(const std::vector<int> &threads_per_stage) {
std::vector<int> offsets(threads_per_stage.size());
int offset = 0;
for (size_t i = 0; i < threads_per_stage.size(); ++i) {
offsets[i] = offset;
offset += threads_per_stage[i];
}
return offsets;
}
static int build_total_threads(const std::vector<int> &threads_per_stage) {
int total = 0;
for (int count : threads_per_stage) {
total += count;
}
return total;
}
}; };
// Static pipeline algorithms - compile-time specialized versions // Pipeline algorithms - runtime configurable versions
namespace StaticPipelineAlgorithms { namespace PipelineAlgorithms {
template <WaitStrategy wait_strategy, typename Topology, int Stage, inline uint32_t calculate_safe_len(WaitStrategy wait_strategy,
int ThreadInStage> const PipelineTopology &topology, int stage,
uint32_t calculate_safe_len( int thread_in_stage,
std::array<ThreadState, Topology::total_threads> &all_threads, std::vector<ThreadState> &all_threads,
std::atomic<uint32_t> &pushes, bool may_block) { std::atomic<uint32_t> &pushes,
constexpr int thread_idx = bool may_block) {
Topology::template thread_index<Stage, ThreadInStage>(); int thread_idx = topology.thread_index(stage, thread_in_stage);
auto &thread = all_threads[thread_idx]; auto &thread = all_threads[thread_idx];
uint32_t safe_len = UINT32_MAX; uint32_t safe_len = UINT32_MAX;
constexpr int prev_stage_threads = int prev_stage_threads = topology.prev_stage_thread_count(stage);
Topology::template prev_stage_thread_count<Stage>();
// Compile-time loop over previous stage threads // Runtime loop over previous stage threads
[&]<std::size_t... Is>(std::index_sequence<Is...>) { for (int i = 0; i < prev_stage_threads; ++i) {
( std::atomic<uint32_t> &last_push = [&]() -> std::atomic<uint32_t> & {
[&] { if (stage == 0) {
auto &last_push = [&]() -> std::atomic<uint32_t> & { return pushes;
if constexpr (Stage == 0) { } else {
return pushes; int prev_thread_idx = topology.thread_index(stage - 1, i);
} else { return all_threads[prev_thread_idx].pops;
constexpr int prev_thread_idx = }
Topology::template thread_index<Stage - 1, Is>(); }();
return all_threads[prev_thread_idx].pops;
if (thread.last_push_read[i] == thread.local_pops) {
thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
if (thread.last_push_read[i] == thread.local_pops) {
if (!may_block) {
safe_len = 0;
return safe_len;
}
if (wait_strategy == WaitStrategy::Never) {
// Empty - busy wait
} else if (wait_strategy == WaitStrategy::WaitIfUpstreamIdle) {
// We're allowed to spin as long as we eventually go to 0% cpu
// usage on idle
uint32_t push;
bool should_wait = true;
for (int j = 0; j < 100000; ++j) {
push = pushes.load(std::memory_order_relaxed);
if (push != thread.local_pops) {
should_wait = false;
break;
} }
}();
if (thread.last_push_read[Is] == thread.local_pops) {
thread.last_push_read[Is] =
last_push.load(std::memory_order_acquire);
if (thread.last_push_read[Is] == thread.local_pops) {
if (!may_block) {
safe_len = 0;
return;
}
if constexpr (wait_strategy == WaitStrategy::Never) {
// Empty - busy wait
} else if constexpr (wait_strategy ==
WaitStrategy::WaitIfUpstreamIdle) {
// We're allowed to spin as long as we eventually go to 0% cpu
// usage on idle
uint32_t push;
for (int i = 0; i < 100000; ++i) {
push = pushes.load(std::memory_order_relaxed);
if (push != thread.local_pops) {
goto dont_wait;
}
#if defined(__x86_64__) || defined(_M_X64) #if defined(__x86_64__) || defined(_M_X64)
_mm_pause(); _mm_pause();
#endif #endif
}
pushes.wait(push, std::memory_order_relaxed);
dont_wait:;
} else {
static_assert(wait_strategy == WaitStrategy::WaitIfStageEmpty);
last_push.wait(thread.last_push_read[Is],
std::memory_order_relaxed);
}
thread.last_push_read[Is] =
last_push.load(std::memory_order_acquire);
}
} }
safe_len = if (should_wait) {
std::min(safe_len, thread.last_push_read[Is] - thread.local_pops); pushes.wait(push, std::memory_order_relaxed);
}(), }
...); } else { // WaitStrategy::WaitIfStageEmpty
}(std::make_index_sequence<prev_stage_threads>{}); last_push.wait(thread.last_push_read[i], std::memory_order_relaxed);
}
thread.last_push_read[i] = last_push.load(std::memory_order_acquire);
}
}
safe_len = std::min(safe_len, thread.last_push_read[i] - thread.local_pops);
}
return safe_len; return safe_len;
} }
template <WaitStrategy wait_strategy, typename Topology, int Stage, inline void update_thread_pops(WaitStrategy wait_strategy,
int ThreadInStage> const PipelineTopology &topology, int stage,
void update_thread_pops( int thread_in_stage,
std::array<ThreadState, Topology::total_threads> &all_threads, std::vector<ThreadState> &all_threads,
uint32_t local_pops) { uint32_t local_pops) {
constexpr int thread_idx = int thread_idx = topology.thread_index(stage, thread_in_stage);
Topology::template thread_index<Stage, ThreadInStage>();
auto &thread_state = all_threads[thread_idx]; auto &thread_state = all_threads[thread_idx];
if constexpr (wait_strategy == WaitStrategy::WaitIfStageEmpty) { if (wait_strategy == WaitStrategy::WaitIfStageEmpty) {
thread_state.pops.store(local_pops, std::memory_order_seq_cst); thread_state.pops.store(local_pops, std::memory_order_seq_cst);
thread_state.pops.notify_all(); thread_state.pops.notify_all();
} else if constexpr (Stage == Topology::num_stages - 1) { // last stage } else if (stage == topology.num_stages - 1) { // last stage
thread_state.pops.store(local_pops, std::memory_order_seq_cst); thread_state.pops.store(local_pops, std::memory_order_seq_cst);
thread_state.pops.notify_all(); thread_state.pops.notify_all();
} else { } else {
@@ -200,15 +222,13 @@ void update_thread_pops(
} }
} }
template <typename Topology> inline int check_producer_capacity(const PipelineTopology &topology,
int check_producer_capacity( std::vector<ThreadState> &all_threads,
std::array<ThreadState, Topology::total_threads> &all_threads, uint32_t slot, uint32_t size,
uint32_t slot, uint32_t size, uint32_t slot_count, bool block) { uint32_t slot_count, bool block) {
constexpr int last_stage = Topology::num_stages - 1; int last_stage = topology.num_stages - 1;
constexpr int last_stage_offset = int last_stage_offset = topology.stage_offset(last_stage);
Topology::template stage_offset<last_stage>(); int last_stage_thread_count = topology.threads_per_stage[last_stage];
constexpr int last_stage_thread_count =
Topology::threads_per_stage[last_stage];
for (int i = 0; i < last_stage_thread_count; ++i) { for (int i = 0; i < last_stage_thread_count; ++i) {
auto &thread = all_threads[last_stage_offset + i]; auto &thread = all_threads[last_stage_offset + i];
@@ -223,10 +243,10 @@ int check_producer_capacity(
} }
return 0; // Can proceed return 0; // Can proceed
} }
} // namespace StaticPipelineAlgorithms } // namespace PipelineAlgorithms
// Static multi-stage lock-free pipeline for inter-thread communication // Multi-stage lock-free pipeline for inter-thread communication
// with compile-time topology specification. // with runtime-configurable topology and wait strategy.
// //
// Overview: // Overview:
// - Items flow from producers through multiple processing stages (stage 0 -> // - Items flow from producers through multiple processing stages (stage 0 ->
@@ -234,25 +254,17 @@ int check_producer_capacity(
// - Each stage can have multiple worker threads processing items in parallel // - Each stage can have multiple worker threads processing items in parallel
// - Uses a shared ring buffer with atomic counters for lock-free coordination // - Uses a shared ring buffer with atomic counters for lock-free coordination
// - Supports batch processing for efficiency // - Supports batch processing for efficiency
// - Compile-time topology specification via template parameters // - Runtime-configurable topology and wait strategy via constructor parameters
// //
// Architecture: // Architecture:
// - Producers: External threads that add items to the pipeline via push() // - Producers: External threads that add items to the pipeline via push()
// - Stages: Processing stages numbered 0, 1, 2, ... that consume items via // - Stages: Processing stages numbered 0, 1, 2, ... that consume items via
// acquire<Stage, Thread>() // acquire(stage, thread)
// - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage // - Items flow: Producers -> Stage 0 -> Stage 1 -> ... -> Final Stage
// //
// Differences from Dynamic Version:
// - Template parameters specify topology at compile-time (e.g., <Item,
// WaitStrategy::Never, 1, 4, 2>)
// - Stage and thread indices are template parameters, validated at compile-time
// - Fixed-size arrays replace dynamic vectors
// - Specialized algorithms for each stage/thread combination
// - Type-safe guards prevent runtime indexing errors
//
// Usage Pattern: // Usage Pattern:
// using Pipeline = StaticThreadPipeline<Item, WaitStrategy::WaitIfStageEmpty, // ThreadPipeline<Item> pipeline(WaitStrategy::WaitIfStageEmpty, {1, 4, 2},
// 1, 4, 2>; Pipeline pipeline(lgSlotCount); // lgSlotCount);
// //
// // Producer threads (add items for stage 0 to consume): // // Producer threads (add items for stage 0 to consume):
// auto guard = pipeline.push(batchSize, /*block=*/true); // auto guard = pipeline.push(batchSize, /*block=*/true);
@@ -262,12 +274,54 @@ int check_producer_capacity(
// // Guard destructor publishes batch to stage 0 consumers // // Guard destructor publishes batch to stage 0 consumers
// //
// // Stage worker threads (process items and pass to next stage): // // Stage worker threads (process items and pass to next stage):
// auto guard = pipeline.acquire<Stage, Thread>(maxBatch, /*may_block=*/true); // auto guard = pipeline.acquire(stage, thread, maxBatch, /*may_block=*/true);
// for (auto& item : guard.batch) { // for (auto& item : guard.batch) {
// // Process item // // Process item
// } // }
// // Guard destructor marks items as consumed and available to next stage // // Guard destructor marks items as consumed and available to next stage
// //
// Multi-Thread Stage Processing:
// When a stage has multiple threads (e.g., {1, 1, 1, 2} = 2 threads in stage
// 3):
//
// OVERLAPPING BATCHES - EACH THREAD SEES EVERY ENTRY:
// - Multiple threads in the same stage get OVERLAPPING batches from the ring
// buffer
// - Thread 0: calls acquire(3, 0) - gets batch from ring positions 100-110
// - Thread 1: calls acquire(3, 1) - gets batch from ring positions 100-110
// (SAME)
// - Both threads see the same entries and must coordinate processing
//
// PARTITIONING STRATEGIES:
// Choose your partitioning approach based on your use case:
//
// 1. Ring buffer position-based partitioning:
// for (auto it = batch.begin(); it != batch.end(); ++it) {
// if (it.index() % 2 != thread_index) continue; // Skip entries for other
// threads process(*it); // Process only entries assigned to this thread
// }
//
// 2. Entry content-based partitioning:
// for (auto& item : guard.batch) {
// if (hash(item.connection_id) % 2 != thread_index) continue;
// process(item); // Process based on entry properties
// }
//
// 3. Process all entries (when each thread does different work):
// for (auto& item : guard.batch) {
// process(item); // Both threads process all items, but differently
// }
//
// Common Partitioning Patterns:
// - Position-based: it.index() % num_threads == thread_index
// - Hash-based: hash(item.key) % num_threads == thread_index
// - Type-based: item.type == MY_THREAD_TYPE
// - Load balancing: assign work based on thread load
// - All entries: each thread processes all items but performs different
// operations
//
// Note: it.index() returns the position in the ring buffer (0 to buffer_size-1)
//
// Memory Model: // Memory Model:
// - Ring buffer size must be power of 2 for efficient masking // - Ring buffer size must be power of 2 for efficient masking
// - Actual ring slots accessed via: index & (slotCount - 1) // - Actual ring slots accessed via: index & (slotCount - 1)
@@ -278,27 +332,27 @@ int check_producer_capacity(
// ordering // ordering
// - Uses C++20 atomic wait/notify for efficient blocking when no work available // - Uses C++20 atomic wait/notify for efficient blocking when no work available
// - RAII guards ensure proper cleanup even with exceptions // - RAII guards ensure proper cleanup even with exceptions
template <class T, WaitStrategy wait_strategy, int... ThreadsPerStage> template <class T> struct ThreadPipeline {
struct StaticThreadPipeline {
using Topology = StaticPipelineTopology<ThreadsPerStage...>;
// Constructor // Constructor
// wait_strategy: blocking behavior when no work is available
// threads_per_stage: number of threads in each stage (e.g., {1, 4, 2})
// lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots) // lgSlotCount: log2 of ring buffer size (e.g., 10 -> 1024 slots)
// Template parameters specify pipeline topology (e.g., <Item, Never, 1, 4, // Note: Producer threads are external to the pipeline and not counted in
// 2>) Note: Producer threads are external to the pipeline and not counted in // threads_per_stage
// ThreadsPerStage explicit ThreadPipeline(WaitStrategy wait_strategy,
explicit StaticThreadPipeline(int lgSlotCount) std::vector<int> threads_per_stage, int lgSlotCount)
: slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1), : wait_strategy_(wait_strategy), topology_(std::move(threads_per_stage)),
ring(slot_count) { slot_count(1 << lgSlotCount), slot_count_mask(slot_count - 1),
ring(slot_count), all_threads(topology_.total_threads) {
// Otherwise we can't tell the difference between full and empty. // Otherwise we can't tell the difference between full and empty.
assert(!(slot_count_mask & 0x80000000)); assert(!(slot_count_mask & 0x80000000));
initialize_all_threads(); initialize_all_threads();
} }
StaticThreadPipeline(StaticThreadPipeline const &) = delete; ThreadPipeline(ThreadPipeline const &) = delete;
StaticThreadPipeline &operator=(StaticThreadPipeline const &) = delete; ThreadPipeline &operator=(ThreadPipeline const &) = delete;
StaticThreadPipeline(StaticThreadPipeline &&) = delete; ThreadPipeline(ThreadPipeline &&) = delete;
StaticThreadPipeline &operator=(StaticThreadPipeline &&) = delete; ThreadPipeline &operator=(ThreadPipeline &&) = delete;
struct Batch { struct Batch {
Batch() : ring(), begin_(), end_() {} Batch() : ring(), begin_(), end_() {}
@@ -401,7 +455,7 @@ struct StaticThreadPipeline {
} }
private: private:
friend struct StaticThreadPipeline; friend struct ThreadPipeline;
Batch(std::vector<T> *const ring, uint32_t begin_, uint32_t end_) Batch(std::vector<T> *const ring, uint32_t begin_, uint32_t end_)
: ring(ring), begin_(begin_), end_(end_) {} : ring(ring), begin_(begin_), end_(end_) {}
std::vector<T> *const ring; std::vector<T> *const ring;
@@ -409,29 +463,29 @@ struct StaticThreadPipeline {
uint32_t end_; uint32_t end_;
}; };
// Static thread storage - fixed size array
std::array<ThreadState, Topology::total_threads> all_threads;
private: private:
WaitStrategy wait_strategy_;
PipelineTopology topology_;
alignas(128) std::atomic<uint32_t> slots{0}; alignas(128) std::atomic<uint32_t> slots{0};
alignas(128) std::atomic<uint32_t> pushes{0}; alignas(128) std::atomic<uint32_t> pushes{0};
const uint32_t slot_count; const uint32_t slot_count;
const uint32_t slot_count_mask; const uint32_t slot_count_mask;
std::vector<T> ring; std::vector<T> ring;
std::vector<ThreadState> all_threads;
void initialize_all_threads() { void initialize_all_threads() {
[&]<std::size_t... StageIndices>(std::index_sequence<StageIndices...>) { for (int stage = 0; stage < topology_.num_stages; ++stage) {
(init_stage_threads<StageIndices>(), ...); init_stage_threads(stage);
}(std::make_index_sequence<Topology::num_stages>{}); }
} }
template <int Stage> void init_stage_threads() { void init_stage_threads(int stage) {
constexpr int stage_offset = Topology::template stage_offset<Stage>(); int stage_offset = topology_.stage_offset(stage);
constexpr int stage_thread_count = Topology::threads_per_stage[Stage]; int stage_thread_count = topology_.threads_per_stage[stage];
constexpr int prev_stage_threads = int prev_stage_threads = topology_.prev_stage_thread_count(stage);
Topology::template prev_stage_thread_count<Stage>(); bool is_last_stage = (stage == topology_.num_stages - 1);
constexpr bool is_last_stage = (Stage == Topology::num_stages - 1);
for (int thread = 0; thread < stage_thread_count; ++thread) { for (int thread = 0; thread < stage_thread_count; ++thread) {
auto &thread_state = all_threads[stage_offset + thread]; auto &thread_state = all_threads[stage_offset + thread];
@@ -440,14 +494,15 @@ private:
} }
} }
template <int Stage, int Thread> Batch acquire_helper(int stage, int thread, uint32_t maxBatch,
Batch acquire_helper(uint32_t maxBatch, bool mayBlock) { bool may_block) {
constexpr int thread_idx = Topology::template thread_index<Stage, Thread>(); int thread_idx = topology_.thread_index(stage, thread);
auto &thread_state = all_threads[thread_idx]; auto &thread_state = all_threads[thread_idx];
uint32_t begin = thread_state.local_pops & slot_count_mask; uint32_t begin = thread_state.local_pops & slot_count_mask;
uint32_t len = StaticPipelineAlgorithms::calculate_safe_len< uint32_t len = PipelineAlgorithms::calculate_safe_len(
wait_strategy, Topology, Stage, Thread>(all_threads, pushes, mayBlock); wait_strategy_, topology_, stage, thread, all_threads, pushes,
may_block);
if (maxBatch != 0) { if (maxBatch != 0) {
len = std::min(len, maxBatch); len = std::min(len, maxBatch);
@@ -462,13 +517,13 @@ private:
} }
public: public:
template <int Stage, int Thread> struct StageGuard { struct StageGuard {
Batch batch; Batch batch;
~StageGuard() { ~StageGuard() {
if (!batch.empty()) { if (!batch.empty()) {
StaticPipelineAlgorithms::update_thread_pops<wait_strategy, Topology, PipelineAlgorithms::update_thread_pops(
Stage, Thread>( pipeline->wait_strategy_, pipeline->topology_, stage, thread,
pipeline->all_threads, local_pops); pipeline->all_threads, local_pops);
} }
} }
@@ -476,22 +531,28 @@ public:
StageGuard(StageGuard const &) = delete; StageGuard(StageGuard const &) = delete;
StageGuard &operator=(StageGuard const &) = delete; StageGuard &operator=(StageGuard const &) = delete;
StageGuard(StageGuard &&other) noexcept StageGuard(StageGuard &&other) noexcept
: batch(other.batch), local_pops(other.local_pops), : batch(other.batch), local_pops(other.local_pops), stage(other.stage),
thread(other.thread),
pipeline(std::exchange(other.pipeline, nullptr)) {} pipeline(std::exchange(other.pipeline, nullptr)) {}
StageGuard &operator=(StageGuard &&other) noexcept { StageGuard &operator=(StageGuard &&other) noexcept {
batch = other.batch; batch = other.batch;
local_pops = other.local_pops; local_pops = other.local_pops;
stage = other.stage;
thread = other.thread;
pipeline = std::exchange(other.pipeline, nullptr); pipeline = std::exchange(other.pipeline, nullptr);
return *this; return *this;
} }
private: private:
friend struct StaticThreadPipeline; friend struct ThreadPipeline;
uint32_t local_pops; uint32_t local_pops;
StaticThreadPipeline *pipeline; int stage;
int thread;
ThreadPipeline *pipeline;
StageGuard(Batch batch, uint32_t local_pops, StaticThreadPipeline *pipeline) StageGuard(Batch batch, uint32_t local_pops, int stage, int thread,
: batch(batch), local_pops(local_pops), ThreadPipeline *pipeline)
: batch(batch), local_pops(local_pops), stage(stage), thread(thread),
pipeline(batch.empty() ? nullptr : pipeline) {} pipeline(batch.empty() ? nullptr : pipeline) {}
}; };
@@ -514,37 +575,30 @@ public:
} }
private: private:
friend struct StaticThreadPipeline; friend struct ThreadPipeline;
ProducerGuard() : batch(), tp() {} ProducerGuard() : batch(), tp() {}
ProducerGuard(Batch batch, StaticThreadPipeline *tp, uint32_t old_slot, ProducerGuard(Batch batch, ThreadPipeline *tp, uint32_t old_slot,
uint32_t new_slot) uint32_t new_slot)
: batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {} : batch(batch), tp(tp), old_slot(old_slot), new_slot(new_slot) {}
StaticThreadPipeline *const tp; ThreadPipeline *const tp;
uint32_t old_slot; uint32_t old_slot;
uint32_t new_slot; uint32_t new_slot;
}; };
// Acquire a batch of items for processing by a consumer thread. // Acquire a batch of items for processing by a consumer thread.
// Stage: which processing stage (0 = first consumer stage after producers) - // stage: which processing stage (0 = first consumer stage after producers)
// compile-time parameter Thread: thread ID within the stage (0 to // thread: thread ID within the stage (0 to threads_per_stage[stage]-1)
// ThreadsPerStage[Stage]-1) - compile-time parameter maxBatch: maximum items // maxBatch: maximum items to acquire (0 = no limit)
// to acquire (0 = no limit) may_block: whether to block waiting for items // may_block: whether to block waiting for items (false = return empty batch
// (false = return empty batch if none available) Returns: StageGuard<Stage, // if none available) Returns: StageGuard with batch of items to process
// Thread> with batch of items to process and compile-time type safety [[nodiscard]] StageGuard acquire(int stage, int thread, int maxBatch = 0,
template <int Stage, int Thread> bool may_block = true) {
[[nodiscard]] StageGuard<Stage, Thread> acquire(int maxBatch = 0, auto batch = acquire_helper(stage, thread, maxBatch, may_block);
bool may_block = true) {
static_assert(Stage >= 0 && Stage < Topology::num_stages,
"Stage index out of bounds");
static_assert(Thread >= 0 && Thread < Topology::threads_per_stage[Stage],
"Thread index out of bounds");
auto batch = acquire_helper<Stage, Thread>(maxBatch, may_block); int thread_idx = topology_.thread_index(stage, thread);
constexpr int thread_idx = Topology::template thread_index<Stage, Thread>();
uint32_t local_pops = all_threads[thread_idx].local_pops; uint32_t local_pops = all_threads[thread_idx].local_pops;
return StageGuard<Stage, Thread>{std::move(batch), local_pops, this}; return StageGuard{std::move(batch), local_pops, stage, thread, this};
} }
// Reserve slots in the ring buffer for a producer thread to fill with items. // Reserve slots in the ring buffer for a producer thread to fill with items.
@@ -577,9 +631,8 @@ public:
slot = slots.load(std::memory_order_relaxed); slot = slots.load(std::memory_order_relaxed);
begin = slot & slot_count_mask; begin = slot & slot_count_mask;
int capacity_result = int capacity_result = PipelineAlgorithms::check_producer_capacity(
StaticPipelineAlgorithms::check_producer_capacity<Topology>( topology_, all_threads, slot, size, slot_count, block);
all_threads, slot, size, slot_count, block);
if (capacity_result == 1) { if (capacity_result == 1) {
continue; continue;
} }

197
style.md
View File

@@ -5,28 +5,30 @@ This document describes the C++ coding style used in the WeaselDB project. These
## Table of Contents ## Table of Contents
1. [General Principles](#general-principles) 1. [General Principles](#general-principles)
2. [Naming Conventions](#naming-conventions) 1. [Naming Conventions](#naming-conventions)
3. [File Organization](#file-organization) 1. [File Organization](#file-organization)
4. [Code Structure](#code-structure) 1. [Code Structure](#code-structure)
5. [Memory Management](#memory-management) 1. [Memory Management](#memory-management)
6. [Error Handling](#error-handling) 1. [Error Handling](#error-handling)
7. [Documentation](#documentation) 1. [Documentation](#documentation)
8. [Testing](#testing) 1. [Testing](#testing)
--- ______________________________________________________________________
## General Principles ## General Principles
### Language Standard ### Language Standard
- **C++20** is the target standard - **C++20** is the target standard
- Use modern C++ features: RAII, move semantics, constexpr, concepts where appropriate - Use modern C++ features: RAII, move semantics, constexpr, concepts where appropriate
- Prefer standard library containers and algorithms over custom implementations
### C Library Functions and Headers ### C Library Functions and Headers
- **Always use std:: prefixed versions** of C library functions for consistency and clarity - **Always use std:: prefixed versions** of C library functions for consistency and clarity
- **Use C++ style headers** (`<cstring>`, `<cstdlib>`, etc.) instead of C style headers (`<string.h>`, `<stdlib.h>`, etc.) - **Use C++ style headers** (`<cstring>`, `<cstdlib>`, etc.) instead of C style headers (`<string.h>`, `<stdlib.h>`, etc.)
- This applies to all standard libc functions: `std::abort()`, `std::fprintf()`, `std::free()`, `std::memcpy()`, `std::strlen()`, `std::strncpy()`, `std::memset()`, `std::signal()`, etc. - This applies to all standard libc functions: `std::abort()`, `std::fprintf()`, `std::free()`, `std::memcpy()`, `std::strlen()`, `std::strncpy()`, `std::memset()`, `std::signal()`, etc.
- **Exception:** Functions with no std:: equivalent (e.g., `perror()`, `gai_strerror()`) and system-specific headers (e.g., `<unistd.h>`, `<fcntl.h>`) - **Exception:** Functions with no std:: equivalent (e.g., `perror()`, `gai_strerror()`) and system-specific headers (e.g., `<unistd.h>`, `<fcntl.h>`)
```cpp ```cpp
// Preferred - C++ style // Preferred - C++ style
#include <cstring> #include <cstring>
@@ -56,23 +58,25 @@ signal(SIGTERM, handler);
``` ```
### Data Types ### Data Types
- **Almost always signed** - prefer `int`, `int64_t`, `ssize_t` over unsigned types except for: - **Almost always signed** - prefer `int`, `int64_t`, `ssize_t` over unsigned types except for:
- Bit manipulation operations - Bit manipulation operations
- Interfacing with APIs that require unsigned types - Interfacing with APIs that require unsigned types
- Where defined unsigned overflow behavior (wraparound) is intentional and desired - Where defined unsigned overflow behavior (wraparound) is intentional and desired
- **Almost always auto** - let the compiler deduce types except when: - **Almost always auto** - let the compiler deduce types except when:
- The type is not obvious from context (prefer explicit for clarity) - The type is not obvious from context and the exact type is important (prefer explicit for clarity)
- Specific type requirements matter (numeric conversions, template parameters) - Specific type requirements matter (numeric conversions, template parameters)
- Interface contracts need explicit types (public APIs, function signatures) - Interface contracts need explicit types (public APIs, function signatures)
- **Prefer uninitialized memory to default initialization** when using before initializing would be an error - **Prefer uninitialized memory to default initialization** when using before initializing would be an error
- Valgrind will catch uninitialized memory usage bugs - Valgrind will catch uninitialized memory usage bugs
- Avoid hiding logic errors with unnecessary zero-initialization - Avoid hiding logic errors that Valgrind would have caught with unnecessary zero-initialization
- Default initialization can mask bugs and hurt performance - Default initialization can mask bugs and hurt performance
- **Floating point is for metrics only** - avoid `float`/`double` in core data structures and algorithms - **Floating point is for metrics only** - avoid `float`/`double` in core data structures and algorithms
- Use for performance measurements, statistics, and monitoring data - Use for performance measurements, statistics, and monitoring data
- Never use for counts, sizes, or business logic - Avoid branching on the values of floats
### Type Casting ### Type Casting
- **Never use C-style casts** - they're unsafe and can hide bugs by performing dangerous conversions - **Never use C-style casts** - they're unsafe and can hide bugs by performing dangerous conversions
- **Use C++ cast operators** for explicit type conversions with clear intent and safety checks - **Use C++ cast operators** for explicit type conversions with clear intent and safety checks
- **Avoid `reinterpret_cast`** - almost always indicates poor design; redesign APIs instead - **Avoid `reinterpret_cast`** - almost always indicates poor design; redesign APIs instead
@@ -94,18 +98,21 @@ auto addr = reinterpret_cast<uintptr_t>(ptr); // Pointer to integer conv
``` ```
### Performance Focus ### Performance Focus
- **Performance-first design** - optimize for the hot path - **Performance-first design** - optimize for the hot path
- **Simple is fast** - find exactly what's necessary, strip away everything else - **Simple is fast** - find exactly what's necessary, strip away everything else
- **Complexity must be justified with benchmarks** - measure performance impact before adding complexity - **Complexity must be justified with benchmarks** - measure performance impact before adding complexity
- **Strive for 0% CPU usage when idle** - avoid polling, busy waiting, or unnecessary background activity - **Strive for 0% CPU usage when idle** - avoid polling, busy waiting, or unnecessary background activity
- Use **inline functions** for performance-critical code (e.g., `allocate_raw`) - Use **inline functions** for performance-critical code (e.g., `allocate_raw`)
- **String views** with `std::string_view` to minimize unnecessary copying - **String views** with `std::string_view` to minimize unnecessary copying
- **Arena allocation** for efficient memory management (~1ns vs ~20-270ns for malloc) - **Arena allocation** for efficient memory management, and to group related lifetimes together for simplicity
### String Formatting ### String Formatting
- **Always use `format.hpp` functions** - formats directly into arena-allocated memory - **Always use `format.hpp` functions** - formats directly into arena-allocated memory
- **Use `static_format()` for performance-sensitive code** - faster but less flexible than `format()` - **Use `static_format()` for performance-sensitive code** - faster but less flexible than `format()`
- **Use `format()` function with arena allocator** for printf-style formatting - **Use `format()` function with arena allocator** for printf-style formatting
```cpp ```cpp
// Most performance-sensitive - compile-time optimized concatenation // Most performance-sensitive - compile-time optimized concatenation
std::string_view response = static_format(arena, std::string_view response = static_format(arena,
@@ -123,7 +130,10 @@ std::string_view response = format(arena,
static_cast<int>(body.size()), body.data()); static_cast<int>(body.size()), body.data());
``` ```
- Offer APIs that let you avoid concatenating strings if possible - e.g. if the bytes are going to get written to a file descriptor you can skip concatenating and use scatter/gather writev-type calls.
### Complexity Control ### Complexity Control
- **Encapsulation is the main tool for controlling complexity** - **Encapsulation is the main tool for controlling complexity**
- **Header files define the interface** - they are the contract with users of your code - **Header files define the interface** - they are the contract with users of your code
- **Headers should be complete** - include everything needed to use the interface effectively: - **Headers should be complete** - include everything needed to use the interface effectively:
@@ -132,15 +142,17 @@ std::string_view response = format(arena,
- Thread safety guarantees - Thread safety guarantees
- Performance characteristics - Performance characteristics
- Ownership and lifetime semantics - Ownership and lifetime semantics
- **Do not rely on undocumented interface properties** - if it's not in the header, don't depend on it - **Do not rely on undocumented properties of an interface** - if it's not in the header, don't depend on it
--- ______________________________________________________________________
## Naming Conventions ## Naming Conventions
### Variables and Functions ### Variables and Functions
- **snake_case** for all variables, functions, and member functions - **snake_case** for all variables, functions, and member functions
- **Legacy camelCase exists** - the codebase currently contains mixed naming due to historical development. New code should use snake_case. Existing camelCase should be converted to snake_case during natural refactoring (not mass renaming). - **Legacy camelCase exists** - the codebase currently contains mixed naming due to historical development. New code should use snake_case. Existing camelCase should be converted to snake_case during natural refactoring (not mass renaming).
```cpp ```cpp
int64_t used_bytes() const; int64_t used_bytes() const;
void add_block(int64_t size); void add_block(int64_t size);
@@ -148,27 +160,31 @@ int32_t initial_block_size_;
``` ```
### Classes and Structs ### Classes and Structs
- **PascalCase** for class/struct names - **PascalCase** for class/struct names
- **Always use struct keyword** - eliminates debates about complexity and maintains consistency - **Always use struct keyword** - eliminates debates about complexity and maintains consistency
- **Public members first, private after** - puts the interface users care about at the top, implementation details below - **Public members first, private after** - puts the interface users care about at the top, implementation details below
- **Full encapsulation still applies** - use `private:` sections to hide implementation details and maintain deep, capable structs - **Full encapsulation still applies** - use `private:` sections to hide implementation details and maintain deep, capable structs
- The struct keyword doesn't mean shallow design - it means interface-first organization for human readers - The struct keyword doesn't mean shallow design - it means interface-first organization for human readers
- Omit the `public` keyword when inheriting from a struct. It's public by default. E.g. `struct A : B {};` instead of `struct A : public B {};`
```cpp ```cpp
struct Arena { struct MyClass {
// Public interface first // Public interface first
explicit Arena(int64_t initial_size = 1024); void do_thing();
void* allocate_raw(int64_t size);
private: private:
// Private members after // Private members after
int32_t initial_block_size_; int thing_count_;
Block* current_block_;
}; };
``` ```
### Enums ### Enums
- **PascalCase** for enum class names - **PascalCase** for enum class names
- **PascalCase** for enum values (not SCREAMING_SNAKE_CASE) - **PascalCase** for enum values (not SCREAMING_SNAKE_CASE)
- C-style enums are acceptable where implicit int conversion is desirable, like for bitflags
```cpp ```cpp
enum class Type { enum class Type {
PointRead, PointRead,
@@ -183,14 +199,18 @@ enum class ParseState {
``` ```
### Constants and Macros ### Constants and Macros
- **snake_case** for constants - **snake_case** for constants
- Avoid macros when possible; prefer `constexpr` variables - Avoid macros when possible; prefer `constexpr` variables
```cpp ```cpp
static const WeaselJsonCallbacks json_callbacks; static const WeaselJsonCallbacks json_callbacks;
``` ```
### Member Variables ### Member Variables
- **Trailing underscore** for private member variables - **Trailing underscore** for private member variables
```cpp ```cpp
private: private:
int32_t initial_block_size_; int32_t initial_block_size_;
@@ -198,24 +218,28 @@ private:
``` ```
### Template Parameters ### Template Parameters
- **PascalCase** for template type parameters - **PascalCase** for template type parameters
```cpp ```cpp
template <typename T, typename... Args> template <typename T, typename... Args>
template <typename T> struct rebind { using type = T*; }; template <typename T> struct rebind { using type = T*; };
``` ```
--- ______________________________________________________________________
## File Organization ## File Organization
### Include Organization ### Include Organization
- Use **`#pragma once`** instead of include guards - Use **`#pragma once`** instead of include guards
- **Never `using namespace std`** - always use fully qualified names for clarity and safety - **Never `using namespace std`** - always use fully qualified names for clarity and safety
- **Include order** (applies to both headers and source files): - **Include order** (applies to both headers and source files):
1. Corresponding header file (for .cpp files only) 1. Corresponding header file (for .cpp files only)
2. Standard library headers (alphabetical) 1. Standard library headers (alphabetical)
3. Third-party library headers 1. Third-party library headers
4. Project headers 1. Project headers
```cpp ```cpp
#pragma once #pragma once
@@ -239,14 +263,16 @@ std::vector<int> data;
std::unique_ptr<Parser> parser; std::unique_ptr<Parser> parser;
``` ```
--- ______________________________________________________________________
## Code Structure ## Code Structure
### Class Design ### Class Design
- **Move-only semantics** for resource-owning types - **Move-only semantics** for resource-owning types
- **Explicit constructors** to prevent implicit conversions - **Explicit constructors** to prevent implicit conversions
- **Delete copy operations** when inappropriate - **Delete copy operations** when copying is inappropriate or should be discouraged
```cpp ```cpp
struct Arena { struct Arena {
explicit Arena(int64_t initial_size = 1024); explicit Arena(int64_t initial_size = 1024);
@@ -266,12 +292,14 @@ private:
``` ```
### Function Design ### Function Design
- **Const correctness** - mark methods const when appropriate - **Const correctness** - mark methods const when appropriate
- **Parameter passing:** - **Parameter passing:**
- Pass by value for types ≤ 16 bytes (int, pointers, string_view, small structs) - Pass by value for types ≤ 16 bytes (int, pointers, string_view, small structs)
- Pass by const reference for types > 16 bytes (containers, large objects) - Pass by const reference for types > 16 bytes (containers, large objects)
- **Return by value** for small types (≤ 16 bytes), **string_view** to avoid copying strings - **Return by value** for small types (≤ 16 bytes), **string_view** to avoid copying strings
- **noexcept specification** for move operations and non-throwing functions - **noexcept specification** for move operations and non-throwing functions
```cpp ```cpp
std::span<const Operation> operations() const { return operations_; } std::span<const Operation> operations() const { return operations_; }
void process_data(std::string_view request_data); // ≤ 16 bytes, pass by value void process_data(std::string_view request_data); // ≤ 16 bytes, pass by value
@@ -280,27 +308,30 @@ Arena(Arena &&source) noexcept;
``` ```
### Template Usage ### Template Usage
- **Template constraints** using static_assert for better error messages - **Template constraints** using static_assert for better error messages
- **SFINAE** or concepts for template specialization - **SFINAE** or concepts for template specialization
### Factory Patterns & Ownership ### Factory Patterns & Ownership
- **Static factory methods** for complex construction requiring shared ownership
- **Static factory methods** for complex construction requirements like enforcing shared ownership
- **Friend-based factories** for access control when constructor should be private - **Friend-based factories** for access control when constructor should be private
- **Ownership guidelines:** - **Ownership guidelines:**
- **unique_ptr** for exclusive ownership (most common case) - **unique_ptr** for exclusive ownership (most common case)
- **shared_ptr** only when multiple owners need concurrent access to same object - **Ref** only when object logically has multiple owners (`Ref` is our custom std::shared_ptr variant)
- **Factory methods return appropriate smart pointer type** based on ownership needs - **Factory methods return appropriate smart pointer type** based on ownership needs
```cpp ```cpp
// Shared ownership - multiple components need concurrent access // Shared ownership - multiple components need concurrent access
auto server = Server::create(config, handler); // Returns shared_ptr auto server = Server::create(config, handler); // Returns Ref<Server>
// Exclusive ownership - single owner, transfer via move // Exclusive ownership - single owner, transfer via move
auto connection = Connection::createForServer(addr, fd, connection_id, handler, server_ref); auto connection = Connection::createForServer(addr, fd, connection_id, handler, server_ref);
// Friend-based factory for access control // Friend-based factory for access control
struct Connection { struct Connection {
void append_message(std::string_view message_data); WeakRef<MessageSender> get_weak_ref() const;
private: private:
Connection(struct sockaddr_storage client_addr, int file_descriptor, Connection(struct sockaddr_storage client_addr, int file_descriptor,
int64_t connection_id, ConnectionHandler* request_handler, int64_t connection_id, ConnectionHandler* request_handler,
@@ -310,8 +341,10 @@ private:
``` ```
### Control Flow ### Control Flow
- **Early returns** to reduce nesting - **Early returns** to reduce nesting
- **Range-based for loops** when possible - **Range-based for loops** when possible
```cpp ```cpp
if (size == 0) { if (size == 0) {
return nullptr; return nullptr;
@@ -323,9 +356,11 @@ for (auto &precondition : preconditions_) {
``` ```
### Atomic Operations ### Atomic Operations
- **Never use assignment operators** with `std::atomic` - always use explicit `store()` and `load()` - **Never use assignment operators** with `std::atomic` - always use explicit `store()` and `load()`
- **Always specify memory ordering** explicitly for atomic operations - **Always specify memory ordering** explicitly for atomic operations
- **Use the least restrictive correct memory ordering** - choose the weakest ordering that maintains correctness - **Use the least restrictive correct memory ordering** - choose the weakest ordering that maintains correctness
```cpp ```cpp
// Preferred - explicit store/load with precise memory ordering // Preferred - explicit store/load with precise memory ordering
std::atomic<uint64_t> counter; std::atomic<uint64_t> counter;
@@ -343,14 +378,16 @@ counter = 42; // Implicit - memory ordering not explicit
auto value = counter; // Implicit - memory ordering not explicit auto value = counter; // Implicit - memory ordering not explicit
``` ```
--- ______________________________________________________________________
## Memory Management ## Memory Management
### Ownership & Allocation ### Ownership & Allocation
- **Arena allocators** for request-scoped memory with **STL allocator adapters** (see Performance Focus section for characteristics)
- **Arena** for request-scoped memory with **STL allocator adapters**
- **String views** pointing to arena-allocated memory to avoid unnecessary copying - **String views** pointing to arena-allocated memory to avoid unnecessary copying
- **STL containers with arena allocators require default construction after arena reset** - `clear()` is not sufficient - **STL containers with arena allocators require default construction after arena reset** - `clear()` is not sufficient
```cpp ```cpp
// STL containers with arena allocators - correct reset pattern // STL containers with arena allocators - correct reset pattern
std::vector<Operation, ArenaStlAllocator<Operation>> operations(arena); std::vector<Operation, ArenaStlAllocator<Operation>> operations(arena);
@@ -359,10 +396,33 @@ operations = {}; // Default construct - clear() won't work correctly
arena.reset(); // Reset arena memory arena.reset(); // Reset arena memory
``` ```
### Arena String Copying
- **Always use `Arena::copy_string()`** for copying string data into arena memory
- **Avoid manual allocation and memcpy** for string copying
- **Use `Arena::allocate_span<T>()`** for array allocations instead of manual span construction
```cpp
// Preferred - unified arena methods
std::string_view copy = arena.copy_string(original_string);
auto buffer = arena.allocate_span<char>(1024);
auto strings = arena.allocate_span<std::string_view>(count);
// Avoid - manual allocation and copying
char *copied = arena.allocate<char>(str.size());
std::memcpy(copied, str.data(), str.size());
std::string_view copy(copied, str.size());
// Avoid - manual span construction
auto span = std::span{arena.allocate<std::string_view>(count), count};
```
### Resource Management ### Resource Management
- **RAII** everywhere - constructors acquire, destructors release - **RAII** everywhere - constructors acquire, destructors release
- **Move semantics** for efficient resource transfer - **Move semantics** for efficient resource transfer
- **Explicit cleanup** methods where appropriate - **Explicit cleanup** methods where appropriate
```cpp ```cpp
~Arena() { ~Arena() {
while (current_block_) { while (current_block_) {
@@ -373,20 +433,22 @@ arena.reset(); // Reset arena memory
} }
``` ```
--- ______________________________________________________________________
## Error Handling ## Error Handling
### Error Classification & Response ### Error Classification & Response
- **Expected errors** (invalid input, timeouts): Return error codes for programmatic handling - **Expected errors** (invalid input, timeouts): Return error codes for programmatic handling
- **System failures** (malloc fail, socket fail): Abort immediately with error message - **System failures** (malloc fail, socket fail): Abort immediately with error message
- **Programming errors** (precondition violations, assertions): Abort immediately - **Programming errors** (precondition violations, assertions): Abort immediately
### Error Contract Design ### Error Contract Design
- **Error codes are the API contract** - use enums for programmatic decisions - **Error codes are the API contract** - use enums for programmatic decisions
- **Error messages are human-readable only** - never parse message strings - **Error messages are human-readable only** - never parse message strings
- **Consistent error boundaries** - each component defines what it can/cannot recover from - **Consistent error boundaries** - each component defines what it can/cannot recover from
- **Interface precondition violations are undefined behavior** - acceptable to skip checks for performance in hot paths - **Interface precondition violations are undefined behavior** - it's acceptable to skip checks for performance in hot paths
- **Error code types must be nodiscard** - mark error code enums with `[[nodiscard]]` to prevent silent failures - **Error code types must be nodiscard** - mark error code enums with `[[nodiscard]]` to prevent silent failures
```cpp ```cpp
@@ -400,11 +462,12 @@ if (!memory) {
} }
// ... use memory, eventually std::free(memory) // ... use memory, eventually std::free(memory)
// Programming error - precondition violation (may be omitted for performance) // Programming error - precondition violation (gets compiled out in release builds)
assert(ptr != nullptr && "Precondition violated: pointer must be non-null"); assert(ptr != nullptr && "Precondition violated: pointer must be non-null");
``` ```
### Assertions ### Assertions
- **Programming error detection** using standard `assert()` macro - **Programming error detection** using standard `assert()` macro
- **Assertion behavior follows C++ standards:** - **Assertion behavior follows C++ standards:**
- **Debug builds**: Assertions active (undefined `NDEBUG`) - **Debug builds**: Assertions active (undefined `NDEBUG`)
@@ -413,6 +476,7 @@ assert(ptr != nullptr && "Precondition violated: pointer must be non-null");
- **Static assertions** for compile-time validation (always active) - **Static assertions** for compile-time validation (always active)
**Usage guidelines:** **Usage guidelines:**
- Use for programming errors: null checks, precondition validation, invariants - Use for programming errors: null checks, precondition validation, invariants
- Don't use for expected runtime errors: use return codes instead - Don't use for expected runtime errors: use return codes instead
@@ -468,26 +532,28 @@ if (result == -1 && errno != EINTR) {
Most system calls are not interruptible in practice. For these, it is not necessary to add a retry loop. This includes: Most system calls are not interruptible in practice. For these, it is not necessary to add a retry loop. This includes:
* `fcntl` (with `F_GETFL`, `F_SETFL`, `F_GETFD`, `F_SETFD` - note: `F_SETLKW` and `F_OFD_SETLKW` CAN return EINTR) - `fcntl` (with `F_GETFL`, `F_SETFL`, `F_GETFD`, `F_SETFD` - note: `F_SETLKW` and `F_OFD_SETLKW` CAN return EINTR)
* `epoll_ctl` - `epoll_ctl`
* `socketpair` - `socketpair`
* `pipe` - `pipe`
* `setsockopt` - `setsockopt`
* `epoll_create1` - `epoll_create1`
* `close` (special case: guaranteed closed even on EINTR on Linux) - `close` (special case: guaranteed closed even on EINTR on Linux)
When in doubt, consult the `man` page for the specific system call to see if it can return `EINTR`. When in doubt, consult the `man` page for the specific system call to see if it can return `EINTR`.
--- ______________________________________________________________________
## Documentation ## Documentation
### Doxygen Style ### Doxygen Style
- **/** for struct and public method documentation - **/** for struct and public method documentation
- **@brief** for short descriptions - **@brief** for short descriptions
- **@param** and **@return** for function parameters - **@param** and **@return** for function parameters
- **@note** for important implementation notes - **@note** for important implementation notes
- **@warning** for critical usage warnings - **@warning** for critical usage warnings
```cpp ```cpp
/** /**
* @brief Type-safe version of realloc_raw for arrays of type T. * @brief Type-safe version of realloc_raw for arrays of type T.
@@ -502,9 +568,11 @@ T *realloc(T *existing_ptr, int32_t current_size, int32_t requested_size);
``` ```
### Code Comments ### Code Comments
- **Explain why, not what** - code should be self-documenting
- **Explain why, not what** - *what* the code does should be clear without any comments
- **Performance notes** for optimization decisions - **Performance notes** for optimization decisions
- **Thread safety** and ownership semantics - **Thread safety** and ownership semantics
```cpp ```cpp
// Uses O(1) accumulated counters for fast retrieval // Uses O(1) accumulated counters for fast retrieval
int64_t total_allocated() const; int64_t total_allocated() const;
@@ -514,20 +582,23 @@ Connection(struct sockaddr_storage addr, int fd, int64_t id,
ConnectionHandler *handler, std::weak_ptr<Server> server); ConnectionHandler *handler, std::weak_ptr<Server> server);
``` ```
--- ______________________________________________________________________
## Testing ## Testing
### Test Framework ### Test Framework
- **doctest** for unit testing - **doctest** for unit testing
- **TEST_CASE** and **SUBCASE** for test organization - **TEST_CASE** and **SUBCASE** for test organization
- **CHECK** for assertions (non-terminating) - **CHECK** for assertions (non-terminating)
- **REQUIRE** for critical assertions (terminating) - **REQUIRE** for critical assertions (terminating)
### Test Structure ### Test Structure
- **Descriptive test names** explaining the scenario - **Descriptive test names** explaining the scenario
- **SUBCASE** for related test variations - **SUBCASE** for related test variations that share setup/teardown code
- **Fresh instances** for each test to avoid state contamination - **Fresh instances** for each test to avoid state contamination
```cpp ```cpp
TEST_CASE("Arena basic allocation") { TEST_CASE("Arena basic allocation") {
Arena arena; Arena arena;
@@ -546,34 +617,27 @@ TEST_CASE("Arena basic allocation") {
``` ```
### Test Design Principles ### Test Design Principles
- **Test the contract, not the implementation** - validate what the API promises to deliver, not implementation details - **Test the contract, not the implementation** - validate what the API promises to deliver, not implementation details
- **Both integration and unit tests** - test components in isolation and working together - **Both integration and unit tests** - test components in isolation and working together
- **Prefer fakes to mocks** - use real implementations for internal components, fake external dependencies - **Prefer fakes to mocks** - use real implementations for internal components, fake external dependencies
- **Always enable assertions in tests** - use `-UNDEBUG` pattern to ensure assertions are checked (see Build Integration section) - **Always enable assertions in tests** - use `-UNDEBUG` pattern to ensure assertions are checked (see Build Integration section)
TODO make a new example here using APIs that exist
```cpp ```cpp
// Good: Testing through public API
TEST_CASE("Server accepts connections") {
auto config = Config::defaultConfig();
auto handler = std::make_unique<TestHandler>();
auto server = Server::create(config, std::move(handler));
// Test observable behavior - server can accept connections
auto result = connectToServer(server->getPort());
CHECK(result.connected);
}
// Avoid: Testing internal implementation details
// TEST_CASE("Server creates epoll instance") { /* implementation detail */ }
``` ```
### What NOT to Test ### What NOT to Test
**Avoid testing language features and plumbing:** **Avoid testing language features:**
- Don't test that virtual functions dispatch correctly - Don't test that virtual functions dispatch correctly
- Don't test that standard library types work (unique_ptr, containers, etc.) - Don't test that standard library types work (unique_ptr, containers, etc.)
- Don't test basic constructor/destructor calls - Don't test basic constructor/destructor calls
**Test business logic instead:** **Test business logic instead:**
- When does your code call hooks/callbacks and why? - When does your code call hooks/callbacks and why?
- What state transitions trigger behavior changes? - What state transitions trigger behavior changes?
- How does your code handle error conditions? - How does your code handle error conditions?
@@ -582,6 +646,7 @@ TEST_CASE("Server accepts connections") {
**Ask: "Am I testing the C++ compiler or my application logic?"** **Ask: "Am I testing the C++ compiler or my application logic?"**
### Test Synchronization (Authoritative Rules) ### Test Synchronization (Authoritative Rules)
- **ABSOLUTELY NEVER use timeouts** (`sleep_for`, `wait_for`, etc.) - **ABSOLUTELY NEVER use timeouts** (`sleep_for`, `wait_for`, etc.)
- **Deterministic synchronization only:** - **Deterministic synchronization only:**
- Blocking I/O (naturally waits for completion) - Blocking I/O (naturally waits for completion)
@@ -592,6 +657,7 @@ TEST_CASE("Server accepts connections") {
#### Threading Checklist for Tests/Benchmarks #### Threading Checklist for Tests/Benchmarks
**Common threading principles (all concurrent code):** **Common threading principles (all concurrent code):**
- **Count total threads** - Include main/benchmark thread in count - **Count total threads** - Include main/benchmark thread in count
- **Always assume concurrent execution needed** - Tests/benchmarks require real concurrency - **Always assume concurrent execution needed** - Tests/benchmarks require real concurrency
- **Add synchronization primitive** - `std::latch start_latch{N}` (most common), `std::barrier`, or similar where N = total concurrent threads - **Add synchronization primitive** - `std::latch start_latch{N}` (most common), `std::barrier`, or similar where N = total concurrent threads
@@ -599,18 +665,21 @@ TEST_CASE("Server accepts connections") {
- **Main thread synchronizes before measurement/execution** - ensures all threads start simultaneously - **Main thread synchronizes before measurement/execution** - ensures all threads start simultaneously
**Test-specific:** **Test-specific:**
- **Perform many operations per thread creation** - amortize thread creation cost and increase chances of hitting race conditions - **Perform many operations per thread creation** - amortize thread creation cost and increase chances of hitting race conditions
- **Pattern: Create test that spawns threads and runs many operations, then run that test many times** - amortizes thread creation cost while providing fresh test instances - **Pattern: Create test that spawns threads and runs many operations, then run that test many times** - amortizes thread creation cost while providing fresh test instances
- **Run 100-10000 operations per test, and 100-10000 test iterations** - maximizes chances of hitting race conditions - **Run 100-10000 operations per test, and 100-10000 test iterations** - maximizes chances of hitting race conditions
- **Always run with ThreadSanitizer** - compile with `-fsanitize=thread` - **Always run with ThreadSanitizer** - compile with `-fsanitize=thread`
**Benchmark-specific:** **Benchmark-specific:**
- **NEVER create threads inside the benchmark measurement** - creates thread creation/destruction overhead, not contention - **NEVER create threads inside the benchmark measurement** - creates thread creation/destruction overhead, not contention
- **Create background threads OUTSIDE the benchmark** that run continuously during measurement - **Create background threads OUTSIDE the benchmark** that run continuously during measurement
- **Use `std::atomic<bool> keep_running` to cleanly shut down background threads after benchmark** - **Use `std::atomic<bool> keep_running` to cleanly shut down background threads after benchmark**
- **Measure only the foreground operation under real contention from background threads** - **Measure only the foreground operation under real contention from background threads**
**Red flags to catch immediately:** **Red flags to catch immediately:**
- ❌ Creating threads in a loop without `std::latch` - ❌ Creating threads in a loop without `std::latch`
- ❌ Background threads starting work immediately - ❌ Background threads starting work immediately
- ❌ Benchmark measuring before all threads synchronized - ❌ Benchmark measuring before all threads synchronized
@@ -636,11 +705,12 @@ for (int i = 0; i < 4; ++i) {
} }
``` ```
--- ______________________________________________________________________
## Build Integration ## Build Integration
### Build Configuration ### Build Configuration
```bash ```bash
# Debug: assertions on, optimizations off # Debug: assertions on, optimizations off
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
@@ -650,6 +720,7 @@ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
``` ```
**Test Target Pattern:** **Test Target Pattern:**
- Production targets follow build type (assertions off in Release) - Production targets follow build type (assertions off in Release)
- Test targets use `-UNDEBUG` to force assertions on in all builds - Test targets use `-UNDEBUG` to force assertions on in all builds
- Ensures consistent test validation regardless of build type - Ensures consistent test validation regardless of build type
@@ -657,8 +728,9 @@ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
```cmake ```cmake
# Test target with assertions always enabled # Test target with assertions always enabled
add_executable(test_example tests/test_example.cpp src/example.cpp) add_executable(test_example tests/test_example.cpp src/example.cpp)
target_link_libraries(test_example doctest::doctest) target_link_libraries(test_example doctest_impl)
target_compile_options(test_example PRIVATE -UNDEBUG) # Always enable assertions target_compile_options(test_example PRIVATE -UNDEBUG) # Always enable assertions
add_test(NAME test_example COMMAND test_example)
# Production target follows build type # Production target follows build type
add_executable(example src/example.cpp src/main.cpp) add_executable(example src/example.cpp src/main.cpp)
@@ -666,4 +738,5 @@ add_executable(example src/example.cpp src/main.cpp)
``` ```
### Code Generation ### Code Generation
- Generated files go in build directory, not source - Generated files go in build directory, not source

View File

@@ -10,9 +10,8 @@ interfaces = [
max_request_size_bytes = 1048576 # 1MB max_request_size_bytes = 1048576 # 1MB
# Number of I/O threads for handling connections and network events # Number of I/O threads for handling connections and network events
io_threads = 8 io_threads = 8
epoll_instances = 8
# Event batch size for epoll processing # Event batch size for epoll processing
event_batch_size = 64 event_batch_size = 128
[commit] [commit]
# Minimum length for request_id to ensure sufficient entropy # Minimum length for request_id to ensure sufficient entropy

View File

@@ -27,3 +27,6 @@ request_id_retention_versions = 100000000
max_buffer_size_bytes = 10485760 # 10MB max_buffer_size_bytes = 10485760 # 10MB
# Interval for sending keepalive comments to prevent idle timeouts (seconds) # Interval for sending keepalive comments to prevent idle timeouts (seconds)
keepalive_interval_seconds = 30 keepalive_interval_seconds = 30
[benchmark]
ok_resolve_iterations = 4000

View File

@@ -1,60 +1,268 @@
#include "arena.hpp" #include "config.hpp"
#include "connection.hpp"
#include "http_handler.hpp" #include "http_handler.hpp"
#include "perfetto_categories.hpp" #include "server.hpp"
#include <atomic>
#include <chrono>
#include <doctest/doctest.h> #include <doctest/doctest.h>
#include <fcntl.h>
#include <poll.h>
#include <string>
#include <thread>
#include <unistd.h>
// Perfetto static storage for tests // Test to demonstrate HTTP pipelining response ordering issue
PERFETTO_TRACK_EVENT_STATIC_STORAGE(); //
// HTTP/1.1 pipelining allows multiple requests to be sent on a single
// connection without waiting for responses, but responses MUST be sent in the
// same order as requests were received (RFC 2616 Section 8.1.2.2).
//
// This test sends two pipelined requests:
// 1. GET /ok - Slow response (goes through 4-stage pipeline processing)
// 2. GET /metrics - Fast response (handled directly, just collects metrics)
//
// Even though /ok takes longer to process due to pipeline overhead, the /ok
// response should be sent first since it was requested first. Currently this
// test FAILS because the faster /metrics response completes before /ok and
// gets sent out of order.
TEST_CASE("HTTP pipelined responses out of order") {
weaseldb::Config config;
HttpHandler handler(config);
auto server = Server::create(config, handler, {});
int fd = server->create_local_connection();
// Global variable needed by Connection auto runThread = std::thread{[&]() { server->run(); }};
std::atomic<int> activeConnections{0};
// Simple test helper since Connection has complex constructor requirements // Send two pipelined requests in a single write() call
struct TestConnectionData { // Request order: /ok first, then /metrics
Arena arena; // Expected response order: /ok response first, then /metrics response
std::string message_buffer; // Actual result: /metrics response first (fast), then /ok response (slow)
void *user_data = nullptr; std::string pipelined_requests = "GET /ok HTTP/1.1\r\n"
"Host: localhost\r\n"
"Connection: keep-alive\r\n"
"\r\n"
"GET /metrics HTTP/1.1\r\n"
"Host: localhost\r\n"
"Connection: keep-alive\r\n"
"\r\n";
void append_message(std::string_view data) { message_buffer += data; } int w = write(fd, pipelined_requests.c_str(), pipelined_requests.size());
REQUIRE(w == static_cast<int>(pipelined_requests.size()));
Arena &get_arena() { return arena; } // Set socket to non-blocking
const std::string &getResponse() const { return message_buffer; } int flags = fcntl(fd, F_GETFL, 0);
void clearResponse() { message_buffer.clear(); } fcntl(fd, F_SETFL, flags | O_NONBLOCK);
void reset() {
arena.reset();
message_buffer.clear();
}
};
// Test helper to verify the new hook functionality // Read all responses with non-blocking I/O and poll
struct MockConnectionHandler : public ConnectionHandler { char buf[8192];
bool write_progress_called = false; int total_read = 0;
bool write_buffer_drained_called = false;
void on_write_progress(std::unique_ptr<Connection> &) override { bool found_ok = false;
write_progress_called = true; bool found_http_response = false;
std::string ok_response_header = "Content-Length: 2";
while (true) {
// Use poll to wait for data availability
struct pollfd pfd = {fd, POLLIN, 0};
int poll_result = poll(&pfd, 1, -1); // Block indefinitely
if (poll_result > 0 && (pfd.revents & POLLIN)) {
int r = read(fd, buf + total_read, sizeof(buf) - total_read - 1);
if (r > 0) {
printf("%.*s", r, buf + total_read);
total_read += r;
// Check if we have what we need after each read
buf[total_read] = '\0';
std::string current_data(buf, total_read);
found_http_response =
current_data.find("HTTP/1.1") != std::string::npos;
found_ok = current_data.find(ok_response_header) != std::string::npos;
// If we have both HTTP response and ok_response_header, we can proceed
// with the test
if (found_http_response && found_ok) {
break;
}
} else if (r == 0) {
REQUIRE(false);
break; // EOF
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
REQUIRE(false);
}
}
} }
void on_write_buffer_drained(std::unique_ptr<Connection> &) override { buf[total_read] = '\0';
write_buffer_drained_called = true; std::string response_data(buf, total_read);
// Ensure we found both HTTP response and ok_response_header
REQUIRE(found_http_response);
REQUIRE(found_ok);
// Find first occurrence of ok_response_header in response body
std::size_t ok_pos = response_data.find(ok_response_header);
REQUIRE(ok_pos != std::string::npos);
// Count HTTP response status lines before the /ok response body
// This tests response ordering: should be exactly 1 (the /ok response itself)
std::string before_ok = response_data.substr(0, ok_pos);
int http_response_count = 0;
std::size_t pos = 0;
while ((pos = before_ok.find("HTTP/1.1", pos)) != std::string::npos) {
http_response_count++;
pos += 8;
} }
};
TEST_CASE("ConnectionHandler hooks") { // Assert there's exactly one HTTP response line before /ok response body
SUBCASE("on_write_buffer_drained hook exists") { // If http_response_count == 2, it means /metrics response came first (wrong
MockConnectionHandler handler; // order) If http_response_count == 1, it means /ok response came first
// (correct order)
CHECK(http_response_count == 1);
// Verify hooks are available and can be overridden close(fd);
CHECK_FALSE(handler.write_progress_called); server->shutdown();
CHECK_FALSE(handler.write_buffer_drained_called); runThread.join();
}
// Would normally be called by Server during write operations
std::unique_ptr<Connection> null_conn; TEST_CASE("HTTP pipelined POST requests race condition") {
handler.on_write_progress(null_conn); weaseldb::Config config;
handler.on_write_buffer_drained(null_conn); HttpHandler handler(config);
auto server = Server::create(config, handler, {});
CHECK(handler.write_progress_called); int fd = server->create_local_connection();
CHECK(handler.write_buffer_drained_called);
} auto runThread = std::thread{[&]() { server->run(); }};
// Create a POST request with JSON body that requires parsing
std::string json_body = R"({
"request_id": "test-123",
"leader_id": "leader-1",
"read_version": 1,
"preconditions": [],
"operations": [{"write": {"key": "dGVzdA==", "value": "dmFsdWU="}}]
})";
std::string first_post = "POST /v1/commit HTTP/1.1\r\n"
"Host: localhost\r\n"
"Content-Type: application/json\r\n"
"Content-Length: " +
std::to_string(json_body.size()) +
"\r\n"
"Connection: keep-alive\r\n"
"\r\n" +
json_body;
std::string second_get = "GET /v1/version HTTP/1.1\r\n"
"Host: localhost\r\n"
"Connection: close\r\n"
"\r\n";
// Send POST request followed immediately by GET request
// This creates a scenario where the GET request starts parsing
// while the POST response is being written (triggering the reset)
int w1 = write(fd, first_post.c_str(), first_post.size());
REQUIRE(w1 == static_cast<int>(first_post.size()));
int w2 = write(fd, second_get.c_str(), second_get.size());
REQUIRE(w2 == static_cast<int>(second_get.size()));
// Read responses using blocking I/O (deterministic synchronization)
char buf[4096];
int total_read = 0;
int responses_found = 0;
while (total_read < 4000) {
int r = read(fd, buf + total_read, sizeof(buf) - total_read - 1);
if (r <= 0)
break;
total_read += r;
buf[total_read] = '\0';
std::string response(buf, total_read);
std::size_t pos = 0;
while ((pos = response.find("HTTP/1.1", pos)) != std::string::npos) {
responses_found++;
pos += 8;
}
if (responses_found >= 2)
break;
}
// Should get responses to both requests
// Race condition might cause parsing errors or connection issues
CHECK(responses_found >= 1); // At minimum should handle first request
close(fd);
server->shutdown();
runThread.join();
}
TEST_CASE("HTTP URL split across multiple writes") {
weaseldb::Config config;
HttpHandler handler(config);
auto server = Server::create(config, handler, {});
int fd = server->create_local_connection();
auto runThread = std::thread{[&]() { server->run(); }};
// Test URL accumulation by splitting the URL across multiple writes
// This would have caught the original bug where URL string_view pointed
// to llhttp's internal buffer that gets reused between writes
// Split "GET /metrics HTTP/1.1\r\n" across multiple writes
std::string part1 = "GET /met";
std::string part2 = "rics HTTP/1.1\r\n";
std::string headers = "Host: localhost\r\n"
"Connection: close\r\n"
"\r\n";
// Write URL in two parts - this tests URL accumulation
int w1 = write(fd, part1.c_str(), part1.size());
REQUIRE(w1 == static_cast<int>(part1.size()));
// Attempt to trigger separate llhttp parsing calls
std::this_thread::sleep_for(std::chrono::milliseconds(1));
int w2 = write(fd, part2.c_str(), part2.size());
REQUIRE(w2 == static_cast<int>(part2.size()));
int w3 = write(fd, headers.c_str(), headers.size());
REQUIRE(w3 == static_cast<int>(headers.size()));
// Read response
char buf[4096];
int total_read = 0;
bool found_metrics_response = false;
while (total_read < 4000) {
int r = read(fd, buf + total_read, sizeof(buf) - total_read - 1);
if (r <= 0)
break;
total_read += r;
buf[total_read] = '\0';
std::string response(buf, total_read);
// Check for successful metrics response (not 404)
if (response.find("HTTP/1.1 200 OK") != std::string::npos &&
response.find("text/plain; version=0.0.4") != std::string::npos) {
found_metrics_response = true;
break;
}
// Check for 404 which would indicate URL accumulation failed
if (response.find("HTTP/1.1 404") != std::string::npos) {
FAIL("Got 404 - URL accumulation failed, split URL was not properly "
"reconstructed");
}
}
REQUIRE(found_metrics_response);
close(fd);
server->shutdown();
runThread.join();
} }

View File

@@ -4,9 +4,7 @@
#include "metric.hpp" #include "metric.hpp"
#include <atomic> #include <atomic>
#include <chrono>
#include <cmath> #include <cmath>
#include <fstream>
#include <latch> #include <latch>
#include <sstream> #include <sstream>
#include <thread> #include <thread>
@@ -587,32 +585,6 @@ TEST_CASE("thread counter cleanup bug") {
} }
} }
TEST_CASE("error conditions") {
SUBCASE("counter negative increment") {
auto counter_family = metric::create_counter("error_counter", "Error test");
auto counter = counter_family.create({});
// This should abort in debug builds due to validation
// In release builds, behavior is undefined
// counter.inc(-1.0); // Would abort
}
SUBCASE("invalid metric names") {
// These should abort due to validation
// auto bad_counter = metric::create_counter("123invalid", "help"); // Would
// abort auto bad_gauge = metric::create_gauge("invalid-name", "help"); //
// Would abort
}
SUBCASE("invalid label keys") {
auto counter_family = metric::create_counter("valid_name", "help");
// This should abort due to label validation
// auto counter = counter_family.create({{"123invalid", "value"}}); // Would
// abort
}
}
TEST_CASE("memory management") { TEST_CASE("memory management") {
SUBCASE("arena allocation in render") { SUBCASE("arena allocation in render") {
Arena arena; Arena arena;
@@ -655,6 +627,113 @@ TEST_CASE("memory management") {
} }
} }
TEST_CASE("histogram pending buffer thread cleanup bug") {
for (int iterations = 0; iterations < 1000; ++iterations) {
// This test demonstrates the bug where pending histogram observations
// are lost when a thread dies because ThreadInit destructor doesn't
// flush pending data into shared before accumulating into global state.
metric::reset_metrics_for_testing();
auto hist_family = metric::create_histogram(
"pending_bug_test", "Test histogram for pending buffer bug",
{1.0}); // Single bucket for simplicity
std::atomic<bool> keep_rendering{true};
constexpr int num_threads = 100;
std::latch ready{2};
// Background thread that calls render in a tight loop to hold global mutex
std::thread render_thread([&]() {
ready.arrive_and_wait();
Arena arena;
while (keep_rendering.load(std::memory_order_relaxed)) {
metric::render(arena);
arena.reset();
}
});
// Don't spawn threads until render thread is running
ready.arrive_and_wait();
// Spawn threads that observe once and exit
std::vector<std::thread> observer_threads;
for (int i = 0; i < num_threads; ++i) {
observer_threads.emplace_back([&hist_family]() {
auto hist = hist_family.create({{"test", "observer"}});
hist.observe(0.5); // Goes into first bucket (le="1.0")
// Thread dies here - pending observations should be lost due to bug
});
}
// Join all observer threads
for (auto &t : observer_threads) {
t.join();
}
// Stop render thread
keep_rendering.store(false, std::memory_order_relaxed);
render_thread.join();
// Check if the worker's observations were preserved
Arena arena;
auto output = metric::render(arena);
// First, let's debug what we actually got
std::ostringstream debug_output;
for (const auto &line : output) {
debug_output << line;
}
std::string full_output = debug_output.str();
// Parse the output to find the worker's bucket count for le="2.0"
uint64_t worker_bucket_2_count = 0;
bool found_worker_metric = false;
// The render output alternates between metric name and value in separate
// string_views
for (size_t i = 0; i < output.size(); ++i) {
const auto &line = output[i];
// Look for: pending_bug_test_bucket{test="observer",le="1.0"}
if (line.find("pending_bug_test_bucket{test=\"observer\",le=\"1.0\"}") !=
std::string_view::npos) {
found_worker_metric = true;
// The value should be in the next element
if (i + 1 < output.size()) {
auto value_str = output[i + 1];
// Remove trailing newline if present
while (!value_str.empty() &&
(value_str.back() == '\n' || value_str.back() == '\r')) {
value_str.remove_suffix(1);
}
try {
worker_bucket_2_count = std::stoull(std::string(value_str));
} catch (const std::exception &e) {
MESSAGE("Failed to parse value: '"
<< value_str << "' from metric line: '" << line << "'");
MESSAGE("Full output:\n" << full_output);
throw;
}
}
break;
}
}
REQUIRE(found_worker_metric); // The metric should exist
// BUG: This will fail because pending observations are lost on thread death
// Expected: num_threads observations (each thread made 1 observation)
// Actual: less than num_threads (observations stuck in pending are lost
// when threads die)
CHECK_MESSAGE(
worker_bucket_2_count == num_threads,
"Expected "
<< num_threads << " observations but got " << worker_bucket_2_count
<< ". This indicates the pending buffer bug where observations "
<< "stuck in pending are lost when thread dies.");
}
}
TEST_CASE("render output deterministic order golden test") { TEST_CASE("render output deterministic order golden test") {
// Clean slate - reset all metrics before this test // Clean slate - reset all metrics before this test
metric::reset_metrics_for_testing(); metric::reset_metrics_for_testing();

View File

@@ -28,19 +28,39 @@ struct Base {
virtual int get_value() const { return base_value; } virtual int get_value() const { return base_value; }
}; };
struct Derived : public Base { struct Derived : Base {
int derived_value; int derived_value;
explicit Derived(int base_v, int derived_v) explicit Derived(int base_v, int derived_v)
: Base(base_v), derived_value(derived_v) {} : Base(base_v), derived_value(derived_v) {}
int get_value() const override { return base_value + derived_value; } int get_value() const override { return base_value + derived_value; }
}; };
struct AnotherDerived : public Base { struct AnotherDerived : Base {
int another_value; int another_value;
explicit AnotherDerived(int base_v, int another_v) explicit AnotherDerived(int base_v, int another_v)
: Base(base_v), another_value(another_v) {} : Base(base_v), another_value(another_v) {}
int get_value() const override { return base_value * another_value; } int get_value() const override { return base_value * another_value; }
}; };
// Classes to test polymorphic pointer address changes
struct Interface1 {
int interface1_data = 1;
virtual ~Interface1() = default;
virtual int get_interface1() const { return interface1_data; }
};
struct Interface2 {
int interface2_data = 2;
virtual ~Interface2() = default;
virtual int get_interface2() const { return interface2_data; }
};
// Multiple inheritance - this will cause pointer address changes
struct MultipleInheritance : Interface1, Interface2 {
int own_data;
explicit MultipleInheritance(int data) : own_data(data) {}
int get_own_data() const { return own_data; }
};
} // anonymous namespace } // anonymous namespace
TEST_CASE("Ref basic functionality") { TEST_CASE("Ref basic functionality") {
@@ -52,9 +72,9 @@ TEST_CASE("Ref basic functionality") {
CHECK((*ref).value == 42); CHECK((*ref).value == 42);
} }
SUBCASE("copy construction increments reference count") { SUBCASE("explicit copy increments reference count") {
auto ref1 = make_ref<TestObject>(123); auto ref1 = make_ref<TestObject>(123);
auto ref2 = ref1; auto ref2 = ref1.copy();
CHECK(ref1); CHECK(ref1);
CHECK(ref2); CHECK(ref2);
@@ -63,11 +83,11 @@ TEST_CASE("Ref basic functionality") {
CHECK(ref2->value == 123); CHECK(ref2->value == 123);
} }
SUBCASE("copy assignment works correctly") { SUBCASE("explicit copy assignment works correctly") {
auto ref1 = make_ref<TestObject>(100); auto ref1 = make_ref<TestObject>(100);
auto ref2 = make_ref<TestObject>(200); auto ref2 = make_ref<TestObject>(200);
ref2 = ref1; ref2 = ref1.copy();
CHECK(ref1.get() == ref2.get()); CHECK(ref1.get() == ref2.get());
CHECK(ref1->value == 100); CHECK(ref1->value == 100);
CHECK(ref2->value == 100); CHECK(ref2->value == 100);
@@ -109,7 +129,7 @@ TEST_CASE("Ref basic functionality") {
TEST_CASE("WeakRef basic functionality") { TEST_CASE("WeakRef basic functionality") {
SUBCASE("construction from Ref") { SUBCASE("construction from Ref") {
auto ref = make_ref<TestObject>(333); auto ref = make_ref<TestObject>(333);
WeakRef<TestObject> weak_ref = ref; WeakRef<TestObject> weak_ref = ref.as_weak();
auto locked = weak_ref.lock(); auto locked = weak_ref.lock();
CHECK(locked); CHECK(locked);
@@ -121,7 +141,7 @@ TEST_CASE("WeakRef basic functionality") {
WeakRef<TestObject> weak_ref; WeakRef<TestObject> weak_ref;
{ {
auto ref = make_ref<TestObject>(444); auto ref = make_ref<TestObject>(444);
weak_ref = ref; weak_ref = ref.as_weak();
} }
// ref goes out of scope, object should be destroyed // ref goes out of scope, object should be destroyed
@@ -131,8 +151,8 @@ TEST_CASE("WeakRef basic functionality") {
SUBCASE("copy and move semantics") { SUBCASE("copy and move semantics") {
auto ref = make_ref<TestObject>(666); auto ref = make_ref<TestObject>(666);
WeakRef<TestObject> weak1 = ref; WeakRef<TestObject> weak1 = ref.as_weak();
WeakRef<TestObject> weak2 = weak1; // copy WeakRef<TestObject> weak2 = weak1.copy(); // explicit copy
WeakRef<TestObject> weak3 = std::move(weak1); // move WeakRef<TestObject> weak3 = std::move(weak1); // move
auto locked2 = weak2.lock(); auto locked2 = weak2.lock();
@@ -160,7 +180,7 @@ TEST_CASE("Ref thread safety") {
start_latch.arrive_and_wait(); start_latch.arrive_and_wait();
for (int j = 0; j < copies_per_thread; ++j) { for (int j = 0; j < copies_per_thread; ++j) {
auto copy = ref; auto copy = ref.copy();
CHECK(copy); CHECK(copy);
CHECK(copy->value == 777); CHECK(copy->value == 777);
} }
@@ -191,7 +211,7 @@ TEST_CASE("Control block cleanup race condition test") {
WeakRef<TestObject> ptr2; WeakRef<TestObject> ptr2;
auto setup = [&]() { auto setup = [&]() {
ptr1 = make_ref<TestObject>(0); ptr1 = make_ref<TestObject>(0);
ptr2 = ptr1; ptr2 = ptr1.as_weak();
}; };
// Barrier for synchronization - 2 participants (main thread + worker thread) // Barrier for synchronization - 2 participants (main thread + worker thread)
@@ -243,7 +263,7 @@ TEST_CASE("WeakRef prevents circular references") {
// Create object and weak reference // Create object and weak reference
{ {
auto ref = make_ref<TestObject>(123); auto ref = make_ref<TestObject>(123);
weak_ref = ref; weak_ref = ref.as_weak();
// Should be able to lock while object exists // Should be able to lock while object exists
auto locked = weak_ref.lock(); auto locked = weak_ref.lock();
@@ -262,8 +282,8 @@ TEST_CASE("WeakRef prevents circular references") {
auto child = make_ref<Node>(2); auto child = make_ref<Node>(2);
// Create potential cycle // Create potential cycle
parent->next = child; // Strong reference: parent → child parent->next = child.copy(); // Strong reference: parent → child
child->parent = parent; // WeakRef: child ⇝ parent (breaks cycle) child->parent = parent.as_weak(); // WeakRef: child ⇝ parent (breaks cycle)
CHECK(parent->data == 1); CHECK(parent->data == 1);
CHECK(child->data == 2); CHECK(child->data == 2);
@@ -286,7 +306,7 @@ TEST_CASE("Polymorphic Ref conversions") {
CHECK(derived_ref->get_value() == 30); // 10 + 20 CHECK(derived_ref->get_value() == 30); // 10 + 20
// Convert Ref<Derived> to Ref<Base> // Convert Ref<Derived> to Ref<Base>
Ref<Base> base_ref = derived_ref; Ref<Base> base_ref = derived_ref.copy();
CHECK(base_ref); CHECK(base_ref);
CHECK(base_ref->get_value() == 30); // Virtual dispatch works CHECK(base_ref->get_value() == 30); // Virtual dispatch works
CHECK(base_ref->base_value == 10); CHECK(base_ref->base_value == 10);
@@ -303,7 +323,7 @@ TEST_CASE("Polymorphic Ref conversions") {
CHECK(base_ref->get_value() == 100); CHECK(base_ref->get_value() == 100);
// Assign derived to base // Assign derived to base
base_ref = derived_ref; base_ref = derived_ref.copy();
CHECK(base_ref->get_value() == 20); // 5 + 15 CHECK(base_ref->get_value() == 20); // 5 + 15
CHECK(base_ref.get() == derived_ref.get()); CHECK(base_ref.get() == derived_ref.get());
} }
@@ -338,7 +358,7 @@ TEST_CASE("Polymorphic Ref conversions") {
CHECK(another_derived->get_value() == 24); // 6 * 4 CHECK(another_derived->get_value() == 24); // 6 * 4
// Convert to base // Convert to base
Ref<Base> base_ref = another_derived; Ref<Base> base_ref = another_derived.copy();
CHECK(base_ref->get_value() == 24); // Virtual dispatch CHECK(base_ref->get_value() == 24); // Virtual dispatch
CHECK(base_ref.get() == another_derived.get()); CHECK(base_ref.get() == another_derived.get());
} }
@@ -349,10 +369,10 @@ TEST_CASE("Polymorphic WeakRef conversions") {
auto derived_ref = make_ref<Derived>(3, 7); auto derived_ref = make_ref<Derived>(3, 7);
// Create WeakRef<Derived> // Create WeakRef<Derived>
WeakRef<Derived> weak_derived = derived_ref; WeakRef<Derived> weak_derived = derived_ref.as_weak();
// Convert to WeakRef<Base> // Convert to WeakRef<Base>
WeakRef<Base> weak_base = weak_derived; WeakRef<Base> weak_base = weak_derived.copy();
// Both should lock to same object // Both should lock to same object
auto locked_derived = weak_derived.lock(); auto locked_derived = weak_derived.lock();
@@ -368,11 +388,11 @@ TEST_CASE("Polymorphic WeakRef conversions") {
auto derived_ref = make_ref<Derived>(4, 6); auto derived_ref = make_ref<Derived>(4, 6);
auto base_ref = make_ref<Base>(999); auto base_ref = make_ref<Base>(999);
WeakRef<Derived> weak_derived = derived_ref; WeakRef<Derived> weak_derived = derived_ref.as_weak();
WeakRef<Base> weak_base = base_ref; WeakRef<Base> weak_base = base_ref.as_weak();
// Assign derived weak ref to base weak ref // Assign derived weak ref to base weak ref
weak_base = weak_derived; weak_base = weak_derived.copy();
auto locked = weak_base.lock(); auto locked = weak_base.lock();
CHECK(locked); CHECK(locked);
@@ -384,7 +404,7 @@ TEST_CASE("Polymorphic WeakRef conversions") {
auto derived_ref = make_ref<Derived>(2, 8); auto derived_ref = make_ref<Derived>(2, 8);
// Create WeakRef<Base> directly from Ref<Derived> // Create WeakRef<Base> directly from Ref<Derived>
WeakRef<Base> weak_base = derived_ref; WeakRef<Base> weak_base = derived_ref.as_weak();
auto locked = weak_base.lock(); auto locked = weak_base.lock();
CHECK(locked); CHECK(locked);
@@ -394,7 +414,7 @@ TEST_CASE("Polymorphic WeakRef conversions") {
SUBCASE("WeakRef move operations") { SUBCASE("WeakRef move operations") {
auto derived_ref = make_ref<Derived>(1, 9); auto derived_ref = make_ref<Derived>(1, 9);
WeakRef<Derived> weak_derived = derived_ref; WeakRef<Derived> weak_derived = derived_ref.as_weak();
// Move construct // Move construct
WeakRef<Base> weak_base = std::move(weak_derived); WeakRef<Base> weak_base = std::move(weak_derived);
@@ -414,7 +434,7 @@ TEST_CASE("Polymorphic edge cases") {
CHECK(!empty_derived); CHECK(!empty_derived);
// Convert empty derived to base // Convert empty derived to base
Ref<Base> empty_base = empty_derived; Ref<Base> empty_base = empty_derived.copy();
CHECK(!empty_base); CHECK(!empty_base);
// Move empty derived to base // Move empty derived to base
@@ -427,7 +447,7 @@ TEST_CASE("Polymorphic edge cases") {
CHECK(!empty_weak_derived.lock()); CHECK(!empty_weak_derived.lock());
// Convert empty weak derived to weak base // Convert empty weak derived to weak base
WeakRef<Base> empty_weak_base = empty_weak_derived; WeakRef<Base> empty_weak_base = empty_weak_derived.copy();
CHECK(!empty_weak_base.lock()); CHECK(!empty_weak_base.lock());
} }
@@ -435,7 +455,7 @@ TEST_CASE("Polymorphic edge cases") {
auto derived_ref = make_ref<Derived>(5, 5); auto derived_ref = make_ref<Derived>(5, 5);
// Ref<Derived> → WeakRef<Base> // Ref<Derived> → WeakRef<Base>
WeakRef<Base> weak_base_from_ref = derived_ref; WeakRef<Base> weak_base_from_ref = derived_ref.as_weak();
// WeakRef<Base> → Ref<Base> via lock // WeakRef<Base> → Ref<Base> via lock
auto base_ref_from_weak = weak_base_from_ref.lock(); auto base_ref_from_weak = weak_base_from_ref.lock();
@@ -444,6 +464,36 @@ TEST_CASE("Polymorphic edge cases") {
CHECK(base_ref_from_weak->get_value() == 10); // 5 + 5 CHECK(base_ref_from_weak->get_value() == 10); // 5 + 5
CHECK(base_ref_from_weak.get() == derived_ref.get()); CHECK(base_ref_from_weak.get() == derived_ref.get());
} }
SUBCASE("multiple inheritance pointer address bug test") {
auto multi_ref = make_ref<MultipleInheritance>(42);
// Get pointers to different base classes - these will have different
// addresses
Interface1 *interface1_ptr = multi_ref.get();
Interface2 *interface2_ptr = multi_ref.get();
// Verify that pointers are indeed different (demonstrating the issue)
CHECK(static_cast<void *>(interface1_ptr) !=
static_cast<void *>(interface2_ptr));
// Create WeakRef to Interface2 (which has a different pointer address)
WeakRef<Interface2> weak_interface2 = multi_ref.as_weak();
// Lock should return the correct Interface2 pointer, not miscalculated one
auto locked_interface2 = weak_interface2.lock();
CHECK(locked_interface2);
CHECK(locked_interface2.get() ==
interface2_ptr); // This might fail due to the bug!
CHECK(locked_interface2->get_interface2() == 2);
// Also test Interface1
WeakRef<Interface1> weak_interface1 = multi_ref.as_weak();
auto locked_interface1 = weak_interface1.lock();
CHECK(locked_interface1);
CHECK(locked_interface1.get() == interface1_ptr); // This might also fail!
CHECK(locked_interface1->get_interface1() == 1);
}
} }
// Should be run with asan or valgrind // Should be run with asan or valgrind
@@ -457,5 +507,5 @@ TEST_CASE("Self-referencing WeakRef pattern") {
WeakRef<SelfReferencing> self_; WeakRef<SelfReferencing> self_;
}; };
auto x = make_ref<SelfReferencing>(); auto x = make_ref<SelfReferencing>();
x->self_ = x; x->self_ = x.as_weak();
} }

173
tests/test_server.cpp Normal file
View File

@@ -0,0 +1,173 @@
#include "config.hpp"
#include "connection.hpp"
#include "connection_handler.hpp"
#include "server.hpp"
#include <doctest/doctest.h>
#include <latch>
#include <string_view>
#include <thread>
struct EchoHandler : ConnectionHandler {
Arena arena;
std::span<std::string_view> reply;
WeakRef<MessageSender> wconn;
std::latch done{1};
void on_data_arrived(std::string_view data, Connection &conn) override {
reply = arena.allocate_span<std::string_view>(1);
reply[0] = arena.copy_string(data);
wconn = conn.get_weak_ref();
CHECK(wconn.lock());
done.count_down();
}
};
TEST_CASE("Echo test") {
EchoHandler handler;
weaseldb::Config config;
auto server = Server::create(config, handler, {});
int fd = server->create_local_connection();
auto runThread = std::thread{[&]() { server->run(); }};
int w = write(fd, "hello", 5);
REQUIRE(w == 5);
handler.done.wait();
if (auto conn = handler.wconn.lock()) {
// Cast to Connection* to access append_bytes (not available on
// MessageSender)
auto *conn_ptr = static_cast<Connection *>(conn.get());
conn_ptr->append_bytes(std::exchange(handler.reply, {}),
std::move(handler.arena), ConnectionShutdown::None);
} else {
REQUIRE(false);
}
char buf[6];
buf[5] = 0;
int r = read(fd, buf, 5);
REQUIRE(r == 5);
CHECK(std::string(buf) == "hello");
close(fd);
server->shutdown();
runThread.join();
}
struct ShutdownTestHandler : ConnectionHandler {
Arena arena;
std::span<std::string_view> reply;
WeakRef<MessageSender> wconn;
std::latch received_data{1};
std::latch connection_closed_latch{1};
ConnectionShutdown shutdown_mode = ConnectionShutdown::None;
std::atomic<bool> connection_closed{false};
void on_data_arrived(std::string_view data, Connection &conn) override {
reply = arena.allocate_span<std::string_view>(1);
reply[0] = arena.copy_string(data);
wconn = conn.get_weak_ref();
received_data.count_down();
}
void on_connection_closed(Connection &) override {
connection_closed = true;
connection_closed_latch.count_down();
}
};
TEST_CASE("Connection shutdown write-only mode") {
ShutdownTestHandler handler;
handler.shutdown_mode = ConnectionShutdown::WriteOnly;
weaseldb::Config config;
auto server = Server::create(config, handler, {});
int fd = server->create_local_connection();
auto runThread = std::thread{[&]() { server->run(); }};
// Send data to trigger handler
int w = write(fd, "test", 4);
REQUIRE(w == 4);
handler.received_data.wait();
// Send response with write shutdown
if (auto conn = handler.wconn.lock()) {
auto *conn_ptr = static_cast<Connection *>(conn.get());
conn_ptr->append_bytes(std::exchange(handler.reply, {}),
std::move(handler.arena),
ConnectionShutdown::WriteOnly);
} else {
REQUIRE(false);
}
// Read the response
char buf[5];
buf[4] = 0;
int r = read(fd, buf, 4);
REQUIRE(r == 4);
CHECK(std::string(buf) == "test");
// After write shutdown, we should get EOF when trying to read more
char extra_buf[1];
int eof_result = read(fd, extra_buf, 1);
CHECK(eof_result == 0); // EOF indicates successful write shutdown
// Connection should still be alive (not closed) after write shutdown
// We can verify this by checking that we can still write to the socket
int write_result = write(fd, "x", 1);
CHECK(write_result == 1); // Should succeed - connection still alive
CHECK(handler.connection_closed.load() ==
false); // Connection should still be alive
close(fd);
server->shutdown();
runThread.join();
}
TEST_CASE("Connection shutdown full mode") {
ShutdownTestHandler handler;
handler.shutdown_mode = ConnectionShutdown::Full;
weaseldb::Config config;
auto server = Server::create(config, handler, {});
int fd = server->create_local_connection();
auto runThread = std::thread{[&]() { server->run(); }};
// Send data to trigger handler
int w = write(fd, "test", 4);
REQUIRE(w == 4);
handler.received_data.wait();
// Send response with full shutdown
if (auto conn = handler.wconn.lock()) {
auto *conn_ptr = static_cast<Connection *>(conn.get());
conn_ptr->append_bytes(std::exchange(handler.reply, {}),
std::move(handler.arena), ConnectionShutdown::Full);
} else {
REQUIRE(false);
}
// Read the response - connection should close after this
char buf[5];
buf[4] = 0;
int r = read(fd, buf, 4);
REQUIRE(r == 4);
CHECK(std::string(buf) == "test");
// Connection should be closed by server (full shutdown)
char extra_buf[1];
int close_result = read(fd, extra_buf, 1);
CHECK(close_result == 0); // EOF indicates connection was closed
// Wait for connection closed callback to be called
handler.connection_closed_latch.wait();
CHECK(handler.connection_closed.load() == true);
close(fd);
server->shutdown();
runThread.join();
}

View File

@@ -1,110 +0,0 @@
#include "../src/thread_pipeline.hpp"
#include "config.hpp"
#include "connection.hpp"
#include "perfetto_categories.hpp"
#include "server.hpp"
#include <cstring>
#include <doctest/doctest.h>
#include <thread>
// Perfetto static storage for tests
PERFETTO_TRACK_EVENT_STATIC_STORAGE();
struct Message {
std::unique_ptr<Connection> conn;
std::string data;
bool done;
};
struct EchoHandler : public ConnectionHandler {
private:
StaticThreadPipeline<Message, WaitStrategy::WaitIfStageEmpty, 1> &pipeline;
public:
explicit EchoHandler(
StaticThreadPipeline<Message, WaitStrategy::WaitIfStageEmpty, 1>
&pipeline)
: pipeline(pipeline) {}
void on_data_arrived(std::string_view data,
std::unique_ptr<Connection> &conn_ptr) override {
assert(conn_ptr);
auto guard = pipeline.push(1, true);
for (auto &message : guard.batch) {
message.conn = std::move(conn_ptr);
message.data = data;
message.done = false;
}
}
};
TEST_CASE(
"Server correctly handles connection ownership transfer via pipeline") {
weaseldb::Config config;
config.server.io_threads = 1;
config.server.epoll_instances = 1;
StaticThreadPipeline<Message, WaitStrategy::WaitIfStageEmpty, 1> pipeline{10};
EchoHandler handler{pipeline};
auto echoThread = std::thread{[&]() {
for (;;) {
auto guard = pipeline.acquire<0, 0>();
for (auto &message : guard.batch) {
bool done = message.done;
if (done) {
return;
}
assert(message.conn);
message.conn->append_message(message.data);
Server::release_back_to_server(std::move(message.conn));
}
}
}};
// Create server with NO listen sockets (empty vector)
auto server = Server::create(config, handler, {});
std::thread server_thread([&server]() { server->run(); });
// Create local connection
int client_fd = server->create_local_connection();
REQUIRE(client_fd > 0);
// Write some test data
const char *test_message = "Hello, World!";
ssize_t bytes_written;
do {
bytes_written = write(client_fd, test_message, std::strlen(test_message));
} while (bytes_written == -1 && errno == EINTR);
REQUIRE(bytes_written == std::strlen(test_message));
// Read the echoed response
char buffer[1024] = {0};
ssize_t bytes_read;
do {
bytes_read = read(client_fd, buffer, sizeof(buffer) - 1);
} while (bytes_read == -1 && errno == EINTR);
if (bytes_read == -1) {
perror("read failed");
}
REQUIRE(bytes_read == std::strlen(test_message));
// Verify we got back exactly what we sent
CHECK(std::string(buffer, bytes_read) == std::string(test_message));
// Cleanup
int e = close(client_fd);
if (e == -1 && errno != EINTR) {
perror("close client_fd");
std::abort();
}
server->shutdown();
server_thread.join();
{
auto guard = pipeline.push(1, true);
for (auto &message : guard.batch) {
message.done = true;
}
}
echoThread.join();
}

View File

@@ -7,12 +7,14 @@ WeaselDB's /ok health check endpoint achieves 1M requests/second with 740ns of c
## Performance Metrics ## Performance Metrics
### Throughput ### Throughput
- **1.0M requests/second** /ok health check endpoint (4-stage commit pipeline) - **1.0M requests/second** /ok health check endpoint (4-stage commit pipeline)
- 8 I/O threads with 8 epoll instances - 8 I/O threads with 8 epoll instances
- Load tester used 12 network threads - Load tester used 12 network threads
- **0% CPU usage when idle** (optimized futex wake implementation) - **0% CPU usage when idle** (optimized futex wake implementation)
### Threading Architecture ### Threading Architecture
- **Four-stage commit pipeline**: Sequence → Resolve → Persist → Release - **Four-stage commit pipeline**: Sequence → Resolve → Persist → Release
- Lock-free coordination using atomic ring buffer - Lock-free coordination using atomic ring buffer
- **Optimized futex wake**: Only wake on final pipeline stage - **Optimized futex wake**: Only wake on final pipeline stage
@@ -21,6 +23,7 @@ WeaselDB's /ok health check endpoint achieves 1M requests/second with 740ns of c
### Performance Characteristics ### Performance Characteristics
**Health Check Pipeline (/ok endpoint)**: **Health Check Pipeline (/ok endpoint)**:
- **Throughput**: 1.0M requests/second - **Throughput**: 1.0M requests/second
- **Configurable CPU work**: 740ns (4000 iterations, validated with nanobench) - **Configurable CPU work**: 740ns (4000 iterations, validated with nanobench)
- **Theoretical maximum CPU time**: 1000ns (1,000,000,000ns ÷ 1,000,000 req/s) - **Theoretical maximum CPU time**: 1000ns (1,000,000,000ns ÷ 1,000,000 req/s)
@@ -31,23 +34,27 @@ WeaselDB's /ok health check endpoint achieves 1M requests/second with 740ns of c
### Key Optimizations ### Key Optimizations
**Futex Wake Reduction**: **Futex Wake Reduction**:
- **Previous approach**: Futex wake at every pipeline stage (10% CPU overhead) - **Previous approach**: Futex wake at every pipeline stage (10% CPU overhead)
- **Optimized approach**: Futex wake only at final stage to wake producers. Stages now do their futex wait on the beginning of the pipeline instead of the previous stage. - **Optimized approach**: Futex wake only at final stage to wake producers. Stages now do their futex wait on the beginning of the pipeline instead of the previous stage.
- **Result**: 23% increase in serial CPU budget (396ns → 488ns) - **Result**: 23% increase in serial CPU budget (396ns → 488ns)
- **Benefits**: Higher throughput per CPU cycle + idle efficiency - **Benefits**: Higher throughput per CPU cycle + idle efficiency
**CPU-Friendly Spin Loop**: **CPU-Friendly Spin Loop**:
- **Added**: `_mm_pause()` intrinsics in polling loop to reduce power consumption and improve hyperthreading efficiency - **Added**: `_mm_pause()` intrinsics in polling loop to reduce power consumption and improve hyperthreading efficiency
- **Maintained**: 100,000 spin iterations necessary to prevent thread descheduling - **Maintained**: 100,000 spin iterations necessary to prevent thread descheduling
- **Result**: Same throughput with more efficient spinning - **Result**: Same throughput with more efficient spinning
**Resolve Batch Size Optimization**: **Resolve Batch Size Optimization**:
- **Changed**: Resolve max batch size from unlimited to 1 - **Changed**: Resolve max batch size from unlimited to 1
- **Mechanism**: Single-item processing checks for work more frequently, keeping the thread in fast coordination paths instead of expensive spin/wait cycles - **Mechanism**: Single-item processing checks for work more frequently, keeping the thread in fast coordination paths instead of expensive spin/wait cycles
### Request Flow ### Request Flow
**Health Check Pipeline** (/ok endpoint): **Health Check Pipeline** (/ok endpoint):
``` ```
I/O Threads (8) → HttpHandler::on_batch_complete() → Commit Pipeline I/O Threads (8) → HttpHandler::on_batch_complete() → Commit Pipeline
↑ ↓ ↑ ↓
@@ -59,10 +66,10 @@ I/O Threads (8) → HttpHandler::on_batch_complete() → Commit Pipeline
| Stage 2: Persist (generate response) | Stage 2: Persist (generate response)
| (send "OK" response) | (send "OK" response)
| ↓ | ↓
| Stage 3: Release (connection return) | Stage 3: Release (wake I/O threads)
| (optimized futex wake) | (optimized futex wake)
| ↓ | ↓
└─────────────────────── Server::release_back_to_server() └─────────────────────── I/O threads send response to client
``` ```
## Test Configuration ## Test Configuration

16
todo.md
View File

@@ -3,16 +3,19 @@
## 📋 Planned Tasks ## 📋 Planned Tasks
### Core Database Features ### Core Database Features
- [ ] Design commit pipeline architecture with three-stage processing
- [ ] Stage 1: Version assignment and precondition validation thread - [ ] Design commit pipeline architecture with four-stage processing
- [ ] Stage 2: Transaction persistence and subscriber streaming thread - [ ] Stage 0: Sequence assignment and request validation
- [ ] Stage 3: Connection return to server thread - [ ] Stage 1: Precondition resolution and conflict detection
- [ ] Stage 2: Transaction persistence and subscriber streaming
- [ ] Stage 3: Response generation and connection cleanup
- [ ] Use ThreadPipeline for inter-stage communication - [ ] Use ThreadPipeline for inter-stage communication
- [ ] Design persistence interface for pluggable storage backends (S3, local disk) - [ ] Design persistence interface for pluggable storage backends (S3, local disk)
- [ ] Integrate https://git.weaselab.dev/weaselab/conflict-set for optimistic concurrency control - [ ] Integrate https://git.weaselab.dev/weaselab/conflict-set for optimistic concurrency control
- [ ] Design and architect the subscription component for change streams - [ ] Design and architect the subscription component for change streams
### API Endpoints Implementation ### API Endpoints Implementation
- [ ] Implement `GET /v1/version` endpoint to return latest committed version and leader - [ ] Implement `GET /v1/version` endpoint to return latest committed version and leader
- [ ] Implement `POST /v1/commit` endpoint for transaction submission with precondition validation - [ ] Implement `POST /v1/commit` endpoint for transaction submission with precondition validation
- [ ] Implement `GET /v1/status` endpoint for commit request status lookup by request_id - [ ] Implement `GET /v1/status` endpoint for commit request status lookup by request_id
@@ -23,6 +26,7 @@
- [ ] Implement `DELETE /v1/retention/<policy_id>` endpoint for retention policy removal - [ ] Implement `DELETE /v1/retention/<policy_id>` endpoint for retention policy removal
### Infrastructure & Tooling ### Infrastructure & Tooling
- [x] Implement thread-safe Prometheus metrics library and serve `GET /metrics` endpoint - [x] Implement thread-safe Prometheus metrics library and serve `GET /metrics` endpoint
- [ ] Implement gperf-based HTTP routing for efficient request dispatching - [ ] Implement gperf-based HTTP routing for efficient request dispatching
- [ ] Replace nlohmann/json with simdjson DOM API in parser comparison benchmarks - [ ] Replace nlohmann/json with simdjson DOM API in parser comparison benchmarks
@@ -54,6 +58,7 @@
- [ ] Implement `DeleteObjects` for batch object deletion - [ ] Implement `DeleteObjects` for batch object deletion
### Client Libraries ### Client Libraries
- [ ] Implement high-level Python client library for WeaselDB REST API - [ ] Implement high-level Python client library for WeaselDB REST API
- [ ] Wrap `/v1/version`, `/v1/commit`, `/v1/status` endpoints - [ ] Wrap `/v1/version`, `/v1/commit`, `/v1/status` endpoints
- [ ] Handle `/v1/subscribe` SSE streaming with reconnection logic - [ ] Handle `/v1/subscribe` SSE streaming with reconnection logic
@@ -64,6 +69,7 @@
- [ ] Provide CLI tooling for database administration - [ ] Provide CLI tooling for database administration
### Testing & Validation ### Testing & Validation
- [ ] Build out-of-process API test suite using client library over real TCP - [ ] Build out-of-process API test suite using client library over real TCP
- [ ] Test all `/v1/version`, `/v1/commit`, `/v1/status` endpoints - [ ] Test all `/v1/version`, `/v1/commit`, `/v1/status` endpoints
- [ ] Test `/v1/subscribe` Server-Sent Events streaming - [ ] Test `/v1/subscribe` Server-Sent Events streaming
@@ -79,6 +85,6 @@
- [x] Built streaming JSON parser for commit requests with high-performance parsing - [x] Built streaming JSON parser for commit requests with high-performance parsing
- [x] Implemented HTTP server with multi-threaded networking using multiple epoll instances - [x] Implemented HTTP server with multi-threaded networking using multiple epoll instances
- [x] Created threading model with pipeline for serial request processing for optimistic concurrency control - [x] Created threading model with pipeline for serial request processing for optimistic concurrency control
- [x] Designed connection ownership transfer system to enable the serial processing model - [x] Implemented server-owned connection model with WeakRef pattern for safe concurrent access
- [x] Implemented arena-per-connection memory model for clean memory lifetime management - [x] Implemented arena-per-connection memory model for clean memory lifetime management
- [x] Built TOML configuration system for server settings - [x] Built TOML configuration system for server settings

View File

@@ -297,10 +297,10 @@ struct Connection {
} }
} }
bool writeBytes() { bool write_bytes() {
for (;;) { for (;;) {
assert(!request.empty()); assert(!request.empty());
int w = write(fd, request.data(), request.size()); int w = send(fd, request.data(), request.size(), MSG_NOSIGNAL);
if (w == -1) { if (w == -1) {
if (errno == EINTR) { if (errno == EINTR) {
continue; continue;
@@ -610,7 +610,6 @@ int main(int argc, char *argv[]) {
} }
printf("\n"); printf("\n");
signal(SIGPIPE, SIG_IGN);
signal(SIGTERM, signal_handler); signal(SIGTERM, signal_handler);
signal(SIGINT, signal_handler); signal(SIGINT, signal_handler);
@@ -673,7 +672,7 @@ int main(int argc, char *argv[]) {
continue; // Let unique_ptr destructor clean up continue; // Let unique_ptr destructor clean up
} }
if (events[i].events & EPOLLOUT) { if (events[i].events & EPOLLOUT) {
bool finished = conn->writeBytes(); bool finished = conn->write_bytes();
if (conn->error) { if (conn->error) {
continue; continue;
} }
@@ -749,14 +748,14 @@ int main(int argc, char *argv[]) {
// Try to write once in the connect thread before handing off to network // Try to write once in the connect thread before handing off to network
// threads // threads
assert(conn->has_messages()); assert(conn->has_messages());
bool writeFinished = conn->writeBytes(); bool write_finished = conn->write_bytes();
if (conn->error) { if (conn->error) {
continue; // Connection failed, destructor will clean up continue; // Connection failed, destructor will clean up
} }
// Determine the appropriate epoll events based on write result // Determine the appropriate epoll events based on write result
struct epoll_event event{}; struct epoll_event event{};
if (writeFinished) { if (write_finished) {
// All data was written, wait for response // All data was written, wait for response
int shutdown_result = shutdown(conn->fd, SHUT_WR); int shutdown_result = shutdown(conn->fd, SHUT_WR);
if (shutdown_result == -1) { if (shutdown_result == -1) {