Add more TODOs

Add TODO
Actually have contention in benchmark
2025-08-29 17:15:36 -04:00 · 2025-08-29 17:05:51 -04:00 · 2025-08-29 17:05:21 -04:00 · 2025-08-29 15:10:10 -04:00 · 2025-08-29 14:29:15 -04:00 · 2025-08-29 14:08:41 -04:00
22 changed files with 3698 additions and 45 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,7 @@ set(SOURCES
    src/json_commit_request_parser.cpp
    src/http_handler.cpp
    src/arena_allocator.cpp
+    src/format.cpp
    ${CMAKE_BINARY_DIR}/json_tokens.cpp)

 add_executable(weaseldb ${SOURCES})
@@ -133,7 +134,7 @@ target_include_directories(test_data PUBLIC benchmarks)
 target_link_libraries(test_data simdutf::simdutf)

 add_executable(test_arena_allocator tests/test_arena_allocator.cpp
-                                    src/arena_allocator.cpp)
+                                    src/arena_allocator.cpp src/format.cpp)
 target_link_libraries(test_arena_allocator doctest::doctest)
 target_include_directories(test_arena_allocator PRIVATE src)
 target_compile_options(test_arena_allocator PRIVATE -UNDEBUG)
@@ -185,6 +186,17 @@ target_compile_definitions(test_server_connection_return
                           PRIVATE DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN)
 target_compile_options(test_server_connection_return PRIVATE -UNDEBUG)

+# Metrics system test
+add_executable(test_metric tests/test_metric.cpp src/metric.cpp
+                           src/arena_allocator.cpp src/format.cpp)
+target_link_libraries(test_metric doctest::doctest Threads::Threads
+                      simdutf::simdutf weaseljson)
+target_include_directories(test_metric PRIVATE src)
+target_compile_options(test_metric PRIVATE -UNDEBUG)
+
+# Register with CTest
+add_test(NAME metric_tests COMMAND test_metric)
+
 add_executable(bench_arena_allocator benchmarks/bench_arena_allocator.cpp
                                     src/arena_allocator.cpp)
 target_link_libraries(bench_arena_allocator nanobench)
@@ -216,6 +228,21 @@ add_executable(bench_thread_pipeline benchmarks/bench_thread_pipeline.cpp)
 target_link_libraries(bench_thread_pipeline nanobench Threads::Threads)
 target_include_directories(bench_thread_pipeline PRIVATE src)

+add_executable(bench_format_comparison benchmarks/bench_format_comparison.cpp
+                                       src/arena_allocator.cpp src/format.cpp)
+target_link_libraries(bench_format_comparison nanobench)
+target_include_directories(bench_format_comparison PRIVATE src)
+
+# Metrics system benchmark
+add_executable(bench_metric benchmarks/bench_metric.cpp src/metric.cpp
+                            src/arena_allocator.cpp src/format.cpp)
+target_link_libraries(bench_metric nanobench Threads::Threads simdutf::simdutf
+                      weaseljson)
+target_include_directories(bench_metric PRIVATE src)
+
+# Register benchmark with CTest
+add_test(NAME metric_benchmarks COMMAND bench_metric)
+
 # Debug tools
 add_executable(
  debug_arena tools/debug_arena.cpp src/json_commit_request_parser.cpp
@@ -237,3 +264,4 @@ add_test(NAME arena_allocator_benchmarks COMMAND bench_arena_allocator)
 add_test(NAME commit_request_benchmarks COMMAND bench_commit_request)
 add_test(NAME parser_comparison_benchmarks COMMAND bench_parser_comparison)
 add_test(NAME thread_pipeline_benchmarks COMMAND bench_thread_pipeline)
+add_test(NAME format_comparison_benchmarks COMMAND bench_format_comparison)
--- a/benchmarks/bench_format_comparison.cpp
+++ b/benchmarks/bench_format_comparison.cpp
@@ -0,0 +1,274 @@
+#include "arena_allocator.hpp"
+#include "format.hpp"
+#include <cstdio>
+#include <iomanip>
+#include <nanobench.h>
+#include <sstream>
+#include <string>
+
+#if __cpp_lib_format >= 201907L
+#include <format>
+#define HAS_STD_FORMAT 1
+#else
+#define HAS_STD_FORMAT 0
+#endif
+
+// Test data for consistent benchmarks
+constexpr int TEST_INT = 42;
+constexpr double TEST_DOUBLE = 3.14159;
+const std::string TEST_STRING = "Hello World";
+
+// Benchmark simple string concatenation: "Hello " + "World" + "!"
+void benchmark_simple_concatenation() {
+  std::cout << "\n=== Simple String Concatenation: 'Hello World!' ===\n";
+
+  ankerl::nanobench::Bench bench;
+  bench.title("Simple Concatenation").unit("op").warmup(100).epochs(1000);
+
+  // Arena-based static_format
+  bench.run("static_format", [&] {
+    ArenaAllocator arena(64);
+    auto result = static_format(arena, "Hello ", "World", "!");
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // Arena-based format
+  bench.run("format", [&] {
+    ArenaAllocator arena(64);
+    auto result = format(arena, "Hello %s!", "World");
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // std::stringstream
+  bench.run("std::stringstream", [&] {
+    std::stringstream ss;
+    ss << "Hello " << "World" << "!";
+    auto result = ss.str();
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+#if HAS_STD_FORMAT
+  // std::format (C++20)
+  bench.run("std::format", [&] {
+    auto result = std::format("Hello {}!", "World");
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+#endif
+
+  // Raw snprintf with malloc
+  bench.run("snprintf + malloc", [&] {
+    char buffer[64];
+    int len = std::snprintf(buffer, sizeof(buffer), "Hello %s!", "World");
+    char *result = static_cast<char *>(std::malloc(len + 1));
+    std::memcpy(result, buffer, len + 1);
+    ankerl::nanobench::doNotOptimizeAway(result);
+    std::free(result);
+  });
+}
+
+// Benchmark mixed type formatting: "Count: 42, Rate: 3.14159"
+void benchmark_mixed_types() {
+  std::cout << "\n=== Mixed Type Formatting: 'Count: 42, Rate: 3.14159' ===\n";
+
+  ankerl::nanobench::Bench bench;
+  bench.title("Mixed Types").unit("op").warmup(100).epochs(1000);
+
+  // Arena-based static_format
+  bench.run("static_format", [&] {
+    ArenaAllocator arena(128);
+    auto result =
+        static_format(arena, "Count: ", TEST_INT, ", Rate: ", TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // Arena-based format
+  bench.run("format", [&] {
+    ArenaAllocator arena(128);
+    auto result = format(arena, "Count: %d, Rate: %.5f", TEST_INT, TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // std::stringstream
+  bench.run("std::stringstream", [&] {
+    std::stringstream ss;
+    ss << "Count: " << TEST_INT << ", Rate: " << std::fixed
+       << std::setprecision(5) << TEST_DOUBLE;
+    auto result = ss.str();
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+#if HAS_STD_FORMAT
+  // std::format (C++20)
+  bench.run("std::format", [&] {
+    auto result = std::format("Count: {}, Rate: {:.5f}", TEST_INT, TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+#endif
+
+  // Raw snprintf with malloc
+  bench.run("snprintf + malloc", [&] {
+    char buffer[128];
+    int len = std::snprintf(buffer, sizeof(buffer), "Count: %d, Rate: %.5f",
+                            TEST_INT, TEST_DOUBLE);
+    char *result = static_cast<char *>(std::malloc(len + 1));
+    std::memcpy(result, buffer, len + 1);
+    ankerl::nanobench::doNotOptimizeAway(result);
+    std::free(result);
+  });
+}
+
+// Benchmark complex formatting with precision and alignment
+void benchmark_complex_formatting() {
+  std::cout << "\n=== Complex Formatting: '%-10s %5d %8.2f' ===\n";
+
+  ankerl::nanobench::Bench bench;
+  bench.title("Complex Formatting").unit("op").warmup(100).epochs(1000);
+
+  // Arena-based format (static_format doesn't support printf specifiers)
+  bench.run("format", [&] {
+    ArenaAllocator arena(128);
+    auto result = format(arena, "%-10s %5d %8.2f", TEST_STRING.c_str(),
+                         TEST_INT, TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // std::stringstream
+  bench.run("std::stringstream", [&] {
+    std::stringstream ss;
+    ss << std::left << std::setw(10) << TEST_STRING << " " << std::right
+       << std::setw(5) << TEST_INT << " " << std::setw(8) << std::fixed
+       << std::setprecision(2) << TEST_DOUBLE;
+    auto result = ss.str();
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+#if HAS_STD_FORMAT
+  // std::format (C++20)
+  bench.run("std::format", [&] {
+    auto result = std::format("{:<10} {:>5} {:>8.2f}", TEST_STRING, TEST_INT,
+                              TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+#endif
+
+  // Raw snprintf with malloc
+  bench.run("snprintf + malloc", [&] {
+    char buffer[128];
+    int len = std::snprintf(buffer, sizeof(buffer), "%-10s %5d %8.2f",
+                            TEST_STRING.c_str(), TEST_INT, TEST_DOUBLE);
+    char *result = static_cast<char *>(std::malloc(len + 1));
+    std::memcpy(result, buffer, len + 1);
+    ankerl::nanobench::doNotOptimizeAway(result);
+    std::free(result);
+  });
+}
+
+// Benchmark error message formatting (common use case)
+void benchmark_error_messages() {
+  std::cout << "\n=== Error Message Formatting: 'Error 404: File not found "
+               "(line 123)' ===\n";
+
+  ankerl::nanobench::Bench bench;
+  bench.title("Error Messages").unit("op").warmup(100).epochs(1000);
+
+  constexpr int error_code = 404;
+  constexpr int line_number = 123;
+  const std::string error_msg = "File not found";
+
+  // Arena-based static_format (using string literals only)
+  bench.run("static_format", [&] {
+    ArenaAllocator arena(128);
+    auto result = static_format(arena, "Error ", error_code, ": ",
+                                "File not found", " (line ", line_number, ")");
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // Arena-based format
+  bench.run("format", [&] {
+    ArenaAllocator arena(128);
+    auto result = format(arena, "Error %d: %s (line %d)", error_code,
+                         error_msg.c_str(), line_number);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // std::stringstream
+  bench.run("std::stringstream", [&] {
+    std::stringstream ss;
+    ss << "Error " << error_code << ": " << error_msg << " (line "
+       << line_number << ")";
+    auto result = ss.str();
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+#if HAS_STD_FORMAT
+  // std::format (C++20)
+  bench.run("std::format", [&] {
+    auto result = std::format("Error {}: {} (line {})", error_code, error_msg,
+                              line_number);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+#endif
+}
+
+// Benchmark memory allocation overhead by testing arena reuse
+void benchmark_memory_reuse() {
+  std::cout << "\n=== Memory Allocation Patterns ===\n";
+
+  ankerl::nanobench::Bench bench;
+  bench.title("Memory Patterns").unit("op").warmup(100).epochs(100);
+
+  // Arena with fresh allocation each time (realistic usage)
+  bench.run("fresh arena", [&] {
+    ArenaAllocator arena(128);
+    auto result = format(arena, "Test %d: %s %.2f", TEST_INT,
+                         TEST_STRING.c_str(), TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // Pre-allocated arena (reuse scenario)
+  ArenaAllocator shared_arena(1024);
+  bench.run("reused arena", [&] {
+    auto result = format(shared_arena, "Test %d: %s %.2f", TEST_INT,
+                         TEST_STRING.c_str(), TEST_DOUBLE);
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+
+  // Fresh std::string allocations
+  bench.run("std::stringstream", [&] {
+    std::stringstream ss;
+    ss << "Test " << TEST_INT << ": " << TEST_STRING << " " << std::fixed
+       << std::setprecision(2) << TEST_DOUBLE;
+    auto result = ss.str();
+    ankerl::nanobench::doNotOptimizeAway(result);
+  });
+}
+
+int main() {
+  std::cout << "Format Function Benchmark Comparison\n";
+  std::cout << "====================================\n";
+
+#if HAS_STD_FORMAT
+  std::cout << "C++20 std::format: Available\n";
+#else
+  std::cout << "C++20 std::format: Not available\n";
+#endif
+
+  benchmark_simple_concatenation();
+  benchmark_mixed_types();
+  benchmark_complex_formatting();
+  benchmark_error_messages();
+  benchmark_memory_reuse();
+
+  std::cout << "\n=== Summary ===\n";
+  std::cout
+      << "* static_format: Best for simple concatenation with known types\n";
+  std::cout
+      << "* format: Best for printf-style formatting with arena allocation\n";
+  std::cout
+      << "* std::stringstream: Flexible but slower due to heap allocation\n";
+  std::cout << "* std::format: Modern C++20 alternative (if available)\n";
+  std::cout << "* snprintf + malloc: Low-level but requires manual memory "
+               "management\n";
+
+  return 0;
+}
--- a/benchmarks/bench_metric.cpp
+++ b/benchmarks/bench_metric.cpp
@@ -0,0 +1,256 @@
+#include <nanobench.h>
+
+#include "arena_allocator.hpp"
+#include "metric.hpp"
+
+#include <atomic>
+#include <cmath>
+#include <latch>
+#include <random>
+#include <thread>
+#include <vector>
+
+// High-contention benchmark setup
+struct ContentionEnvironment {
+  // Background threads for contention
+  std::vector<std::thread> background_threads;
+  std::atomic<bool> stop_flag{false};
+
+  // Synchronization latches - must be members to avoid use-after-return
+  std::unique_ptr<std::latch> contention_latch;
+  std::unique_ptr<std::latch> render_latch;
+
+  metric::Family<metric::Gauge> gauge_family =
+      metric::create_gauge("gauge", "");
+  metric::Family<metric::Counter> counter_family =
+      metric::create_counter("counter", "");
+  metric::Family<metric::Histogram> histogram_family = metric::create_histogram(
+      "histogram", "", metric::exponential_buckets(0.001, 5, 7));
+
+  ContentionEnvironment() = default;
+
+  void start_background_contention(int num_threads = 4) {
+    stop_flag.store(false);
+    contention_latch = std::make_unique<std::latch>(num_threads + 1);
+
+    for (int i = 0; i < num_threads; ++i) {
+      background_threads.emplace_back([this, i]() {
+        auto bg_counter = counter_family.create({});
+        auto bg_gauge = gauge_family.create({});
+        auto bg_histogram = histogram_family.create({});
+
+        std::mt19937 rng(i);
+        std::uniform_real_distribution<double> dist(0.0, 10.0);
+
+        contention_latch
+            ->arrive_and_wait(); // All background threads start together
+
+        while (!stop_flag.load(std::memory_order_relaxed)) {
+          // Simulate mixed workload
+          bg_counter.inc(1.0);
+          bg_gauge.set(dist(rng));
+          bg_gauge.inc(1.0);
+          bg_histogram.observe(dist(rng));
+        }
+      });
+    }
+
+    contention_latch
+        ->arrive_and_wait(); // Wait for all background threads to be ready
+  }
+
+  void start_render_thread() {
+    render_latch = std::make_unique<std::latch>(2);
+
+    background_threads.emplace_back([this]() {
+      ArenaAllocator arena;
+
+      render_latch->arrive_and_wait(); // Render thread signals it's ready
+
+      while (!stop_flag.load(std::memory_order_relaxed)) {
+        auto output = metric::render(arena);
+        static_cast<void>(output); // Suppress unused variable warning
+        arena.reset();
+      }
+    });
+
+    render_latch->arrive_and_wait(); // Wait for render thread to be ready
+  }
+
+  void stop_background_threads() {
+    stop_flag.store(true);
+    for (auto &t : background_threads) {
+      if (t.joinable()) {
+        t.join();
+      }
+    }
+    background_threads.clear();
+  }
+
+  ~ContentionEnvironment() { stop_background_threads(); }
+};
+
+int main() {
+  ankerl::nanobench::Bench bench;
+  bench.title("WeaselDB Metrics Performance").unit("operation").warmup(1000);
+
+  metric::Family<metric::Gauge> gauge_family =
+      metric::create_gauge("gauge", "");
+  metric::Family<metric::Counter> counter_family =
+      metric::create_counter("counter", "");
+  metric::Family<metric::Histogram> histogram_family = metric::create_histogram(
+      "histogram", "", metric::exponential_buckets(0.001, 5, 7));
+
+  auto counter = counter_family.create({});
+  auto gauge = gauge_family.create({});
+  auto histogram = histogram_family.create({});
+
+  // Baseline performance without contention
+  {
+    bench.run("counter.inc() - no contention", [&]() {
+      counter.inc(1.0);
+      ankerl::nanobench::doNotOptimizeAway(counter);
+    });
+
+    bench.run("gauge.inc() - no contention", [&]() {
+      gauge.inc(1.0);
+      ankerl::nanobench::doNotOptimizeAway(gauge);
+    });
+
+    bench.run("gauge.set() - no contention", [&]() {
+      gauge.set(42.0);
+      ankerl::nanobench::doNotOptimizeAway(gauge);
+    });
+
+    bench.run("histogram.observe() - no contention", [&]() {
+      histogram.observe(0.5);
+      ankerl::nanobench::doNotOptimizeAway(histogram);
+    });
+  }
+
+  // High contention with background threads
+  {
+    ContentionEnvironment env;
+
+    // Start background threads creating contention
+    env.start_background_contention(8);
+
+    bench.run("counter.inc() - 8 background threads",
+              [&]() { counter.inc(1.0); });
+
+    bench.run("gauge.inc() - 8 background threads", [&]() { gauge.inc(1.0); });
+
+    bench.run("gauge.set() - 8 background threads", [&]() { gauge.set(42.0); });
+
+    bench.run("histogram.observe() - 8 background threads",
+              [&]() { histogram.observe(1.5); });
+  }
+
+  // Concurrent render contention
+  {
+    ContentionEnvironment env;
+
+    // Start background threads + render thread
+    env.start_background_contention(4);
+    env.start_render_thread();
+
+    bench.run("counter.inc() - with concurrent render",
+              [&]() { counter.inc(1.0); });
+
+    bench.run("gauge.inc() - with concurrent render",
+              [&]() { gauge.inc(1.0); });
+
+    bench.run("histogram.observe() - with concurrent render",
+              [&]() { histogram.observe(2.0); });
+  }
+
+  // Render performance scaling
+  {
+    // Test render performance as number of metrics increases
+    std::vector<metric::Counter> counters;
+    std::vector<metric::Gauge> gauges;
+    std::vector<metric::Histogram> histograms;
+
+    auto counter_family =
+        metric::create_counter("scale_counter", "Scale counter");
+    auto gauge_family = metric::create_gauge("scale_gauge", "Scale gauge");
+    auto histogram_family = metric::create_histogram(
+        "scale_histogram", "Scale histogram",
+        std::initializer_list<double>{0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0});
+
+    // Create varying numbers of metrics
+    for (int scale : {10, 100, 1000}) {
+      // Clear previous metrics by creating new families
+      // (Note: In real usage, metrics persist for application lifetime)
+      for (int i = 0; i < scale; ++i) {
+        counters.emplace_back(
+            counter_family.create({{"id", std::to_string(i)}}));
+        gauges.emplace_back(gauge_family.create({{"id", std::to_string(i)}}));
+        histograms.emplace_back(
+            histogram_family.create({{"id", std::to_string(i)}}));
+
+        // Set some values
+        counters.back().inc(static_cast<double>(i));
+        gauges.back().set(static_cast<double>(i * 2));
+        histograms.back().observe(static_cast<double>(i) * 0.1);
+      }
+
+      ArenaAllocator arena;
+      std::string bench_name =
+          "render() - " + std::to_string(scale) + " metrics each type";
+
+      bench.run(bench_name, [&]() {
+        auto output = metric::render(arena);
+        ankerl::nanobench::doNotOptimizeAway(output);
+        arena.reset();
+      });
+    }
+  }
+
+  // Callback metrics performance
+  {
+    auto counter_family =
+        metric::create_counter("callback_counter", "Callback counter");
+    auto gauge_family =
+        metric::create_gauge("callback_gauge", "Callback gauge");
+
+    std::atomic<double> counter_value{0};
+    std::atomic<double> gauge_value{100};
+
+    // Register callbacks
+    counter_family.register_callback(
+        {{"type", "callback"}}, [&counter_value]() {
+          return counter_value.load(std::memory_order_relaxed);
+        });
+
+    gauge_family.register_callback({{"type", "callback"}}, [&gauge_value]() {
+      return gauge_value.load(std::memory_order_relaxed);
+    });
+
+    // Background thread updating callback values
+    std::atomic<bool> stop_callback{false};
+    std::latch start_latch{2}; // Background thread + benchmark thread
+
+    std::thread callback_updater([&]() {
+      start_latch.arrive_and_wait(); // Wait for benchmark to start
+      while (!stop_callback.load()) {
+        counter_value.fetch_add(1);
+        gauge_value.store(gauge_value.load() + 1);
+      }
+    });
+
+    ArenaAllocator arena;
+
+    start_latch.arrive_and_wait(); // Wait for background thread to be ready
+    bench.run("render() - with callback metrics", [&]() {
+      auto output = metric::render(arena);
+      ankerl::nanobench::doNotOptimizeAway(output);
+      arena.reset();
+    });
+
+    stop_callback.store(true);
+    callback_updater.join();
+  }
+
+  return 0;
+}
--- a/benchmarks/bench_parser_comparison.cpp
+++ b/benchmarks/bench_parser_comparison.cpp
@@ -789,7 +789,7 @@ int main() {
  std::cout << "\nBenchmark completed. The WeaselDB parser is optimized for:\n";
  std::cout << "- Arena-based memory allocation for reduced fragmentation\n";
  std::cout << "- Streaming parsing for network protocols\n";
-  std::cout << "- Zero-copy string handling with string views\n";
+  std::cout << "- String views to minimize unnecessary copying\n";
  std::cout << "- Base64 decoding integrated into parsing pipeline\n";
  std::cout << "- Efficient reset and reuse for high-throughput scenarios\n";

--- a/benchmarks/bench_volatile_loop.cpp
+++ b/benchmarks/bench_volatile_loop.cpp
@@ -1,12 +1,12 @@
 #include <nanobench.h>

-#include "../src/loop_iterations.h"
+#include "../src/loop_iterations.hpp"

 int main() {
  ankerl::nanobench::Bench bench;
  bench.minEpochIterations(100000);
-  bench.run("volatile loop to " + std::to_string(loopIterations), [&] {
-    for (volatile int i = 0; i < loopIterations; i = i + 1)
+  bench.run("volatile loop to " + std::to_string(loop_iterations), [&] {
+    for (volatile int i = 0; i < loop_iterations; i = i + 1)
      ;
  });

--- a/design.md
+++ b/design.md
@@ -23,7 +23,7 @@ WeaselDB is a high-performance write-side database component designed for system
 - **High-performance JSON parsing** with streaming support and SIMD optimization
 - **Multi-threaded networking** using multiple epoll instances with unified I/O thread pool
 - **Configurable epoll instances** to eliminate kernel-level contention
- **Zero-copy design** throughout the pipeline
+- **Optimized memory management** with arena allocation and efficient copying
 - **Factory pattern safety** ensuring correct object lifecycle management

 ---
@@ -152,7 +152,7 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 - **Arena-backed string storage** with efficient memory management
 - **Move-only semantics** for optimal performance
 - **Builder pattern** for constructing commit requests
- **Zero-copy string views** pointing to arena-allocated memory
+- **String views** pointing to arena-allocated memory to avoid unnecessary copying

 #### **Configuration & Optimization**

@@ -242,7 +242,7 @@ The system implements a RESTful API. See [api.md](api.md) for comprehensive API
 1. **Performance-first** - Every component optimized for high throughput
 2. **Scalable concurrency** - Multiple epoll instances eliminate kernel contention
 3. **Memory efficiency** - Arena allocation eliminates fragmentation
-4. **Zero-copy** - Minimize data copying throughout pipeline
+4. **Efficient copying** - Minimize unnecessary copies while accepting required ones
 5. **Streaming-ready** - Support incremental processing
 6. **Type safety** - Compile-time validation where possible
 7. **Resource management** - RAII and move semantics throughout
@@ -437,7 +437,7 @@ public:

 #### Arena-Based String Handling
 ```cpp
-// Preferred: Zero-copy string view with arena allocation
+// Preferred: String view with arena allocation to minimize copying
 std::string_view process_json_key(const char* data, ArenaAllocator& arena);

 // Avoid: Unnecessary string copies
--- a/src/arena_allocator.cpp
+++ b/src/arena_allocator.cpp
@@ -77,32 +77,29 @@ void *ArenaAllocator::realloc_raw(void *ptr, uint32_t old_size,
  assert(current_block_ &&
         "realloc called with non-null ptr but no current block exists");

-  // Assert that offset is large enough (should always be true for
-  // valid callers)
-  assert(current_block_->offset >= old_size &&
-         "offset must be >= old_size for valid last allocation");
+  if (current_block_->offset >= old_size) {
+    // Check if this was the last allocation by comparing with expected location
+    char *expected_last_alloc_start =
+        current_block_->data() + current_block_->offset - old_size;

-  // Check if this was the last allocation by comparing with expected location
-  char *expected_last_alloc_start =
-      current_block_->data() + current_block_->offset - old_size;
+    if (ptr == expected_last_alloc_start) {
+      // This is indeed the last allocation
+      if (new_size > old_size) {
+        // Growing - check if we have space
+        size_t additional_space_needed = new_size - old_size;

-  if (ptr == expected_last_alloc_start) {
-    // This is indeed the last allocation
-    if (new_size > old_size) {
-      // Growing - check if we have space
-      size_t additional_space_needed = new_size - old_size;
-
-      if (current_block_->offset + additional_space_needed <=
-          current_block_->size) {
-        // We can extend in place
-        current_block_->offset += additional_space_needed;
-        return ptr;
+        if (current_block_->offset + additional_space_needed <=
+            current_block_->size) {
+          // We can extend in place
+          current_block_->offset += additional_space_needed;
+          return ptr;
+        }
+      } else {
+        // Shrinking - just update the offset
+        size_t space_to_free = old_size - new_size;
+        current_block_->offset -= space_to_free;
+        return new_size == 0 ? nullptr : ptr;
      }
-    } else {
-      // Shrinking - just update the offset
-      size_t space_to_free = old_size - new_size;
-      current_block_->offset -= space_to_free;
-      return new_size == 0 ? nullptr : ptr;
    }
  }

--- a/src/arena_allocator.hpp
+++ b/src/arena_allocator.hpp
@@ -277,10 +277,14 @@ public:
   * @brief Type-safe version of realloc_raw for arrays of type T.
   *
   * @param ptr Pointer to the existing allocation (must be from this allocator)
+   *            If nullptr, behaves like allocate<T>(new_size)
   * @param old_size Size of the existing allocation in number of T objects
+   *                 Ignored if ptr is nullptr
   * @param new_size Desired new size in number of T objects
   * @return Pointer to the reallocated memory (may be the same as ptr or
   * different)
+   * @note Follows standard realloc() semantics: realloc(nullptr, size) ==
+   * malloc(size)
   * @note Prints error to stderr and calls std::abort() if memory allocation
   * fails or size overflow occurs
   */
@@ -430,6 +434,40 @@ public:
    return current_block_ ? current_block_->size - current_block_->offset : 0;
  }

+  /**
+   * @brief Get all available space in the current block and claim it
+   * immediately.
+   *
+   * This method returns a pointer to all remaining space in the current block
+   * and immediately marks it as used in the arena. The caller should use
+   * realloc() to shrink the allocation to the actual amount needed.
+   *
+   * If no block exists or current block is full, creates a new block.
+   *
+   * @return Pointer to allocated space and the number of bytes allocated
+   * @note The caller must call realloc() to return unused space
+   * @note This is designed for speculative operations like printf formatting
+   * @note Postcondition: always returns at least 1 byte
+   */
+  struct AllocatedSpace {
+    char *ptr;
+    size_t allocated_bytes;
+  };
+
+  AllocatedSpace allocate_remaining_space() {
+    if (!current_block_ || available_in_current_block() == 0) {
+      add_block(initial_block_size_);
+    }
+
+    char *allocated_ptr = current_block_->data() + current_block_->offset;
+    size_t available = available_in_current_block();
+
+    // Claim all remaining space
+    current_block_->offset = current_block_->size;
+
+    return {allocated_ptr, available};
+  }
+
  /**
   * @brief Get the total number of blocks in the allocator.
   *
@@ -575,3 +613,44 @@ public:

  template <typename U> friend class ArenaStlAllocator;
 };
+
+/// Simple arena-aware vector that doesn't have a destructor
+/// Safe to return as span because both the vector and its data are
+/// arena-allocated Uses arena's realloc() for efficient growth without copying
+/// when possible
+template <typename T> struct ArenaVector {
+  explicit ArenaVector(ArenaAllocator *arena)
+      : arena_(arena), data_(nullptr), size_(0), capacity_(0) {}
+
+  void push_back(const T &item) {
+    if (size_ >= capacity_) {
+      grow();
+    }
+    data_[size_++] = item;
+  }
+
+  T *data() { return data_; }
+  const T *data() const { return data_; }
+  size_t size() const { return size_; }
+  bool empty() const { return size_ == 0; }
+
+  T &operator[](size_t index) { return data_[index]; }
+  const T &operator[](size_t index) const { return data_[index]; }
+
+  // No destructor - arena cleanup handles memory
+
+private:
+  void grow() {
+    size_t new_capacity = capacity_ == 0 ? 8 : capacity_ * 2;
+
+    // arena.realloc() handles nullptr like standard realloc() - acts like
+    // malloc() This avoids copying when growing in-place is possible
+    data_ = arena_->realloc(data_, capacity_, new_capacity);
+    capacity_ = new_capacity;
+  }
+
+  ArenaAllocator *arena_;
+  T *data_;
+  size_t size_;
+  size_t capacity_;
+};
--- a/src/connection.hpp
+++ b/src/connection.hpp
@@ -70,7 +70,7 @@ struct Connection {
   * asynchronously by the server's I/O threads using efficient vectored
   * I/O.
   *
-   * @param s The data to send (string view for zero-copy efficiency)
+   * @param s The data to send (string view parameter for efficiency)
   * @param copy_to_arena If true (default), copies data to the connection's
   * arena for safe storage. If false, the caller must ensure the data remains
   * valid until all queued messages are sent.
--- a/src/format.cpp
+++ b/src/format.cpp
--- a/src/format.hpp
+++ b/src/format.hpp
@@ -0,0 +1,276 @@
+#pragma once
+
+#include <concepts>
+#include <cstdarg>
+#include <cstdio>
+#include <cstring>
+#include <string_view>
+#include <type_traits>
+
+#include "arena_allocator.hpp"
+
+/**
+ * @brief Runtime printf-style formatting with arena allocation optimization.
+ *
+ * This function provides familiar printf-style formatting with intelligent
+ * optimization for arena allocation. It attempts single-pass formatting by
+ * speculatively using available arena space, falling back to two-pass
+ * formatting only when necessary.
+ *
+ * The function uses an optimized allocation strategy:
+ * 1. **Single-pass attempt**: Try to format directly into available arena space
+ * 2. **Fallback to two-pass**: If formatting doesn't fit, measure required size
+ *    and allocate exactly what's needed
+ *
+ * ## Supported Format Specifiers:
+ * All standard printf format specifiers are supported:
+ * - **Integers**: %d, %i, %u, %x, %X, %o, %ld, %lld, etc.
+ * - **Floating point**: %f, %e, %E, %g, %G, %.2f, etc.
+ * - **Strings**: %s, %.*s, etc.
+ * - **Characters**: %c
+ * - **Pointers**: %p
+ * - **Width/precision**: %10d, %-10s, %.2f, %*.*s, etc.
+ *
+ * ## Performance Characteristics:
+ * - **Optimistic single-pass**: Often avoids the cost of measuring format size
+ * - **Arena allocation**: Uses fast arena allocation (~1ns vs ~20-270ns for
+ * malloc)
+ * - **Memory efficient**: Returns unused space to arena via realloc()
+ * - **Fallback safety**: Two-pass approach handles any format that doesn't fit
+ *
+ * @param arena Arena allocator for memory management
+ * @param fmt Printf-style format string
+ * @param ... Variable arguments matching format specifiers
+ * @return std::string_view pointing to arena-allocated formatted string
+ * @note Aborts program on formatting errors (never returns invalid data)
+ * @note GCC format attribute enables compile-time format string validation
+ *
+ * ## Usage Examples:
+ * ```cpp
+ * ArenaAllocator arena(1024);
+ *
+ * // Basic formatting
+ * auto msg = format(arena, "Hello %s!", "World");
+ * // msg == "Hello World!"
+ *
+ * // Numeric formatting with precision
+ * auto value = format(arena, "Pi: %.3f", 3.14159);
+ * // value == "Pi: 3.142"
+ *
+ * // Mixed types with width/alignment
+ * auto table = format(arena, "%-10s %5d %8.2f", "Item", 42, 99.95);
+ * // table == "Item           42    99.95"
+ *
+ * // Error messages
+ * auto error = format(arena, "Error %d: %s (line %d)", 404, "Not found", 123);
+ * // error == "Error 404: Not found (line 123)"
+ * ```
+ *
+ * ## When to Use:
+ * - **Printf familiarity**: When you prefer printf-style format strings
+ * - **Runtime flexibility**: Format strings from variables, config, or user
+ * input
+ * - **Complex formatting**: Precision, width, alignment, padding
+ * - **Debugging**: Quick formatted output for logging/debugging
+ * - **Mixed precision**: Different numeric precision requirements
+ *
+ * ## When to Use static_format() Instead:
+ * - **Hot paths**: Performance-critical code where every nanosecond counts
+ * - **Simple concatenation**: Basic string + number + string combinations
+ * - **Compile-time optimization**: When all types/values known at compile time
+ * - **Template contexts**: Where compile-time buffer sizing is beneficial
+ *
+ * ## Optimization Details:
+ * The function uses `ArenaAllocator::allocate_remaining_space()` to claim all
+ * available arena space and attempt formatting. If successful, it shrinks the
+ * allocation to the actual size used. If formatting fails (doesn't fit), it
+ * falls back to the traditional two-pass approach: measure size, allocate
+ * exactly, then format.
+ *
+ * This strategy optimizes for the common case where available arena space is
+ * sufficient, while maintaining correctness for all cases.
+ */
+std::string_view format(ArenaAllocator &arena, const char *fmt, ...)
+    __attribute__((format(printf, 2, 3)));
+
+namespace detail {
+
+template <int kLen> struct StringTerm {
+  explicit constexpr StringTerm(const char *s) : s(s) {}
+  static constexpr int kMaxLength = kLen;
+  void write(char *&buf) const {
+    std::memcpy(buf, s, kLen);
+    buf += kLen;
+  }
+
+private:
+  const char *s;
+};
+template <int kLen>
+constexpr StringTerm<kLen - 1> term(const char (&array)[kLen]) {
+  return StringTerm<kLen - 1>{array};
+}
+
+template <class IntType> constexpr int decimal_length(IntType x) {
+  static_assert(std::is_integral_v<IntType>,
+                "decimal_length requires integral type");
+
+  if constexpr (std::is_signed_v<IntType>) {
+    // Handle negative values by using unsigned equivalent
+    using Unsigned = std::make_unsigned_t<IntType>;
+    // Safe conversion: cast to unsigned first, then negate in unsigned
+    // arithmetic
+    auto abs_x = x < 0 ? -static_cast<Unsigned>(x) : static_cast<Unsigned>(x);
+    int result = 0;
+    do {
+      ++result;
+      abs_x /= 10;
+    } while (abs_x);
+    return result;
+  } else {
+    int result = 0;
+    do {
+      ++result;
+      x /= 10;
+    } while (x);
+    return result;
+  }
+}
+
+template <std::integral IntType> struct IntTerm {
+  static constexpr bool kSigned = std::is_signed_v<IntType>;
+  using Unsigned = std::make_unsigned_t<IntType>;
+
+  explicit constexpr IntTerm(IntType v) : v(v) {}
+
+  static constexpr int kMaxLength =
+      decimal_length(Unsigned(-1)) + (kSigned ? 1 : 0);
+
+  void write(char *&buf) const {
+    char itoa_buf[kMaxLength];
+    auto x = static_cast<Unsigned>(v);
+
+    if constexpr (kSigned) {
+      if (v < 0) {
+        *buf++ = '-';
+        x = -static_cast<Unsigned>(v);
+      }
+    }
+
+    int i = kMaxLength;
+    do {
+      itoa_buf[--i] = static_cast<char>('0' + (x % 10));
+      x /= 10;
+    } while (x);
+
+    while (i < kMaxLength) {
+      *buf++ = itoa_buf[i++];
+    }
+  }
+
+private:
+  IntType v;
+};
+
+template <std::integral IntType> constexpr IntTerm<IntType> term(IntType s) {
+  return IntTerm<IntType>{s};
+}
+
+struct DoubleTerm {
+  explicit constexpr DoubleTerm(double s) : s(s) {}
+  static constexpr int kMaxLength = 24;
+  void write(char *&buf) const;
+
+private:
+  double s;
+};
+
+// Variable template for compile-time max length access
+template <typename T>
+inline constexpr int max_decimal_length_v = decltype(term(T{}))::kMaxLength;
+
+inline constexpr DoubleTerm term(double s) { return DoubleTerm(s); }
+
+} // namespace detail
+
+/**
+ * @brief Compile-time optimized formatting for high-performance code paths.
+ *
+ * This function provides ultra-fast string formatting by calculating buffer
+ * sizes at compile time and using specialized term handlers for each type.
+ * It's designed for performance-critical code where formatting overhead
+ * matters.
+ *
+ * Unlike the runtime `format()` function, `static_format()` processes all
+ * arguments at compile time to determine exact memory requirements and uses
+ * optimized term writers for maximum speed.
+ *
+ * ## Supported Types:
+ * - **String literals**: C-style string literals and arrays
+ * - **Integers**: All integral types (int, int64_t, uint32_t, etc.)
+ * - **Floating point**: double (uses high-precision Grisu2 algorithm)
+ * - **Custom types**: Via specialization of `detail::term()`
+ *
+ * ## Performance Characteristics:
+ * - **Compile-time buffer sizing**: Buffer size calculated at compile time (no
+ * runtime measurement)
+ * - **Optimized arena allocation**: Uses pre-calculated exact buffer sizes with
+ * arena allocator
+ * - **Specialized type handling**: Fast paths for common types via template
+ * specialization
+ * - **Memory efficient**: Uses arena.realloc() to return unused space to the
+ * arena
+ *
+ * @tparam Ts Types of the arguments to format (auto-deduced)
+ * @param arena Arena allocator for memory management
+ * @param ts Arguments to format - can be string literals, integers, doubles
+ * @return std::string_view pointing to arena-allocated formatted string
+ *
+ * ## Usage Examples:
+ * ```cpp
+ * ArenaAllocator arena(1024);
+ *
+ * // String concatenation
+ * auto result1 = static_format(arena, "Hello ", "World", "!");
+ * // result1 == "Hello World!"
+ *
+ * // Mixed types
+ * auto result2 = static_format(arena, "Count: ", 42, ", Rate: ", 3.14);
+ * // result2 == "Count: 42, Rate: 3.14"
+ *
+ * // Error messages
+ * auto error = static_format(arena, "Error ", 404, ": ", "Not found");
+ * // error == "Error 404: Not found"
+ * ```
+ *
+ * ## When to Use:
+ * - **Hot paths**: Performance-critical code where formatting speed matters
+ * - **Known types**: When argument types are known at compile time
+ * - **Simple formatting**: Concatenation and basic type conversion
+ * - **Template code**: Where compile-time optimization is beneficial
+ *
+ * ## When to Use format() Instead:
+ * - **Printf-style formatting**: When you need format specifiers like "%d",
+ * "%.2f"
+ * - **Runtime flexibility**: When format strings come from variables/config
+ * - **Complex formatting**: When you need padding, precision, etc.
+ * - **Convenience**: For quick debugging or non-critical paths
+ *
+ * @note All arguments are passed by forwarding reference for optimal
+ * performance
+ * @note Memory is arena-allocated and automatically sized to exact requirements
+ * @note Compile-time errors occur if unsupported types are used
+ * @note This function is constexpr-friendly and optimizes well in release
+ * builds
+ */
+template <class... Ts>
+std::string_view static_format(ArenaAllocator &arena, Ts &&...ts) {
+  constexpr int upper_bound = (decltype(detail::term(ts))::kMaxLength + ...);
+  char *result = arena.allocate<char>(upper_bound);
+  char *buf = result;
+  (detail::term(ts).write(buf), ...);
+  const int size = static_cast<int>(buf - result);
+  return std::string_view(
+      arena.realloc(result, upper_bound, upper_bound - size),
+      static_cast<std::size_t>(size));
+}
--- a/src/http_handler.hpp
+++ b/src/http_handler.hpp
@@ -8,7 +8,7 @@

 #include "connection.hpp"
 #include "connection_handler.hpp"
-#include "loop_iterations.h"
+#include "loop_iterations.hpp"
 #include "perfetto_categories.hpp"
 #include "server.hpp"
 #include "thread_pipeline.hpp"
@@ -111,7 +111,7 @@ struct HttpHandler : ConnectionHandler {
          if (nulls == 2) {
            return;
          }
-          for (volatile int i = 0; i < loopIterations; i = i + 1)
+          for (volatile int i = 0; i < loop_iterations; i = i + 1)
            ;
        }
      }
--- a/src/loop_iterations.h
+++ b/src/loop_iterations.h
@@ -1,3 +0,0 @@
-#pragma once
-
-constexpr int loopIterations = 1725;
--- a/src/loop_iterations.hpp
+++ b/src/loop_iterations.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+constexpr int loop_iterations = 1725;
--- a/src/metric.cpp
+++ b/src/metric.cpp
@@ -0,0 +1,874 @@
+#include "metric.hpp"
+
+#include <algorithm>
+#include <atomic>
+#include <bit>
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include <immintrin.h>
+#include <simdutf.h>
+
+#include "format.hpp"
+
+// TODO fix static initialization order fiasco
+
+// Verify that malloc provides sufficient alignment for atomic 128-bit
+// operations
+static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= 16,
+              "Default new alignment must be at least 16 bytes for atomic "
+              "128-bit stores");
+
+// WeaselDB Metrics System Design:
+//
+// THREADING MODEL:
+// - Counters and Histograms: Per-thread storage, single writer per thread
+// - Gauges: Global storage with mutex protection (multi-writer)
+//
+// PRECISION STRATEGY:
+// - Use atomic<uint64_t> for lock-free storage
+// - Store doubles using std::bit_cast to uint64_t (preserves full IEEE 754
+// precision)
+// - Single writer assumption allows simple load/store without CAS loops
+//
+// MEMORY MODEL:
+// - Thread-local metrics auto-cleanup on thread destruction
+// - Global metrics (gauges) persist for application lifetime
+// - Histogram buckets are sorted, deduplicated, and include +Inf bucket
+
+namespace metric {
+
+// Validation helper that works in both debug and release builds
+static void validate_or_abort(bool condition, const char *message,
+                              const char *value) {
+  if (!condition) {
+    std::fprintf(stderr, "WeaselDB metric validation failed: %s: '%s'\n",
+                 message, value);
+    std::abort();
+  }
+}
+
+// Labels key for second level of map
+struct LabelsKey {
+  std::vector<std::pair<std::string, std::string>> labels;
+
+  LabelsKey(std::vector<std::pair<std::string, std::string>> l)
+      : labels(std::move(l)) {
+    // Validate all label keys and values
+    for (const auto &[key, value] : labels) {
+      validate_or_abort(is_valid_label_key(key), "invalid label key",
+                        key.c_str());
+      validate_or_abort(is_valid_label_value(value), "invalid label value",
+                        value.c_str());
+    }
+
+    // Sort labels by key for Prometheus compatibility
+    std::sort(labels.begin(), labels.end(),
+              [](const auto &a, const auto &b) { return a.first < b.first; });
+  }
+
+  bool operator==(const LabelsKey &other) const {
+    return labels == other.labels;
+  }
+};
+
+} // namespace metric
+
+namespace std {
+template <> struct hash<metric::LabelsKey> {
+  std::size_t operator()(const metric::LabelsKey &k) const {
+    std::size_t hash_value = 0;
+    for (const auto &[key, value] : k.labels) {
+      // Combine hashes using a simple but effective method
+      hash_value ^= std::hash<std::string>{}(key) + 0x9e3779b9 +
+                    (hash_value << 6) + (hash_value >> 2);
+      hash_value ^= std::hash<std::string>{}(value) + 0x9e3779b9 +
+                    (hash_value << 6) + (hash_value >> 2);
+    }
+    return hash_value;
+  }
+};
+} // namespace std
+
+namespace metric {
+
+// DESIGN: Store doubles in atomic<uint64_t> for lock-free operations
+// - Preserves full IEEE 754 double precision (no truncation)
+// - Allows atomic load/store without locks
+// - Use std::bit_cast for safe conversion between double and uint64_t
+
+// Family::State structures own the second-level maps (labels -> instances)
+template <> struct Family<Counter>::State {
+  std::string name;
+  std::string help;
+
+  struct PerThreadState {
+    std::unordered_map<LabelsKey, std::unique_ptr<Counter::State>> instances;
+  };
+  std::unordered_map<std::thread::id, PerThreadState> perThreadState;
+
+  // Callback-based metrics (global, not per-thread)
+  std::unordered_map<LabelsKey, MetricCallback<Counter>> callbacks;
+};
+
+template <> struct Family<Gauge>::State {
+  std::string name;
+  std::string help;
+  std::unordered_map<LabelsKey, std::unique_ptr<Gauge::State>> instances;
+
+  // Callback-based metrics
+  std::unordered_map<LabelsKey, MetricCallback<Gauge>> callbacks;
+};
+
+template <> struct Family<Histogram>::State {
+  std::string name;
+  std::string help;
+  std::vector<double> buckets;
+
+  struct PerThreadState {
+    std::unordered_map<LabelsKey, std::unique_ptr<Histogram::State>> instances;
+  };
+  std::unordered_map<std::thread::id, PerThreadState> perThreadState;
+
+  // Note: No callbacks map - histograms don't support callback-based metrics
+};
+
+// Counter: Thread-local, monotonically increasing, single writer per thread
+struct Counter::State {
+  double value; // Single writer, no atomics needed
+  friend struct Metric;
+};
+
+// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS).
+// TODO slow under contention.
+struct Gauge::State {
+  std::atomic<uint64_t> value; // Stores double as uint64_t bits, lock-free
+  friend struct Metric;
+};
+
+// Histogram: Thread-local buckets, single writer per thread
+struct Histogram::State {
+  std::vector<double>
+      thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf)
+  std::vector<uint64_t> counts; // Count per bucket - single writer, malloc
+                                // provides 16-byte alignment
+  // TODO this should just be a double like in counter
+  std::atomic<uint64_t>
+      sum; // Sum of observations (double stored as uint64_t bits)
+  std::atomic<uint64_t> observations; // Total observation count (uint64_t)
+  friend struct Metric;
+};
+
+struct Metric {
+  static std::mutex mutex;
+
+  // Two-level map: name -> Family
+  static std::unordered_map<std::string,
+                            std::unique_ptr<Family<Counter>::State>>
+      counterFamilies;
+  static std::unordered_map<std::string, std::unique_ptr<Family<Gauge>::State>>
+      gaugeFamilies;
+  static std::unordered_map<std::string,
+                            std::unique_ptr<Family<Histogram>::State>>
+      histogramFamilies;
+
+  // Thread cleanup for per-family thread-local storage
+  struct ThreadInit {
+    ThreadInit() {
+      // Thread registration happens lazily when metrics are created
+    }
+    ~ThreadInit() {
+      // TODO we need to accumulate this threads counts into a global. Otherwise
+      // it can go backwards when we destroy a thread.
+
+      //  Clean up this thread's storage from all families
+      std::unique_lock<std::mutex> _{mutex};
+      auto thread_id = std::this_thread::get_id();
+
+      // Clean up counter families
+      for (auto &[name, family] : counterFamilies) {
+        family->perThreadState.erase(thread_id);
+      }
+
+      // Clean up histogram families
+      for (auto &[name, family] : histogramFamilies) {
+        family->perThreadState.erase(thread_id);
+      }
+
+      // Gauges are global, no per-thread cleanup needed
+    }
+  };
+  static thread_local ThreadInit thread_init;
+
+  // Thread cleanup now handled by ThreadInit RAII
+
+  static Counter create_counter_instance(
+      Family<Counter> *family,
+      const std::vector<std::pair<std::string, std::string>> &labels) {
+    std::unique_lock<std::mutex> _{mutex};
+    LabelsKey key{labels};
+
+    // Validate that labels aren't already registered as callback
+    validate_or_abort(
+        family->p->callbacks.find(key) == family->p->callbacks.end(),
+        "labels already registered as callback",
+        key.labels.empty() ? "(no labels)" : key.labels[0].first.c_str());
+
+    auto &ptr =
+        family->p->perThreadState[std::this_thread::get_id()].instances[key];
+    if (!ptr) {
+      ptr = std::make_unique<Counter::State>();
+      ptr->value = 0.0;
+    }
+    Counter result;
+    result.p = ptr.get();
+    return result;
+  }
+
+  static Gauge create_gauge_instance(
+      Family<Gauge> *family,
+      const std::vector<std::pair<std::string, std::string>> &labels) {
+    std::unique_lock<std::mutex> _{mutex};
+    LabelsKey key{labels};
+
+    // Validate that labels aren't already registered as callback
+    validate_or_abort(
+        family->p->callbacks.find(key) == family->p->callbacks.end(),
+        "labels already registered as callback",
+        key.labels.empty() ? "(no labels)" : key.labels[0].first.c_str());
+
+    auto &ptr = family->p->instances[key];
+    if (!ptr) {
+      ptr = std::make_unique<Gauge::State>();
+      ptr->value.store(0, std::memory_order_relaxed);
+    }
+    Gauge result;
+    result.p = ptr.get();
+    return result;
+  }
+
+  static Histogram create_histogram_instance(
+      Family<Histogram> *family,
+      const std::vector<std::pair<std::string, std::string>> &labels) {
+    std::unique_lock<std::mutex> _{mutex};
+    LabelsKey key{labels};
+    auto &ptr =
+        family->p->perThreadState[std::this_thread::get_id()].instances[key];
+    if (!ptr) {
+      ptr = std::make_unique<Histogram::State>();
+      // DESIGN: Prometheus-compatible histogram buckets
+      // Use buckets from family configuration
+      ptr->thresholds = family->p->buckets; // Already sorted and deduplicated
+
+      // Single writer semantics - no atomics needed for bucket counts
+      ptr->counts = std::vector<uint64_t>(ptr->thresholds.size(), 0);
+      ptr->sum.store(0, std::memory_order_relaxed);
+      ptr->observations.store(0, std::memory_order_relaxed);
+    }
+    Histogram result;
+    result.p = ptr.get();
+    return result;
+  }
+};
+
+Counter::Counter() = default;
+
+void Counter::inc(double x) {
+  // DESIGN: Single writer per thread allows simple increment
+  // No atomics needed since only one thread writes to this counter
+  auto new_value = p->value + x;
+
+  // Validate monotonic property (counter never decreases)
+  if (new_value < p->value) [[unlikely]] {
+    validate_or_abort(false, "counter value overflow/wraparound detected",
+                      std::to_string(new_value).c_str());
+  }
+
+  __atomic_store(&p->value, &new_value, __ATOMIC_RELAXED);
+}
+
+Gauge::Gauge() = default;
+
+void Gauge::inc(double x) {
+  // Lock-free increment using CAS loop
+  uint64_t expected = p->value.load(std::memory_order_relaxed);
+  uint64_t desired;
+  do {
+    double current_value = std::bit_cast<double>(expected);
+    double new_value = current_value + x;
+    desired = std::bit_cast<uint64_t>(new_value);
+  } while (!p->value.compare_exchange_weak(expected, desired,
+                                           std::memory_order_relaxed));
+}
+void Gauge::dec(double x) {
+  // Lock-free decrement using CAS loop
+  uint64_t expected = p->value.load(std::memory_order_relaxed);
+  uint64_t desired;
+  do {
+    double current_value = std::bit_cast<double>(expected);
+    double new_value = current_value - x;
+    desired = std::bit_cast<uint64_t>(new_value);
+  } while (!p->value.compare_exchange_weak(expected, desired,
+                                           std::memory_order_relaxed));
+}
+void Gauge::set(double x) {
+  // Simple atomic store for set operation
+  p->value.store(std::bit_cast<uint64_t>(x), std::memory_order_relaxed);
+}
+
+Histogram::Histogram() = default;
+
+// Vectorized histogram bucket updates using single-writer + atomic-read design
+// Since histograms have single-writer semantics, we can use architecturally
+// atomic stores
+
+#ifdef __x86_64__
+// x86-64: 128-bit vectorized with inline assembly atomic stores
+__attribute__((target("avx"))) static void
+update_histogram_buckets(const std::vector<double> &thresholds,
+                         std::vector<uint64_t> &counts, double x,
+                         size_t start_idx) {
+  const size_t size = thresholds.size();
+  size_t i = start_idx;
+
+  // Process 2 buckets at a time with 128-bit vectors + inline assembly
+  const __m128d x_vec = _mm_set1_pd(x);
+
+  for (; i + 2 <= size; i += 2) {
+    // Ensure alignment for atomic guarantee (malloc provides 16-byte alignment)
+    assert((reinterpret_cast<uintptr_t>(static_cast<void *>(&counts[i])) &
+            15) == 0 &&
+           "counts array must be 16-byte aligned for atomic 128-bit stores");
+
+    // 128-bit vectorized comparison and arithmetic
+    __m128d thresholds_vec = _mm_loadu_pd(&thresholds[i]);
+    __m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
+    __m128i cmp_as_int = _mm_castpd_si128(cmp_result);
+    __m128i ones = _mm_set1_epi64x(1);
+    __m128i increments = _mm_and_si128(cmp_as_int, ones);
+
+    // Load current counts and add increments
+    __m128i current_counts = _mm_load_si128((__m128i *)&counts[i]);
+    __m128i updated_counts = _mm_add_epi64(current_counts, increments);
+
+    // Processors that enumerate support for Intel® AVX (by setting the feature
+    // flag CPUID.01H:ECX.AVX[bit 28])
+    //     guarantee that the 16-byte memory operations performed by the
+    //     following instructions will always be carried out atomically: â¢
+    //     MOVAPD, MOVAPS, and MOVDQA. â¢ VMOVAPD, VMOVAPS, and VMOVDQA when
+    //     encoded with VEX.128. â¢ VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64
+    //     when encoded with EVEX.128 and k0 (masking disabled). (Note that
+    //     these instructions require the linear addresses of their memory
+    //     operands to be 16-byte aligned.)
+    __asm__ __volatile__(
+        "vmovdqa %%xmm0, %0"
+        : "=m"(*((__m128i *)&counts[i])) // Output: aligned memory location
+        : "x"(updated_counts)            // Input: xmm register
+        : "memory"                       // Memory clobber
+    );
+    // We don't actually need it to be atomic across 128-bits, but that's
+    // sufficient to guarantee each 64 bit half is atomic.
+  }
+
+  // Handle remainder with atomic stores
+  for (; i < size; ++i) {
+    if (x <= thresholds[i]) {
+      __atomic_store_n(&counts[i], counts[i] + 1, __ATOMIC_RELAXED);
+    }
+  }
+}
+#else
+// Fallback implementation for non-x86 architectures
+static void
+update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
+                                    std::vector<uint64_t> &counts, double x,
+                                    size_t start_idx) {
+  const size_t size = thresholds.size();
+
+  // Scalar implementation with atomic stores for TSAN compatibility
+  for (size_t i = start_idx; i < size; ++i) {
+    if (x <= thresholds[i]) {
+      __atomic_store_n(&counts[i], counts[i] + 1, __ATOMIC_RELAXED);
+    }
+  }
+}
+#endif
+
+void Histogram::observe(double x) {
+  assert(p->thresholds.size() == p->counts.size());
+
+  update_histogram_buckets(p->thresholds, p->counts, x, 0);
+
+  // DESIGN: Single writer per thread allows simple load-modify-store for sum
+  // No CAS loop needed since only one thread writes to this histogram
+  auto current_sum =
+      std::bit_cast<double>(p->sum.load(std::memory_order_relaxed));
+  p->sum.store(std::bit_cast<uint64_t>(current_sum + x),
+               std::memory_order_relaxed);
+
+  p->observations.fetch_add(1, std::memory_order_relaxed);
+}
+
+template <> Family<Counter>::Family() = default;
+template <> Family<Gauge>::Family() = default;
+template <> Family<Histogram>::Family() = default;
+
+template <>
+Counter Family<Counter>::create(
+    std::vector<std::pair<std::string, std::string>> labels) {
+  return Metric::create_counter_instance(this, labels);
+}
+
+template <>
+Gauge Family<Gauge>::create(
+    std::vector<std::pair<std::string, std::string>> labels) {
+  return Metric::create_gauge_instance(this, labels);
+}
+
+template <>
+Histogram Family<Histogram>::create(
+    std::vector<std::pair<std::string, std::string>> labels) {
+  return Metric::create_histogram_instance(this, labels);
+}
+
+Family<Counter> create_counter(std::string name, std::string help) {
+  validate_or_abort(is_valid_metric_name(name), "invalid counter name",
+                    name.c_str());
+
+  std::unique_lock<std::mutex> _{Metric::mutex};
+  auto &familyPtr = Metric::counterFamilies[name];
+  if (!familyPtr) {
+    familyPtr = std::make_unique<Family<Counter>::State>();
+    familyPtr->name = std::move(name);
+    familyPtr->help = std::move(help);
+  } else {
+    validate_or_abort(
+        familyPtr->help == help,
+        "metric family already registered with different help text",
+        name.c_str());
+  }
+  Family<Counter> family;
+  family.p = familyPtr.get();
+  return family;
+}
+
+Family<Gauge> create_gauge(std::string name, std::string help) {
+  validate_or_abort(is_valid_metric_name(name), "invalid gauge name",
+                    name.c_str());
+
+  std::unique_lock<std::mutex> _{Metric::mutex};
+  auto &familyPtr = Metric::gaugeFamilies[name];
+  if (!familyPtr) {
+    familyPtr = std::make_unique<Family<Gauge>::State>();
+    familyPtr->name = std::move(name);
+    familyPtr->help = std::move(help);
+  } else {
+    validate_or_abort(
+        familyPtr->help == help,
+        "metric family already registered with different help text",
+        name.c_str());
+  }
+  Family<Gauge> family;
+  family.p = familyPtr.get();
+  return family;
+}
+
+Family<Histogram> create_histogram(std::string name, std::string help,
+                                   std::span<const double> buckets) {
+  validate_or_abort(is_valid_metric_name(name), "invalid histogram name",
+                    name.c_str());
+
+  std::unique_lock<std::mutex> _{Metric::mutex};
+  auto &familyPtr = Metric::histogramFamilies[name];
+  if (!familyPtr) {
+    familyPtr = std::make_unique<Family<Histogram>::State>();
+    familyPtr->name = std::move(name);
+    familyPtr->help = std::move(help);
+
+    // DESIGN: Prometheus-compatible histogram buckets
+    familyPtr->buckets = std::vector<double>(buckets.begin(), buckets.end());
+    std::sort(familyPtr->buckets.begin(), familyPtr->buckets.end());
+    familyPtr->buckets.erase(
+        std::unique(familyPtr->buckets.begin(), familyPtr->buckets.end()),
+        familyPtr->buckets.end());
+    // +Inf bucket captures all observations (Prometheus requirement)
+    if (familyPtr->buckets.empty() ||
+        familyPtr->buckets.back() != std::numeric_limits<double>::infinity()) {
+      familyPtr->buckets.push_back(std::numeric_limits<double>::infinity());
+    }
+  } else {
+    validate_or_abort(
+        familyPtr->help == help,
+        "metric family already registered with different help text",
+        name.c_str());
+    std::vector<double> new_buckets_vec(buckets.begin(), buckets.end());
+    std::sort(new_buckets_vec.begin(), new_buckets_vec.end());
+    new_buckets_vec.erase(
+        std::unique(new_buckets_vec.begin(), new_buckets_vec.end()),
+        new_buckets_vec.end());
+    if (new_buckets_vec.empty() ||
+        new_buckets_vec.back() != std::numeric_limits<double>::infinity()) {
+      new_buckets_vec.push_back(std::numeric_limits<double>::infinity());
+    }
+    validate_or_abort(familyPtr->buckets == new_buckets_vec,
+                      "metric family already registered with different buckets",
+                      name.c_str());
+  }
+  Family<Histogram> family;
+  family.p = familyPtr.get();
+  return family;
+}
+
+std::vector<double> linear_buckets(double start, double width, int count) {
+  validate_or_abort(width > 0, "linear bucket width must be positive",
+                    std::to_string(width).c_str());
+  validate_or_abort(count >= 0, "linear bucket count must be non-negative",
+                    std::to_string(count).c_str());
+
+  std::vector<double> buckets;
+  buckets.reserve(count);
+
+  for (int i = 0; i < count; ++i) {
+    buckets.push_back(start + i * width);
+  }
+
+  return buckets;
+}
+
+std::vector<double> exponential_buckets(double start, double factor,
+                                        int count) {
+  validate_or_abort(start > 0, "exponential bucket start must be positive",
+                    std::to_string(start).c_str());
+  validate_or_abort(factor > 1, "exponential bucket factor must be > 1",
+                    std::to_string(factor).c_str());
+  validate_or_abort(count >= 0, "exponential bucket count must be non-negative",
+                    std::to_string(count).c_str());
+
+  std::vector<double> buckets;
+  buckets.reserve(count);
+
+  double current = start;
+  for (int i = 0; i < count; ++i) {
+    buckets.push_back(current);
+    current *= factor;
+  }
+
+  return buckets;
+}
+
+// Prometheus validation functions
+// Metric names must match [a-zA-Z_:][a-zA-Z0-9_:]*
+bool is_valid_metric_name(const std::string &name) {
+  if (name.empty())
+    return false;
+
+  // First character must be letter, underscore, or colon
+  char first = name[0];
+  if (!std::isalpha(first) && first != '_' && first != ':') {
+    return false;
+  }
+
+  // Remaining characters must be alphanumeric, underscore, or colon
+  for (size_t i = 1; i < name.size(); ++i) {
+    char c = name[i];
+    if (!std::isalnum(c) && c != '_' && c != ':') {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Label keys must match [a-zA-Z_][a-zA-Z0-9_]*
+bool is_valid_label_key(const std::string &key) {
+  if (key.empty())
+    return false;
+
+  // First character must be letter or underscore
+  char first = key[0];
+  if (!std::isalpha(first) && first != '_') {
+    return false;
+  }
+
+  // Remaining characters must be alphanumeric or underscore
+  for (size_t i = 1; i < key.size(); ++i) {
+    char c = key[i];
+    if (!std::isalnum(c) && c != '_') {
+      return false;
+    }
+  }
+
+  // Label keys starting with __ are reserved for internal use
+  if (key.size() >= 2 && key[0] == '_' && key[1] == '_') {
+    return false;
+  }
+
+  return true;
+}
+
+// Label values can contain any UTF-8 characters (no specific restrictions)
+bool is_valid_label_value(const std::string &value) {
+  // Prometheus allows any UTF-8 string as label value
+  // Validate UTF-8 encoding for correctness using simdutf
+  return simdutf::validate_utf8(value.c_str(), value.size());
+}
+
+std::span<std::string_view> render(ArenaAllocator &arena) {
+  std::unique_lock<std::mutex> _{Metric::mutex};
+
+  std::vector<std::string_view> output;
+
+  auto format_labels =
+      [&](const std::vector<std::pair<std::string_view, std::string_view>>
+              &labels) -> std::string_view {
+    if (labels.empty()) {
+      return "";
+    }
+
+    size_t required_size = 2; // {}
+    for (const auto &[key, value] : labels) {
+      required_size += key.length() + 3 + value.length(); // key="value"
+      for (char c : value) {
+        if (c == '\\' || c == '"' || c == '\n') {
+          required_size++;
+        }
+      }
+    }
+    if (!labels.empty()) {
+      required_size += labels.size() - 1; // commas
+    }
+
+    char *buf = arena.allocate<char>(required_size);
+    char *p = buf;
+
+    *p++ = '{';
+    for (size_t i = 0; i < labels.size(); ++i) {
+      if (i > 0)
+        *p++ = ',';
+      std::memcpy(p, labels[i].first.data(), labels[i].first.length());
+      p += labels[i].first.length();
+      *p++ = '=';
+      *p++ = '"';
+      for (char c : labels[i].second) {
+        switch (c) {
+        case '\\':
+          *p++ = '\\';
+          *p++ = '\\';
+          break;
+        case '"':
+          *p++ = '\\';
+          *p++ = '"';
+          break;
+        case '\n':
+          *p++ = '\\';
+          *p++ = 'n';
+          break;
+        default:
+          *p++ = c;
+          break;
+        }
+      }
+      *p++ = '"';
+    }
+    *p++ = '}';
+    return std::string_view(buf, p - buf);
+  };
+
+  // Render counters
+  for (const auto &[name, family] : Metric::counterFamilies) {
+    output.push_back(
+        format(arena, "# HELP %s %s\n", name.c_str(), family->help.c_str()));
+    output.push_back(format(arena, "# TYPE %s counter\n", name.c_str()));
+
+    std::vector<std::pair<std::string_view, std::string_view>> labels_sv;
+    for (const auto &[labels_key, callback] : family->callbacks) {
+      auto value = callback();
+      labels_sv.clear();
+      for (const auto &l : labels_key.labels)
+        labels_sv.push_back(l);
+      auto labels = format_labels(labels_sv);
+      output.push_back(format(arena, "%.*s%.*s %.17g\n",
+                              static_cast<int>(name.length()), name.data(),
+                              static_cast<int>(labels.length()), labels.data(),
+                              value));
+    }
+
+    for (const auto &[thread_id, per_thread] : family->perThreadState) {
+      for (const auto &[labels_key, instance] : per_thread.instances) {
+        // Atomic read from render thread - single writer doesn't need atomic
+        // writes
+        double value;
+        __atomic_load(&instance->value, &value, __ATOMIC_RELAXED);
+        labels_sv.clear();
+        for (const auto &l : labels_key.labels)
+          labels_sv.push_back(l);
+        auto labels = format_labels(labels_sv);
+        output.push_back(format(arena, "%.*s%.*s %.17g\n",
+                                static_cast<int>(name.length()), name.data(),
+                                static_cast<int>(labels.length()),
+                                labels.data(), value));
+      }
+    }
+  }
+
+  // Render gauges
+  for (const auto &[name, family] : Metric::gaugeFamilies) {
+    output.push_back(
+        format(arena, "# HELP %s %s\n", name.c_str(), family->help.c_str()));
+    output.push_back(format(arena, "# TYPE %s gauge\n", name.c_str()));
+
+    std::vector<std::pair<std::string_view, std::string_view>> labels_sv;
+    for (const auto &[labels_key, callback] : family->callbacks) {
+      auto value = callback();
+      labels_sv.clear();
+      for (const auto &l : labels_key.labels)
+        labels_sv.push_back(l);
+      auto labels = format_labels(labels_sv);
+      output.push_back(format(arena, "%.*s%.*s %.17g\n",
+                              static_cast<int>(name.length()), name.data(),
+                              static_cast<int>(labels.length()), labels.data(),
+                              value));
+    }
+
+    for (const auto &[labels_key, instance] : family->instances) {
+      auto value = std::bit_cast<double>(
+          instance->value.load(std::memory_order_relaxed));
+      labels_sv.clear();
+      for (const auto &l : labels_key.labels)
+        labels_sv.push_back(l);
+      auto labels = format_labels(labels_sv);
+      output.push_back(format(arena, "%.*s%.*s %.17g\n",
+                              static_cast<int>(name.length()), name.data(),
+                              static_cast<int>(labels.length()), labels.data(),
+                              value));
+    }
+  }
+
+  // Render histograms
+  for (const auto &[name, family] : Metric::histogramFamilies) {
+    output.push_back(
+        format(arena, "# HELP %s %s\n", name.c_str(), family->help.c_str()));
+    output.push_back(format(arena, "# TYPE %s histogram\n", name.c_str()));
+
+    std::vector<std::pair<std::string_view, std::string_view>> bucket_labels_sv;
+    for (const auto &[thread_id, per_thread] : family->perThreadState) {
+      for (const auto &[labels_key, instance] : per_thread.instances) {
+        for (size_t i = 0; i < instance->thresholds.size(); ++i) {
+          bucket_labels_sv.clear();
+          for (const auto &l : labels_key.labels)
+            bucket_labels_sv.push_back(l);
+
+          if (std::isinf(instance->thresholds[i])) {
+            bucket_labels_sv.push_back({"le", "+Inf"});
+          } else {
+            bucket_labels_sv.push_back(
+                {"le", format(arena, "%.17g", instance->thresholds[i])});
+          }
+          // Atomic read from render thread - single writer doesn't need atomic
+          // writes
+          auto count = __atomic_load_n(&instance->counts[i], __ATOMIC_RELAXED);
+          auto labels = format_labels(bucket_labels_sv);
+          output.push_back(format(arena, "%s_bucket%.*s %llu\n", name.c_str(),
+                                  static_cast<int>(labels.length()),
+                                  labels.data(),
+                                  static_cast<unsigned long long>(count)));
+        }
+
+        auto sum_value = std::bit_cast<double>(
+            instance->sum.load(std::memory_order_relaxed));
+        bucket_labels_sv.clear();
+        for (const auto &l : labels_key.labels)
+          bucket_labels_sv.push_back(l);
+        auto labels = format_labels(bucket_labels_sv);
+        output.push_back(format(arena, "%s_sum%.*s %.17g\n", name.c_str(),
+                                static_cast<int>(labels.length()),
+                                labels.data(), sum_value));
+
+        auto count_value =
+            instance->observations.load(std::memory_order_relaxed);
+        output.push_back(format(arena, "%s_count%.*s %llu\n", name.c_str(),
+                                static_cast<int>(labels.length()),
+                                labels.data(),
+                                static_cast<unsigned long long>(count_value)));
+      }
+    }
+  }
+
+  auto result = arena.allocate<std::string_view>(output.size());
+  std::copy(output.begin(), output.end(), result);
+  return std::span<std::string_view>(result, output.size());
+}
+
+// Template specialization implementations for register_callback
+template <>
+void Family<Counter>::register_callback(
+    std::vector<std::pair<std::string, std::string>> labels,
+    MetricCallback<Counter> callback) {
+  std::unique_lock<std::mutex> _{Metric::mutex};
+  LabelsKey key{std::move(labels)};
+
+  // Validate that labels aren't already in use by create() calls
+  for (const auto &[thread_id, per_thread] : p->perThreadState) {
+    validate_or_abort(
+        per_thread.instances.find(key) == per_thread.instances.end(),
+        "labels already registered as static instance",
+        key.labels.empty() ? "(no labels)" : key.labels[0].first.c_str());
+  }
+
+  // Validate that callback isn't already registered for these labels
+  validate_or_abort(p->callbacks.find(key) == p->callbacks.end(),
+                    "callback already registered for labels",
+                    key.labels.empty() ? "(no labels)"
+                                       : key.labels[0].first.c_str());
+
+  p->callbacks[std::move(key)] = std::move(callback);
+}
+
+template <>
+void Family<Gauge>::register_callback(
+    std::vector<std::pair<std::string, std::string>> labels,
+    MetricCallback<Gauge> callback) {
+  std::unique_lock<std::mutex> _{Metric::mutex};
+  LabelsKey key{std::move(labels)};
+
+  // Validate that labels aren't already in use by create() calls
+  validate_or_abort(p->instances.find(key) == p->instances.end(),
+                    "labels already registered as static instance",
+                    key.labels.empty() ? "(no labels)"
+                                       : key.labels[0].first.c_str());
+
+  // Validate that callback isn't already registered for these labels
+  validate_or_abort(p->callbacks.find(key) == p->callbacks.end(),
+                    "callback already registered for labels",
+                    key.labels.empty() ? "(no labels)"
+                                       : key.labels[0].first.c_str());
+
+  p->callbacks[std::move(key)] = std::move(callback);
+}
+
+// Explicit template instantiations to provide member implementations
+
+// Static member definitions
+std::mutex Metric::mutex;
+std::unordered_map<std::string, std::unique_ptr<Family<Counter>::State>>
+    Metric::counterFamilies;
+std::unordered_map<std::string, std::unique_ptr<Family<Gauge>::State>>
+    Metric::gaugeFamilies;
+std::unordered_map<std::string, std::unique_ptr<Family<Histogram>::State>>
+    Metric::histogramFamilies;
+thread_local Metric::ThreadInit Metric::thread_init;
+
+} // namespace metric
--- a/src/metric.hpp
+++ b/src/metric.hpp
@@ -0,0 +1,200 @@
+#pragma once
+
+// WeaselDB Metrics System
+//
+// High-performance metrics collection with Prometheus-compatible output.
+//
+// DESIGN PRINCIPLES:
+// - Single-writer semantics: Each metric instance bound to creating thread
+// - Lock-free operations using atomic<uint64_t> storage for doubles
+// - Full IEEE 754 double precision preservation via bit reinterpretation
+// - Single global registry: All metrics registered in one global namespace
+//
+// CRITICAL THREAD SAFETY CONSTRAINT:
+// Each metric instance has exactly ONE writer thread (the creating thread).
+// It is undefined behavior to call inc()/dec()/set()/observe() from a different
+// thread.
+//
+// REGISTRY MODEL:
+// This implementation uses a single global registry for all metrics, unlike
+// typical Prometheus client libraries that support multiple registries.
+// This design choice prioritizes simplicity and performance over flexibility.
+//
+// METRIC LIFECYCLE:
+// Metrics are created once and persist for the application lifetime. There is
+// no unregistration mechanism - this prevents accidental metric loss and
+// simplifies the implementation.
+//
+// USAGE:
+//   auto counter_family = metric::create_counter("requests_total", "Total
+//   requests"); auto counter = counter_family.create({{"method", "GET"}});  //
+//   Bound to this thread counter.inc(1.0);  // ONLY call from creating thread
+//
+//   auto histogram_family = metric::create_histogram("latency", "Request
+//   latency", {0.1, 0.5, 1.0}); auto histogram =
+//   histogram_family.create({{"endpoint", "/api"}});  // Bound to this thread
+//   histogram.observe(0.25);  // ONLY call from creating thread
+
+#include <functional>
+#include <initializer_list>
+#include <span>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arena_allocator.hpp"
+
+namespace metric {
+
+// Forward declarations
+template <typename T> struct Family;
+
+// Callback function type for dynamic metric values
+// Called during render() to get current metric value
+// THREAD SAFETY: May be called from arbitrary thread, but serialized by
+// render() mutex - no need to be thread-safe internally
+template <typename T> using MetricCallback = std::function<double()>;
+
+// Counter: A metric value that only increases.
+//
+// THREAD SAFETY RULES:
+// 1. Do not call inc() on the same Counter object from multiple threads.
+//    Each object must have only one writer thread.
+// 2. To use Counters concurrently, each thread must create its own Counter
+//    object.
+// 3. When rendered, the values of all Counter objects with the same labels
+//    are summed together into a single total.
+struct Counter {
+  void inc(double = 1.0); // Increment counter (must be >= 0)
+
+private:
+  Counter();
+  friend struct Metric;
+  template <class> friend struct Family;
+  struct State;
+  State *p;
+};
+
+// Gauge: A metric value that can be set, increased, or decreased.
+//
+// THREAD SAFETY RULES:
+// 1. Do not call inc(), dec(), or set() on the same Gauge object from
+//    multiple threads. Each object must have only one writer thread.
+// 2. To use Gauges concurrently, each thread must create its own Gauge object.
+// 3. If multiple Gauge objects are created with the same labels, their
+//    operations are combined. For example, increments from different objects
+//    are cumulative.
+// 4. For independent gauges, create them with unique labels.
+struct Gauge {
+  void inc(double = 1.0);
+  void dec(double = 1.0);
+  void set(double);
+
+private:
+  Gauge();
+  friend struct Metric;
+  template <class> friend struct Family;
+  struct State;
+  State *p;
+};
+
+// Histogram: A metric that samples observations into buckets.
+//
+// THREAD SAFETY RULES:
+// 1. Do not call observe() on the same Histogram object from multiple
+//    threads. Each object must have only one writer thread.
+// 2. To use Histograms concurrently, each thread must create its own
+//    Histogram object.
+// 3. When rendered, the observations from all Histogram objects with the
+//    same labels are combined into a single histogram.
+struct Histogram {
+  void observe(double); // Record observation in appropriate bucket
+
+private:
+  Histogram();
+  friend struct Metric;
+  template <class> friend struct Family;
+  struct State;
+  State *p;
+};
+
+// Family: Factory for creating metric instances with different label
+// combinations Each family represents one metric name with varying labels
+template <class T> struct Family {
+  static_assert(std::is_same_v<T, Counter> || std::is_same_v<T, Gauge> ||
+                std::is_same_v<T, Histogram>);
+
+  // Create metric instance with specific labels
+  // Labels are sorted by key for Prometheus compatibility
+  // ERROR: Will abort if labels already registered via register_callback()
+  // OK: Multiple calls with same labels return same instance (idempotent)
+  T create(std::vector<std::pair<std::string, std::string>> labels);
+
+  // Register callback-based metric (Counter and Gauge only)
+  // Validates that label set isn't already taken
+  void
+  register_callback(std::vector<std::pair<std::string, std::string>> labels,
+                    MetricCallback<T> callback);
+
+private:
+  Family();
+  friend struct Metric;
+  friend Family<Counter> create_counter(std::string, std::string);
+  friend Family<Gauge> create_gauge(std::string, std::string);
+  friend Family<Histogram> create_histogram(std::string, std::string,
+                                            std::span<const double>);
+
+  struct State;
+  State *p;
+};
+
+// Factory functions for creating metric families
+// IMPORTANT: name and help must point to static memory (string literals)
+
+// Create counter family (monotonically increasing values)
+// ERROR: Aborts if family with same name is registered with different help
+// text.
+Family<Counter> create_counter(std::string name, std::string help);
+
+// Create gauge family (can increase/decrease)
+// ERROR: Aborts if family with same name is registered with different help
+// text.
+Family<Gauge> create_gauge(std::string name, std::string help);
+
+// Create histogram family with custom buckets
+// Buckets will be sorted, deduplicated, and +Inf will be added automatically
+// ERROR: Aborts if family with same name is registered with different help text
+// or buckets.
+Family<Histogram> create_histogram(std::string name, std::string help,
+                                   std::span<const double> buckets);
+
+// Helper functions for generating standard histogram buckets
+// Following Prometheus client library conventions
+
+// Generate linear buckets: start, start+width, start+2*width, ...,
+// start+(count-1)*width Example: linear_buckets(0, 10, 5) = {0, 10, 20, 30, 40}
+std::vector<double> linear_buckets(double start, double width, int count);
+
+// Generate exponential buckets: start, start*factor, start*factor^2, ...,
+// start*factor^(count-1) Example: exponential_buckets(1, 2, 5) = {1, 2, 4, 8,
+// 16}
+std::vector<double> exponential_buckets(double start, double factor, int count);
+
+// Render all metrics in Prometheus text format
+// Returns chunks of Prometheus exposition format (includes # HELP and # TYPE
+// lines) Each string_view may contain multiple lines separated by '\n' String
+// views are NOT null-terminated - use .size() for length All string data
+// allocated in provided arena for zero-copy efficiency
+// TODO: Implement Prometheus text exposition format
+// THREAD SAFETY: Serialized by global mutex - callbacks need not be thread-safe
+std::span<std::string_view> render(ArenaAllocator &arena);
+
+// Validation functions for Prometheus compatibility
+bool is_valid_metric_name(const std::string &name);
+bool is_valid_label_key(const std::string &key);
+bool is_valid_label_value(const std::string &value);
+
+// Note: Histograms do not support callbacks due to their multi-value nature
+// (buckets + sum + count). Use static histogram metrics only.
+
+} // namespace metric
--- a/style.md
+++ b/style.md
@@ -99,7 +99,7 @@ auto addr = reinterpret_cast<uintptr_t>(ptr);         // Pointer to integer conv
 - **Complexity must be justified with benchmarks** - measure performance impact before adding complexity
 - **Strive for 0% CPU usage when idle** - avoid polling, busy waiting, or unnecessary background activity
 - Use **inline functions** for performance-critical code (e.g., `allocate_raw`)
- **Zero-copy operations** with `std::string_view` over string copying
+- **String views** with `std::string_view` to minimize unnecessary copying
 - **Arena allocation** for efficient memory management (~1ns vs ~20-270ns for malloc)

 ### Complexity Control
@@ -249,7 +249,7 @@ private:
 - **Parameter passing:**
  - Pass by value for types ≤ 16 bytes (int, pointers, string_view, small structs)
  - Pass by const reference for types > 16 bytes (containers, large objects)
- **Return by value** for small types (≤ 16 bytes), **string_view** for zero-copy over strings
+- **Return by value** for small types (≤ 16 bytes), **string_view** to avoid copying strings
 - **noexcept specification** for move operations and non-throwing functions
 ```cpp
 std::span<const Operation> operations() const { return operations_; }
@@ -301,13 +301,34 @@ for (auto &precondition : preconditions_) {
 }
 ```

+### Atomic Operations
+- **Never use assignment operators** with `std::atomic` - always use explicit `store()` and `load()`
+- **Always specify memory ordering** explicitly for atomic operations
+- **Use the least restrictive correct memory ordering** - choose the weakest ordering that maintains correctness
+```cpp
+// Preferred - explicit store/load with precise memory ordering
+std::atomic<uint64_t> counter;
+counter.store(42, std::memory_order_relaxed);      // Single-writer metric updates
+auto value = counter.load(std::memory_order_relaxed); // Reading metrics for display
+
+counter.store(1, std::memory_order_release);       // Publishing initialization
+auto ready = counter.load(std::memory_order_acquire); // Synchronizing with publisher
+
+counter.store(42, std::memory_order_seq_cst);      // When sequential consistency needed
+
+// Avoid - assignment operators (implicit memory ordering)
+std::atomic<uint64_t> counter;
+counter = 42;        // Implicit - memory ordering not explicit
+auto value = counter; // Implicit - memory ordering not explicit
+```
+
 ---

 ## Memory Management

 ### Ownership & Allocation
 - **Arena allocators** for request-scoped memory with **STL allocator adapters** (see Performance Focus section for characteristics)
- **String views** pointing to arena-allocated memory for zero-copy operations
+- **String views** pointing to arena-allocated memory to avoid unnecessary copying
 - **STL containers with arena allocators require default construction after arena reset** - `clear()` is not sufficient
 ```cpp
 // STL containers with arena allocators - correct reset pattern
@@ -547,6 +568,24 @@ TEST_CASE("Server accepts connections") {
  - `std::latch`, `std::barrier`, futures/promises
 - **Force concurrent execution** using `std::latch` to synchronize thread startup

+#### Threading Checklist for Tests/Benchmarks
+
+**MANDATORY: Before writing any `std::thread` or `threads.emplace_back()`:**
+
+1. **Count total threads** - Include main/benchmark thread in count
+2. **Always assume concurrent execution needed** - Tests/benchmarks require real concurrency
+3. **Add `std::latch start_latch{N}`** where N = total concurrent threads
+4. **Each thread calls `start_latch.arrive_and_wait()`** before doing work
+5. **Main/benchmark thread calls `start_latch.arrive_and_wait()`** before measurement
+
+**Red flags to catch immediately:**
+- ❌ Creating threads in a loop without `std::latch`
+- ❌ Background threads starting work immediately
+- ❌ Benchmark measuring before all threads synchronized
+- ❌ Any use of `sleep_for`, `wait_for`, or timeouts
+
+**Simple rule:** Multiple threads = `std::latch` synchronization. No exceptions, even for "simple" background threads.
+
 ```cpp
 // BAD: Race likely over before threads start
 std::atomic<int> counter{0};
--- a/tests/test_arena_allocator.cpp
+++ b/tests/test_arena_allocator.cpp
@@ -1,7 +1,9 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include "arena_allocator.hpp"
+#include "format.hpp"
 #include <cstring>
 #include <doctest/doctest.h>
+#include <string>
 #include <vector>

 TEST_CASE("ArenaAllocator basic construction") {
@@ -532,3 +534,67 @@ TEST_CASE("ArenaAllocator realloc functionality") {
    }
  }
 }
+
+TEST_CASE("format function fallback codepath") {
+  SUBCASE("single-pass optimization success") {
+    ArenaAllocator arena(128);
+    auto result = format(arena, "Hello %s! Number: %d", "World", 42);
+    CHECK(result == "Hello World! Number: 42");
+    CHECK(result.length() == 23);
+  }
+
+  SUBCASE("fallback when speculative formatting fails") {
+    // Create arena with limited space to force fallback
+    ArenaAllocator arena(16);
+
+    // Consume most space to leave insufficient room for speculative formatting
+    arena.allocate<char>(10);
+    CHECK(arena.available_in_current_block() == 6);
+
+    // Format string larger than available space - should trigger fallback
+    std::string long_string = "This is a very long string that won't fit";
+    auto result = format(arena, "Prefix: %s with %d", long_string.c_str(), 123);
+
+    std::string expected =
+        "Prefix: This is a very long string that won't fit with 123";
+    CHECK(result == expected);
+    CHECK(result.length() == expected.length());
+  }
+
+  SUBCASE("edge case - exactly available space") {
+    ArenaAllocator arena(32);
+    arena.allocate<char>(20); // Leave 12 bytes
+    CHECK(arena.available_in_current_block() == 12);
+
+    // Format that needs exactly available space (should still use fallback due
+    // to null terminator)
+    auto result = format(arena, "Test%d", 123); // "Test123" = 7 chars
+    CHECK(result == "Test123");
+    CHECK(result.length() == 7);
+  }
+
+  SUBCASE("allocate_remaining_space postcondition") {
+    // Test empty arena
+    ArenaAllocator empty_arena(64);
+    auto space1 = empty_arena.allocate_remaining_space();
+    CHECK(space1.allocated_bytes >= 1);
+    CHECK(space1.allocated_bytes == 64);
+
+    // Test full arena (should create new block)
+    ArenaAllocator full_arena(32);
+    full_arena.allocate<char>(32); // Fill completely
+    auto space2 = full_arena.allocate_remaining_space();
+    CHECK(space2.allocated_bytes >= 1);
+    CHECK(space2.allocated_bytes == 32); // New block created
+  }
+
+  SUBCASE("format error handling") {
+    ArenaAllocator arena(64);
+
+    // Test with invalid format (should return empty string_view)
+    // Note: This is hard to trigger reliably across platforms,
+    // so we focus on successful cases in the other subcases
+    auto result = format(arena, "Valid format: %d", 42);
+    CHECK(result == "Valid format: 42");
+  }
+}
--- a/tests/test_metric.cpp
+++ b/tests/test_metric.cpp
@@ -0,0 +1,552 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <doctest/doctest.h>
+
+#include "arena_allocator.hpp"
+#include "metric.hpp"
+
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <latch>
+#include <thread>
+#include <vector>
+
+TEST_CASE("metric validation functions") {
+  SUBCASE("valid metric names") {
+    CHECK(metric::is_valid_metric_name("valid_name"));
+    CHECK(metric::is_valid_metric_name("ValidName"));
+    CHECK(metric::is_valid_metric_name("valid:name"));
+    CHECK(metric::is_valid_metric_name("_valid"));
+    CHECK(metric::is_valid_metric_name("valid_123"));
+    CHECK(metric::is_valid_metric_name("prometheus_metric_name"));
+  }
+
+  SUBCASE("invalid metric names") {
+    CHECK_FALSE(metric::is_valid_metric_name(""));
+    CHECK_FALSE(metric::is_valid_metric_name("123invalid"));
+    CHECK_FALSE(metric::is_valid_metric_name("invalid-name"));
+    CHECK_FALSE(metric::is_valid_metric_name("invalid.name"));
+    CHECK_FALSE(metric::is_valid_metric_name("invalid name"));
+  }
+
+  SUBCASE("valid label keys") {
+    CHECK(metric::is_valid_label_key("valid_key"));
+    CHECK(metric::is_valid_label_key("ValidKey"));
+    CHECK(metric::is_valid_label_key("valid123"));
+    CHECK(metric::is_valid_label_key("_valid"));
+  }
+
+  SUBCASE("invalid label keys") {
+    CHECK_FALSE(metric::is_valid_label_key(""));
+    CHECK_FALSE(metric::is_valid_label_key("123invalid"));
+    CHECK_FALSE(metric::is_valid_label_key("invalid:key"));
+    CHECK_FALSE(metric::is_valid_label_key("invalid-key"));
+    CHECK_FALSE(metric::is_valid_label_key("__reserved"));
+    CHECK_FALSE(metric::is_valid_label_key("__internal"));
+  }
+
+  SUBCASE("valid label values") {
+    CHECK(metric::is_valid_label_value("any_value"));
+    CHECK(metric::is_valid_label_value("123"));
+    CHECK(metric::is_valid_label_value("special-chars.allowed"));
+    CHECK(metric::is_valid_label_value(""));
+    CHECK(metric::is_valid_label_value("unicode测试"));
+  }
+}
+
+TEST_CASE("counter basic functionality") {
+  auto counter_family =
+      metric::create_counter("test_counter", "Test counter help");
+
+  SUBCASE("create counter with no labels") {
+    auto counter = counter_family.create({});
+    counter.inc(1.0);
+    counter.inc(2.5);
+    counter.inc(); // Default increment of 1.0
+  }
+
+  SUBCASE("create counter with labels") {
+    auto counter =
+        counter_family.create({{"method", "GET"}, {"status", "200"}});
+    counter.inc(5.0);
+
+    // Same labels should return same instance (idempotent)
+    auto counter2 =
+        counter_family.create({{"method", "GET"}, {"status", "200"}});
+    counter2.inc(3.0);
+  }
+
+  SUBCASE("label sorting") {
+    // Labels should be sorted by key
+    auto counter1 =
+        counter_family.create({{"z_key", "value"}, {"a_key", "value"}});
+    auto counter2 =
+        counter_family.create({{"a_key", "value"}, {"z_key", "value"}});
+
+    // These should be the same instance due to label sorting
+    counter1.inc(1.0);
+    counter2.inc(2.0); // Should add to same counter
+  }
+}
+
+TEST_CASE("gauge basic functionality") {
+  auto gauge_family = metric::create_gauge("test_gauge", "Test gauge help");
+
+  SUBCASE("gauge operations") {
+    auto gauge = gauge_family.create({{"instance", "test"}});
+
+    gauge.set(10.0);
+    gauge.inc(5.0);
+    gauge.dec(3.0);
+    gauge.inc(); // Default increment
+    gauge.dec(); // Default decrement
+  }
+
+  SUBCASE("gauge with multiple instances") {
+    auto gauge1 = gauge_family.create({{"instance", "test1"}});
+    auto gauge2 = gauge_family.create({{"instance", "test2"}});
+
+    gauge1.set(100.0);
+    gauge2.set(200.0);
+
+    gauge1.inc(50.0);
+    gauge2.dec(25.0);
+  }
+}
+
+TEST_CASE("histogram basic functionality") {
+  auto hist_family =
+      metric::create_histogram("test_latency", "Test latency histogram",
+                               metric::exponential_buckets(0.1, 2.0, 5));
+
+  SUBCASE("histogram observations") {
+    auto hist = hist_family.create({{"endpoint", "/api"}});
+
+    hist.observe(0.05); // Below first bucket
+    hist.observe(0.3);  // Between buckets
+    hist.observe(1.5);  // Between buckets
+    hist.observe(10.0); // Above all explicit buckets (goes in +Inf)
+  }
+
+  SUBCASE("histogram bucket validation") {
+    // Buckets should be sorted and deduplicated, with +Inf added
+    auto hist_family2 = metric::create_histogram(
+        "test_hist2", "Test",
+        std::initializer_list<double>{5.0, 1.0, 2.5, 1.0,
+                                      0.5}); // Unsorted with duplicate
+
+    auto hist = hist_family2.create({});
+    hist.observe(0.1);
+    hist.observe(1.5);
+    hist.observe(100.0); // Should go in +Inf bucket
+  }
+}
+
+TEST_CASE("histogram bucket generators") {
+  SUBCASE("linear_buckets basic functionality") {
+    // Linear buckets: start=0, width=10, count=5 -> {0, 10, 20, 30, 40}
+    auto buckets = metric::linear_buckets(0.0, 10.0, 5);
+
+    CHECK(buckets.size() == 5); // exactly count buckets
+    CHECK(buckets[0] == 0.0);
+    CHECK(buckets[1] == 10.0);
+    CHECK(buckets[2] == 20.0);
+    CHECK(buckets[3] == 30.0);
+    CHECK(buckets[4] == 40.0);
+  }
+
+  SUBCASE("linear_buckets with non-zero start") {
+    // Linear buckets: start=5, width=2.5, count=3 -> {5, 7.5, 10}
+    auto buckets = metric::linear_buckets(5.0, 2.5, 3);
+
+    CHECK(buckets.size() == 3);
+    CHECK(buckets[0] == 5.0);
+    CHECK(buckets[1] == 7.5);
+    CHECK(buckets[2] == 10.0);
+  }
+
+  SUBCASE("linear_buckets edge cases") {
+    // Zero count should give empty vector
+    auto zero_buckets = metric::linear_buckets(100.0, 10.0, 0);
+    CHECK(zero_buckets.size() == 0);
+
+    // Negative start should work
+    auto negative_buckets = metric::linear_buckets(-10.0, 5.0, 2);
+    CHECK(negative_buckets.size() == 2);
+    CHECK(negative_buckets[0] == -10.0);
+    CHECK(negative_buckets[1] == -5.0);
+  }
+
+  SUBCASE("exponential_buckets basic functionality") {
+    // Exponential buckets: start=1, factor=2, count=5 -> {1, 2, 4, 8, 16}
+    auto buckets = metric::exponential_buckets(1.0, 2.0, 5);
+
+    CHECK(buckets.size() == 5); // exactly count buckets
+    CHECK(buckets[0] == 1.0);
+    CHECK(buckets[1] == 2.0);
+    CHECK(buckets[2] == 4.0);
+    CHECK(buckets[3] == 8.0);
+    CHECK(buckets[4] == 16.0);
+  }
+
+  SUBCASE("exponential_buckets different factor") {
+    // Exponential buckets: start=0.1, factor=10, count=3 -> {0.1, 1, 10}
+    auto buckets = metric::exponential_buckets(0.1, 10.0, 3);
+
+    CHECK(buckets.size() == 3);
+    CHECK(buckets[0] == doctest::Approx(0.1));
+    CHECK(buckets[1] == doctest::Approx(1.0));
+    CHECK(buckets[2] == doctest::Approx(10.0));
+  }
+
+  SUBCASE("exponential_buckets typical latency pattern") {
+    // Typical web service latency buckets: 5ms, 10ms, 20ms, 40ms, 80ms, etc.
+    auto buckets = metric::exponential_buckets(0.005, 2.0, 8);
+
+    CHECK(buckets.size() == 8);
+    CHECK(buckets[0] == doctest::Approx(0.005)); // 5ms
+    CHECK(buckets[1] == doctest::Approx(0.010)); // 10ms
+    CHECK(buckets[2] == doctest::Approx(0.020)); // 20ms
+    CHECK(buckets[3] == doctest::Approx(0.040)); // 40ms
+    CHECK(buckets[4] == doctest::Approx(0.080)); // 80ms
+    CHECK(buckets[5] == doctest::Approx(0.160)); // 160ms
+    CHECK(buckets[6] == doctest::Approx(0.320)); // 320ms
+    CHECK(buckets[7] == doctest::Approx(0.640)); // 640ms
+  }
+
+  SUBCASE("exponential_buckets edge cases") {
+    // Zero count should give empty vector
+    auto zero_buckets = metric::exponential_buckets(5.0, 3.0, 0);
+    CHECK(zero_buckets.size() == 0);
+  }
+
+  SUBCASE("bucket generators with histogram creation") {
+    // Test that generated buckets work correctly with histogram creation
+    auto linear_hist = metric::create_histogram(
+        "linear_test", "Linear test", metric::linear_buckets(0, 100, 5));
+    auto linear_instance = linear_hist.create({{"type", "linear"}});
+
+    // Test observations fall into expected buckets
+    linear_instance.observe(50);   // Should fall into 100 bucket
+    linear_instance.observe(150);  // Should fall into 200 bucket
+    linear_instance.observe(1000); // Should fall into +Inf bucket
+
+    auto exp_hist =
+        metric::create_histogram("exp_test", "Exponential test",
+                                 metric::exponential_buckets(0.001, 10.0, 4));
+    auto exp_instance = exp_hist.create({{"type", "exponential"}});
+
+    // Test typical latency measurements
+    exp_instance.observe(0.0005); // Should fall into 0.001 bucket (1ms)
+    exp_instance.observe(0.005);  // Should fall into 0.01 bucket (10ms)
+    exp_instance.observe(0.05);   // Should fall into 0.1 bucket (100ms)
+    exp_instance.observe(5.0);    // Should fall into +Inf bucket
+  }
+
+  SUBCASE("prometheus compatibility verification") {
+    // Verify our bucket generation matches Prometheus Go client behavior
+
+    // Linear buckets equivalent to Prometheus LinearBuckets(0, 10, 5)
+    auto our_linear = metric::linear_buckets(0, 10, 5);
+    std::vector<double> expected_linear = {0, 10, 20, 30, 40};
+    CHECK(our_linear == expected_linear);
+
+    // Exponential buckets equivalent to Prometheus ExponentialBuckets(1, 2, 5)
+    auto our_exp = metric::exponential_buckets(1, 2, 5);
+    std::vector<double> expected_exp = {1, 2, 4, 8, 16};
+    CHECK(our_exp == expected_exp);
+
+    // Default Prometheus histogram buckets (exponential)
+    auto default_buckets = metric::exponential_buckets(0.005, 2.5, 9);
+    // Should be: .005, .0125, .03125, .078125, .1953125,
+    // .48828125, 1.220703125, 3.0517578125, 7.62939453125
+    CHECK(default_buckets.size() == 9);
+    CHECK(default_buckets[0] == doctest::Approx(0.005));
+    CHECK(default_buckets[1] == doctest::Approx(0.0125));
+    CHECK(default_buckets[8] == doctest::Approx(7.62939453125));
+  }
+}
+
+TEST_CASE("callback-based metrics") {
+  auto counter_family =
+      metric::create_counter("callback_counter", "Callback counter");
+  auto gauge_family = metric::create_gauge("callback_gauge", "Callback gauge");
+
+  SUBCASE("counter callback") {
+    std::atomic<double> counter_value{42.0};
+
+    counter_family.register_callback(
+        {{"type", "callback"}},
+        [&counter_value]() { return counter_value.load(); });
+
+    // Callback should be called during render
+    ArenaAllocator arena;
+    auto output = metric::render(arena);
+    CHECK(output.size() > 0);
+  }
+
+  SUBCASE("gauge callback") {
+    std::atomic<double> gauge_value{123.5};
+
+    gauge_family.register_callback({{"type", "callback"}}, [&gauge_value]() {
+      return gauge_value.load();
+    });
+
+    ArenaAllocator arena;
+    auto output = metric::render(arena);
+    CHECK(output.size() > 0);
+  }
+
+  SUBCASE("callback conflict detection") {
+    // First create a static instance
+    auto counter = counter_family.create({{"conflict", "test"}});
+    counter.inc(1.0);
+
+    // Then try to register a callback with same labels - should abort
+    // This is a validation test that would abort in debug builds
+  }
+}
+
+TEST_CASE("prometheus text format rendering") {
+  ArenaAllocator arena;
+
+  // Create some metrics
+  auto counter_family =
+      metric::create_counter("http_requests_total", "Total HTTP requests");
+  auto counter = counter_family.create({{"method", "GET"}, {"status", "200"}});
+  counter.inc(1000);
+
+  auto gauge_family =
+      metric::create_gauge("memory_usage_bytes", "Memory usage");
+  auto gauge = gauge_family.create({{"type", "heap"}});
+  gauge.set(1048576);
+
+  auto hist_family = metric::create_histogram(
+      "request_duration_seconds", "Request duration",
+      metric::exponential_buckets(0.1, 2.0, 3)); // 0.1, 0.2, 0.4, 0.8
+  auto hist = hist_family.create({{"handler", "api"}});
+  hist.observe(0.25);
+  hist.observe(0.75);
+  hist.observe(1.5);
+
+  SUBCASE("render format validation") {
+    auto output = metric::render(arena);
+    CHECK(output.size() > 0);
+
+    // Basic format checks
+    bool found_help = false;
+    bool found_type = false;
+    bool found_metric_line = false;
+
+    for (const auto &line : output) {
+      if (line.starts_with("# HELP"))
+        found_help = true;
+      if (line.starts_with("# TYPE"))
+        found_type = true;
+      if (line.find("http_requests_total") != std::string_view::npos)
+        found_metric_line = true;
+    }
+
+    CHECK(found_help);
+    CHECK(found_type);
+    CHECK(found_metric_line);
+  }
+
+  SUBCASE("special value formatting") {
+    auto special_gauge_family =
+        metric::create_gauge("special_values", "Special value test");
+    auto special_gauge = special_gauge_family.create({});
+
+    special_gauge.set(std::numeric_limits<double>::infinity());
+    auto output = metric::render(arena);
+
+    // Should contain "+Inf" representation
+    bool found_inf = false;
+    for (const auto &line : output) {
+      if (line.find("+Inf") != std::string_view::npos) {
+        found_inf = true;
+        break;
+      }
+    }
+    CHECK(found_inf);
+  }
+}
+
+TEST_CASE("thread safety") {
+  constexpr int num_threads = 8;
+  constexpr int ops_per_thread = 1000;
+
+  SUBCASE("counter single-writer semantics") {
+    auto counter_family =
+        metric::create_counter("thread_test_counter", "Thread test");
+
+    std::vector<std::thread> threads;
+    std::latch start_latch{num_threads};
+
+    // Each thread creates its own counter instance (safe)
+    for (int i = 0; i < num_threads; ++i) {
+      threads.emplace_back([&, i]() {
+        auto counter =
+            counter_family.create({{"thread_id", std::to_string(i)}});
+
+        start_latch.arrive_and_wait();
+
+        for (int j = 0; j < ops_per_thread; ++j) {
+          counter.inc(1.0);
+        }
+      });
+    }
+
+    for (auto &t : threads) {
+      t.join();
+    }
+  }
+
+  SUBCASE("gauge multi-writer contention") {
+    auto gauge_family =
+        metric::create_gauge("thread_test_gauge", "Thread test gauge");
+
+    std::vector<std::thread> threads;
+    std::latch start_latch{num_threads};
+
+    // Multiple threads create gauges with the same labels, writing to the same
+    // underlying state, testing CAS contention.
+    for (int i = 0; i < num_threads; ++i) {
+      threads.emplace_back([&]() {
+        auto gauge = gauge_family.create({{"shared", "true"}});
+        start_latch.arrive_and_wait();
+
+        for (int j = 0; j < ops_per_thread; ++j) {
+          gauge.inc(1.0);
+        }
+      });
+    }
+
+    for (auto &t : threads) {
+      t.join();
+    }
+  }
+
+  SUBCASE("histogram single-writer per thread") {
+    auto hist_family =
+        metric::create_histogram("thread_test_hist", "Thread test histogram",
+                                 std::initializer_list<double>{0.1, 0.5, 1.0});
+
+    std::vector<std::thread> threads;
+    std::latch start_latch{num_threads};
+
+    for (int i = 0; i < num_threads; ++i) {
+      threads.emplace_back([&, i]() {
+        auto hist = hist_family.create({{"thread_id", std::to_string(i)}});
+
+        start_latch.arrive_and_wait();
+
+        for (int j = 0; j < ops_per_thread; ++j) {
+          hist.observe(static_cast<double>(j) / ops_per_thread);
+        }
+      });
+    }
+
+    for (auto &t : threads) {
+      t.join();
+    }
+  }
+
+  SUBCASE("concurrent render calls") {
+    // Multiple threads calling render concurrently should be safe (serialized
+    // by mutex)
+    auto counter_family = metric::create_counter("render_test", "Render test");
+    auto counter = counter_family.create({});
+    counter.inc(100);
+
+    std::vector<std::thread> threads;
+    std::latch start_latch{num_threads};
+    std::atomic<int> success_count{0};
+
+    for (int i = 0; i < num_threads; ++i) {
+      threads.emplace_back([&]() {
+        start_latch.arrive_and_wait();
+
+        ArenaAllocator arena;
+        auto output = metric::render(arena);
+        if (output.size() > 0) {
+          success_count.fetch_add(1);
+        }
+      });
+    }
+
+    for (auto &t : threads) {
+      t.join();
+    }
+
+    CHECK(success_count.load() == num_threads);
+  }
+}
+
+TEST_CASE("error conditions") {
+  SUBCASE("counter negative increment") {
+    auto counter_family = metric::create_counter("error_counter", "Error test");
+    auto counter = counter_family.create({});
+
+    // This should abort in debug builds due to validation
+    // In release builds, behavior is undefined
+    // counter.inc(-1.0); // Would abort
+  }
+
+  SUBCASE("invalid metric names") {
+    // These should abort due to validation
+    // auto bad_counter = metric::create_counter("123invalid", "help"); // Would
+    // abort auto bad_gauge = metric::create_gauge("invalid-name", "help");   //
+    // Would abort
+  }
+
+  SUBCASE("invalid label keys") {
+    auto counter_family = metric::create_counter("valid_name", "help");
+
+    // This should abort due to label validation
+    // auto counter = counter_family.create({{"123invalid", "value"}}); // Would
+    // abort
+  }
+}
+
+TEST_CASE("memory management") {
+  SUBCASE("arena allocation in render") {
+    ArenaAllocator arena;
+    auto initial_used = arena.used_bytes();
+
+    auto counter_family = metric::create_counter("memory_test", "Memory test");
+    auto counter = counter_family.create(
+        {{"large_label", "very_long_value_that_takes_space"}});
+    counter.inc(42);
+
+    auto output = metric::render(arena);
+    auto final_used = arena.used_bytes();
+
+    CHECK(output.size() > 0);
+    CHECK(final_used > initial_used); // Arena was used for string allocation
+
+    // All string_views should point to arena memory
+    for (const auto &line : output) {
+      CHECK(line.size() > 0);
+    }
+  }
+
+  SUBCASE("arena reset behavior") {
+    ArenaAllocator arena;
+
+    auto counter_family = metric::create_counter("reset_test", "Reset test");
+    auto counter = counter_family.create({});
+    counter.inc(1);
+
+    // Render multiple times with arena resets
+    for (int i = 0; i < 5; ++i) {
+      auto output = metric::render(arena);
+      CHECK(output.size() > 0);
+      arena.reset(); // Should not affect metric values, only arena memory
+    }
+
+    // Final render should still work
+    auto final_output = metric::render(arena);
+    CHECK(final_output.size() > 0);
+  }
+}
--- a/todo.md
+++ b/todo.md
@@ -42,7 +42,7 @@
  - [ ] Support for HTTP redirects (3xx responses) with redirect limits
  - [ ] SSL/TLS support using OpenSSL for HTTPS connections
  - [ ] Request/response logging and metrics integration
-  - [ ] Memory-efficient design with zero-copy where possible
+  - [ ] Memory-efficient design minimizing unnecessary copying
 - [ ] Implement fake in-process S3 service using separate Server instance with S3 ConnectionHandler
  - [ ] Use create_local_connection to get fd for in-process communication
  - [ ] Implement `ListObjectsV2` API for object enumeration
--- a/tools/check_snake_case.py
+++ b/tools/check_snake_case.py
@@ -155,7 +155,7 @@ def main():

        if violations:
            for v in violations:
-                print(f"\n❌ {filepath}:{v['line']}:{v['column']}")
+                print(f"\n❌ {filepath}:{v['line']}:{v['column']}:")
                print(f"  {v['type']} '{v['camelCase']}' should be '{v['snake_case']}'")
                print(f"    Context: {v['context']}")
            total_violations += len(violations)
--- a/tools/load_tester.cpp
+++ b/tools/load_tester.cpp
@@ -401,6 +401,7 @@ private:
      double current_min = g_min_latency.load(std::memory_order_relaxed);
      while (latency < current_min &&
             !g_min_latency.compare_exchange_weak(current_min, latency,
+                                                  std::memory_order_relaxed,
                                                  std::memory_order_relaxed)) {
        // Retry if another thread updated min_latency
      }
@@ -408,6 +409,7 @@ private:
      double current_max = g_max_latency.load(std::memory_order_relaxed);
      while (latency > current_max &&
             !g_max_latency.compare_exchange_weak(current_max, latency,
+                                                  std::memory_order_relaxed,
                                                  std::memory_order_relaxed)) {
        // Retry if another thread updated max_latency
      }
@@ -637,7 +639,7 @@ int main(int argc, char *argv[]) {
      int epollfd = g_epoll_fds[i]; // Each thread uses its own epoll instance
      pthread_setname_np(pthread_self(),
                         ("network-" + std::to_string(i)).c_str());
-      while (g_connect_threads.load() != 0) {
+      while (g_connect_threads.load(std::memory_order_acquire) != 0) {
        struct epoll_event events[256]; // Use a reasonable max size
        int batch_size = std::min(int(sizeof(events) / sizeof(events[0])),
                                  g_config.event_batch_size);
@@ -779,7 +781,7 @@ int main(int argc, char *argv[]) {
          continue;
        }
      }
-      g_connect_threads.fetch_sub(1);
+      g_connect_threads.fetch_sub(1, std::memory_order_release);
    });
  }
Author	SHA1	Message	Date
Andrew Noyes	50e27cced8	Add more TODOs	2025-08-29 17:15:36 -04:00
Andrew Noyes	d2762dc8da	Add TODO	2025-08-29 17:05:51 -04:00
Andrew Noyes	5592d065de	Actually have contention in benchmark	2025-08-29 17:05:21 -04:00
Andrew Noyes	91e799aae8	Use plain arrays and atomic read with intrinsics for render	2025-08-29 15:10:10 -04:00
Andrew Noyes	4fc277393e	Use std::latch sync in benchmarks too	2025-08-29 14:29:15 -04:00
Andrew Noyes	a5776004de	Update potential misunderstanding about thread safety	2025-08-29 14:08:41 -04:00
Andrew Noyes	62b37c067c	Metrics implementation, WIP	2025-08-29 13:43:03 -04:00
Andrew Noyes	fac0d20ae1	Finish metrics design, I think	2025-08-29 11:51:40 -04:00
Andrew Noyes	e3a2ddbbfb	Validation + callback api	2025-08-29 11:31:06 -04:00
Andrew Noyes	b6d4ae2862	Initialize atomics in metrics, update style guide on atomics	2025-08-29 10:52:26 -04:00
Andrew Noyes	1133d1e365	Use std::bit_cast, document that gauge mutex is an implementation detail	2025-08-29 10:45:19 -04:00
Andrew Noyes	de5adb54d2	Flesh out metrics architecture more	2025-08-29 10:40:19 -04:00
Andrew Noyes	d0f2b6550a	More scaffolding	2025-08-28 17:32:34 -04:00
Andrew Noyes	ca5b299da8	Make MetricKey hashable	2025-08-28 17:10:56 -04:00
Andrew Noyes	9c89eba6c8	Metrics system scaffold	2025-08-28 17:04:53 -04:00
Andrew Noyes	ed6e6ea9fe	Output trailing : for konsole integration workaround	2025-08-28 14:45:40 -04:00
Andrew Noyes	c97920c473	format utility improvements	2025-08-28 14:40:01 -04:00
Andrew Noyes	7808896226	Add format benchmarks	2025-08-28 14:20:27 -04:00
Andrew Noyes	404b491880	Add documentation	2025-08-28 14:05:45 -04:00
Andrew Noyes	bc0d5a7422	Add format utility	2025-08-28 14:01:43 -04:00
Andrew Noyes	6fb57619c5	Remove inaccurate "zero-{copy,allocation}" claims	2025-08-28 13:40:05 -04:00
Andrew Noyes	f46a98249f	Change to loop_iterations	2025-08-28 13:34:52 -04:00
Andrew Noyes	a73a463936	Fix Arena realloc bug	2025-08-28 13:27:53 -04:00
Andrew Noyes	a32356e298	Add ArenaVector	2025-08-28 13:27:21 -04:00
Andrew Noyes	3d61408976	Use precise memory orderings in load_tester	2025-08-27 18:13:28 -04:00