Clarify threading model for metrics

2025-08-30 17:29:39 -04:00
parent 21ddcb75fb
commit affeeb674a
2 changed files with 62 additions and 21 deletions
--- a/design.md
+++ b/design.md
@@ -154,6 +154,26 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 - **Builder pattern** for constructing commit requests
 - **String views** pointing to arena-allocated memory to avoid unnecessary copying
 #### **Metrics System** (`src/metric.{hpp,cpp}`)
 **High-Performance Metrics Implementation:**
 - **Thread-local counters/histograms** with single writer for performance
 - **Global gauges** with lock-free atomic CAS operations for multi-writer scenarios
 - **SIMD-optimized histogram bucket updates** using AVX instructions for high throughput
 - **Arena allocator integration** for efficient memory management during rendering
 **Threading Model:**
 - **Counters**: Per-thread storage, single writer, atomic write in `Counter::inc()`, atomic read in render thread
 - **Histograms**: Per-thread storage, single writer, per-histogram mutex serializes all access (observe and render)
 - **Gauges**: Lock-free atomic operations using `std::bit_cast` for double precision
 - **Thread cleanup**: Automatic accumulation of thread-local state into global state on destruction
 **Prometheus Compatibility:**
 - **Standard metric types** with proper label handling and validation
 - **Bucket generation helpers** for linear/exponential histogram distributions
 - **Callback-based metrics** for dynamic values
 - **UTF-8 validation** using simdutf for label values
 #### **Configuration & Optimization**
 **Configuration System** (`src/config.{hpp,cpp}`):
--- a/src/metric.cpp
+++ b/src/metric.cpp
@@ -11,7 +11,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -35,20 +34,30 @@ static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= 16,
 // WeaselDB Metrics System Design:
 //
 // THREADING MODEL:
-// - Counters: Per-thread storage, single writer per thread
+// - Counters: Per-thread storage, single writer, atomic write/read coordination
-// - Histograms: Per-thread storage with mutex protection for consistent reads
+// with render thread
-// - Gauges: Global storage with mutex protection (multi-writer)
+// - Histograms: Per-thread storage, single writer, mutex protection for all
 // access (both observe and render)
 // - Gauges: Global storage with atomic CAS operations (multi-writer, no mutex
 // needed)
 //
 // SYNCHRONIZATION STRATEGY:
 // - Counters: Atomic store in Counter::inc(), atomic load in render thread
 // - Histograms: Mutex serializes all access - updates in observe(), reads in
 // render
 // - Gauges: Lock-free atomic operations for all updates and reads
 //
 // PRECISION STRATEGY:
 // - Use atomic<uint64_t> for lock-free storage
 // - Store doubles using std::bit_cast to uint64_t (preserves full IEEE 754
 // precision)
-// - Single writer assumption allows simple load/store without CAS loops
+// - Single writer for counters enables simple atomic store/load
 //
 // MEMORY MODEL:
 // - Thread-local metrics auto-cleanup on thread destruction
 // - Global metrics (gauges) persist for application lifetime
-// - Histogram buckets are sorted, deduplicated, and include +Inf bucket
+// - Histogram buckets are sorted, deduplicated, sizes never change after
 // creation
 namespace metric {
@@ -155,27 +164,29 @@ template <> struct Family<Histogram>::State {
  // Note: No callbacks map - histograms don't support callback-based metrics
 };
-// Counter: Thread-local, monotonically increasing, single writer per thread
+// Counter: Thread-local, monotonically increasing, single writer
 struct Counter::State {
-  double value; // Single writer, no atomics needed
+  double value; // Single writer, atomic coordination with render thread
  friend struct Metric;
 };
-// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS).
+// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS)
 // TODO slow under contention.
 struct Gauge::State {
-  std::atomic<uint64_t> value; // Stores double as uint64_t bits, lock-free
+  std::atomic<uint64_t>
      value; // Stores double as uint64_t bits, lock-free CAS operations
  friend struct Metric;
 };
-// Histogram: Thread-local buckets with mutex protection per thread
+// Histogram: Thread-local buckets, single writer, mutex protection per thread,
 // per histogram
 struct Histogram::State {
-  std::vector<double>
+  std::vector<double> thresholds; // Bucket boundaries (sorted, deduplicated,
-      thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf)
+                                  // sizes never change)
-  std::vector<uint64_t> counts; // Count per bucket
+  std::vector<uint64_t> counts;   // Count per bucket
-  double sum;                   // Sum of observations
+  double sum;                     // Sum of observations
-  uint64_t observations;        // Total observation count
+  uint64_t observations;          // Total observation count
-  std::mutex mutex; // Per-histogram mutex for consistent reads/writes
+  std::mutex
      mutex; // Per-thread, per-histogram mutex for consistent reads/writes
  friend struct Metric;
 };
@@ -358,7 +369,7 @@ struct Metric {
 Counter::Counter() = default;
 void Counter::inc(double x) {
-  // DESIGN: Single writer per thread, but render thread reads concurrently
+  // DESIGN: Single writer, but render thread reads concurrently
  // Need atomic store since render thread reads without writer's coordination
  auto new_value = p->value + x;
@@ -816,15 +827,25 @@ std::span<std::string_view> render(ArenaAllocator &arena) {
    for (const auto &[thread_id, per_thread] : family->perThreadState) {
      for (const auto &[labels_key, instance] : per_thread.instances) {
        // Extract data under lock - minimize critical section
        // Pre-allocate vectors to avoid malloc inside critical section
        // Note: thresholds and counts sizes never change after histogram
        // creation
        std::vector<double> thresholds_snapshot;
        std::vector<uint64_t> counts_snapshot;
        double sum_snapshot;
        uint64_t observations_snapshot;
        // Pre-allocate outside critical section using immutable sizes
        thresholds_snapshot.resize(instance->thresholds.size());
        counts_snapshot.resize(instance->counts.size());
        // Copy data with minimal critical section
        {
          std::lock_guard<std::mutex> lock(instance->mutex);
-          thresholds_snapshot = instance->thresholds;
+          std::memcpy(thresholds_snapshot.data(), instance->thresholds.data(),
-          counts_snapshot = instance->counts;
+                      instance->thresholds.size() * sizeof(double));
          std::memcpy(counts_snapshot.data(), instance->counts.data(),
                      instance->counts.size() * sizeof(uint64_t));
          sum_snapshot = instance->sum;
          observations_snapshot = instance->observations;
        }