Clarify threading model for metrics

2025-08-30 17:29:39 -04:00
parent 21ddcb75fb
commit affeeb674a
2 changed files with 62 additions and 21 deletions
--- a/design.md
+++ b/design.md
@@ -154,6 +154,26 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati
 - **Builder pattern** for constructing commit requests
 - **String views** pointing to arena-allocated memory to avoid unnecessary copying

+#### **Metrics System** (`src/metric.{hpp,cpp}`)
+
+**High-Performance Metrics Implementation:**
+- **Thread-local counters/histograms** with single writer for performance
+- **Global gauges** with lock-free atomic CAS operations for multi-writer scenarios
+- **SIMD-optimized histogram bucket updates** using AVX instructions for high throughput
+- **Arena allocator integration** for efficient memory management during rendering
+
+**Threading Model:**
+- **Counters**: Per-thread storage, single writer, atomic write in `Counter::inc()`, atomic read in render thread
+- **Histograms**: Per-thread storage, single writer, per-histogram mutex serializes all access (observe and render)
+- **Gauges**: Lock-free atomic operations using `std::bit_cast` for double precision
+- **Thread cleanup**: Automatic accumulation of thread-local state into global state on destruction
+
+**Prometheus Compatibility:**
+- **Standard metric types** with proper label handling and validation
+- **Bucket generation helpers** for linear/exponential histogram distributions
+- **Callback-based metrics** for dynamic values
+- **UTF-8 validation** using simdutf for label values
+
 #### **Configuration & Optimization**

 **Configuration System** (`src/config.{hpp,cpp}`):
--- a/src/metric.cpp
+++ b/src/metric.cpp
@@ -11,7 +11,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <functional>
-#include <limits>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -35,20 +34,30 @@ static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= 16,
 // WeaselDB Metrics System Design:
 //
 // THREADING MODEL:
-// - Counters: Per-thread storage, single writer per thread
-// - Histograms: Per-thread storage with mutex protection for consistent reads
-// - Gauges: Global storage with mutex protection (multi-writer)
+// - Counters: Per-thread storage, single writer, atomic write/read coordination
+// with render thread
+// - Histograms: Per-thread storage, single writer, mutex protection for all
+// access (both observe and render)
+// - Gauges: Global storage with atomic CAS operations (multi-writer, no mutex
+// needed)
+//
+// SYNCHRONIZATION STRATEGY:
+// - Counters: Atomic store in Counter::inc(), atomic load in render thread
+// - Histograms: Mutex serializes all access - updates in observe(), reads in
+// render
+// - Gauges: Lock-free atomic operations for all updates and reads
 //
 // PRECISION STRATEGY:
 // - Use atomic<uint64_t> for lock-free storage
 // - Store doubles using std::bit_cast to uint64_t (preserves full IEEE 754
 // precision)
-// - Single writer assumption allows simple load/store without CAS loops
+// - Single writer for counters enables simple atomic store/load
 //
 // MEMORY MODEL:
 // - Thread-local metrics auto-cleanup on thread destruction
 // - Global metrics (gauges) persist for application lifetime
-// - Histogram buckets are sorted, deduplicated, and include +Inf bucket
+// - Histogram buckets are sorted, deduplicated, sizes never change after
+// creation

 namespace metric {

@@ -155,27 +164,29 @@ template <> struct Family<Histogram>::State {
  // Note: No callbacks map - histograms don't support callback-based metrics
 };

-// Counter: Thread-local, monotonically increasing, single writer per thread
+// Counter: Thread-local, monotonically increasing, single writer
 struct Counter::State {
-  double value; // Single writer, no atomics needed
+  double value; // Single writer, atomic coordination with render thread
  friend struct Metric;
 };

-// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS).
-// TODO slow under contention.
+// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS)
 struct Gauge::State {
-  std::atomic<uint64_t> value; // Stores double as uint64_t bits, lock-free
+  std::atomic<uint64_t>
+      value; // Stores double as uint64_t bits, lock-free CAS operations
  friend struct Metric;
 };

-// Histogram: Thread-local buckets with mutex protection per thread
+// Histogram: Thread-local buckets, single writer, mutex protection per thread,
+// per histogram
 struct Histogram::State {
-  std::vector<double>
-      thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf)
+  std::vector<double> thresholds; // Bucket boundaries (sorted, deduplicated,
+                                  // sizes never change)
  std::vector<uint64_t> counts;   // Count per bucket
  double sum;                     // Sum of observations
  uint64_t observations;          // Total observation count
-  std::mutex mutex; // Per-histogram mutex for consistent reads/writes
+  std::mutex
+      mutex; // Per-thread, per-histogram mutex for consistent reads/writes
  friend struct Metric;
 };

@@ -358,7 +369,7 @@ struct Metric {
 Counter::Counter() = default;

 void Counter::inc(double x) {
-  // DESIGN: Single writer per thread, but render thread reads concurrently
+  // DESIGN: Single writer, but render thread reads concurrently
  // Need atomic store since render thread reads without writer's coordination
  auto new_value = p->value + x;

@@ -816,15 +827,25 @@ std::span<std::string_view> render(ArenaAllocator &arena) {
    for (const auto &[thread_id, per_thread] : family->perThreadState) {
      for (const auto &[labels_key, instance] : per_thread.instances) {
        // Extract data under lock - minimize critical section
+        // Pre-allocate vectors to avoid malloc inside critical section
+        // Note: thresholds and counts sizes never change after histogram
+        // creation
        std::vector<double> thresholds_snapshot;
        std::vector<uint64_t> counts_snapshot;
        double sum_snapshot;
        uint64_t observations_snapshot;

+        // Pre-allocate outside critical section using immutable sizes
+        thresholds_snapshot.resize(instance->thresholds.size());
+        counts_snapshot.resize(instance->counts.size());
+
+        // Copy data with minimal critical section
        {
          std::lock_guard<std::mutex> lock(instance->mutex);
-          thresholds_snapshot = instance->thresholds;
-          counts_snapshot = instance->counts;
+          std::memcpy(thresholds_snapshot.data(), instance->thresholds.data(),
+                      instance->thresholds.size() * sizeof(double));
+          std::memcpy(counts_snapshot.data(), instance->counts.data(),
+                      instance->counts.size() * sizeof(uint64_t));
          sum_snapshot = instance->sum;
          observations_snapshot = instance->observations;
        }