From affeeb674a1e362ee8418e9dcdc777418938a0cd Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Sat, 30 Aug 2025 17:29:39 -0400 Subject: [PATCH] Clarify threading model for metrics --- design.md | 20 ++++++++++++++++ src/metric.cpp | 63 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/design.md b/design.md index 7244af2..190f08f 100644 --- a/design.md +++ b/design.md @@ -154,6 +154,26 @@ A high-performance, multi-stage, lock-free pipeline for inter-thread communicati - **Builder pattern** for constructing commit requests - **String views** pointing to arena-allocated memory to avoid unnecessary copying +#### **Metrics System** (`src/metric.{hpp,cpp}`) + +**High-Performance Metrics Implementation:** +- **Thread-local counters/histograms** with single writer for performance +- **Global gauges** with lock-free atomic CAS operations for multi-writer scenarios +- **SIMD-optimized histogram bucket updates** using AVX instructions for high throughput +- **Arena allocator integration** for efficient memory management during rendering + +**Threading Model:** +- **Counters**: Per-thread storage, single writer, atomic write in `Counter::inc()`, atomic read in render thread +- **Histograms**: Per-thread storage, single writer, per-histogram mutex serializes all access (observe and render) +- **Gauges**: Lock-free atomic operations using `std::bit_cast` for double precision +- **Thread cleanup**: Automatic accumulation of thread-local state into global state on destruction + +**Prometheus Compatibility:** +- **Standard metric types** with proper label handling and validation +- **Bucket generation helpers** for linear/exponential histogram distributions +- **Callback-based metrics** for dynamic values +- **UTF-8 validation** using simdutf for label values + #### **Configuration & Optimization** **Configuration System** (`src/config.{hpp,cpp}`): diff --git a/src/metric.cpp b/src/metric.cpp index d982cae..9e12a38 100644 --- a/src/metric.cpp +++ b/src/metric.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -35,20 +34,30 @@ static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= 16, // WeaselDB Metrics System Design: // // THREADING MODEL: -// - Counters: Per-thread storage, single writer per thread -// - Histograms: Per-thread storage with mutex protection for consistent reads -// - Gauges: Global storage with mutex protection (multi-writer) +// - Counters: Per-thread storage, single writer, atomic write/read coordination +// with render thread +// - Histograms: Per-thread storage, single writer, mutex protection for all +// access (both observe and render) +// - Gauges: Global storage with atomic CAS operations (multi-writer, no mutex +// needed) +// +// SYNCHRONIZATION STRATEGY: +// - Counters: Atomic store in Counter::inc(), atomic load in render thread +// - Histograms: Mutex serializes all access - updates in observe(), reads in +// render +// - Gauges: Lock-free atomic operations for all updates and reads // // PRECISION STRATEGY: // - Use atomic for lock-free storage // - Store doubles using std::bit_cast to uint64_t (preserves full IEEE 754 // precision) -// - Single writer assumption allows simple load/store without CAS loops +// - Single writer for counters enables simple atomic store/load // // MEMORY MODEL: // - Thread-local metrics auto-cleanup on thread destruction // - Global metrics (gauges) persist for application lifetime -// - Histogram buckets are sorted, deduplicated, and include +Inf bucket +// - Histogram buckets are sorted, deduplicated, sizes never change after +// creation namespace metric { @@ -155,27 +164,29 @@ template <> struct Family::State { // Note: No callbacks map - histograms don't support callback-based metrics }; -// Counter: Thread-local, monotonically increasing, single writer per thread +// Counter: Thread-local, monotonically increasing, single writer struct Counter::State { - double value; // Single writer, no atomics needed + double value; // Single writer, atomic coordination with render thread friend struct Metric; }; -// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS). -// TODO slow under contention. +// Gauge: Global, can increase/decrease, multiple writers (uses atomic CAS) struct Gauge::State { - std::atomic value; // Stores double as uint64_t bits, lock-free + std::atomic + value; // Stores double as uint64_t bits, lock-free CAS operations friend struct Metric; }; -// Histogram: Thread-local buckets with mutex protection per thread +// Histogram: Thread-local buckets, single writer, mutex protection per thread, +// per histogram struct Histogram::State { - std::vector - thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf) - std::vector counts; // Count per bucket - double sum; // Sum of observations - uint64_t observations; // Total observation count - std::mutex mutex; // Per-histogram mutex for consistent reads/writes + std::vector thresholds; // Bucket boundaries (sorted, deduplicated, + // sizes never change) + std::vector counts; // Count per bucket + double sum; // Sum of observations + uint64_t observations; // Total observation count + std::mutex + mutex; // Per-thread, per-histogram mutex for consistent reads/writes friend struct Metric; }; @@ -358,7 +369,7 @@ struct Metric { Counter::Counter() = default; void Counter::inc(double x) { - // DESIGN: Single writer per thread, but render thread reads concurrently + // DESIGN: Single writer, but render thread reads concurrently // Need atomic store since render thread reads without writer's coordination auto new_value = p->value + x; @@ -816,15 +827,25 @@ std::span render(ArenaAllocator &arena) { for (const auto &[thread_id, per_thread] : family->perThreadState) { for (const auto &[labels_key, instance] : per_thread.instances) { // Extract data under lock - minimize critical section + // Pre-allocate vectors to avoid malloc inside critical section + // Note: thresholds and counts sizes never change after histogram + // creation std::vector thresholds_snapshot; std::vector counts_snapshot; double sum_snapshot; uint64_t observations_snapshot; + // Pre-allocate outside critical section using immutable sizes + thresholds_snapshot.resize(instance->thresholds.size()); + counts_snapshot.resize(instance->counts.size()); + + // Copy data with minimal critical section { std::lock_guard lock(instance->mutex); - thresholds_snapshot = instance->thresholds; - counts_snapshot = instance->counts; + std::memcpy(thresholds_snapshot.data(), instance->thresholds.data(), + instance->thresholds.size() * sizeof(double)); + std::memcpy(counts_snapshot.data(), instance->counts.data(), + instance->counts.size() * sizeof(uint64_t)); sum_snapshot = instance->sum; observations_snapshot = instance->observations; }