diff --git a/benchmarks/bench_metric.cpp b/benchmarks/bench_metric.cpp index d424719..74b45f0 100644 --- a/benchmarks/bench_metric.cpp +++ b/benchmarks/bench_metric.cpp @@ -34,9 +34,12 @@ struct ContentionEnvironment { : counter_family( metric::create_counter("bench_counter", "Benchmark counter")), gauge_family(metric::create_gauge("bench_gauge", "Benchmark gauge")), - histogram_family(metric::create_histogram( - "bench_histogram", "Benchmark histogram", - std::initializer_list{0.1, 0.5, 1.0, 2.5, 5.0})), + histogram_family( + metric::create_histogram("bench_histogram", "Benchmark histogram", + // 7 explicit buckets + automatic +Inf = 8 + // total (optimal for SIMD: 2x4 buckets) + std::initializer_list{ + 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0})), counter(counter_family.create({{"benchmark", "contention"}})), gauge(gauge_family.create({{"benchmark", "contention"}})), histogram(histogram_family.create({{"benchmark", "contention"}})) {} @@ -133,9 +136,9 @@ int main() { ankerl::nanobench::doNotOptimizeAway(gauge); }); - auto histogram_family = - metric::create_histogram("baseline_histogram", "Baseline histogram", - std::initializer_list{0.1, 0.5, 1.0}); + auto histogram_family = metric::create_histogram( + "baseline_histogram", "Baseline histogram", + std::initializer_list{0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0}); auto histogram = histogram_family.create({{"type", "baseline"}}); bench.run("histogram.observe() - no contention", [&]() { @@ -244,9 +247,9 @@ int main() { auto counter_family = metric::create_counter("scale_counter", "Scale counter"); auto gauge_family = metric::create_gauge("scale_gauge", "Scale gauge"); - auto histogram_family = - metric::create_histogram("scale_histogram", "Scale histogram", - std::initializer_list{0.1, 0.5, 1.0}); + auto histogram_family = metric::create_histogram( + "scale_histogram", "Scale histogram", + std::initializer_list{0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0}); // Create varying numbers of metrics for (int scale : {10, 100, 1000}) { diff --git a/src/metric.cpp b/src/metric.cpp index 4aa5e74..1cd5af4 100644 --- a/src/metric.cpp +++ b/src/metric.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include "format.hpp" @@ -142,7 +143,7 @@ template <> struct Family::State { // Counter: Thread-local, monotonically increasing, single writer per thread struct Counter::State { - AtomicWord value; // Stores double as uint64_t bits + double value; // Single writer, no atomics needed friend struct Metric; }; @@ -156,7 +157,8 @@ struct Gauge::State { struct Histogram::State { std::vector thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf) - std::vector counts; // Count per bucket (uint64_t) + std::vector + counts; // Count per bucket - single writer, no atomics needed AtomicWord sum; // Sum of observations (double stored as uint64_t bits) AtomicWord observations; // Total observation count (uint64_t) friend struct Metric; @@ -218,7 +220,7 @@ struct Metric { family->p->perThreadState[std::this_thread::get_id()].instances[key]; if (!ptr) { ptr = std::make_unique(); - ptr->value.store(0, std::memory_order_relaxed); + ptr->value = 0.0; } Counter result; result.p = ptr.get(); @@ -260,13 +262,8 @@ struct Metric { // Use buckets from family configuration ptr->thresholds = family->p->buckets; // Already sorted and deduplicated - // DESIGN: std::atomic is not copy-constructible - // Initialize vector with correct size, all atomics explicitly initialized - // to 0 - ptr->counts = std::vector(ptr->thresholds.size()); - for (auto &count : ptr->counts) { - count.store(0, std::memory_order_relaxed); - } + // Single writer semantics - no atomics needed for bucket counts + ptr->counts = std::vector(ptr->thresholds.size(), 0); ptr->sum.store(0, std::memory_order_relaxed); ptr->observations.store(0, std::memory_order_relaxed); } @@ -279,19 +276,17 @@ struct Metric { Counter::Counter() = default; void Counter::inc(double x) { - // DESIGN: Single writer per thread allows simple load-modify-store - // No CAS loop needed since only one thread writes to this counter - auto current_value = - std::bit_cast(p->value.load(std::memory_order_relaxed)); - auto new_value = current_value + x; + // DESIGN: Single writer per thread allows simple increment + // No atomics needed since only one thread writes to this counter + auto new_value = p->value + x; // Validate monotonic property (counter never decreases) - if (new_value < current_value) [[unlikely]] { + if (new_value < p->value) [[unlikely]] { validate_or_abort(false, "counter value overflow/wraparound detected", std::to_string(new_value).c_str()); } - p->value.store(std::bit_cast(new_value), std::memory_order_relaxed); + p->value = new_value; } Gauge::Gauge() = default; @@ -325,14 +320,67 @@ void Gauge::set(double x) { Histogram::Histogram() = default; +// Vectorized histogram bucket updates using single-writer + atomic-read design +// Since histograms have single-writer semantics, we can bypass atomic writes! + +// Default implementation +__attribute__((target("default"))) static void +update_histogram_buckets_vectorized(const std::vector &thresholds, + std::vector &counts, double x, + size_t start_idx) { + const size_t size = thresholds.size(); + + // Single writer - simple increment, no atomics needed + for (size_t i = start_idx; i < size; ++i) { + if (x <= thresholds[i]) { + counts[i]++; + } + } +} + +// AVX2 version - true vectorization with direct memory access +#ifdef __x86_64__ +__attribute__((target("avx2"))) static void +update_histogram_buckets_vectorized(const std::vector &thresholds, + std::vector &counts, double x, + size_t start_idx) { + const size_t size = thresholds.size(); + size_t i = start_idx; + + // Process 4 buckets at a time with AVX2 + const __m256d x_vec = _mm256_set1_pd(x); + + for (; i + 4 <= size; i += 4) { + // Vectorized comparison + __m256d thresholds_vec = _mm256_loadu_pd(&thresholds[i]); + __m256d cmp_result = _mm256_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ); + + // Convert to increment mask + __m256i cmp_as_int = _mm256_castpd_si256(cmp_result); + __m256i ones = _mm256_set1_epi64x(1); + __m256i increments = _mm256_and_si256(cmp_as_int, ones); + + // Vectorized 4-lane add directly to memory + __m256i current_counts = _mm256_loadu_si256((__m256i *)&counts[i]); + __m256i updated_counts = _mm256_add_epi64(current_counts, increments); + _mm256_storeu_si256((__m256i *)&counts[i], updated_counts); + } + + // Handle remainder + for (; i < size; ++i) { + if (x <= thresholds[i]) { + counts[i]++; + } + } +} +#endif + void Histogram::observe(double x) { assert(p->thresholds.size() == p->counts.size()); - // Increment bucket counts (cumulative: each bucket counts all values <= - // threshold) - for (size_t i = 0; i < p->thresholds.size(); ++i) { - p->counts[i].fetch_add(x <= p->thresholds[i], std::memory_order_relaxed); - } + // Use multiversioned auto-vectorized function + // Compiler automatically selects best implementation for current CPU + update_histogram_buckets_vectorized(p->thresholds, p->counts, x, 0); // DESIGN: Single writer per thread allows simple load-modify-store for sum // No CAS loop needed since only one thread writes to this histogram @@ -630,8 +678,10 @@ std::span render(ArenaAllocator &arena) { for (const auto &[thread_id, per_thread] : family->perThreadState) { for (const auto &[labels_key, instance] : per_thread.instances) { - auto value = std::bit_cast( - instance->value.load(std::memory_order_relaxed)); + // Atomic read from render thread - single writer doesn't need atomic + // writes + double value; + __atomic_load(&instance->value, &value, __ATOMIC_RELAXED); labels_sv.clear(); for (const auto &l : labels_key.labels) labels_sv.push_back(l); @@ -697,7 +747,9 @@ std::span render(ArenaAllocator &arena) { bucket_labels_sv.push_back( {"le", format(arena, "%.17g", instance->thresholds[i])}); } - auto count = instance->counts[i].load(std::memory_order_relaxed); + // Atomic read from render thread - single writer doesn't need atomic + // writes + auto count = __atomic_load_n(&instance->counts[i], __ATOMIC_RELAXED); auto labels = format_labels(bucket_labels_sv); output.push_back(format(arena, "%s_bucket%.*s %llu\n", name.c_str(), static_cast(labels.length()),