Actually have contention in benchmark

This commit is contained in:
2025-08-29 17:05:21 -04:00
parent 91e799aae8
commit 5592d065de
2 changed files with 100 additions and 155 deletions

View File

@@ -25,6 +25,12 @@
#include "format.hpp"
// Verify that malloc provides sufficient alignment for atomic 128-bit
// operations
static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= 16,
"Default new alignment must be at least 16 bytes for atomic "
"128-bit stores");
// WeaselDB Metrics System Design:
//
// THREADING MODEL:
@@ -157,8 +163,8 @@ struct Gauge::State {
struct Histogram::State {
std::vector<double>
thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf)
std::vector<uint64_t>
counts; // Count per bucket - single writer, no atomics needed
std::vector<uint64_t> counts; // Count per bucket - single writer, malloc
// provides 16-byte alignment
AtomicWord sum; // Sum of observations (double stored as uint64_t bits)
AtomicWord observations; // Total observation count (uint64_t)
friend struct Metric;
@@ -286,7 +292,7 @@ void Counter::inc(double x) {
std::to_string(new_value).c_str());
}
p->value = new_value;
__atomic_store(&p->value, &new_value, __ATOMIC_RELAXED);
}
Gauge::Gauge() = default;
@@ -321,55 +327,74 @@ void Gauge::set(double x) {
Histogram::Histogram() = default;
// Vectorized histogram bucket updates using single-writer + atomic-read design
// Since histograms have single-writer semantics, we can bypass atomic writes!
// Since histograms have single-writer semantics, we can use architecturally
// atomic stores
// Default implementation
__attribute__((target("default"))) static void
update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
std::vector<uint64_t> &counts, double x,
size_t start_idx) {
const size_t size = thresholds.size();
// Single writer - simple increment, no atomics needed
for (size_t i = start_idx; i < size; ++i) {
if (x <= thresholds[i]) {
counts[i]++;
}
}
}
// AVX2 version - true vectorization with direct memory access
#ifdef __x86_64__
__attribute__((target("avx2"))) static void
update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
std::vector<uint64_t> &counts, double x,
size_t start_idx) {
// x86-64: 128-bit vectorized with inline assembly atomic stores
__attribute__((target("avx"))) static void
update_histogram_buckets(const std::vector<double> &thresholds,
std::vector<uint64_t> &counts, double x,
size_t start_idx) {
const size_t size = thresholds.size();
size_t i = start_idx;
// Process 4 buckets at a time with AVX2
const __m256d x_vec = _mm256_set1_pd(x);
// Process 2 buckets at a time with 128-bit vectors + inline assembly
const __m128d x_vec = _mm_set1_pd(x);
for (; i + 4 <= size; i += 4) {
// Vectorized comparison
__m256d thresholds_vec = _mm256_loadu_pd(&thresholds[i]);
__m256d cmp_result = _mm256_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
for (; i + 2 <= size; i += 2) {
// Ensure alignment for atomic guarantee (malloc provides 16-byte alignment)
assert((reinterpret_cast<uintptr_t>(static_cast<void *>(&counts[i])) &
15) == 0 &&
"counts array must be 16-byte aligned for atomic 128-bit stores");
// Convert to increment mask
__m256i cmp_as_int = _mm256_castpd_si256(cmp_result);
__m256i ones = _mm256_set1_epi64x(1);
__m256i increments = _mm256_and_si256(cmp_as_int, ones);
// 128-bit vectorized comparison and arithmetic
__m128d thresholds_vec = _mm_loadu_pd(&thresholds[i]);
__m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
__m128i cmp_as_int = _mm_castpd_si128(cmp_result);
__m128i ones = _mm_set1_epi64x(1);
__m128i increments = _mm_and_si128(cmp_as_int, ones);
// Vectorized 4-lane add directly to memory
__m256i current_counts = _mm256_loadu_si256((__m256i *)&counts[i]);
__m256i updated_counts = _mm256_add_epi64(current_counts, increments);
_mm256_storeu_si256((__m256i *)&counts[i], updated_counts);
// Load current counts and add increments
__m128i current_counts = _mm_load_si128((__m128i *)&counts[i]);
__m128i updated_counts = _mm_add_epi64(current_counts, increments);
// Processors that enumerate support for Intel® AVX (by setting the feature
// flag CPUID.01H:ECX.AVX[bit 28])
// guarantee that the 16-byte memory operations performed by the
// following instructions will always be carried out atomically: â¢
// MOVAPD, MOVAPS, and MOVDQA. ⢠VMOVAPD, VMOVAPS, and VMOVDQA when
// encoded with VEX.128. ⢠VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64
// when encoded with EVEX.128 and k0 (masking disabled). (Note that
// these instructions require the linear addresses of their memory
// operands to be 16-byte aligned.)
__asm__ __volatile__(
"vmovdqa %%xmm0, %0"
: "=m"(*((__m128i *)&counts[i])) // Output: aligned memory location
: "x"(updated_counts) // Input: xmm register
: "memory" // Memory clobber
);
}
// Handle remainder
// Handle remainder with atomic stores
for (; i < size; ++i) {
if (x <= thresholds[i]) {
counts[i]++;
__atomic_store_n(&counts[i], counts[i] + 1, __ATOMIC_RELAXED);
}
}
}
#else
// Fallback implementation for non-x86 architectures
static void
update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
std::vector<uint64_t> &counts, double x,
size_t start_idx) {
const size_t size = thresholds.size();
// Scalar implementation with atomic stores for TSAN compatibility
for (size_t i = start_idx; i < size; ++i) {
if (x <= thresholds[i]) {
__atomic_store_n(&counts[i], counts[i] + 1, __ATOMIC_RELAXED);
}
}
}
@@ -378,9 +403,7 @@ update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
void Histogram::observe(double x) {
assert(p->thresholds.size() == p->counts.size());
// Use multiversioned auto-vectorized function
// Compiler automatically selects best implementation for current CPU
update_histogram_buckets_vectorized(p->thresholds, p->counts, x, 0);
update_histogram_buckets(p->thresholds, p->counts, x, 0);
// DESIGN: Single writer per thread allows simple load-modify-store for sum
// No CAS loop needed since only one thread writes to this histogram
@@ -830,11 +853,6 @@ void Family<Gauge>::register_callback(
}
// Explicit template instantiations to provide member implementations
template void Family<Counter>::register_callback(
std::vector<std::pair<std::string, std::string>>, MetricCallback<Counter>);
template void Family<Gauge>::register_callback(
std::vector<std::pair<std::string, std::string>>, MetricCallback<Gauge>);
// Static member definitions
std::mutex Metric::mutex;