Actually have contention in benchmark

2025-08-29 17:05:21 -04:00
parent 91e799aae8
commit 5592d065de
2 changed files with 100 additions and 155 deletions
@@ -25,6 +25,12 @@

 #include "format.hpp"

+// Verify that malloc provides sufficient alignment for atomic 128-bit
+// operations
+static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= 16,
+              "Default new alignment must be at least 16 bytes for atomic "
+              "128-bit stores");
+
 // WeaselDB Metrics System Design:
 //
 // THREADING MODEL:
@@ -157,8 +163,8 @@ struct Gauge::State {
 struct Histogram::State {
  std::vector<double>
      thresholds; // Bucket boundaries (sorted, deduplicated, includes +Inf)
-  std::vector<uint64_t>
-      counts;     // Count per bucket - single writer, no atomics needed
+  std::vector<uint64_t> counts; // Count per bucket - single writer, malloc
+                                // provides 16-byte alignment
  AtomicWord sum; // Sum of observations (double stored as uint64_t bits)
  AtomicWord observations; // Total observation count (uint64_t)
  friend struct Metric;
@@ -286,7 +292,7 @@ void Counter::inc(double x) {
                      std::to_string(new_value).c_str());
  }

-  p->value = new_value;
+  __atomic_store(&p->value, &new_value, __ATOMIC_RELAXED);
 }

 Gauge::Gauge() = default;
@@ -321,55 +327,74 @@ void Gauge::set(double x) {
 Histogram::Histogram() = default;

 // Vectorized histogram bucket updates using single-writer + atomic-read design
-// Since histograms have single-writer semantics, we can bypass atomic writes!
+// Since histograms have single-writer semantics, we can use architecturally
+// atomic stores

-// Default implementation
-__attribute__((target("default"))) static void
-update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
-                                    std::vector<uint64_t> &counts, double x,
-                                    size_t start_idx) {
-  const size_t size = thresholds.size();
-
-  // Single writer - simple increment, no atomics needed
-  for (size_t i = start_idx; i < size; ++i) {
-    if (x <= thresholds[i]) {
-      counts[i]++;
-    }
-  }
-}
-
-// AVX2 version - true vectorization with direct memory access
 #ifdef __x86_64__
-__attribute__((target("avx2"))) static void
-update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
-                                    std::vector<uint64_t> &counts, double x,
-                                    size_t start_idx) {
+// x86-64: 128-bit vectorized with inline assembly atomic stores
+__attribute__((target("avx"))) static void
+update_histogram_buckets(const std::vector<double> &thresholds,
+                         std::vector<uint64_t> &counts, double x,
+                         size_t start_idx) {
  const size_t size = thresholds.size();
  size_t i = start_idx;

-  // Process 4 buckets at a time with AVX2
-  const __m256d x_vec = _mm256_set1_pd(x);
+  // Process 2 buckets at a time with 128-bit vectors + inline assembly
+  const __m128d x_vec = _mm_set1_pd(x);

-  for (; i + 4 <= size; i += 4) {
-    // Vectorized comparison
-    __m256d thresholds_vec = _mm256_loadu_pd(&thresholds[i]);
-    __m256d cmp_result = _mm256_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
+  for (; i + 2 <= size; i += 2) {
+    // Ensure alignment for atomic guarantee (malloc provides 16-byte alignment)
+    assert((reinterpret_cast<uintptr_t>(static_cast<void *>(&counts[i])) &
+            15) == 0 &&
+           "counts array must be 16-byte aligned for atomic 128-bit stores");

-    // Convert to increment mask
-    __m256i cmp_as_int = _mm256_castpd_si256(cmp_result);
-    __m256i ones = _mm256_set1_epi64x(1);
-    __m256i increments = _mm256_and_si256(cmp_as_int, ones);
+    // 128-bit vectorized comparison and arithmetic
+    __m128d thresholds_vec = _mm_loadu_pd(&thresholds[i]);
+    __m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
+    __m128i cmp_as_int = _mm_castpd_si128(cmp_result);
+    __m128i ones = _mm_set1_epi64x(1);
+    __m128i increments = _mm_and_si128(cmp_as_int, ones);

-    // Vectorized 4-lane add directly to memory
-    __m256i current_counts = _mm256_loadu_si256((__m256i *)&counts[i]);
-    __m256i updated_counts = _mm256_add_epi64(current_counts, increments);
-    _mm256_storeu_si256((__m256i *)&counts[i], updated_counts);
+    // Load current counts and add increments
+    __m128i current_counts = _mm_load_si128((__m128i *)&counts[i]);
+    __m128i updated_counts = _mm_add_epi64(current_counts, increments);
+
+    // Processors that enumerate support for Intel® AVX (by setting the feature
+    // flag CPUID.01H:ECX.AVX[bit 28])
+    //     guarantee that the 16-byte memory operations performed by the
+    //     following instructions will always be carried out atomically: â¢
+    //     MOVAPD, MOVAPS, and MOVDQA. â¢ VMOVAPD, VMOVAPS, and VMOVDQA when
+    //     encoded with VEX.128. â¢ VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64
+    //     when encoded with EVEX.128 and k0 (masking disabled). (Note that
+    //     these instructions require the linear addresses of their memory
+    //     operands to be 16-byte aligned.)
+    __asm__ __volatile__(
+        "vmovdqa %%xmm0, %0"
+        : "=m"(*((__m128i *)&counts[i])) // Output: aligned memory location
+        : "x"(updated_counts)            // Input: xmm register
+        : "memory"                       // Memory clobber
+    );
  }

-  // Handle remainder
+  // Handle remainder with atomic stores
  for (; i < size; ++i) {
    if (x <= thresholds[i]) {
-      counts[i]++;
+      __atomic_store_n(&counts[i], counts[i] + 1, __ATOMIC_RELAXED);
+    }
+  }
+}
+#else
+// Fallback implementation for non-x86 architectures
+static void
+update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
+                                    std::vector<uint64_t> &counts, double x,
+                                    size_t start_idx) {
+  const size_t size = thresholds.size();
+
+  // Scalar implementation with atomic stores for TSAN compatibility
+  for (size_t i = start_idx; i < size; ++i) {
+    if (x <= thresholds[i]) {
+      __atomic_store_n(&counts[i], counts[i] + 1, __ATOMIC_RELAXED);
    }
  }
 }
@@ -378,9 +403,7 @@ update_histogram_buckets_vectorized(const std::vector<double> &thresholds,
 void Histogram::observe(double x) {
  assert(p->thresholds.size() == p->counts.size());

-  // Use multiversioned auto-vectorized function
-  // Compiler automatically selects best implementation for current CPU
-  update_histogram_buckets_vectorized(p->thresholds, p->counts, x, 0);
+  update_histogram_buckets(p->thresholds, p->counts, x, 0);

  // DESIGN: Single writer per thread allows simple load-modify-store for sum
  // No CAS loop needed since only one thread writes to this histogram
@@ -830,11 +853,6 @@ void Family<Gauge>::register_callback(
 }

 // Explicit template instantiations to provide member implementations
-template void Family<Counter>::register_callback(
-    std::vector<std::pair<std::string, std::string>>, MetricCallback<Counter>);
-
-template void Family<Gauge>::register_callback(
-    std::vector<std::pair<std::string, std::string>>, MetricCallback<Gauge>);

 // Static member definitions
 std::mutex Metric::mutex;