From b3e48b904a6be09ce0617b5c3fd0e32b7c0722fd Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 3 Sep 2025 11:01:01 -0400 Subject: [PATCH] Add some clarifying implementation comments --- src/metric.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/metric.cpp b/src/metric.cpp index 6dfa1b2..c7eb05d 100644 --- a/src/metric.cpp +++ b/src/metric.cpp @@ -421,6 +421,10 @@ struct Metric { ~ThreadInit() { // Accumulate thread-local state into global state before cleanup std::unique_lock _{mutex}; + // NOTE: registration_version increment is REQUIRED here because: + // - Cached render plan has pre-resolved pointers to thread-local state + // - When threads disappear, these pointers become invalid + // - Cache invalidation forces rebuild with updated pointer sets ++Metric::registration_version; auto thread_id = std::this_thread::get_id(); @@ -994,6 +998,7 @@ struct Metric { // Sum thread-local values for (auto *state_ptr : instruction.aggregate_counter.thread_states) { double value; + // NOTE: __atomic_load works on plain double (not atomic) __atomic_load(&state_ptr->value, &value, __ATOMIC_RELAXED); total_value += value; } @@ -1079,6 +1084,12 @@ struct Metric { for (size_t i = 0; i < static_text.size(); ++i) { // Copy static text into caller's arena + // NOTE: This copying is REQUIRED for memory safety: + // - static_text lives in cached_plan's arena (persistent across renders) + // - dynamic_text lives in caller's arena (single render lifetime) + // - output must live entirely in caller's arena for consistent lifetime + // Without copying, output would have mixed arena ownership causing + // use-after-free output.push_back(arena_copy_string(static_text[i], arena)); // Add corresponding dynamic text @@ -1279,9 +1290,15 @@ update_histogram_buckets_simd(std::span thresholds, __m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ); __m128i cmp_as_int = _mm_castpd_si128(cmp_result); __m128i ones = _mm_set1_epi64x(1); + // NOTE: _mm_cmp_pd returns 0xFFFFFFFFFFFFFFFF for true, 0x0000000000000000 + // for false The AND with ones converts this to 1 or 0 per 64-bit lane + // (extracts low bit) __m128i increments = _mm_and_si128(cmp_as_int, ones); // Load current counts and add increments + // NOTE: Using unaligned load/store (_mm_loadu_si128/_mm_storeu_si128) + // because uint64_t arrays from arena allocator are not guaranteed 16-byte + // aligned __m128i current_counts = _mm_loadu_si128((__m128i *)&counts[i]); __m128i updated_counts = _mm_add_epi64(current_counts, increments);