Add some clarifying implementation comments

2025-09-03 11:01:01 -04:00
parent 721f814785
commit b3e48b904a
1 changed files with 17 additions and 0 deletions
--- a/src/metric.cpp
+++ b/src/metric.cpp
@@ -421,6 +421,10 @@ struct Metric {
    ~ThreadInit() {
      // Accumulate thread-local state into global state before cleanup
      std::unique_lock<std::mutex> _{mutex};
      // NOTE: registration_version increment is REQUIRED here because:
      // - Cached render plan has pre-resolved pointers to thread-local state
      // - When threads disappear, these pointers become invalid
      // - Cache invalidation forces rebuild with updated pointer sets
      ++Metric::registration_version;
      auto thread_id = std::this_thread::get_id();
@@ -994,6 +998,7 @@ struct Metric {
        // Sum thread-local values
        for (auto *state_ptr : instruction.aggregate_counter.thread_states) {
          double value;
          // NOTE: __atomic_load works on plain double (not atomic<double>)
          __atomic_load(&state_ptr->value, &value, __ATOMIC_RELAXED);
          total_value += value;
        }
@@ -1079,6 +1084,12 @@ struct Metric {
    for (size_t i = 0; i < static_text.size(); ++i) {
      // Copy static text into caller's arena
      // NOTE: This copying is REQUIRED for memory safety:
      // - static_text lives in cached_plan's arena (persistent across renders)
      // - dynamic_text lives in caller's arena (single render lifetime)
      // - output must live entirely in caller's arena for consistent lifetime
      // Without copying, output would have mixed arena ownership causing
      // use-after-free
      output.push_back(arena_copy_string(static_text[i], arena));
      // Add corresponding dynamic text
@@ -1279,9 +1290,15 @@ update_histogram_buckets_simd(std::span<const double> thresholds,
    __m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
    __m128i cmp_as_int = _mm_castpd_si128(cmp_result);
    __m128i ones = _mm_set1_epi64x(1);
    // NOTE: _mm_cmp_pd returns 0xFFFFFFFFFFFFFFFF for true, 0x0000000000000000
    // for false The AND with ones converts this to 1 or 0 per 64-bit lane
    // (extracts low bit)
    __m128i increments = _mm_and_si128(cmp_as_int, ones);
    // Load current counts and add increments
    // NOTE: Using unaligned load/store (_mm_loadu_si128/_mm_storeu_si128)
    // because uint64_t arrays from arena allocator are not guaranteed 16-byte
    // aligned
    __m128i current_counts = _mm_loadu_si128((__m128i *)&counts[i]);
    __m128i updated_counts = _mm_add_epi64(current_counts, increments);