Add some clarifying implementation comments
This commit is contained in:
@@ -421,6 +421,10 @@ struct Metric {
|
|||||||
~ThreadInit() {
|
~ThreadInit() {
|
||||||
// Accumulate thread-local state into global state before cleanup
|
// Accumulate thread-local state into global state before cleanup
|
||||||
std::unique_lock<std::mutex> _{mutex};
|
std::unique_lock<std::mutex> _{mutex};
|
||||||
|
// NOTE: registration_version increment is REQUIRED here because:
|
||||||
|
// - Cached render plan has pre-resolved pointers to thread-local state
|
||||||
|
// - When threads disappear, these pointers become invalid
|
||||||
|
// - Cache invalidation forces rebuild with updated pointer sets
|
||||||
++Metric::registration_version;
|
++Metric::registration_version;
|
||||||
auto thread_id = std::this_thread::get_id();
|
auto thread_id = std::this_thread::get_id();
|
||||||
|
|
||||||
@@ -994,6 +998,7 @@ struct Metric {
|
|||||||
// Sum thread-local values
|
// Sum thread-local values
|
||||||
for (auto *state_ptr : instruction.aggregate_counter.thread_states) {
|
for (auto *state_ptr : instruction.aggregate_counter.thread_states) {
|
||||||
double value;
|
double value;
|
||||||
|
// NOTE: __atomic_load works on plain double (not atomic<double>)
|
||||||
__atomic_load(&state_ptr->value, &value, __ATOMIC_RELAXED);
|
__atomic_load(&state_ptr->value, &value, __ATOMIC_RELAXED);
|
||||||
total_value += value;
|
total_value += value;
|
||||||
}
|
}
|
||||||
@@ -1079,6 +1084,12 @@ struct Metric {
|
|||||||
|
|
||||||
for (size_t i = 0; i < static_text.size(); ++i) {
|
for (size_t i = 0; i < static_text.size(); ++i) {
|
||||||
// Copy static text into caller's arena
|
// Copy static text into caller's arena
|
||||||
|
// NOTE: This copying is REQUIRED for memory safety:
|
||||||
|
// - static_text lives in cached_plan's arena (persistent across renders)
|
||||||
|
// - dynamic_text lives in caller's arena (single render lifetime)
|
||||||
|
// - output must live entirely in caller's arena for consistent lifetime
|
||||||
|
// Without copying, output would have mixed arena ownership causing
|
||||||
|
// use-after-free
|
||||||
output.push_back(arena_copy_string(static_text[i], arena));
|
output.push_back(arena_copy_string(static_text[i], arena));
|
||||||
|
|
||||||
// Add corresponding dynamic text
|
// Add corresponding dynamic text
|
||||||
@@ -1279,9 +1290,15 @@ update_histogram_buckets_simd(std::span<const double> thresholds,
|
|||||||
__m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
|
__m128d cmp_result = _mm_cmp_pd(x_vec, thresholds_vec, _CMP_LE_OQ);
|
||||||
__m128i cmp_as_int = _mm_castpd_si128(cmp_result);
|
__m128i cmp_as_int = _mm_castpd_si128(cmp_result);
|
||||||
__m128i ones = _mm_set1_epi64x(1);
|
__m128i ones = _mm_set1_epi64x(1);
|
||||||
|
// NOTE: _mm_cmp_pd returns 0xFFFFFFFFFFFFFFFF for true, 0x0000000000000000
|
||||||
|
// for false The AND with ones converts this to 1 or 0 per 64-bit lane
|
||||||
|
// (extracts low bit)
|
||||||
__m128i increments = _mm_and_si128(cmp_as_int, ones);
|
__m128i increments = _mm_and_si128(cmp_as_int, ones);
|
||||||
|
|
||||||
// Load current counts and add increments
|
// Load current counts and add increments
|
||||||
|
// NOTE: Using unaligned load/store (_mm_loadu_si128/_mm_storeu_si128)
|
||||||
|
// because uint64_t arrays from arena allocator are not guaranteed 16-byte
|
||||||
|
// aligned
|
||||||
__m128i current_counts = _mm_loadu_si128((__m128i *)&counts[i]);
|
__m128i current_counts = _mm_loadu_si128((__m128i *)&counts[i]);
|
||||||
__m128i updated_counts = _mm_add_epi64(current_counts, increments);
|
__m128i updated_counts = _mm_add_epi64(current_counts, increments);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user