Fix histogram thread death bug

This commit is contained in:
2025-09-18 12:39:47 -04:00
parent cb66d65479
commit 9f8562e30f
2 changed files with 121 additions and 0 deletions

View File

@@ -627,6 +627,113 @@ TEST_CASE("memory management") {
}
}
TEST_CASE("histogram pending buffer thread cleanup bug") {
for (int iterations = 0; iterations < 1000; ++iterations) {
// This test demonstrates the bug where pending histogram observations
// are lost when a thread dies because ThreadInit destructor doesn't
// flush pending data into shared before accumulating into global state.
metric::reset_metrics_for_testing();
auto hist_family = metric::create_histogram(
"pending_bug_test", "Test histogram for pending buffer bug",
{1.0}); // Single bucket for simplicity
std::atomic<bool> keep_rendering{true};
constexpr int num_threads = 100;
std::latch ready{2};
// Background thread that calls render in a tight loop to hold global mutex
std::thread render_thread([&]() {
ready.arrive_and_wait();
Arena arena;
while (keep_rendering.load(std::memory_order_relaxed)) {
metric::render(arena);
arena.reset();
}
});
// Don't spawn threads until render thread is running
ready.arrive_and_wait();
// Spawn threads that observe once and exit
std::vector<std::thread> observer_threads;
for (int i = 0; i < num_threads; ++i) {
observer_threads.emplace_back([&hist_family]() {
auto hist = hist_family.create({{"test", "observer"}});
hist.observe(0.5); // Goes into first bucket (le="1.0")
// Thread dies here - pending observations should be lost due to bug
});
}
// Join all observer threads
for (auto &t : observer_threads) {
t.join();
}
// Stop render thread
keep_rendering.store(false, std::memory_order_relaxed);
render_thread.join();
// Check if the worker's observations were preserved
Arena arena;
auto output = metric::render(arena);
// First, let's debug what we actually got
std::ostringstream debug_output;
for (const auto &line : output) {
debug_output << line;
}
std::string full_output = debug_output.str();
// Parse the output to find the worker's bucket count for le="2.0"
uint64_t worker_bucket_2_count = 0;
bool found_worker_metric = false;
// The render output alternates between metric name and value in separate
// string_views
for (size_t i = 0; i < output.size(); ++i) {
const auto &line = output[i];
// Look for: pending_bug_test_bucket{test="observer",le="1.0"}
if (line.find("pending_bug_test_bucket{test=\"observer\",le=\"1.0\"}") !=
std::string_view::npos) {
found_worker_metric = true;
// The value should be in the next element
if (i + 1 < output.size()) {
auto value_str = output[i + 1];
// Remove trailing newline if present
while (!value_str.empty() &&
(value_str.back() == '\n' || value_str.back() == '\r')) {
value_str.remove_suffix(1);
}
try {
worker_bucket_2_count = std::stoull(std::string(value_str));
} catch (const std::exception &e) {
MESSAGE("Failed to parse value: '"
<< value_str << "' from metric line: '" << line << "'");
MESSAGE("Full output:\n" << full_output);
throw;
}
}
break;
}
}
REQUIRE(found_worker_metric); // The metric should exist
// BUG: This will fail because pending observations are lost on thread death
// Expected: num_threads observations (each thread made 1 observation)
// Actual: less than num_threads (observations stuck in pending are lost
// when threads die)
CHECK_MESSAGE(
worker_bucket_2_count == num_threads,
"Expected "
<< num_threads << " observations but got " << worker_bucket_2_count
<< ". This indicates the pending buffer bug where observations "
<< "stuck in pending are lost when thread dies.");
}
}
TEST_CASE("render output deterministic order golden test") {
// Clean slate - reset all metrics before this test
metric::reset_metrics_for_testing();