diff --git a/benchmarks/bench_metric.cpp b/benchmarks/bench_metric.cpp
index 85c6a1c..d424719 100644
--- a/benchmarks/bench_metric.cpp
+++ b/benchmarks/bench_metric.cpp
@@ -4,7 +4,6 @@
 #include "metric.hpp"
 
 #include <atomic>
-#include <chrono>
 #include <cmath>
 #include <latch>
 #include <random>
@@ -17,6 +16,10 @@ struct ContentionEnvironment {
   std::vector<std::thread> background_threads;
   std::atomic<bool> stop_flag{false};
 
+  // Synchronization latches - must be members to avoid use-after-return
+  std::unique_ptr<std::latch> contention_latch;
+  std::unique_ptr<std::latch> render_latch;
+
   // Metrics for testing
   metric::Family<metric::Counter> counter_family;
   metric::Family<metric::Gauge> gauge_family;
@@ -40,6 +43,7 @@ struct ContentionEnvironment {
 
   void start_background_contention(int num_threads = 4) {
     stop_flag.store(false);
+    contention_latch = std::make_unique<std::latch>(num_threads + 1);
 
     for (int i = 0; i < num_threads; ++i) {
       background_threads.emplace_back([this, i]() {
@@ -53,6 +57,9 @@ struct ContentionEnvironment {
         std::mt19937 rng(i);
         std::uniform_real_distribution<double> dist(0.0, 10.0);
 
+        contention_latch
+            ->arrive_and_wait(); // All background threads start together
+
         while (!stop_flag.load(std::memory_order_relaxed)) {
           // Simulate mixed workload
           bg_counter.inc(1.0);
@@ -61,18 +68,27 @@ struct ContentionEnvironment {
         }
       });
     }
+
+    contention_latch
+        ->arrive_and_wait(); // Wait for all background threads to be ready
   }
 
   void start_render_thread() {
+    render_latch = std::make_unique<std::latch>(2);
+
     background_threads.emplace_back([this]() {
       ArenaAllocator arena;
 
+      render_latch->arrive_and_wait(); // Render thread signals it's ready
+
       while (!stop_flag.load(std::memory_order_relaxed)) {
         auto output = metric::render(arena);
         static_cast<void>(output); // Suppress unused variable warning
         arena.reset();
       }
     });
+
+    render_latch->arrive_and_wait(); // Wait for render thread to be ready
   }
 
   void stop_background_threads() {
@@ -190,17 +206,23 @@ int main() {
 
     std::atomic<bool> stop_shared{false};
     std::vector<std::thread> shared_threads;
+    std::latch start_latch{
+        9}; // Force threads to start concurrently (8 background + 1 benchmark)
 
     for (int i = 0; i < 8; ++i) {
-      shared_threads.emplace_back([&gauge_family, &stop_shared]() {
-        auto gauge = gauge_family.create({{"shared", "true"}});
-        while (!stop_shared.load(std::memory_order_relaxed)) {
-          gauge.inc(1.0);
-        }
-      });
+      shared_threads.emplace_back(
+          [&gauge_family, &stop_shared, &start_latch]() {
+            auto gauge = gauge_family.create({{"shared", "true"}});
+            start_latch.arrive_and_wait(); // All threads start together
+            while (!stop_shared.load(std::memory_order_relaxed)) {
+              gauge.inc(1.0);
+            }
+          });
     }
 
     auto gauge_for_benchmark = gauge_family.create({{"shared", "true"}});
+    start_latch
+        .arrive_and_wait(); // Benchmark thread waits for all background threads
     bench.run("gauge.inc() - 8 threads, same labels (contention)", [&]() {
       gauge_for_benchmark.inc(1.0);
       ankerl::nanobench::doNotOptimizeAway(gauge_for_benchmark);
@@ -277,7 +299,10 @@ int main() {
 
     // Background thread updating callback values
     std::atomic<bool> stop_callback{false};
+    std::latch start_latch{2}; // Background thread + benchmark thread
+
     std::thread callback_updater([&]() {
+      start_latch.arrive_and_wait(); // Wait for benchmark to start
       while (!stop_callback.load()) {
         counter_value.fetch_add(1);
         gauge_value.store(gauge_value.load() + 1);
@@ -286,6 +311,7 @@ int main() {
 
     ArenaAllocator arena;
 
+    start_latch.arrive_and_wait(); // Wait for background thread to be ready
     bench.run("render() - with callback metrics", [&]() {
       auto output = metric::render(arena);
       ankerl::nanobench::doNotOptimizeAway(output);
diff --git a/style.md b/style.md
index f2da1e0..689ab94 100644
--- a/style.md
+++ b/style.md
@@ -568,6 +568,24 @@ TEST_CASE("Server accepts connections") {
   - `std::latch`, `std::barrier`, futures/promises
 - **Force concurrent execution** using `std::latch` to synchronize thread startup
 
+#### Threading Checklist for Tests/Benchmarks
+
+**MANDATORY: Before writing any `std::thread` or `threads.emplace_back()`:**
+
+1. **Count total threads** - Include main/benchmark thread in count
+2. **Always assume concurrent execution needed** - Tests/benchmarks require real concurrency
+3. **Add `std::latch start_latch{N}`** where N = total concurrent threads
+4. **Each thread calls `start_latch.arrive_and_wait()`** before doing work
+5. **Main/benchmark thread calls `start_latch.arrive_and_wait()`** before measurement
+
+**Red flags to catch immediately:**
+- ❌ Creating threads in a loop without `std::latch`
+- ❌ Background threads starting work immediately
+- ❌ Benchmark measuring before all threads synchronized
+- ❌ Any use of `sleep_for`, `wait_for`, or timeouts
+
+**Simple rule:** Multiple threads = `std::latch` synchronization. No exceptions, even for "simple" background threads.
+
 ```cpp
 // BAD: Race likely over before threads start
 std::atomic<int> counter{0};