diff --git a/src/metric.cpp b/src/metric.cpp
index a2dab1b..287f957 100644
--- a/src/metric.cpp
+++ b/src/metric.cpp
@@ -306,16 +306,15 @@ struct Gauge::State {
 // Histogram: Thread-local buckets, single writer, mutex protection per thread,
 // per histogram
 struct Histogram::State {
-  ArenaVector<double> thresholds; // Bucket boundaries (sorted, deduplicated,
-                                  // sizes never change)
-  ArenaVector<uint64_t> counts;   // Count per bucket
-  double sum;                     // Sum of observations
-  uint64_t observations;          // Total observation count
+  std::span<const double> thresholds; // Bucket boundaries (sorted,
+                                      // deduplicated, sizes never change)
+  std::span<uint64_t> counts;         // Count per bucket
+  double sum;                         // Sum of observations
+  uint64_t observations;              // Total observation count
   std::mutex
       mutex; // Per-thread, per-histogram mutex for consistent reads/writes
 
-  State(ArenaAllocator &arena)
-      : thresholds(&arena), counts(&arena), sum(0.0), observations(0) {}
+  State() : sum(0.0), observations(0) {}
   friend struct Metric;
 };
 
@@ -428,20 +427,10 @@ struct Metric {
             // Acquire lock to get consistent snapshot
             std::lock_guard<std::mutex> lock(instance->mutex);
 
-            // Ensure global accumulator exists
+            // Global accumulator should have been created when we made the
+            // histogram
             auto &global_state = family->global_accumulated_values[labels_key];
-            if (!global_state) {
-              global_state = get_global_arena().construct<Histogram::State>(
-                  get_global_arena());
-              // Copy thresholds from instance
-              for (size_t i = 0; i < instance->thresholds.size(); ++i) {
-                global_state->thresholds.push_back(instance->thresholds[i]);
-              }
-              // Initialize counts with zeros
-              for (size_t i = 0; i < instance->counts.size(); ++i) {
-                global_state->counts.push_back(0);
-              }
-            }
+            assert(global_state);
 
             // Accumulate bucket counts (mutex already held)
             for (size_t i = 0; i < instance->counts.size(); ++i) {
@@ -560,33 +549,43 @@ struct Metric {
 
     auto &ptr = family->p->per_thread_state[thread_id].instances[key];
     if (!ptr) {
-      ptr = get_thread_local_arena().construct<Histogram::State>(
-          get_thread_local_arena());
+      ptr = get_thread_local_arena().construct<Histogram::State>();
 
       // DESIGN: Prometheus-compatible histogram buckets
       // Use buckets from family configuration
-      for (size_t i = 0; i < family->p->buckets.size(); ++i) {
-        ptr->thresholds.push_back(family->p->buckets[i]);
-      }
+      size_t bucket_count = family->p->buckets.size();
+      double *thresholds_data =
+          get_thread_local_arena().allocate<double>(bucket_count);
+      uint64_t *counts_data =
+          get_thread_local_arena().allocate<uint64_t>(bucket_count);
 
-      // Initialize with zero values, mutex protects all operations
-      for (size_t i = 0; i < ptr->thresholds.size(); ++i) {
-        ptr->counts.push_back(0);
-      }
+      // Copy thresholds and initialize counts
+      std::memcpy(thresholds_data, family->p->buckets.data(),
+                  bucket_count * sizeof(double));
+      std::memset(counts_data, 0, bucket_count * sizeof(uint64_t));
+
+      ptr->thresholds = std::span<const double>(thresholds_data, bucket_count);
+      ptr->counts = std::span<uint64_t>(counts_data, bucket_count);
 
       // Ensure global accumulator exists for this label set
       auto &global_state = family->p->global_accumulated_values[key];
       if (!global_state) {
-        global_state =
-            get_global_arena().construct<Histogram::State>(get_global_arena());
-        // Copy thresholds
-        for (size_t i = 0; i < ptr->thresholds.size(); ++i) {
-          global_state->thresholds.push_back(ptr->thresholds[i]);
-        }
-        // Initialize counts with zeros
-        for (size_t i = 0; i < ptr->thresholds.size(); ++i) {
-          global_state->counts.push_back(0);
-        }
+        global_state = get_global_arena().construct<Histogram::State>();
+
+        // Allocate and copy thresholds, initialize counts
+        double *global_thresholds_data =
+            get_global_arena().allocate<double>(bucket_count);
+        uint64_t *global_counts_data =
+            get_global_arena().allocate<uint64_t>(bucket_count);
+
+        std::memcpy(global_thresholds_data, ptr->thresholds.data(),
+                    bucket_count * sizeof(double));
+        std::memset(global_counts_data, 0, bucket_count * sizeof(uint64_t));
+
+        global_state->thresholds =
+            std::span<const double>(global_thresholds_data, bucket_count);
+        global_state->counts =
+            std::span<uint64_t>(global_counts_data, bucket_count);
       }
     }
     Histogram result;
@@ -804,8 +803,8 @@ Histogram::Histogram() = default;
 // AVX-optimized implementation for high performance
 
 __attribute__((target("avx"))) static void
-update_histogram_buckets_simd(const ArenaVector<double> &thresholds,
-                              ArenaVector<uint64_t> &counts, double x,
+update_histogram_buckets_simd(std::span<const double> thresholds,
+                              std::span<uint64_t> counts, double x,
                               size_t start_idx) {
   const size_t size = thresholds.size();
   size_t i = start_idx;
@@ -1131,17 +1130,17 @@ compute_metric_values(ArenaAllocator &arena,
 
   // Compute histogram values - ITERATION ORDER MUST MATCH FORMAT PHASE
   size_t histogram_family_idx = 0;
-  for (const auto &family_pair : Metric::get_histogram_families()) {
+  for ([[maybe_unused]] const auto &[_name, _family] :
+       Metric::get_histogram_families()) {
     // Use pre-computed data with resolved pointers - no hash lookups!
     const auto &family_data = label_sets.histogram_data[histogram_family_idx++];
 
     for (const auto &data : family_data) {
       size_t bucket_count = data.bucket_count; // Use cached bucket count
 
-      ArenaVector<uint64_t> total_counts(&arena);
-      for (size_t i = 0; i < bucket_count; ++i) {
-        total_counts.push_back(0);
-      }
+      uint64_t *total_counts_data = arena.allocate<uint64_t>(bucket_count);
+      std::memset(total_counts_data, 0, bucket_count * sizeof(uint64_t));
+      std::span<uint64_t> total_counts(total_counts_data, bucket_count);
       double total_sum = 0.0;
       uint64_t total_observations = 0;