Add latency sim. Not reviewed

Update defaults based on simulation
Make defaults for batch_size_threshold, max_in_flight_requests
2025-08-24 22:32:47 -04:00 · 2025-08-24 21:34:46 -04:00 · 2025-08-24 20:23:17 -04:00 · 2025-08-24 20:02:11 -04:00 · 2025-08-24 19:47:29 -04:00
4 changed files with 2290 additions and 0 deletions
--- a/latency_sim/database_comparison.py
+++ b/latency_sim/database_comparison.py
@@ -0,0 +1,970 @@
+#!/usr/bin/env python3
+"""
+Database Persistence Pattern Comparison
+
+Compares WeaselDB's batched S3 persistence approach against traditional
+database persistence patterns to understand trade-offs in latency,
+throughput, consistency, and operational complexity.
+
+Simulated approaches:
+1. WeaselDB: Batched async S3 persistence with optimistic concurrency
+2. Traditional WAL: Write-ahead log with periodic sync to disk
+3. Synchronous: Immediate disk sync per transaction (like PostgreSQL serializable)
+4. Group Commit: Batched disk writes with configurable group size
+5. Async Replication: Immediate response with async durability
+"""
+
+import heapq
+import random
+import statistics
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Tuple, Any
+import numpy as np
+import matplotlib.pyplot as plt
+from collections import defaultdict, deque
+from abc import ABC, abstractmethod
+from enum import Enum
+import time
+
+
+class PersistencePattern(Enum):
+    WEASELDB_S3 = "WeaselDB S3 Batched"
+    TRADITIONAL_WAL = "Traditional WAL"
+    SYNCHRONOUS = "Synchronous Disk"
+    GROUP_COMMIT = "Group Commit"
+    ASYNC_REPLICATION = "Async Replication"
+
+
+@dataclass
+class Transaction:
+    """Represents a database transaction/commit"""
+    txn_id: int
+    arrival_time: float
+    size_bytes: int = 1024
+    requires_durability: bool = True  # Some txns can be async
+
+
+@dataclass
+class PersistenceMetrics:
+    """Metrics for a persistence approach"""
+    pattern: PersistencePattern
+    total_transactions: int
+    completed_transactions: int
+
+    # Latency metrics (milliseconds)
+    min_latency: float
+    mean_latency: float
+    median_latency: float
+    p95_latency: float
+    p99_latency: float
+    max_latency: float
+
+    # Throughput metrics
+    avg_throughput_tps: float
+    peak_throughput_tps: float
+
+    # Resource metrics
+    avg_disk_iops: float
+    avg_network_mbps: float
+    storage_efficiency: float  # GB stored / GB logical data
+
+    # Consistency metrics
+    durability_guarantee: str
+    consistency_model: str
+    recovery_time_estimate: float
+
+    # Operational metrics
+    operational_complexity: int  # 1-10 scale
+    infrastructure_cost: float  # Relative cost per transaction
+
+
+class PersistenceSimulator(ABC):
+    """Abstract base class for persistence pattern simulators"""
+
+    def __init__(self,
+                 simulation_duration: float = 60.0,
+                 arrival_rate_per_sec: float = 1000.0):
+        self.simulation_duration = simulation_duration
+        self.arrival_rate_per_sec = arrival_rate_per_sec
+
+        # Simulation state
+        self.current_time = 0.0
+        self.event_queue = []
+        self.completed_transactions = []
+        self.next_txn_id = 0
+
+        # Metrics tracking
+        self.disk_iops = []
+        self.network_usage = []
+        self.throughput_samples = []
+
+        # Random number generator
+        self.rng = np.random.RandomState(42)
+
+    def generate_transaction(self, arrival_time: float) -> Transaction:
+        """Generate a transaction with realistic characteristics"""
+        # Size distribution: mostly small, some large
+        if self.rng.random() < 0.8:  # 80% small transactions
+            size = self.rng.randint(100, 2048)  # 100B - 2KB
+        elif self.rng.random() < 0.95:  # 15% medium
+            size = self.rng.randint(2048, 20480)  # 2KB - 20KB
+        else:  # 5% large transactions
+            size = self.rng.randint(20480, 102400)  # 20KB - 100KB
+
+        return Transaction(
+            txn_id=self.next_txn_id,
+            arrival_time=arrival_time,
+            size_bytes=size,
+            requires_durability=True
+        )
+
+    @abstractmethod
+    def process_transaction(self, txn: Transaction) -> None:
+        """Process a transaction according to the persistence pattern"""
+        pass
+
+    @abstractmethod
+    def get_pattern_name(self) -> PersistencePattern:
+        """Return the persistence pattern this simulator implements"""
+        pass
+
+    def run_simulation(self) -> PersistenceMetrics:
+        """Run the simulation and return metrics"""
+        # Generate arrival events
+        self._generate_arrivals()
+
+        # Process events
+        while self.event_queue and self.current_time < self.simulation_duration:
+            time, event_type, data = heapq.heappop(self.event_queue)
+            self.current_time = time
+
+            if event_type == 'transaction_arrival':
+                self.process_transaction(data)
+            elif event_type == 'custom':
+                self._handle_custom_event(data)
+
+        return self._calculate_metrics()
+
+    def _generate_arrivals(self):
+        """Generate Poisson arrival events"""
+        time = 0.0
+        while time < self.simulation_duration:
+            inter_arrival = self.rng.exponential(1.0 / self.arrival_rate_per_sec)
+            time += inter_arrival
+
+            if time >= self.simulation_duration:
+                break
+
+            txn = self.generate_transaction(time)
+            self.next_txn_id += 1
+
+            heapq.heappush(self.event_queue, (time, 'transaction_arrival', txn))
+
+    def _handle_custom_event(self, data):
+        """Handle custom events - override in subclasses"""
+        pass
+
+    def schedule_event(self, time: float, event_type: str, data: Any):
+        """Schedule a custom event"""
+        heapq.heappush(self.event_queue, (time, event_type, data))
+
+    def _calculate_metrics(self) -> PersistenceMetrics:
+        """Calculate performance metrics from completed transactions"""
+        if not self.completed_transactions:
+            raise ValueError("No transactions completed")
+
+        latencies = [txn['latency_ms'] for txn in self.completed_transactions]
+
+        return PersistenceMetrics(
+            pattern=self.get_pattern_name(),
+            total_transactions=self.next_txn_id,
+            completed_transactions=len(self.completed_transactions),
+
+            # Latency metrics
+            min_latency=min(latencies),
+            mean_latency=statistics.mean(latencies),
+            median_latency=statistics.median(latencies),
+            p95_latency=np.percentile(latencies, 95),
+            p99_latency=np.percentile(latencies, 99),
+            max_latency=max(latencies),
+
+            # Throughput metrics
+            avg_throughput_tps=len(self.completed_transactions) / self.simulation_duration,
+            peak_throughput_tps=self._calculate_peak_throughput(),
+
+            # Resource metrics - implemented by subclasses
+            avg_disk_iops=statistics.mean(self.disk_iops) if self.disk_iops else 0,
+            avg_network_mbps=statistics.mean(self.network_usage) if self.network_usage else 0,
+            storage_efficiency=self._calculate_storage_efficiency(),
+
+            # Pattern-specific characteristics
+            durability_guarantee=self._get_durability_guarantee(),
+            consistency_model=self._get_consistency_model(),
+            recovery_time_estimate=self._get_recovery_time(),
+            operational_complexity=self._get_operational_complexity(),
+            infrastructure_cost=self._get_infrastructure_cost()
+        )
+
+    def _calculate_peak_throughput(self) -> float:
+        """Calculate peak throughput in 1-second windows"""
+        if not self.completed_transactions:
+            return 0.0
+
+        # Group transactions by second
+        throughput_by_second = defaultdict(int)
+        for txn in self.completed_transactions:
+            second = int(txn['completion_time'])
+            throughput_by_second[second] += 1
+
+        return max(throughput_by_second.values()) if throughput_by_second else 0
+
+    # Abstract methods for pattern-specific characteristics
+    @abstractmethod
+    def _calculate_storage_efficiency(self) -> float:
+        pass
+
+    @abstractmethod
+    def _get_durability_guarantee(self) -> str:
+        pass
+
+    @abstractmethod
+    def _get_consistency_model(self) -> str:
+        pass
+
+    @abstractmethod
+    def _get_recovery_time(self) -> float:
+        pass
+
+    @abstractmethod
+    def _get_operational_complexity(self) -> int:
+        pass
+
+    @abstractmethod
+    def _get_infrastructure_cost(self) -> float:
+        pass
+
+
+class WeaselDBSimulator(PersistenceSimulator):
+    """WeaselDB's batched S3 persistence simulation"""
+
+    def __init__(self,
+                 batch_timeout_ms: float = 1.0,
+                 batch_size_threshold: int = 800000,
+                 max_in_flight: int = 50,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.batch_timeout_ms = batch_timeout_ms
+        self.batch_size_threshold = batch_size_threshold
+        self.max_in_flight = max_in_flight
+
+        # State
+        self.current_batch = []
+        self.batch_start_time = None
+        self.in_flight_batches = {}
+        self.next_batch_id = 0
+
+        # S3 characteristics
+        self.s3_base_latency_ms = 60.0  # From our simulation
+        self.s3_size_penalty_per_mb = 20.0
+
+    def get_pattern_name(self) -> PersistencePattern:
+        return PersistencePattern.WEASELDB_S3
+
+    def process_transaction(self, txn: Transaction):
+        """Process transaction using WeaselDB batching logic"""
+        # Add to current batch
+        if not self.current_batch:
+            self.batch_start_time = self.current_time
+
+        self.current_batch.append(txn)
+
+        # Check if we should send batch
+        if self._should_send_batch() and len(self.in_flight_batches) < self.max_in_flight:
+            self._send_current_batch()
+
+    def _should_send_batch(self) -> bool:
+        """Check batch triggers"""
+        if not self.current_batch:
+            return False
+
+        # Size trigger
+        batch_size = sum(txn.size_bytes for txn in self.current_batch)
+        if batch_size >= self.batch_size_threshold:
+            return True
+
+        # Time trigger
+        if self.batch_start_time and (self.current_time - self.batch_start_time) >= (self.batch_timeout_ms / 1000.0):
+            return True
+
+        return False
+
+    def _send_current_batch(self):
+        """Send current batch to S3"""
+        if not self.current_batch:
+            return
+
+        batch_id = self.next_batch_id
+        self.next_batch_id += 1
+
+        batch_size = sum(txn.size_bytes for txn in self.current_batch)
+
+        # Sample S3 latency
+        s3_latency = self._sample_s3_latency(batch_size) / 1000.0
+        completion_time = self.current_time + s3_latency
+
+        # Track batch
+        self.in_flight_batches[batch_id] = {
+            'transactions': self.current_batch.copy(),
+            'sent_time': self.current_time,
+            'completion_time': completion_time,
+            'size_bytes': batch_size
+        }
+
+        # Schedule completion
+        self.schedule_event(completion_time, 'custom', {'type': 'batch_complete', 'batch_id': batch_id})
+
+        # Track network usage
+        self.network_usage.append(batch_size / (1024 * 1024))  # MB
+
+        # Clear batch
+        self.current_batch.clear()
+        self.batch_start_time = None
+
+    def _sample_s3_latency(self, batch_size_bytes: int) -> float:
+        """Sample S3 latency with size scaling"""
+        base = self.s3_base_latency_ms
+        size_penalty = (batch_size_bytes / (1024 * 1024)) * self.s3_size_penalty_per_mb
+        variable = self.rng.gamma(2.0, 15.0)  # Variable component
+        return 30.0 + variable + size_penalty  # 30ms RTT + variable + size
+
+    def _handle_custom_event(self, data):
+        """Handle batch completion events"""
+        if data['type'] == 'batch_complete':
+            batch_id = data['batch_id']
+            if batch_id in self.in_flight_batches:
+                batch = self.in_flight_batches.pop(batch_id)
+
+                # Mark transactions complete
+                for txn in batch['transactions']:
+                    latency_ms = (self.current_time - txn.arrival_time) * 1000
+                    self.completed_transactions.append({
+                        'txn_id': txn.txn_id,
+                        'arrival_time': txn.arrival_time,
+                        'completion_time': self.current_time,
+                        'latency_ms': latency_ms,
+                        'size_bytes': txn.size_bytes
+                    })
+
+    def _calculate_storage_efficiency(self) -> float:
+        return 1.0  # S3 has no storage overhead
+
+    def _get_durability_guarantee(self) -> str:
+        return "Eventually durable (S3 11 9's)"
+
+    def _get_consistency_model(self) -> str:
+        return "Optimistic concurrency, eventual consistency"
+
+    def _get_recovery_time(self) -> float:
+        return 0.1  # Fast recovery, just reconnect to S3
+
+    def _get_operational_complexity(self) -> int:
+        return 3  # Moderate - S3 managed, but need batching logic
+
+    def _get_infrastructure_cost(self) -> float:
+        return 1.0  # Baseline cost
+
+
+class TraditionalWALSimulator(PersistenceSimulator):
+    """Traditional Write-Ahead Log with periodic sync"""
+
+    def __init__(self,
+                 wal_sync_interval_ms: float = 10.0,
+                 checkpoint_interval_sec: float = 30.0,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.wal_sync_interval_ms = wal_sync_interval_ms
+        self.checkpoint_interval_sec = checkpoint_interval_sec
+
+        # WAL state
+        self.wal_buffer = []
+        self.last_sync_time = 0.0
+        self.pending_transactions = {}  # Waiting for sync
+
+        # EBS characteristics
+        self.disk_latency_ms = 1.0  # EBS base latency
+        self.disk_iops_limit = 10000  # Typical SSD
+
+        # Schedule periodic syncs
+        self._schedule_periodic_syncs()
+
+    def get_pattern_name(self) -> PersistencePattern:
+        return PersistencePattern.TRADITIONAL_WAL
+
+    def process_transaction(self, txn: Transaction):
+        """Write to WAL buffer, wait for sync"""
+        # Write to WAL buffer (immediate)
+        self.wal_buffer.append(txn)
+        self.pending_transactions[txn.txn_id] = txn
+
+        # Track disk IOPS (write to WAL)
+        self.disk_iops.append(1.0)
+
+    def _schedule_periodic_syncs(self):
+        """Schedule periodic WAL sync events"""
+        sync_time = self.wal_sync_interval_ms / 1000.0
+        while sync_time < self.simulation_duration:
+            self.schedule_event(sync_time, 'custom', {'type': 'wal_sync'})
+            sync_time += self.wal_sync_interval_ms / 1000.0
+
+    def _handle_custom_event(self, data):
+        """Handle WAL sync events"""
+        if data['type'] == 'wal_sync':
+            self._perform_wal_sync()
+
+    def _perform_wal_sync(self):
+        """Sync WAL buffer to disk"""
+        if not self.wal_buffer:
+            return
+
+        # Calculate sync latency based on buffer size
+        buffer_size = sum(txn.size_bytes for txn in self.wal_buffer)
+        sync_latency = self._calculate_disk_sync_latency(buffer_size)
+
+        completion_time = self.current_time + sync_latency / 1000.0
+
+        # Schedule sync completion
+        syncing_txns = list(self.wal_buffer)
+        self.schedule_event(completion_time, 'custom', {
+            'type': 'sync_complete',
+            'transactions': syncing_txns
+        })
+
+        # Track IOPS for sync operation
+        self.disk_iops.append(len(self.wal_buffer))
+
+        # Clear buffer
+        self.wal_buffer.clear()
+        self.last_sync_time = self.current_time
+
+    def _calculate_disk_sync_latency(self, size_bytes: int) -> float:
+        """Calculate disk sync latency with realistic fsync modeling including directory sync"""
+        # Data write to page cache
+        write_latency = 0.1  # Fast write to page cache
+
+        # EBS sequential write throughput
+        throughput_mbps = 1000.0  # EBS gp3 throughput
+        size_mb = size_bytes / (1024 * 1024)
+        transfer_latency = size_mb * (1000.0 / throughput_mbps)
+
+        # WAL file fsync latency on EBS
+        # fdatasync on EBS with network replication
+        file_fsync_base = 0.8  # Higher base latency on EBS
+        file_fsync_variable = self.rng.gamma(2.2, 0.4)  # More variable due to network
+
+        # Size penalty for large WAL syncs
+        size_penalty = min(size_mb * 0.05, 1.0)  # Smaller penalty than batched writes
+
+        file_fsync_latency = file_fsync_base + file_fsync_variable + size_penalty
+
+        # WAL typically appends to existing files, so no directory fsync needed
+        # Directory fsync only required when creating new WAL segments
+        return write_latency + transfer_latency + file_fsync_latency
+
+    def _handle_custom_event(self, data):
+        """Handle sync completion"""
+        if data['type'] == 'wal_sync':
+            self._perform_wal_sync()
+        elif data['type'] == 'sync_complete':
+            # Mark transactions as durable
+            for txn in data['transactions']:
+                if txn.txn_id in self.pending_transactions:
+                    del self.pending_transactions[txn.txn_id]
+
+                    latency_ms = (self.current_time - txn.arrival_time) * 1000
+                    self.completed_transactions.append({
+                        'txn_id': txn.txn_id,
+                        'arrival_time': txn.arrival_time,
+                        'completion_time': self.current_time,
+                        'latency_ms': latency_ms,
+                        'size_bytes': txn.size_bytes
+                    })
+
+    def _calculate_storage_efficiency(self) -> float:
+        return 2.0  # WAL + main storage
+
+    def _get_durability_guarantee(self) -> str:
+        return "ACID durable after WAL sync"
+
+    def _get_consistency_model(self) -> str:
+        return "Strict ACID consistency"
+
+    def _get_recovery_time(self) -> float:
+        return 30.0  # WAL replay time
+
+    def _get_operational_complexity(self) -> int:
+        return 7  # Complex - WAL management, checkpoints, recovery
+
+    def _get_infrastructure_cost(self) -> float:
+        return 1.5  # Higher due to disk I/O overhead
+
+
+class SynchronousSimulator(PersistenceSimulator):
+    """Synchronous disk persistence (like PostgreSQL with synchronous_commit=on)"""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.disk_latency_ms = 1.0  # EBS base latency
+
+    def get_pattern_name(self) -> PersistencePattern:
+        return PersistencePattern.SYNCHRONOUS
+
+    def process_transaction(self, txn: Transaction):
+        """Immediately sync transaction to disk"""
+        # Calculate disk write latency
+        disk_latency = self._calculate_disk_latency(txn.size_bytes) / 1000.0
+        completion_time = self.current_time + disk_latency
+
+        # Schedule completion
+        self.schedule_event(completion_time, 'custom', {
+            'type': 'disk_write_complete',
+            'transaction': txn
+        })
+
+        # Track disk IOPS
+        self.disk_iops.append(1.0)
+
+    def _calculate_disk_latency(self, size_bytes: int) -> float:
+        """Calculate per-transaction disk write latency with fsync"""
+        # Write to page cache (fast)
+        write_latency = 0.1
+
+        # FSYNC latency for immediate durability on EBS
+        # Each transaction requires its own fsync - expensive on network storage!
+        fsync_base = 1.5  # Much higher base latency for individual fsyncs on EBS
+        fsync_variable = self.rng.gamma(2.3, 0.5)  # More variable due to network
+
+        # Small write penalty - less efficient than batched writes
+        if size_bytes < 4096:
+            penalty = 0.5  # Individual small writes are less efficient
+        else:
+            penalty = 0.0
+
+        fsync_latency = fsync_base + fsync_variable + penalty
+
+        # Synchronous commits to existing database files don't need directory fsync
+        # Directory fsync only needed when creating new database files/tablespaces
+        return write_latency + fsync_latency
+
+    def _handle_custom_event(self, data):
+        """Handle disk write completion"""
+        if data['type'] == 'disk_write_complete':
+            txn = data['transaction']
+            latency_ms = (self.current_time - txn.arrival_time) * 1000
+
+            self.completed_transactions.append({
+                'txn_id': txn.txn_id,
+                'arrival_time': txn.arrival_time,
+                'completion_time': self.current_time,
+                'latency_ms': latency_ms,
+                'size_bytes': txn.size_bytes
+            })
+
+    def _calculate_storage_efficiency(self) -> float:
+        return 1.5  # Some overhead for metadata
+
+    def _get_durability_guarantee(self) -> str:
+        return "Immediate ACID durability"
+
+    def _get_consistency_model(self) -> str:
+        return "Strict ACID with immediate consistency"
+
+    def _get_recovery_time(self) -> float:
+        return 5.0  # Fast recovery, data already on disk
+
+    def _get_operational_complexity(self) -> int:
+        return 5  # Moderate - standard database operations
+
+    def _get_infrastructure_cost(self) -> float:
+        return 2.0  # High due to disk I/O per transaction
+
+
+class WeaselDBDiskSimulator(PersistenceSimulator):
+    """WeaselDB's batched disk persistence simulation"""
+
+    def __init__(self,
+                 batch_timeout_ms: float = 1.0,
+                 batch_size_threshold: int = 800000,
+                 max_in_flight: int = 50,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.batch_timeout_ms = batch_timeout_ms
+        self.batch_size_threshold = batch_size_threshold
+        self.max_in_flight = max_in_flight
+
+        # State
+        self.current_batch = []
+        self.batch_start_time = None
+        self.in_flight_batches = {}
+        self.next_batch_id = 0
+
+        # EBS characteristics (gp3 or io2 volumes)
+        self.disk_base_latency_ms = 0.5  # EBS has higher base latency than local NVMe
+        self.disk_throughput_mbps = 1000.0  # EBS gp3 max throughput
+
+    def get_pattern_name(self) -> PersistencePattern:
+        return PersistencePattern.WEASELDB_S3  # Reuse enum, but it's actually disk
+
+    def process_transaction(self, txn: Transaction):
+        """Process transaction using WeaselDB batching logic"""
+        # Add to current batch
+        if not self.current_batch:
+            self.batch_start_time = self.current_time
+
+        self.current_batch.append(txn)
+
+        # Check if we should send batch
+        if self._should_send_batch() and len(self.in_flight_batches) < self.max_in_flight:
+            self._send_current_batch()
+
+    def _should_send_batch(self) -> bool:
+        """Check batch triggers"""
+        if not self.current_batch:
+            return False
+
+        # Size trigger
+        batch_size = sum(txn.size_bytes for txn in self.current_batch)
+        if batch_size >= self.batch_size_threshold:
+            return True
+
+        # Time trigger
+        if self.batch_start_time and (self.current_time - self.batch_start_time) >= (self.batch_timeout_ms / 1000.0):
+            return True
+
+        return False
+
+    def _send_current_batch(self):
+        """Send current batch to disk"""
+        if not self.current_batch:
+            return
+
+        batch_id = self.next_batch_id
+        self.next_batch_id += 1
+
+        batch_size = sum(txn.size_bytes for txn in self.current_batch)
+
+        # Sample disk write latency
+        disk_latency = self._sample_disk_latency(batch_size) / 1000.0
+        completion_time = self.current_time + disk_latency
+
+        # Track batch
+        self.in_flight_batches[batch_id] = {
+            'transactions': self.current_batch.copy(),
+            'sent_time': self.current_time,
+            'completion_time': completion_time,
+            'size_bytes': batch_size
+        }
+
+        # Schedule completion
+        self.schedule_event(completion_time, 'custom', {'type': 'batch_complete', 'batch_id': batch_id})
+
+        # Track disk IOPS (one write operation per batch)
+        self.disk_iops.append(1.0)
+
+        # Clear batch
+        self.current_batch.clear()
+        self.batch_start_time = None
+
+    def _sample_disk_latency(self, batch_size_bytes: int) -> float:
+        """Sample disk write latency with realistic fsync modeling including directory sync"""
+        # Base latency for the write command (data goes to page cache)
+        write_latency = self.disk_base_latency_ms
+
+        # Throughput-based latency for data transfer to page cache
+        size_mb = batch_size_bytes / (1024 * 1024)
+        transfer_latency = (size_mb / self.disk_throughput_mbps) * 1000.0  # Convert to ms
+
+        # FSYNC latency for EBS - forces write to replicated storage
+        # EBS has higher fsync latency due to network replication
+        file_fsync_base = 1.0  # Higher base fsync latency for EBS
+        file_fsync_variable = self.rng.gamma(2.5, 0.6)  # More variable due to network
+
+        # Size-dependent fsync penalty for large batches
+        size_penalty = min(size_mb * 0.1, 2.0)  # Max 2ms penalty
+
+        file_fsync_latency = file_fsync_base + file_fsync_variable + size_penalty
+
+        # Directory fsync latency - required for WeaselDB batch file creation on EBS
+        # EBS directory metadata sync is also network-replicated
+        dir_fsync_base = 0.5  # Higher directory metadata sync latency on EBS
+        dir_fsync_variable = self.rng.gamma(1.8, 0.3)  # More variable due to network
+
+        dir_fsync_latency = dir_fsync_base + dir_fsync_variable
+
+        # Total latency: write + file fsync + directory fsync
+        # WeaselDB needs directory fsync for batch file durability guarantees
+        return write_latency + transfer_latency + file_fsync_latency + dir_fsync_latency
+
+    def _handle_custom_event(self, data):
+        """Handle batch completion events"""
+        if data['type'] == 'batch_complete':
+            batch_id = data['batch_id']
+            if batch_id in self.in_flight_batches:
+                batch = self.in_flight_batches.pop(batch_id)
+
+                # Mark transactions complete
+                for txn in batch['transactions']:
+                    latency_ms = (self.current_time - txn.arrival_time) * 1000
+                    self.completed_transactions.append({
+                        'txn_id': txn.txn_id,
+                        'arrival_time': txn.arrival_time,
+                        'completion_time': self.current_time,
+                        'latency_ms': latency_ms,
+                        'size_bytes': txn.size_bytes
+                    })
+
+    def _calculate_storage_efficiency(self) -> float:
+        return 1.2  # Some overhead for batching metadata
+
+    def _get_durability_guarantee(self) -> str:
+        return "ACID durable after EBS replication"
+
+    def _get_consistency_model(self) -> str:
+        return "Strict serializable with optimistic concurrency"
+
+    def _get_recovery_time(self) -> float:
+        return 10.0  # EBS volume attachment + recovery
+
+    def _get_operational_complexity(self) -> int:
+        return 4  # Moderate - batching logic + disk management
+
+    def _get_infrastructure_cost(self) -> float:
+        return 1.4  # Higher than S3 due to EBS provisioned storage + replication
+
+
+class DatabaseComparisonFramework:
+    """Framework for comparing different database persistence patterns"""
+
+    def __init__(self, simulation_duration: float = 30.0):
+        self.simulation_duration = simulation_duration
+
+    def run_comparison(self,
+                      arrival_rates: List[float] = [100, 500, 1000, 2000]) -> Dict[str, List[PersistenceMetrics]]:
+        """Run comparison across multiple arrival rates"""
+
+        results = defaultdict(list)
+
+        for rate in arrival_rates:
+            print(f"\nTesting at {rate} TPS...")
+
+            # WeaselDB S3 (optimized config)
+            weasel_s3 = WeaselDBSimulator(
+                batch_timeout_ms=1.0,
+                batch_size_threshold=800000,
+                max_in_flight=50,
+                simulation_duration=self.simulation_duration,
+                arrival_rate_per_sec=rate
+            )
+            weasel_s3_metrics = weasel_s3.run_simulation()
+            results['WeaselDB S3'].append(weasel_s3_metrics)
+            print(f"  WeaselDB S3 P95: {weasel_s3_metrics.p95_latency:.1f}ms")
+
+            # WeaselDB EBS (same batching, EBS storage)
+            weasel_ebs = WeaselDBDiskSimulator(
+                batch_timeout_ms=1.0,
+                batch_size_threshold=800000,
+                max_in_flight=50,
+                simulation_duration=self.simulation_duration,
+                arrival_rate_per_sec=rate
+            )
+            weasel_ebs_metrics = weasel_ebs.run_simulation()
+            results['WeaselDB EBS'].append(weasel_ebs_metrics)
+            print(f"  WeaselDB EBS P95: {weasel_ebs_metrics.p95_latency:.1f}ms")
+
+            # Traditional WAL
+            wal = TraditionalWALSimulator(
+                wal_sync_interval_ms=10.0,
+                simulation_duration=self.simulation_duration,
+                arrival_rate_per_sec=rate
+            )
+            wal_metrics = wal.run_simulation()
+            results['Traditional WAL'].append(wal_metrics)
+            print(f"  WAL P95: {wal_metrics.p95_latency:.1f}ms")
+
+            # Synchronous
+            sync = SynchronousSimulator(
+                simulation_duration=self.simulation_duration,
+                arrival_rate_per_sec=rate
+            )
+            sync_metrics = sync.run_simulation()
+            results['Synchronous'].append(sync_metrics)
+            print(f"  Synchronous P95: {sync_metrics.p95_latency:.1f}ms")
+
+        return dict(results)
+
+    def print_comparison_report(self, results: Dict[str, List[PersistenceMetrics]]):
+        """Print comprehensive comparison report"""
+        print("\n" + "="*80)
+        print("DATABASE PERSISTENCE PATTERN COMPARISON")
+        print("="*80)
+
+        # Get arrival rates for headers
+        arrival_rates = [100, 500, 1000, 2000]
+
+        # Latency comparison
+        print(f"\nP95 LATENCY COMPARISON (ms)")
+        print(f"{'Pattern':<20}", end="")
+        for rate in arrival_rates:
+            print(f"{rate:>8} TPS", end="")
+        print()
+        print("-" * 60)
+
+        for pattern_name, metrics_list in results.items():
+            print(f"{pattern_name:<20}", end="")
+            for metrics in metrics_list:
+                print(f"{metrics.p95_latency:>8.1f}", end="")
+            print()
+
+        # Throughput comparison
+        print(f"\nTHROUGHPUT ACHIEVED (TPS)")
+        print(f"{'Pattern':<20}", end="")
+        for rate in arrival_rates:
+            print(f"{rate:>8} TPS", end="")
+        print()
+        print("-" * 60)
+
+        for pattern_name, metrics_list in results.items():
+            print(f"{pattern_name:<20}", end="")
+            for metrics in metrics_list:
+                print(f"{metrics.avg_throughput_tps:>8.1f}", end="")
+            print()
+
+        # Characteristics comparison
+        print(f"\nSYSTEM CHARACTERISTICS")
+        print(f"{'Pattern':<20} {'Durability':<25} {'Consistency':<20} {'OpComplx':<8} {'Cost':<6}")
+        print("-" * 85)
+
+        for pattern_name, metrics_list in results.items():
+            metrics = metrics_list[0]  # Use first metrics for characteristics
+            print(f"{pattern_name:<20} {metrics.durability_guarantee:<25} "
+                  f"{metrics.consistency_model:<20} {metrics.operational_complexity:<8} "
+                  f"{metrics.infrastructure_cost:<6.1f}")
+
+        # Performance sweet spots
+        print(f"\nPERFORMANCE SWEET SPOTS")
+        print("-" * 40)
+
+        for rate in arrival_rates:
+            print(f"\nAt {rate} TPS:")
+            rate_results = [(name, metrics_list[arrival_rates.index(rate)])
+                           for name, metrics_list in results.items()]
+
+            # Sort by P95 latency
+            rate_results.sort(key=lambda x: x[1].p95_latency)
+
+            for i, (name, metrics) in enumerate(rate_results):
+                rank_symbol = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else "  "
+                print(f"  {rank_symbol} {name}: {metrics.p95_latency:.1f}ms P95, "
+                      f"{metrics.avg_throughput_tps:.0f} TPS achieved")
+
+    def plot_comparison_results(self, results: Dict[str, List[PersistenceMetrics]],
+                               save_path: Optional[str] = None):
+        """Plot comparison results"""
+        try:
+            arrival_rates = [100, 500, 1000, 2000]
+
+            fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
+            fig.suptitle('Database Persistence Pattern Comparison', fontsize=16)
+
+            # Plot 1: P95 Latency vs Load
+            for pattern_name, metrics_list in results.items():
+                p95_latencies = [m.p95_latency for m in metrics_list]
+                ax1.plot(arrival_rates, p95_latencies, marker='o', linewidth=2, label=pattern_name)
+
+            ax1.set_xlabel('Arrival Rate (TPS)')
+            ax1.set_ylabel('P95 Latency (ms)')
+            ax1.set_title('P95 Latency vs Load')
+            ax1.legend()
+            ax1.grid(True, alpha=0.3)
+            ax1.set_yscale('log')
+
+            # Plot 2: Throughput Achieved
+            for pattern_name, metrics_list in results.items():
+                throughputs = [m.avg_throughput_tps for m in metrics_list]
+                ax2.plot(arrival_rates, throughputs, marker='s', linewidth=2, label=pattern_name)
+
+            # Perfect throughput line
+            ax2.plot(arrival_rates, arrival_rates, 'k--', alpha=0.5, label='Perfect (no loss)')
+
+            ax2.set_xlabel('Target Rate (TPS)')
+            ax2.set_ylabel('Achieved Throughput (TPS)')
+            ax2.set_title('Throughput: Target vs Achieved')
+            ax2.legend()
+            ax2.grid(True, alpha=0.3)
+
+            # Plot 3: Latency Distribution at 1000 TPS
+            rate_idx = 2  # 1000 TPS
+            for pattern_name, metrics_list in results.items():
+                metrics = metrics_list[rate_idx]
+                # Plot latency percentiles
+                percentiles = [50, 95, 99]
+                values = [metrics.median_latency, metrics.p95_latency, metrics.p99_latency]
+                ax3.bar([f"{pattern_name}\nP{p}" for p in percentiles], values,
+                       alpha=0.7, label=pattern_name)
+
+            ax3.set_ylabel('Latency (ms)')
+            ax3.set_title('Latency Percentiles at 1000 TPS')
+            ax3.set_yscale('log')
+            ax3.grid(True, alpha=0.3)
+
+            # Plot 4: Cost vs Performance
+            for pattern_name, metrics_list in results.items():
+                costs = [m.infrastructure_cost for m in metrics_list]
+                p95s = [m.p95_latency for m in metrics_list]
+
+                # Use different markers for different patterns
+                markers = {'WeaselDB': 'o', 'Traditional WAL': 's', 'Synchronous': '^'}
+                marker = markers.get(pattern_name, 'o')
+
+                ax4.scatter(costs, p95s, s=100, marker=marker, alpha=0.7, label=pattern_name)
+
+            ax4.set_xlabel('Infrastructure Cost (relative)')
+            ax4.set_ylabel('P95 Latency (ms)')
+            ax4.set_title('Cost vs Performance Trade-off')
+            ax4.legend()
+            ax4.grid(True, alpha=0.3)
+            ax4.set_yscale('log')
+
+            plt.tight_layout()
+
+            if save_path:
+                plt.savefig(save_path, dpi=300, bbox_inches='tight')
+                print(f"Comparison plots saved to {save_path}")
+            else:
+                plt.show()
+
+        except Exception as e:
+            print(f"Could not generate plots: {e}")
+
+
+def main():
+    """Run database persistence pattern comparison"""
+    print("Database Persistence Pattern Comparison")
+    print("Comparing WeaselDB vs Traditional Database Approaches")
+    print()
+
+    comparison = DatabaseComparisonFramework(simulation_duration=20.0)
+    results = comparison.run_comparison()
+
+    comparison.print_comparison_report(results)
+
+    try:
+        comparison.plot_comparison_results(results, 'database_comparison.png')
+    except Exception as e:
+        print(f"Could not generate plots: {e}")
+
+
+if __name__ == "__main__":
+    main()
--- a/latency_sim/persistence_optimizer.py
+++ b/latency_sim/persistence_optimizer.py
@@ -0,0 +1,461 @@
+#!/usr/bin/env python3
+"""
+Persistence Thread Parameter Optimization
+
+Uses Bayesian Optimization to automatically find the optimal configuration
+parameters that minimize commit latency. This is much more efficient than
+grid search since it uses a probabilistic model to guide parameter exploration.
+
+Key advantages:
+- Efficiently explores high-dimensional parameter spaces
+- Uses previous simulation results to guide future parameter choices
+- Handles expensive objective function evaluations (our simulation)
+- Provides uncertainty estimates for parameter importance
+"""
+
+import numpy as np
+from typing import Dict, List, Tuple, Optional
+import time
+from persistence_simulation import PersistenceSimulation, print_results
+
+# Try to import scikit-optimize for Bayesian optimization
+try:
+    from skopt import gp_minimize, forest_minimize
+    from skopt.space import Real, Integer
+    from skopt.utils import use_named_args
+    from skopt.plots import plot_convergence, plot_objective
+    import matplotlib.pyplot as plt
+    OPTIMIZE_AVAILABLE = True
+except ImportError:
+    print("scikit-optimize not available. Install with: pip install scikit-optimize")
+    print("Falling back to grid search...")
+    OPTIMIZE_AVAILABLE = False
+
+
+class PersistenceOptimizer:
+    """
+    Automated parameter optimization for the persistence thread using Bayesian optimization.
+
+    This class finds the optimal configuration parameters to minimize commit latency
+    by intelligently exploring the parameter space using Gaussian Process models.
+    """
+
+    def __init__(self,
+                 optimization_budget: int = 50,
+                 simulation_duration: float = 20.0,
+                 arrival_rate: float = 1000.0,
+                 objective_metric: str = "p95_latency",
+                 random_seed: int = 42):
+
+        self.optimization_budget = optimization_budget
+        self.simulation_duration = simulation_duration
+        self.arrival_rate = arrival_rate
+        self.objective_metric = objective_metric
+        self.random_seed = random_seed
+
+        # Track optimization history
+        self.optimization_history = []
+        self.best_params = None
+        self.best_score = float('inf')
+
+        # Define parameter search space
+        self.parameter_space = self._define_search_space()
+        self.parameter_names = [dim.name for dim in self.parameter_space]
+
+    def _define_search_space(self) -> List:
+        """
+        Define the parameter search space for optimization.
+
+        Focus on the 3 core parameters that matter for persistence thread performance
+        with 100% reliable S3. Retry parameters removed since S3 never fails.
+        """
+        return [
+            # Core batching parameters
+            Real(1.0, 50.0, name='batch_timeout_ms',
+                 prior='log-uniform'),  # Log scale since small changes matter
+            Integer(64 * 1024, 4 * 1024 * 1024, name='batch_size_threshold',  # 64KB - 4MB
+                    prior='log-uniform'),
+
+            # Flow control parameters - likely the most impactful
+            Integer(1, 50, name='max_in_flight_requests'),
+        ]
+
+    def _run_simulation_with_params(self, params: Dict[str, float]) -> Dict:
+        """Run simulation with given parameters and return results"""
+        try:
+            sim = PersistenceSimulation(
+                batch_timeout_ms=params['batch_timeout_ms'],
+                batch_size_threshold=int(params['batch_size_threshold']),
+                max_in_flight_requests=int(params['max_in_flight_requests']),
+                # Retry parameters fixed since S3 is 100% reliable
+                max_retry_attempts=0,      # No retries needed
+                retry_base_delay_ms=100.0, # Irrelevant but needs a value
+                # S3 parameters kept fixed - 100% reliable for optimization focus
+                s3_latency_shape=2.0,      # Fixed Gamma shape
+                s3_latency_scale=15.0,     # Fixed Gamma scale (30ms RTT + ~30ms variable = ~60ms mean)
+                s3_failure_rate=0.0,       # 100% reliable S3
+                arrival_rate_per_sec=self.arrival_rate,
+                simulation_duration_sec=self.simulation_duration
+            )
+
+            return sim.run_simulation()
+
+        except Exception as e:
+            print(f"Simulation failed with params {params}: {e}")
+            # Return a high penalty score for failed simulations
+            return {
+                'commit_metrics': {
+                    'latency_ms': {
+                        'mean': 10000,
+                        'p95': 10000,
+                        'p99': 10000
+                    }
+                },
+                'error': str(e)
+            }
+
+    def _extract_objective_value(self, results: Dict) -> float:
+        """Extract the objective value to minimize from simulation results"""
+        try:
+            commit_metrics = results['commit_metrics']['latency_ms']
+
+            if self.objective_metric == "mean_latency":
+                return commit_metrics['mean']
+            elif self.objective_metric == "p95_latency":
+                return commit_metrics['p95']
+            elif self.objective_metric == "p99_latency":
+                return commit_metrics['p99']
+            elif self.objective_metric == "weighted_latency":
+                # Weighted combination emphasizing tail latencies
+                return (0.3 * commit_metrics['mean'] +
+                       0.5 * commit_metrics['p95'] +
+                       0.2 * commit_metrics['p99'])
+            else:
+                return commit_metrics['p95']  # Default to P95
+
+        except KeyError as e:
+            print(f"Failed to extract objective from results: {e}")
+            return 10000  # High penalty for invalid results
+
+    def optimize_with_bayesian(self) -> Tuple[Dict, float]:
+        """
+        Use Bayesian Optimization to find optimal parameters.
+
+        This uses Gaussian Process models to build a probabilistic model
+        of the objective function and intelligently choose where to sample next.
+        """
+        if not OPTIMIZE_AVAILABLE:
+            return self.optimize_with_grid_search()
+
+        print(f"Starting Bayesian Optimization with {self.optimization_budget} evaluations")
+        print(f"Objective: Minimize {self.objective_metric}")
+        print(f"Parameter space: {len(self.parameter_space)} dimensions")
+        print()
+
+        @use_named_args(self.parameter_space)
+        def objective(**params):
+            """Objective function for Bayesian optimization"""
+            print(f"Evaluating: {params}")
+
+            start_time = time.time()
+            results = self._run_simulation_with_params(params)
+            eval_time = time.time() - start_time
+
+            objective_value = self._extract_objective_value(results)
+
+            # Track optimization history
+            history_entry = {
+                'params': params.copy(),
+                'objective_value': objective_value,
+                'results': results,
+                'eval_time': eval_time,
+                'iteration': len(self.optimization_history) + 1
+            }
+            self.optimization_history.append(history_entry)
+
+            # Update best if improved
+            if objective_value < self.best_score:
+                self.best_score = objective_value
+                self.best_params = params.copy()
+                print(f"✓ NEW BEST: {objective_value:.2f}ms (evaluation {history_entry['iteration']})")
+            else:
+                print(f"  Score: {objective_value:.2f}ms")
+
+            print(f"  Time: {eval_time:.1f}s")
+            print()
+
+            return objective_value
+
+        # Run Bayesian optimization
+        result = gp_minimize(
+            func=objective,
+            dimensions=self.parameter_space,
+            n_calls=self.optimization_budget,
+            n_initial_points=10,  # Random exploration first
+            acq_func='EI',        # Expected Improvement acquisition
+            random_state=self.random_seed
+        )
+
+        # Extract best parameters
+        best_params_list = result.x
+        best_params_dict = dict(zip(self.parameter_names, best_params_list))
+        best_objective = result.fun
+
+        return best_params_dict, best_objective
+
+    def optimize_with_grid_search(self) -> Tuple[Dict, float]:
+        """Fallback grid search optimization if scikit-optimize not available"""
+        print("Using grid search optimization (install scikit-optimize for better results)")
+        print()
+
+        # Define a smaller grid for key parameters
+        grid_configs = [
+            # Vary max_in_flight and batch_timeout
+            {'max_in_flight_requests': 5, 'batch_timeout_ms': 5.0},
+            {'max_in_flight_requests': 10, 'batch_timeout_ms': 5.0},
+            {'max_in_flight_requests': 20, 'batch_timeout_ms': 5.0},
+            {'max_in_flight_requests': 10, 'batch_timeout_ms': 2.0},
+            {'max_in_flight_requests': 10, 'batch_timeout_ms': 10.0},
+            {'max_in_flight_requests': 15, 'batch_timeout_ms': 3.0},
+            {'max_in_flight_requests': 25, 'batch_timeout_ms': 7.0},
+        ]
+
+        best_params = None
+        best_score = float('inf')
+
+        for i, config in enumerate(grid_configs):
+            print(f"Evaluating config {i+1}/{len(grid_configs)}: {config}")
+
+            # Use default values for unspecified parameters
+            full_params = {
+                'batch_timeout_ms': 5.0,
+                'batch_size_threshold': 1024 * 1024,
+                'max_in_flight_requests': 5
+            }
+            full_params.update(config)
+
+            results = self._run_simulation_with_params(full_params)
+            objective_value = self._extract_objective_value(results)
+
+            if objective_value < best_score:
+                best_score = objective_value
+                best_params = full_params.copy()
+                print(f"✓ NEW BEST: {objective_value:.2f}ms")
+            else:
+                print(f"  Score: {objective_value:.2f}ms")
+            print()
+
+        return best_params, best_score
+
+    def analyze_parameter_importance(self):
+        """Analyze which parameters have the most impact on performance"""
+        if not self.optimization_history:
+            print("No optimization history available")
+            return
+
+        print("Parameter Importance Analysis")
+        print("=" * 50)
+
+        # Extract parameter values and objectives
+        param_data = {}
+        objectives = []
+
+        for entry in self.optimization_history:
+            objectives.append(entry['objective_value'])
+            for param_name, param_value in entry['params'].items():
+                if param_name not in param_data:
+                    param_data[param_name] = []
+                param_data[param_name].append(param_value)
+
+        objectives = np.array(objectives)
+
+        # Simple correlation analysis
+        print("Parameter correlations with objective (lower is better):")
+        correlations = []
+
+        for param_name, values in param_data.items():
+            correlation = np.corrcoef(values, objectives)[0, 1]
+            correlations.append((param_name, correlation))
+            print(f"  {param_name:<25}: {correlation:+.3f}")
+
+        print("\nMost impactful parameters (by absolute correlation):")
+        correlations.sort(key=lambda x: abs(x[1]), reverse=True)
+        for param_name, correlation in correlations[:5]:
+            impact = "reduces latency" if correlation < 0 else "increases latency"
+            print(f"  {param_name:<25}: {impact} (r={correlation:+.3f})")
+
+    def plot_optimization_progress(self, save_path: Optional[str] = None):
+        """Plot optimization convergence"""
+        if not OPTIMIZE_AVAILABLE or not self.optimization_history:
+            return
+
+        iterations = [entry['iteration'] for entry in self.optimization_history]
+        objectives = [entry['objective_value'] for entry in self.optimization_history]
+
+        # Calculate running minimum (best so far)
+        running_min = []
+        current_min = float('inf')
+        for obj in objectives:
+            current_min = min(current_min, obj)
+            running_min.append(current_min)
+
+        plt.figure(figsize=(12, 8))
+
+        # Plot 1: Objective value over iterations
+        plt.subplot(2, 2, 1)
+        plt.scatter(iterations, objectives, alpha=0.6, s=30)
+        plt.plot(iterations, running_min, 'r-', linewidth=2, label='Best so far')
+        plt.xlabel('Iteration')
+        plt.ylabel(f'{self.objective_metric} (ms)')
+        plt.title('Optimization Progress')
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+
+        # Plot 2: Parameter evolution for key parameters
+        plt.subplot(2, 2, 2)
+        key_params = ['max_in_flight_requests', 'batch_timeout_ms']
+        for param in key_params:
+            if param in self.optimization_history[0]['params']:
+                values = [entry['params'][param] for entry in self.optimization_history]
+                plt.scatter(iterations, values, alpha=0.6, label=param, s=30)
+        plt.xlabel('Iteration')
+        plt.ylabel('Parameter Value')
+        plt.title('Key Parameter Evolution')
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+
+        # Plot 3: Objective distribution
+        plt.subplot(2, 2, 3)
+        plt.hist(objectives, bins=20, alpha=0.7, edgecolor='black')
+        plt.axvline(self.best_score, color='red', linestyle='--',
+                   label=f'Best: {self.best_score:.1f}ms')
+        plt.xlabel(f'{self.objective_metric} (ms)')
+        plt.ylabel('Count')
+        plt.title('Objective Value Distribution')
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+
+        # Plot 4: Convergence rate
+        plt.subplot(2, 2, 4)
+        improvements = []
+        for i, entry in enumerate(self.optimization_history):
+            if i == 0:
+                improvements.append(0)
+            else:
+                prev_best = running_min[i-1]
+                curr_best = running_min[i]
+                improvement = prev_best - curr_best
+                improvements.append(improvement)
+
+        plt.plot(iterations, improvements, 'g-', marker='o', markersize=3)
+        plt.xlabel('Iteration')
+        plt.ylabel('Improvement (ms)')
+        plt.title('Per-Iteration Improvement')
+        plt.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"Optimization plots saved to {save_path}")
+        else:
+            plt.show()
+
+    def run_optimization(self) -> Dict:
+        """Run the full optimization process and return results"""
+        start_time = time.time()
+
+        # Run optimization
+        if OPTIMIZE_AVAILABLE:
+            best_params, best_score = self.optimize_with_bayesian()
+        else:
+            best_params, best_score = self.optimize_with_grid_search()
+
+        total_time = time.time() - start_time
+
+        # Run final simulation with best parameters for detailed results
+        print("Running final simulation with optimal parameters...")
+        final_results = self._run_simulation_with_params(best_params)
+
+        # Prepare optimization summary
+        optimization_summary = {
+            'best_parameters': best_params,
+            'best_objective_value': best_score,
+            'optimization_time': total_time,
+            'evaluations_performed': len(self.optimization_history),
+            'final_simulation_results': final_results,
+            'optimization_history': self.optimization_history
+        }
+
+        return optimization_summary
+
+    def print_optimization_summary(self, summary: Dict):
+        """Print a comprehensive summary of optimization results"""
+        print("=" * 80)
+        print("BAYESIAN OPTIMIZATION RESULTS")
+        print("=" * 80)
+
+        print(f"Optimization completed in {summary['optimization_time']:.1f} seconds")
+        print(f"Performed {summary['evaluations_performed']} parameter evaluations")
+        print(f"Best {self.objective_metric}: {summary['best_objective_value']:.2f}ms")
+        print()
+
+        print("OPTIMAL PARAMETERS:")
+        print("-" * 40)
+        for param, value in summary['best_parameters'].items():
+            if isinstance(value, float):
+                if param.endswith('_rate'):
+                    print(f"  {param:<25}: {value:.4f}")
+                else:
+                    print(f"  {param:<25}: {value:.2f}")
+            else:
+                print(f"  {param:<25}: {value}")
+
+        print("\nDETAILED PERFORMANCE WITH OPTIMAL PARAMETERS:")
+        print("-" * 50)
+        final_results = summary['final_simulation_results']
+        print_results(final_results)
+
+        print("\nPARAMETER IMPACT ANALYSIS:")
+        print("-" * 30)
+        self.analyze_parameter_importance()
+
+
+def main():
+    """Main optimization workflow"""
+    print("Persistence Thread Parameter Optimization")
+    print("Using Bayesian Optimization for intelligent parameter search")
+    print()
+
+    # Create optimizer with different objective functions to test
+    objectives_to_test = ["p95_latency", "weighted_latency"]
+
+    for objective in objectives_to_test:
+        print(f"\n{'='*80}")
+        print(f"OPTIMIZING FOR: {objective.upper()}")
+        print(f"{'='*80}")
+
+        optimizer = PersistenceOptimizer(
+            optimization_budget=30,  # Reasonable for demo
+            simulation_duration=15.0,  # Shorter sims for faster optimization
+            arrival_rate=1000.0,
+            objective_metric=objective,
+            random_seed=42
+        )
+
+        # Run optimization
+        summary = optimizer.run_optimization()
+        optimizer.print_optimization_summary(summary)
+
+        # Generate plots
+        try:
+            optimizer.plot_optimization_progress(f'optimization_{objective}.png')
+        except Exception as e:
+            print(f"Could not generate plots: {e}")
+
+        print(f"\nOptimization for {objective} completed!")
+        print("="*80)
+
+
+if __name__ == "__main__":
+    main()
--- a/latency_sim/persistence_simulation.py
+++ b/latency_sim/persistence_simulation.py
@@ -0,0 +1,748 @@
+#!/usr/bin/env python3
+"""
+Persistence Thread Simulation
+
+Simulates the persistence thread design from persistence.md to analyze
+commit latency distributions with Poisson arrival times and realistic
+S3 response characteristics.
+
+Key metrics tracked:
+- End-to-end commit latency (arrival to acknowledgment)
+- Batch processing latencies
+- Queue depths and flow control behavior
+- Retry patterns and failure handling
+"""
+
+import heapq
+import random
+import statistics
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Tuple
+import numpy as np
+import matplotlib.pyplot as plt
+from collections import defaultdict, deque
+import time
+
+
+@dataclass
+class Commit:
+    """Represents a single commit request"""
+    commit_id: int
+    arrival_time: float
+    size_bytes: int = 1024  # Default 1KB per commit
+
+
+@dataclass
+class Batch:
+    """Represents a batch of commits being processed"""
+    batch_id: int
+    commits: List[Commit]
+    created_time: float
+    size_bytes: int = field(init=False)
+    retry_count: int = 0
+
+    def __post_init__(self):
+        self.size_bytes = sum(c.size_bytes for c in self.commits)
+
+
+@dataclass
+class InFlightRequest:
+    """Tracks an in-flight S3 request"""
+    batch: Batch
+    start_time: float
+    expected_completion: float
+    connection_id: int
+
+
+class PersistenceSimulation:
+    """
+    Simulates the persistence thread behavior described in persistence.md
+
+    For S3 latency, we use a Gamma distribution which is recommended for
+    modeling network service response times because:
+    - It has a natural lower bound (minimum network RTT)
+    - It can model the right-skewed tail typical of network services
+    - It captures both typical fast responses and occasional slow responses
+    - Shape parameter controls the heaviness of the tail
+    """
+
+    def __init__(self,
+                 # Configuration from persistence.md defaults
+                 batch_timeout_ms: float = 5.0,
+                 batch_size_threshold: int = 1024 * 1024,  # 1MB
+                 max_in_flight_requests: int = 5,
+                 max_retry_attempts: int = 3,
+                 retry_base_delay_ms: float = 100.0,
+
+                 # S3 latency modeling (Gamma distribution parameters)
+                 s3_latency_shape: float = 2.0,      # Shape parameter (α)
+                 s3_latency_scale: float = 25.0,     # Scale parameter (β)
+                 s3_failure_rate: float = 0.01,      # 1% failure rate
+
+                 # Arrival rate modeling
+                 arrival_rate_per_sec: float = 1000.0,  # Lambda for Poisson
+
+                 # Simulation parameters
+                 simulation_duration_sec: float = 60.0):
+
+        # Configuration
+        self.batch_timeout_ms = batch_timeout_ms
+        self.batch_size_threshold = batch_size_threshold
+        self.max_in_flight_requests = max_in_flight_requests
+        self.max_retry_attempts = max_retry_attempts
+        self.retry_base_delay_ms = retry_base_delay_ms
+
+        # S3 modeling parameters
+        self.s3_latency_shape = s3_latency_shape
+        self.s3_latency_scale = s3_latency_scale
+        self.s3_failure_rate = s3_failure_rate
+
+        # Arrival modeling
+        self.arrival_rate_per_sec = arrival_rate_per_sec
+        self.simulation_duration_sec = simulation_duration_sec
+
+        # Simulation state
+        self.current_time = 0.0
+        self.event_queue = []  # Priority queue of (time, event_type, event_data)
+        self.pending_commits = deque()
+        self.current_batch = []
+        self.batch_start_time = None  # Will be set when first commit added to batch
+        self.in_flight_requests: Dict[int, InFlightRequest] = {}
+        self.next_batch_id = 0
+        self.next_connection_id = 0
+        self.next_commit_id = 0
+
+        # Metrics collection
+        self.completed_commits = []
+        self.batch_metrics = []
+        self.retry_counts = defaultdict(int)
+        self.queue_depth_samples = []
+        self.timeline_events = []  # For debugging/visualization
+
+        # Random number generators
+        self.rng = random.Random(42)  # Reproducible results
+        self.np_rng = np.random.RandomState(42)
+
+    def sample_s3_latency(self, batch_size_bytes: int = 0) -> float:
+        """
+        Sample S3 response latency using Gamma distribution with size-dependent scaling.
+
+        Gamma distribution is ideal for S3 latency because:
+        - Shape=2.0, Scale=15.0 gives variable latency around the mean
+        - Minimum 30ms RTT prevents unrealistic sub-network responses
+        - Right-skewed tail captures occasional slow responses
+        - Models the reality that most responses are fast but some are slow
+
+        Size-dependent scaling:
+        - Base latency: 30ms RTT + Gamma(2.0, 15.0) = ~60ms mean
+        - Linear scaling with size: +20ms per MB
+        - Large requests (1MB) average ~80ms with similar tail behavior
+        """
+        # Minimum network RTT (realistic for cloud storage)
+        min_rtt = 30.0  # 30ms minimum round-trip time
+
+        # Variable latency component from Gamma distribution
+        variable_latency = self.np_rng.gamma(self.s3_latency_shape, self.s3_latency_scale)
+
+        # Size-dependent scaling: +20ms per MB
+        size_mb = batch_size_bytes / (1024 * 1024)
+        size_penalty = size_mb * 20.0  # 20ms per MB
+
+        return min_rtt + variable_latency + size_penalty
+
+    def sample_inter_arrival_time(self) -> float:
+        """Sample time between commit arrivals using Poisson process"""
+        return self.np_rng.exponential(1.0 / self.arrival_rate_per_sec)
+
+    def sample_commit_size(self) -> int:
+        """
+        Sample commit size with realistic distribution including large commits.
+
+        Distribution:
+        - 70% small commits: 500B - 10KB (typical operations)
+        - 25% medium commits: 10KB - 100KB (batch operations, large documents)
+        - 5% large commits: 100KB - 1MB (bulk data, file uploads)
+
+        This creates a realistic mix where most commits are small but some
+        can trigger size-based batching or become single-commit batches.
+        """
+        rand = self.rng.random()
+
+        if rand < 0.70:  # 70% small commits
+            return self.rng.randint(500, 10 * 1024)  # 500B - 10KB
+        elif rand < 0.95:  # 25% medium commits
+            return self.rng.randint(10 * 1024, 100 * 1024)  # 10KB - 100KB
+        else:  # 5% large commits
+            return self.rng.randint(100 * 1024, 1024 * 1024)  # 100KB - 1MB
+
+    def schedule_event(self, time: float, event_type: str, data=None):
+        """Add event to priority queue"""
+        heapq.heappush(self.event_queue, (time, event_type, data))
+
+    def should_process_batch(self) -> bool:
+        """Check if current batch should be processed based on triggers"""
+        if not self.current_batch:
+            return False
+
+        # Size trigger
+        batch_size = sum(c.size_bytes for c in self.current_batch)
+        if batch_size >= self.batch_size_threshold:
+            return True
+
+        # Time trigger - only if we have a valid batch start time
+        if self.batch_start_time is not None:
+            if (self.current_time - self.batch_start_time) >= (self.batch_timeout_ms / 1000.0):
+                return True
+
+        return False
+
+    def can_start_new_request(self) -> bool:
+        """Check if we can start a new request based on flow control"""
+        return len(self.in_flight_requests) < self.max_in_flight_requests
+
+    def process_current_batch(self):
+        """Process the current batch if conditions are met"""
+        if not self.current_batch or not self.can_start_new_request():
+            return
+
+        if self.should_process_batch():
+            batch = Batch(
+                batch_id=self.next_batch_id,
+                commits=self.current_batch.copy(),
+                created_time=self.current_time
+            )
+            self.next_batch_id += 1
+
+            # Clear current batch
+            self.current_batch.clear()
+            self.batch_start_time = None  # Reset to None, will be set on next batch
+
+            self.send_batch_to_s3(batch)
+
+    def send_batch_to_s3(self, batch: Batch, is_retry: bool = False):
+        """Send batch to S3 and track as in-flight request"""
+        if not self.can_start_new_request():
+            # This shouldn't happen due to flow control, but handle gracefully
+            self.schedule_event(self.current_time + 0.001, 'retry_batch', batch)
+            return
+
+        # Sample S3 response characteristics (pass batch size for latency modeling)
+        s3_latency = self.sample_s3_latency(batch.size_bytes) / 1000.0  # Convert ms to seconds
+        will_fail = self.rng.random() < self.s3_failure_rate
+
+        if will_fail:
+            s3_latency *= 2  # Failed requests typically take longer
+
+        completion_time = self.current_time + s3_latency
+        connection_id = self.next_connection_id
+        self.next_connection_id += 1
+
+        # Track in-flight request
+        in_flight = InFlightRequest(
+            batch=batch,
+            start_time=self.current_time,
+            expected_completion=completion_time,
+            connection_id=connection_id
+        )
+        self.in_flight_requests[connection_id] = in_flight
+
+        # Schedule completion event
+        if will_fail:
+            self.schedule_event(completion_time, 'batch_failed', connection_id)
+        else:
+            self.schedule_event(completion_time, 'batch_completed', connection_id)
+
+        # Log event for analysis
+        self.timeline_events.append({
+            'time': self.current_time,
+            'event': 'batch_sent',
+            'batch_id': batch.batch_id,
+            'batch_size': batch.size_bytes,
+            'commit_count': len(batch.commits),
+            'retry_count': batch.retry_count,
+            'is_retry': is_retry
+        })
+
+    def handle_batch_completed(self, connection_id: int):
+        """Handle successful batch completion"""
+        if connection_id not in self.in_flight_requests:
+            return
+
+        in_flight = self.in_flight_requests.pop(connection_id)
+        batch = in_flight.batch
+
+        # Calculate metrics
+        batch_latency = self.current_time - batch.created_time
+        self.batch_metrics.append({
+            'batch_id': batch.batch_id,
+            'latency': batch_latency,
+            'size_bytes': batch.size_bytes,
+            'commit_count': len(batch.commits),
+            'retry_count': batch.retry_count
+        })
+
+        # Mark commits as completed and calculate end-to-end latency
+        for commit in batch.commits:
+            commit_latency = self.current_time - commit.arrival_time
+            self.completed_commits.append({
+                'commit_id': commit.commit_id,
+                'arrival_time': commit.arrival_time,
+                'completion_time': self.current_time,
+                'latency': commit_latency,
+                'batch_id': batch.batch_id,
+                'retry_count': batch.retry_count
+            })
+
+        self.timeline_events.append({
+            'time': self.current_time,
+            'event': 'batch_completed',
+            'batch_id': batch.batch_id,
+            'latency': batch_latency,
+            'retry_count': batch.retry_count
+        })
+
+        # Try to process any pending work now that we have capacity
+        self.process_pending_work()
+
+    def handle_batch_failed(self, connection_id: int):
+        """Handle batch failure with retry logic"""
+        if connection_id not in self.in_flight_requests:
+            return
+
+        in_flight = self.in_flight_requests.pop(connection_id)
+        batch = in_flight.batch
+
+        self.timeline_events.append({
+            'time': self.current_time,
+            'event': 'batch_failed',
+            'batch_id': batch.batch_id,
+            'retry_count': batch.retry_count
+        })
+
+        if batch.retry_count < self.max_retry_attempts:
+            # Exponential backoff retry
+            batch.retry_count += 1
+            backoff_delay = (self.retry_base_delay_ms / 1000.0) * (2 ** (batch.retry_count - 1))
+            retry_time = self.current_time + backoff_delay
+
+            self.schedule_event(retry_time, 'retry_batch', batch)
+            self.retry_counts[batch.retry_count] += 1
+        else:
+            # Max retries exhausted - this would be a fatal error in real system
+            self.timeline_events.append({
+                'time': self.current_time,
+                'event': 'batch_abandoned',
+                'batch_id': batch.batch_id,
+                'retry_count': batch.retry_count
+            })
+
+    def handle_retry_batch(self, batch: Batch):
+        """Handle batch retry"""
+        self.send_batch_to_s3(batch, is_retry=True)
+
+    def process_pending_work(self):
+        """Process any pending commits that can now be batched"""
+        # Move pending commits to current batch
+        while self.pending_commits and self.can_start_new_request():
+            if not self.current_batch:
+                self.batch_start_time = self.current_time
+
+            commit = self.pending_commits.popleft()
+            self.current_batch.append(commit)
+
+            # Check if we should process this batch immediately
+            if self.should_process_batch():
+                self.process_current_batch()
+                break
+
+        # If we have commits but no in-flight capacity, they stay in current_batch
+        # and will be processed when capacity becomes available
+
+    def handle_commit_arrival(self, commit: Commit):
+        """Handle new commit arrival"""
+        # If we have in-flight capacity, try to add to current batch
+        if self.can_start_new_request():
+            if not self.current_batch:
+                self.batch_start_time = self.current_time
+
+            self.current_batch.append(commit)
+            self.process_current_batch()  # Check if we should process now
+        else:
+            # Add to pending queue due to flow control
+            self.pending_commits.append(commit)
+
+        # Schedule timeout event for current batch if it's the first commit
+        if len(self.current_batch) == 1 and not self.pending_commits:
+            timeout_time = self.current_time + (self.batch_timeout_ms / 1000.0)
+            self.schedule_event(timeout_time, 'batch_timeout', None)
+
+    def handle_batch_timeout(self):
+        """Handle batch timeout trigger"""
+        if self.current_batch and self.can_start_new_request():
+            self.process_current_batch()
+
+    def sample_queue_depth(self):
+        """Sample current queue depths for analysis"""
+        pending_count = len(self.pending_commits)
+        current_batch_count = len(self.current_batch)
+        in_flight_count = len(self.in_flight_requests)
+
+        self.queue_depth_samples.append({
+            'time': self.current_time,
+            'pending_commits': pending_count,
+            'current_batch_size': current_batch_count,
+            'in_flight_requests': in_flight_count,
+            'total_unacknowledged': pending_count + current_batch_count +
+                                  sum(len(req.batch.commits) for req in self.in_flight_requests.values())
+        })
+
+    def generate_arrivals(self):
+        """Generate Poisson arrival events for the simulation"""
+        current_time = 0.0
+
+        while current_time < self.simulation_duration_sec:
+            # Sample next arrival time
+            inter_arrival = self.sample_inter_arrival_time()
+            current_time += inter_arrival
+
+            if current_time >= self.simulation_duration_sec:
+                break
+
+            # Create commit
+            commit = Commit(
+                commit_id=self.next_commit_id,
+                arrival_time=current_time,
+                size_bytes=self.sample_commit_size()  # Realistic size distribution
+            )
+            self.next_commit_id += 1
+
+            # Schedule arrival event
+            self.schedule_event(current_time, 'commit_arrival', commit)
+
+        # Schedule periodic queue depth sampling
+        sample_time = 0.0
+        while sample_time < self.simulation_duration_sec:
+            self.schedule_event(sample_time, 'sample_queue_depth', None)
+            sample_time += 0.1  # Sample every 100ms
+
+    def run_simulation(self):
+        """Run the complete simulation"""
+        print(f"Starting persistence simulation...")
+        print(f"Arrival rate: {self.arrival_rate_per_sec} commits/sec")
+        print(f"Duration: {self.simulation_duration_sec} seconds")
+        print(f"Expected commits: ~{int(self.arrival_rate_per_sec * self.simulation_duration_sec)}")
+        print()
+
+        # Generate all arrival events
+        self.generate_arrivals()
+
+        # Process events in time order
+        events_processed = 0
+        while self.event_queue and events_processed < 1000000:  # Safety limit
+            time, event_type, data = heapq.heappop(self.event_queue)
+            self.current_time = time
+
+            if time > self.simulation_duration_sec:
+                break
+
+            if event_type == 'commit_arrival':
+                self.handle_commit_arrival(data)
+            elif event_type == 'batch_completed':
+                self.handle_batch_completed(data)
+            elif event_type == 'batch_failed':
+                self.handle_batch_failed(data)
+            elif event_type == 'retry_batch':
+                self.handle_retry_batch(data)
+            elif event_type == 'batch_timeout':
+                self.handle_batch_timeout()
+            elif event_type == 'sample_queue_depth':
+                self.sample_queue_depth()
+
+            events_processed += 1
+
+        print(f"Simulation completed. Processed {events_processed} events.")
+        return self.analyze_results()
+
+    def analyze_results(self) -> Dict:
+        """Analyze simulation results and return metrics"""
+        if not self.completed_commits:
+            return {"error": "No commits completed during simulation"}
+
+        # Calculate latency statistics
+        latencies = [c['latency'] * 1000 for c in self.completed_commits]  # Convert to ms
+
+        results = {
+            'simulation_config': {
+                'duration_sec': self.simulation_duration_sec,
+                'arrival_rate_per_sec': self.arrival_rate_per_sec,
+                'batch_timeout_ms': self.batch_timeout_ms,
+                'batch_size_threshold': self.batch_size_threshold,
+                'max_in_flight_requests': self.max_in_flight_requests,
+                's3_latency_params': f"Gamma(shape={self.s3_latency_shape}, scale={self.s3_latency_scale})",
+                's3_failure_rate': self.s3_failure_rate
+            },
+            'commit_metrics': {
+                'total_commits': len(self.completed_commits),
+                'latency_ms': {
+                    'mean': statistics.mean(latencies),
+                    'median': statistics.median(latencies),
+                    'std': statistics.stdev(latencies) if len(latencies) > 1 else 0,
+                    'min': min(latencies),
+                    'max': max(latencies),
+                    'p95': np.percentile(latencies, 95),
+                    'p99': np.percentile(latencies, 99)
+                }
+            },
+            'batch_metrics': {
+                'total_batches': len(self.batch_metrics),
+                'avg_commits_per_batch': statistics.mean([b['commit_count'] for b in self.batch_metrics]),
+                'avg_batch_size_bytes': statistics.mean([b['size_bytes'] for b in self.batch_metrics]),
+                'avg_batch_latency_ms': statistics.mean([b['latency'] * 1000 for b in self.batch_metrics])
+            },
+            'retry_analysis': dict(self.retry_counts),
+            'queue_depth_analysis': self._analyze_queue_depths()
+        }
+
+        return results
+
+    def _analyze_queue_depths(self) -> Dict:
+        """Analyze queue depth patterns"""
+        if not self.queue_depth_samples:
+            return {}
+
+        pending = [s['pending_commits'] for s in self.queue_depth_samples]
+        in_flight = [s['in_flight_requests'] for s in self.queue_depth_samples]
+        total_unack = [s['total_unacknowledged'] for s in self.queue_depth_samples]
+
+        return {
+            'pending_commits': {
+                'mean': statistics.mean(pending),
+                'max': max(pending),
+                'p95': np.percentile(pending, 95)
+            },
+            'in_flight_requests': {
+                'mean': statistics.mean(in_flight),
+                'max': max(in_flight),
+                'p95': np.percentile(in_flight, 95)
+            },
+            'total_unacknowledged': {
+                'mean': statistics.mean(total_unack),
+                'max': max(total_unack),
+                'p95': np.percentile(total_unack, 95)
+            }
+        }
+
+    def plot_results(self, results: Dict, save_path: Optional[str] = None):
+        """Generate visualization plots of simulation results"""
+        if not self.completed_commits:
+            print("No data to plot")
+            return
+
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
+        fig.suptitle('Persistence Thread Simulation Results', fontsize=16)
+
+        # Plot 1: Commit latency histogram
+        latencies_ms = [c['latency'] * 1000 for c in self.completed_commits]
+        ax1.hist(latencies_ms, bins=50, alpha=0.7, edgecolor='black')
+        ax1.set_xlabel('Commit Latency (ms)')
+        ax1.set_ylabel('Count')
+        ax1.set_title('Commit Latency Distribution')
+        ax1.axvline(results['commit_metrics']['latency_ms']['mean'], color='red',
+                   linestyle='--', label=f"Mean: {results['commit_metrics']['latency_ms']['mean']:.1f}ms")
+        ax1.axvline(results['commit_metrics']['latency_ms']['p95'], color='orange',
+                   linestyle='--', label=f"P95: {results['commit_metrics']['latency_ms']['p95']:.1f}ms")
+        ax1.legend()
+
+        # Plot 2: Timeline of commit completions
+        completion_times = [c['completion_time'] for c in self.completed_commits]
+        completion_latencies = [c['latency'] * 1000 for c in self.completed_commits]
+        ax2.scatter(completion_times, completion_latencies, alpha=0.6, s=10)
+        ax2.set_xlabel('Time (seconds)')
+        ax2.set_ylabel('Commit Latency (ms)')
+        ax2.set_title('Latency Over Time')
+
+        # Plot 3: Queue depth over time
+        if self.queue_depth_samples:
+            times = [s['time'] for s in self.queue_depth_samples]
+            pending = [s['pending_commits'] for s in self.queue_depth_samples]
+            in_flight = [s['in_flight_requests'] for s in self.queue_depth_samples]
+            total_unack = [s['total_unacknowledged'] for s in self.queue_depth_samples]
+
+            ax3.plot(times, pending, label='Pending Commits', alpha=0.8)
+            ax3.plot(times, in_flight, label='In-Flight Requests', alpha=0.8)
+            ax3.plot(times, total_unack, label='Total Unacknowledged', alpha=0.8)
+            ax3.axhline(self.max_in_flight_requests, color='red', linestyle='--',
+                       label=f'Max In-Flight Limit ({self.max_in_flight_requests})')
+            ax3.set_xlabel('Time (seconds)')
+            ax3.set_ylabel('Count')
+            ax3.set_title('Queue Depths Over Time')
+            ax3.legend()
+
+        # Plot 4: Batch size distribution
+        if self.batch_metrics:
+            batch_sizes = [b['commit_count'] for b in self.batch_metrics]
+            ax4.hist(batch_sizes, bins=20, alpha=0.7, edgecolor='black')
+            ax4.set_xlabel('Commits per Batch')
+            ax4.set_ylabel('Count')
+            ax4.set_title('Batch Size Distribution')
+
+        plt.tight_layout()
+
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"Plots saved to {save_path}")
+        else:
+            plt.show()
+
+
+def print_results(results: Dict):
+    """Pretty print simulation results"""
+    print("=" * 80)
+    print("PERSISTENCE THREAD SIMULATION RESULTS")
+    print("=" * 80)
+
+    # Configuration
+    config = results['simulation_config']
+    print(f"\nConfiguration:")
+    print(f"  Duration: {config['duration_sec']}s")
+    print(f"  Arrival Rate: {config['arrival_rate_per_sec']} commits/sec")
+    print(f"  Batch Timeout: {config['batch_timeout_ms']}ms")
+    print(f"  Batch Size Threshold: {config['batch_size_threshold']:,} bytes")
+    print(f"  Max In-Flight: {config['max_in_flight_requests']}")
+    print(f"  S3 Latency: {config['s3_latency_params']}")
+    print(f"  S3 Failure Rate: {config['s3_failure_rate']:.1%}")
+
+    # Commit metrics
+    commit_metrics = results['commit_metrics']
+    latency = commit_metrics['latency_ms']
+    print(f"\nCommit Performance:")
+    print(f"  Total Commits: {commit_metrics['total_commits']:,}")
+    print(f"  Latency Mean: {latency['mean']:.2f}ms")
+    print(f"  Latency Median: {latency['median']:.2f}ms")
+    print(f"  Latency P95: {latency['p95']:.2f}ms")
+    print(f"  Latency P99: {latency['p99']:.2f}ms")
+    print(f"  Latency Std: {latency['std']:.2f}ms")
+    print(f"  Latency Range: {latency['min']:.2f}ms - {latency['max']:.2f}ms")
+
+    # Batch metrics
+    batch_metrics = results['batch_metrics']
+    print(f"\nBatching Performance:")
+    print(f"  Total Batches: {batch_metrics['total_batches']:,}")
+    print(f"  Avg Commits/Batch: {batch_metrics['avg_commits_per_batch']:.1f}")
+    print(f"  Avg Batch Size: {batch_metrics['avg_batch_size_bytes']/1024:.1f}KB")
+    print(f"  Avg Batch Latency: {batch_metrics['avg_batch_latency_ms']:.2f}ms")
+
+    # Retry analysis
+    if results['retry_analysis']:
+        print(f"\nRetry Analysis:")
+        for retry_count, occurrences in results['retry_analysis'].items():
+            print(f"  {occurrences:,} batches required {retry_count} retries")
+
+    # Queue depth analysis
+    if results['queue_depth_analysis']:
+        queue_analysis = results['queue_depth_analysis']
+        print(f"\nQueue Depth Analysis:")
+        if 'pending_commits' in queue_analysis:
+            pending = queue_analysis['pending_commits']
+            print(f"  Pending Commits - Mean: {pending['mean']:.1f}, Max: {pending['max']}, P95: {pending['p95']:.1f}")
+        if 'in_flight_requests' in queue_analysis:
+            in_flight = queue_analysis['in_flight_requests']
+            print(f"  In-Flight Requests - Mean: {in_flight['mean']:.1f}, Max: {in_flight['max']}, P95: {in_flight['p95']:.1f}")
+        if 'total_unacknowledged' in queue_analysis:
+            total = queue_analysis['total_unacknowledged']
+            print(f"  Total Unacknowledged - Mean: {total['mean']:.1f}, Max: {total['max']}, P95: {total['p95']:.1f}")
+
+
+if __name__ == "__main__":
+    print("Running Persistence Thread Configuration Analysis")
+    print("S3 Latency Modeling: Gamma distribution (shape=2.0, scale=25ms)")
+    print("Testing different configurations to optimize latency...")
+    print()
+
+    # Test configurations with different max_in_flight values
+    configs = [
+        {"name": "Baseline (max_in_flight=5)", "max_in_flight_requests": 5},
+        {"name": "Higher Parallelism (max_in_flight=10)", "max_in_flight_requests": 10},
+        {"name": "Much Higher (max_in_flight=20)", "max_in_flight_requests": 20},
+        {"name": "Lower Timeout (max_in_flight=10, timeout=2ms)", "max_in_flight_requests": 10, "batch_timeout_ms": 2.0},
+        {"name": "Higher Timeout (max_in_flight=10, timeout=10ms)", "max_in_flight_requests": 10, "batch_timeout_ms": 10.0},
+    ]
+
+    results_comparison = []
+
+    for config in configs:
+        print(f"\n{'='*60}")
+        print(f"Testing: {config['name']}")
+        print(f"{'='*60}")
+
+        sim = PersistenceSimulation(
+            arrival_rate_per_sec=1000.0,
+            simulation_duration_sec=30.0,
+            s3_latency_shape=2.0,
+            s3_latency_scale=25.0,
+            s3_failure_rate=0.01,
+            max_in_flight_requests=config.get("max_in_flight_requests", 5),
+            batch_timeout_ms=config.get("batch_timeout_ms", 5.0)
+        )
+
+        results = sim.run_simulation()
+        results["config_name"] = config["name"]
+        results_comparison.append(results)
+
+        # Print key metrics for quick comparison
+        commit_metrics = results['commit_metrics']
+        batch_metrics = results['batch_metrics']
+        queue_metrics = results.get('queue_depth_analysis', {})
+
+        print(f"\nKey Metrics:")
+        print(f"  Mean Latency: {commit_metrics['latency_ms']['mean']:.1f}ms")
+        print(f"  P95 Latency: {commit_metrics['latency_ms']['p95']:.1f}ms")
+        print(f"  P99 Latency: {commit_metrics['latency_ms']['p99']:.1f}ms")
+        print(f"  Avg Commits/Batch: {batch_metrics['avg_commits_per_batch']:.1f}")
+        print(f"  Avg Batch Size: {batch_metrics['avg_batch_size_bytes']/1024:.1f}KB")
+        if queue_metrics:
+            print(f"  Avg Queue Depth: {queue_metrics.get('total_unacknowledged', {}).get('mean', 0):.1f}")
+            print(f"  Max Queue Depth: {queue_metrics.get('total_unacknowledged', {}).get('max', 0)}")
+
+    # Summary comparison
+    print(f"\n{'='*80}")
+    print("CONFIGURATION COMPARISON SUMMARY")
+    print(f"{'='*80}")
+    print(f"{'Configuration':<40} {'Mean':<8} {'P95':<8} {'P99':<8} {'AvgQueue':<10}")
+    print(f"{'-'*80}")
+
+    for result in results_comparison:
+        name = result["config_name"]
+        commit_metrics = result['commit_metrics']
+        queue_metrics = result.get('queue_depth_analysis', {})
+        mean_lat = commit_metrics['latency_ms']['mean']
+        p95_lat = commit_metrics['latency_ms']['p95']
+        p99_lat = commit_metrics['latency_ms']['p99']
+        avg_queue = queue_metrics.get('total_unacknowledged', {}).get('mean', 0)
+
+        print(f"{name:<40} {mean_lat:<8.1f} {p95_lat:<8.1f} {p99_lat:<8.1f} {avg_queue:<10.1f}")
+
+    print(f"\nRecommendation: Choose config with lowest P95/P99 latencies")
+    print(f"Note: Higher in-flight allows more parallelism but may increase queue variability")
+
+    # Generate plots for best configuration
+    best_config = min(results_comparison, key=lambda r: r['commit_metrics']['latency_ms']['p95'])
+    print(f"\nGenerating plots for best configuration: {best_config['config_name']}")
+
+    try:
+        # Re-run best config to get simulation object for plotting
+        best_params = next(c for c in configs if c['name'] == best_config['config_name'])
+        sim_best = PersistenceSimulation(
+            arrival_rate_per_sec=1000.0,
+            simulation_duration_sec=30.0,
+            s3_latency_shape=2.0,
+            s3_latency_scale=25.0,
+            s3_failure_rate=0.01,
+            max_in_flight_requests=best_params.get("max_in_flight_requests", 5),
+            batch_timeout_ms=best_params.get("batch_timeout_ms", 5.0)
+        )
+        sim_best.run_simulation()
+        sim_best.plot_results(best_config, f'persistence_optimization_results.png')
+    except Exception as e:
+        print(f"\nCould not generate plots: {e}")
+        print("Install matplotlib and numpy to enable visualization")
--- a/persistence.md
+++ b/persistence.md
@@ -0,0 +1,111 @@
+# Persistence Thread Design
+
+## Overview
+
+The persistence thread receives commit batches from the main processing pipeline and uploads them to S3. It uses a single-threaded design with connection pooling and batching for optimal performance.
+
+## Architecture
+
+**Input**: Commits arrive via `ThreadPipeline` interface from upstream processing
+**Output**: Batched commits uploaded to S3 persistence backend
+**Transport**: Single-threaded TCP client with connection pooling
+**Protocol**: Higher layers handle HTTP, authentication, and S3-specific details
+
+## Batching Strategy
+
+The persistence thread collects commits into batches using two trigger conditions:
+
+1. **Time Trigger**: `batch_timeout_ms` elapsed since batch collection started
+2. **Size Trigger**: `batch_size_threshold` commits collected (can be exceeded by final commit)
+
+**Flow Control**: When `max_in_flight_requests` reached, block until responses received.
+
+## Main Processing Loop
+
+### 1. Batch Collection
+
+**No In-Flight Requests**:
+- Use blocking acquire to get first commit batch
+- Process immediately (no batching delay)
+
+**With In-Flight Requests**:
+- Check flow control: if at `max_in_flight_requests`, block for responses
+- Collect commits using non-blocking acquire until trigger condition:
+  - Check for available commits (non-blocking)
+  - If `batch_size_threshold` reached → process batch immediately
+  - If below threshold → use `epoll_wait(batch_timeout_ms)` for I/O and timeout
+  - On timeout → process collected commits
+- If no commits available and no in-flight requests → switch to blocking acquire
+
+### 2. Connection Management
+
+- Acquire healthy connection from pool
+- Create new connections if pool below `target_pool_size`
+- If no healthy connections available, block until one becomes available
+- Maintain automatic pool replenishment
+
+### 3. Data Transmission
+
+- Write batch data to S3 connection using appropriate protocol
+- Publish accepted transactions to subscriber system
+- Track request as in-flight for flow control
+
+### 4. I/O Event Processing
+
+- Handle epoll events for all in-flight connections
+- Monitor connection health via heartbeats
+- Process incoming responses and detect connection failures
+
+### 5. Response Handling
+
+- **Ordered Acknowledgment**: Only acknowledge batch after all prior batches are durable
+- Release batch via `StageGuard` destructor (publishes to next pipeline stage)
+- Publish durability events to subscriber system
+- Return healthy connection to pool
+
+### 6. Failure Handling
+
+- Remove failed connection from pool
+- Retry batch with exponential backoff (up to `max_retry_attempts`)
+- Backoff delays only affect the specific failing batch
+- If retries exhausted, abort process or escalate error
+- Initiate pool replenishment if below target
+
+## Connection Pool
+
+**Target Size**: `target_pool_size` connections (recommended: 2x `max_in_flight_requests`)
+**Replenishment**: Automatic creation when below target
+**Health Monitoring**: Heartbeat-based connection validation
+**Sizing Rationale**: 2x multiplier ensures availability during peak load and connection replacement
+
+## Key Design Properties
+
+**Batch Ordering**: Batches may be retried out-of-order for performance, but acknowledgment to next pipeline stage maintains strict ordering.
+
+**Backpressure**: Retry delays for failing batches create natural backpressure that eventually blocks the persistence thread when in-flight limits are reached.
+
+**Graceful Shutdown**: On shutdown signal, drain all in-flight batches to completion before terminating.
+
+## Configuration Parameters
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `batch_timeout_ms` | 1ms | Maximum time to wait collecting commits for batching |
+| `batch_size_threshold` | 1MB | Threshold for triggering batch processing |
+| `max_in_flight_requests` | 50 | Maximum concurrent requests to persistence backend |
+| `target_pool_size` | 2x in-flight | Target number of connections to maintain |
+| `max_retry_attempts` | 3 | Maximum retries for failed batches before aborting |
+| `retry_base_delay_ms` | 100ms | Base delay for exponential backoff retries |
+
+## Configuration Validation
+
+**Required Constraints**:
+- `batch_size_threshold` > 0 (must process at least one commit per batch)
+- `max_in_flight_requests` > 0 (must allow at least one concurrent request)
+- `target_pool_size` >= `max_in_flight_requests` (pool must accommodate all in-flight requests)
+- `batch_timeout_ms` > 0 (timeout must be positive)
+- `max_retry_attempts` >= 0 (zero disables retries)
+- `retry_base_delay_ms` > 0 (delay must be positive if retries enabled)
+
+**Performance Recommendations**:
+- `target_pool_size` <= 2x `max_in_flight_requests` (optimal for performance)
Author	SHA1	Message	Date
Andrew Noyes	1a4e8d5761	Add latency sim. Not reviewed	2025-08-24 22:32:47 -04:00
Andrew Noyes	506bbbb528	Update defaults based on simulation	2025-08-24 21:34:46 -04:00
Andrew Noyes	da69a99cf4	Make defaults for batch_size_threshold, max_in_flight_requests	2025-08-24 20:23:17 -04:00
Andrew Noyes	333148bb5a	Improve clarity	2025-08-24 20:02:11 -04:00
Andrew Noyes	f54d1e0dc1	Initial persistence thread design	2025-08-24 19:47:29 -04:00