feat: add default-4threads benchmark and time-to-first-record metric

sumedhsakdeo · claude · sumedhsakdeo · commit 636661b400a3 · 2026-02-15T05:06:19.000-08:00
Add a parametrized benchmark case for default (executor.map) with
max_workers=4 to compare memory/throughput against unbounded threading.
Add TTFR (time to first record) measurement across all configurations.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/tests/benchmark/test_read_benchmark.py b/tests/benchmark/test_read_benchmark.py
@@ -86,75 +86,100 @@ def benchmark_table(tmp_path_factory: pytest.TempPathFactory) -> Table:
 
 @pytest.mark.benchmark
 @pytest.mark.parametrize(
-    "streaming,concurrent_files,batch_size",
+    "streaming,concurrent_files,batch_size,max_workers",
     [
-        pytest.param(False, 1, None, id="default"),
-        pytest.param(True, 1, None, id="streaming-cf1"),
-        pytest.param(True, 2, None, id="streaming-cf2"),
-        pytest.param(True, 4, None, id="streaming-cf4"),
-        pytest.param(True, 8, None, id="streaming-cf8"),
-        pytest.param(True, 16, None, id="streaming-cf16"),
+        pytest.param(False, 1, None, None, id="default"),
+        pytest.param(False, 1, None, 4, id="default-4threads"),
+        pytest.param(True, 1, None, None, id="streaming-cf1"),
+        pytest.param(True, 2, None, None, id="streaming-cf2"),
+        pytest.param(True, 4, None, None, id="streaming-cf4"),
+        pytest.param(True, 8, None, None, id="streaming-cf8"),
+        pytest.param(True, 16, None, None, id="streaming-cf16"),
     ],
 )
 def test_read_throughput(
     benchmark_table: Table,
     streaming: bool,
     concurrent_files: int,
     batch_size: int | None,
+    max_workers: int | None,
 ) -> None:
-    """Measure records/sec and peak Arrow memory for a scan configuration."""
+    """Measure records/sec, time to first record, and peak Arrow memory for a scan configuration."""
+    from pyiceberg.utils.concurrent import ExecutorFactory
+
     effective_batch_size = batch_size or 131_072  # PyArrow default
     if streaming:
         config_str = f"streaming=True, concurrent_files={concurrent_files}, batch_size={effective_batch_size}"
     else:
-        config_str = f"streaming=False (executor.map, all files parallel), batch_size={effective_batch_size}"
+        workers_str = f", max_workers={max_workers}" if max_workers else ""
+        config_str = f"streaming=False (executor.map, all files parallel), batch_size={effective_batch_size}{workers_str}"
     print("\n--- ArrowScan Read Throughput Benchmark ---")
     print(f"Config: {config_str}")
     print(f"  Files: {NUM_FILES}, Rows per file: {ROWS_PER_FILE}, Total rows: {TOTAL_ROWS}")
 
     elapsed_times: list[float] = []
     throughputs: list[float] = []
     peak_memories: list[int] = []
-
-    for run in range(NUM_RUNS):
-        # Measure throughput
-        gc.collect()
-        pa.default_memory_pool().release_unused()
-        baseline_mem = pa.total_allocated_bytes()
-        peak_mem = baseline_mem
-
-        start = timeit.default_timer()
-        total_rows = 0
-        for batch in benchmark_table.scan().to_arrow_batch_reader(
-            batch_size=batch_size,
-            streaming=streaming,
-            concurrent_files=concurrent_files,
-        ):
-            total_rows += len(batch)
-            current_mem = pa.total_allocated_bytes()
-            if current_mem > peak_mem:
-                peak_mem = current_mem
-        elapsed = timeit.default_timer() - start
-
-        peak_above_baseline = peak_mem - baseline_mem
-        rows_per_sec = total_rows / elapsed if elapsed > 0 else 0
-        elapsed_times.append(elapsed)
-        throughputs.append(rows_per_sec)
-        peak_memories.append(peak_above_baseline)
-
-        print(
-            f"  Run {run + 1}: {elapsed:.2f}s, {rows_per_sec:,.0f} rows/s, "
-            f"peak arrow mem: {peak_above_baseline / (1024 * 1024):.1f} MB"
-        )
-
-        assert total_rows == TOTAL_ROWS, f"Expected {TOTAL_ROWS} rows, got {total_rows}"
+    ttfr_times: list[float] = []
+
+    # Override max_workers if specified
+    original_instance = None
+    if max_workers is not None:
+        from concurrent.futures import ThreadPoolExecutor
+
+        original_instance = ExecutorFactory._instance
+        ExecutorFactory._instance = ThreadPoolExecutor(max_workers=max_workers)
+
+    try:
+        for run in range(NUM_RUNS):
+            # Measure throughput
+            gc.collect()
+            pa.default_memory_pool().release_unused()
+            baseline_mem = pa.total_allocated_bytes()
+            peak_mem = baseline_mem
+
+            start = timeit.default_timer()
+            total_rows = 0
+            first_batch_time = None
+            for batch in benchmark_table.scan().to_arrow_batch_reader(
+                batch_size=batch_size,
+                streaming=streaming,
+                concurrent_files=concurrent_files,
+            ):
+                if first_batch_time is None:
+                    first_batch_time = timeit.default_timer() - start
+                total_rows += len(batch)
+                current_mem = pa.total_allocated_bytes()
+                if current_mem > peak_mem:
+                    peak_mem = current_mem
+            elapsed = timeit.default_timer() - start
+
+            peak_above_baseline = peak_mem - baseline_mem
+            rows_per_sec = total_rows / elapsed if elapsed > 0 else 0
+            elapsed_times.append(elapsed)
+            throughputs.append(rows_per_sec)
+            peak_memories.append(peak_above_baseline)
+            ttfr_times.append(first_batch_time or 0.0)
+
+            print(
+                f"  Run {run + 1}: {elapsed:.2f}s, {rows_per_sec:,.0f} rows/s, "
+                f"TTFR: {(first_batch_time or 0) * 1000:.1f}ms, "
+                f"peak arrow mem: {peak_above_baseline / (1024 * 1024):.1f} MB"
+            )
+
+            assert total_rows == TOTAL_ROWS, f"Expected {TOTAL_ROWS} rows, got {total_rows}"
+    finally:
+        if original_instance is not None:
+            ExecutorFactory._instance = original_instance
 
     mean_elapsed = statistics.mean(elapsed_times)
     stdev_elapsed = statistics.stdev(elapsed_times) if len(elapsed_times) > 1 else 0.0
     mean_throughput = statistics.mean(throughputs)
     mean_peak_mem = statistics.mean(peak_memories)
+    mean_ttfr = statistics.mean(ttfr_times)
 
     print(
         f"  Mean: {mean_elapsed:.2f}s ± {stdev_elapsed:.2f}s, {mean_throughput:,.0f} rows/s, "
+        f"TTFR: {mean_ttfr * 1000:.1f}ms, "
         f"peak arrow mem: {mean_peak_mem / (1024 * 1024):.1f} MB"
     )