supermodeltools · greynewell · Mar 26, 2026 · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- **Remove GT and analysis caching from SupermodelBenchmark** (supermodeltools/supermodel-public-api#714):
+  Both caches had no invalidation mechanism, causing stale data to persist silently across runs.
+  The GT cache bypassed all fixes applied to `extract_ground_truth` (FP filters, pattern additions).
+  The analysis cache was keyed by zip hash, so server-side idempotency key version bumps did not
+  bust it — the server was never reached and old results were served indefinitely. Both caches are
+  now removed. GT extraction is a single GitHub API call (cheap). The Supermodel API handles
+  server-side deduplication via the idempotency key. Also removes the `DEFAULT_GT_DIR` constant,
+  `ground_truth_dir` constructor parameter, and the `cached_analysis` task config field — all of
+  which existed solely to support the now-removed caching paths.
+
+- **Dead code benchmark: filter feature-removal false positives from ground truth** (supermodeltools/supermodel-public-api#714):
+  The ground truth extractor now applies the existing `_is_feature_removal_fp` filter (which
+  was implemented but never called). Symbols deleted in a PR that are also imported by other
+  files deleted in the same PR are excluded from GT — they were live code co-removed with
+  their consumers, not dead code. Genuinely orphaned symbols with no deleted importer are
+  kept. This fixes 0-recall scores for PRs like n8n #23572 and prisma #28485 where whole
+  files were removed as part of a feature deletion.
+
 ## [0.14.0] - 2026-02-13
 
 ### Added

diff --git a/src/mcpbr/benchmarks/supermodel/api_client.py b/src/mcpbr/benchmarks/supermodel/api_client.py
@@ -4,10 +4,9 @@
 import hashlib
 import json
 import logging
-import os
 import sys
-import tempfile
 import time
+from typing import Any
 
 logger = logging.getLogger("mcpbr.supermodel")
 
@@ -44,29 +43,19 @@ async def call_supermodel_api(
         with open(zip_path, "rb") as f:
             zip_hash = hashlib.sha256(f.read()).hexdigest()[:12]
         ep_name = endpoint_path.strip("/").replace("/", "-")
-        idempotency_key = f"bench:{ep_name}:{zip_hash}:v2"
+        idempotency_key = f"bench:{ep_name}:{zip_hash}:v3"
 
     headers = [
         "-H",
         "Accept: application/json",
         "-H",
         f"Idempotency-Key: {idempotency_key}",
     ]
-
-    # Pass API key via curl config file to avoid exposure in process table (ps aux)
-    api_key_config_path: str | None = None
     if api_key:
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".cfg", prefix="mcpbr_curl_", delete=False
-        ) as api_key_fd:
-            api_key_fd.write(f'header = "X-Api-Key: {api_key}"\n')
-            api_key_config_path = api_key_fd.name
-        os.chmod(api_key_config_path, 0o600)
+        headers.extend(["-H", f"X-Api-Key: {api_key}"])
 
     # Initial request with file upload
     upload_cmd = ["curl", "-s", "-X", "POST", url, "-F", f"file=@{zip_path}", *headers]
-    if api_key_config_path:
-        upload_cmd.extend(["--config", api_key_config_path])
 
     start_time = time.time()
     print(
@@ -83,10 +72,7 @@ async def call_supermodel_api(
     if proc.returncode != 0:
         raise RuntimeError(f"Supermodel API request failed: {stderr.decode()}")
 
-    try:
-        response = json.loads(stdout.decode())
-    except json.JSONDecodeError as e:
-        raise RuntimeError(f"Non-JSON response from Supermodel API: {stdout.decode()[:500]}") from e
+    response = json.loads(stdout.decode())
 
     # Poll if async — use lightweight requests (1-byte dummy file instead of
     # re-uploading the full zip). The API recognizes the idempotency key and
@@ -102,6 +88,8 @@ async def call_supermodel_api(
 
             # Create poll dummy on first iteration only
             if poll_dummy_path is None:
+                import tempfile
+
                 with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as poll_dummy:
                     poll_dummy.write(b"\n")
                     poll_dummy_path = poll_dummy.name
@@ -116,8 +104,6 @@ async def call_supermodel_api(
                 f"file=@{poll_dummy_path}",
                 *headers,
             ]
-            if api_key_config_path:
-                poll_cmd.extend(["--config", api_key_config_path])
 
             retry_after = response.get("retryAfter", 10)
             poll_count += 1
@@ -138,17 +124,12 @@ async def call_supermodel_api(
 
             if proc.returncode != 0:
                 raise RuntimeError(f"Supermodel API poll failed: {stderr.decode()}")
-            try:
-                response = json.loads(stdout.decode())
-            except json.JSONDecodeError as e:
-                raise RuntimeError(
-                    f"Non-JSON poll response from Supermodel API: {stdout.decode()[:500]}"
-                ) from e
+            response = json.loads(stdout.decode())
     finally:
         if poll_dummy_path is not None:
-            os.unlink(poll_dummy_path)
-        if api_key_config_path is not None:
-            os.unlink(api_key_config_path)
+            import os as _os
+
+            _os.unlink(poll_dummy_path)
 
     elapsed = time.time() - start_time
 
@@ -161,6 +142,6 @@ async def call_supermodel_api(
     if isinstance(status, int) and status >= 400:
         raise RuntimeError(f"Supermodel API HTTP {status}: {response.get('message', response)}")
 
-    api_result = response.get("result", response)
+    api_result: dict[str, Any] = response.get("result", response)
     print(f"  Supermodel API: completed in {elapsed:.1f}s", file=sys.stderr, flush=True)
-    return dict(api_result)
+    return api_result