-
Notifications
You must be signed in to change notification settings - Fork 17
Add validation to run_benchmark.sh #275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
48ac749
73e58f0
6f938a0
4d6ce30
d63df04
a7e5b39
cae9795
532e804
3fa9796
2f65d9a
7d8a178
e8ed42f
06d1be0
b7705e8
7aa1684
8452b1d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -355,6 +355,7 @@ def _build_submission_payload( | |
| is_official: bool, | ||
| asset_ids: list[int] | None = None, | ||
| concurrency_streams: int = 1, | ||
| validation_results: dict | None = None, | ||
| ) -> dict: | ||
| """Build a BenchmarkSubmission payload from parsed dataclasses. | ||
|
|
||
|
|
@@ -396,10 +397,22 @@ def _query_sort_key(name: str): | |
|
|
||
| query_names = sorted(raw_times.keys(), key=_query_sort_key) | ||
|
|
||
| per_query_validation = (validation_results or {}).get("queries", {}) | ||
|
|
||
| def _get_validation_result(query_name): | ||
| # Look up validation result for this query (keys are lowercase e.g. "q1") | ||
| vkey = "q" + query_name.lstrip("Q").lower() | ||
| vdata = per_query_validation.get(vkey) | ||
| if vdata: | ||
| return {"status": vdata["status"], "message": vdata.get("message")} | ||
| return {"status": "not-validated"} | ||
|
|
||
| for query_name in query_names: | ||
| times = raw_times[query_name] | ||
| is_failed = query_name in failed_queries | ||
|
|
||
| validation_result = _get_validation_result(query_name) | ||
|
|
||
| # Each execution becomes a separate query log entry | ||
| for exec_idx, runtime_ms in enumerate(times): | ||
| if is_failed: | ||
|
|
@@ -416,11 +429,14 @@ def _query_sort_key(name: str): | |
| "extra_info": { | ||
| "execution_number": exec_idx + 1, | ||
| }, | ||
| "validation_result": validation_result, | ||
| } | ||
| ) | ||
| execution_order += 1 | ||
|
|
||
| # Handle failed queries that may not have times | ||
| # Handle failed queries that may not have times. | ||
| # Queries that failed before producing a result file are always "not-validated" | ||
| # since validate_results.py never ran for them (no parquet to compare against). | ||
| for query_name, error_info in failed_queries.items(): | ||
| if query_name not in raw_times: | ||
| query_logs.append( | ||
|
|
@@ -432,6 +448,7 @@ def _query_sort_key(name: str): | |
| "extra_info": { | ||
| "error": str(error_info), | ||
| }, | ||
| "validation_result": _get_validation_result(query_name), | ||
| } | ||
| ) | ||
| execution_order += 1 | ||
|
|
@@ -469,6 +486,7 @@ def _query_sort_key(name: str): | |
| "extra_info": extra_info, | ||
| "is_official": is_official, | ||
| "asset_ids": asset_ids, | ||
| "validation_status": (validation_results or {}).get("overall_status", "not-validated"), | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -616,6 +634,14 @@ async def _process_benchmark_dir( | |
| print(" Warning: no config directory found. Use --config-dir to specify one.", file=sys.stderr) | ||
| engine_config = None | ||
|
|
||
| validation_results_path = benchmark_dir / "validation_results.json" | ||
| if validation_results_path.exists(): | ||
| print(" Loading validation results...", file=sys.stderr) | ||
| validation_results = json.loads(validation_results_path.read_text()) | ||
| else: | ||
| print(" No validation results found.", file=sys.stderr) | ||
| validation_results = None | ||
|
|
||
| # Resolve logs directory: explicit override → auto-detect from repo | ||
| effective_logs_dir = logs_dir | ||
| if effective_logs_dir is None: | ||
|
|
@@ -669,6 +695,7 @@ async def _process_benchmark_dir( | |
| is_official=is_official, | ||
| asset_ids=asset_ids, | ||
| concurrency_streams=concurrency_streams, | ||
| validation_results=validation_results, | ||
| ) | ||
| except Exception as e: | ||
| print(f" Error building payload for '{bench_name}': {e}", file=sys.stderr) | ||
|
|
@@ -680,6 +707,15 @@ async def _process_benchmark_dir( | |
| print(f" Identifier hash: {payload['query_engine']['identifier_hash']}", file=sys.stderr) | ||
| print(f" Node count: {payload['node_count']}", file=sys.stderr) | ||
| print(f" Query logs: {len(payload['query_logs'])}", file=sys.stderr) | ||
| print(f" Validation status: {payload['validation_status']}", file=sys.stderr) | ||
| xfail_queries = [ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The "x" prefix was short for "expected". As in this is an "expected failure". This convention was set in the Benchmarking DB where one of the possible validation states is "XFAIL" for this particular case. |
||
| ql["query_name"] | ||
| for ql in payload["query_logs"] | ||
| if ql.get("validation_result", {}).get("status") == "expected-failure" | ||
| ] | ||
| if xfail_queries: | ||
| unique_xfail = sorted(set(xfail_queries), key=lambda x: int(x)) | ||
| print(f" Expected-failure queries: {unique_xfail}", file=sys.stderr) | ||
|
|
||
| if dry_run: | ||
| print("\n [DRY RUN] Payload:", file=sys.stderr) | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| pandas>=2.0 | ||
| pyarrow>=10.0 | ||
| sqlglot |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if
per_query_validationis an empty dictionary (from line 400)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If
per_query_validationis empty thenper_query_validation.get(vkey)should returnNonefor each query and the status will be returned as "not-validated".