PassNet/pass_bench/analysis_util.py at develop · PaddlePaddle/PassNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
import os
import re
import sys
from scipy.stats import gmean
from pass_bench.config.datatype_tolerance_config import get_precision
from pass_bench.positive_tolerance_interpretation import (
    PositiveToleranceInterpretation,
)
from pass_bench.verify_aggregated_params import determine_tolerances
from pass_bench.positive_tolerance_interpretation_manager import (
    get_positive_tolerance_interpretation,
)


def detect_sample_status(log_text: str) -> str:
    """
    Detect the status for a single sample from log text.

    This function analyzes log text (can be string or list of lines) and returns a status.
    It checks for explicit eager phase status and shape/type mismatches.

    Args:
        log_text: Log text content (can be a string or list of lines)

    Returns:
        Possible values:
        - "correct": Sample executed successfully
        - "eager_fail": Eager model execution failed
        - "compile_fail": Compiled model failed to load
        - "shape_mismatch": Output shape mismatch between eager and compiled
        - "type_mismatch": Data type mismatch between eager and compiled
        - "runtime_fail": Runtime error during execution
        - "pass_miss": pass manager finished but did not match/modify graph
    """
    lines = log_text.split("\n") if isinstance(log_text, str) else log_text

    # Track phase status and mismatch types
    eager_success = False
    compile_success = False
    shape_match = False
    type_match = False
    runtime_fail = False
    pass_miss = False

    # Scan for status and mismatch markers
    for line in lines:
        if "[Datatype][eager]" in line:
            eager_success = True
        elif "[Datatype][compiled]" in line:
            compile_success = True
        elif "[Shape]" in line and "match:True" in line:
            shape_match = True
        elif "[DataType]" in line and "match:True" in line:
            type_match = True
        elif "all_close" in line:
            # When there are all_close checking result, the eager and compiled running should be success.
            shape_match = True
            eager_success = True
            compile_success = True
        elif "[PassMgrBackend] in line" and "No passes modified" in line:
            # Treat pass manager no-op as a sample-level miss so scoring can penalize it.
            pass_miss = True
        else:
            # Do nothing
            pass

    if any("Exception:" in line or "Error:" in line for line in lines):
        runtime_fail = True

    # Determine error type
    if not eager_success:
        return "eager_fail"
    elif not compile_success:
        return "compile_fail"
    elif not shape_match:
        return "shape_mismatch"
    elif not type_match:
        return "type_mismatch"
    elif runtime_fail:
        return "runtime_fail"
    elif pass_miss:
        return "pass_miss"
    else:
        return "correct"


def parse_single_sample_log_to_data(log_text: str) -> dict:
    """
    Parse a single sample's log text into a data dictionary.

    Args:
        log_text: Log text content (can be string or list of lines)

    Returns:
        Data dictionary containing configuration, correctness,
        performance, and result information for a single model-compiler run.
    """
    if isinstance(log_text, str):
        lines = log_text.split("\n")
    else:
        lines = log_text

    data = {
        "model_path": None,
        "configuration": {},
        "correctness": {},
        "performance": {
            "eager": {},
            "compiled": {},
            "datatype": {},
            "speedup": {},
        },
        "status": None,
    }

    # Define regex patterns for each type of log line
    patterns = {
        "processing": re.compile(r"\[Processing\] (.+)"),
        "config": re.compile(r"\[Config\] (\S+): (.+)"),
        "performance": re.compile(r"\[Performance\]\[(\w+)\]: (.+)"),
        "datatype": re.compile(r"\[Datatype\]\[(\w+)\]: (.+)"),
        "correctness": re.compile(r"\[Correctness\](\[.+\]): (.+)"),
        "speedup": re.compile(r"\[Speedup\]\[(\w+)\]: (.+)"),
    }

    for line in lines:
        processing_match = patterns["processing"].search(line)
        if processing_match:
            data["model_path"] = line.split()[-1]
            continue

        config_match = patterns["config"].search(line)
        if config_match:
            key, value = config_match.groups()
            data["configuration"][key.strip()] = value.strip()
            continue

        performance_match = patterns["performance"].search(line)
        if performance_match:
            key, value_str = performance_match.groups()
            data["performance"][key.strip()] = value_str.strip()
            continue

        datatype_match = patterns["datatype"].search(line)
        if datatype_match:
            key, value_str = datatype_match.groups()
            data["performance"]["datatype"][key.strip()] = value_str.strip().split()
            continue

        correctness_match = patterns["correctness"].search(line)
        if correctness_match:
            key, value_str = correctness_match.groups()
            values = []
            for v in value_str.strip().split():
                try:
                    values.append(int(v) if "." not in v else float(v))
                except ValueError:
                    values.append(float(v))
            data["correctness"][key.strip()] = values
            continue

        speedup_match = patterns["speedup"].search(line)
        if speedup_match:
            key, value_str = speedup_match.groups()
            data["performance"]["speedup"][key.strip()] = float(value_str)
            continue

    data["status"] = detect_sample_status(log_text)

    return data


def parse_logs_to_data(log_file: str) -> list:
    """
    Parse a structured log file generated by the benchmark script and
    return a list of data dictionaries (one per model-compiler run).

    Args:
        log_file: Path to the benchmark log file

    Returns:
        List of data dictionaries, each containing configuration, correctness,
        performance, and result information for a single model-compiler run.
    """
    try:
        with open(log_file, "r", encoding="utf-8") as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading {log_file}: {e}")
        return []

    if not lines:
        print(f"No content in {log_file}")
        return []

    def process_a_sample(current_lines, samples):
        data = parse_single_sample_log_to_data(current_lines)
        samples.append(data)

    samples, current_lines = [], []
    for line in lines:
        if "[Processing]" in line:
            if current_lines:
                # parse the logs of the previous sample
                process_a_sample(current_lines, samples)
            # clear current_lines of current sample and append the processing line
            current_lines = [line]
        else:
            # append line of current sample
            current_lines.append(line) if current_lines else None

    if current_lines:
        # parse the final sample
        process_a_sample(current_lines, samples)

    print(f"Parsed {len(samples)} samples from {log_file}")
    return samples


def scan_all_folders(benchmark_path: str) -> dict:
    """
    Unified entry point that supports log files and directories:
      - If benchmark_path is a log file (.log or .txt) → parse it directly and return data as a single curve.

      - If benchmark_path is a directory → scan for .log and .txt files in the directory,
        each log file becomes a curve.

    Returns dict[curve_name] -> list_of_samples
    """

    # Handle single log file
    if os.path.isfile(benchmark_path):
        print(f"Detected log file: '{benchmark_path}'")
        samples = parse_logs_to_data(benchmark_path)
        if not samples:
            print("  - No valid data found in log file.")
            return {}

        folder_name = (
            os.path.splitext(os.path.basename(benchmark_path))[0] or "benchmark"
        )
        print(
            f"  - Parsed log file → 1 curve '{folder_name}' "
            f"with {len(samples)} samples."
        )
        return {folder_name: samples}

    # Check if it's a directory
    if not os.path.isdir(benchmark_path):
        print(
            f"Error: Provided path '{benchmark_path}' is neither a valid file nor directory."
        )
        return {}

    print(f"Scanning '{benchmark_path}' ...")

    # Find .log and .txt files in the directory
    log_files = sorted(
        [
            f
            for f in os.listdir(benchmark_path)
            if os.path.isfile(os.path.join(benchmark_path, f))
            and f.endswith((".log", ".txt"))
        ]
    )

    if not log_files:
        print("  - No log files (.log or .txt) found in directory.")
        return {}

    # Process log files, each becomes a curve
    all_results = {}
    print(f"  - Found {len(log_files)} log file(s) → each becomes a curve.")
    for log_file in log_files:
        log_file_path = os.path.join(benchmark_path, log_file)
        samples = parse_logs_to_data(log_file_path)
        if not samples:
            continue

        curve_name = os.path.splitext(log_file)[0] or "benchmark"
        all_results[curve_name] = samples
        print(f"    - Curve '{curve_name}': {len(samples)} samples.")

    if not all_results:
        print("  - No valid data found in any log file.")
        return {}

    print(f"Total curves loaded: {len(all_results)}")
    return all_results


def get_correctness(dtype: str, t: int, correctness_data: dict, index: int) -> bool:
    """
    Based on tolerance, data type, and output index, find the actual atol/rtol values from the config and get the correctness result for a single output.
    """
    precision_pair = get_precision(t, dtype)
    atol, rtol = precision_pair[1], precision_pair[0]

    if atol == 0 and rtol == 0:
        metric_key_to_check = "[equal]"
    else:
        metric_key_to_check = f"[all_close_atol_{atol:.2E}_rtol_{rtol:.2E}]"

    result = correctness_data.get(metric_key_to_check)
    if isinstance(result, list) and len(result) > index:
        return bool(result[index])
    return False


def fake_perf_degrad(
    tolerance,
    error_code,
    positive_tolerance_interpretation: PositiveToleranceInterpretation,
) -> str:
    """
    Judge current correctness based on tolerance t and status.
    Refactored to delegate logic to PositiveToleranceInterpretation classes.
    """
    if positive_tolerance_interpretation.is_error_tolerated(tolerance, error_code):
        return "correct"

    return error_code


def calculate_scores(
    samples: list,
    positive_tolerance_interpretation: PositiveToleranceInterpretation,
    p: float = 0,
    b: float = 0.1,
    type: str = "ESt",
) -> tuple:
    """
    Use a standard tolerance to evaluate all samples and calculate S(t) and ES(t) scores for each tolerance level.
    """
    total_samples = len(samples)
    is_correct_at_t1 = [False] * total_samples
    speedup_at_t1 = [None] * total_samples
    fail_type_at_t1 = ["correct"] * total_samples

    scores = {}

    strategy = positive_tolerance_interpretation
    tolerances = determine_tolerances(samples, positive_tolerance_interpretation)

    for tolerance in tolerances:
        rectified_speedups = []
        rectified_speedups_fake_degrad = []

        for idx, sample in enumerate(samples):
            is_correct, fail_type = check_sample_correctness(sample, tolerance)

            # Collect statistics
            if is_correct:
                performance_data = sample.get("performance", {})
                speedup = performance_data.get("speedup", {}).get("e2e")

            if tolerance == 1:
                is_correct_at_t1[idx] = is_correct
                speedup_at_t1[idx] = speedup if is_correct else None
                fail_type_at_t1[idx] = fail_type if fail_type is not None else "correct"

            # S(t) calculation
            if fail_type is not None:
                rectified_speedup = b
            else:
                rectified_speedup = speedup ** (p + 1) if speedup < 1 else speedup
            rectified_speedups.append(rectified_speedup)

            # ES(t) calculation
            if tolerance < 1:
                if fail_type is not None:
                    rec_speedup_fake_degrad = b
                else:
                    rec_speedup_fake_degrad = (
                        speedup ** (p + 1) if speedup < 1 else speedup
                    )
            else:
                if not is_correct_at_t1[idx]:
                    is_tolerated = strategy.is_error_tolerated(
                        tolerance, fail_type_at_t1[idx]
                    )
                    rec_speedup_fake_degrad = 1 if is_tolerated else b
                else:
                    rec_speedup_fake_degrad = (
                        speedup_at_t1[idx] ** (p + 1)
                        if speedup_at_t1[idx] < 1
                        else speedup_at_t1[idx]
                    )
            rectified_speedups_fake_degrad.append(rec_speedup_fake_degrad)

        if not rectified_speedups:
            print("  - No speedup data found.")
            scores[tolerance] = 0

        if type == "St":
            scores[tolerance] = gmean(rectified_speedups)
        elif type == "ESt":
            scores[tolerance] = gmean(rectified_speedups_fake_degrad)
        else:
            print("Invalid type specified. Please choose either 'ESt' or 'St'.")
            sys.exit()
        print(f"  - {type}={scores[tolerance]:.3f} for tolerance={tolerance}.")

    return scores


def check_sample_correctness(sample: dict, tolerance: int) -> tuple[bool, str]:
    """
    Check if a sample is correct at the given tolerance level.

    Args:
        sample: Sample data dictionary
        tolerance: Tolerance level

    Returns:
        Tuple of (is_correct, fail_type)
        - is_correct: True if sample is correct at this tolerance
        - fail_type: Error type if not correct, None if correct
    """
    status = sample.get("status")

    # If there's already a failure type, return it
    if status != "correct":
        return False, status

    # Check correctness based on datatype and tolerance
    performance_data = sample.get("performance", {})
    eager_dtypes = performance_data.get("datatype", {}).get("eager", [])

    # Check all outputs for correctness
    correctness_data = sample.get("correctness", {})
    output_count = len(correctness_data.get("[equal]", []))
    is_correct = all(
        get_correctness(eager_dtypes[i], tolerance, correctness_data, i)
        for i in range(output_count)
    )

    if is_correct:
        return True, None
    else:
        return False, "accuracy"


def get_incorrect_models(
    tolerance: int,
    log_file_path: str,
    type: str = "ESt",
    positive_tolerance_interpretation_type: str = "default",
) -> set[str]:
    """
    Filters and returns models with accuracy issues based on given tolerance threshold.

    Parses model data from log file and checks each model's accuracy against the specified
    tolerance threshold. Returns paths of all models that fail to meet the accuracy requirements.

    Args:
        tolerance (int): Accuracy tolerance threshold for model validation
        log_file_path (str): Path to the log file containing model test results
        type (str): "ESt" or "St" indicating the type of accuracy check to perform

    Returns:
        failed_models(str): names of models failing accuracy check, empty set if none found
    """
    failed_models = set()
    samples = parse_logs_to_data(log_file_path)

    if type == "ESt" and tolerance >= 1:
        total_samples = len(samples)
        is_correct_at_t1 = [False] * total_samples
        fail_type_at_t1 = ["correct"] * total_samples

        for idx, sample in enumerate(samples):
            is_correct, fail_type = check_sample_correctness(sample, 1)
            is_correct_at_t1[idx] = is_correct
            fail_type_at_t1[idx] = fail_type if fail_type is not None else "correct"

        positive_tolerance_interpretation = get_positive_tolerance_interpretation(
            positive_tolerance_interpretation_type
        )

        for idx, sample in enumerate(samples):
            if not is_correct_at_t1[idx]:
                current_correctness = fake_perf_degrad(
                    tolerance, fail_type_at_t1[idx], positive_tolerance_interpretation
                )
                (
                    failed_models.add(sample.get("model_path"))
                    if current_correctness != "correct"
                    else None
                )
            else:
                is_correct, fail_type = check_sample_correctness(sample, tolerance)
                failed_models.add(sample.get("model_path")) if not is_correct else None
    else:
        for idx, sample in enumerate(samples):
            is_correct, fail_type = check_sample_correctness(sample, tolerance)
            failed_models.add(sample.get("model_path")) if not is_correct else None

    return failed_models