arguzz/scripts/process_csv.py at main · Rigorous-Software-Engineering/arguzz · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
from pathlib import Path

from gen_bar_plot import generate_injection_bar_plot, generate_summary_bar_plot
from gen_bug_refind_table import generate_bug_refind_table
from gen_error_statistic import generate_error_statistic
from gen_pie_plot import generate_pie_charts
from gen_RQ3 import generate_RQ3
from gen_RQ4 import generate_RQ4
from gen_RQ5 import generate_RQ5
from gen_size_time_scatter import generate_summary_scatter_plot

ZKVM_NAME_LIST = ["jolt", "nexus", "openvm", "pico", "risc0", "sp1"]
ACTION_LIST = ["explore", "refind", "check"]
CONFIG_LIST = ["default", "no-inline", "no-modification", "no-schedular"]

# maps a zkvm name to its first 7 character commit used for the
# explore experiments
ZKVM_TO_EXPLORE_COMMITS_TABLE = {
    "jolt": "1687134",
    "nexus": "8f4ba56",
    "openvm": "ca36de3",
    "pico": "dd5b7d1",
    "risc0": "ebd64e4",
    "sp1": "429e95e",
}

# maps the first 7 character of the "fix" commit to the bug id
FIX_COMMIT_TO_BUG_ID_TABLE: dict[str, int] = {
    # -- risc0 --
    "67f2d81": 1,  # Missing constraint in three-register instructions
    "31f6570": 2,  # Off-by-one error in cycle-counting logic
    # -- nexus --
    "62e3abc": 3,  # Unconstrained store operand in load-store instructions
    "54cebc7": 4,  # completeness MT Out-of-bounds panic due to memory size misestimation
    "c684c4e": 5,  # completeness MT Carry overflow in multiplication extension
    # -- JOLT --
    "85bf51d": 6,  # Unconstrained immediate operand in lui
    "20ac6eb": 7,  # Incorrect RAM size calculation
    "55b9830": 8,  # Out-of-bounds panic for high-address bytecode
    "0582b2a": 9,  # Dory-commitment failure for traces shorter than 256 cycles
    "0369981": 10,  # Sumcheck-verification failure for mulhsu
}

# maps the zkvms to their "fix" commits
ZKVM_TO_FIX_COMMITS = {
    "risc0": ["67f2d81", "31f6570"],
    "nexus": ["62e3abc", "54cebc7", "c684c4e"],
    "jolt": ["85bf51d", "20ac6eb", "55b9830", "0582b2a", "0369981"],
}


def process_explore(timestamp: str, zkvm_name: str, explore_dir: Path, out_dir: Path):
    assert explore_dir.name == "explore", f"probably invalid explore dir {explore_dir}"
    assert explore_dir.is_dir(), f"{explore_dir} is not a directory"

    for commit_dir in explore_dir.iterdir():
        if not commit_dir.is_dir():
            raise ValueError(f"unexpected non directory {commit_dir}!")
        commit = commit_dir.name

        # == config == #

        for config_dir in commit_dir.iterdir():
            if not config_dir.is_dir():
                raise ValueError(f"unexpected non directory {config_dir}!")

            config = config_dir.name
            if config not in CONFIG_LIST:
                raise ValueError(f"unexpected config '{config}'!")

            print(f"== EXPLORE PROCESSING: {timestamp} {zkvm_name} {config} {commit} ==")

            # == csv file == #

            # final output directory
            final_out_dir = out_dir / timestamp / zkvm_name / "explore" / commit / config
            visited_csv = set()

            for csv_file in config_dir.iterdir():
                if not csv_file.is_file():
                    raise ValueError(f"unexpected non file {csv_file}!")

                csv_file_name = csv_file.name
                visited_csv.add(csv_file_name)
                match csv_file_name:
                    case "build.csv":
                        pass
                    case "findings.csv":
                        pass
                    case "checked_findings.csv":
                        pass
                    case "injection.csv":
                        generate_summary_scatter_plot(
                            csv_file, final_out_dir / "size_time_injection.pdf"
                        )
                        generate_error_statistic(csv_file, final_out_dir / "fault_errors.txt")
                        generate_pie_charts(csv_file, final_out_dir / "categories.pdf")
                        generate_injection_bar_plot(csv_file, final_out_dir / "injection.pdf")
                    case "normal.csv":
                        generate_summary_scatter_plot(
                            csv_file, final_out_dir / "size_time_normal.pdf"
                        )
                        generate_error_statistic(csv_file, final_out_dir / "normal_errors.txt")
                    case "pipeline.csv":
                        pass
                    case "run.csv":
                        pass
                    case "summary.csv":
                        generate_summary_bar_plot(csv_file, final_out_dir / "summary.pdf")
                    case _:
                        raise ValueError(f"unexpected csv file '{csv_file_name}'!")

            missing_csv = {"injection.csv", "normal.csv", "summary.csv"} - visited_csv
            if len(missing_csv) > 0:
                print(f" - WARNING MISSING CSVs: {missing_csv}")
                print(" - Some processing steps were skipped ...")

            print(f"== LEAVING EXPLORE: {timestamp} {zkvm_name} {config} {commit} ==")


def process_check(timestamp: str, zkvm_name: str, check_dir: Path, out_dir: Path):
    assert check_dir.name == "check", f"probably invalid check dir {check_dir}"
    assert check_dir.is_dir(), f"{check_dir} is not a directory"

    for commit_dir in check_dir.iterdir():
        if not commit_dir.is_dir():
            raise ValueError(f"unexpected non directory {commit_dir}!")
        commit = commit_dir.name

        # == config == #

        for config_dir in commit_dir.iterdir():
            if not config_dir.is_dir():
                raise ValueError(f"unexpected non directory {config_dir}!")

            config = config_dir.name
            if config not in CONFIG_LIST:
                print(f"WARNING unknown configuration: {config}")
                continue
                # raise ValueError(f"unexpected config '{config}'!")

            print(f"== CHECK PROCESSING: {timestamp} {zkvm_name} {config} {commit} ==")

            # == csv file == #

            # final output directory
            final_out_dir = out_dir / timestamp / zkvm_name / "check" / commit / config
            visited_csv = set()

            for csv_file in config_dir.iterdir():
                if not csv_file.is_file():
                    raise ValueError(f"unexpected non file {csv_file}!")

                csv_file_name = csv_file.name
                visited_csv.add(csv_file_name)
                match csv_file_name:
                    case "build.csv":
                        pass
                    case "findings.csv":
                        pass
                    case "checked_findings.csv":
                        generate_bug_refind_table(csv_file, final_out_dir / "found-bugs.txt")
                    case "injection.csv":
                        pass
                    case "normal.csv":
                        pass
                    case "pipeline.csv":
                        pass
                    case "run.csv":
                        pass
                    case "summary.csv":
                        pass
                    case _:
                        raise ValueError(f"unexpected csv file '{csv_file_name}'!")

            missing_csv = {"checked_findings.csv"} - visited_csv
            if len(missing_csv) > 0:
                print(f" - WARNING MISSING CSVs: {missing_csv}")
                print(" - Some processing steps were skipped ...")

            print(f"== LEAVING CHECK: {timestamp} {zkvm_name} {config} {commit} ==")


def process_actions(
    timestamp: str,
    zkvm_name: str,
    zkvm_dir: Path,
    out_dir: Path,
    enable_explore: bool,
    enable_refind: bool,
):
    # == explore ==
    if enable_explore:
        explore_dir = zkvm_dir / "explore"
        if explore_dir.is_dir():
            process_explore(timestamp, zkvm_name, explore_dir, out_dir)
        else:
            print(f"WARNING: unable to find an 'explore' folder for {timestamp} / {zkvm_name}")
            print("Skipping 'explore' data processing...")

    # == refind / check ==
    if enable_refind:
        check_dir = zkvm_dir / "check"
        if check_dir.is_dir():
            process_check(timestamp, zkvm_name, check_dir, out_dir)
        else:
            print(f"WARNING: unable to find an 'check' folder for {timestamp} / {zkvm_name}")
            print("Skipping 'check' data processing...")


def process_csv(csv_dir: Path, out_dir: Path, enable_explore: bool, enable_refind: bool):
    assert csv_dir.is_dir(), f"not a directory {csv_dir}"

    # although it must not necessarily be a timestamp folder, in a normal usecase it is.
    timestamp = csv_dir.name

    rq3_csv_lookup: dict[int, dict[str, Path]] = {}
    rq4_csv_lookup: dict[str, dict[str, Path]] = {}
    rq5_csv_lookup: dict[str, dict[str, Path]] = {}

    for zkvm_name in ZKVM_NAME_LIST:
        zkvm_dir = csv_dir / zkvm_name
        if not zkvm_dir.is_dir():
            print(f"WARNING: unable to find data for {zkvm_name}!")
            print(f"         No directory at {zkvm_dir} ...")
            continue

        if enable_explore:
            explore_commit = ZKVM_TO_EXPLORE_COMMITS_TABLE[zkvm_name]

            # == start RQ4 data collection ==
            default_summary_csv = zkvm_dir / "explore" / explore_commit / "default" / "summary.csv"
            no_inline_summary_csv = (
                zkvm_dir / "explore" / explore_commit / "no-inline" / "summary.csv"
            )
            if default_summary_csv.is_file() and no_inline_summary_csv.is_file():
                rq4_csv_lookup[zkvm_name] = {}
                rq4_csv_lookup[zkvm_name]["default"] = default_summary_csv
                rq4_csv_lookup[zkvm_name]["no-inline"] = no_inline_summary_csv
            else:
                print(f"WARNING: unable to produce RQ4 for {zkvm_name}")
                print(f"            - {default_summary_csv}")
                print(f"            - {no_inline_summary_csv}")
            # == end RQ4 data collection ==

            # == start RQ5 data collection ==
            default_injection_csv = (
                zkvm_dir / "explore" / explore_commit / "default" / "injection.csv"
            )
            no_schedular_injection_csv = (
                zkvm_dir / "explore" / explore_commit / "no-schedular" / "injection.csv"
            )
            if default_injection_csv.is_file() and no_schedular_injection_csv.is_file():
                rq5_csv_lookup[zkvm_name] = {}
                rq5_csv_lookup[zkvm_name]["default"] = default_injection_csv
                rq5_csv_lookup[zkvm_name]["no-schedular"] = no_schedular_injection_csv
            else:
                print(f"WARNING: unable to produce RQ5 for {zkvm_name}")
                print(f"            - {default_injection_csv}")
                print(f"            - {no_schedular_injection_csv}")
            # == end RQ5 data collection ==

        if enable_refind:
            if zkvm_name in ZKVM_TO_FIX_COMMITS:
                for refind_commit in ZKVM_TO_FIX_COMMITS[zkvm_name]:
                    # == start RQ3 data collection ==
                    bug_id = FIX_COMMIT_TO_BUG_ID_TABLE[refind_commit]
                    rq3_csv_lookup[bug_id] = {}

                    default_checked_findings_csv = (
                        zkvm_dir / "check" / refind_commit / "default" / "checked_findings.csv"
                    )
                    no_modification_checked_findings_csv = (
                        zkvm_dir
                        / "check"
                        / refind_commit
                        / "no-modification"
                        / "checked_findings.csv"
                    )

                    if default_checked_findings_csv.is_file():
                        rq3_csv_lookup[bug_id]["default"] = default_checked_findings_csv

                    if no_modification_checked_findings_csv.is_file():
                        rq3_csv_lookup[bug_id][
                            "no-modification"
                        ] = no_modification_checked_findings_csv
                    # == end RQ3 data collection ==

        # legacy computation per zkvm
        process_actions(timestamp, zkvm_name, zkvm_dir, out_dir, False, enable_refind)
        process_actions(timestamp, zkvm_name, zkvm_dir, out_dir, enable_explore, enable_refind)

    if enable_explore:
        generate_RQ4(rq4_csv_lookup, out_dir / timestamp / "RQ4")

    if enable_explore:
        generate_RQ5(rq5_csv_lookup, out_dir / timestamp / "RQ5")

    if enable_refind:
        generate_RQ3(rq3_csv_lookup, out_dir / timestamp / "RQ3")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Helps to process data extracted using the extract_csv.py. "
        "If using the extraction script as recommended, make sure to provide the "
        "timestamp folder (e.g. '2025-08-20_20-19-50') inside of the extraction "
        "folder and not the outer folder or any other sub-folders!"
    )
    parser.add_argument(
        "--disable-explore", action="store_true", help="Disables explore processing"
    )
    parser.add_argument(
        "--disable-refind", action="store_true", help="Disables bug refinding processing"
    )

    parser.add_argument("csv_dir", help="Input folder containing the extracted CSV files.")
    parser.add_argument(
        "out_dir",
        help="Output folder for the processed data. The name of "
        "the provided csv_dir is always used as subfolder!",
    )

    args = parser.parse_args()
    csv_dir = Path(args.csv_dir)
    out_dir = Path(args.out_dir)
    enable_explore = not args.disable_explore
    enable_refind = not args.disable_refind

    if not csv_dir.is_dir():
        parser.error(f"unable to find csv directory {csv_dir}")

    process_csv(csv_dir, out_dir, enable_explore, enable_refind)