diff --git a/operation/benchmarks/drivers/calculate.py b/operation/benchmarks/drivers/calculate.py index cde6329b1..ce542d7fa 100644 --- a/operation/benchmarks/drivers/calculate.py +++ b/operation/benchmarks/drivers/calculate.py @@ -4,16 +4,17 @@ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- import time +from loguru import logger from triton.testing import do_bench as kernel_bench import os import subprocess +# test operation correctness def do_correctness(operation): flaggems_dir = os.getenv("FLAGGEMS_WORK_DIR", "/") gems_repo = subprocess.check_output( ["find", flaggems_dir, "-type", "d", "-name", "FlagGems"], text=True).strip() - p = subprocess.Popen( f"cd {os.path.join(gems_repo, 'tests')} && python3 test_named_ops.py --name {operation} --device cpu ", shell=True @@ -22,8 +23,44 @@ def do_correctness(operation): return p.returncode + +# test operation performance +def do_performance(mode, warmup, result_log_dir): + flaggems_dir = os.getenv("FLAGGEMS_WORK_DIR", "/") + gems_repo = subprocess.check_output( + ["find", flaggems_dir, "-type", "d", "-name", "FlagGems"], text=True).strip() + del_file_path = os.path.join(gems_repo, 'benchmark') + # 删除历史日志 + # del_file = os.path.join(del_file_path, + # f"result_test_distribution_perf--level_core--mode_{mode}--warmup_{warmup}--record_log.log") + del_file = os.path.join(del_file_path, + f"result--level_core--mode_{mode}--warmup_{warmup}--record_log.log") + del_process = subprocess.Popen(["rm", del_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + del_process.communicate() + p = subprocess.Popen( + # 执行所有算子命令 + f"cd {os.path.join(gems_repo, 'benchmark')} && pytest --level core --mode {mode} --warmup {warmup} --record log", + # 执行单个算子命令 + # f"cd {os.path.join(gems_repo, 'benchmark')} && pytest -m mm --level core --mode {mode} --warmup {warmup} --record log -s", + # 执行单个文件命令 + # f"cd {os.path.join(gems_repo, 'benchmark')} && pytest test_distribution_perf.py --level core --mode {mode} --warmup {warmup} --record log", + shell=True + ) + p.wait() + + # 全量执行日志路径 + log_dir = os.path.join(gems_repo, "benchmark", + f"result--level_core--mode_{mode}--warmup_{warmup}--record_log.log") + # 仅执行单个文件日志路径 + # log_dir = os.path.join(gems_repo, "benchmark", + # f"result_test_distribution_perf--level_core--mode_{mode}--warmup_{warmup}--record_log.log") + cp_subprocess = subprocess.run(["cp", f"{log_dir}", f"{result_log_dir}/result.log.txt"], check=True) + return p.returncode, cp_subprocess.returncode + + grad_outputs = None + def do(exec_func, exec_args, bp=False): global grad_outputs if bp: diff --git a/operation/benchmarks/drivers/parse_log.py b/operation/benchmarks/drivers/parse_log.py new file mode 100644 index 000000000..d3ff61907 --- /dev/null +++ b/operation/benchmarks/drivers/parse_log.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +# !/usr/bin/env python3 +# -*- coding: UTF-8 -*- + +import json +import os +from collections import defaultdict +from loguru import logger + + +def parse_log_file(spectflops, mode, warmup, log_dir, result_log_path): + log_file = os.path.join(log_dir, "result.log.txt") + save_log_path = os.path.join(result_log_path, "result.json") + if os.path.isfile(save_log_path): + with open(save_log_path, 'r+', encoding='utf-8') as file_r: + try: + file_r_json = file_r.read() + res = json.loads(file_r_json) + result_data = get_result_data(log_file, res, spectflops, mode, warmup) + file_r.seek(0) + file_r.write(json.dumps(result_data, ensure_ascii=False)) + file_r.truncate() + except json.decoder.JSONDecodeError: + print("JSONDecodeError json file content is None") + else: + with open(save_log_path, 'w') as file_w: + res = defaultdict(dict) + result_data = get_result_data(log_file, res, spectflops, mode, warmup) + file_w.write(json.dumps(result_data, ensure_ascii=False)) + + +""" 参数说明 +# 时延:1 无预热时延 Latency-No warmup:no_warmup_latency,2 预热时延 Latency-Warmup:warmup_latency +# 吞吐率:3 Raw-Throughput原始吞吐:raw_throughput, 4 Core-Throughput是核心吞吐:core_throughput +# 算力:5 实际算力开销:ctflops, 6 实际算力利用率:cfu, 7 实际算力开销-内核时间:ktflops, 8 实际算力利用率-内核时间:kfu +""" +def get_result_data(log_file, res, spectflops, mode, warmup): + with open(log_file, 'r') as file_r: + lines = file_r.readlines() + for line in lines: + if line.startswith("[INFO]"): + json_data = line[6:].strip() + try: + data = json.loads(json_data) + op_name = data.get("op_name") + dtype = data.get("dtype") + results = data.get("result") + for result in results: + shape_detail = result.get("shape_detail") + latency_base = result.get("latency_base") + if mode == "cpu" and warmup == "0": + no_warmup_latency = result.get("latency") + parse_data = { + "op_name": op_name, + "dtype": dtype, + "shape_detail": shape_detail, + "latency_base_cpu_nowarm": latency_base, + "no_warmup_latency": no_warmup_latency + } + res[f"{op_name}_{dtype}_{shape_detail}"].update(parse_data) + elif mode == "cpu" and warmup != "0": + warmup_latency = result.get("latency") + raw_throughput = 1 / float(warmup_latency) + ctflops = result.get("tflops") + if ctflops is None: + cfu = None + else: + cfu = round(100.0 * float(ctflops) / 1e12 / float(spectflops), 2) + parse_data = { + "op_name": op_name, + "dtype": dtype, + "shape_detail": shape_detail, + "latency_base_cpu_warm": latency_base, + "warmup_latency": warmup_latency, + "raw_throughput": raw_throughput, + "ctflops": ctflops, + "cfu": cfu + } + res[f"{op_name}_{dtype}_{shape_detail}"].update(parse_data) + elif mode == "cuda" and warmup != "0": + kerneltime = result.get("latency") + core_throughput = 1 / float(kerneltime) + ktflops = result.get("tflops") + if ktflops is None: + kfu = None + else: + kfu = round(100.0 * float(ktflops) / 1E12 / float(spectflops), 2) + parse_data = { + "op_name": op_name, + "dtype": dtype, + "shape_detail": shape_detail, + "latency_base_cuda_warm": latency_base, + "kerneltime": kerneltime, + "core_throughput": core_throughput, + "ktflops": ktflops, + "kfu": kfu + } + res[f"{op_name}_{dtype}_{shape_detail}"].update(parse_data) + except json.JSONDecodeError as e: + logger.error(f"Error decoding JSON: {e}") + return res \ No newline at end of file diff --git a/operation/benchmarks/opv2/main.py b/operation/benchmarks/opv2/main.py new file mode 100644 index 000000000..e74123ac4 --- /dev/null +++ b/operation/benchmarks/opv2/main.py @@ -0,0 +1,128 @@ + # Copyright (c) 2024 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +#!/usr/bin/env python3 +# -*- coding: UTF-8 -*- +import torch +import os +import time +from argparse import ArgumentParser, Namespace +import yaml +import sys +import subprocess + +sys.path.append("..") +from drivers.utils import * +from drivers.calculate import * +from drivers.parse_log import * + + +def parse_args(): + parser = ArgumentParser(description=" ") + + parser.add_argument("--vendor", + type=str, + required=True, + help="vendor name like nvidia") + parser.add_argument("--case_name", + type=str, + required=True, + help="op name like mm") + parser.add_argument("--spectflops", + type=str, + required=True, + help="spectflops of current dataformat") + + parser.add_argument("--dataformat", + type=str, + required=True, + help="like FP32,FP16") + + parser.add_argument("--oplib", + type=str, + required=True, + help="impl like pytorch/flaggems/cpp") + + parser.add_argument("--chip", + type=str, + required=True, + help="chip like A100_40_SXM") + + parser.add_argument("--mode", + type=str, + required=True, + help="mode like cpu") + + parser.add_argument("--warmup", + type=str, + required=True, + help="warmup") + + parser.add_argument("--log_dir", + type=str, + required=True, + help="abs log dir") + + parser.add_argument("--result_log_path", + type=str, + required=True, + help="result log path for FlagPerf/operation/result") + + args, unknown_args = parser.parse_known_args() + args.unknown_args = unknown_args + return args + + +def main(config): + correctness = do_correctness(config.case_name) + correctness = correctness == 0 + + # test operation performance + performance = do_performance(config.mode, config.warmup, config.log_dir) + performance = performance == 0 + parse_log_file(config.spectflops, config.mode, config.warmup, config.log_dir, config.result_log_path) + + # dtype = { + # "FP32": torch.float32, + # "FP16": torch.float16, + # "BF16": torch.bfloat16, + # "INT32": torch.int32, + # "INT16": torch.int16, + # "BOOL": torch.bool + # } + # set_ieee_float32(config.vendor) + # + # + # m = case_config.Melements + # + # + # a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0) + # + # latency_nowarm, latency_warm, cputime, kerneltime = do_test( + # torch.abs, (a, ), host_device_sync, config, case_config) + # + # op2flops = lambda x: x * m * 1024 * 1024 + # + # perf_result = cal_perf(cputime, kerneltime, op2flops, + # config.spectflops) + # print_result(config, config.case_name, *perf_result, correctness, + # latency_nowarm, latency_warm) + +if __name__ == "__main__": + config = parse_args() + # with open("case_config.yaml", "r") as file: + # case_config = yaml.safe_load(file) + # adapt_torch(config.vendor) + # with open(os.path.join(config.vendor, config.chip, "case_config.yaml"), + # "r") as file: + # case_config_vendor = yaml.safe_load(file) + # case_config.update(case_config_vendor) + # case_config = Namespace(**case_config) + + if config.oplib == "flaggems": + import flag_gems + flag_gems.enable() + print("Using flaggems") + else: + print("Using nativetorch") + main(config) \ No newline at end of file diff --git a/operation/configs/host.yaml b/operation/configs/host.yaml index 4895e4d37..f301bf466 100644 --- a/operation/configs/host.yaml +++ b/operation/configs/host.yaml @@ -30,8 +30,11 @@ CLEAR_CACHES: True ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES" # "operation:dataFormat:chip": "docker_images" # now only support flaggems and nativepytorch +MODE: "cpu" +WARMUP: 1000 CASES: - "mm:FP16:312:nativetorch:A100_40_SXM": "ngctorch2403" +# "mm:FP16:312:nativetorch:A100_40_SXM": "ngctorch2403" + "opv2:mm:FP16:312:flaggems:A100_40_SXM": "ngctorch2403" # "mm:FP16:flaggems:A100_40_SXM": "ngctorch2403" # "mm:FP16:nativetorch:A100_40_SXM": "ngctorch2403" # 'exp:FP32:nativetorch:R300p" : "xpytorch029" diff --git a/operation/container_main.py b/operation/container_main.py index 787e99e9a..d09d94d06 100644 --- a/operation/container_main.py +++ b/operation/container_main.py @@ -39,6 +39,16 @@ def parse_args(): required=True, help="vendor name like nvidia") + parser.add_argument("--mode", + type=str, + required=True, + help="mode like cpu") + + parser.add_argument("--warmup", + type=str, + required=True, + help="warmup") + parser.add_argument("--log_level", type=str, required=True, @@ -63,6 +73,11 @@ def parse_args(): required=True, help="abs path for FlagPerf/base") + parser.add_argument("--result_log_path", + type=str, + required=True, + help="result log path for FlagPerf/operation/result") + args, unknown_args = parser.parse_known_args() args.unknown_args = unknown_args return args @@ -96,9 +111,9 @@ def write_pid_file(pid_file_path, pid_file): logger.info("Success Writing PID file at " + os.path.join(config.log_dir, "start_base_task.pid")) - op, dataformat, spectflops, oplib, chip = config.case_name.split(":") + test_file, op, dataformat, spectflops, oplib, chip = config.case_name.split(":") - case_dir = os.path.join(config.perf_path, "benchmarks", op) + case_dir = os.path.join(config.perf_path, "benchmarks", test_file) start_cmd = "cd " + case_dir + ";python3 main.py " start_cmd += " --vendor=" + config.vendor start_cmd += " --case_name=" + op @@ -106,11 +121,12 @@ def write_pid_file(pid_file_path, pid_file): start_cmd += " --dataformat=" + dataformat start_cmd += " --oplib=" + oplib start_cmd += " --chip=" + chip - + start_cmd += " --mode=" + config.mode + start_cmd += " --warmup=" + config.warmup + start_cmd += " --log_dir=" + config.log_dir + start_cmd += " --result_log_path=" + config.result_log_path script_log_file = os.path.join(os.path.dirname(logfile), "operation.log.txt") - - logger.info(start_cmd) logger.info(script_log_file) f = open(script_log_file, "w") diff --git a/operation/result_data_format/formatMDfile.py b/operation/result_data_format/formatMDfile.py new file mode 100644 index 000000000..99b0bafc9 --- /dev/null +++ b/operation/result_data_format/formatMDfile.py @@ -0,0 +1,21 @@ +import os + + +def render(extracted_values, readme_file_path, vendor, shm_size, chip): + json_data = [] + for key, value in extracted_values.items(): + json_data.append(value) + dest_file_path = os.path.join(readme_file_path, "README.md") + markdown_table = creat_markdown_table(json_data, vendor, shm_size, chip) + with open(dest_file_path, 'w') as file: + file.write(markdown_table) + + +def creat_markdown_table(data, vendor, shm_size, chip): + v_chip = f'{vendor}_{chip}' + table = f"# 参评AI芯片信息\n\n * 厂商:{vendor}\n * 产品名称:{v_chip}\n * 产品型号:{chip}\n * SHM_SIZE:{shm_size}\n\n\n\n" + table += "# 评测结果\n\n" + table += "| op_name | dtype | shape_detail | 无预热时延(Latency-No warmup) | 预热时延(Latency-Warmup) | 原始吞吐(Raw-Throughput)| 核心吞吐(Core-Throughput) | 实际算力开销 | 实际算力利用率 | 实际算力开销(内核时间) | 实际算力利用率(内核时间) |\n| --- | ---| --- | ---| --- | ---| --- | ---| --- | ---| --- |\n" + for row in data: + table += f"| {row['op_name']} | {row['dtype']} | {row['shape_detail']} | {row['no_warmup_latency']} | {row['warmup_latency']} | {row['raw_throughput']} | {row['core_throughput']} | {row['ctflops']} | {row['cfu']} | {row['ktflops']} | {row['kfu']} |\n" + return table \ No newline at end of file diff --git a/operation/result_data_format/format_result_main.py b/operation/result_data_format/format_result_main.py new file mode 100644 index 000000000..613b58b12 --- /dev/null +++ b/operation/result_data_format/format_result_main.py @@ -0,0 +1,41 @@ +# Copyright (c) 2024 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +# !/usr/bin/env python3 +# -*- coding: UTF-8 -*- +import json +import os +import sys +import yaml +from argparse import Namespace + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../"))) +OP_PATH = os.path.abspath(os.path.join(CURR_PATH, "../")) +from formatMDfile import * + + +def main(vendor, shm_size, chip): + result_json_file_path = os.path.join(OP_PATH, "result/result.json") + sava_path = os.path.join(OP_PATH, "result") + # render_base(sava_path, vendor, shm_size, chip) + with open(result_json_file_path, 'r') as f: + content = json.loads(f.read()) + render(content, sava_path, vendor, shm_size, chip) + + +if __name__ == "__main__": + config_path = os.path.join(OP_PATH, "configs/host.yaml") + with open(config_path, "r") as file: + config_dict = yaml.safe_load(file) + config = Namespace(**config_dict) + cases = [] + for case in config.CASES: + cases.append(case) + vendor = config.VENDOR + shm_size = config.SHM_SIZE + for run_case in cases: + case_name = run_case + test_file, op, dataformat, spectflops, oplib, chip = case_name.split(":") + main(vendor, shm_size, chip) + print("successful !!!") \ No newline at end of file diff --git a/operation/run.py b/operation/run.py index f3682af93..fbf4868c6 100644 --- a/operation/run.py +++ b/operation/run.py @@ -201,8 +201,8 @@ def start_tasks_in_cluster(dp_path, container_name, config, base_args, nnodes = len(config.HOSTS) framework = config.CASES[case] - op, df, spectflops, oplib, chip = case.split(":") - env_dir = os.path.join(config.FLAGPERF_PATH, "benchmarks", op, + test_file, op, df, spectflops, oplib, chip = case.split(":") + env_dir = os.path.join(config.FLAGPERF_PATH, "benchmarks", test_file, config.VENDOR, chip) env_shell = os.path.join(env_dir, "env.sh") @@ -382,8 +382,7 @@ def summary_logs(config, case_log_dir): result[host][index] = sys_log # FlagPerf Result - flagperf_result_path = os.path.join(monitor_log_dir, - "operation.log.txt") + flagperf_result_path = os.path.join(monitor_log_dir, "operation.log.txt") with open(flagperf_result_path, 'r') as file: key_lines = [ line.strip() for line in file if 'FlagPerf Result' in line @@ -533,6 +532,7 @@ def main(): check_cluster_deploy_path(dp_path) cases = get_valid_cases(config) log_test_configs(cases, curr_log_path, dp_path, config) + result_log_path = os.path.join(config.FLAGPERF_PATH, config.FLAGPERF_LOG_PATH) RUN_LOGGER.info("========= Step 2: Prepare and Run test cases. =========") @@ -568,10 +568,12 @@ def main(): + " --nproc_per_node " + str(config.NPROC_PER_NODE) \ + " --log_dir " + os.path.join(dp_path, log_dir_container) \ + " --log_level " + config.FLAGPERF_LOG_LEVEL.upper() \ - + " --master_port " + config.MASTER_PORT + + " --master_port " + config.MASTER_PORT \ + + " --mode " + config.MODE \ + + " --warmup " + str(config.WARMUP) \ + + " --result_log_path " + result_log_path RUN_LOGGER.info("=== 2.2 Setup container and run testcases. ===") - RUN_LOGGER.info("-== Testcase " + case + " starts ==-") RUN_LOGGER.info("1) Prepare container environments in cluster...") case_log_dir = os.path.join(curr_log_path, case) diff --git a/operation/vendors/nvidia/ngctorch2403/Dockerfile b/operation/vendors/nvidia/ngctorch2403/Dockerfile index d3bf8cbf0..92529da9a 100644 --- a/operation/vendors/nvidia/ngctorch2403/Dockerfile +++ b/operation/vendors/nvidia/ngctorch2403/Dockerfile @@ -6,4 +6,4 @@ RUN apt-get update RUN pip3 install loguru RUN pip3 install pycuda RUN pip3 install schedule -RUN pip3 install munch +RUN pip3 install munch \ No newline at end of file diff --git a/operation/vendors/nvidia/ngctorch2403/ngctorch2403_install.sh b/operation/vendors/nvidia/ngctorch2403/ngctorch2403_install.sh index d7e75df16..fa8f1d309 100644 --- a/operation/vendors/nvidia/ngctorch2403/ngctorch2403_install.sh +++ b/operation/vendors/nvidia/ngctorch2403/ngctorch2403_install.sh @@ -1,5 +1,5 @@ #!/bin/bash git clone https://mirror.ghproxy.com/https://github.com/FlagOpen/FlagGems.git cd FlagGems -git checkout +#git checkout pip install .