diff --git a/examples/guidellm_example.py b/examples/guidellm_example.py index f09b1e6..fe2e297 100644 --- a/examples/guidellm_example.py +++ b/examples/guidellm_example.py @@ -9,11 +9,12 @@ GUIDELLM__MAX_CONCURRENCY=256, GUIDELLM__REQUEST_TIMEOUT=21600, target="http://localhost:8000/v1", - data_type="emulated", max_seconds=30, - data="prompt_tokens=512,generated_tokens=256", + #scenario = "benchmarking_32k", + data="prompt_tokens=128,output_tokens=128", + branch = "update_guidellm", vllm_kwargs={"enable-chunked-prefill": True} ) task.execute_remotely("oneshot-a100x1") -#task.execute_locally() \ No newline at end of file +#task.execute_locally() diff --git a/examples/lmeval_example.py b/examples/lmeval_example.py index 8910aa2..688c355 100644 --- a/examples/lmeval_example.py +++ b/examples/lmeval_example.py @@ -6,8 +6,8 @@ model_id="meta-llama/Llama-3.2-1B-Instruct", tasks="gsm8k", model_args="dtype=auto,max_model_len=8192", - batch_size="auto", + batch_size="auto", ) task.execute_remotely("oneshot-a100x1") -#task.execute_locally() \ No newline at end of file +#task.execute_locally() diff --git a/src/automation/configs.py b/src/automation/configs.py index 76dbe58..10aa396 100644 --- a/src/automation/configs.py +++ b/src/automation/configs.py @@ -1,2 +1,4 @@ -DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_5:latest" -DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml" \ No newline at end of file +DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_8:latest" +DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml" +DEFAULT_RESEARCH_BRANCH = "main" +DEFAULT_GUIDELLM_SCENARIO = "chat" diff --git a/src/automation/standards/benchmarking/benchmarking_128k.json b/src/automation/standards/benchmarking/benchmarking_128k.json new file mode 100644 index 0000000..13b8105 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_128k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 128000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 128000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_16k.json b/src/automation/standards/benchmarking/benchmarking_16k.json new file mode 100644 index 0000000..f927a4a --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_16k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 16000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 16000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_32k.json b/src/automation/standards/benchmarking/benchmarking_32k.json new file mode 100644 index 0000000..6543fd7 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_32k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 32000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 32000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_64k.json b/src/automation/standards/benchmarking/benchmarking_64k.json new file mode 100644 index 0000000..871b210 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_64k.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 64000, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 64000, + "output_tokens": 2048, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 2048 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_chat.json b/src/automation/standards/benchmarking/benchmarking_chat.json new file mode 100644 index 0000000..f4d0548 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_chat.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 512, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 512, + "output_tokens": 256, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 256 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_code_completion.json b/src/automation/standards/benchmarking/benchmarking_code_completion.json new file mode 100644 index 0000000..6be35df --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_code_completion.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 256, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 256, + "output_tokens": 1024, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_code_fixing.json b/src/automation/standards/benchmarking/benchmarking_code_fixing.json new file mode 100644 index 0000000..bceff14 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_code_fixing.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 1024, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 1024, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_docstring_generation.json b/src/automation/standards/benchmarking/benchmarking_docstring_generation.json new file mode 100644 index 0000000..0eda212 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_docstring_generation.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 768, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 768, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_instruction.json b/src/automation/standards/benchmarking/benchmarking_instruction.json new file mode 100644 index 0000000..0fac491 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_instruction.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 256, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 256, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_long_rag.json b/src/automation/standards/benchmarking/benchmarking_long_rag.json new file mode 100644 index 0000000..4fe719a --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_long_rag.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 10240, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 10240, + "output_tokens": 1536, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1536 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_rag.json b/src/automation/standards/benchmarking/benchmarking_rag.json new file mode 100644 index 0000000..9525b09 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_rag.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 1024, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/benchmarking_summarization.json b/src/automation/standards/benchmarking/benchmarking_summarization.json new file mode 100644 index 0000000..9525b09 --- /dev/null +++ b/src/automation/standards/benchmarking/benchmarking_summarization.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 1024, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 128, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 128 + } +} diff --git a/src/automation/standards/benchmarking/chat.json b/src/automation/standards/benchmarking/chat.json new file mode 100644 index 0000000..024438c --- /dev/null +++ b/src/automation/standards/benchmarking/chat.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 512, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 256, + "output_tokens_stdev": 64, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/standards/benchmarking/rag.json b/src/automation/standards/benchmarking/rag.json new file mode 100644 index 0000000..c7ee2f2 --- /dev/null +++ b/src/automation/standards/benchmarking/rag.json @@ -0,0 +1,13 @@ +{ + "rate_type": "sweep", + "data": { + "prompt_tokens": 4096, + "prompt_tokens_stdev": 512, + "prompt_tokens_min": 2048, + "prompt_tokens_max": 6144, + "output_tokens": 512, + "output_tokens_stdev": 128, + "output_tokens_min": 1, + "output_tokens_max": 1024 + } +} diff --git a/src/automation/tasks/base_task.py b/src/automation/tasks/base_task.py index d886599..74fa1ba 100644 --- a/src/automation/tasks/base_task.py +++ b/src/automation/tasks/base_task.py @@ -1,27 +1,33 @@ from clearml import Task from typing import Sequence, Optional -from automation.configs import DEFAULT_OUTPUT_URI +from automation.configs import DEFAULT_OUTPUT_URI, DEFAULT_RESEARCH_BRANCH from automation.standards import STANDARD_CONFIGS import yaml import os class BaseTask(): - base_packages = ["git+https://github.com/neuralmagic/research.git"] + #base_packages = ["git+https://github.com/neuralmagic/research.git"] + #base_packages = ["git+https://github.com/neuralmagic/research.git@update_guidellm"] def __init__( self, project_name: str, task_name: str, docker_image: str, + branch: Optional[str] = DEFAULT_RESEARCH_BRANCH, packages: Optional[Sequence[str]]=None, task_type: str="training", ): + branch_name = branch or DEFAULT_RESEARCH_BRANCH + base_packages = [f"git+https://github.com/neuralmagic/research.git@{branch_name}"] if packages is not None: - packages = list(set(packages + self.base_packages)) + packages = list(set(packages + base_packages)) else: - packages = self.base_packages + packages = base_packages + + print(packages) self.project_name = project_name self.task_name = task_name @@ -29,6 +35,7 @@ def __init__( self.packages = packages self.task_type = task_type self.task = None + self.branch= branch self.script_path = None self.callable_artifacts = None @@ -50,8 +57,8 @@ def process_config(self, config): return yaml.safe_load(open(STANDARD_CONFIGS[config], "r")) elif os.path.exists(config): return yaml.safe_load(open(config, "r")) - elif os.path.exists(os.path.join("..", "standatrds", config)): - return yaml.safe_load(open(os.path.join("..", "standatrds", config)), "r") + elif os.path.exists(os.path.join("..", "standards", config)): + return yaml.safe_load(open(os.path.join("..", "standards", config)), "r") else: return yaml.safe_load(config) @@ -91,7 +98,7 @@ def create_task(self): add_task_init_call=True, script=self.script_path, repo="https://github.com/neuralmagic/research.git", - branch="main", + branch=self.branch, ) self.task.output_uri = DEFAULT_OUTPUT_URI self.set_arguments() diff --git a/src/automation/tasks/guidellm.py b/src/automation/tasks/guidellm.py index 390012b..a85eb83 100644 --- a/src/automation/tasks/guidellm.py +++ b/src/automation/tasks/guidellm.py @@ -1,10 +1,10 @@ from automation.tasks import BaseTask -from automation.configs import DEFAULT_DOCKER_IMAGE +from automation.configs import DEFAULT_DOCKER_IMAGE, DEFAULT_RESEARCH_BRANCH from typing import Optional, Sequence import os DEFAULT_SERVER_WAIT_TIME = 600 # 600 seconds = 10 minutes -GUIDELLM_PACKAGE = "git+https://github.com/neuralmagic/guidellm.git@http_backend" +GUIDELLM_PACKAGE = "git+https://github.com/neuralmagic/guidellm.git" class GuideLLMTask(BaseTask): @@ -23,6 +23,7 @@ def __init__( docker_image: str=DEFAULT_DOCKER_IMAGE, packages: Optional[Sequence[str]]=None, clearml_model: bool=False, + branch: str= DEFAULT_RESEARCH_BRANCH, task_type: str="training", vllm_kwargs: dict={}, target: str="http://localhost:8000/v1", @@ -52,6 +53,7 @@ def __init__( docker_image=docker_image, packages=packages, task_type=task_type, + branch = branch, ) # Check for conflicts in configs and constructor arguments diff --git a/src/automation/tasks/scripts/guidellm_script.py b/src/automation/tasks/scripts/guidellm_script.py index 617b502..35269a9 100644 --- a/src/automation/tasks/scripts/guidellm_script.py +++ b/src/automation/tasks/scripts/guidellm_script.py @@ -1,35 +1,46 @@ - import os +import sys from clearml import Task from automation.utils import resolve_model_id, cast_args, kill_process_tree from automation.vllm import start_vllm_server from pyhocon import ConfigFactory +from automation.configs import DEFAULT_GUIDELLM_SCENARIO - -def main(configurations=None): +def main(): task = Task.current_task() args = task.get_parameters_as_dict(cast=True) - if configurations is None: - guidellm_args = ConfigFactory.parse_string(task.get_configuration_object("GuideLLM")) - - environment_args = task.get_configuration_object("environment") - if environment_args is None: - environment_args = {} - else: - environment_args = ConfigFactory.parse_string(environment_args) + raw_config = task.get_configuration_object("GuideLLM") + if raw_config is None: + print("[DEBUG] `GuideLLM` config not found in configuration — checking parameters as fallback") + raw_config = task.get_parameters_as_dict().get("GuideLLM") + if raw_config is None: + raise RuntimeError("GuideLLM config is None. This likely means `get_configurations()` is not returning it or it's not passed via parameters.") + guidellm_args = ConfigFactory.from_dict(raw_config) + else: + guidellm_args = ConfigFactory.parse_string(raw_config) + + def clean_hocon_value(v): + if isinstance(v, str) and v.startswith('"') and v.endswith('"'): + return v[1:-1] + return v + + guidellm_args = {k: clean_hocon_value(v) for k, v in guidellm_args.items()} + + print("[DEBUG] Guidellm_Args:", guidellm_args) + + environment_args = task.get_configuration_object("environment") + if environment_args is None: + environment_args = {} + else: + environment_args = ConfigFactory.parse_string(environment_args) - vllm_args = task.get_configuration_object("vLLM") - if vllm_args is None: - vllm_args = {} - else: - vllm_args = ConfigFactory.parse_string(vllm_args) + vllm_args = task.get_configuration_object("vLLM") + if vllm_args is None: + vllm_args = {} else: - guidellm_args = configurations.get("GuideLLM", {}) - environment_args = configurations.get("environment", {}) - vllm_args = configurations.get("vLLM", {}) - + vllm_args = ConfigFactory.parse_string(vllm_args) clearml_model = args["Args"]["clearml_model"] if isinstance(clearml_model, str): @@ -39,22 +50,24 @@ def main(configurations=None): if isinstance(force_download, str): force_download = force_download.lower() == "true" - # Resolve model_id model_id = resolve_model_id(args["Args"]["model"], clearml_model, force_download) + gpu_count = int(guidellm_args.get("gpu_count", 1)) + # Start vLLM server server_process, server_initialized, server_log = start_vllm_server( vllm_args, model_id, guidellm_args["target"], args["Args"]["server_wait_time"], + gpu_count, ) if not server_initialized: kill_process_tree(server_process.pid) task.upload_artifact(name="vLLM server log", artifact_object=server_log) - raise AssertionError("Server failed to intialize") + raise AssertionError("Server failed to initialize") # Parse through environment variables for k, v in environment_args.items(): @@ -62,13 +75,51 @@ def main(configurations=None): guidellm_args["model"] = model_id - from guidellm import generate_benchmark_report - guidellm_args = cast_args(guidellm_args, generate_benchmark_report) - report = generate_benchmark_report(**guidellm_args) - kill_process_tree(server_process.pid) + import json + import asyncio + from pathlib import Path + from guidellm.benchmark.entrypoints import benchmark_with_scenario + from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios + + user_scenario = guidellm_args.get("scenario", "") + if user_scenario: + filepath = Path(os.path.join(".", "src", "automation", "standards", "benchmarking", f"{user_scenario}.json")) + if os.path.exists(filepath): + current_scenario = GenerativeTextScenario.from_file(filepath, dict(guidellm_args)) + else: + raise ValueError(f"Scenario path {filepath} does not exist") + #elif len(get_builtin_scenarios()) > 0: + # to be used when get_builtin_scenarios() bug is fiexed + # current_scenario = GenerativeTextScenario.from_builtin(get_builtin_scenarios()[0], dict(guidellm_args)) + else: + filepath = Path(os.path.join(".", "src", "automation", "standards", "benchmarking", f"{DEFAULT_GUIDELLM_SCENARIO}.json")) + current_scenario = GenerativeTextScenario.from_file(filepath, dict(guidellm_args)) + print(current_scenario.model_fields) + + # Ensure output_path is set and consistent + output_path = Path(guidellm_args.get("output_path", "guidellm-output.json")) + guidellm_args["output_path"] = str(output_path) - task.upload_artifact(name="guidellm guidance report", artifact_object=report.to_json()) - task.upload_artifact(name="vLLM server log", artifact_object=server_log) + print("[DEBUG] Calling benchmark_with_scenario with:") + print(json.dumps(guidellm_args, indent=2)) + + executable_path = os.path.dirname(sys.executable) + vllm_path = os.path.join(executable_path, "vllm") + print(f"The vllm path is: {vllm_path}") + + try: + asyncio.run( + benchmark_with_scenario( + current_scenario, + output_path= output_path, + output_extras= None + ) + ) + + finally: + task.upload_artifact(name="guidellm guidance report", artifact_object=output_path) + task.upload_artifact(name="vLLM server log", artifact_object=server_log) + kill_process_tree(server_process.pid) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/automation/vllm/server.py b/src/automation/vllm/server.py index 6036d65..2e7d321 100644 --- a/src/automation/vllm/server.py +++ b/src/automation/vllm/server.py @@ -14,25 +14,34 @@ def start_vllm_server( vllm_args, model_id, target, - server_wait_time, + server_wait_time, + gpu_count, ): task = Task.current_task() + print("Inside start vllm server") + executable_path = os.path.dirname(sys.executable) vllm_path = os.path.join(executable_path, "vllm") - num_gpus = torch.cuda.device_count() + available_gpus = list(range(torch.cuda.device_count())) + selected_gpus = available_gpus[:gpu_count] + + subprocess_env = os.environ.copy() + subprocess_env["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in selected_gpus) parsed_target = urlparse(target) + print(f"vllm path is: {vllm_path}") server_command = [ f"{vllm_path}", "serve", model_id, "--host", parsed_target.hostname, "--port", str(parsed_target.port), - "--tensor-parallel-size", str(num_gpus) + "--tensor-parallel-size", str(gpu_count), ] + print(server_command) subprocess_env = os.environ.copy() for k, v in vllm_args.items(): @@ -40,11 +49,15 @@ def start_vllm_server( subprocess_env[k] = str(v) else: if v == True or v == "True": - v = "true" - server_command.extend([f"--{k}", str(v)]) + server_command.append(f"--{k}") + else: + server_command.extend([f"--{k}", str(v)]) + server_log_file_name = f"{SERVER_LOG_PREFIX}_{task.id}.txt" server_log_file = open(server_log_file_name, "w") + print("Server command:", " ".join(server_command)) + print(f"VLLM logs are located at: {server_log_file} in {os.getcwd()}") server_process = subprocess.Popen(server_command, stdout=server_log_file, stderr=server_log_file, shell=False, env=subprocess_env) delay = 5 @@ -52,6 +65,7 @@ def start_vllm_server( for _ in range(server_wait_time // delay): try: response = requests.get(target + "/models") + print(f"response: {response}") if response.status_code == 200: print("Server initialized") server_initialized = True @@ -64,4 +78,4 @@ def start_vllm_server( if server_initialized: return server_process, True, server_log_file_name else: - return server_process, False, server_log_file_name \ No newline at end of file + return server_process, False, server_log_file_name