Merge pull request #112 from VectorInstitute/bugfix/multinode

XkunW · web-flow · commit b61bc4f8683b · 2025-05-21T13:34:30.000-04:00
Misc small features and bug fixes:

- Fixed multi-node launch GPU placement group issue: `--exclusive` option is needed for slurm script and compilation config needs to stay at 0
- Set environment variables in the generated slurm script instead of in the helper to ensure reusability
- Replaced `python3.10 -m vllm.entrypoints.openai.api_server` with `vllm serve` to support custom chat template usage
- Added additional launch options: `--exclude` for excluding certain nodes, `--node-list` for targeting a specific list of nodes, and `--bind` for binding additional directories
- Added remaining vLLM engine arg short-long name mappings for robustness
- Added some notes in README to capture some gotchas
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ models:
     vllm_args:
       --max-model-len: 1010000
       --max-num-seqs: 256
-      --compilation-confi: 3
+      --compilation-config: 3
 ```
 
 You would then set the `VEC_INF_CONFIG` path using:
@@ -94,7 +94,11 @@ You would then set the `VEC_INF_CONFIG` path using:
 export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
 ```
 
-Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
+**NOTE**
+* There are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
+* Check [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/serving/engine_args.html) for the full list of available vLLM engine arguments, the default parallel size for any parallelization is default to 1, so none of the sizes were set specifically in this example
+* For GPU partitions with non-Ampere architectures, e.g. `rtx6000`, `t4v2`, BF16 isn't supported. For models that have BF16 as the default type, when using a non-Ampere GPU, use FP16 instead, i.e. `--dtype: float16`
+* Setting `--compilation-config` to `3` currently breaks multi-node model launches, so we don't set them for models that require multiple nodes of GPUs.
 
 #### Other commands
 
@@ -161,7 +165,7 @@ Once the inference server is ready, you can start sending in inference requests.
     "prompt_logprobs":null
 }
 ```
-**NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
+**NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`
 
 ## SSH tunnel from your local device
 If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -72,6 +72,21 @@ def cli() -> None:
     type=str,
     help="Quality of service",
 )
+@click.option(
+    "--exclude",
+    type=str,
+    help="Exclude certain nodes from the resources granted to the job",
+)
+@click.option(
+    "--node-list",
+    type=str,
+    help="Request a specific list of nodes for deployment",
+)
+@click.option(
+    "--bind",
+    type=str,
+    help="Additional binds for the singularity container as a comma separated list of bind paths",
+)
 @click.option(
     "--time",
     type=str,
@@ -124,8 +139,16 @@ def launch(
             Number of nodes to use
         - gpus_per_node : int, optional
             Number of GPUs per node
+        - account : str, optional
+            Charge resources used by this job to specified account
         - qos : str, optional
             Quality of service tier
+        - exclude : str, optional
+            Exclude certain nodes from the resources granted to the job
+        - node_list : str, optional
+            Request a specific list of nodes for deployment
+        - bind : str, optional
+            Additional binds for the singularity container
         - time : str, optional
             Time limit for job
         - venv : str, optional
diff --git a/vec_inf/client/_client_vars.py b/vec_inf/client/_client_vars.py
@@ -21,7 +21,12 @@
 from pathlib import Path
 from typing import TypedDict
 
-from vec_inf.client.slurm_vars import SINGULARITY_LOAD_CMD
+from vec_inf.client.slurm_vars import (
+    LD_LIBRARY_PATH,
+    SINGULARITY_IMAGE,
+    SINGULARITY_LOAD_CMD,
+    VLLM_NCCL_SO_PATH,
+)
 
 
 MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
@@ -60,6 +65,8 @@
     "qos": "qos",
     "time": "time",
     "nodes": "num_nodes",
+    "exclude": "exclude",
+    "nodelist": "node_list",
     "gpus-per-node": "gpus_per_node",
     "cpus-per-task": "cpus_per_task",
     "mem": "mem_per_node",
@@ -71,7 +78,12 @@
 VLLM_SHORT_TO_LONG_MAP = {
     "-tp": "--tensor-parallel-size",
     "-pp": "--pipeline-parallel-size",
+    "-dp": "--data-parallel-size",
+    "-dpl": "--data-parallel-size-local",
+    "-dpa": "--data-parallel-address",
+    "-dpp": "--data-parallel-rpc-port",
     "-O": "--compilation-config",
+    "-q": "--quantization",
 }
 
 
@@ -117,6 +129,8 @@ class SlurmScriptTemplate(TypedDict):
         Commands for Singularity container setup
     imports : str
         Import statements and source commands
+    env_vars : list[str]
+        Environment variables to set
     singularity_command : str
         Template for Singularity execution command
     activate_venv : str
@@ -134,6 +148,7 @@ class SlurmScriptTemplate(TypedDict):
     shebang: ShebangConfig
     singularity_setup: list[str]
     imports: str
+    env_vars: list[str]
     singularity_command: str
     activate_venv: str
     server_setup: ServerSetupConfig
@@ -152,10 +167,14 @@ class SlurmScriptTemplate(TypedDict):
     },
     "singularity_setup": [
         SINGULARITY_LOAD_CMD,
-        "singularity exec {singularity_image} ray stop",
+        f"singularity exec {SINGULARITY_IMAGE} ray stop",
     ],
     "imports": "source {src_dir}/find_port.sh",
-    "singularity_command": "singularity exec --nv --bind {model_weights_path}:{model_weights_path} --containall {singularity_image}",
+    "env_vars": [
+        f"export LD_LIBRARY_PATH={LD_LIBRARY_PATH}",
+        f"export VLLM_NCCL_SO_PATH={VLLM_NCCL_SO_PATH}",
+    ],
+    "singularity_command": f"singularity exec --nv --bind {{model_weights_path}}{{additional_binds}} --containall {SINGULARITY_IMAGE}",
     "activate_venv": "source {venv}/bin/activate",
     "server_setup": {
         "single_node": [
@@ -203,8 +222,7 @@ class SlurmScriptTemplate(TypedDict):
         '    && mv temp.json "$json_path"',
     ],
     "launch_cmd": [
-        "python3.10 -m vllm.entrypoints.openai.api_server \\",
-        "    --model {model_weights_path} \\",
+        "vllm serve {model_weights_path} \\",
         "    --served-model-name {model_name} \\",
         '    --host "0.0.0.0" \\',
         "    --port $vllm_port_number \\",
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
@@ -5,7 +5,6 @@
 """
 
 import json
-import os
 import time
 import warnings
 from pathlib import Path
@@ -36,10 +35,6 @@
     ModelType,
     StatusResponse,
 )
-from vec_inf.client.slurm_vars import (
-    LD_LIBRARY_PATH,
-    VLLM_NCCL_SO_PATH,
-)
 
 
 class ModelLauncher:
@@ -230,11 +225,6 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         return params
 
-    def _set_env_vars(self) -> None:
-        """Set environment variables for the launch command."""
-        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
-        os.environ["VLLM_NCCL_SO_PATH"] = VLLM_NCCL_SO_PATH
-
     def _build_launch_command(self) -> str:
         """Generate the slurm script and construct the launch command.
 
@@ -259,9 +249,6 @@ def launch(self) -> LaunchResponse:
         SlurmJobError
             If SLURM job submission fails
         """
-        # Set environment variables
-        self._set_env_vars()
-
         # Build and execute the launch command
         command_output, stderr = utils.run_bash_command(self._build_launch_command())
 
diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py
@@ -12,7 +12,6 @@
     SLURM_JOB_CONFIG_ARGS,
     SLURM_SCRIPT_TEMPLATE,
 )
-from vec_inf.client.slurm_vars import SINGULARITY_IMAGE
 
 
 class SlurmScriptGenerator:
@@ -40,6 +39,9 @@ def __init__(self, params: dict[str, Any]):
         self.params = params
         self.is_multinode = int(self.params["num_nodes"]) > 1
         self.use_singularity = self.params["venv"] == "singularity"
+        self.additional_binds = self.params.get("bind", "")
+        if self.additional_binds:
+            self.additional_binds = f" --bind {self.additional_binds}"
         self.model_weights_path = str(
             Path(params["model_weights_parent_dir"], params["model_name"])
         )
@@ -87,11 +89,8 @@ def _generate_server_setup(self) -> str:
         """
         server_script = ["\n"]
         if self.use_singularity:
-            server_script.append(
-                "\n".join(SLURM_SCRIPT_TEMPLATE["singularity_setup"]).format(
-                    singularity_image=SINGULARITY_IMAGE,
-                )
-            )
+            server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["singularity_setup"]))
+        server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["env_vars"]))
         server_script.append(
             SLURM_SCRIPT_TEMPLATE["imports"].format(src_dir=self.params["src_dir"])
         )
@@ -104,7 +103,7 @@ def _generate_server_setup(self) -> str:
                     "SINGULARITY_PLACEHOLDER",
                     SLURM_SCRIPT_TEMPLATE["singularity_command"].format(
                         model_weights_path=self.model_weights_path,
-                        singularity_image=SINGULARITY_IMAGE,
+                        additional_binds=self.additional_binds,
                     ),
                 )
         else:
@@ -136,7 +135,7 @@ def _generate_launch_cmd(self) -> str:
             launcher_script.append(
                 SLURM_SCRIPT_TEMPLATE["singularity_command"].format(
                     model_weights_path=self.model_weights_path,
-                    singularity_image=SINGULARITY_IMAGE,
+                    additional_binds=self.additional_binds,
                 )
                 + " \\"
             )
diff --git a/vec_inf/client/config.py b/vec_inf/client/config.py
@@ -108,6 +108,16 @@ class ModelConfig(BaseModel):
     partition: Union[PARTITION, str] = Field(
         default=cast(str, DEFAULT_ARGS["partition"]), description="GPU partition type"
     )
+    exclude: Optional[str] = Field(
+        default=None,
+        description="Exclude certain nodes from the resources granted to the job",
+    )
+    node_list: Optional[str] = Field(
+        default=None, description="Request a specific list of nodes for deployment"
+    )
+    bind: Optional[str] = Field(
+        default=None, description="Additional binds for the singularity container"
+    )
     venv: str = Field(
         default="singularity", description="Virtual environment/container system"
     )
diff --git a/vec_inf/client/models.py b/vec_inf/client/models.py
@@ -170,6 +170,12 @@ class LaunchOptions:
         Quality of Service level
     time : str, optional
         Time limit for the job
+    exclude : str, optional
+        Exclude certain nodes from the resources granted to the job
+    node_list : str, optional
+        Request a specific list of nodes for deployment
+    bind : str, optional
+        Additional binds for the singularity container
     vocab_size : int, optional
         Size of model vocabulary
     data_type : str, optional
@@ -191,6 +197,9 @@ class LaunchOptions:
     gpus_per_node: Optional[int] = None
     account: Optional[str] = None
     qos: Optional[str] = None
+    exclude: Optional[str] = None
+    node_list: Optional[str] = None
+    bind: Optional[str] = None
     time: Optional[str] = None
     vocab_size: Optional[int] = None
     data_type: Optional[str] = None
diff --git a/vec_inf/config/models.yaml b/vec_inf/config/models.yaml
@@ -14,7 +14,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 8192
       --max-num-seqs: 256
-      --compilation-config: 3
   c4ai-command-r-plus-08-2024:
     model_family: c4ai-command-r
     model_variant: plus-08-2024
@@ -30,7 +29,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 65536
       --max-num-seqs: 256
-      --compilation-config: 3
   c4ai-command-r-08-2024:
     model_family: c4ai-command-r
     model_variant: 08-2024
@@ -494,7 +492,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 16384
       --max-num-seqs: 256
-      --compilation-config: 3
   Mistral-7B-Instruct-v0.1:
     model_family: Mistral
     model_variant: 7B-Instruct-v0.1
@@ -566,7 +563,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 32768
       --max-num-seqs: 256
-      --compilation-config: 3
   Mistral-Large-Instruct-2411:
     model_family: Mistral
     model_variant: Large-Instruct-2411
@@ -582,7 +578,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 32768
       --max-num-seqs: 256
-      --compilation-config: 3
   Mixtral-8x7B-Instruct-v0.1:
     model_family: Mixtral
     model_variant: 8x7B-Instruct-v0.1
@@ -613,7 +608,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 65536
       --max-num-seqs: 256
-      --compilation-config: 3
   Mixtral-8x22B-Instruct-v0.1:
     model_family: Mixtral
     model_variant: 8x22B-Instruct-v0.1
@@ -629,7 +623,6 @@ models:
       --tensor-parallel-size: 4
       --max-model-len: 65536
       --max-num-seqs: 256
-      --compilation-config: 3
   Phi-3-medium-128k-instruct:
     model_family: Phi-3
     model_variant: medium-128k-instruct