diff --git a/benchmarks/profiler/utils/profiler_argparse.py b/benchmarks/profiler/utils/profiler_argparse.py index af0709a67c..5ae7b18bf1 100644 --- a/benchmarks/profiler/utils/profiler_argparse.py +++ b/benchmarks/profiler/utils/profiler_argparse.py @@ -158,13 +158,13 @@ def create_profiler_parser() -> argparse.Namespace: parser.add_argument( "--min-num-gpus-per-engine", type=int, - default=config.get("hardware", {}).get("min_num_gpus_per_engine", 0), + default=config.get("hardware", {}).get("min_num_gpus_per_engine", 1), help="minimum number of GPUs per engine", ) parser.add_argument( "--max-num-gpus-per-engine", type=int, - default=config.get("hardware", {}).get("max_num_gpus_per_engine", 0), + default=config.get("hardware", {}).get("max_num_gpus_per_engine", 8), help="maximum number of GPUs per engine", ) parser.add_argument( @@ -245,9 +245,15 @@ def create_profiler_parser() -> argparse.Namespace: parser.add_argument( "--num-gpus-per-node", type=int, - default=config.get("hardware", {}).get("num_gpus_per_node", 0), + default=config.get("hardware", {}).get("num_gpus_per_node", 8), help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size", ) + parser.add_argument( + "--enable-gpu-discovery", + action="store_true", + default=config.get("hardware", {}).get("enable_gpu_discovery", False), + help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.", + ) # Dynamically add all planner arguments from planner_argparse.py add_planner_arguments_to_parser(parser, prefix="planner-") @@ -305,6 +311,9 @@ def create_profiler_parser() -> argparse.Namespace: if not args.model and not args.config: parser.error("--model or --config is required (provide at least one)") - auto_generate_search_space(args) + # Run auto-generation if GPU discovery is enabled + # This will override any manually specified hardware parameters + if args.enable_gpu_discovery: + auto_generate_search_space(args) return args diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml index 0414564cec..73bfd51f2e 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml @@ -138,6 +138,15 @@ spec: Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" type: string type: object + enableGpuDiscovery: + default: false + description: |- + EnableGpuDiscovery controls whether the profiler should automatically discover GPU + resources from the Kubernetes cluster nodes. When enabled, the profiler will override + any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine, + num_gpus_per_node) with values detected from the cluster. + Requires cluster-wide node access permissions - only available with cluster-scoped operators. + type: boolean model: description: |- Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b"). diff --git a/deploy/cloud/helm/platform/components/operator/templates/_validation.tpl b/deploy/cloud/helm/platform/components/operator/templates/_validation.tpl index 0389d233ac..17860973de 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/_validation.tpl +++ b/deploy/cloud/helm/platform/components/operator/templates/_validation.tpl @@ -37,7 +37,7 @@ Prevents all conflict scenarios: {{- end -}} {{- if $namespaceRestrictedOperators -}} - {{- fail (printf "VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. Found existing namespace-restricted Dynamo operators in namespaces: %s. This would create resource conflicts as both the cluster-wide operator and namespace-restricted operators would manage the same DGDs/DCDs. Either:\n1. Use one of the existing namespace-restricted operators for your specific namespace, or\n2. Uninstall all existing namespace-restricted operators first, or\n3. Install this operator in namespace-restricted mode: --set namespaceRestriction.enabled=true" (join ", " ($namespaceRestrictedOperators | uniq))) -}} + {{- fail (printf "VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. Found existing namespace-restricted Dynamo operators in namespaces: %s. This would create resource conflicts as both the cluster-wide operator and namespace-restricted operators would manage the same DGDs/DCDs. Either:\n1. Use one of the existing namespace-restricted operators for your specific namespace, or\n2. Uninstall all existing namespace-restricted operators first, or\n3. Install this operator in namespace-restricted mode: --set dynamo-operator.namespaceRestriction.enabled=true" (join ", " ($namespaceRestrictedOperators | uniq))) -}} {{- end -}} {{- end -}} diff --git a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml index b7194087b8..3e297009b8 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml @@ -124,9 +124,7 @@ spec: - --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }} - --mpi-run-ssh-secret-namespace={{ .Release.Namespace }} {{- end }} - {{- if .Values.namespaceRestriction.enabled }} - - --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes - {{- else }} + {{- if not .Values.namespaceRestriction.enabled }} - --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling - --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner {{- end }} diff --git a/deploy/cloud/helm/platform/components/operator/templates/profiling-job-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/profiling-job-rbac.yaml index b3e11731e7..01982e75e8 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/profiling-job-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/profiling-job-rbac.yaml @@ -70,35 +70,6 @@ roleRef: kind: Role name: dgdr-profiling-job subjects: -- kind: ServiceAccount - name: dgdr-profiling-job - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes - labels: - {{- include "dynamo-operator.labels" . | nindent 4 }} - app.kubernetes.io/component: dgdr-profiling -rules: -# Nodes - cluster-scoped resource needed for profiling -- apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes - labels: - {{- include "dynamo-operator.labels" . | nindent 4 }} - app.kubernetes.io/component: dgdr-profiling -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes -subjects: - kind: ServiceAccount name: dgdr-profiling-job namespace: {{ .Release.Namespace }} diff --git a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentrequest_types.go b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentrequest_types.go index 421004040a..e13f8e9a00 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentrequest_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentrequest_types.go @@ -114,6 +114,15 @@ type DynamoGraphDeploymentRequestSpec struct { // +kubebuilder:validation:Enum=vllm;sglang;trtllm Backend string `json:"backend"` + // EnableGpuDiscovery controls whether the profiler should automatically discover GPU + // resources from the Kubernetes cluster nodes. When enabled, the profiler will override + // any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine, + // num_gpus_per_node) with values detected from the cluster. + // Requires cluster-wide node access permissions - only available with cluster-scoped operators. + // +kubebuilder:default=false + // +kubebuilder:validation:Optional + EnableGpuDiscovery bool `json:"enableGpuDiscovery,omitempty"` + // ProfilingConfig provides the complete configuration for the profiling job. // This configuration is passed directly to the profiler. // The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema). diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml index 0414564cec..73bfd51f2e 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml @@ -138,6 +138,15 @@ spec: Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" type: string type: object + enableGpuDiscovery: + default: false + description: |- + EnableGpuDiscovery controls whether the profiler should automatically discover GPU + resources from the Kubernetes cluster nodes. When enabled, the profiler will override + any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine, + num_gpus_per_node) with values detected from the cluster. + Requires cluster-wide node access permissions - only available with cluster-scoped operators. + type: boolean model: description: |- Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b"). diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go index f2459fa4a9..093a28763f 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go @@ -720,6 +720,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex return errors.New("profilingConfig.config is required and must not be empty") } + // Validate enableGpuDiscovery is only true for cluster-wide operators + if dgdr.Spec.EnableGpuDiscovery && r.Config.RestrictedNamespace != "" { + return errors.New("enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in profilingConfig.config") + } + // Validate ConfigMap if provided (for the DGD base config) if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil { cm := &corev1.ConfigMap{} @@ -937,6 +942,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. "--profile-config", string(configYAML), } + // Add --enable-gpu-discovery flag based on DGDR spec + // GPU discovery requires cluster-wide node access + if dgdr.Spec.EnableGpuDiscovery { + profilerArgs = append(profilerArgs, "--enable-gpu-discovery") + } + // Use profiler image from profilingConfig imageName := dgdr.Spec.ProfilingConfig.ProfilerImage logger.Info("Using profiler image", "image", imageName) diff --git a/docs/benchmarks/sla_driven_profiling.md b/docs/benchmarks/sla_driven_profiling.md index bded79ef60..a9fec61324 100644 --- a/docs/benchmarks/sla_driven_profiling.md +++ b/docs/benchmarks/sla_driven_profiling.md @@ -42,16 +42,45 @@ The recommended way to profile models is through DGDRs. Sample configurations ar - **`profile_sla_moe_dgdr.yaml`**: MoE model profiling The Dynamo Operator automatically: -1. Discovers GPU resources +1. Discovers GPU resources (cluster-scoped operators only) 2. Runs profiling (AIPerf on real engines or AI Configurator simulation) 3. Generates optimal DGD configuration with SLA planner 4. Deploys the DGD to your cluster See the [Quick Start Guide](/docs/planner/sla_planner_quickstart.md) for prerequisites and detailed instructions. +## Hardware Configuration + +Hardware parameters have sensible defaults and are **optional** - you can override them if needed: + +```yaml +profilingConfig: + config: + # Override hardware defaults if needed + hardware: + min_num_gpus_per_engine: 1 + max_num_gpus_per_engine: 8 + num_gpus_per_node: 8 + + # Only needed when using AI Configurator (sweep.use_ai_configurator: true) + sweep: + aic_system: h200_sxm # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.) +``` + +### Automatic GPU Discovery (Optional Feature) + +Cluster-scoped operators can optionally enable automatic GPU discovery to detect hardware from cluster nodes. When enabled, hardware config is auto-detected and overrides any manually specified values. + +```yaml +spec: + enableGpuDiscovery: true +``` + +This feature is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions. It is not available for namespace-restricted operators. + ## Profiling Method -1. **GPU Discovery**: Detects available GPUs and their specifications +1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes. 2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense model and 4 nodes for MoE models. 3. **Parallelization Mapping Sweep**: Use the input ISL and OSL, test the performance of the engines with different parallelization mappings. For dense models, we test different TP sizes for both prefill and decode. For MoE models, we test different TEP sizes for prefill and DEP sizes for decode. - **Prefill**: For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), we directly measure the TTFT for a request with given isl without kv-reusing. For example, the below plot shows the prefill parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B. diff --git a/docs/kubernetes/api_reference.md b/docs/kubernetes/api_reference.md index e7b0d8984a..00ac44dd93 100644 --- a/docs/kubernetes/api_reference.md +++ b/docs/kubernetes/api_reference.md @@ -77,7 +77,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\}
| +| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: {}
| | `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | | @@ -95,11 +95,11 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: \{\}
| -| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: \{\}
| -| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: \{\}
| -| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\}
| -| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\}
| +| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: {}
| +| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: {}
| +| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: {}
| +| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: {}
| +| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: {}
| #### DeploymentStatus @@ -237,6 +237,7 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests It serves as the primary interface for users to request model deployments with specific performance and resource constraints, enabling SLA-driven deployments. + Lifecycle: 1. Initial → Pending: Validates spec and prepares for profiling 2. Pending → Profiling: Creates and runs profiling job (online or AIC) @@ -245,6 +246,7 @@ Lifecycle: 5. Ready: Terminal state when DGD is operational or spec is available 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted + The spec becomes immutable once profiling starts. Users must delete and recreate the DGDR to modify configuration after this point. @@ -276,11 +278,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\}
| -| `backend` _string_ | Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm]
Required: \{\}
| -| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: \{\}
| +| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: {}
| +| `backend` _string_ | Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm]
Required: {}
| +| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: {}
| +| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: {}
| | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec. | false | | -| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: \{\}
| +| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: {}
| #### DynamoGraphDeploymentRequestStatus @@ -298,12 +301,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `state` _string_ | State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization. | | | -| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: \{\}
| +| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: {}
| | `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.
Used to detect spec changes and enforce immutability after profiling starts. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.
Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
Conditions are merged by type on patch updates. | | | -| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/" | | Optional: \{\}
| -| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: \{\}
Optional: \{\}
| -| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\}
| +| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/" | | Optional: {}
| +| `generatedDeployment` _[RawExtension](#rawextension)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: {}
Optional: {}
| +| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: {}
| #### DynamoGraphDeploymentSpec @@ -319,9 +322,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | Optional: \{\}
| -| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: \{\}
| -| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: \{\}
| +| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | Optional: {}
| +| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: {}
| +| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: {}
| | `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm]
| @@ -415,9 +418,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `create` _boolean_ | Create indicates to create a new PVC | | | -| `name` _string_ | Name is the name of the PVC | | Required: \{\}
| +| `name` _string_ | Name is the name of the PVC | | Required: {}
| | `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | | -| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | +| `size` _[Quantity](#quantity)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | | `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | | @@ -436,9 +439,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: \{\}
Type: object
| -| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: \{\}
| -| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\}
| +| `config` _[JSON](#json)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: {}
Type: object
| +| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: {}
| +| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: {}
| #### SharedMemorySpec @@ -456,7 +459,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `disabled` _boolean_ | | | | -| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | | +| `size` _[Quantity](#quantity)_ | | | | #### VolumeMount @@ -473,7 +476,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\}
| +| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: {}
| | `mountPoint` _string_ | MountPoint specifies where to mount the volume.
If useAsCompilationCache is true and mountPoint is not specified,
a backend-specific default will be used. | | | | `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.
When true, backend-specific environment variables will be set and default mount points may be used. | false | | diff --git a/docs/planner/sla_planner_quickstart.md b/docs/planner/sla_planner_quickstart.md index 1fcb2a6ca9..d8fb253b53 100644 --- a/docs/planner/sla_planner_quickstart.md +++ b/docs/planner/sla_planner_quickstart.md @@ -229,26 +229,17 @@ sweep: # Offline Profiling (AI Configurator - TensorRT-LLM only) sweep: use_ai_configurator: true -aic: - system: h200_sxm - model_name: QWEN3_32B - backend_version: "0.20.0" + aic_system: h200_sxm + aic_model_name: QWEN3_32B + aic_backend_version: "0.20.0" ``` > [!NOTE] > For detailed comparison, supported configurations, and limitations, see [SLA-Driven Profiling Documentation](/docs/benchmarks/sla_driven_profiling.md#profiling-methods). -### GPU Discovery +### Hardware Configuration -By default, the DGDR controller automatically discovers available GPU resources. Optionally specify preferences: - -```yaml -spec: - gpu: - type: h200 # GPU type (e.g., h100, h200) - count: 8 # Number of GPUs to use - memoryGB: 141 # GPU memory in GB -``` +For details on hardware configuration and GPU discovery options, see [Hardware Configuration in SLA-Driven Profiling](/docs/benchmarks/sla_driven_profiling.md#hardware-configuration). ### Advanced Configuration diff --git a/tests/profiler/test_profile_sla_dryrun.py b/tests/profiler/test_profile_sla_dryrun.py index 556cc0789e..8a9537136e 100644 --- a/tests/profiler/test_profile_sla_dryrun.py +++ b/tests/profiler/test_profile_sla_dryrun.py @@ -242,8 +242,12 @@ def __init__(self): self.namespace = "test-namespace" self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.dgd_image = "" - self.min_num_gpus_per_engine = 0 # Will be auto-generated - self.max_num_gpus_per_engine = 0 # Will be auto-generated + self.min_num_gpus_per_engine = ( + 1 # Will be overridden by auto-generation + ) + self.max_num_gpus_per_engine = ( + 8 # Will be overridden by auto-generation + ) self.skip_existing_results = False self.force_rerun = False self.isl = 3000 @@ -261,7 +265,7 @@ def __init__(self): self.aic_model_name = None self.aic_backend = "" self.aic_backend_version = None - self.num_gpus_per_node = None # Will be auto-generated + self.num_gpus_per_node = 8 # Will be overridden by auto-generation self.deploy_after_profile = False return Args() @@ -304,8 +308,12 @@ def __init__(self): self.namespace = "test-namespace" self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.dgd_image = "" - self.min_num_gpus_per_engine = 0 # Will be auto-generated - self.max_num_gpus_per_engine = 0 # Will be auto-generated + self.min_num_gpus_per_engine = ( + 1 # Will be overridden by auto-generation + ) + self.max_num_gpus_per_engine = ( + 8 # Will be overridden by auto-generation + ) self.skip_existing_results = False self.force_rerun = False self.isl = 3000 @@ -323,7 +331,7 @@ def __init__(self): self.aic_model_name = None self.aic_backend = "" self.aic_backend_version = None - self.num_gpus_per_node = None # Will be auto-generated + self.num_gpus_per_node = 8 # Will be overridden by auto-generation self.deploy_after_profile = False return Args() @@ -366,8 +374,12 @@ def __init__(self): self.namespace = "test-namespace" self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.dgd_image = "" - self.min_num_gpus_per_engine = 0 # Will be auto-generated - self.max_num_gpus_per_engine = 0 # Will be auto-generated + self.min_num_gpus_per_engine = ( + 1 # Will be overridden by auto-generation + ) + self.max_num_gpus_per_engine = ( + 8 # Will be overridden by auto-generation + ) self.skip_existing_results = False self.force_rerun = False self.isl = 3000 @@ -385,7 +397,7 @@ def __init__(self): self.aic_model_name = None self.aic_backend = "" self.aic_backend_version = None - self.num_gpus_per_node = None # Will be auto-generated + self.num_gpus_per_node = 8 # Will be overridden by auto-generation self.deploy_after_profile = False return Args()