ai-dynamo · hhzhang16 · Oct 30, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
@@ -158,13 +158,13 @@ def create_profiler_parser() -> argparse.Namespace:
     parser.add_argument(
         "--min-num-gpus-per-engine",
         type=int,
-        default=config.get("hardware", {}).get("min_num_gpus_per_engine", 0),
+        default=config.get("hardware", {}).get("min_num_gpus_per_engine", 1),
         help="minimum number of GPUs per engine",
     )
     parser.add_argument(
         "--max-num-gpus-per-engine",
         type=int,
-        default=config.get("hardware", {}).get("max_num_gpus_per_engine", 0),
+        default=config.get("hardware", {}).get("max_num_gpus_per_engine", 8),
         help="maximum number of GPUs per engine",
     )
     parser.add_argument(
@@ -245,9 +245,15 @@ def create_profiler_parser() -> argparse.Namespace:
     parser.add_argument(
         "--num-gpus-per-node",
         type=int,
-        default=config.get("hardware", {}).get("num_gpus_per_node", 0),
+        default=config.get("hardware", {}).get("num_gpus_per_node", 8),
         help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
     )
+    parser.add_argument(
+        "--enable-gpu-discovery",
+        action="store_true",
+        default=config.get("hardware", {}).get("enable_gpu_discovery", False),
+        help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
+    )
 
     # Dynamically add all planner arguments from planner_argparse.py
     add_planner_arguments_to_parser(parser, prefix="planner-")
@@ -305,6 +311,9 @@ def create_profiler_parser() -> argparse.Namespace:
     if not args.model and not args.config:
         parser.error("--model or --config is required (provide at least one)")
 
-    auto_generate_search_space(args)
+    # Run auto-generation if GPU discovery is enabled
+    # This will override any manually specified hardware parameters
+    if args.enable_gpu_discovery:
+        auto_generate_search_space(args)
 
     return args
@@ -138,6 +138,15 @@ spec:
                         Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
                       type: string
                   type: object
+                enableGpuDiscovery:
+                  default: false
+                  description: |-
+                    EnableGpuDiscovery controls whether the profiler should automatically discover GPU
+                    resources from the Kubernetes cluster nodes. When enabled, the profiler will override
+                    any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
+                    num_gpus_per_node) with values detected from the cluster.
+                    Requires cluster-wide node access permissions - only available with cluster-scoped operators.
+                  type: boolean
                 model:
                   description: |-
                     Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").

@@ -37,7 +37,7 @@ Prevents all conflict scenarios:
   {{- end -}}
 
   {{- if $namespaceRestrictedOperators -}}
-    {{- fail (printf "VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. Found existing namespace-restricted Dynamo operators in namespaces: %s. This would create resource conflicts as both the cluster-wide operator and namespace-restricted operators would manage the same DGDs/DCDs. Either:\n1. Use one of the existing namespace-restricted operators for your specific namespace, or\n2. Uninstall all existing namespace-restricted operators first, or\n3. Install this operator in namespace-restricted mode: --set namespaceRestriction.enabled=true" (join ", " ($namespaceRestrictedOperators | uniq))) -}}
+    {{- fail (printf "VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. Found existing namespace-restricted Dynamo operators in namespaces: %s. This would create resource conflicts as both the cluster-wide operator and namespace-restricted operators would manage the same DGDs/DCDs. Either:\n1. Use one of the existing namespace-restricted operators for your specific namespace, or\n2. Uninstall all existing namespace-restricted operators first, or\n3. Install this operator in namespace-restricted mode: --set dynamo-operator.namespaceRestriction.enabled=true" (join ", " ($namespaceRestrictedOperators | uniq))) -}}
   {{- end -}}
 {{- end -}}
 

@@ -124,9 +124,7 @@ spec:
           - --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
           - --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
         {{- end }}
-        {{- if .Values.namespaceRestriction.enabled }}
-          - --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
-        {{- else }}
+        {{- if not .Values.namespaceRestriction.enabled }}
           - --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling
           - --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
         {{- end }}

@@ -70,35 +70,6 @@ roleRef:
   kind: Role
   name: dgdr-profiling-job
 subjects:
-- kind: ServiceAccount
-  name: dgdr-profiling-job
-  namespace: {{ .Release.Namespace }}
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
-  labels:
-    {{- include "dynamo-operator.labels" . | nindent 4 }}
-    app.kubernetes.io/component: dgdr-profiling
-rules:
-# Nodes - cluster-scoped resource needed for profiling
-- apiGroups: [""]
-  resources: ["nodes"]
-  verbs: ["get", "list", "watch"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
-  labels:
-    {{- include "dynamo-operator.labels" . | nindent 4 }}
-    app.kubernetes.io/component: dgdr-profiling
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-dgdr-profiling-nodes
-subjects:
 - kind: ServiceAccount
   name: dgdr-profiling-job
   namespace: {{ .Release.Namespace }}

@@ -114,6 +114,15 @@ type DynamoGraphDeploymentRequestSpec struct {
 	// +kubebuilder:validation:Enum=vllm;sglang;trtllm
 	Backend string `json:"backend"`
 
+	// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
+	// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
+	// any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
+	// num_gpus_per_node) with values detected from the cluster.
+	// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
+	// +kubebuilder:default=false
+	// +kubebuilder:validation:Optional
+	EnableGpuDiscovery bool `json:"enableGpuDiscovery,omitempty"`
+
 	// ProfilingConfig provides the complete configuration for the profiling job.
 	// This configuration is passed directly to the profiler.
 	// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).

@@ -138,6 +138,15 @@ spec:
                         Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
                       type: string
                   type: object
+                enableGpuDiscovery:
+                  default: false
+                  description: |-
+                    EnableGpuDiscovery controls whether the profiler should automatically discover GPU
+                    resources from the Kubernetes cluster nodes. When enabled, the profiler will override
+                    any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
+                    num_gpus_per_node) with values detected from the cluster.
+                    Requires cluster-wide node access permissions - only available with cluster-scoped operators.
+                  type: boolean
                 model:
                   description: |-
                     Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").

@@ -720,6 +720,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
 		return errors.New("profilingConfig.config is required and must not be empty")
 	}
 
+	// Validate enableGpuDiscovery is only true for cluster-wide operators
+	if dgdr.Spec.EnableGpuDiscovery && r.Config.RestrictedNamespace != "" {
+		return errors.New("enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in profilingConfig.config")
+	}
+
 	// Validate ConfigMap if provided (for the DGD base config)
 	if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
 		cm := &corev1.ConfigMap{}
@@ -937,6 +942,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 			"--profile-config", string(configYAML),
 		}
 
+		// Add --enable-gpu-discovery flag based on DGDR spec
+		// GPU discovery requires cluster-wide node access
+		if dgdr.Spec.EnableGpuDiscovery {
+			profilerArgs = append(profilerArgs, "--enable-gpu-discovery")
+		}
+
 		// Use profiler image from profilingConfig
 		imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
 		logger.Info("Using profiler image", "image", imageName)

diff --git a/docs/benchmarks/sla_driven_profiling.md b/docs/benchmarks/sla_driven_profiling.md
@@ -42,16 +42,45 @@ The recommended way to profile models is through DGDRs. Sample configurations ar
 - **`profile_sla_moe_dgdr.yaml`**: MoE model profiling
 
 The Dynamo Operator automatically:
-1. Discovers GPU resources
+1. Discovers GPU resources (cluster-scoped operators only)
 2. Runs profiling (AIPerf on real engines or AI Configurator simulation)
 3. Generates optimal DGD configuration with SLA planner
 4. Deploys the DGD to your cluster
 
 See the [Quick Start Guide](/docs/planner/sla_planner_quickstart.md) for prerequisites and detailed instructions.
 
+## Hardware Configuration
+
+Hardware parameters have sensible defaults and are **optional** - you can override them if needed:
+
+```yaml
+profilingConfig:
+  config:
+    # Override hardware defaults if needed
+    hardware:
+      min_num_gpus_per_engine: 1
+      max_num_gpus_per_engine: 8
+      num_gpus_per_node: 8
+
+    # Only needed when using AI Configurator (sweep.use_ai_configurator: true)
+    sweep:
+      aic_system: h200_sxm  # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
+```
+
+### Automatic GPU Discovery (Optional Feature)
+
+Cluster-scoped operators can optionally enable automatic GPU discovery to detect hardware from cluster nodes. When enabled, hardware config is auto-detected and overrides any manually specified values.
+
+```yaml
+spec:
+  enableGpuDiscovery: true
+```
+
+This feature is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions. It is not available for namespace-restricted operators.
+
 ## Profiling Method
 
-1. **GPU Discovery**: Detects available GPUs and their specifications
+1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes.
 2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense model and 4 nodes for MoE models.
 3. **Parallelization Mapping Sweep**: Use the input ISL and OSL, test the performance of the engines with different parallelization mappings. For dense models, we test different TP sizes for both prefill and decode. For MoE models, we test different TEP sizes for prefill and DEP sizes for decode.
    - **Prefill**: For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), we directly measure the TTFT for a request with given isl without kv-reusing. For example, the below plot shows the prefill parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B.