kubeflow
diff --git a/‎pkg/constants/constants.go
+20-17 b/‎pkg/constants/constants.go
+20-17
diff --git a/‎pkg/controller/trainjob_controller.go
+2-11 b/‎pkg/controller/trainjob_controller.go
+2-11
diff --git a/‎pkg/runtime/core/clustertrainingruntime.go
+9-4 b/‎pkg/runtime/core/clustertrainingruntime.go
+9-4
diff --git a/‎pkg/runtime/core/trainingruntime.go
+37-22 b/‎pkg/runtime/core/trainingruntime.go
+37-22
diff --git a/‎pkg/runtime/framework/core/framework.go
+2-2 b/‎pkg/runtime/framework/core/framework.go
+2-2
diff --git a/‎pkg/runtime/framework/core/framework_test.go
+4-1 b/‎pkg/runtime/framework/core/framework_test.go
+4-1
diff --git a/‎pkg/runtime/framework/interface.go
+2-1 b/‎pkg/runtime/framework/interface.go
+2-1
diff --git a/‎pkg/runtime/framework/plugins/jobset/jobset.go
+43 b/‎pkg/runtime/framework/plugins/jobset/jobset.go
+43
diff --git a/‎pkg/runtime/framework/plugins/mpi/mpi.go
+16-3 b/‎pkg/runtime/framework/plugins/mpi/mpi.go
+16-3
@@ -4,6 +4,7 @@ import (
 	"fmt"
 
 	batchv1 "k8s.io/api/batch/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
 )
 
 const (
@@ -61,23 +62,6 @@ const (
 	// {"type": "Suspended", "status": "True", "reason": "Resumed"} condition.
 	TrainJobResumedMessage = "TrainJob is resumed"
 
-	// Distributed envs for torchrun.
-	// Ref: https://github.com/pytorch/pytorch/blob/3a0d0885171376ed610c8175a19ba40411fc6f3f/torch/distributed/argparse_util.py#L45
-	// TorchEnvNumNodes is the env name for the number of training nodes.
-	TorchEnvNumNodes string = "PET_NNODES"
-
-	// TorchEnvNumProcPerNode is the env name for the number of procs per node (e.g. number of GPUs per Pod).
-	TorchEnvNumProcPerNode string = "PET_NPROC_PER_NODE"
-
-	// TorchEnvNodeRank is the env name for the node RANK
-	TorchEnvNodeRank string = "PET_NODE_RANK"
-
-	// TorchEnvMasterAddr is the env name for the master node address.
-	TorchEnvMasterAddr string = "PET_MASTER_ADDR"
-
-	// TorchEnvMasterPort is the env name for the master node port.
-	TorchEnvMasterPort string = "PET_MASTER_PORT"
-
 	// JobLauncher is the Job name for the launcher.
 	JobLauncher string = "launcher"
 
@@ -131,9 +115,28 @@ const (
 
 	// OpenMPIEnvDefaultSlots is the OpenMPI default number of slots env key.
 	OpenMPIEnvDefaultSlots string = "OMPI_MCA_orte_set_default_slots"
+	// Distributed envs for torchrun.
+	// Ref: https://github.com/pytorch/pytorch/blob/3a0d0885171376ed610c8175a19ba40411fc6f3f/torch/distributed/argparse_util.py#L45
+	// TorchEnvNumNodes is the env name for the number of training nodes.
+	TorchEnvNumNodes string = "PET_NNODS"
+
+	// TorchEnvNumProcPerNode is the env name for the number of procs per node (e.g. number of GPUs per Pod).
+	TorchEnvNumProcPerNode string = "PET_NPROC_PER_NODE"
+
+	// TorchEnvNodeRank is the env name for the node RANK
+	TorchEnvNodeRank string = "PET_NODE_RANK"
+
+	// TorchEnvMasterAddr is the env name for the master node address.
+	TorchEnvMasterAddr string = "PET_MASTER_ADDR"
+
+	// TorchEnvMasterPort is the env name for the master node port.
+	TorchEnvMasterPort string = "PET_MASTER_PORT"
 )
 
 var (
 	// JobCompletionIndexFieldPath is the field path for the Job completion index annotation.
 	JobCompletionIndexFieldPath string = fmt.Sprintf("metadata.annotations['%s']", batchv1.JobCompletionIndexAnnotation)
+
+	// Torchrun reserved env names
+	TorchRunReservedEnvNames = sets.New(TorchEnvNumNodes, TorchEnvNumProcPerNode, TorchEnvNodeRank, TorchEnvMasterAddr, TorchEnvMasterPort)
 )
@@ -41,8 +41,6 @@ import (
 	jobruntimes "github.com/kubeflow/trainer/pkg/runtime"
 )
 
-var errorUnsupportedRuntime = errors.New("the specified runtime is not supported")
-
 type objsOpState int
 
 const (
@@ -85,10 +83,10 @@ func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
 		return ctrl.Result{}, nil
 	}
 
-	runtimeRefGK := runtimeRefToGroupKind(trainJob.Spec.RuntimeRef).String()
+	runtimeRefGK := jobruntimes.RuntimeRefToRuntimeRegistryKey(trainJob.Spec.RuntimeRef)
 	runtime, ok := r.runtimes[runtimeRefGK]
 	if !ok {
-		return ctrl.Result{}, fmt.Errorf("%w: %s", errorUnsupportedRuntime, runtimeRefGK)
+		return ctrl.Result{}, fmt.Errorf("unsupported runtime: %s", runtimeRefGK)
 	}
 	opState, err := r.reconcileObjects(ctx, runtime, &trainJob)
 
@@ -215,13 +213,6 @@ func isTrainJobFinished(trainJob *trainer.TrainJob) bool {
 		meta.IsStatusConditionTrue(trainJob.Status.Conditions, trainer.TrainJobFailed)
 }
 
-func runtimeRefToGroupKind(runtimeRef trainer.RuntimeRef) schema.GroupKind {
-	return schema.GroupKind{
-		Group: ptr.Deref(runtimeRef.APIGroup, ""),
-		Kind:  ptr.Deref(runtimeRef.Kind, ""),
-	}
-}
-
 func (r *TrainJobReconciler) SetupWithManager(mgr ctrl.Manager, options controller.Options) error {
 	b := ctrl.NewControllerManagedBy(mgr).
 		WithOptions(options).
 
@@ -26,6 +26,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/validation/field"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 
 	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
 	"github.com/kubeflow/trainer/pkg/runtime"
@@ -69,14 +70,18 @@ func (r *ClusterTrainingRuntime) EventHandlerRegistrars() []runtime.ReconcilerBu
 }
 
 func (r *ClusterTrainingRuntime) ValidateObjects(ctx context.Context, old, new *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
+	clusterTrainingRuntime := &trainer.ClusterTrainingRuntime{}
 	if err := r.client.Get(ctx, client.ObjectKey{
-		Namespace: old.Namespace,
-		Name:      old.Spec.RuntimeRef.Name,
+		Name: new.Spec.RuntimeRef.Name,
 	}, &trainer.ClusterTrainingRuntime{}); err != nil {
 		return nil, field.ErrorList{
-			field.Invalid(field.NewPath("spec", "RuntimeRef"), old.Spec.RuntimeRef,
+			field.Invalid(field.NewPath("spec", "RuntimeRef"), new.Spec.RuntimeRef,
 				fmt.Sprintf("%v: specified clusterTrainingRuntime must be created before the TrainJob is created", err)),
 		}
 	}
-	return r.framework.RunCustomValidationPlugins(old, new)
+	info, _ := r.runtimeInfo(ctx, new, clusterTrainingRuntime.Spec.Template, clusterTrainingRuntime.Spec.MLPolicy, clusterTrainingRuntime.Spec.PodGroupPolicy)
+	jobSetTemplate := jobsetv1alpha2.JobSet{
+		Spec: clusterTrainingRuntime.Spec.Template.Spec,
+	}
+	return r.framework.RunCustomValidationPlugins(jobSetTemplate.DeepCopy(), info, old, new)
 }
@@ -21,8 +21,6 @@ import (
 	"errors"
 	"fmt"
 
-	"github.com/kubeflow/trainer/pkg/apply"
-	"github.com/kubeflow/trainer/pkg/constants"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
@@ -35,6 +33,8 @@ import (
 	jobsetv1alpha2ac "sigs.k8s.io/jobset/client-go/applyconfiguration/jobset/v1alpha2"
 
 	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
+	"github.com/kubeflow/trainer/pkg/apply"
+	"github.com/kubeflow/trainer/pkg/constants"
 	"github.com/kubeflow/trainer/pkg/runtime"
 	fwkcore "github.com/kubeflow/trainer/pkg/runtime/framework/core"
 	fwkplugins "github.com/kubeflow/trainer/pkg/runtime/framework/plugins"
@@ -89,6 +89,29 @@ func (r *TrainingRuntime) NewObjects(ctx context.Context, trainJob *trainer.Trai
 func (r *TrainingRuntime) buildObjects(
 	ctx context.Context, trainJob *trainer.TrainJob, jobSetTemplateSpec trainer.JobSetTemplateSpec, mlPolicy *trainer.MLPolicy, podGroupPolicy *trainer.PodGroupPolicy,
 ) ([]any, error) {
+
+	info, err := r.runtimeInfo(ctx, trainJob, jobSetTemplateSpec, mlPolicy, podGroupPolicy)
+	if err != nil {
+		return nil, err
+	}
+	if err = r.framework.RunEnforceMLPolicyPlugins(info, trainJob); err != nil {
+		return nil, err
+	}
+
+	if err = r.framework.RunEnforcePodGroupPolicyPlugins(info, trainJob); err != nil {
+		return nil, err
+	}
+
+	if err = r.framework.RunPodNetworkPlugins(info, trainJob); err != nil {
+		return nil, err
+	}
+
+	return r.framework.RunComponentBuilderPlugins(ctx, info, trainJob)
+}
+
+func (r *TrainingRuntime) runtimeInfo(
+	ctx context.Context, trainJob *trainer.TrainJob, jobSetTemplateSpec trainer.JobSetTemplateSpec, mlPolicy *trainer.MLPolicy, podGroupPolicy *trainer.PodGroupPolicy) (*runtime.Info, error) {
+
 	propagationLabels := jobSetTemplateSpec.Labels
 	if propagationLabels == nil && trainJob.Spec.Labels != nil {
 		propagationLabels = make(map[string]string, len(trainJob.Spec.Labels))
@@ -140,21 +163,7 @@ func (r *TrainingRuntime) buildObjects(
 		)
 	}
 
-	info := runtime.NewInfo(opts...)
-
-	if err = r.framework.RunEnforceMLPolicyPlugins(info, trainJob); err != nil {
-		return nil, err
-	}
-
-	if err = r.framework.RunEnforcePodGroupPolicyPlugins(info, trainJob); err != nil {
-		return nil, err
-	}
-
-	if err = r.framework.RunPodNetworkPlugins(info, trainJob); err != nil {
-		return nil, err
-	}
-
-	return r.framework.RunComponentBuilderPlugins(ctx, info, trainJob)
+	return runtime.NewInfo(opts...), nil
 }
 
 func syncPodSets(info *runtime.Info) {
@@ -198,14 +207,20 @@ func (r *TrainingRuntime) EventHandlerRegistrars() []runtime.ReconcilerBuilder {
 }
 
 func (r *TrainingRuntime) ValidateObjects(ctx context.Context, old, new *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
+	trainingRuntime := &trainer.TrainingRuntime{}
 	if err := r.client.Get(ctx, client.ObjectKey{
-		Namespace: old.Namespace,
-		Name:      old.Spec.RuntimeRef.Name,
-	}, &trainer.TrainingRuntime{}); err != nil {
+		Namespace: new.Namespace,
+		Name:      new.Spec.RuntimeRef.Name,
+	}, trainingRuntime); err != nil {
 		return nil, field.ErrorList{
-			field.Invalid(field.NewPath("spec", "runtimeRef"), old.Spec.RuntimeRef,
+			field.Invalid(field.NewPath("spec", "runtimeRef"), new.Spec.RuntimeRef,
 				fmt.Sprintf("%v: specified trainingRuntime must be created before the TrainJob is created", err)),
 		}
 	}
-	return r.framework.RunCustomValidationPlugins(old, new)
+	info, _ := r.runtimeInfo(ctx, new, trainingRuntime.Spec.Template, trainingRuntime.Spec.MLPolicy, trainingRuntime.Spec.PodGroupPolicy) // ignoring the error here as the runtime configured should be valid
+
+	jobSetTemplate := jobsetv1alpha2.JobSet{
+		Spec: trainingRuntime.Spec.Template.Spec,
+	}
+	return r.framework.RunCustomValidationPlugins(jobSetTemplate.DeepCopy(), info, old, new)
 }
@@ -101,11 +101,11 @@ func (f *Framework) RunEnforcePodGroupPolicyPlugins(info *runtime.Info, trainJob
 	return nil
 }
 
-func (f *Framework) RunCustomValidationPlugins(oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
+func (f *Framework) RunCustomValidationPlugins(runtimeJobTemplate client.Object, info *runtime.Info, oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
 	var aggregatedWarnings admission.Warnings
 	var aggregatedErrors field.ErrorList
 	for _, plugin := range f.customValidationPlugins {
-		warnings, errs := plugin.Validate(oldObj, newObj)
+		warnings, errs := plugin.Validate(runtimeJobTemplate, info, oldObj, newObj)
 		if len(warnings) != 0 {
 			aggregatedWarnings = append(aggregatedWarnings, warnings...)
 		}
 
@@ -87,6 +87,7 @@ func TestNew(t *testing.T) {
 				customValidationPlugins: []framework.CustomValidationPlugin{
 					&mpi.MPI{},
 					&torch.Torch{},
+					&jobset.JobSet{},
 				},
 				watchExtensionPlugins: []framework.WatchExtensionPlugin{
 					&coscheduling.CoScheduling{},
@@ -379,7 +380,9 @@ func TestRunCustomValidationPlugins(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
-			warnings, errs := fwk.RunCustomValidationPlugins(tc.oldObj, tc.newObj)
+			runtimeInfo := runtime.NewInfo()
+			jobSetTemplate := testingutil.MakeJobSetWrapper(metav1.NamespaceDefault, "test")
+			warnings, errs := fwk.RunCustomValidationPlugins(jobSetTemplate, runtimeInfo, tc.oldObj, tc.newObj)
 			if diff := cmp.Diff(tc.wantWarnings, warnings, cmpopts.SortSlices(func(a, b string) bool { return a < b })); len(diff) != 0 {
 				t.Errorf("Unexpected warninigs (-want,+got):\n%s", diff)
 			}
 
@@ -21,6 +21,7 @@ import (
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/validation/field"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 
 	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
@@ -33,7 +34,7 @@ type Plugin interface {
 
 type CustomValidationPlugin interface {
 	Plugin
-	Validate(oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList)
+	Validate(runtimeJobTemplate client.Object, info *runtime.Info, oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList)
 }
 
 type WatchExtensionPlugin interface {
 
@@ -27,12 +27,15 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	apiruntime "k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/validation/field"
 	metav1ac "k8s.io/client-go/applyconfigurations/meta/v1"
 	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
 	"sigs.k8s.io/controller-runtime/pkg/cache"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 	jobsetv1alpha2ac "sigs.k8s.io/jobset/client-go/applyconfiguration/jobset/v1alpha2"
 
@@ -53,6 +56,7 @@ var _ framework.WatchExtensionPlugin = (*JobSet)(nil)
 var _ framework.PodNetworkPlugin = (*JobSet)(nil)
 var _ framework.ComponentBuilderPlugin = (*JobSet)(nil)
 var _ framework.TerminalConditionPlugin = (*JobSet)(nil)
+var _ framework.CustomValidationPlugin = (*JobSet)(nil)
 
 const Name = constants.JobSetKind
 
@@ -71,6 +75,45 @@ func (j *JobSet) Name() string {
 	return Name
 }
 
+func (j *JobSet) Validate(runtimeJobTemplate client.Object, runtimeInfo *runtime.Info, oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
+
+	var allErrs field.ErrorList
+	specPath := field.NewPath("spec")
+	runtimeRefPath := specPath.Child("runtimeRef")
+
+	jobSet, ok := runtimeJobTemplate.(*jobsetv1alpha2.JobSet)
+	if !ok {
+		return nil, nil
+	}
+
+	rJobContainerNames := make(map[string]sets.Set[string])
+	for _, rJob := range jobSet.Spec.ReplicatedJobs {
+		rJobContainerNames[rJob.Name] = sets.New[string]()
+		for _, c := range rJob.Template.Spec.Template.Spec.Containers {
+			rJobContainerNames[rJob.Name].Insert(c.Name)
+		}
+	}
+
+	if newObj.Spec.ModelConfig != nil && newObj.Spec.ModelConfig.Input != nil {
+		if containerSet, ok := rJobContainerNames[constants.JobInitializer]; !ok {
+			allErrs = append(allErrs, field.Invalid(runtimeRefPath, newObj.Spec.RuntimeRef, fmt.Sprintf("must have %s job when trainJob is configured with input modelConfig", constants.JobInitializer)))
+		} else if !containerSet.Has(constants.ContainerModelInitializer) {
+			allErrs = append(allErrs, field.Invalid(runtimeRefPath, newObj.Spec.RuntimeRef, fmt.Sprintf("must have container with name - %s in the %s job", constants.ContainerModelInitializer, constants.JobInitializer)))
+		}
+	}
+
+	if newObj.Spec.DatasetConfig != nil {
+		if containerSet, ok := rJobContainerNames[constants.JobInitializer]; !ok {
+			allErrs = append(allErrs, field.Invalid(runtimeRefPath, newObj.Spec.RuntimeRef, fmt.Sprintf("must have %s job when trainJob is configured with input datasetConfig", constants.JobInitializer)))
+		} else {
+			if !containerSet.Has(constants.ContainerDatasetInitializer) {
+				allErrs = append(allErrs, field.Invalid(runtimeRefPath, newObj.Spec.RuntimeRef, fmt.Sprintf("must have container with name - %s in the %s job", constants.ContainerDatasetInitializer, constants.JobInitializer)))
+			}
+		}
+	}
+	return nil, allErrs
+}
+
 func (j *JobSet) ReconcilerBuilders() []runtime.ReconcilerBuilder {
 	if _, err := j.restMapper.RESTMapping(
 		schema.GroupKind{Group: jobsetv1alpha2.GroupVersion.Group, Kind: constants.JobSetKind},
 
@@ -31,6 +31,7 @@ import (
 
 	corev1 "k8s.io/api/core/v1"
 	apiruntime "k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/validation/field"
 	corev1ac "k8s.io/client-go/applyconfigurations/core/v1"
 	metav1ac "k8s.io/client-go/applyconfigurations/meta/v1"
@@ -75,13 +76,25 @@ func (m *MPI) Name() string {
 	return Name
 }
 
-// TODO: Need to implement validations for MPI Policy.
 // TODO (andreyvelich): Add validation to check that TrainJob doesn't have MPI envs.
 // TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other.
 // Ref: https://github.com/kubeflow/trainer/pull/2308#discussion_r1823229940
 
-func (m *MPI) Validate(oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
-	return nil, nil
+func (m *MPI) Validate(runtimeJobTemplate client.Object, runtimeInfo *runtime.Info, oldJobObj, newJobObj *trainer.TrainJob) (admission.Warnings, field.ErrorList) {
+	var allErrs field.ErrorList
+	if runtimeInfo == nil || runtimeInfo.RuntimePolicy.MLPolicy == nil || runtimeInfo.RuntimePolicy.MLPolicy.MPI == nil {
+		return nil, allErrs
+	}
+
+	specPath := field.NewPath("spec")
+	if newJobObj.Spec.Trainer != nil && newJobObj.Spec.Trainer.NumProcPerNode != nil {
+		numProcPerNodePath := specPath.Child("trainer").Child("numProcPerNode")
+		numProcPerNode := *newJobObj.Spec.Trainer.NumProcPerNode
+		if numProcPerNode.Type != intstr.Int {
+			allErrs = append(allErrs, field.Invalid(numProcPerNodePath, newJobObj.Spec.Trainer.NumProcPerNode, "must have an int value"))
+		}
+	}
+	return nil, allErrs
 }
 
 func (m *MPI) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob) error {
Original file line number	Diff line number	Diff line change
`@@ -101,11 +101,11 @@ func (f Framework) RunEnforcePodGroupPolicyPlugins(info runtime.Info, trainJob`
`101`	`101`	`return nil`
`102`	`102`	`}`
`103`	`103`
`104`		`-func (f Framework) RunCustomValidationPlugins(oldObj, newObj trainer.TrainJob) (admission.Warnings, field.ErrorList) {`
	`104`	`+func (f Framework) RunCustomValidationPlugins(runtimeJobTemplate client.Object, info runtime.Info, oldObj, newObj *trainer.TrainJob) (admission.Warnings, field.ErrorList) {`
`105`	`105`	`var aggregatedWarnings admission.Warnings`
`106`	`106`	`var aggregatedErrors field.ErrorList`
`107`	`107`	`for _, plugin := range f.customValidationPlugins {`
`108`		`- warnings, errs := plugin.Validate(oldObj, newObj)`
	`108`	`+ warnings, errs := plugin.Validate(runtimeJobTemplate, info, oldObj, newObj)`
`109`	`109`	`if len(warnings) != 0 {`
`110`	`110`	`aggregatedWarnings = append(aggregatedWarnings, warnings...)`
`111`	`111`	`}`