Merge branch 'main' into feature/bump-aqua-version

mrDzurb · web-flow · commit 113df5c16a2d · 2025-08-27T16:15:50.000-07:00
diff --git a/ads/jobs/builders/infrastructure/dsc_job.py b/ads/jobs/builders/infrastructure/dsc_job.py
@@ -1751,6 +1751,7 @@ def is_multi_node_job(runtime):
         return (
             MULTI_NODE_JOB_SUPPORT
             and isinstance(runtime, MultiNodeRuntime)
+            and runtime.replica
             and runtime.replica > 1
         )
 
diff --git a/ads/jobs/builders/infrastructure/dsc_job_runtime.py b/ads/jobs/builders/infrastructure/dsc_job_runtime.py
@@ -365,6 +365,11 @@ def _get_node_group(self, dsc_job):
             dsc_job,
             "job_node_configuration_details.job_node_group_configuration_details_list",
         )
+        if node_groups is None:
+            node_groups = get_value(
+                dsc_job,
+                "job_node_configuration_details.jobNodeGroupConfigurationDetailsList",
+            )
         if node_groups and len(node_groups) == 1:
             return node_groups[0]
         return None
@@ -373,6 +378,7 @@ def _get_replica(self, dsc_job, envs):
         node_group = self._get_node_group(dsc_job)
         if node_group:
             replica = get_value(node_group, "replicas")
+            envs.pop(self.CONST_NODE_COUNT, None)
         elif not envs:
             replica = None
         elif self.CONST_WORKER_COUNT in envs:
@@ -399,7 +405,9 @@ def _extract_envs(self, dsc_job):
         env_attr = "job_configuration_details.environment_variables"
         node_group = self._get_node_group(dsc_job)
         if node_group:
-            envs = get_value(node_group, env_attr)
+            envs = get_value(node_group, env_attr) or get_value(
+                node_group, "jobConfigurationDetails.environment_variables"
+            )
         else:
             envs = get_value(dsc_job, env_attr)
         if envs:
diff --git a/ads/opctl/operator/lowcode/forecast/const.py b/ads/opctl/operator/lowcode/forecast/const.py
@@ -91,3 +91,4 @@ class ForecastOutputColumns(ExtendedEnum):
 AUTO_SELECT = "auto-select"
 AUTO_SELECT_SERIES = "auto-select-series"
 BACKTEST_REPORT_NAME = "back_test.csv"
+TROUBLESHOOTING_GUIDE = "https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/ai-operators/troubleshooting.md"
diff --git a/ads/opctl/operator/lowcode/forecast/errors.py b/ads/opctl/operator/lowcode/forecast/errors.py
@@ -4,6 +4,9 @@
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
+from ads.opctl.operator.lowcode.forecast.const import TROUBLESHOOTING_GUIDE
+
+
 class ForecastSchemaYamlError(Exception):
     """Exception raised when there is an issue with the schema."""
 
@@ -12,6 +15,7 @@ def __init__(self, error: str):
             "Invalid forecast operator specification. Check the YAML structure and ensure it "
             "complies with the required schema for forecast operator. \n"
             f"{error}"
+            f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
         )
 
 
@@ -23,4 +27,5 @@ def __init__(self, error: str):
             "Invalid input data. Check the input data and ensure it "
             "complies with the validation criteria. \n"
             f"{error}"
+            f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
         )
diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py
@@ -51,6 +51,7 @@
     SpeedAccuracyMode,
     SupportedMetrics,
     SupportedModels,
+    TROUBLESHOOTING_GUIDE,
 )
 from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec
 from .forecast_datasets import ForecastDatasets, ForecastResults
@@ -743,6 +744,7 @@ def _validate_automlx_explanation_mode(self):
             raise ValueError(
                 "AUTOMLX explanation accuracy mode is only supported for AutoMLX models. "
                 "Please select mode other than AUTOMLX from the available explanations_accuracy_mode options"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
 
     @runtime_dependency(
diff --git a/ads/opctl/operator/lowcode/forecast/model/factory.py b/ads/opctl/operator/lowcode/forecast/model/factory.py
@@ -4,7 +4,14 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 from ads.opctl.operator.lowcode.common.transformations import Transformations
-from ..const import AUTO_SELECT, SpeedAccuracyMode, SupportedModels, AUTO_SELECT_SERIES
+
+from ..const import (
+    AUTO_SELECT,
+    AUTO_SELECT_SERIES,
+    TROUBLESHOOTING_GUIDE,
+    SpeedAccuracyMode,
+    SupportedModels,
+)
 from ..meta_selector import MetaSelector
 from ..model_evaluator import ModelEvaluator
 from ..operator_config import ForecastOperatorConfig
@@ -23,6 +30,7 @@ def __init__(self, model_type: str):
         super().__init__(
             f"Model: `{model_type}` "
             f"is not supported. Supported models: {SupportedModels.values()}"
+            f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
         )
 
 
diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py
@@ -18,7 +18,7 @@
     get_frequency_of_datetime,
 )
 
-from ..const import ForecastOutputColumns, SupportedModels
+from ..const import ForecastOutputColumns, SupportedModels, TROUBLESHOOTING_GUIDE
 from ..operator_config import ForecastOperatorConfig
 
 
@@ -49,7 +49,8 @@ def _verify_dt_col(self, spec):
                 f"{SupportedModels.AutoMLX} requires data with a frequency of at least one hour. Please try using a different model,"
                 " or select the 'auto' option."
             )
-            raise InvalidParameterError(message)
+            raise InvalidParameterError(f"{message}"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps.")
 
 
 class AdditionalData(AbstractData):
@@ -65,10 +66,12 @@ def __init__(self, spec, historical_data, additional_data=None, subset=None):
             if historical_data.get_max_time() > add_dates[-spec.horizon]:
                 raise DataMismatchError(
                     f"The Historical Data ends on {historical_data.get_max_time()}. The additional data horizon starts on {add_dates[-spec.horizon]}. The horizon should have exactly {spec.horizon} dates after the Historical at a frequency of {historical_data.freq}"
+                    f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
                 )
             elif historical_data.get_max_time() != add_dates[-(spec.horizon + 1)]:
                 raise DataMismatchError(
                     f"The Additional Data must be present for all historical data and the entire horizon. The Historical Data ends on {historical_data.get_max_time()}. The additonal data horizon starts after {add_dates[-(spec.horizon+1)]}. These should be the same date."
+                    f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
                 )
         else:
             self.name = "additional_data"
@@ -215,6 +218,7 @@ def get_data_at_series(self, s_id, include_horizon=True):
         except Exception as e:
             raise InvalidParameterError(
                 f"Unable to retrieve series id: {s_id} from data. Available series ids are: {self.list_series_ids()}"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             ) from e
 
     def get_horizon_at_series(self, s_id):
@@ -296,6 +300,7 @@ def add_series_id(
         if not overwrite and series_id in self.series_id_map:
             raise ValueError(
                 f"Attempting to update ForecastOutput for series_id {series_id} when this already exists. Set overwrite to True."
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
         forecast = self._check_forecast_format(forecast)
         self.series_id_map[series_id] = forecast
@@ -336,6 +341,7 @@ def populate_series_output(
         except KeyError as e:
             raise ValueError(
                 f"Attempting to update output for series: {series_id}, however no series output has been initialized."
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             ) from e
 
         if (output_i.shape[0] - self.horizon) == len(fit_val):
@@ -356,18 +362,21 @@ def populate_series_output(
         if len(forecast_val) != self.horizon:
             raise ValueError(
                 f"Attempting to set forecast along horizon ({self.horizon}) for series: {series_id}, however forecast is only length {len(forecast_val)}"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
         output_i["forecast_value"].iloc[-self.horizon :] = forecast_val
 
         if len(upper_bound) != self.horizon:
             raise ValueError(
                 f"Attempting to set upper_bound along horizon ({self.horizon}) for series: {series_id}, however upper_bound is only length {len(upper_bound)}"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
         output_i[self.upper_bound_name].iloc[-self.horizon :] = upper_bound
 
         if len(lower_bound) != self.horizon:
             raise ValueError(
                 f"Attempting to set lower_bound along horizon ({self.horizon}) for series: {series_id}, however lower_bound is only length {len(lower_bound)}"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
         output_i[self.lower_bound_name].iloc[-self.horizon :] = lower_bound
 
diff --git a/ads/opctl/operator/lowcode/forecast/model_evaluator.py b/ads/opctl/operator/lowcode/forecast/model_evaluator.py
@@ -10,7 +10,10 @@
 from ads.opctl import logger
 from ads.opctl.operator.lowcode.common.const import DataColumns
 from ads.opctl.operator.lowcode.common.errors import InsufficientDataError
-from ads.opctl.operator.lowcode.forecast.const import BACKTEST_REPORT_NAME
+from ads.opctl.operator.lowcode.forecast.const import (
+    BACKTEST_REPORT_NAME,
+    TROUBLESHOOTING_GUIDE,
+)
 from ads.opctl.operator.lowcode.forecast.model.factory import SupportedModels
 
 from .model.forecast_datasets import ForecastDatasets
@@ -79,6 +82,7 @@ def generate_k_fold_data(
             raise InsufficientDataError(
                 "Insufficient data to evaluate multiple models. Please specify a model "
                 "instead of using auto-select."
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
         training_datasets = [
             sampled_historical_data[sampled_historical_data[date_col] <= cut_off_date]
@@ -223,6 +227,7 @@ def find_best_model(
             model = SupportedModels.Prophet
             logger.error(
                 f"Running {model} model as auto-select failed with the following error: {e.message}"
+                f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."
             )
             return model
         nonempty_metrics = {
diff --git a/ads/pipeline/ads_pipeline.py b/ads/pipeline/ads_pipeline.py
@@ -1728,15 +1728,19 @@ def __step_details(self, pipeline_details: Dict) -> list:
 
     def __step_infrastructure_configuration_details(self, step) -> dict:
         step_infrastructure_configuration_details = {}
-        step_infrastructure_configuration_details[
-            "blockStorageSizeInGBs"
-        ] = step.infrastructure.block_storage_size
-        step_infrastructure_configuration_details[
-            "shapeName"
-        ] = step.infrastructure.shape_name
-        step_infrastructure_configuration_details[
-            "shapeConfigDetails"
-        ] = step.infrastructure.shape_config_details
+        step_infrastructure_configuration_details["blockStorageSizeInGBs"] = (
+            step.infrastructure.block_storage_size
+        )
+        step_infrastructure_configuration_details["shapeName"] = (
+            step.infrastructure.shape_name
+        )
+        step_infrastructure_configuration_details["shapeConfigDetails"] = (
+            step.infrastructure.shape_config_details
+        )
+        if getattr(step.infrastructure, "subnet_id", ""):
+            step_infrastructure_configuration_details["subnetId"] = (
+                step.infrastructure.subnet_id
+            )
         return step_infrastructure_configuration_details
 
     def __step_configuration_details(self, pipeline_details: Dict, step) -> dict:
diff --git a/tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py b/tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py
@@ -10,6 +10,9 @@
 from unittest import mock
 
 from ads.jobs import DataScienceJob, DataScienceJobRun, PyTorchDistributedRuntime
+from ads.jobs.builders.infrastructure.dsc_job_runtime import (
+    MULTI_NODE_JOB_SUPPORT,
+)
 from ads.jobs.builders.infrastructure.dsc_job_runtime import (
     PyTorchDistributedRuntimeHandler as Handler,
 )
@@ -133,23 +136,26 @@ def test_create_job_runs(self, patched_run, *args):
         self.assertIsInstance(main_run, DataScienceJobRun)
         self.assertEqual(main_run.id, test_ocid)
         kwarg_list = [call_args.kwargs for call_args in patched_run.call_args_list]
-        self.assertEqual(
-            kwarg_list,
-            [
-                {
-                    "display_name": "None-0",
-                    "environment_variables": {"NODE_RANK": "0", "NODE_COUNT": "2"},
-                },
-                {
-                    "display_name": "None-1",
-                    "environment_variables": {
-                        "NODE_RANK": "1",
-                        "NODE_COUNT": "2",
-                        "MAIN_JOB_RUN_OCID": test_ocid,
+        if MULTI_NODE_JOB_SUPPORT:
+            self.assertEqual(kwarg_list, [{}])
+        else:
+            self.assertEqual(
+                kwarg_list,
+                [
+                    {
+                        "display_name": "None-0",
+                        "environment_variables": {"NODE_RANK": "0", "NODE_COUNT": "2"},
                     },
-                },
-            ],
-        )
+                    {
+                        "display_name": "None-1",
+                        "environment_variables": {
+                            "NODE_RANK": "1",
+                            "NODE_COUNT": "2",
+                            "MAIN_JOB_RUN_OCID": test_ocid,
+                        },
+                    },
+                ],
+            )
 
     @mock.patch.dict(
         os.environ, {utils.CONST_ENV_INPUT_MAPPINGS: json.dumps({INPUT_SRC: INPUT_DST})}

Original file line number	Diff line number	Diff line change
`@@ -1751,6 +1751,7 @@ def is_multi_node_job(runtime):`
`1751`	`1751`	`return (`
`1752`	`1752`	`MULTI_NODE_JOB_SUPPORT`
`1753`	`1753`	`and isinstance(runtime, MultiNodeRuntime)`
	`1754`	`+ and runtime.replica`
`1754`	`1755`	`and runtime.replica > 1`
`1755`	`1756`	`)`
`1756`	`1757`
Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@`
`51`	`51`	`SpeedAccuracyMode,`
`52`	`52`	`SupportedMetrics,`
`53`	`53`	`SupportedModels,`
	`54`	`+ TROUBLESHOOTING_GUIDE,`
`54`	`55`	`)`
`55`	`56`	`from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec`
`56`	`57`	`from .forecast_datasets import ForecastDatasets, ForecastResults`
`@@ -743,6 +744,7 @@ def _validate_automlx_explanation_mode(self):`
`743`	`744`	`raise ValueError(`
`744`	`745`	`"AUTOMLX explanation accuracy mode is only supported for AutoMLX models. "`
`745`	`746`	`"Please select mode other than AUTOMLX from the available explanations_accuracy_mode options"`
	`747`	`+ f"\nPlease refer to the troubleshooting guide at {TROUBLESHOOTING_GUIDE} for resolution steps."`
`746`	`748`	`)`
`747`	`749`
`748`	`750`	`@runtime_dependency(`