Description
Describe the bug
SageMaker Local lets you train models using data either on your local machine or S3. When using ModelTrainer
, these artifacts are stored in a local directory under local_container_root
, where for each channel, a TemporaryDirectory
is created and files are copied there.
However, I've noticed that if your input data is a single S3 file, the temporary directory is not created, leading to a File or directory not found
error.
I found that adding a os.makedirs(local_dir, exists_ok=True)
after this line fixes the problem:
but there's probably a cleaner way.
To reproduce
trainer = ModelTrainer(
training_mode=Mode.LOCAL_CONTAINER,
local_directory_root='.smlocal',
...
)
trainer.train(
input_data_config=[
InputData(channel_name="input", data_source="s3://my-bucket/my-single-file.csv")
],
)
Expected behavior
The files are downloaded locally
Screenshots or logs
See stack trace below:
Stack trace (click to expand)
│ ❱ 153 │ trainer = start_training_job( │ │ 154 │ │ num=num, │ │ 155 │ │ data_config=input_data, │ │ 156 │ │ config=config_dict, │ │ │ │ /Users/aigars/workspace/capture-v2/sm/main.py:76 in start_training_job │ │ │ │ 73 │ │ # instance_count=1, │ │ 74 │ │ # metric_definitions=METRIC_DEFINITIONS, │ │ 75 │ ) │ │ ❱ 76 │ trainer.train( │ │ 77 │ │ input_data_config=data_config, │ │ 78 │ │ wait=not run_in_background, # whether to wait for job to finish │ │ 79 │ │ logs=not run_in_background, # whether to log job logs to stdout │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/te │ │ lemetry/telemetry_logging.py:175 in wrapper │ │ │ │ 172 │ │ │ │ │ "sagemaker_session is not provided or not valid.", │ │ 173 │ │ │ │ │ func_name, │ │ 174 │ │ │ │ ) │ │ ❱ 175 │ │ │ │ return func(*args, **kwargs) │ │ 176 │ │ │ │ 177 │ │ return wrapper │ │ 178 │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/pydantic/val │ │ idate_call_decorator.py:60 in wrapper_function │ │ │ │ 57 │ │ │ │ 58 │ │ @functools.wraps(function) │ │ 59 │ │ def wrapper_function(*args, **kwargs): │ │ ❱ 60 │ │ │ return validate_call_wrapper(*args, **kwargs) │ │ 61 │ │ │ │ 62 │ │ wrapper_function.raw_function = function # type: ignore │ │ 63 │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/pydantic/_in │ │ ternal/_validate_call.py:96 in __call__ │ │ │ │ 93 │ │ │ self.__return_pydantic_validator__ = None │ │ 94 │ │ │ 95 │ def __call__(self, *args: Any, **kwargs: Any) -> Any: │ │ ❱ 96 │ │ res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, │ │ 97 │ │ if self.__return_pydantic_validator__: │ │ 98 │ │ │ return self.__return_pydantic_validator__(res) │ │ 99 │ │ return res │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │ │ dules/train/model_trainer.py:649 in train │ │ │ │ 646 │ │ │ │ hyper_parameters=string_hyper_parameters, │ │ 647 │ │ │ │ environment=self.environment, │ │ 648 │ │ │ ) │ │ ❱ 649 │ │ │ local_container.train(wait) │ │ 650 │ │ │ 651 │ def create_input_data_channel( │ │ 652 │ │ self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │ │ dules/local_core/local_container.py:166 in train │ │ │ │ 163 │ │ │ │ 164 │ │ data_dir = os.path.join(self.container_root, "input", "data") │ │ 165 │ │ os.makedirs(data_dir, exist_ok=True) │ │ ❱ 166 │ │ volumes = self._prepare_training_volumes( │ │ 167 │ │ │ data_dir, self.input_data_config, self.hyper_parameters │ │ 168 │ │ ) │ │ 169 │ │ # If local, source directory needs to be updated to mounted /opt/ml/code path │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │ │ dules/local_core/local_container.py:519 in _prepare_training_volumes │ │ │ │ 516 │ │ │ channel_dir = os.path.join(data_dir, channel_name) │ │ 517 │ │ │ os.makedirs(channel_dir, exist_ok=True) │ │ 518 │ │ │ │ │ ❱ 519 │ │ │ data_source_local_path = self._get_data_source_local_path(channel.data_sourc │ │ 520 │ │ │ volumes.append(_Volume(data_source_local_path, channel=channel_name).map) │ │ 521 │ │ │ │ 522 │ │ # If there is a training script directory and it is a local directory, │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │ │ dules/local_core/local_container.py:555 in _get_data_source_local_path │ │ │ │ 552 │ │ │ # make sure local_dir exists │ │ 553 │ │ │ # os.makedirs(local_dir, exist_ok=True) │ │ 554 │ │ │ self._temporary_folders.append(local_dir) │ │ ❱ 555 │ │ │ download_folder(parsed_uri.netloc, parsed_uri.path, local_dir, self.sagemake │ │ 556 │ │ │ return local_dir │ │ 557 │ │ else: │ │ 558 │ │ │ return os.path.abspath(data_source.file_system_data_source.directory_path) │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/ut │ │ ils.py:410 in download_folder │ │ │ │ 407 │ if not prefix.endswith("/"): │ │ 408 │ │ try: │ │ 409 │ │ │ file_destination = os.path.join(target, os.path.basename(prefix)) │ │ ❱ 410 │ │ │ s3.Object(bucket_name, prefix).download_file(file_destination) │ │ 411 │ │ │ return │ │ 412 │ │ except botocore.exceptions.ClientError as e: │ │ 413 │ │ │ err_info = e.response["Error"] │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/boto3/s3/inj │ │ ect.py:361 in object_download_file │ │ │ │ 358 │ :param Config: The transfer configuration to be used when performing the │ │ 359 │ │ transfer. │ │ 360 │ """ │ │ ❱ 361 │ return self.meta.client.download_file( │ │ 362 │ │ Bucket=self.bucket_name, │ │ 363 │ │ Key=self.key, │ │ 364 │ │ Filename=Filename, │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/boto3/s3/inj │ │ ect.py:192 in download_file │ │ │ │ 189 │ │ transfer. │ │ 190 │ """ │ │ 191 │ with S3Transfer(self, Config) as transfer: │ │ ❱ 192 │ │ return transfer.download_file( │ │ 193 │ │ │ bucket=Bucket, │ │ 194 │ │ │ key=Key, │ │ 195 │ │ │ filename=Filename, │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/boto3/s3/tra │ │ nsfer.py:406 in download_file │ │ │ │ 403 │ │ │ bucket, key, filename, extra_args, subscribers │ │ 404 │ │ ) │ │ 405 │ │ try: │ │ ❱ 406 │ │ │ future.result() │ │ 407 │ │ # This is for backwards compatibility where when retries are │ │ 408 │ │ # exceeded we need to throw the same error from boto3 instead of │ │ 409 │ │ # s3transfer's built in RetriesExceededError as current users are │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/f │ │ utures.py:103 in result │ │ │ │ 100 │ │ │ # Usually the result() method blocks until the transfer is done, │ │ 101 │ │ │ # however if a KeyboardInterrupt is raised we want want to exit │ │ 102 │ │ │ # out of this and propagate the exception. │ │ ❱ 103 │ │ │ return self._coordinator.result() │ │ 104 │ │ except KeyboardInterrupt as e: │ │ 105 │ │ │ self.cancel() │ │ 106 │ │ │ raise e │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/f │ │ utures.py:264 in result │ │ │ │ 261 │ │ # Once done waiting, raise an exception if present or return the │ │ 262 │ │ # final result. │ │ 263 │ │ if self._exception: │ │ ❱ 264 │ │ │ raise self._exception │ │ 265 │ │ return self._result │ │ 266 │ │ │ 267 │ def cancel(self, msg='', exc_type=CancelledError): │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/t │ │ asks.py:135 in __call__ │ │ │ │ 132 │ │ │ # task to the TransferFuture had failed) then execute the task's │ │ 133 │ │ │ # main() method. │ │ 134 │ │ │ if not self._transfer_coordinator.done(): │ │ ❱ 135 │ │ │ │ return self._execute_main(kwargs) │ │ 136 │ │ except Exception as e: │ │ 137 │ │ │ self._log_and_set_exception(e) │ │ 138 │ │ finally: │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/t │ │ asks.py:158 in _execute_main │ │ │ │ 155 │ │ # Log what is about to be executed. │ │ 156 │ │ logger.debug(f"Executing task {self} with kwargs {kwargs_to_display}") │ │ 157 │ │ │ │ ❱ 158 │ │ return_value = self._main(**kwargs) │ │ 159 │ │ # If the task is the final task, then set the TransferFuture's │ │ 160 │ │ # value to the return value from main(). │ │ 161 │ │ if self._is_final: │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/d │ │ ownload.py:640 in _main │ │ │ │ 637 │ │ :param data: The data to write │ │ 638 │ │ :param offset: The offset to write the data to. │ │ 639 │ │ """ │ │ ❱ 640 │ │ fileobj.seek(offset) │ │ 641 │ │ fileobj.write(data) │ │ 642 │ │ 643 │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/u │ │ tils.py:393 in seek │ │ │ │ 390 │ │ self._fileobj.write(data) │ │ 391 │ │ │ 392 │ def seek(self, where, whence=0): │ │ ❱ 393 │ │ self._open_if_needed() │ │ 394 │ │ self._fileobj.seek(where, whence) │ │ 395 │ │ │ 396 │ def tell(self): │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/u │ │ tils.py:376 in _open_if_needed │ │ │ │ 373 │ │ │ 374 │ def _open_if_needed(self): │ │ 375 │ │ if self._fileobj is None: │ │ ❱ 376 │ │ │ self._fileobj = self._open_function(self._filename, self._mode) │ │ 377 │ │ │ if self._start_byte != 0: │ │ 378 │ │ │ │ self._fileobj.seek(self._start_byte) │ │ 379 │ │ │ │ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/u │ │ tils.py:287 in open │ │ │ │ 284 │ │ ) │ │ 285 │ │ │ 286 │ def open(self, filename, mode): │ │ ❱ 287 │ │ return open(filename, mode) │ │ 288 │ │ │ 289 │ def remove_file(self, filename): │ │ 290 │ │ """Remove a file, noop if file does not exist."""
System information
A description of your system. Please provide:
- SageMaker Python SDK version:
2.239.0
- Framework name (eg. PyTorch) or algorithm (eg. KMeans): n/a
- Framework version: n/a
- Python version: 3.12
- CPU or GPU: CPU
- Custom Docker image (Y/N): Y
Additional context
Add any other context about the problem here.