diff --git a/.env.example b/.env.example index e3728e89..abb006d8 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,15 @@ -GLOBUS_CLIENT_ID= -GLOBUS_CLIENT_SECRET= -PREFECT_API_URL= -PREFECT_API_KEY= -PUSHGATEWAY_URL= -JOB_NAME= -INSTANCE_LABEL= \ No newline at end of file +GLOBUS_CLIENT_ID= # For Globus Transfer +GLOBUS_CLIENT_SECRET= # For Globus Transfer +GLOBUS_COMPUTE_CLIENT_ID= # For ALCF Jobs +GLOBUS_COMPUTE_CLIENT_SECRET= # For ALCF Jobs +GLOBUS_COMPUTE_ENDPOINT= # For ALCF Jobs +PREFECT_API_URL= # For Prefect Flows +PREFECT_API_KEY= # For Prefect Flows +SCICAT_API_URL= # For SciCat Ingest +SCICAT_INGEST_USER= # For SciCat Ingest +SCICAT_INGEST_PASSWORD= # For SciCat Ingest +PATH_NERSC_CLIENT_ID= # For NERSC SFAPI +PATH_NERSC_PRI_KEY= # For NERSC SFAPI +PUSHGATEWAY_URL= # For Grafana Pushgateway +JOB_NAME= # For Grafana Pushgateway +INSTANCE_LABEL= # For Grafana Pushgateway diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 1164dede..e160e455 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -12,10 +12,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.11 + - name: Set up Python 3.12.5 uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.12.5 cache: 'pip' - name: Install dependencies run: | diff --git a/README.md b/README.md index d1745784..ba1e4453 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,18 @@ $ pip3 install -e . Use `.env.example` as a template. ``` -GLOBUS_CLIENT_ID= -GLOBUS_CLIENT_SECRET= -PREFECT_API_URL= -PREFECT_API_KEY= +GLOBUS_CLIENT_ID= # For Globus Transfer +GLOBUS_CLIENT_SECRET= # For Globus Transfer +GLOBUS_COMPUTE_CLIENT_ID= # For ALCF Jobs +GLOBUS_COMPUTE_CLIENT_SECRET= # For ALCF Jobs +GLOBUS_COMPUTE_ENDPOINT= # For ALCF Jobs +PREFECT_API_URL= # For Prefect Flows +PREFECT_API_KEY= # For Prefect Flows +SCICAT_API_URL= # For SciCat Ingest +SCICAT_INGEST_USER= # For SciCat Ingest +SCICAT_INGEST_PASSWORD= # For SciCat Ingest +PATH_NERSC_CLIENT_ID= # For NERSC SFAPI, generate on https://iris.nersc.gov/ +PATH_NERSC_PRI_KEY= # For NERSC SFAPI ``` ## Current workflow overview and status: diff --git a/config.yml b/config.yml index e9936adc..d17d856f 100644 --- a/config.yml +++ b/config.yml @@ -107,6 +107,11 @@ globus: client_id: ${GLOBUS_CLIENT_ID} client_secret: ${GLOBUS_CLIENT_SECRET} +hpss_alsdev: + root_path: /home/a/alsdev/data_mover + uri: nersc.gov + name: hpss_alsdev + harbor_images832: recon_image: tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c multires_image: tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c diff --git a/create_deployment_832_dispatcher.sh b/create_deployment_832_dispatcher.sh index afaae432..6ab20cb5 100755 --- a/create_deployment_832_dispatcher.sh +++ b/create_deployment_832_dispatcher.sh @@ -3,4 +3,14 @@ export $(grep -v '^#' .env | xargs) prefect work-pool create 'dispatcher_pool' prefect deployment build ./orchestration/flows/bl832/dispatcher.py:dispatcher -n run_832_dispatcher -q bl832 -p dispatcher_pool -prefect deployment apply dispatcher-deployment.yaml \ No newline at end of file +prefect deployment apply dispatcher-deployment.yaml + +prefect work-pool create 'hpss_pool' +prefect deployment build ./orchestration/flows/bl832/dispatcher.py:archive_832_project_dispatcher -n run_archive_832_project_dispatcher -q hpss_dispatcher_queue -p hpss_pool +prefect deployment apply archive_832_project_dispatcher-deployment.yaml + +prefect deployment build ./orchestration/flows/bl832/dispatcher.py:archive_832_projects_from_previous_cycle_dispatcher -n run_archive_832_projects_from_previous_cycle_dispatcher -q hpss_dispatcher_queue -p hpss_pool +prefect deployment apply archive_832_projects_from_previous_cycle_dispatcher-deployment.yaml + +prefect deployment build ./orchestration/flows/bl832/dispatcher.py:archive_all_832_raw_projects_dispatcher -n run_archive_all_832_raw_projects_dispatcher -q hpss_dispatcher_queue -p hpss_pool +prefect deployment apply archive_all_832_raw_projects_dispatcher-deployment.yaml \ No newline at end of file diff --git a/create_deployment_832_hpss.sh b/create_deployment_832_hpss.sh new file mode 100644 index 00000000..972b5dba --- /dev/null +++ b/create_deployment_832_hpss.sh @@ -0,0 +1,7 @@ +export $(grep -v '^#' .env | xargs) + + +prefect work-pool create 'hpss_pool' + +prefect deployment build ./orchestration/flows/bl832/hpss.py:cfs_to_hpss_flow -n cfs_to_hpss_flow -q cfs_to_hpss_queue -p hpss_pool +prefect deployment apply cfs_to_hpss_flow-deployment.yaml \ No newline at end of file diff --git a/docs/bl832_ALCF.md b/docs/bl832_ALCF.md index b6f9f8c0..6eda4975 100644 --- a/docs/bl832_ALCF.md +++ b/docs/bl832_ALCF.md @@ -428,7 +428,7 @@ def prune_alcf832_raw(relative_path: str): prune_one_safe( file=relative_path, if_older_than_days=0, - tranfer_client=tc, + transfer_client=tc, source_endpoint=config.alcf832_raw, check_endpoint=config.nersc832_alsdev_raw, logger=p_logger, diff --git a/docs/mkdocs/docs/733.md b/docs/mkdocs/docs/733.md new file mode 100644 index 00000000..74f2fc0f --- /dev/null +++ b/docs/mkdocs/docs/733.md @@ -0,0 +1,88 @@ +# Beamline 7.3.3 + + +## Flow Diagram +```mermaid +sequenceDiagram + participant DET as Detector/
File Watcher + participant DISP as Prefect
Dispatcher + participant D733 as data733
Storage + participant GLOB as Globus
Transfer + participant CFS as NERSC
CFS + participant CAT as SciCat
Metadata + participant SFAPI as SFAPI + participant HPC as HPC
Compute + participant HPSS as HPSS
Tape + + %% Initial Trigger + DET->>DET: Monitor filesystem + DET->>DISP: Trigger on new file + DISP->>DISP: Coordinate flows + + %% Flow 1: new_file_733 + rect rgb(220, 230, 255) + note over DISP,CAT: FLOW 1: new_file_733 + DISP->>GLOB: Init transfer + activate GLOB + GLOB->>D733: Initiate copy + activate D733 + D733-->>GLOB: Copy initiated + deactivate D733 + %% note right of GLOB: Transfer in progress + GLOB-->>DISP: Transfer complete + deactivate GLOB + + DISP->>CAT: Register metadata + end + + %% Flow 2: HPSS Transfer + rect rgb(220, 255, 230) + note over DISP,CAT: FLOW 2: Scheduled HPSS Transfer + DISP->>SFAPI: Submit tape job + activate SFAPI + SFAPI->>HPSS: Initiate archive + activate HPSS + HPSS-->>SFAPI: Archive complete + deactivate HPSS + SFAPI-->>DISP: Job complete + deactivate SFAPI + + DISP->>CAT: Update metadata + end + + %% Flow 3: HPC Analysis + rect rgb(255, 230, 230) + note over DISP,HPC: FLOW 3: HPC Downstream Analysis + DISP->>SFAPI: Submit compute job + activate SFAPI + SFAPI->>HPC: Execute job + activate HPC + HPC->>HPC: Process data + HPC-->>SFAPI: Compute complete + deactivate HPC + SFAPI-->>DISP: Job complete + deactivate SFAPI + + DISP->>CAT: Update metadata + end + + %% Flow 4: Scheduled Pruning + rect rgb(255, 255, 220) + note over DISP,CAT: FLOW 4: Scheduled Pruning + DISP->>DISP: Scheduled pruning trigger + + DISP->>D733: Prune old files + activate D733 + D733->>D733: Delete expired data + D733-->>DISP: Pruning complete + deactivate D733 + + DISP->>CFS: Prune old files + activate CFS + CFS->>CFS: Delete expired data + CFS-->>DISP: Pruning complete + deactivate CFS + + DISP->>CAT: Update metadata + end + ``` \ No newline at end of file diff --git a/docs/mkdocs/docs/alcf832.md b/docs/mkdocs/docs/alcf832.md index 3774832f..3d441e0d 100644 --- a/docs/mkdocs/docs/alcf832.md +++ b/docs/mkdocs/docs/alcf832.md @@ -389,7 +389,7 @@ def prune_alcf832_raw(relative_path: str): prune_one_safe( file=relative_path, if_older_than_days=0, - tranfer_client=tc, + transfer_client=tc, source_endpoint=config.alcf832_raw, check_endpoint=config.nersc832_alsdev_raw, logger=p_logger, diff --git a/docs/mkdocs/docs/assets/images/sfapi_step1.png b/docs/mkdocs/docs/assets/images/sfapi_step1.png new file mode 100644 index 00000000..7c1c3d7f Binary files /dev/null and b/docs/mkdocs/docs/assets/images/sfapi_step1.png differ diff --git a/docs/mkdocs/docs/assets/images/sfapi_step2.png b/docs/mkdocs/docs/assets/images/sfapi_step2.png new file mode 100644 index 00000000..3f127a6c Binary files /dev/null and b/docs/mkdocs/docs/assets/images/sfapi_step2.png differ diff --git a/docs/mkdocs/docs/assets/images/sfapi_step3.png b/docs/mkdocs/docs/assets/images/sfapi_step3.png new file mode 100644 index 00000000..ac152b69 Binary files /dev/null and b/docs/mkdocs/docs/assets/images/sfapi_step3.png differ diff --git a/docs/mkdocs/docs/assets/images/sfapi_step4.png b/docs/mkdocs/docs/assets/images/sfapi_step4.png new file mode 100644 index 00000000..9c67f2b6 Binary files /dev/null and b/docs/mkdocs/docs/assets/images/sfapi_step4.png differ diff --git a/docs/mkdocs/docs/common_infrastructure.md b/docs/mkdocs/docs/common_infrastructure.md new file mode 100644 index 00000000..3f448d56 --- /dev/null +++ b/docs/mkdocs/docs/common_infrastructure.md @@ -0,0 +1,47 @@ +# Common Infrastructure + +## Overview +The common infrastructure for this project includes: +- **Shared Code**: There are general functions and classes used across beamline workflows to reduce code duplication. +- **Beamline Specific Implementation Patterns**: We organize each beamline's implementation in a similar way, making it easier to understand and maintain. + +## Shared Code +Shared code is organized into modules that can be imported in beamline specific implementations. Key modules include: +- **`orchestration/config.py`** + - Contains an Abstract Base Class (ABC) called `BeamlineConfig()` which serves as the base for all beamline-specific configuration classes. It uses the `Dynaconf` package to load the configuration file,`config.yml`, which contains information about endpoints, containers, and more. +- **`orchestration/transfer_endpoints.py`** + - Contains an ABC called `TransferEndpoint()`, which is extended by `FileSystemEndpoint`, `HPSSEndpoint` and `GlobusEndpoint`. These definitions are used to enforce typing and ensure the correct transfer and pruning implmentation are used. +- **`orchestration/transfer_controller.py`**: + - Contains an ABC called `TransferController()` with specific implementations for Globus, Local File Systems, and NERSC HPSS. +- **`orchestration/prune_controller.py`** + - This module is responsible for managing the pruning of data off of storage systems. It uses a configurable retention policy to determine when to remove files. It contains an ABC called `PruneController()` that is extended by specific implementations for `FileSystemEndpoint`, `GlobusEndpoint`, and `HPSSEndpoint`. +- **`orchestration/sfapi.py`**: Create an SFAPI Client to launch remote jobs at NERSC. +- **`orchestration/flows/scicat/ingest.py`**: Ingests datasets into SciCat, our metadata management system. +- **`orchestration/hpss.py`**: Schedule a Prefect Flow to copy data between NERSC CFS and HPSS. These call the relevant TransferControllers for HPSS, which handle the underlying tape-safe logic. + + +## Beamline Specific Implementation Patterns +In order to balance generalizability, maintainability, and scalability of this project to multiple beamlines, we try to organize specific implementations in a similar way. We keep specific implementaqtions in the directory `orchestration/flows/bl{beamline_id}/`, which generally contains a few things: +- **`config.py`** + - Extend `BeamlineConfig()` from `orchestration/config.py` for specific implementations (e.g. `Config832`, `Config733`, etc.) This ensures only the relevant beamline specific configurations are used in each case. +- **`dispatcher.py`** + - This script is the starting point for each beamline's data transfer and analysis workflow. The Prefect Flow it contains is generally invoked by a File Watcher script on the beamline computer. The Dispatcher contains the logic for calling subflows, ensures that steps are completed in the correct order, and prevents subsequent steps from being called if there is a failure along the way. +- **`move.py`** + - This script is usually the first one the Dispatcher calls synchronously, and contains the logic for immediately moving data, scheduling pruning flows, and ingesting into SciCat. Downstream steps typically rely on this action completing first. +- **`job_controller.py`** + - For beamlines that trigger remote analysis workflows, the `JobController()` ABC allows us to define HPC or machine specific implementations, which may differ in how code can be deployed. For example, it can be extended to define how to run tomography reconstruction at ALCF and NERSC. +- **`{hpc}.py`** + - We separate HPC implementations for `JobController()` in their own files. +- **`ingest.py`** + - This is where we define SciCat implementations for each beamline, as each technique will have specific metadata fields that are important to capture. + +## Testing +We write Unit Tests using [pytest](https://pytest.org/) for individual components, which can be found in `orchestration/_tests/`. We run these tests as part of our Github Actions. + +## CI/CD +The project is integrated with [GitHub Actions](https://github.com/features/actions) for continuous integration and deployment. The specifics for these can be found in `.github/workflows/`. The features we support here includes: + +- **Automated Test Execution**: All the unit tests are run automatically with every Git Push. +- **Linting**: `flake8` is used to check for syntax and styling errors. +- **MkDocs**: The documentation site is automatically updated whenever a Pull Request is merged into the main branch. +- **Docker**: A Docker image is aumatically created and registered on the Github Container Repository (ghcr.io) when a new release is made. \ No newline at end of file diff --git a/docs/mkdocs/docs/hpss.md b/docs/mkdocs/docs/hpss.md new file mode 100644 index 00000000..6c56164b --- /dev/null +++ b/docs/mkdocs/docs/hpss.md @@ -0,0 +1,410 @@ + + +# Developing for the High Performance Storage System (HPSS) at NERSC + +HPSS is the tape-based data storage system we use for long term storage of experimental data at the ALS. Tape storage, while it may seem antiquated, is still a very economical and secure medium for infrequently accessed data as tape does not need to be powered except for reading and writing. This requires certain considerations when working with this system. + + +## Overview + +**Purpose:** Archive and retrieve large experimental datasets using HPSS. +**Approach:** Use HPSS tools (hsi and htar) within a structured transfer framework orchestrated via SFAPI and SLURM jobs. +**Key Considerations:** File sizes should typically be between 100 GB and 2 TB. Larger projects are segmented into multiple archives. + +### "User" in this context + +It is important to clarify who users are when we talk about transferring to tape. In terms of the flows we support, that includes beamline scientists, visiting users, and computing staff. In this context, it's important to differentiate between who is collecting the data and who is doing the work of moving to and from tape. + +**NERSC Users** + - Can move data to and from HPSS via `htar` and `hsi` commands on Perlmutter in a terminal, in Jupyter, or in a script via SFAPI as outlined below. + - There are limitations and caveats to interacting with the tape system that users should be aware of. + +**ALS Users** + - Generate data! + - Sometimes they are also NERSC users, and can move data to HPSS if they want. + - Either way, we support the long term storage of data that they collect by archiving it on HPSS. + +**Splash Flows Globus Users** + - Can use the Prefect Flows and Slurm scripts provided to help perform transfers to HPSS in an automated way. + - Have transparent and reproducible knowledge on where data is stored on tape. + - Perform transfers that bundle data in a way that is optimized for tape storage and retrieval. + - Apply it across different beamlines. + +**Collaboration Account** + - We use use a collaboration account at NERSC for automating our transfers. This "service" user can perform the same sets of tasks as other NERSC users, but has wider access to data systems. ALS Users benefit from, but do not directly interact with this account. For more information, see the NERSC [documentation](https://docs.nersc.gov/accounts/collaboration_accounts/). To see how to create a NERSC SFAPI Client, check out the [Run Tomography Reconstruction Remotely at NERSC](nersc832.md) example. + +In `orchestration/transfer_controller.py` we have included two transfer classes for moving data from CFS to HPSS and vice versa (HPSS to CFS). We are following the [HPSS best practices](https://docs.nersc.gov/filesystems/HPSS-best-practices/) outlined in the NERSC documentation. + +HPSS is intended for long-term storage of data that is not frequently accessed, and users should aim for file sizes between 100 GB and 2 TB. Since HPSS is a tape system, we need to ensure storage and retrieval commands are done efficiently, as it is a mechanical process to load in a tape and then scroll to the correct region on the tape. + +While there are Globus endpoints for HPSS, the NERSC documentation recommends against it as there are certain conditions (i.e. network disconnection) that are not as robust as their recommended HPSS tools `hsi` and `htar`, which they say is the fastest approach. Together, these tools allow us to work with the HPSS filesystem and carefully bundle our projects into `tar` archives that are built directly on HPSS. Another couple of drawbacks to using Globus here is 1) if you have small files, you need to tar them regardless before transferring, and 2) HPSS does not support collab accounts (i.e. alsdev). + +### Storing and Retrieving from Tape +#### Important note about retrieval + +In production, we are using the `alsdev` collab account. To run these commands, you must have a valid SFAPI client/key pair from Iris at NERSC in your environment. + +**Files are stored on HPC in the following location:** +- `/home/a/alsdev/data_mover` + +**Data retrieved from tape will be found here on NERSC CFS:** +- `/global/cfs/cdirs/als/data_mover/8.3.2/retrieved_from_tape` + +**Logs about data transfers to/from tape are organized here on CFS:** +- `/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{beamline_id}` +- Find details about whether files were stored via hsi or htar, and what the path is on HPSS, etc. + +----------------------- + + +In `orchestraiton/hpss.py` there are two Prefect flows, which utilize two special TransferController classes for interacting with HPSS: + +#### `cfs_to_hpss_flow` + Prefect flow for transferring data from CFS to HPSS tape archive. + + This flow handles the transfer of files or directories from NERSC's Community + File System (CFS) to the High Performance Storage System (HPSS) tape archive. + For directories, files are bundled into tar archives based on time periods. + + Args: + file_path (Union[str, List[str]]): A single file path or a list of file paths to transfer + source (FileSystemEndpoint): The CFS source endpoint + destination (HPSSEndpoint): The HPSS destination endpoint + config (BeamlineConfig): The beamline configuration containing endpoints and credentials + + Returns: + bool: True if all transfers succeeded, False otherwise + +#### `hpss_to_cfs_flow` + Prefect flow for retrieving data from HPSS tape archive to CFS. + + This flow handles the retrieval of files or tar archives from NERSC's High + Performance Storage System (HPSS) to the Community File System (CFS). + For tar archives, you can optionally specify specific files to extract. + + Args: + file_path (str): The path of the file or tar archive on HPSS + source (HPSSEndpoint): The HPSS source endpoint + destination (FileSystemEndpoint): The CFS destination endpoint + files_to_extract (Optional[List[str]]): Specific files to extract from the tar archive + config (BeamlineConfig): The beamline configuration containing endpoints and credentials + + Returns: + bool: True if the transfer succeeded, False otherwise + +#### `CFSToHPSSTransferController` + Use SFAPI, Slurm, hsi, and htar to move data from CFS to HPSS at NERSC. + + This controller requires the source to be a FileSystemEndpoint on CFS and the + destination to be an HPSSEndpoint. For a single file, the transfer is done using hsi (via hsi cput). + For a directory, the transfer is performed with htar. In this updated version, if the source is a + directory then the files are bundled into tar archives based on their modification dates as follows: + - Files with modification dates between Jan 1 and Jul 15 (inclusive) are grouped together + (Cycle 1 for that year). + - Files with modification dates between Jul 16 and Dec 31 are grouped together (Cycle 2). + + Within each group, if the total size exceeds 2 TB the files are partitioned into multiple tar bundles. + The resulting naming convention on HPSS is: + + /home/a/alsdev/data_mover/[beamline]/raw/[proposal_name]/ + [proposal_name]_[year]-[cycle].tar + [proposal_name]_[year]-[cycle]_part0.tar + [proposal_name]_[year]-[cycle]_part1.tar + ... + + At the end of the SLURM script, the directory tree for both the source (CFS) and destination (HPSS) + is echoed for logging purposes. + +#### `HPSSToCFSTransferController` + Use SFAPI, Slurm, hsi and htar to move data between HPSS and CFS at NERSC. + + This controller retrieves data from an HPSS source endpoint and places it on a CFS destination endpoint. + It supports the following modes: + - "single": Single file retrieval via hsi get. + - "tar": Full tar archive extraction via htar -xvf. + - "partial": Partial extraction from a tar archive: if a list of files is provided (via files_to_extract), + only the specified files will be extracted. + + A single SLURM job script is generated that branches based on the mode. + + + +## Developer Notes about HPSS + +### Working with `hsi` + +We use `hsi` for handling individual files on HPSS. [Here is the official NERSC documentation for `hsi`.](https://docs.nersc.gov/filesystems/hsi/) + + +**Login to HPSS using `hsi`** + +``` +nersc$ hsi +``` + + +**Common `hsi` commands** +``` +hsi ls: show the contents of your HPSS home directory +hsi mkdir [new_dir]: create a remote directory in your home +hsi put [local_file_name]: Transfer a single file into HPSS with the same name +hsi put -R [local_directory]: Transfer a directory tree into HPSS, creating sub-dirs when needed +hsi get [/path/to/hpss_file]: Transfer a single file from HPSS into the local directory without renaming +hsi rm [/path/to/hpss_file]: Prune a file from HPSS +hsi rm -r [/path/to/hpss_file]: Prune a directory from HPSS +hsi rmdir /path/to/my_hpss_dir/: Prune an empty directory + +``` + +**Examples** + +Find files that are more than 20 days old and redirects the output to the file temp.txt: + +``` +hsi -q "find . -ctime 20" > temp.txt 2>&1 +``` + +### Working with `htar` + +We can use `htar` to efficiently work with groups of files on HPSS. The basic syntax of `htar` is similar to the standard `tar` utility: + +``` +htar -{c|K|t|x|X} -f tarfile [directories] [files] + +-c : Create +-K : Verify existing tarfile in HPSS +-t : List +-x : Extract +-X : re-create the index file for an existing archive +``` + +You cannot add or append files to an existing htar file. The following examples [can also be found here](https://docs.nersc.gov/filesystems/htar/#htar-usage-examples). + +**Create an archive with a directory and file** + +``` +nersc$ htar -cvf archive.tar project_directory some_extra_file.json +``` +**List the contents of a `tar` archive** +``` +nersc$ htar -tf archive.tar +HTAR: drwx------ als/als 0 2010-09-24 14:24 project_directory/cool_scan1 +HTAR: -rwx------ als/als 9331200 2010-09-24 14:24 project_directory/cool_scan2 +HTAR: -rwx------ als/als 9331200 2010-09-24 14:24 project_directory/cool_scan3 +HTAR: -rwx------ als/als 9331200 2010-09-24 14:24 project_directory/cool_scan4 +HTAR: -rwx------ als/als 398552 2010-09-24 17:35 some_extra_file.json +HTAR: HTAR SUCCESSFUL + +``` + +**Extract the entire `htar` file** + +``` +htar -xvf archive.tar +``` + +**Extract a single file from `htar`** + +``` +htar -xvf archive.tar project_directory/cool_scan4 +``` + +**`-Hnostage` option** + +If your `htar` files are >100GB, and you only want to extract one or two small member files, you may find faster retrieval rates by skipping staging the file to the HPSS disk cache with `-Hnostage`. + +``` +htar -Hnostage -xvf archive.tar project_directory/cool_scan4 +``` + +### Transferring Data from CFS to HPSS + +NERSC provides a special `xfer` QOS ("Quality of Service") for interacting with HPSS, which we can use with our SFAPI Slurm job scripts. + +#### Single Files + +We can transfer single files over to HPSS using `hsi put` in a Slurm script: + +**Example `hsi` transfer job** + +``` +#SBATCH --qos=xfer +#SBATCH -C cron +#SBATCH --time=12:00:00 +#SBATCH --job-name=my_transfer +#SBATCH --licenses=SCRATCH +#SBATCH --mem=20GB + +# Archive a user's project folder to HPSS +hsi put /global/cfs/cdirs/als/data_mover/8.3.2/raw/als_user_project_folder/cool_scan1.h5 +``` + +Notes: +- `xfer` jobs specifying -N nodes will be rejected at submission time. By default, `xfer` jobs get 2GB of memory allocated. The memory footprint scales somewhat with the size of the file, so if you're archiving larger files, you'll need to request more memory. You can do this by adding `#SBATCH --mem=XGB` to the above script (where X in the range of 5 - 10 GB is a good starting point for large files). +- NERSC users are at most allowed 15 concurrent `xfer` sessions, which can be used strategically for parallel transfers and reads. + + +#### Multiple Files + +NERSC recommends that when serving many files smaller than 100 GB we use `htar` to bundle them together before archiving. Since individual scans within a project may not be this large, we try to archive all of the scans in a project into a single `tar` file. If projects end up being larger than 2 TB, we can create multiple `tar` files. + +One great part about `htar` is that it builds the archive directly on `HPSS`, so you do not need to worry about needing the additional storage allocation on the CFS side for the `tar` file. + +**Example `xfer` transfer job** +``` +#SBATCH --qos=xfer +#SBATCH -C cron +#SBATCH --time=12:00:00 +#SBATCH --job-name=my_transfer +#SBATCH --licenses=SCRATCH +#SBATCH --mem=100GB + +# Archive a user's project folder to HPSS +htar -cvf als_user_project.tar /global/cfs/cdirs/als/data_mover/8.3.2/raw/als_user_project_folder +``` + +### Transferring Data from HPSS to CFS + +At some point you may want to access data from HPSS. An important thing to consider is whether you need to access single or multiple files. + +You could extract an entire `htar` file + +``` +htar -xvf als_user_project_folder.tar +``` + +Or maybe a single file + +``` +htar -xvf als_user_project_folder.tar cool_scan1.h5 +``` + +### Prefect Flows for HPSS Transfers + +Most of the time we expect transfers to occur from CFS to HPSS on a scheduled basis, after users have completed scanning during their alotted beamtime. + +#### Transfer to HPSS Implementation +**`orchestration/transfer_controller.py`:** + - **`CFSToHPSSTransferController()`**: This controller uses a Slurm Job Script and SFAPI to launch the tape transfer job. The Slurm script handles the specific logic for handling single and multiple files, on a project by project basis. It reads the files sizes, and creates bundles that are <= 2TB. The groups within each tar archive are saved in a log on NERSC CFS for posterity. + + Here is a high level overview of the steps taken within the SFAPI Slurm Job: + 1. Define the source (CFS) and destination (HPSS) paths. + 2. Create the destination directory on HPSS if it doesn't exist. + - Recursively check each part of the incoming file path if the folder exists + - If the folder does not exist, use `hsi mkdir` + - Repeat until the file path is built + - Note: In the [hsi documentation](https://hpss-collaboration.org/wp-content/uploads/2023/09/hpss_hsi_10.2_reference_manual.pdf), there is a command `mkdir -p` to create\ missing intermediate path name directories. If the -p flag is not specified, the parent directory of each newly-created directory must already exist. This is not currently implemented, but something to consider in the future. + 3. Determine if the source is a file or a directory. + - If a file, transfer it using 'hsi cput'. + - If a directory, group files by beam cycle and archive them. + * Cycle 1: Jan 1 - Jul 15 + * Cycle 2: Jul 16 - Dec 31 + * If a group exceeds 2 TB, it is partitioned into multiple tar archives. + * Archive names: + `[proposal_name]_[year]-[cycle].tar` + `[proposal_name]_[year]-[cycle]_part0.tar, _part1.tar, etc.` + + +**`orchestration/hpss.py`:** +- **`cfs_to_hpss_flow()`** This Prefect Flow sets up the CFSToHPSSTransferController() and calls the copy command. By registering this Flow, the HPSS transfers can be easily scheduled. + + +**HPSS SFAPI/Slurm Job Logic**: +```mermaid + +flowchart TD + subgraph "Parameter & Path Setup" + A["Validate Params:
file_path, source, destination"] + B["Compute CFS Path
and get beamline_id"] + C["Build HPSS Root Path
and determine proposal name"] + D["Set Logs Path"] + end + + subgraph "SLURM Script" + E["Set SLURM Header Directives"] + F["Enable Strict Error Handling"] + G["Define Variables:
SOURCE_PATH, DEST_ROOT,
FOLDER_NAME, DEST_PATH"] + H["Check if Destination Directory Exists"] + I{"Directory Exists?"} + J["If Yes: Log Exists"] + K["If No: Create Directory"] + L["Determine Source Type"] + M{"File or Directory?"} + N["If File:
Transfer via hsi cput"] + O["If Directory:
List files, group by date,
bundle and create tar archives"] + end + + subgraph "Job Submission" + P["Log Directory Trees"] + Q["Submit Job via Perlmutter"] + R["Update Job Status & Wait"] + S{"Job Successful?"} + T["Return True"] + U["Attempt Recovery & Log Error
Return False"] + end + + %% Connections + A --> B + B --> C + C --> D + D --> E + E --> F + F --> G + G --> H + H --> I + I -- "Yes" --> J + I -- "No" --> K + J --> L + K --> L + L --> M + M -- "File" --> N + M -- "Directory" --> O + N --> P + O --> P + P --> Q + Q --> R + R --> S + S -- "Yes" --> T + S -- "No" --> U +``` + +#### Transfer to CFS Implementation + +**`orchestration/transfer_controller.py`:** + - **`CFSToHPSSTransferController()`**: This controller uses a Slurm Job Script and SFAPI to copy data from tape to NERSC CFS. The Slurm script handles the specific logic for handling single and multiple files, on a project by project basis. Based on the file path, the Slurm job determines whether a single file or a tar archive has been requested (or even specific files within a tar archive), and run the correct routine to copy the data to CFS. + +**`orchestration/hpss.py`:** +- **`cfs_to_hpss_flow()`** This Prefect Flow sets up the HPSSToCFSTransferController() and calls the copy command. By registering this Flow, the HPSS transfers to CFS can be easily scheduled. While copying from CFS to HPSS is likely not going to be automated, it is still helpful to have this as a Prefect Flow to simplify data access in low-code manner. + + +### Update SciCat with HPSS file paths + +`BeamlineIngestorController()` in `orchestration/flows/scicat/ingestor_controller.py` contains a method `add_new_dataset_location()` that can be used to update the source folder and host metadata in SciCat with new HPSS location: + +```python + def add_new_dataset_location( + self, + dataset_id: str, + source_folder: str, + source_folder_host: str, + ) -> bool: + """ + Add a new location to an existing dataset in SciCat. + + :param dataset_id: SciCat ID of the dataset. + :param source_folder: "Absolute file path on file server containing the files of this dataset, + e.g. /some/path/to/sourcefolder. In case of a single file dataset, e.g. HDF5 data, + it contains the path up to, but excluding the filename. Trailing slashes are removed.", + + :param source_folder_host: "DNS host name of file server hosting sourceFolder, + optionally including a protocol e.g. [protocol://]fileserver1.example.com", + + """ + dataset = self.scicat_client.datasets_get_one(dataset_id) + # sourceFolder sourceFolderHost are each a string + dataset["sourceFolder"] = source_folder + dataset["sourceFolderHost"] = source_folder_host + self.scicat_client.datasets_update(dataset, dataset_id) + logger.info(f"Added location {source_folder} to dataset {dataset_id}") + return dataset_id +``` \ No newline at end of file diff --git a/docs/mkdocs/docs/nersc832.md b/docs/mkdocs/docs/nersc832.md index 9eec997d..b6c381ff 100644 --- a/docs/mkdocs/docs/nersc832.md +++ b/docs/mkdocs/docs/nersc832.md @@ -13,9 +13,14 @@ NERSC user accounts are managed in the [Iris](https://iris.nersc.gov/login) syst Superfacility API Clients (SFAPI) is the gateway to running remote tasks on NERSC, and the method we support for scheduling jobs via SLURM. To create a new SFAPI client: 1. Login to [Iris](https://iris.nersc.gov/login) and navigate to the Profile section, found either on the menu bar at the top or under Settings on the left panel. -2. Scroll all the way to the bottom to the **Superfacility API Clients** section. -3. Press the '+ New Client' button - Fill out the form + +![Iris Dashboard](assets/images/sfapi_step1.png) + +2. Scroll all the way to the bottom to the **Superfacility API Clients** section and press the '+ New Client' button + +![Superfacility API Clients](assets/images/sfapi_step2.png) + +3. Fill out the form a. Client Name (ex: 'tomo-sfapi') @@ -27,8 +32,12 @@ Superfacility API Clients (SFAPI) is the gateway to running remote tasks on NERS e. IP address ranges (Your IP, perlmutter nodes) +![SFAPI Client Form](assets/images/sfapi_step3.png) + 4. Save the SFAPI keys in a safe place. You will need these to launch jobs on NERSC. +![SFAPI keys](assets/images/sfapi_step4.png) + ## NERSC System Status Sometimes the system is down. [Check the status](https://www.nersc.gov/live-status/motd/) for unexpected and routine maintainence. diff --git a/docs/mkdocs/docs/scicat.md b/docs/mkdocs/docs/scicat.md new file mode 100644 index 00000000..ef456d6c --- /dev/null +++ b/docs/mkdocs/docs/scicat.md @@ -0,0 +1,422 @@ +# SciCat + +## Overview +SciCat is a data management system for scientific data. It provides tools to manage, organize, and share research data effectively. + +## Features +- **Data Management**: Efficient storage and organization of scientific datasets. +- **User Authentication**: Secure access control for users. +- **Metadata Management**: Manage metadata associated with your data. +- **Data Sharing**: Share data securely with collaborators. +- **Integration**: Integrate with other tools and workflows. + +## Workflow Diagram + +```mermaid +flowchart TD + A["Start Ingest Flow"] --> B["ingest_dataset Flow"] + B --> C["ingest_dataset_task Task"] + C --> D["Load Environment Variables"] + D --> E["Initialize SciCat Client"] + E --> F["Dynamic Import of Beamline Ingestor Module"] + F --> G["Call ingestor.ingest() in ingest_tomo832.py"] + G --> H["Open HDF5 File (h5py)"] + H --> I["Extract SciCat & Scientific Metadata"] + I --> J["Compute Access Controls & Clean Data"] + J --> K["Upload Raw Dataset (Create Dataset Object)"] + K --> L["Upload Data Block (File Info Mapping)"] + L --> M["Build Thumbnail from Data Array"] + M --> N["Encode Thumbnail to Base64"] + N --> O["Upload Attachment (Thumbnail)"] + O --> P["Return Dataset ID"] +``` + +## Getting Started + +In `splash_flows_globus`, we "ingest" our datasets into SciCat during our file movement workflows. In the directory `orchestration/flows/scicat/` there are two general scripts: `ingest.py` and `utils.py`. Since the data from each beamline is different, we define specific ingest implementations, such as `orchestration/flows/bl832/ingest_tomo832.py`. + +# SciCat Client API Documentation + +This document details the API provided by the `ScicatClient` class and its associated utility functions for interacting with the SciCat Catamel server. + +--- + +## Overview + +The `ScicatClient` class offers a comprehensive interface for communicating with the SciCat server via HTTP. It supports operations such as creating, updating, retrieving, and deleting datasets, samples, instruments, proposals, and published data. The client utilizes token-based authentication and provides helper functions to work with file metadata and image encoding. + +--- + +## ScicatClient Class + +### Initialization + + ScicatClient( + base_url: str, + token: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + timeout_seconds: Optional[int] = None + ) + +- **Parameters:** + - `base_url`: Base URL for the SciCat API (e.g., "http://localhost:3000/api/v3/"). + - `token`: (Optional) A pre-obtained authentication token. + - `username`: (Optional) Username for login. + - `password`: (Optional) Password for login. + - `timeout_seconds`: (Optional) Timeout in seconds for HTTP requests. +- **Behavior:** If no token is provided, the client attempts to log in using the provided username and password, retrieves a token, and sets the appropriate HTTP headers. +- **Raises:** An assertion error if neither a token nor valid login credentials are provided. + +--- + +### Internal Methods + +#### _send_to_scicat + + _send_to_scicat(cmd: str, endpoint: str, data: Optional[BaseModel] = None) + +- **Purpose:** Sends an HTTP request to the SciCat server. +- **Parameters:** + - `cmd`: The HTTP method (e.g., "post", "patch", "get", "delete"). + - `endpoint`: API endpoint to append to the base URL. + - `data`: (Optional) A `pydantic.BaseModel` instance representing the payload. +- **Returns:** The HTTP response object from the request. + +#### _call_endpoint + + _call_endpoint( + cmd: str, + endpoint: str, + data: Optional[BaseModel] = None, + operation: str = "" + ) -> Optional[dict] + +- **Purpose:** Calls a specific API endpoint, handles JSON parsing, and checks for errors. +- **Parameters:** + - `cmd`: The HTTP method. + - `endpoint`: The specific endpoint to call. + - `data`: (Optional) Data to include in the request body. + - `operation`: (Optional) A string identifier for the operation, used in logging. +- **Returns:** A dictionary containing the parsed JSON response. +- **Raises:** `ScicatCommError` if the server responds with an error status. + +--- + +## Dataset Operations + +### Create Dataset + + datasets_create(dataset: Dataset) -> str + +- **Purpose:** Uploads a new dataset. +- **Parameters:** + - `dataset`: An instance of the `Dataset` model. +- **Returns:** A string representing the unique identifier (PID) of the created dataset. +- **Aliases:** `upload_new_dataset`, `create_dataset`. + +### Update Dataset + + datasets_update(dataset: Dataset, pid: str) -> str + +- **Purpose:** Updates an existing dataset. +- **Parameters:** + - `dataset`: An instance of the `Dataset` model with updated fields. + - `pid`: The unique identifier of the dataset to update. +- **Returns:** A string representing the updated dataset's PID. +- **Alias:** `update_dataset`. + +### Create Dataset OrigDatablock + + datasets_origdatablock_create( + dataset_id: str, + datablockDto: CreateDatasetOrigDatablockDto + ) -> dict + +- **Purpose:** Creates an original datablock for a specified dataset. +- **Parameters:** + - `dataset_id`: The unique identifier of the dataset. + - `datablockDto`: A data transfer object containing the datablock details. +- **Returns:** A dictionary representing the created datablock. +- **Aliases:** `upload_dataset_origdatablock`, `create_dataset_origdatablock`. + +### Create Dataset Attachment + + datasets_attachment_create( + attachment: Attachment, + datasetType: str = "Datasets" + ) -> dict + +- **Purpose:** Uploads an attachment to a dataset. +- **Parameters:** + - `attachment`: An instance of the `Attachment` model. + - `datasetType`: (Optional) The type of dataset; default is "Datasets". +- **Returns:** A dictionary containing details of the uploaded attachment. +- **Aliases:** `upload_attachment`, `create_dataset_attachment`. + +### Find Datasets (Full Query) + + datasets_find( + skip: int = 0, + limit: int = 25, + query_fields: Optional[dict] = None + ) -> Optional[dict] + +- **Purpose:** Retrieves datasets using a full text search query. +- **Parameters:** + - `skip`: Number of records to skip (for pagination). + - `limit`: Maximum number of records to return. + - `query_fields`: (Optional) A dictionary specifying search criteria. +- **Returns:** A dictionary with the query results. +- **Aliases:** `get_datasets_full_query`, `find_datasets_full_query`. + +### Get Many Datasets (Simple Filter) + + datasets_get_many(filter_fields: Optional[dict] = None) -> Optional[dict] + +- **Purpose:** Retrieves datasets based on simple filtering criteria. +- **Parameters:** + - `filter_fields`: A dictionary containing the filter conditions. +- **Returns:** A dictionary with the filtered datasets. +- **Aliases:** `get_datasets`, `find_datasets`. + +### Get Single Dataset + + datasets_get_one(pid: str) -> Optional[dict] + +- **Purpose:** Retrieves a single dataset by its PID. +- **Parameters:** + - `pid`: The unique identifier of the dataset. +- **Returns:** A dictionary with the dataset details. +- **Alias:** `get_dataset_by_pid`. + +### Delete Dataset + + datasets_delete(pid: str) -> Optional[dict] + +- **Purpose:** Deletes a dataset identified by its PID. +- **Parameters:** + - `pid`: The unique identifier of the dataset to delete. +- **Returns:** A dictionary containing the server's response. +- **Alias:** `delete_dataset`. + +--- + +## Sample Operations + +### Create Sample + + samples_create(sample: Sample) -> str + +- **Purpose:** Creates a new sample. +- **Parameters:** + - `sample`: An instance of the `Sample` model. +- **Returns:** A string representing the newly created sample ID. +- **Alias:** `upload_sample`. + +### Update Sample + + samples_update(sample: Sample, sampleId: Optional[str] = None) -> str + +- **Purpose:** Updates an existing sample. +- **Parameters:** + - `sample`: An instance of the `Sample` model with updated values. + - `sampleId`: (Optional) The unique identifier of the sample; if omitted, the sample’s own `sampleId` is used. +- **Returns:** A string representing the updated sample ID. + +### Get Single Sample + + samples_get_one(pid: str) -> Optional[dict] + +- **Purpose:** Retrieves a sample by its PID. +- **Parameters:** + - `pid`: The unique sample identifier. +- **Returns:** A dictionary with the sample details. +- **Alias:** `get_sample`. + +--- + +## Instrument Operations + +### Create Instrument + + instruments_create(instrument: Instrument) -> str + +- **Purpose:** Creates a new instrument. Admin rights may be required. +- **Parameters:** + - `instrument`: An instance of the `Instrument` model. +- **Returns:** A string representing the instrument's unique identifier (PID). +- **Alias:** `upload_instrument`. + +### Update Instrument + + instruments_update(instrument: Instrument, pid: Optional[str] = None) -> str + +- **Purpose:** Updates an existing instrument. +- **Parameters:** + - `instrument`: An instance of the `Instrument` model with updated fields. + - `pid`: (Optional) The unique identifier of the instrument; if omitted, the instrument’s own `pid` is used. +- **Returns:** A string representing the updated instrument PID. + +### Get Single Instrument + + instruments_get_one(pid: Optional[str] = None, name: Optional[str] = None) -> Optional[dict] + +- **Purpose:** Retrieves an instrument by its PID or by name. +- **Parameters:** + - `pid`: (Optional) The unique instrument identifier. + - `name`: (Optional) The instrument name (used if PID is not provided). +- **Returns:** A dictionary with the instrument details. +- **Alias:** `get_instrument`. + +--- + +## Proposal Operations + +### Create Proposal + + proposals_create(proposal: Proposal) -> str + +- **Purpose:** Creates a new proposal. Admin rights may be required. +- **Parameters:** + - `proposal`: An instance of the `Proposal` model. +- **Returns:** A string representing the newly created proposal ID. +- **Alias:** `upload_proposal`. + +### Update Proposal + + proposals_update(proposal: Proposal, proposalId: Optional[str] = None) -> str + +- **Purpose:** Updates an existing proposal. +- **Parameters:** + - `proposal`: An instance of the `Proposal` model with updated information. + - `proposalId`: (Optional) The unique identifier of the proposal; if omitted, the proposal’s own `proposalId` is used. +- **Returns:** A string representing the updated proposal ID. + +### Get Single Proposal + + proposals_get_one(pid: str) -> Optional[dict] + +- **Purpose:** Retrieves a proposal by its PID. +- **Parameters:** + - `pid`: The unique proposal identifier. +- **Returns:** A dictionary with the proposal details. +- **Alias:** `get_proposal`. + +--- + +## Published Data Operations + +### Get Published Data + + published_data_get_many(filter=None) -> Optional[dict] + +- **Purpose:** Retrieves published datasets based on optional filter criteria. +- **Parameters:** + - `filter`: (Optional) A dictionary specifying filter conditions. +- **Returns:** A dictionary containing the published data. +- **Aliases:** `get_published_data`, `find_published_data`. + +--- + +## Additional Dataset Operations + +### Get Dataset OrigDatablocks + + datasets_origdatablocks_get_one(pid: str) -> Optional[dict] + +- **Purpose:** Retrieves the original datablocks associated with a dataset. +- **Parameters:** + - `pid`: The unique identifier of the dataset. +- **Returns:** A dictionary with the original datablock details. +- **Alias:** `get_dataset_origdatablocks`. + +--- + +## Utility Functions + +### File Utilities + +#### Get File Size + + get_file_size(pathobj: Path) + +- **Purpose:** Returns the size of a file in bytes. +- **Parameters:** + - `pathobj`: A `Path` object representing the file. +- **Returns:** The file size as an integer. + +#### Get Checksum + + get_checksum(pathobj: Path) + +- **Purpose:** Computes the MD5 checksum of a file. +- **Parameters:** + - `pathobj`: A `Path` object representing the file. +- **Returns:** The MD5 checksum as a hexadecimal string. + +#### Encode Thumbnail + + encode_thumbnail(filename, imType="jpg") + +- **Purpose:** Encodes an image file as a Base64 data URL, suitable for use as a thumbnail. +- **Parameters:** + - `filename`: Path to the image file. + - `imType`: (Optional) Image format (default is "jpg"). +- **Returns:** A string containing the Base64 encoded image prefixed with the appropriate data URL header. + +#### Get File Modification Time + + get_file_mod_time(pathobj: Path) + +- **Purpose:** Retrieves the last modification time of a file. +- **Parameters:** + - `pathobj`: A `Path` object representing the file. +- **Returns:** A string representation of the file's modification time. + +### Authentication Helpers + +#### Create Client from Token + + from_token(base_url: str, token: str) + +- **Purpose:** Instantiates a `ScicatClient` using an existing authentication token. +- **Parameters:** + - `base_url`: Base URL for the SciCat API. + - `token`: A valid authentication token. +- **Returns:** An instance of `ScicatClient`. + +#### Create Client from Credentials + + from_credentials(base_url: str, username: str, password: str) + +- **Purpose:** Instantiates a `ScicatClient` by logging in with username and password. +- **Parameters:** + - `base_url`: Base URL for the SciCat API. + - `username`: Login username. + - `password`: Login password. +- **Returns:** An instance of `ScicatClient`. + +#### Retrieve Token + + get_token(base_url, username, password) + +- **Purpose:** Logs in using provided credentials and retrieves an authentication token. +- **Parameters:** + - `base_url`: Base URL for the SciCat API. + - `username`: Login username. + - `password`: Login password. +- **Returns:** An authentication token as a string. +- **Behavior:** Attempts login via the `Users/login` and `auth/msad` endpoints. + +--- + +## Exception Classes + +- **ScicatLoginError** + - Raised when an error occurs during the login process. + - Contains an error message describing the issue. + +- **ScicatCommError** + - Raised when communication with the SciCat server fails (non-20x HTTP responses). + - Contains an error message describing the issue. diff --git a/docs/mkdocs/docs/tomography_workflow.md b/docs/mkdocs/docs/tomography_workflow.md new file mode 100644 index 00000000..108e0b2d --- /dev/null +++ b/docs/mkdocs/docs/tomography_workflow.md @@ -0,0 +1,135 @@ +```mermaid + +--- +config: + theme: neo + layout: elk + look: neo +--- +flowchart LR + subgraph s2["ALCF Reconstruction [Prefect Flow]"] + n17["data832"] + n18["ALCF Eagle
[Filesystem]"] + n26["Reconstruction on ALCF Polaris with Globus Compute Endpoint"] + end + subgraph s1["new_file_832 [Prefect Flow]"] + n20["data832"] + n21["NERSC CFS"] + n22@{ label: "SciCat
[Metadata Database]" } + n46["spot832"] + end + subgraph s3["NERSC Reconstruction [Prefect Flow]"] + n28["NERSC CFS"] + n29["NERSC Scratch"] + n30["Reconstruction on NERSC Perlmutter with SFAPI, Slurm, Docker"] + n42["data832"] + end + subgraph s4["Scheduled HPSS Transfer [Prefect Flow]"] + n38["NERSC CFS"] + n39["HPSS Tape Archive"] + n40["SciCat
[Metadata Database]"] + end + subgraph s5["Data Visualization at the Beamline"] + n41["data832"] + n43["Tiled Server
[Reconstruction Database]"] + n44(["ITK-VTK-Viewer
[Web Application]"]) + n45["SciCat
[Metadata Database]"] + end + n17 -- Raw Data [Globus Transfer] --> n18 + n23["spot832"] -- File Watcher --> n24["Dispatcher
[Prefect Worker]"] + n25["Detector"] -- Raw Data --> n23 + n24 --> s2 & s1 & s3 & s4 + n20 -- Raw Data [Globus Transfer] --> n21 + n21 -- "Metadata [SciCat Ingestion]" --> n22 + n18 -- Raw Data --> n26 + n26 -- Recon Data --> n18 + n18 -- Recon Data [Globus Transfer] --> n17 + n28 -- Raw Data --> n29 + n29 -- Raw Data --> n30 + n29 -- Recon Data --> n28 + n30 -- Recon Data --> n29 + s1 --> n32["Scheduled Pruning
[Prefect Workers]"] + s3 --> n32 + s2 --> n32 + n32 --> n33["ALCF Eagle"] & n35["NERSC CFS"] & n34["data832"] & n36["spot832"] + n38 -- Raw Data [SFAPI Slurm htar Transfer] --> n39 + s4 --> n32 + n39 -- "Metadata [SciCat Ingestion]" --> n40 + n28 -- "Recon Data" --> n42 + n41 -- Recon Data --> n43 + n43 -- Recon Data --> n44 + n43 -- Metadata [SciCat Ingestion] --> n45 + n45 -- Hyperlink --> n44 + n46 -- "Raw Data [Globus Transfer]" --> n20 + n17@{ shape: internal-storage} + n18@{ shape: disk} + n20@{ shape: internal-storage} + n21@{ shape: disk} + n22@{ shape: db} + n46@{ shape: internal-storage} + n28@{ shape: disk} + n29@{ shape: disk} + n42@{ shape: internal-storage} + n38@{ shape: disk} + n39@{ shape: paper-tape} + n40@{ shape: db} + n41@{ shape: internal-storage} + n43@{ shape: db} + n45@{ shape: db} + n23@{ shape: internal-storage} + n24@{ shape: rect} + n25@{ shape: rounded} + n33@{ shape: disk} + n35@{ shape: disk} + n34@{ shape: internal-storage} + n36@{ shape: internal-storage} + n17:::storage + n17:::Peach + n18:::storage + n18:::Sky + n26:::compute + n20:::storage + n20:::Peach + n21:::Sky + n22:::Sky + n46:::collection + n46:::storage + n46:::Peach + n28:::Sky + n29:::storage + n29:::Sky + n30:::compute + n42:::Peach + n38:::Sky + n39:::storage + n40:::Sky + n41:::Peach + n43:::Sky + n44:::visualization + n45:::Sky + n23:::collection + n23:::storage + n23:::Peach + n24:::collection + n24:::Rose + n25:::Ash + n32:::Rose + n33:::Sky + n35:::Sky + n34:::Peach + n36:::Peach + classDef collection fill:#D3A6A1, stroke:#D3A6A1, stroke-width:2px, color:#000000 + classDef compute fill:#A9C0C9, stroke:#A9C0C9, stroke-width:2px, color:#000000 + classDef Rose stroke-width:1px, stroke-dasharray:none, stroke:#FF5978, fill:#FFDFE5, color:#8E2236 + classDef storage fill:#A3C1DA, stroke:#A3C1DA, stroke-width:2px, color:#000000 + classDef Ash stroke-width:1px, stroke-dasharray:none, stroke:#999999, fill:#EEEEEE, color:#000000 + classDef Peach stroke-width:1px, stroke-dasharray:none, stroke:#FBB35A, fill:#FFEFDB, color:#8F632D + classDef visualization fill:#E8D5A6, stroke:#E8D5A6, stroke-width:2px, color:#000000 + classDef Sky stroke-width:1px, stroke-dasharray:none, stroke:#374D7C, fill:#E2EBFF, color:#374D7C + style s2 stroke:#757575 + style s1 stroke:#757575 + style s3 stroke:#757575 + style s4 stroke:#757575 + style s5 stroke:#757575 + +``` \ No newline at end of file diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 728a990e..6e6c84fc 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -13,10 +13,13 @@ nav: - Home: index.md - Installation and Requirements: install.md - Getting Started: getting_started.md -- Compute at ALCF: alcf832.md -- Compute at NERSC: nersc832.md +- Common Infrastructure: common_infrastructure.md +- Beamline Implementations: + - 8.3.2 Micro Tomography - Compute at ALCF: alcf832.md + - 8.3.2 Micro Tomography - Compute at NERSC: nersc832.md - Orchestration: orchestration.md - Configuration: configuration.md +- HPSS Tape Archive Access: hpss.md # - Troubleshooting: troubleshooting.md - Glossary: glossary.md - About: about.md diff --git a/orchestration/_tests/test_prune_controller.py b/orchestration/_tests/test_prune_controller.py new file mode 100644 index 00000000..6f72db4f --- /dev/null +++ b/orchestration/_tests/test_prune_controller.py @@ -0,0 +1,404 @@ +# tests/orchestration/_tests/test_prune_controllers.py + +from pathlib import Path +from typing import Any, Dict, Optional + +import pytest +from prefect.blocks.system import JSON +from prefect.testing.utilities import prefect_test_harness + +from orchestration.config import BeamlineConfig +from orchestration.prune_controller import ( + PruneController, + FileSystemPruneController, + GlobusPruneController, + get_prune_controller, + PruneMethod, + prune_filesystem_endpoint, + prune_globus_endpoint, +) +from orchestration.transfer_endpoints import FileSystemEndpoint +from orchestration.globus.transfer import GlobusEndpoint + + +############################################################################### +# Shared Fixtures & Helpers +############################################################################### + +@pytest.fixture(autouse=True, scope="session") +def prefect_test_fixture(): + """Set up the Prefect test harness and register our JSON block.""" + with prefect_test_harness(): + JSON(value={"max_wait_seconds": 600}).save(name="globus-settings") + yield + + +class MockConfig(BeamlineConfig): + """Minimal concrete BeamlineConfig for tests (no real I/O).""" + def __init__(self, beamline_id: str = "0.0.0") -> None: + super().__init__(beamline_id=beamline_id) + # Test stubs that the controllers/flows expect to exist + self.tc = None + + def _beam_specific_config(self) -> None: + # Keep it no-op for tests; you can set other attributes here if needed + # e.g., self.some_endpoint = ... + pass + + +@pytest.fixture +def mock_config() -> MockConfig: + """Provides a fresh MockConfig per test.""" + return MockConfig(beamline_id="0.0.0") + + +@pytest.fixture +def fs_endpoint(tmp_path: Path) -> FileSystemEndpoint: + """A FileSystemEndpoint rooted at our tmp directory.""" + return FileSystemEndpoint( + name="fs_endpoint", + root_path=str(tmp_path), + uri=str(tmp_path), + ) + + +@pytest.fixture +def globus_endpoint(tmp_path: Path) -> GlobusEndpoint: + """A real GlobusEndpoint with a mock UUID.""" + return GlobusEndpoint( + uuid="mock-uuid", + uri=str(tmp_path), + root_path=str(tmp_path), + name="globus_endpoint", + ) + + +@pytest.fixture +def fs_controller(mock_config: MockConfig) -> FileSystemPruneController: + """FileSystemPruneController using mock_config.""" + return FileSystemPruneController(config=mock_config) + + +@pytest.fixture +def globus_controller(mock_config: MockConfig) -> GlobusPruneController: + """GlobusPruneController using mock_config.""" + return GlobusPruneController(config=mock_config) + + +############################################################################### +# Mock Fixtures for External Calls +############################################################################### + +@pytest.fixture +def mock_scheduler(monkeypatch): + """ + Monkeypatches schedule_prefect_flow → a mock that records its args and returns True. + Returns the dict where call args are recorded. + """ + recorded: Dict[str, Any] = {} + + def _scheduler(deployment_name, flow_run_name, parameters, duration_from_now): + recorded.update( + deployment_name=deployment_name, + flow_run_name=flow_run_name, + parameters=parameters, + duration=duration_from_now, + ) + return True + + monkeypatch.setattr( + "orchestration.prune_controller.schedule_prefect_flow", + _scheduler, + ) + return recorded + + +@pytest.fixture +def mock_scheduler_raises(monkeypatch): + """ + Monkeypatches schedule_prefect_flow → a mock that always raises. + """ + def _scheduler_raises(*args, **kwargs): + raise RuntimeError("scheduler failure") + + monkeypatch.setattr( + "orchestration.prune_controller.schedule_prefect_flow", + _scheduler_raises, + ) + + +@pytest.fixture +def mock_prune_one_safe(monkeypatch): + """ + Monkeypatches prune_one_safe → a mock that records its kwargs and returns True. + Returns the dict where call args are recorded. + """ + recorded: Dict[str, Any] = {} + + def _prune_one_safe( + file: str, + if_older_than_days: int, + transfer_client: Any, + source_endpoint: GlobusEndpoint, + check_endpoint: Optional[GlobusEndpoint], + logger: Any, + max_wait_seconds: int, + ) -> bool: + recorded.update( + file=file, + if_older_than_days=if_older_than_days, + transfer_client=transfer_client, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + max_wait_seconds=max_wait_seconds, + ) + return True + + monkeypatch.setattr( + "orchestration.prune_controller.prune_one_safe", + _prune_one_safe, + ) + return recorded + + +############################################################################### +# Tests +############################################################################### + +def test_prunecontroller_is_abstract(): + """PruneController must be abstract (cannot be instantiated directly).""" + with pytest.raises(TypeError): + PruneController(config=MockConfig()) + + +def test_get_prune_controller_factory_correct_types(mock_config): + """get_prune_controller returns the right subclass or raises on invalid.""" + assert isinstance(get_prune_controller(PruneMethod.SIMPLE, mock_config), FileSystemPruneController) + assert isinstance(get_prune_controller(PruneMethod.GLOBUS, mock_config), GlobusPruneController) + with pytest.raises((AttributeError, ValueError)): + get_prune_controller("invalid", mock_config) # type: ignore + + +def test_fs_prune_immediate_deletes_file_directly(fs_controller, fs_endpoint, tmp_path: Path): + """Immediate FileSystem prune should delete an existing file.""" + rel = "subdir/foo.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + assert p.exists() + + fn = prune_filesystem_endpoint.fn # type: ignore + assert fn(relative_path=rel, source_endpoint=fs_endpoint, check_endpoint=None, config=fs_controller.config) + assert not p.exists() + + +def test_fs_prune_immediate_returns_false_if_missing(fs_controller, fs_endpoint, tmp_path: Path): + """Immediate FileSystem prune should return False for missing path.""" + rel = "no/such/file.txt" + assert not (tmp_path / rel).exists() + + fn = prune_filesystem_endpoint.fn # type: ignore + assert fn(relative_path=rel, source_endpoint=fs_endpoint, check_endpoint=None, config=fs_controller.config) is False + + +def test_fs_prune_schedules_when_days_from_now_positive(fs_controller, fs_endpoint, tmp_path: Path, mock_scheduler): + """Calling prune with days_from_now>0 should schedule a Prefect flow.""" + rel = "to_schedule.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + result = fs_controller.prune( + file_path=rel, + source_endpoint=fs_endpoint, + check_endpoint=None, + days_from_now=1.5, + ) + assert result is True + + assert mock_scheduler["flow_run_name"] == f"prune_from_{fs_endpoint.name}" + assert mock_scheduler["parameters"]["relative_path"] == rel + assert pytest.approx(mock_scheduler["duration"].total_seconds()) == 1.5 * 86400 + + +def test_fs_prune_schedules_when_days_from_now_zero(fs_controller, fs_endpoint, tmp_path: Path, mock_scheduler): + """Calling prune with days_from_now==0 should schedule a Prefect flow.""" + rel = "to_schedule.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + result = fs_controller.prune( + file_path=rel, + source_endpoint=fs_endpoint, + check_endpoint=None, + days_from_now=0.0, + ) + assert result is True + assert not p.exists() + + +def test_fs_prune_schedules_when_days_from_now_negative(fs_controller, fs_endpoint, tmp_path: Path, mock_scheduler): + """Calling prune with days_from_now<0 should return False.""" + rel = "to_schedule.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + result = fs_controller.prune( + file_path=rel, + source_endpoint=fs_endpoint, + check_endpoint=None, + days_from_now=-4.0, + ) + assert result is False + + +def test_fs_prune_returns_false_if_schedule_raises(fs_controller, fs_endpoint, tmp_path: Path, mock_scheduler_raises): + """If scheduling fails, fs_controller.prune should return False.""" + rel = "error.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + assert fs_controller.prune( + file_path=rel, + source_endpoint=fs_endpoint, + check_endpoint=None, + days_from_now=2.0, + ) is False + + +def test_globus_prune_immediate_calls_prune_one_safe_directly( + globus_controller, + globus_endpoint, + tmp_path: Path, + mock_prune_one_safe +): + """Immediate Globus prune should invoke prune_one_safe with correct arguments.""" + rel = "data.bin" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + fn = prune_globus_endpoint.fn # type: ignore + _ = fn( + relative_path=rel, + source_endpoint=globus_endpoint, + check_endpoint=None, + config=globus_controller.config, + ) + + assert mock_prune_one_safe["file"] == rel + assert mock_prune_one_safe["if_older_than_days"] == 0 + assert mock_prune_one_safe["transfer_client"] is None + assert mock_prune_one_safe["source_endpoint"] is globus_endpoint + assert mock_prune_one_safe["max_wait_seconds"] == 600 + + +@pytest.mark.parametrize("invalid_fp", [None, ""]) +def test_globus_prune_rejects_missing_file_path_directly(globus_controller, globus_endpoint, invalid_fp): + """Globus prune should return False when file_path is None or empty.""" + assert globus_controller.prune( + file_path=invalid_fp, + source_endpoint=globus_endpoint, + check_endpoint=None, + days_from_now=0.0, + ) is False + + +def test_globus_prune_rejects_missing_endpoint_directly(globus_controller, tmp_path: Path): + """Globus prune should return False when source_endpoint is None.""" + (tmp_path / "whatever").touch() + + assert globus_controller.prune( + file_path="whatever", + source_endpoint=None, # type: ignore + check_endpoint=None, + days_from_now=0.0, + ) is False + + +def test_globus_prune_schedules_when_days_from_now_positive( + globus_controller, + globus_endpoint, + tmp_path: Path, + mock_scheduler): + """Calling Globus prune with days_from_now>0 should schedule a Prefect flow.""" + rel = "sched.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + result = globus_controller.prune( + file_path=rel, + source_endpoint=globus_endpoint, + check_endpoint=None, + days_from_now=3.0, + ) + assert result is True + + assert mock_scheduler["flow_run_name"] == f"prune_from_{globus_endpoint.name}" + assert pytest.approx(mock_scheduler["duration"].total_seconds()) == 3.0 * 86400 + + +def test_globus_prune_schedules_when_days_from_now_negative( + globus_controller, + globus_endpoint, + tmp_path: Path,): + """Calling Globus prune with days_from_now<0 should return False.""" + rel = "sched.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + result = globus_controller.prune( + file_path=rel, + source_endpoint=globus_endpoint, + check_endpoint=None, + days_from_now=-4.0, + ) + assert result is False + + +def test_globus_prune_schedules_when_days_from_now_zero( + globus_controller, + globus_endpoint, + tmp_path: Path, + mock_scheduler, + mock_prune_one_safe): + """Calling Globus prune with days_from_now==0 should schedule a Prefect flow.""" + rel = "sched.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + result = globus_controller.prune( + file_path=rel, + source_endpoint=globus_endpoint, + check_endpoint=None, + days_from_now=0.0 + ) + assert result is True + assert mock_prune_one_safe["file"] == rel + assert mock_prune_one_safe["if_older_than_days"] == 0 + assert mock_prune_one_safe["source_endpoint"] is globus_endpoint + + +def test_globus_prune_returns_false_if_schedule_raises( + globus_controller, + globus_endpoint, + tmp_path: Path, + mock_scheduler_raises): + """If scheduling fails, globus_controller.prune should return False.""" + rel = "err.txt" + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.touch() + + assert globus_controller.prune( + file_path=rel, + source_endpoint=globus_endpoint, + check_endpoint=None, + days_from_now=4.0, + ) is False diff --git a/orchestration/_tests/test_scicat.py b/orchestration/_tests/test_scicat.py index 6cc68cdb..b204fa85 100644 --- a/orchestration/_tests/test_scicat.py +++ b/orchestration/_tests/test_scicat.py @@ -60,6 +60,8 @@ def test_np_encoder(): test_dict = {"dont_panic": np.full((1, 1), np.inf)} encoded_np = json.loads(json.dumps(test_dict, cls=NPArrayEncoder)) + # requests doesn't allow strings that have np.inf or np.nan + # so the NPArrayEncoder needs to return both as None assert json.dumps(encoded_np, allow_nan=False) diff --git a/orchestration/_tests/test_sfapi_flow.py b/orchestration/_tests/test_sfapi_flow.py index a1809686..ee424f47 100644 --- a/orchestration/_tests/test_sfapi_flow.py +++ b/orchestration/_tests/test_sfapi_flow.py @@ -2,7 +2,7 @@ from pathlib import Path import pytest -from unittest.mock import MagicMock, patch, mock_open +from unittest.mock import MagicMock, patch from uuid import uuid4 from prefect.blocks.system import Secret @@ -37,81 +37,55 @@ def test_create_sfapi_client_success(): """ Test successful creation of the SFAPI client. """ - from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + from orchestration.sfapi import create_sfapi_client + + # Define fake credential file paths + fake_client_id_path = "/path/to/client_id" + fake_client_secret_path = "/path/to/client_secret" - # Mock data for client_id and client_secret files + # Mock file contents mock_client_id = 'value' mock_client_secret = '{"key": "value"}' # Create separate mock_open instances for each file - mock_open_client_id = mock_open(read_data=mock_client_id) - mock_open_client_secret = mock_open(read_data=mock_client_secret) - - with patch("orchestration.flows.bl832.nersc.os.getenv") as mock_getenv, \ - patch("orchestration.flows.bl832.nersc.os.path.isfile") as mock_isfile, \ - patch("builtins.open", side_effect=[ - mock_open_client_id.return_value, - mock_open_client_secret.return_value - ]), \ - patch("orchestration.flows.bl832.nersc.JsonWebKey.import_key") as mock_import_key, \ - patch("orchestration.flows.bl832.nersc.Client") as MockClient: - - # Mock environment variables - mock_getenv.side_effect = lambda x: { - "PATH_NERSC_CLIENT_ID": "/path/to/client_id", - "PATH_NERSC_PRI_KEY": "/path/to/client_secret" - }.get(x, None) - - # Mock file existence - mock_isfile.return_value = True - - # Mock JsonWebKey.import_key to return a mock secret + with patch("orchestration.sfapi.Path.is_file", return_value=True), \ + patch("orchestration.sfapi.Path.read_text", side_effect=[mock_client_id, mock_client_secret]), \ + patch("orchestration.sfapi.JsonWebKey.import_key") as mock_import_key, \ + patch("orchestration.sfapi.Client") as MockClient: + + # Mock key import to return a fake secret mock_import_key.return_value = "mock_secret" - # Create the client - client = NERSCTomographyHPCController.create_sfapi_client() + # Create the client using the provided fake paths + create_sfapi_client(fake_client_id_path, fake_client_secret_path) - # Assert that Client was instantiated with 'value' and 'mock_secret' + # Verify that Client was instantiated with the expected arguments MockClient.assert_called_once_with("value", "mock_secret") - # Assert that the returned client is the mocked client - assert client == MockClient.return_value, "Client should be the mocked sfapi_client.Client instance" - def test_create_sfapi_client_missing_paths(): """ Test creation of the SFAPI client with missing credential paths. """ - from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + from orchestration.sfapi import create_sfapi_client - with patch("orchestration.flows.bl832.nersc.os.getenv", return_value=None): - with pytest.raises(ValueError, match="Missing NERSC credentials paths."): - NERSCTomographyHPCController.create_sfapi_client() + # Passing None for both paths should trigger a ValueError. + with pytest.raises(ValueError, match="NERSC credentials paths are missing."): + create_sfapi_client(None, None) def test_create_sfapi_client_missing_files(): """ Test creation of the SFAPI client with missing credential files. """ - with ( - # Mock environment variables - patch( - "orchestration.flows.bl832.nersc.os.getenv", - side_effect=lambda x: { - "PATH_NERSC_CLIENT_ID": "/path/to/client_id", - "PATH_NERSC_PRI_KEY": "/path/to/client_secret" - }.get(x, None) - ), - - # Mock file existence to simulate missing files - patch("orchestration.flows.bl832.nersc.os.path.isfile", return_value=False) - ): - # Import the module after applying patches to ensure mocks are in place - from orchestration.flows.bl832.nersc import NERSCTomographyHPCController - - # Expect a FileNotFoundError due to missing credential files + from orchestration.sfapi import create_sfapi_client + fake_client_id_path = "/path/to/client_id" + fake_client_secret_path = "/path/to/client_secret" + + # Simulate missing credential files by patching Path.is_file to return False. + with patch("orchestration.sfapi.Path.is_file", return_value=False): with pytest.raises(FileNotFoundError, match="NERSC credential files are missing."): - NERSCTomographyHPCController.create_sfapi_client() + create_sfapi_client(fake_client_id_path, fake_client_secret_path) # ---------------------------- # Fixture for Mocking SFAPI Client @@ -123,7 +97,7 @@ def mock_sfapi_client(): """ Mock the sfapi_client.Client class with necessary methods. """ - with patch("orchestration.flows.bl832.nersc.Client") as MockClient: + with patch("orchestration.sfapi.Client") as MockClient: mock_client_instance = MockClient.return_value # Mock the user method @@ -151,7 +125,7 @@ def mock_config832(): """ Mock the Config832 class to provide necessary configurations. """ - with patch("orchestration.flows.bl832.nersc.Config832") as MockConfig: + with patch("orchestration.flows.bl832.config.Config832") as MockConfig: mock_config = MockConfig.return_value mock_config.harbor_images832 = { "recon_image": "mock_recon_image", diff --git a/orchestration/_tests/test_transfer_controller.py b/orchestration/_tests/test_transfer_controller.py index a1cae916..fad96601 100644 --- a/orchestration/_tests/test_transfer_controller.py +++ b/orchestration/_tests/test_transfer_controller.py @@ -2,7 +2,8 @@ import pytest from pytest_mock import MockFixture -from unittest.mock import MagicMock, patch +import time +from unittest.mock import MagicMock, patch, Mock from uuid import uuid4 import globus_sdk @@ -12,6 +13,11 @@ from .test_globus import MockTransferClient +@pytest.fixture(autouse=True) +def fast_sleep(monkeypatch): + """Patch time.sleep to return immediately to speed up tests.""" + monkeypatch.setattr(time, "sleep", lambda x: None) + @pytest.fixture(autouse=True, scope="session") def prefect_test_fixture(): @@ -48,10 +54,18 @@ def transfer_controller_module(): get_transfer_controller, CopyMethod, ) + from orchestration.hpss import ( + CFSToHPSSTransferController, + HPSSToCFSTransferController, + HPSSEndpoint, + ) return { "FileSystemEndpoint": FileSystemEndpoint, "GlobusTransferController": GlobusTransferController, "SimpleTransferController": SimpleTransferController, + "CFSToHPSSTransferController": CFSToHPSSTransferController, + "HPSSToCFSTransferController": HPSSToCFSTransferController, + "HPSSEndpoint": HPSSEndpoint, "get_transfer_controller": get_transfer_controller, "CopyMethod": CopyMethod, } @@ -103,7 +117,8 @@ def mock_file_system_endpoint(transfer_controller_module): FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] endpoint = FileSystemEndpoint( name="mock_filesystem_endpoint", - root_path="/mock_fs_root" + root_path="/mock_fs_root", + uri="mock_uri" ) return endpoint @@ -191,7 +206,8 @@ def test_globus_transfer_controller_copy_failure( mocker.patch('prefect.blocks.system.Secret.load', return_value=MockSecretClass()) - with patch("orchestration.transfer_controller.start_transfer", return_value=(False, "mock-task-id")) as mock_start_transfer: + with patch("orchestration.transfer_controller.start_transfer", + return_value=(False, "mock-task-id")) as mock_start_transfer: controller = GlobusTransferController(mock_config832) result = controller.copy( file_path="some_dir/test_file.txt", @@ -226,6 +242,7 @@ def test_globus_transfer_controller_copy_exception( assert result is False, "Expected False when TransferAPIError is raised." mock_start_transfer.assert_called_once() + def test_globus_transfer_controller_with_metrics( mock_config832, mock_globus_endpoint, transfer_controller_module ): @@ -235,30 +252,30 @@ def test_globus_transfer_controller_with_metrics( GlobusTransferController = transfer_controller_module["GlobusTransferController"] from orchestration.prometheus_utils import PrometheusMetrics mock_prometheus = MagicMock(spec=PrometheusMetrics) - + with patch("orchestration.transfer_controller.start_transfer", return_value=(True, "mock-task-id")) as mock_start_transfer: # Create the controller with mock prometheus metrics controller = GlobusTransferController(mock_config832, prometheus_metrics=mock_prometheus) - + # Set up mock for get_transfer_file_info mock_transfer_info = {"bytes_transferred": 1024 * 1024} # 1MB controller.get_transfer_file_info = MagicMock(return_value=mock_transfer_info) - + # Execute the copy operation result = controller.copy( file_path="some_dir/test_file.txt", source=mock_globus_endpoint, destination=mock_globus_endpoint, ) - + # Verify transfer was successful assert result is True mock_start_transfer.assert_called_once() - + # Verify metrics were collected and pushed controller.get_transfer_file_info.assert_called_once_with("mock-task-id") mock_prometheus.push_metrics_to_prometheus.assert_called_once() - + # Verify the metrics data metrics_data = mock_prometheus.push_metrics_to_prometheus.call_args[0][0] assert metrics_data["bytes_transferred"] == 1024 * 1024 @@ -275,6 +292,7 @@ def test_globus_transfer_controller_with_metrics( # Tests for SimpleTransferController # -------------------------------------------------------------------------- + def test_simple_transfer_controller_no_file_path( mock_config832, mock_file_system_endpoint, transfer_controller_module ): @@ -299,53 +317,361 @@ def test_simple_transfer_controller_no_source_or_destination(mock_config832, tra assert result is False, "Expected False when either source or destination is None." -def test_simple_transfer_controller_copy_success( - mock_config832, mock_file_system_endpoint, transfer_controller_module +def test_simple_transfer_controller_copy_success_with_real_files( + tmp_path, mock_config832, transfer_controller_module ): SimpleTransferController = transfer_controller_module["SimpleTransferController"] - with patch("os.system", return_value=0) as mock_os_system: - controller = SimpleTransferController(mock_config832) - result = controller.copy( - file_path="some_dir/test_file.txt", - source=mock_file_system_endpoint, - destination=mock_file_system_endpoint, - ) - assert result is True, "Expected True when os.system returns 0." - mock_os_system.assert_called_once() - command_called = mock_os_system.call_args[0][0] - assert "cp -r" in command_called, "Expected cp command in os.system call." + # Create real directory structure + source_dir = tmp_path / "source" + dest_dir = tmp_path / "destination" + source_dir.mkdir() + dest_dir.mkdir() + # Create actual source file + source_file = source_dir / "experiment" / "data.txt" + source_file.parent.mkdir(parents=True) + source_file.write_text("test content") -def test_simple_transfer_controller_copy_failure( - mock_config832, mock_file_system_endpoint, transfer_controller_module + # Setup endpoints with real paths + source_endpoint = Mock() + source_endpoint.name = "source_storage" + source_endpoint.root_path = str(source_dir) + + dest_endpoint = Mock() + dest_endpoint.name = "dest_storage" + dest_endpoint.root_path = str(dest_dir) + + controller = SimpleTransferController(mock_config832) + result = controller.copy( + file_path="experiment/data.txt", + source=source_endpoint, + destination=dest_endpoint, + ) + + # Verify the result and actual file operations + assert result is True + + # Check that the file actually exists at destination + dest_file = dest_dir / "experiment" / "data.txt" + assert dest_file.exists(), "File should be copied to destination" + assert dest_file.read_text() == "test content", "File content should match" + + +# def test_simple_transfer_controller_copy_failure( +# mock_config832, mock_file_system_endpoint, transfer_controller_module +# ): +# SimpleTransferController = transfer_controller_module["SimpleTransferController"] +# with patch("orchestration.transfer_controller.os.path.exists", return_value=True): # ensure source file exists +# with patch("orchestration.transfer_controller.os.system", return_value=1) as mock_os_system: +# controller = SimpleTransferController(mock_config832) +# result = controller.copy( +# file_path="some_dir/test_file.txt", +# source=mock_file_system_endpoint, +# destination=mock_file_system_endpoint, +# ) +# assert result is False, "Expected False when os.system returns non-zero." +# mock_os_system.assert_called_once() +# command_called = mock_os_system.call_args[0][0] +# assert "cp -r" in command_called, "Expected cp command in os.system call." + +def test_simple_transfer_controller_copy_command_failure( + tmp_path, mock_config832, transfer_controller_module ): SimpleTransferController = transfer_controller_module["SimpleTransferController"] - with patch("os.system", return_value=1) as mock_os_system: + + # Create real directory structure and source file + source_dir = tmp_path / "source" + dest_dir = tmp_path / "destination" + source_dir.mkdir() + dest_dir.mkdir() + + # Create the actual source file + source_file = source_dir / "some_dir" / "test_file.txt" + source_file.parent.mkdir(parents=True) + source_file.write_text("test content") + + # Setup endpoints with real paths + source_endpoint = Mock() + source_endpoint.root_path = str(source_dir) + + dest_endpoint = Mock() + dest_endpoint.root_path = str(dest_dir) + + # Mock only os.system to simulate command failure + with patch("orchestration.transfer_controller.os.system", return_value=1) as mock_os_system: controller = SimpleTransferController(mock_config832) result = controller.copy( file_path="some_dir/test_file.txt", - source=mock_file_system_endpoint, - destination=mock_file_system_endpoint, + source=source_endpoint, + destination=dest_endpoint, ) + # Verify the copy failed assert result is False, "Expected False when os.system returns non-zero." + + # Verify the command was called correctly mock_os_system.assert_called_once() command_called = mock_os_system.call_args[0][0] - assert "cp -r" in command_called, "Expected cp command in os.system call." + # More specific assertions about the command + expected_source = str(source_dir / "some_dir" / "test_file.txt") + expected_dest = str(dest_dir / "some_dir" / "test_file.txt") + expected_command = f"cp -r '{expected_source}' '{expected_dest}'" + assert command_called == expected_command, f"Expected exact command: {expected_command}" -def test_simple_transfer_controller_copy_exception( - mock_config832, mock_file_system_endpoint, transfer_controller_module + # Verify the destination file was NOT created (since command failed) + dest_file = dest_dir / "some_dir" / "test_file.txt" + assert not dest_file.exists(), "File should not exist when copy command fails" + + +def test_simple_transfer_controller_copy_exception_handling( + tmp_path, mock_config832, transfer_controller_module ): SimpleTransferController = transfer_controller_module["SimpleTransferController"] - with patch("os.system", side_effect=Exception("Mocked cp error")) as mock_os_system: + + # Create real directory structure and source file + source_dir = tmp_path / "source" + dest_dir = tmp_path / "destination" + source_dir.mkdir() + dest_dir.mkdir() + + # Create the actual source file + source_file = source_dir / "some_dir" / "test_file.txt" + source_file.parent.mkdir(parents=True) + source_file.write_text("test content") + + # Setup endpoints with real paths + source_endpoint = Mock() + source_endpoint.root_path = str(source_dir) + + dest_endpoint = Mock() + dest_endpoint.root_path = str(dest_dir) + + # Mock os.system to raise an exception + with patch("orchestration.transfer_controller.os.system", side_effect=Exception("Mocked cp error")) as mock_os_system: controller = SimpleTransferController(mock_config832) result = controller.copy( file_path="some_dir/test_file.txt", - source=mock_file_system_endpoint, - destination=mock_file_system_endpoint, + source=source_endpoint, + destination=dest_endpoint, ) + # Verify the copy failed due to exception assert result is False, "Expected False when an exception is raised during copy." + + # Verify the system command was attempted mock_os_system.assert_called_once() + + # Verify the command that would have been called + command_called = mock_os_system.call_args[0][0] + expected_source = str(source_dir / "some_dir" / "test_file.txt") + expected_dest = str(dest_dir / "some_dir" / "test_file.txt") + expected_command = f"cp -r '{expected_source}' '{expected_dest}'" + assert command_called == expected_command, f"Expected exact command: {expected_command}" + + # Verify the destination file was NOT created (since exception occurred) + dest_file = dest_dir / "some_dir" / "test_file.txt" + assert not dest_file.exists(), "File should not exist when copy operation raises exception" + + # Verify destination directory was still created (this happens before the exception) + dest_parent_dir = dest_dir / "some_dir" + assert dest_parent_dir.exists(), "Destination directory should have been created before exception" + + +# -------------------------------------------------------------------------- +# Tests for CFSToHPSSTransferController +# -------------------------------------------------------------------------- + +def test_cfs_to_hpss_transfer_controller_success(mock_config832, transfer_controller_module, mocker: MockFixture): + """ + Test a successful copy() operation using CFSToHPSSTransferController. + We simulate a successful job submission and completion. + """ + CFSToHPSSTransferController = transfer_controller_module["CFSToHPSSTransferController"] + HPSSEndpoint = transfer_controller_module["HPSSEndpoint"] + FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] + + # Create mock endpoints for source (CFS) and destination (HPSS) + source_endpoint = FileSystemEndpoint("mock_cfs_source", "/mock_cfs_source", "mock.uri") + destination_endpoint = HPSSEndpoint("mock_hpss_dest", "/mock_hpss_dest", "mock.uri") + + # Create a fake job object that simulates successful completion. + fake_job = MagicMock() + fake_job.jobid = "12345" + fake_job.state = "COMPLETED" + fake_job.complete.return_value = None + + # Create a fake compute object that returns the fake job. + fake_compute = MagicMock() + fake_compute.submit_job.return_value = fake_job + + # Create a fake client whose compute() returns our fake_compute. + fake_client = MagicMock() + fake_client.compute.return_value = fake_compute + + controller = CFSToHPSSTransferController(fake_client, mock_config832) + result = controller.copy( + file_path="test_dir/test_file.txt", + source=source_endpoint, + destination=destination_endpoint, + ) + assert result is True, "Expected True when CFSToHPSSTransferController transfer completes successfully." + fake_compute.submit_job.assert_called_once() + fake_job.complete.assert_called_once() + + +def test_cfs_to_hpss_transfer_controller_failure(mock_config832, transfer_controller_module): + """ + Test a failing copy() operation using CFSToHPSSTransferController when job submission raises an exception. + """ + CFSToHPSSTransferController = transfer_controller_module["CFSToHPSSTransferController"] + HPSSEndpoint = transfer_controller_module["HPSSEndpoint"] + FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] + + source_endpoint = FileSystemEndpoint("mock_cfs_source", "/mock_cfs_source", "mock.uri") + destination_endpoint = HPSSEndpoint("mock_hpss_dest", "/mock_hpss_dest", "mock.uri") + + # Create a fake client whose compute().submit_job raises an exception. + fake_client = MagicMock() + fake_compute = MagicMock() + fake_compute.submit_job.side_effect = Exception("Job submission failed") + fake_client.compute.return_value = fake_compute + + controller = CFSToHPSSTransferController(fake_client, mock_config832) + result = controller.copy( + file_path="test_dir/test_file.txt", + source=source_endpoint, + destination=destination_endpoint, + ) + assert result is False, "Expected False when CFSToHPSSTransferController transfer fails due to job submission error." + fake_compute.submit_job.assert_called_once() + + +# -------------------------------------------------------------------------- +# Tests for HPSSToCFSTransferController +# -------------------------------------------------------------------------- + +def test_hpss_to_cfs_transfer_controller_success(mock_config832, transfer_controller_module, mocker: MockFixture): + """ + Test a successful copy() operation using HPSSToCFSTransferController. + We simulate a successful job submission and completion. + """ + HPSSToCFSTransferController = transfer_controller_module["HPSSToCFSTransferController"] + HPSSEndpoint = transfer_controller_module["HPSSEndpoint"] + FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] + + source_endpoint = HPSSEndpoint("mock_hpss_source", "/mock_hpss_source", "mock.uri") + destination_endpoint = FileSystemEndpoint("mock_cfs_dest", "/mock_cfs_dest", "mock.uri") + + # Create a fake job object for a successful transfer. + fake_job = MagicMock() + fake_job.jobid = "67890" + fake_job.state = "COMPLETED" + fake_job.complete.return_value = None + + fake_compute = MagicMock() + fake_compute.submit_job.return_value = fake_job + + fake_client = MagicMock() + fake_client.compute.return_value = fake_compute + + controller = HPSSToCFSTransferController(fake_client, mock_config832) + result = controller.copy( + file_path="archive.tar", + source=source_endpoint, + destination=destination_endpoint, + files_to_extract=["file1.txt", "file2.txt"] + ) + assert result is True, "Expected True when HPSSToCFSTransferController transfer completes successfully." + fake_compute.submit_job.assert_called_once() + fake_job.complete.assert_called_once() + + +def test_hpss_to_cfs_transfer_controller_missing_params(mock_config832, transfer_controller_module): + """ + Test that HPSSToCFSTransferController.copy() returns False when required parameters are missing. + """ + HPSSToCFSTransferController = transfer_controller_module["HPSSToCFSTransferController"] + fake_client = MagicMock() # Client is not used because the method returns early. + controller = HPSSToCFSTransferController(fake_client, mock_config832) + + result = controller.copy(file_path=None, source=None, destination=None) + assert result is False, "Expected False when required parameters are missing." + + +def test_hpss_to_cfs_transfer_controller_job_failure(mock_config832, transfer_controller_module): + """ + Test HPSSToCFSTransferController.transfer() returns False when job.complete() raises an exception. + """ + HPSSToCFSTransferController = transfer_controller_module["HPSSToCFSTransferController"] + HPSSEndpoint = transfer_controller_module["HPSSEndpoint"] + FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] + + source_endpoint = HPSSEndpoint("mock_hpss_source", "/mock_hpss_source", "mock.uri") + destination_endpoint = FileSystemEndpoint("mock_cfs_dest", "/mock_cfs_dest", "mock.uri") + + fake_job = MagicMock() + fake_job.jobid = "67891" + fake_job.state = "FAILED" + fake_job.complete.side_effect = Exception("Job completion failed") + + fake_compute = MagicMock() + fake_compute.submit_job.return_value = fake_job + + fake_client = MagicMock() + fake_client.compute.return_value = fake_compute + + controller = HPSSToCFSTransferController(fake_client, mock_config832) + result = controller.copy( + file_path="archive.tar", + source=source_endpoint, + destination=destination_endpoint, + ) + assert result is False, "Expected False when HPSSToCFSTransferController job fails to complete." + fake_compute.submit_job.assert_called_once() + fake_job.complete.assert_called_once() + + +def test_hpss_to_cfs_transfer_controller_recovery(mock_config832, transfer_controller_module): + """ + Test HPSSToCFSTransferController recovery scenario when initial job.complete() fails with 'Job not found:'. + The controller should attempt to recover the job and complete successfully. + """ + HPSSToCFSTransferController = transfer_controller_module["HPSSToCFSTransferController"] + HPSSEndpoint = transfer_controller_module["HPSSEndpoint"] + FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] + + source_endpoint = HPSSEndpoint("mock_hpss_source", "/mock_hpss_source", "mock.uri") + destination_endpoint = FileSystemEndpoint("mock_cfs_dest", "/mock_cfs_dest", "mock.uri") + + # Fake job that fails initially with a "Job not found:" error. + fake_job_initial = MagicMock() + fake_job_initial.jobid = "11111" + fake_job_initial.state = "UNKNOWN" + fake_job_initial.complete.side_effect = Exception("Job not found: 11111") + + fake_compute = MagicMock() + fake_compute.submit_job.return_value = fake_job_initial + + # When recovery is attempted, return a job that completes successfully. + fake_job_recovered = MagicMock() + fake_job_recovered.jobid = "11111" + fake_job_recovered.state = "COMPLETED" + fake_job_recovered.complete.return_value = None + + fake_client = MagicMock() + fake_client.compute.return_value = fake_compute + fake_client.perlmutter.job.return_value = fake_job_recovered + + controller = HPSSToCFSTransferController(fake_client, mock_config832) + result = controller.copy( + file_path="archive.tar", + source=source_endpoint, + destination=destination_endpoint, + ) + assert result is True, "Expected True after successful job recovery in HPSSToCFSTransferController." + fake_compute.submit_job.assert_called_once() + fake_job_initial.complete.assert_called_once() + fake_client.perlmutter.job.assert_called_once_with(jobid="11111") + fake_job_recovered.complete.assert_called_once() diff --git a/orchestration/config.py b/orchestration/config.py index 79083375..39946302 100644 --- a/orchestration/config.py +++ b/orchestration/config.py @@ -1,10 +1,18 @@ -import collections +from abc import ABC, abstractmethod import builtins -from pathlib import Path +import collections import os - +from pathlib import Path +import re import yaml +from dynaconf import Dynaconf + +# TODO: Add secrets management +settings = Dynaconf( + settings_files=["config.yml"], +) + def get_config(): return read_config(config_file=Path(__file__).parent.parent / "config.yml") @@ -41,3 +49,41 @@ def expand_environment_variables(config): return type(config)([expand_environment_variables(v) for v in config]) else: return config + + +class BeamlineConfig(ABC): + """ + Base class for beamline configurations. + + This class reads the common configuration from disk, builds endpoints and apps, + and initializes the Globus Transfer and Flows clients. Beamline-specific subclasses + must override the _setup_specific_config() method to assign their own attributes. + + Attributes: + beamline_id (str): Beamline number identifier with periods (e.g. "8.3.2" or "7.3.3"). + config (dict): The loaded configuration dictionary. + """ + + def __init__( + self, + beamline_id: str + ) -> None: + pattern = r'^\d+(\.\d+)+$' + if not re.match(pattern, beamline_id): + raise ValueError(f"Invalid beamline_id format: '{beamline_id}'." + f"Expected format: digits separated by dots (e.g., '8.3.2', '7.0.1.2', '12.3')") + self.beamline_id = beamline_id + self.config = settings + self._beam_specific_config() + self.config = None # Clear reference to config after beam-specific setup + + @abstractmethod + def _beam_specific_config(self) -> None: + """ + Set up beamline-specific configuration attributes. + + This method must be implemented by subclasses. Typical assignments + include selecting endpoints (using keys that include the beamline ID), + and other beamline-specific parameters. + """ + pass diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index ff19a9c3..55aecd95 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -1,12 +1,17 @@ from globus_sdk import TransferClient -from orchestration.globus import transfer, flows +from orchestration.config import BeamlineConfig +from orchestration.globus import flows, transfer -class Config832: + +class Config832(BeamlineConfig): def __init__(self) -> None: - config = transfer.get_config() - self.endpoints = transfer.build_endpoints(config) - self.apps = transfer.build_apps(config) + super().__init__(beamline_id="8.3.2") + + def _beam_specific_config(self) -> None: + # config = transfer.get_config() + self.endpoints = transfer.build_endpoints(self.config) + self.apps = transfer.build_apps(self.config) self.tc: TransferClient = transfer.init_transfer_client(self.apps["als_transfer"]) self.flow_client = flows.get_flows_client() self.spot832 = self.endpoints["spot832"] @@ -22,5 +27,6 @@ def __init__(self) -> None: self.nersc832_alsdev_recon_scripts = self.endpoints["nersc832_alsdev_recon_scripts"] self.alcf832_raw = self.endpoints["alcf832_raw"] self.alcf832_scratch = self.endpoints["alcf832_scratch"] - self.scicat = config["scicat"] - self.ghcr_images832 = config["ghcr_images832"] + self.hpss_alsdev = self.config["hpss_alsdev"] + self.scicat = self.config["scicat"] + self.ghcr_images832 = self.config["ghcr_images832"] diff --git a/orchestration/flows/bl832/dispatcher.py b/orchestration/flows/bl832/dispatcher.py index cd9da2f0..31a30dab 100644 --- a/orchestration/flows/bl832/dispatcher.py +++ b/orchestration/flows/bl832/dispatcher.py @@ -1,13 +1,27 @@ import asyncio +from datetime import datetime +from dateutil.parser import isoparse + from prefect import flow, task, get_run_logger from prefect.blocks.system import JSON from prefect.deployments.deployments import run_deployment from pydantic import BaseModel, ValidationError, Field -from typing import Any, Optional, Union +from typing import Any, List, Optional, Union from orchestration.flows.bl832.move import process_new_832_file_task +from orchestration.flows.bl832.config import Config832 +from orchestration.flows.bl832.scicat_ingestor import TomographyIngestorController +# ------------------------------------------------------------------------------------------------------------------------ +# Decision Flow: Dispatcher +# ------------------------------------------------------------------------------------------------------------------------ +# This flow reads decision settings and launches tasks accordingly. +# ------------------------------------------------------------------------------------------------------------------------ +# The dispatcher flow reads decision settings and launches tasks accordingly. +# It first runs the new_832_file_flow/new_file_832 flow synchronously. +# Then, it prepares the ALCF and NERSC flows to run asynchronously based on the decision settings. +# ------------------------------------------------------------------------------------------------------------------------ class FlowParameterMapper: """ Class to define and map the parameters required for each flow. @@ -105,7 +119,7 @@ async def run_recon_flow_async(flow_name: str, parameters: dict) -> None: async def dispatcher( file_path: Optional[str] = None, is_export_control: bool = False, - config: Optional[Union[dict, Any]] = None, + config: Optional[Union[dict, Any]] = None ) -> None: """ Dispatcher flow that reads decision settings and launches tasks accordingly. @@ -163,25 +177,286 @@ async def dispatcher( return None -if __name__ == "__main__": +# --------------------------------------------------------------------------- +# Tape Transfer Flow: Archive a single 832 project (raw) +# --------------------------------------------------------------------------- +@flow(name="archive_832_project_dispatcher") +def archive_832_project_dispatcher( + config: Config832, + file_path: Union[str, List[str]] = None, + scicat_id: Optional[Union[str, List[str]]] = None +) -> None: + """ + Flow to archive one or more beamline 832 projects to tape. + Accepts a single file path (str) or a list of file paths, and for each one, + calls the CFStoHPSSTransferController via run_specific_flow. + + Parameters + ---------- + config : Config832 + Configuration object containing endpoint details. + file_path : Union[str, List[str]] + A single file path or a list of file paths to be archived (path on CFS). + scicat_id : Optional[Union[str, List[str]]] + Optional SciCat ID(s) for the project(s). Must be in the same order as file_path(s). + """ + + # Normalize file_path into a list if it's a single string. + logger = get_run_logger() + + # Build pairs with strict 1:1 length check (only if scicat_id provided) + if scicat_id is None: + projects = zip(file_path, [None] * len(file_path)) + else: + if len(file_path) != len(scicat_id): + raise ValueError( + f"Length mismatch: file_path({len(file_path)}) != scicat_id({len(scicat_id)})" + ) + projects = zip(file_path, scicat_id) + + for fp, scid in projects: + try: + run_deployment( + "cfs_to_hpss_flow/cfs_to_hpss_flow", + { + "file_path": fp, + "source": config.nersc832, # NERSC FileSystem Endpoint + "destination": config.hpss_alsdev, # HPSS Endpoint + "config": config + } + ) + logger.info(f"Scheduled tape transfer for project: {fp}") + + except Exception as e: + logger.error(f"Error scheduling transfer for {fp}: {e}") + + # Ingest the project into SciCat if scicat_id is provided. + ingestor = TomographyIngestorController( + config=config, + scicat_client=config.scicat + ) + + if scid: + logger.info("Ingesting new file path into SciCat...") + + try: + ingestor.add_new_dataset_location( + dataset_id=scid, + datafile_path=config.hpss_alsdev.root_path + "/" + fp.split("/")[-1], + source_folder_host="HPSS" + ) + logger.info(f"Updated SciCat dataset {scicat_id} with new location for project: {fp}") + except Exception as e: + logger.error(f"Error updating dataset location for project {fp} into SciCat: {e}") + + +# --------------------------------------------------------------------------- +# Tape Transfer Flow: Process pending projects +# --------------------------------------------------------------------------- +# Scheduled to run every 6 months to process tape transfers. +# --------------------------------------------------------------------------- +@flow(name="archive_832_projects_from_previous_cycle_dispatcher") +def archive_832_projects_from_previous_cycle_dispatcher( + config: Config832, +) -> None: """ - This script defines the flow for the decision making process of the BL832 beamline. - It first sets up the decision settings, then executes the decision flow to run specific sub-flows as needed. + Archives the previous cycle's projects from the NERSC / CFS / 8.3.2 / RAW directory. + + The schedule is as follows: + - On/around January 2 (assuming NERSC is up): + Archive projects with modification dates between January 1 and July 15 (previous year) + - On/around July 4 (assuming NERSC is up): + Archive projects with modification dates between July 16 and December 31 (previous year) + + The flow lists projects via Globus Transfer's operation_ls, filters them based on modification times, + and then calls the cfs_to_hpss_flow for each eligible project. """ + logger = get_run_logger() + now = datetime.now() + + # Validate that today is a scheduled trigger day and set the archive window accordingly. + # ------------------------- + # Compute "last complete cycle" inline (no helpers). + # ------------------------- + if (now.month < 7) or (now.month == 7 and now.day <= 15): + # Before or on Jul 15: most recent completed cycle is previous year's H2. + y = now.year - 1 + label = "Cycle 2" + archive_start = datetime(y, 7, 16, 0, 0, 0) + archive_end = datetime(y, 12, 31, 23, 59, 59) + else: + # Jul 16 or later: most recent completed cycle is current year's H1. + y = now.year + label = "Cycle 1" + archive_start = datetime(y, 1, 1, 0, 0, 0) + archive_end = datetime(y, 7, 15, 23, 59, 59) + + logger.info(f"Archive window for {label}: {archive_start} to {archive_end}") + + # List projects using Globus Transfer's operation_ls. try: - # Setup decision settings based on input parameters - setup_decision_settings(alcf_recon=True, nersc_recon=True, new_file_832=True) - # Run the main decision flow with the specified parameters - # asyncio.run(dispatcher( - # config={}, # PYTEST, ALCF, NERSC - # is_export_control=False, # ALCF & MOVE - # folder_name="folder", # ALCF - # file_name="file", # ALCF - # file_path="/path/to/file", # MOVE - # send_to_alcf=True, # ALCF - # send_to_nersc=True, # MOVE - # ) - # ) + # config.tc: configured Globus Transfer client. + # config.nersc832.endpoint_id: the NERSC endpoint ID. + # config.nersc832_alsdev_raw.root_path: the NERSC CFS directory path. + projects = config.tc.operation_ls( + endpoint_id=config.nersc832.uuid, + path=config.nersc832_alsdev_raw.root_path, + orderby=["name", "last_modified"], + ).get("DATA", []) except Exception as e: - logger = get_run_logger() - logger.error(f"Failed to execute main flow: {e}") + logger.error(f"Failed to list projects: {e}") + return + + logger.info(f"Found {len(projects)} items in the {projects.path} directory.") + + # Process each project: check its modification time and trigger transfer if within the archive window. + for project in projects: + project_name = project.get("name") + last_mod_str = project.get("last_modified") + if not project_name or not last_mod_str: + logger.warning(f"Skipping project due to missing name or last_modified: {project}") + continue + + try: + last_mod = isoparse(last_mod_str) + except Exception as e: + logger.warning(f"Error parsing modification time for project {project_name}: {e}") + continue + + if archive_start <= last_mod <= archive_end: + logger.info(f"Project {project_name} last modified at {last_mod} is within the archive window.") + try: + # Call the transfer flow for this project. + # This should be blocking to ensure sequential processing (HPSS has limitations for concurrent transfers). + run_deployment( + "cfs_to_hpss_flow/cfs_to_hpss_flow", + { + "file_path": config.nersc832_alsdev_raw.root_path + "/" + project['name'], + "source": config.nersc832, + "destination": config.hpss_alsdev, + "config": config + } + ) + + except Exception as e: + logger.error( + f"Error archiving project {project_name}: {e}. Logs are available on NERSC at " + f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{config.beamline_id}/{project_name}_to_hpss_*.log" + f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{config.beamline_id}/{project_name}_to_hpss_*.err" + ) + # Ingest the project into SciCat. + logger.info("Ingesting new file path into SciCat...") + ingestor = TomographyIngestorController( + config=config, + scicat_client=config.scicat + ) + + # Add a loop to get each file name in projects, and upate path in SciCat. + for scan in config.tc.operation_ls( + endpoint_id=config.nersc832.uuid, + path=config.nersc832_alsdev_raw.root_path + "/" + project['name'], + orderby=["name", "last_modified"], + ): + logger.info(f"Found scan: {scan['name']}") + + logger.info("Looking for dataset in SciCat...") + + try: + scicat_id = ingestor._find_dataset( + file_name=scan['name'] + ) + logger.info(f"Found existing dataset in SciCat with ID: {scicat_id}") + except Exception as e: + logger.warning(f"Error finding dataset in SciCat for scan {scan['name']}: {e}") + + logger.info("Updating dataset location in SciCat...") + try: + if scicat_id: + ingestor.add_new_dataset_location( + dataset_id=scicat_id, + datafile_path=config.hpss_alsdev.root_path + "/" + project['name'] + "/" + scan['name'], + source_folder_host="HPSS" + ) + else: + logger.warning(f"Skipping dataset location update for scan {scan['name']} as SciCat ID was not found.") + except Exception as e: + logger.warning(f"Error updating dataset location for project {project} into SciCat: {e}") + else: + logger.info(f"Project {project_name} last modified at {last_mod} is outside the archive window.") + + +# --------------------------------------------------------------------------- +# Tape Transfer Flow: Archive all 832 projects (raw) +# --------------------------------------------------------------------------- +@flow(name="archive_all_832_raw_projects_dispatcher") +def archive_all_832_projects_dispatcher( + config: Config832, +) -> None: + """ + Scheduled flow to process tape transfers. + It should call the CFStoHPSSTransferController (not shown) and, upon success, mark projects as moved. + """ + logger = get_run_logger() + + logger.info(f"Checking for projects at {config.nersc832_alsdev_raw.root_path} to archive to tape...") + + # ARCHIVE ALL PROJECTS IN THE NERSC / CFS / 8.3.2 / RAW DIRECTORY + # Use the Globus SDK transfer controller (config.tc) to list all projects. + # Note this is different from the controller classes in this repo. + for project in config.tc.operation_ls( + endpoint_id=config.nersc832.uuid, + path=config.nersc832_alsdev_raw.root_path, + orderby=["name", "last_modified"], + ): + logger.info(f"Found project: {project['name']}") + try: + # Call the transfer flow for this project. + # This should be blocking to ensure sequential processing (HPSS has limitations for concurrent transfers). + run_deployment( + "cfs_to_hpss_flow/cfs_to_hpss_flow", + { + "file_path": config.nersc832_alsdev_raw.root_path + "/" + project['name'], + "source": config.nersc832, # NERSC FileSystem Endpoint (not globus) + "destination": config.hpss_alsdev, # HPSS Endpoint + "config": config + } + ) + except Exception as e: + logger.error( + f"Error archiving project {project['name']}: {e}. Logs are available on NERSC at " + f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{config.beamline_id}/{project['name']}_to_hpss_*.log" + f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{config.beamline_id}/{project['name']}_to_hpss_*.err" + ) + + ingestor = TomographyIngestorController( + config=config, + scicat_client=config.scicat + ) + + # Update the path for each scan within the project into SciCat. + for scan in config.tc.operation_ls( + endpoint_id=config.nersc832.uuid, + path=config.nersc832_alsdev_raw.root_path + "/" + project['name'], + orderby=["name", "last_modified"], + ): + try: + logger.info(f"Found scan: {scan['name']}") + + logger.info("Ingesting new file path into SciCat...") + scicat_id = ingestor._find_dataset( + file_name=scan['name'] + ) + except Exception as e: + logger.warning(f"Error finding dataset for scan {scan['name']}: {e}") + try: + if scicat_id: + logger.info(f"Found existing dataset in SciCat with ID: {scicat_id}") + ingestor.add_new_dataset_location( + dataset_id=scicat_id, + datafile_path=config.hpss_alsdev.root_path + "/" + project['name'] + "/" + scan['name'], + source_folder_host="HPSS" + ) + else: + logger.warning(f"Skipping dataset location update for scan {scan['name']} as SciCat ID was not found.") + except Exception as e: + logger.error(f"Error updating dataset location for project {project} into SciCat: {e}") diff --git a/orchestration/flows/bl832/ingest_tomo832.py b/orchestration/flows/bl832/ingest_tomo832.py index 096039e2..cbd454e5 100644 --- a/orchestration/flows/bl832/ingest_tomo832.py +++ b/orchestration/flows/bl832/ingest_tomo832.py @@ -105,7 +105,8 @@ def ingest( file_path, dataset_id, INGEST_STORAGE_ROOT_PATH, - INGEST_SOURCE_ROOT_PATH) + INGEST_SOURCE_ROOT_PATH + ) thumbnail_file = build_thumbnail(file["/exchange/data"][0]) encoded_thumbnail = encode_image_2_thumbnail(thumbnail_file) @@ -113,7 +114,8 @@ def ingest( scicat_client, encoded_thumbnail, dataset_id, - ownable) + ownable + ) return dataset_id @@ -182,7 +184,7 @@ def upload_data_block( source_root_path: str ) -> Datablock: "Creates a datablock of files" - # calcularte the path where the file will as known to SciCat + # calculate the path where the file will as known to SciCat storage_path = str(file_path).replace(source_root_path, storage_root_path) datafiles = create_data_files(file_path, storage_path) diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index b2ff064b..55522cd6 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -86,9 +86,10 @@ def get_controller( config=config ) elif hpc_type == HPC.NERSC: + from orchestration.sfapi import create_sfapi_client from orchestration.flows.bl832.nersc import NERSCTomographyHPCController return NERSCTomographyHPCController( - client=NERSCTomographyHPCController.create_sfapi_client(), + client=create_sfapi_client(), config=config ) elif hpc_type == HPC.OLCF: diff --git a/orchestration/flows/bl832/move.py b/orchestration/flows/bl832/move.py index b547a5c7..72a544ad 100644 --- a/orchestration/flows/bl832/move.py +++ b/orchestration/flows/bl832/move.py @@ -1,90 +1,21 @@ import datetime -import os +import logging from pathlib import Path import uuid -from globus_sdk import TransferClient -from prefect import flow, task, get_run_logger +from prefect import flow, task from prefect.blocks.system import JSON -from orchestration.flows.scicat.ingest import ingest_dataset +# from orchestration.flows.scicat.ingest import ingest_dataset +from orchestration.flows.bl832.scicat_ingestor import TomographyIngestorController from orchestration.flows.bl832.config import Config832 -from orchestration.globus.transfer import GlobusEndpoint, start_transfer -from orchestration.prefect import schedule_prefect_flow -from orchestration.prometheus_utils import PrometheusMetrics +from orchestration.globus.transfer import start_transfer +from orchestration.prune_controller import get_prune_controller, PruneMethod +from orchestration.transfer_controller import get_transfer_controller, CopyMethod -API_KEY = os.getenv("API_KEY") -TOMO_INGESTOR_MODULE = "orchestration.flows.bl832.ingest_tomo832" - - -@task(name="transfer_spot_to_data") -def transfer_spot_to_data( - file_path: str, - transfer_client: TransferClient, - spot832: GlobusEndpoint, - data832: GlobusEndpoint, -): - logger = get_run_logger() - - # if source_file begins with "/", it will mess up os.path.join - if file_path[0] == "/": - file_path = file_path[1:] - - source_path = os.path.join(spot832.root_path, file_path) - dest_path = os.path.join(data832.root_path, file_path) - success, _ = start_transfer( - transfer_client, - spot832, - source_path, - data832, - dest_path, - max_wait_seconds=600, - logger=logger, - ) - logger.info(f"spot832 to data832 globus task_id: {task}") - return success - - -@task(name="transfer_data_to_nersc") -def transfer_data_to_nersc( - file_path: str, - transfer_client: TransferClient, - data832: GlobusEndpoint, - nersc832: GlobusEndpoint, -): - logger = get_run_logger() - - # if source_file begins with "/", it will mess up os.path.join - if file_path[0] == "/": - file_path = file_path[1:] - - # Initialize config - config = Config832() - - # Import here to avoid circular imports - from orchestration.transfer_controller import get_transfer_controller, CopyMethod - - # Change prometheus_metrics=None if do not want to push metrics - # prometheus_metrics = None - prometheus_metrics = PrometheusMetrics() - # Get a Globus transfer controller - transfer_controller = get_transfer_controller( - transfer_type=CopyMethod.GLOBUS, - config=config, - prometheus_metrics=prometheus_metrics - ) - - # Use transfer controller to copy the file - # The controller automatically handles metrics collection and pushing - logger.info(f"Transferring {file_path} from data832 to nersc") - success = transfer_controller.copy( - file_path=file_path, - source=data832, - destination=nersc832 - ) - - return success +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) @flow(name="new_832_file_flow") @@ -102,13 +33,12 @@ def process_new_832_file_flow( ) -@task(name="new_832_file_task") +@task(name="process_new_832_file_task") def process_new_832_file_task( file_path: str, - is_export_control=False, send_to_nersc=True, - config=None -): + config: Config832 = None +) -> None: """ Sends a file along a path: - Copy from spot832 to data832 @@ -117,17 +47,11 @@ def process_new_832_file_task( - Schedule a job to delete from spot832 in the future - Schedule a job to delete from data832 in the future - The is_export_control and send_to_nersc flags are functionally identical, but - they are separate options at the beamlines, so we leave them as separate parameters - in case the desired behavior changes in the future. - :param file_path: path to file on spot832 - :param is_export_control: if True, do not send to NERSC ingest into SciCat :param send_to_nersc: if True, send to NERSC and ingest into SciCat """ - logger = get_run_logger() - logger.info("starting flow") + logger.info("Starting New 832 File Flow") if not config: config = Config832() @@ -136,100 +60,108 @@ def process_new_832_file_task( # to all 3 systems. logger.info(f"Transferring {file_path} from spot to data") relative_path = file_path.split("/global")[1] - transfer_spot_to_data(relative_path, config.tc, config.spot832, config.data832) - logger.info(f"Transferring {file_path} to spot to data") + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.GLOBUS, + config=config + ) - if not is_export_control and send_to_nersc: - transfer_data_to_nersc( - relative_path, config.tc, config.data832, config.nersc832 - ) - logger.info( - f"File successfully transferred from data832 to NERSC {file_path}. Task {task}" + data832_transfer_success = transfer_controller.copy( + file_path=relative_path, + source=config.spot832, + destination=config.data832, + ) + + if send_to_nersc and data832_transfer_success: + nersc_transfer_success = transfer_controller.copy( + file_path=relative_path, + source=config.data832, + destination=config.nersc832 ) - flow_name = f"ingest scicat: {Path(file_path).name}" - logger.info(f"Ingesting {file_path} with {TOMO_INGESTOR_MODULE}") - try: - ingest_dataset(file_path, TOMO_INGESTOR_MODULE) - except Exception as e: - logger.error(f"SciCat ingest failed with {e}") - - # schedule_prefect_flow( - # "ingest_scicat/ingest_scicat", - # flow_name, - # {"relative_path": relative_path}, - # datetime.timedelta(0.0), - # ) + + if nersc_transfer_success: + logger.info(f"File successfully transferred from data832 to NERSC {file_path}. Task {task}") + try: + ingestor = TomographyIngestorController(config) + # get_scicat_client assumes that the environment variables are set in the environment + ingestor.get_scicat_client() + ingestor.ingest_new_raw_dataset(file_path) + except Exception as e: + logger.error(f"SciCat ingest failed with {e}") bl832_settings = JSON.load("bl832-settings").value - flow_name = f"delete spot832: {Path(file_path).name}" schedule_spot832_delete_days = bl832_settings["delete_spot832_files_after_days"] schedule_data832_delete_days = bl832_settings["delete_data832_files_after_days"] - schedule_prefect_flow( - "prune_spot832/prune_spot832", - flow_name, - { - "relative_path": relative_path, - "source_endpoint": config.spot832, - "check_endpoint": config.data832, - }, - - datetime.timedelta(days=schedule_spot832_delete_days), + + prune_controller = get_prune_controller( + prune_type=PruneMethod.GLOBUS, + config=config + ) + + prune_controller.prune( + file_path=relative_path, + source_endpoint=config.spot832, + check_endpoint=config.data832, + days_from_now=schedule_spot832_delete_days ) logger.info( f"Scheduled delete from spot832 at {datetime.timedelta(days=schedule_spot832_delete_days)}" ) - flow_name = f"delete data832: {Path(file_path).name}" - schedule_prefect_flow( - "prune_data832/prune_data832", - flow_name, - { - "relative_path": relative_path, - "source_endpoint": config.data832, - "check_endpoint": config.nersc832, - }, - datetime.timedelta(days=schedule_data832_delete_days), + prune_controller.prune( + file_path=relative_path, + source_endpoint=config.data832, + check_endpoint=config.nersc832, + days_from_now=schedule_data832_delete_days ) logger.info( f"Scheduled delete from data832 at {datetime.timedelta(days=schedule_data832_delete_days)}" ) + return @flow(name="test_832_transfers") def test_transfers_832(file_path: str = "/raw/transfer_tests/test.txt"): - logger = get_run_logger() + """Test transfers between spot832, data832, and NERSC. + Note that the file must already exist on spot832. + Uses Globus transfer. + This flow is scheduled to run periodically to verify that transfers are working. + + :param file_path: path to file on spot832 + :return: None + + """ config = Config832() - # test_scicat(config) logger.info(f"{str(uuid.uuid4())}{file_path}") # copy file to a uniquely-named file in the same folder file = Path(file_path) new_file = str(file.with_name(f"test_{str(uuid.uuid4())}.txt")) logger.info(new_file) - success, _ = start_transfer( + + success = start_transfer( config.tc, config.spot832, file_path, config.spot832, new_file, logger=logger ) + logger.info(success) - spot832_path = transfer_spot_to_data( - new_file, config.tc, config.spot832, config.data832 - ) - logger.info(f"Transferred {spot832_path} to spot to data") - task = transfer_data_to_nersc(new_file, config.tc, config.data832, config.nersc832) - logger.info( - f"File successfully transferred from data832 to NERSC {spot832_path}. Task {task}" + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.GLOBUS, + config=config ) + dat832_success = transfer_controller.copy( + file_path=new_file, + source=config.spot832, + destination=config.data832, + ) + logger.info(f"Transferred {new_file} from spot to data. Success: {dat832_success}") -@flow(name="test_832_transfers_grafana") -def test_transfers_832_grafana(file_path: str = "/raw/transfer_tests/test/"): - logger = get_run_logger() - config = Config832() - - task = transfer_data_to_nersc(file_path, config.tc, config.data832, config.nersc_alsdev) - - logger.info( - f"File successfully transferred from data832 to NERSC {file_path}. Task {task}" - ) \ No newline at end of file + nersc_success = transfer_controller.copy( + file_path=new_file, + source=config.data832, + destination=config.nersc832, + ) + logger.info(f"File successfully transferred from data832 to NERSC {new_file}. Success: {nersc_success}") + pass diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 05680ed0..c4b9f720 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,13 +1,13 @@ import datetime from dotenv import load_dotenv -import json +# import json import logging -import os +# import os from pathlib import Path import re import time -from authlib.jose import JsonWebKey +# from authlib.jose import JsonWebKey from prefect import flow, get_run_logger from prefect.blocks.system import JSON from sfapi_client import Client @@ -42,38 +42,6 @@ def __init__( TomographyHPCController.__init__(self, config) self.client = client - @staticmethod - def create_sfapi_client() -> Client: - """Create and return an NERSC client instance""" - - # When generating the SFAPI Key in Iris, make sure to select "asldev" as the user! - # Otherwise, the key will not have the necessary permissions to access the data. - client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") - client_secret_path = os.getenv("PATH_NERSC_PRI_KEY") - - if not client_id_path or not client_secret_path: - logger.error("NERSC credentials paths are missing.") - raise ValueError("Missing NERSC credentials paths.") - if not os.path.isfile(client_id_path) or not os.path.isfile(client_secret_path): - logger.error("NERSC credential files are missing.") - raise FileNotFoundError("NERSC credential files are missing.") - - client_id = None - client_secret = None - with open(client_id_path, "r") as f: - client_id = f.read() - - with open(client_secret_path, "r") as f: - client_secret = JsonWebKey.import_key(json.loads(f.read())) - - try: - client = Client(client_id, client_secret) - logger.info("NERSC client created successfully.") - return client - except Exception as e: - logger.error(f"Failed to create NERSC client: {e}") - raise e - def reconstruct( self, file_path: str = "", diff --git a/orchestration/flows/bl832/prune.py b/orchestration/flows/bl832/prune.py index 1de05085..91ddb54f 100644 --- a/orchestration/flows/bl832/prune.py +++ b/orchestration/flows/bl832/prune.py @@ -36,7 +36,7 @@ def prune_files( prune_one_safe( file=relative_path, if_older_than_days=0, - tranfer_client=config.tc, + transfer_client=config.tc, source_endpoint=source_endpoint, check_endpoint=check_endpoint, logger=p_logger, diff --git a/orchestration/flows/bl832/scicat_ingestor.py b/orchestration/flows/bl832/scicat_ingestor.py new file mode 100644 index 00000000..b824b453 --- /dev/null +++ b/orchestration/flows/bl832/scicat_ingestor.py @@ -0,0 +1,798 @@ +import io +import json +from logging import getLogger +import os +from pathlib import Path +from typing import Any, Dict, List, Optional + +import h5py +from pyscicat.client import ScicatClient +from pyscicat.model import ( + Attachment, + CreateDatasetOrigDatablockDto, + Datablock, + DataFile, + DerivedDataset, + RawDataset, + DatasetType, + Ownable, +) + +from orchestration.flows.bl832.config import Config832 +from orchestration.flows.scicat.ingestor_controller import BeamlineIngestorController +from orchestration.flows.scicat.utils import ( + build_search_terms, + build_thumbnail, + clean_email, + encode_image_2_thumbnail, + get_file_size, + get_file_mod_time, + Issue, + NPArrayEncoder, + Severity +) + + +logger = getLogger(__name__) + + +class TomographyIngestorController(BeamlineIngestorController): + """ + Ingestor for 8.3.2 Microtomography beamline. Handles ingestion of raw and derived datasets. + Extends the BeamlineIngestorController with beamline-specific (8.3.2) metadata extraction + """ + DEFAULT_USER = "8.3.2" # In case there's not proposal number + INGEST_SPEC = "als832_dx_3" # Where is this spec defined? + + DATA_SAMPLE_KEYS = [ + "/measurement/instrument/sample_motor_stack/setup/axis1pos", + "/measurement/instrument/sample_motor_stack/setup/axis2pos", + "/measurement/instrument/sample_motor_stack/setup/sample_x", + "/measurement/instrument/sample_motor_stack/setup/axis5pos", + "/measurement/instrument/camera_motor_stack/setup/camera_elevation", + "/measurement/instrument/source/current", + "/measurement/instrument/camera_motor_stack/setup/camera_distance", + "/measurement/instrument/source/beam_intensity_incident", + "/measurement/instrument/monochromator/energy", + "/measurement/instrument/detector/exposure_time", + "/measurement/instrument/time_stamp", + "/measurement/instrument/monochromator/setup/turret2", + "/measurement/instrument/monochromator/setup/turret1", + ] + + SCICAT_METADATA_KEYS = [ + "/measurement/instrument/instrument_name", + "/measurement/sample/experiment/beamline", + "/measurement/sample/experiment/experiment_lead", + "/measurement/sample/experiment/pi", + "/measurement/sample/experiment/proposal", + "/measurement/sample/experimenter/email", + "/measurement/sample/experimenter/name", + "/measurement/sample/file_name", + ] + + SCIENTIFIC_METADATA_KEYS = [ + "/measurement/instrument/attenuator/setup/filter_y", + "/measurement/instrument/camera_motor_stack/setup/tilt_motor", + "/measurement/instrument/detection_system/objective/camera_objective", + "/measurement/instrument/detection_system/scintillator/scintillator_type", + "/measurement/instrument/detector/binning_x", + "/measurement/instrument/detector/binning_y", + "/measurement/instrument/detector/dark_field_value", + "/measurement/instrument/detector/delay_time", + "/measurement/instrument/detector/dimension_x", + "/measurement/instrument/detector/dimension_y", + "/measurement/instrument/detector/model", + "/measurement/instrument/detector/pixel_size", + "/measurement/instrument/detector/temperature", + "/measurement/instrument/monochromator/setup/Z2", + "/measurement/instrument/monochromator/setup/temperature_tc2", + "/measurement/instrument/monochromator/setup/temperature_tc3", + "/measurement/instrument/slits/setup/hslits_A_Door", + "/measurement/instrument/slits/setup/hslits_A_Wall", + "/measurement/instrument/slits/setup/hslits_center", + "/measurement/instrument/slits/setup/hslits_size", + "/measurement/instrument/slits/setup/vslits_Lead_Flag", + "/measurement/instrument/source/source_name", + "/process/acquisition/dark_fields/dark_num_avg_of", + "/process/acquisition/dark_fields/num_dark_fields", + "/process/acquisition/flat_fields/i0_move_x", + "/process/acquisition/flat_fields/i0_move_y", + "/process/acquisition/flat_fields/i0cycle", + "/process/acquisition/flat_fields/num_flat_fields", + "/process/acquisition/flat_fields/usebrightexpose", + "/process/acquisition/mosaic/tile_xmovedist", + "/process/acquisition/mosaic/tile_xnumimg", + "/process/acquisition/mosaic/tile_xorig", + "/process/acquisition/mosaic/tile_xoverlap", + "/process/acquisition/mosaic/tile_ymovedist", + "/process/acquisition/mosaic/tile_ynumimg", + "/process/acquisition/mosaic/tile_yorig", + "/process/acquisition/mosaic/tile_yoverlap", + "/process/acquisition/name", + "/process/acquisition/rotation/blur_limit", + "/process/acquisition/rotation/blur_limit", + "/process/acquisition/rotation/multiRev", + "/process/acquisition/rotation/nhalfCir", + "/process/acquisition/rotation/num_angles", + "/process/acquisition/rotation/range", + ] + + def __init__( + self, + config: Config832, + scicat_client: Optional[ScicatClient] = None + ) -> None: + """Initializes the TomographyIngestorController with beamline-specific settings. + :param config: Configuration object (Config832) for the 8.3.2 beamline. + :param scicat_client: An optional SciCat client instance. If not provided, it will be created.""" + super().__init__(config, scicat_client) + + def ingest_new_raw_dataset( + self, + file_path: str = "", + ) -> str: + """ + Ingest a new raw tomography dataset from the 8.3.2 beamline. + + This method integrates the full ingestion process: + - Reading and parsing the HDF5 file. + - Extracting SciCat and scientific metadata. + - Calculating access controls. + - Creating and uploading the RawDataset and datablock. + - Generating and uploading a thumbnail attachment. + + :param file_path: Path to the file to ingest. + :return: SciCat dataset ID. + :raises ValueError: If required environment variables are missing. + :raises Exception: If any issues are encountered during ingestion. + """ + issues: List[Issue] = [] + logger.setLevel("INFO") + + # Retrieve required environment variables for storage paths + INGEST_STORAGE_ROOT_PATH = os.getenv("INGEST_STORAGE_ROOT_PATH") + INGEST_SOURCE_ROOT_PATH = os.getenv("INGEST_SOURCE_ROOT_PATH") + if not INGEST_STORAGE_ROOT_PATH or not INGEST_SOURCE_ROOT_PATH: + raise ValueError( + "INGEST_STORAGE_ROOT_PATH and INGEST_SOURCE_ROOT_PATH must be set" + ) + + file_path_obj = Path(file_path) + with h5py.File(file_path, "r") as file: + # Extract metadata from the HDF5 file using beamline-specific keys + scicat_metadata = self._extract_fields(file, self.SCICAT_METADATA_KEYS, issues) + scientific_metadata = self._extract_fields(file, self.SCIENTIFIC_METADATA_KEYS, issues) + scientific_metadata["data_sample"] = self._get_data_sample(file) + + # Encode scientific metadata using NPArrayEncoder + encoded_scientific_metadata = json.loads( + json.dumps(scientific_metadata, cls=NPArrayEncoder) + ) + + # Calculate access controls + access_controls = self._calculate_access_controls( + self.DEFAULT_USER, + scicat_metadata.get("/measurement/sample/experiment/beamline"), + scicat_metadata.get("/measurement/sample/experiment/proposal"), + ) + logger.info( + f"Access controls for {file_path_obj} - access_groups: {access_controls.get('access_groups')} " + f"owner_group: {access_controls.get('owner_group')}" + ) + + ownable = Ownable( + ownerGroup=access_controls["owner_group"], + accessGroups=access_controls["access_groups"], + ) + + # Create and upload the raw dataset + dataset_id = self._upload_raw_dataset( + file_path_obj, + scicat_metadata, + encoded_scientific_metadata, + ownable, + ) + + # Upload the data block (associated files) + self._upload_data_block( + file_path_obj, + dataset_id, + INGEST_STORAGE_ROOT_PATH, + INGEST_SOURCE_ROOT_PATH, + ) + + # Generate and upload a thumbnail attachment + # The "/exchange/data" key is specific to the Microtomography (8.3.2) HDF5 file structure. + thumbnail_file = build_thumbnail(file["/exchange/data"][0]) + encoded_thumbnail = encode_image_2_thumbnail(thumbnail_file) + self._upload_attachment( + encoded_thumbnail, + dataset_id, + ownable, + ) + + if issues: + for issue in issues: + logger.error(issue) + raise Exception(f"SciCat ingest failed with {len(issues)} issues") + return dataset_id + + def ingest_new_derived_dataset( + self, + folder_path: str = "", + raw_dataset_id: str = "", + ) -> str: + """ + Ingest a new derived dataset from the Microtomography (8.3.2) beamline. + + This method handles ingestion of derived datasets generated during tomography reconstruction: + 1. A directory of TIFF slices + 2. A Zarr directory (derived from the TIFFs) + + :param folder_path: Path to the folder containing the derived data. + :param raw_dataset_id: ID of the raw dataset this derived data is based on. + :return: SciCat ID of the derived dataset. + :raises ValueError: If required environment variables are missing. + :raises Exception: If any issues are encountered during ingestion. + """ + issues: List[Issue] = [] + logger.setLevel("INFO") + + logger.info(f"Ingesting derived dataset from folder: {folder_path}") + # Retrieve required environment variables for storage paths + INGEST_STORAGE_ROOT_PATH = os.getenv("INGEST_STORAGE_ROOT_PATH") + INGEST_SOURCE_ROOT_PATH = os.getenv("INGEST_SOURCE_ROOT_PATH") + if not INGEST_STORAGE_ROOT_PATH or not INGEST_SOURCE_ROOT_PATH: + raise ValueError( + "INGEST_STORAGE_ROOT_PATH and INGEST_SOURCE_ROOT_PATH must be set" + ) + + logger.info("Getting raw dataset from SciCat to link to") + # Get the raw dataset to link to + try: + raw_dataset = self.scicat_client.datasets_get_one(raw_dataset_id) + logger.info(f"Found raw dataset to link: {raw_dataset_id}") + except Exception as e: + raise ValueError(f"Failed to find raw dataset with ID {raw_dataset_id}: {e}") + + folder_path_obj = Path(folder_path) + if not folder_path_obj.exists(): + raise ValueError(f"Folder path does not exist: {folder_path}") + + logger.info(raw_dataset) + # Calculate access controls - use the same as the raw dataset + access_controls = { + "owner_group": raw_dataset["ownerGroup"], + "access_groups": raw_dataset["accessGroups"] + } + logger.info(f"Using access controls from raw dataset: {access_controls}") + + ownable = Ownable( + ownerGroup=access_controls["owner_group"], + accessGroups=access_controls["access_groups"], + ) + + # Get main HDF5 file if exists, otherwise use first file + main_file = None + for file in folder_path_obj.glob("*.h5"): + main_file = file + break + + if not main_file: + # If no HDF5 file, use the first file in the directory + for file in folder_path_obj.iterdir(): + if file.is_file(): + main_file = file + break + if not main_file: + raise ValueError(f"No files found in directory: {folder_path}") + + # Extract scientific metadata + scientific_metadata = { + "derived_from": raw_dataset_id, + "processing_date": get_file_mod_time(main_file), + } + + # Try to extract metadata from HDF5 file if available + if main_file and main_file.suffix.lower() == ".h5": + try: + with h5py.File(main_file, "r") as file: + scientific_metadata.update( + self._extract_fields(file, self.SCIENTIFIC_METADATA_KEYS, issues) + ) + except Exception as e: + logger.warning(f"Could not extract metadata from HDF5 file: {e}") + + # Encode scientific metadata using NPArrayEncoder + encoded_scientific_metadata = json.loads( + json.dumps(scientific_metadata, cls=NPArrayEncoder) + ) + + # Create and upload the derived dataset + + # Use folder name as dataset name if nothing better + dataset_name = folder_path_obj.name + + # Determine if this is a TIFF directory or a Zarr directory + is_zarr = dataset_name.endswith('.zarr') + data_format = "Zarr" if is_zarr else "TIFF" + + # Build description/keywords from the folder name + description = build_search_terms(dataset_name) + keywords = description.split() + + # Add additional descriptive information + if is_zarr: + description = f"Multi-resolution Zarr dataset derived from reconstructed tomography slices: {description}" + keywords.extend(["zarr", "multi-resolution", "volume"]) + else: + description = f"Reconstructed tomography slices: {description}" + keywords.extend(["tiff", "slices", "reconstruction"]) + + # Create the derived dataset + dataset = DerivedDataset( + owner=raw_dataset.get("owner"), + contactEmail=raw_dataset.get("contactEmail"), + creationLocation=raw_dataset.get("creationLocation"), + datasetName=dataset_name, + type=DatasetType.derived, + proposalId=raw_dataset.get("proposalId"), + dataFormat=data_format, + principalInvestigator=raw_dataset.get("principalInvestigator"), + sourceFolder=str(folder_path_obj), + size=sum(f.stat().st_size for f in folder_path_obj.glob("**/*") if f.is_file()), + scientificMetadata=encoded_scientific_metadata, + sampleId=description, + isPublished=False, + description=description, + keywords=keywords, + creationTime=get_file_mod_time(folder_path_obj), + investigator=raw_dataset.get("owner"), + inputDatasets=[raw_dataset_id], + usedSoftware=["TomoPy", "Zarr"] if is_zarr else ["TomoPy"], + jobParameters={"source_folder": str(folder_path_obj)}, + **ownable.dict(), + ) + # Upload the derived dataset + dataset_id = self.scicat_client.upload_new_dataset(dataset) + logger.info(f"Created derived dataset with ID: {dataset_id}") + + # Upload datablock for all files in the directory + total_size = 0 + datafiles = [] + + for file_path in folder_path_obj.glob("**/*"): + if file_path.is_file(): + storage_path = str(file_path).replace(INGEST_SOURCE_ROOT_PATH, INGEST_STORAGE_ROOT_PATH) + datafile = DataFile( + path=storage_path, + size=get_file_size(file_path), + time=get_file_mod_time(file_path), + type="DerivedDatasets", + ) + datafiles.append(datafile) + total_size += datafile.size + + # Upload the datablock + datablock = CreateDatasetOrigDatablockDto( + size=total_size, + dataFileList=datafiles, + datasetId=dataset_id, + **ownable.dict(), + ) + self.scicat_client.upload_dataset_origdatablock(dataset_id, datablock) + logger.info(f"Uploaded datablock with {len(datafiles)} files") + + # Try to generate and upload a thumbnail if possible + try: + if is_zarr: + # For Zarr, generate the thumbnail in memory + thumb_buffer = self._generate_zarr_thumbnail(folder_path_obj) + if thumb_buffer: + encoded_thumbnail = encode_image_2_thumbnail(thumb_buffer) + self._upload_attachment(encoded_thumbnail, dataset_id, ownable) + logger.info("Uploaded thumbnail for Zarr dataset") + elif main_file and main_file.suffix.lower() == ".h5": + with h5py.File(main_file, "r") as file: + # Try to find a suitable dataset for thumbnail + for key in ["/exchange/data", "/data", "/reconstruction"]: + if key in file: + thumbnail_file = build_thumbnail(file[key][0]) + encoded_thumbnail = encode_image_2_thumbnail(thumbnail_file) + self._upload_attachment( + encoded_thumbnail, + dataset_id, + ownable, + ) + logger.info("Uploaded thumbnail for derived dataset") + break + else: + # For TIFF files, use a middle slice as thumbnail + tiff_files = sorted(list(folder_path_obj.glob("*.tiff"))) + sorted(list(folder_path_obj.glob("*.tif"))) + if tiff_files: + # Use a slice from the middle of the volume for the thumbnail + middle_slice = tiff_files[len(tiff_files) // 2] + from PIL import Image + import io + import numpy as np + image = Image.open(middle_slice) + # Convert image to a numpy array + arr = np.array(image, dtype=np.float32) + + # Compute min and max; if they are equal, use a default scaling to avoid division by zero. + arr_min = np.min(arr) + arr_max = np.max(arr) + if arr_max == arr_min: + # In case of no contrast, simply use a zeros array or leave the image unchanged. + arr_scaled = np.zeros(arr.shape, dtype=np.uint8) + else: + # Normalize the array to 0-255 + arr_scaled = ((arr - arr_min) / (arr_max - arr_min) * 255).astype(np.uint8) + + # Create a new image from the scaled array + scaled_image = Image.fromarray(arr_scaled) + thumbnail_buffer = io.BytesIO() + scaled_image.save(thumbnail_buffer, format="PNG") + thumbnail_buffer.seek(0) + encoded_thumbnail = encode_image_2_thumbnail(thumbnail_buffer) + self._upload_attachment(encoded_thumbnail, dataset_id, ownable) + + logger.info("Uploaded thumbnail from TIFF slice") + except Exception as e: + logger.warning(f"Failed to generate thumbnail: {e}") + + if issues: + for issue in issues: + logger.error(issue) + raise Exception(f"SciCat derived dataset ingest failed with {len(issues)} issues") + + return dataset_id + + def _generate_zarr_thumbnail(self, zarr_path: Path) -> io.BytesIO | None: + """ + Generate a thumbnail image from an NGFF Zarr dataset using ngff_zarr. + This implementation extracts a mid-slice and returns a BytesIO buffer. + + :param zarr_path: Path to the Zarr directory. + :return: A BytesIO object containing the PNG image data, or None on failure. + """ + try: + import ngff_zarr as nz + from PIL import Image + import numpy as np + + # Load the multiscale image from the Zarr store + multiscales = nz.from_ngff_zarr(str(zarr_path)) + # Here we assume a specific scale index (e.g. 3) and take the mid-slice along the first dimension + # Adjust this index as needed for your dataset. + image = multiscales.images[3].data + middle_index = image.shape[0] // 2 + mid_slice = image[middle_index, :, :] + # Ensure we have a NumPy array + mid_slice = mid_slice.compute() if hasattr(mid_slice, "compute") else np.array(mid_slice) + + # Normalize the image to 8-bit + mid_slice = mid_slice.astype(np.float32) + dmin, dmax = np.min(mid_slice), np.max(mid_slice) + if dmax != dmin: + norm_array = ((mid_slice - dmin) / (dmax - dmin) * 255).astype(np.uint8) + else: + norm_array = np.zeros_like(mid_slice, dtype=np.uint8) + + # Create a PIL image from the normalized array + img = Image.fromarray(norm_array) + if img.mode == "F": + img = img.convert("L") + + # Save the image to an in-memory bytes buffer + buffer = io.BytesIO() + img.save(buffer, format="PNG") + buffer.seek(0) + return buffer + except ImportError: + logger.warning("ngff_zarr package is not installed. Install it with `pip install ngff_zarr`.") + return None + except Exception as e: + logger.warning(f"Failed to generate Zarr thumbnail using ngff_zarr: {e}") + return None + + def _calculate_access_controls( + self, + username: str, + beamline: str, + proposal: str + ) -> Dict: + """ + Calculate access controls for a dataset. + + :param username: Username of the dataset owner. + :param beamline: Beamline name. + :param proposal: Proposal number. + :return: Dictionary with 'owner_group' and 'access_groups'. + """ + + # make an access group list that includes the name of the proposal and the name of the beamline + access_groups = [] + # sometimes the beamline name is super dirty " '8.3.2', "" '8.3.2', " + beamline = beamline.replace(" '", "").replace("', ", "") if beamline else None + # set owner_group to username so that at least someone has access in case no proposal number is found + owner_group = username + if beamline: + access_groups.append(beamline) + # username lets the user see the Dataset in order to ingest objects after the Dataset + access_groups.append(username) + # temporary mapping while beamline controls process request to match beamline name with what comes + # from ALSHub + if beamline == "bl832" and "8.3.2" not in access_groups: + access_groups.append("8.3.2") + + if proposal and proposal != "None": + owner_group = proposal + + # this is a bit of a kludge. Add 8.3.2 into the access groups so that staff will be able to see it + return {"owner_group": owner_group, "access_groups": access_groups} + + def _create_data_files( + self, + file_path: Path, + storage_path: str + ) -> List[DataFile]: + """ + Builds a list of DataFile objects for SciCat from a single file. + + :param file_path: Path to the file. + :param storage_path: Path where the file is stored. + :return: List of DataFile objects. + """ + datafiles = [] + datafile = DataFile( + path=storage_path, + size=get_file_size(file_path), + time=get_file_mod_time(file_path), + type="RawDatasets", + ) + datafiles.append(datafile) + return datafiles + + def _extract_fields( + self, + file: h5py.File, + keys: List[str], + issues: List[Issue] + ) -> Dict[str, Any]: + metadata = {} + for md_key in keys: + dataset = file.get(md_key) + if not dataset: + issues.append( + Issue(msg=f"dataset not found {md_key}", severity=Severity.WARNING) + ) + continue + metadata[md_key] = self._get_dataset_value(file[md_key]) + return metadata + + def _get_dataset_value( + self, + data_set: h5py.Dataset + ) -> Any: + """ + Extracts the value of a dataset from an HDF5 file. + + :param data_set: HDF5 dataset object. + :return: The value of the dataset, or None if extraction fails. + """ + logger.debug(f"{data_set} {data_set.dtype}") + try: + if "S" in data_set.dtype.str: + if data_set.shape == (1,): + return data_set.asstr()[0] + elif data_set.shape == (): + return data_set[()].decode("utf-8") + else: + return list(data_set.asstr()) + else: + if data_set.maxshape == (1,): + logger.debug(f"{data_set} {data_set[()][0]}") + return data_set[()][0] + else: + logger.debug(f"{data_set} {data_set[()]}") + return data_set[()] + except Exception: + logger.exception("Exception extracting dataset value") + return None + + def _get_data_sample( + self, + file: h5py.File, + sample_size: int = 10 + ) -> Dict[str, Any]: + """ + Extracts a sample of the data from the HDF5 file. + + :param file: HDF5 file object. + :param sample_size: Number of samples to extract. + :return: Dictionary of sampled data arrays. + """ + data_sample = {} + for key in self.DATA_SAMPLE_KEYS: + data_array = file.get(key) + if not data_array: + continue + step_size = int(len(data_array) / sample_size) + if step_size == 0: + step_size = 1 + sample = data_array[0::step_size] + data_sample[key] = sample + + return data_sample + + def _upload_data_block( + self, + file_path: Path, + dataset_id: str, + storage_root_path: str, + source_root_path: str + ) -> Datablock: + """ + Creates a datablock of files associated with a dataset and uploads it to SciCat. + + :param file_path: Path to the file to ingest. + :param dataset_id: SciCat ID of the dataset. + :param storage_root_path: Root path where files are stored. + :param source_root_path: Root path of the source files. + :return: Uploaded Datablock object. + """ + # calculate the path where the file will as known to SciCat + storage_path = str(file_path).replace(source_root_path, storage_root_path) + datafiles = self._create_data_files(file_path, storage_path) + + datablock = CreateDatasetOrigDatablockDto( + size=get_file_size(file_path), + dataFileList=datafiles + ) + return self.scicat_client.upload_dataset_origdatablock(dataset_id, datablock) + + def _upload_attachment( + self, + encoded_thumbnail: str, + dataset_id: str, + ownable: Ownable, + ) -> None: + """ + Creates a thumbnail png attachment and uploads it to SciCat. + + :param encoded_thumbnail: Base64 encoded thumbnail image. + :param dataset_id: SciCat ID of the dataset. + :param ownable: Ownable object. + :return: None + """ + attachment = Attachment( + datasetId=dataset_id, + thumbnail=encoded_thumbnail, + caption="raw image", + **ownable.dict(), + ) + self.scicat_client.upload_attachment(attachment) + + def _upload_raw_dataset( + self, + file_path: Path, + scicat_metadata: Dict, + scientific_metadata: Dict, + ownable: Ownable, + ) -> str: + """ + Create and upload a new raw dataset to SciCat. + + :param file_path: Path to the file to ingest. + :param scicat_metadata: SciCat metadata. + :param scientific_metadata: Scientific metadata. + :param ownable: Ownable object. + :return: SciCat ID of the dataset + """ + file_size = get_file_size(file_path) + file_mod_time = get_file_mod_time(file_path) + file_name = scicat_metadata.get("/measurement/sample/file_name") + description = build_search_terms(file_name) + appended_keywords = description.split() + + dataset = RawDataset( + owner=scicat_metadata.get("/measurement/sample/experiment/pi") or "Unknown", + contactEmail=clean_email(scicat_metadata.get("/measurement/sample/experimenter/email")) + or "Unknown", + creationLocation=scicat_metadata.get("/measurement/instrument/instrument_name") + or "Unknown", + datasetName=file_name, + type=DatasetType.raw, + instrumentId=scicat_metadata.get("/measurement/instrument/instrument_name") + or "Unknown", + proposalId=scicat_metadata.get("/measurement/sample/experiment/proposal"), + dataFormat="DX", + principalInvestigator=scicat_metadata.get("/measurement/sample/experiment/pi") + or "Unknown", + sourceFolder=str(file_path.parent), + size=file_size, + scientificMetadata=scientific_metadata, + sampleId=description, + isPublished=False, + description=description, + keywords=appended_keywords, + creationTime=file_mod_time, + **ownable.dict(), + ) + logger.debug(f"dataset: {dataset}") + dataset_id = self.scicat_client.upload_new_dataset(dataset) + return dataset_id + + +if __name__ == "__main__": + + config = Config832() + file_path = "/Users/david/Documents/data/tomo/raw/20241216_153047_ddd.h5" + proposal_id = "test832" + ingestor = TomographyIngestorController(config) + + # get_scicat_client assumes that the environment variables are set in the environment + # in this test, just using the scicatlive (3.2.5) backend defaults (admin user) + + logger.info("Setting up metadata SciCat ingestion") + + ingestor.get_scicat_client( + scicat_base_url="http://localhost:3000/api/v3/", + scicat_user="admin", + scicat_password="2jf70TPNZsS" + ) + + # INGEST_STORAGE_ROOT_PATH and INGEST_SOURCE_ROOT_PATH must be set + os.environ["INGEST_STORAGE_ROOT_PATH"] = "/global/cfs/cdirs/als/data_mover/8.3.2" + os.environ["INGEST_SOURCE_ROOT_PATH"] = "/data832-raw" + + logger.info(f"Ingesting {file_path}") + id = ingestor.ingest_new_raw_dataset(file_path) + logger.info(f"Ingested with SciCat ID: {id}") + + # logger.info(f"Testing SciCat ID lookup after ingestion based on {file_path}") + # try: + # # Test lookup based on filename after ingestion + # id = ingestor._find_dataset(file_name=file_path) + # logger.info(f"Found dataset id {id}") + # except Exception as e: + # logger.error(f"Failed to find dataset {e}") + + # Pretend we moved to tape: + # /home/a/alsdev/data_mover/[beamline]/raw/[proposal_name]/[proposal_name]_[year]-[cycle].tar + ingestor.add_new_dataset_location( + dataset_id=id, + proposal_id=proposal_id, + file_name="20241216_153047_ddd.h5", + source_folder=f"/home/a/alsdev/data_mover/{config.beamline_id}/raw/{proposal_id}/{proposal_id}_2024-12.tar", + source_folder_host="HPSS", + ) + + # ingestor needs to add the derived dataset ingestion method + + # ingestor needs to add new "origdatablock" method for raw data on different filesystems + # ingestor needs to add new "datablock" method for raw data on HPSS system + # same for derived data + + ingestor.ingest_new_derived_dataset( + folder_path="/Users/david/Documents/data/tomo/scratch/" + "rec20230606_152011_jong-seto_fungal-mycelia_flat-AQ_fungi2_fast.zarr", + raw_dataset_id=id + ) + ingestor.ingest_new_derived_dataset( + folder_path="/Users/david/Documents/data/tomo/scratch/rec20230224_132553_sea_shell", + raw_dataset_id=id + ) + + admin_ingestor = TomographyIngestorController(config) + admin_ingestor.get_scicat_client( + scicat_base_url="http://localhost:3000/api/v3/", + scicat_user="archiveManager", + scicat_password="aman" + ) + admin_ingestor.remove_dataset_location( + dataset_id=id, + source_folder_host="HPSS", + ) diff --git a/orchestration/flows/scicat/ingestor_controller.py b/orchestration/flows/scicat/ingestor_controller.py new file mode 100644 index 00000000..5864f409 --- /dev/null +++ b/orchestration/flows/scicat/ingestor_controller.py @@ -0,0 +1,316 @@ +from abc import ABC, abstractmethod +import logging +import os +import requests +from typing import Optional +from urllib.parse import urljoin + +from pyscicat.client import ScicatClient +from pyscicat.model import ( + CreateDatasetOrigDatablockDto, + DataFile, +) +from orchestration.config import BeamlineConfig + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# NOTE: This BeamlineIngestorController will be relocated to https://github.com/als-computing/scicat_beamline soon. + + +class BeamlineIngestorController(ABC): + """ + Abstract class for beamline SciCat ingestors. + Provides interface methods for ingesting data. + """ + + def __init__( + self, + config: BeamlineConfig, + scicat_client: Optional[ScicatClient] = None + ) -> None: + self.config = config + self.scicat_client = scicat_client + + def get_scicat_client( + self, + scicat_base_url: Optional[str] = None, + scicat_user: Optional[str] = None, + scicat_password: Optional[str] = None + ) -> ScicatClient: + """ + Log in to SciCat using the provided credentials. + + :param scicat_base_url: Base URL of the SciCat instance. Defaults to the environment variable 'SCICAT_API_URL'. + :param scicat_user: Username for the SciCat instance. Defaults to the environment variable 'SCICAT_INGEST_USER'. + :param scicat_password: Password for the SciCat instance. Defaults to the environment variable 'SCICAT_INGEST_PASSWORD' + :return: An instance of ScicatClient with an authenticated session. + :raises ValueError: If any required credentials are missing. + """ + # Use environment variables as defaults if parameters are not provided. + scicat_base_url = scicat_base_url or os.getenv("SCICAT_API_URL") + scicat_user = scicat_user or os.getenv("SCICAT_INGEST_USER") + scicat_password = scicat_password or os.getenv("SCICAT_INGEST_PASSWORD") + + logger.info(f"Logging in to SciCat at {scicat_base_url} as {scicat_user}.") + + # Ensure that all required credentials are provided. + if not (scicat_base_url and scicat_user and scicat_password): + raise ValueError( + "Missing required SciCat credentials. Provide scicat_base_url, scicat_user, " + "and scicat_password as parameters or set them in the environment variables: " + "SCICAT_API_URL, SCICAT_INGEST_USER, SCICAT_INGEST_PASSWORD." + ) + + # Try to log in using the pyscicat client first. + # This method seems deprecated, but leaving it here for backwards compatability + # https://github.com/SciCatProject/pyscicat/issues/61 + try: + self.scicat_client = ScicatClient( + base_url=scicat_base_url, + username=scicat_user, + password=scicat_password, + auto_login=False + ) + # If using scicat on localhost (i.e. scicatlive), need to set Host header to backend.localhost + if "localhost" in scicat_base_url: + self.scicat_client._headers["Host"] = "backend.localhost" + self.scicat_client.login() + logger.info("Logged in to SciCat.") + return self.scicat_client + except Exception as e: + logger.warning(f"Failed to log in to SciCat: {e}, trying alternative method.") + + # This method works for scicatlive 3.2.5 + try: + url = urljoin(scicat_base_url, "auth/login") + logger.info(url) + response = requests.post( + url=url, + json={"username": scicat_user, "password": scicat_password}, + stream=False, + verify=True, + ) + logger.info(f"Login response: {response}") + + self.scicat_client = ScicatClient(scicat_base_url, response.json()["access_token"]) + logger.info("Logged in to SciCat.") + # logger.info(f"SciCat token: {response.json()['access_token']}") + return self.scicat_client + + except Exception as e: + logger.error(f"Failed to log in to SciCat: {e}") + raise e + + @abstractmethod + def ingest_new_raw_dataset( + self, + file_path: str = "", + ) -> str: + """Ingest data from the beamline. + + :param file_path: Path to the file to ingest. + :return: SciCat ID of the dataset. + """ + pass + + @abstractmethod + def ingest_new_derived_dataset( + self, + file_path: str = "", + raw_dataset_id: Optional[str] = "", + ) -> str: + """Ingest data from the beamline. + + :param file_path: Path to the file to ingest. + :return: SciCat ID of the dataset. + """ + pass + + def add_new_dataset_location( + self, + dataset_id: str = None, + datafile_path: str = None, + source_folder_host: Optional[str] = None + ) -> str: + """ + Add a new location to an existing dataset in SciCat. + + :param dataset_id: SciCat ID of the dataset. + :param datafile_path: Absolute file path to the data file (excluding protocol/host). + Caller is responsible for full path composition, including filename. + :param source_folder_host: "DNS host name of file server hosting sourceFolder, + optionally including a protocol e.g. [protocol://]fileserver1.example.com", + :return: The dataset ID after successful datablock addition. + :raises ValueError: If the dataset ID is not found or if the dataset does not have a valid 'pid'. + """ + # Get the dataset to retrieve its metadata + dataset = self.scicat_client.datasets_get_one(dataset_id) + if not dataset: + raise ValueError(f"Dataset with ID {dataset_id} not found") + + logger.info(f"Creating new datablock for dataset {dataset_id} at location {datafile_path}") + + try: + # Create a datafile for the new location + file_path = datafile_path + if source_folder_host: + file_path = f"{source_folder_host}:{datafile_path}" + + # Get size from existing dataset if available + size = dataset.get("size", 0) + + # Create a single datafile + datafile = DataFile( + path=file_path, + size=size, + time=dataset.get("creationTime") + ) + + # Create a minimal datablock for the new location + datablock = CreateDatasetOrigDatablockDto( + size=size, + dataFileList=[datafile] + ) + + # Upload the datablock + self.scicat_client.upload_dataset_origdatablock(dataset_id, datablock) + logger.info(f"Created new datablock for dataset {dataset_id} at location {datafile_path}") + + # Note: We're skipping the dataset update since it's causing validation issues + + except Exception as e: + logger.error(f"Failed to create new datablock for dataset {dataset_id}: {e}") + # Continue without raising to maintain the workflow + + return dataset_id + + def remove_dataset_location( + self, + dataset_id: str = "", + source_folder_host: str = "", + ) -> bool: + """ + Remove a location from an existing dataset in SciCat. + We might want to do this after data was moved to a new location, + and has been pruned from the previous location. + + :param dataset_id: SciCat ID of the dataset. + :param source_folder_host: The source folder host to identify the location to remove. + :return: True if the location was successfully removed, False otherwise. + """ + logger.info(f"Removing location with host {source_folder_host} from dataset {dataset_id}") + + try: + # Get the datablocks directly + datablocks = self.scicat_client.datasets_origdatablocks_get_one(dataset_id) + if not datablocks: + logger.warning(f"No datablocks found for dataset {dataset_id}") + return False + + # Find datablock matching the specified source_folder_host + matching_datablock = None + for datablock in datablocks: + for datafile in datablock.get("dataFileList", []): + file_path = datafile.get("path", "") + if source_folder_host in file_path or ( + "sourceFolderHost" in datablock and + datablock["sourceFolderHost"] == source_folder_host + ): + matching_datablock = datablock + break + if matching_datablock: + break + + if not matching_datablock: + logger.warning( + f"No datablock found for dataset {dataset_id} with source folder host {source_folder_host}" + ) + return False + + # Delete the datablock using its ID + datablock_id = matching_datablock.get("id") + if not datablock_id: + logger.error(f"Datablock found but has no ID for dataset {dataset_id}") + return False + + # Delete the datablock using the appropriate endpoint + response = self.scicat_client.datasets_delete(datablock_id) + if response: + logger.info(f"Successfully removed datablock {datablock_id} from dataset {dataset_id}") + return True + else: + logger.error(f"Failed to delete datablock {datablock_id} from dataset {dataset_id}") + return False + + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + logger.error(f"Forbidden: You do not have permission to delete the datablock {datablock_id}") + else: + logger.error(f"HTTP error occurred: {e}") + except Exception as e: + logger.error(f"Failed to remove datablock from dataset {dataset_id}: {e}") + return False + + def _find_dataset( + self, + proposal_id: Optional[str] = None, # The ALS proposal ID, not the SciCat ID + file_name: Optional[str] = None + ) -> str: + """ + Find a dataset in SciCat and return its ID based on proposal ID and file name. + The dataset name in SciCat is expected to be saved as the base filename without the extension, + e.g. '20241216_153047_ddd' for a file named '20241216_153047_ddd.h5'. + + Parameters: + proposal_id (Optional[str]): The proposal identifier used in ingestion. + file_name (Optional[str]): The full path to the file; its base name (without extension) will be used. + + Returns: + str: The SciCat ID of the dataset. + + Raises: + ValueError: If no dataset or multiple datasets are found, or if the found dataset does not have a valid 'pid'. + """ + + # TODO: I'm not sure if SciCat's advanced query API supports this, but, if we're actually searching by file_name, + # wouldn't it make more sense to look in all the dataFileList entries for all datasets? + # This comes to mind because at 733, scientists organize their data mostly by creating dated folders, + # and there's no guarantee that the files in those folders have unique names relative to the other folders. + # If they were searching for a data file, they would need to use a path fragment, e.g. '20241216_153047/new_run.h5' + # If this function could search dataFileLists by path fragment, it would be some future-proofing for those users... + + if file_name: + # Extract the datasetName from the file_name by stripping the directory and extension. + extracted_name = os.path.splitext(os.path.basename(file_name))[0] + else: + extracted_name = None + + query_fields = { + "proposalId": proposal_id, + "datasetName": extracted_name + } + results = self.scicat_client.datasets_find(query_fields=query_fields) + + # Assuming the client returns a list of datasets. + count = len(results) + + if count == 0: + raise ValueError(f"No dataset found for proposal '{proposal_id}' with dataset name '{extracted_name}'.") + elif count > 1: + # Log all found dataset IDs for human review. + dataset_ids = [d.get("pid", "N/A") for d in results] + logger.error( + f"Multiple datasets found for proposal '{proposal_id}' with dataset name '{extracted_name}': {dataset_ids}." + ) + # raise ValueError( + # f"Multiple datasets found for proposal '{proposal_id}' with dataset name '{extracted_name}'." + # ) + + dataset = results[0] + dataset_id = dataset.get("pid") + if not dataset_id: + raise ValueError("The dataset returned does not have a valid 'pid' field.") + + return dataset_id diff --git a/orchestration/flows/scicat/utils.py b/orchestration/flows/scicat/utils.py index 7fc97012..205675bd 100644 --- a/orchestration/flows/scicat/utils.py +++ b/orchestration/flows/scicat/utils.py @@ -1,14 +1,14 @@ import base64 from dataclasses import dataclass +from datetime import datetime +from pathlib import Path from enum import Enum import io import json import logging -from pathlib import Path import re from typing import Dict, Optional, Union -from uuid import uuid4 import numpy as np import numpy.typing as npt @@ -17,19 +17,26 @@ logger = logging.getLogger("splash_ingest") can_debug = logger.isEnabledFor(logging.DEBUG) -class Severity(str, Enum): - warning = "warning" - error = "error" + +class Severity( + str, + Enum +): + """Enum for issue severity.""" + WARNING = "warning" + ERROR = "error" @dataclass class Issue: + """Dataclass for issues.""" severity: Severity msg: str exception: Optional[Union[str, None]] = None class NPArrayEncoder(json.JSONEncoder): + """Custom JSON encoder for numpy types.""" def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -40,7 +47,53 @@ def default(self, obj): return json.JSONEncoder.default(self, obj) -def calculate_access_controls(username, beamline, proposal) -> Dict: +def build_search_terms( + sample_name: str +) -> str: + """extract search terms from sample name to provide something pleasing to search on""" + terms = re.split("[^a-zA-Z0-9]", sample_name) + description = [term.lower() for term in terms if len(term) > 0] + return " ".join(description) + + +def build_thumbnail( + image_array: npt.ArrayLike +) -> io.BytesIO: + """Create a thumbnail from an image array.""" + try: + image_array = image_array - np.min(image_array) + 1.001 + image_array = np.log(image_array) + image_array = 205 * image_array / (np.max(image_array)) + auto_contrast_image = Image.fromarray(image_array.astype("uint8")) + auto_contrast_image = ImageOps.autocontrast(auto_contrast_image, cutoff=0.1) + # filename = str(uuid4()) + ".png" + file = io.BytesIO() + # file = thumbnail_dir / Path(filename) + auto_contrast_image.save(file, format="png") + file.seek(0) + return file + except Exception as e: + logger.error(f"build_thumbnail failed; returning blank image. Error: {e}", exc_info=True) + # determine original size (height, width) + try: + h, w = image_array.shape[:2] + except Exception: + h, w = 1, 1 + # create blank RGB image of same dimensions + blank = Image.new("RGB", (w, h), color=(0, 0, 0)) + buf = io.BytesIO() + blank.save(buf, format="PNG") + buf.seek(0) + return buf + + +def calculate_access_controls( + username, + beamline, + proposal +) -> Dict: + """Calculate access controls for a dataset.""" + # make an access group list that includes the name of the proposal and the name of the beamline access_groups = [] # sometimes the beamline name is super dirty " '8.3.2', "" '8.3.2', " @@ -63,14 +116,29 @@ def calculate_access_controls(username, beamline, proposal) -> Dict: return {"owner_group": owner_group, "access_groups": access_groups} -def build_search_terms(sample_name): - """extract search terms from sample name to provide something pleasing to search on""" - terms = re.split("[^a-zA-Z0-9]", sample_name) - description = [term.lower() for term in terms if len(term) > 0] - return " ".join(description) +def clean_email(email: str): + """Clean up email addresses.""" + + if email: + if not email or email.upper() == "NONE": + # this is a brutal case, but the beamline sometimes puts in "None" and + # the new scicat backend hates that. + unknown_email = "unknown@example.com" + return unknown_email + return email.replace(" ", "").replace(",", "").replace("'", "") + return None + +def encode_image_2_thumbnail( + filebuffer, + imType="jpg" +) -> str: + """Encode an image file to a base 64 string for use as a thumbnail. -def encode_image_2_thumbnail(filebuffer, imType="jpg"): + "encode_thumbnail()" is now part of SciCat. + Not sure if this would conflict with the current production version we have deployed. + https://www.scicatproject.org/pyscicat/howto/ingest.html?highlight=encode + """ logging.info("Creating thumbnail for dataset") header = "data:image/{imType};base64,".format(imType=imType) dataBytes = base64.b64encode(filebuffer.read()) @@ -78,17 +146,11 @@ def encode_image_2_thumbnail(filebuffer, imType="jpg"): return header + dataStr -def build_thumbnail(image_array: npt.ArrayLike): - image_array = image_array - np.min(image_array) + 1.001 - image_array = np.log(image_array) - image_array = 205 * image_array / (np.max(image_array)) - auto_contrast_image = Image.fromarray(image_array.astype("uint8")) - auto_contrast_image = ImageOps.autocontrast(auto_contrast_image, cutoff=0.1) - # filename = str(uuid4()) + ".png" - file = io.BytesIO() - # file = thumbnail_dir / Path(filename) - auto_contrast_image.save(file, format="png") - file.seek(0) - return file +def get_file_size(file_path: Path) -> int: + """Return the size of the file in bytes.""" + return file_path.lstat().st_size +def get_file_mod_time(file_path: Path) -> str: + """Return the file modification time in ISO format.""" + return datetime.fromtimestamp(file_path.lstat().st_mtime).isoformat() diff --git a/orchestration/globus/flows.py b/orchestration/globus/flows.py index a27b3889..bbf35780 100644 --- a/orchestration/globus/flows.py +++ b/orchestration/globus/flows.py @@ -10,6 +10,10 @@ from globus_sdk.tokenstorage import SimpleJSONFileAdapter from pprint import pprint from prefect.blocks.system import Secret +import logging + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) MY_FILE_ADAPTER = SimpleJSONFileAdapter(os.path.expanduser("~/.sdk-manage-flow.json")) @@ -19,8 +23,14 @@ dotenv_file = load_dotenv() -GLOBUS_CLIENT_ID = Secret.load("globus-client-id") -GLOBUS_CLIENT_SECRET = Secret.load("globus-client-secret") +if not os.getenv("PREFECT_API_URL") and not os.getenv("PREFECT_API_KEY"): + logger.warning("Prefect environment variables are not set.") + +try: + GLOBUS_CLIENT_ID = Secret.load("globus-client-id") + GLOBUS_CLIENT_SECRET = Secret.load("globus-client-secret") +except Exception as e: + logger.warning(f"Error loading Globus client credentials: {e}") def get_flows_client(): diff --git a/orchestration/globus/transfer.py b/orchestration/globus/transfer.py index 764ba6ad..e5ddfadd 100644 --- a/orchestration/globus/transfer.py +++ b/orchestration/globus/transfer.py @@ -1,3 +1,4 @@ +# orchestration/globus/transfer.py from dataclasses import dataclass from datetime import datetime, timezone, timedelta from dateutil import parser @@ -269,7 +270,7 @@ def task_wait( def prune_one_safe( file: str, if_older_than_days: int, - tranfer_client: TransferClient, + transfer_client: TransferClient, source_endpoint: GlobusEndpoint, check_endpoint: Union[GlobusEndpoint, None], max_wait_seconds: int = 120, @@ -281,7 +282,7 @@ def prune_one_safe( is also located at the check_endpoint. If not, raises """ # does the file exist at the source endpoint? - g_file_obj = get_globus_file_object(tranfer_client, source_endpoint, file) + g_file_obj = get_globus_file_object(transfer_client, source_endpoint, file) assert g_file_obj is not None, f"file not found {source_endpoint.uri}" logger.info(f"file: {file} found on {source_endpoint.uri}") @@ -289,7 +290,7 @@ def prune_one_safe( if check_endpoint is None: logger.info("No check endpoint provided, skipping check") else: - g_file_obj = get_globus_file_object(tranfer_client, check_endpoint, file) + g_file_obj = get_globus_file_object(transfer_client, check_endpoint, file) assert g_file_obj is not None, f"file not found {check_endpoint.uri}" logger.info(f"file: {file} found on {check_endpoint.uri}") @@ -306,14 +307,14 @@ def prune_one_safe( logger.info("Not checking dates, sent if_older_than_days==0") delete_id = prune_files( - tranfer_client, + transfer_client, source_endpoint, [file], max_wait_seconds=max_wait_seconds, logger=logger, ) - task_wait(tranfer_client, delete_id) + task_wait(transfer_client, delete_id) logger.info(f"file deleted from: {source_endpoint.uri}") diff --git a/orchestration/hpss.py b/orchestration/hpss.py new file mode 100644 index 00000000..ab236f16 --- /dev/null +++ b/orchestration/hpss.py @@ -0,0 +1,755 @@ +""" +HPSS Module - Handling transfers to and from NERSC's High Performance Storage System (HPSS). + +This module provides functionality for transferring data between NERSC's Community File System (CFS) +and the High Performance Storage System (HPSS) tape archive. It includes: + +1. Prefect flows for initiating transfers in both directions +2. Transfer controllers for CFS to HPSS and HPSS to CFS operations +3. HPSS-specific pruning controller for managing data lifecycle +4. Slurm job scripts for executing HPSS operations via SFAPI + +The module follows tape-safe practices as recommended in NERSC documentation: +https://docs.nersc.gov/filesystems/HPSS-best-practices/ +""" + +import datetime +import logging +import os +from pathlib import Path +import re +import time +from typing import Dict, List, Optional, Union + +from prefect import flow +from sfapi_client import Client +from sfapi_client.compute import Machine + +from orchestration.config import BeamlineConfig +from orchestration.prefect import schedule_prefect_flow +from orchestration.prune_controller import PruneController +from orchestration.transfer_controller import get_transfer_controller, CopyMethod, TransferController +from orchestration.transfer_endpoints import FileSystemEndpoint, HPSSEndpoint + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +""" +HPSS SLURM Template Loader + +Provides utilities for loading SLURM job script templates for HPSS operations. +""" + +# Directory containing SLURM template files + + +def load_slurm_job(job_name: str, **variables) -> str: + """ + Load and render a SLURM template with variable substitution. + + Args: + job_name: Name of the job (without .slurm extension) + **variables: Variables to substitute using .format() + + Returns: + str: The rendered SLURM script + """ + # Read slurm files from orchestration/slurm/ + TEMPLATES_DIR = Path(__file__).parent / "slurm" + slurm_path = TEMPLATES_DIR / f"{job_name}.slurm" + job = slurm_path.read_text() + return job.format(**variables) + + +# --------------------------------- +# HPSS Prefect Flows +# --------------------------------- + +@flow(name="cfs_to_hpss_flow") +def cfs_to_hpss_flow( + file_path: Union[str, List[str]] = None, + source: FileSystemEndpoint = None, + destination: HPSSEndpoint = None, + config: BeamlineConfig = None +) -> bool: + """ + Prefect flow for transferring data from CFS to HPSS tape archive. + + This flow handles the transfer of files or directories from NERSC's Community + File System (CFS) to the High Performance Storage System (HPSS) tape archive. + For directories, files are bundled into tar archives based on time periods. + + Args: + file_path (Union[str, List[str]]): A single file path or a list of file paths to transfer + source (FileSystemEndpoint): The CFS source endpoint + destination (HPSSEndpoint): The HPSS destination endpoint + config (BeamlineConfig): The beamline configuration containing endpoints and credentials + + Returns: + bool: True if all transfers succeeded, False otherwise + """ + logger.info("Running cfs_to_hpss_flow") + + if not file_path: + logger.error("No file path provided for CFS to HPSS transfer") + return False + + if not source or not destination: + logger.error("Source or destination endpoint not provided for CFS to HPSS transfer") + return False + + if not config: + logger.error("No configuration provided for CFS to HPSS transfer") + return False + + # Log detailed information about the transfer + if isinstance(file_path, list): + logger.info(f"Transferring {len(file_path)} files/directories from {source.name} to {destination.name}") + for path in file_path: + logger.debug(f" - {path}") + else: + logger.info(f"Transferring {file_path} from {source.name} to {destination.name}") + + # Configure the transfer controller for CFS to HPSS + logger.info("Configuring transfer controller for CFS_TO_HPSS.") + try: + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.CFS_TO_HPSS, + config=config + ) + except Exception as e: + logger.error(f"Failed to initialize CFS to HPSS transfer controller: {str(e)}", exc_info=True) + return False + + logger.info("CFSToHPSSTransferController selected. Initiating transfer for all file paths.") + + try: + result = transfer_controller.copy( + file_path=file_path, + source=source, + destination=destination + ) + if result: + logger.info("CFS to HPSS transfer completed successfully") + else: + logger.error("CFS to HPSS transfer failed") + return result + except Exception as e: + logger.error(f"Error during CFS to HPSS transfer: {str(e)}", exc_info=True) + return False + + +@flow(name="hpss_to_cfs_flow") +def hpss_to_cfs_flow( + file_path: str = None, + source: HPSSEndpoint = None, + destination: FileSystemEndpoint = None, + files_to_extract: Optional[List[str]] = None, + config: BeamlineConfig = None +) -> bool: + """ + Prefect flow for retrieving data from HPSS tape archive to CFS. + + This flow handles the retrieval of files or tar archives from NERSC's High + Performance Storage System (HPSS) to the Community File System (CFS). + For tar archives, you can optionally specify specific files to extract. + + Args: + file_path (str): The path of the file or tar archive on HPSS + source (HPSSEndpoint): The HPSS source endpoint + destination (FileSystemEndpoint): The CFS destination endpoint + files_to_extract (Optional[List[str]]): Specific files to extract from the tar archive + config (BeamlineConfig): The beamline configuration containing endpoints and credentials + + Returns: + bool: True if the transfer succeeded, False otherwise + """ + logger.info("Running hpss_to_cfs_flow") + + if not file_path: + logger.error("No file path provided for HPSS to CFS transfer") + return False + + if not source or not destination: + logger.error("Source or destination endpoint not provided for HPSS to CFS transfer") + return False + + if not config: + logger.error("No configuration provided for HPSS to CFS transfer") + return False + + logger.info(f"Transferring {file_path} from {source.name} to {destination.name}") + + # Log detailed information about the transfer + if files_to_extract: + logger.info(f"Extracting {len(files_to_extract)} specific files from tar archive:") + for file in files_to_extract: + logger.debug(f" - {file}") + + # Configure transfer controller for HPSS_TO_CFS + logger.info("Configuring transfer controller for HPSS_TO_CFS.") + try: + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.HPSS_TO_CFS, + config=config + ) + except Exception as e: + logger.error(f"Failed to initialize HPSS to CFS transfer controller: {str(e)}", exc_info=True) + return False + + logger.info("HPSSToCFSTransferController selected. Initiating transfer for all file paths.") + + # Initiate transfer + logger.info("HPSSToCFSTransferController selected. Initiating transfer.") + try: + result = transfer_controller.copy( + file_path=file_path, + source=source, + destination=destination, + files_to_extract=files_to_extract, + ) + + if result: + logger.info("HPSS to CFS transfer completed successfully") + else: + logger.error("HPSS to CFS transfer failed") + + return result + except Exception as e: + logger.error(f"Error during HPSS to CFS transfer: {str(e)}", exc_info=True) + return False + + +# ---------------------------------- +# HPSS ls Function +# ---------------------------------- + +def list_hpss_slurm( + client: Client, + endpoint: HPSSEndpoint, + remote_path: str, + recursive: bool = True +) -> str: + """ + Schedule and run a Slurm job on Perlmutter to list contents on HPSS, + then read back the result from the Slurm output file. + + If `remote_path` ends with '.tar', uses `htar -tvf` to list tar members; + otherwise uses `hsi ls [-R]` to list directory contents. + + Args: + client (Client): SFAPI client with compute permissions. + endpoint (HPSSEndpoint): HPSS endpoint (knows root_path & URI). + remote_path (str): Path relative to endpoint.root_path on HPSS. + recursive (bool): Recursively list directories (ignored for .tar). + + Returns: + . + + Raises: + RuntimeError: If job submission or output retrieval fails. + """ + logger = logging.getLogger(__name__) + # Build logs directory on CFS + beamline_id = remote_path.split("/")[0] + logs_dir = f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{beamline_id}/ls" + + # Sanitize remote_path for filenames + safe_name = re.sub(r'[^A-Za-z0-9_]', '_', remote_path) + job_name = f"ls_hpss_{safe_name}" + out_pattern = f"{logs_dir}/{safe_name}_%j.out" + err_pattern = f"{logs_dir}/{safe_name}_%j.err" + + full_hpss = endpoint.full_path(remote_path) + + # for tar: list contents & then show .idx; otherwise do an hsi ls + if remote_path.lower().endswith(".tar"): + cmd = ( + f'echo "[LOG] TAR contents:" && htar -tvf "{full_hpss}"' + ) + else: + ls_flag = "-R" if recursive else "" + cmd = f'hsi ls {ls_flag} "{full_hpss}"' + + job_script = load_slurm_job( + "ls_hpss", + job_name=job_name, + out_pattern=out_pattern, + err_pattern=err_pattern, + full_hpss=full_hpss, + cmd=cmd + ) + + # submit & wait + perlmutter = client.compute(Machine.perlmutter) + job = perlmutter.submit_job(job_script) + try: + job.update() + except Exception: + logger.debug("Initial job.update() failed, proceeding to wait") + job.complete() + + # print where you can find the actual log + out_file = out_pattern.replace("%j", str(job.jobid)) + print(f"HPSS listing complete. See Slurm output at: {out_file}") + + return out_file + +# ---------------------------------- +# HPSS Prune Controller +# ---------------------------------- + + +class HPSSPruneController(PruneController[HPSSEndpoint]): + """ + Controller for pruning files from HPSS tape archive. + + This controller uses SFAPI, Slurm, and hsi to prune data from HPSS at NERSC. + It requires the source to be an HPSSEndpoint and the optional destination to + be a FileSystemEndpoint. It uses "hsi rm" to prune files from HPSS. + + Args: + client (Client): The SFAPI client for submitting jobs to NERSC + config (BeamlineConfig): Configuration object containing endpoints and credentials + """ + def __init__( + self, + client: Client, + config: BeamlineConfig, + ) -> None: + """ + Initialize the HPSS prune controller. + + Args: + client (Client): The SFAPI client for submitting jobs to NERSC + config (BeamlineConfig): Configuration object containing endpoints and credentials + """ + super().__init__(config) + self.client = client + logger.debug(f"Initialized HPSSPruneController with client for beamline {config.beamline_id}") + + def prune( + self, + file_path: str = None, + source_endpoint: HPSSEndpoint = None, + check_endpoint: Optional[FileSystemEndpoint] = None, + days_from_now: datetime.timedelta = 0 + ) -> bool: + """ + Prune (delete) data from HPSS tape archive. + + If days_from_now is 0, executes pruning immediately. + Otherwise, schedules pruning for future execution using Prefect. + + Args: + file_path (str): The path to the file or directory to prune on HPSS + source_endpoint (HPSSEndpoint): The HPSS endpoint containing the data + check_endpoint (Optional[FileSystemEndpoint]): If provided, verify data exists here before pruning + days_from_now (datetime.timedelta): Delay before pruning; if 0, prune immediately + + Returns: + bool: True if pruning was successful or scheduled successfully, False otherwise + """ + if not file_path: + logger.error("No file_path provided for HPSS pruning operation") + return False + + if not source_endpoint: + logger.error("No source_endpoint provided for HPSS pruning operation") + return False + + flow_name = f"prune_from_{source_endpoint.name}" + logger.info(f"Setting up pruning of '{file_path}' from HPSS endpoint '{source_endpoint.name}'") + + # If days_from_now is 0, prune immediately + if days_from_now.total_seconds() == 0: + self._prune_hpss_endpoint( + self, + relative_path=file_path, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + ) + # Otherwise, schedule pruning for future execution + else: + logger.info(f"Scheduling pruning of '{file_path}' from '{source_endpoint.name}' " + f"in {days_from_now.total_seconds()/86400:.1f} days") + + try: + schedule_prefect_flow( + deployment_name="prune_hpss_endpoint/prune_hpss_endpoint", + flow_run_name=flow_name, + parameters={ + "relative_path": file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint, + "config": self.config + }, + duration_from_now=days_from_now + ) + logger.info(f"Successfully scheduled HPSS pruning task in {days_from_now.total_seconds()/86400:.1f} days") + return True + except Exception as e: + logger.error(f"Failed to schedule HPSS pruning task: {str(e)}", exc_info=True) + return False + + @flow(name="prune_hpss_endpoint") + def _prune_hpss_endpoint( + self, + relative_path: str, + source_endpoint: HPSSEndpoint, + check_endpoint: Optional[Union[FileSystemEndpoint, None]] = None, + ) -> None: + """ + Prefect flow that performs the actual HPSS pruning operation. + + Args: + relative_path (str): The HPSS path of the file or directory to prune + source_endpoint (HPSSEndpoint): The HPSS endpoint to prune from + check_endpoint (Optional[FileSystemEndpoint]): If provided, verify data exists here before pruning + """ + logger.info("Pruning files from HPSS") + logger.info(f"Pruning {relative_path} from source endpoint: {source_endpoint.name}") + + beamline_id = self.config.beamline_id + logs_path = f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{beamline_id}" + job_script = load_slurm_job( + "prune_hpss", + relative_path=relative_path, + logs_path=logs_path, + source_endpoint=source_endpoint + ) + + try: + logger.info("Submitting HPSS transfer job to Perlmutter.") + perlmutter = self.client.compute(Machine.perlmutter) + job = perlmutter.submit_job(job_script) + logger.info(f"Submitted job ID: {job.jobid}") + + try: + job.update() + except Exception as update_err: + logger.warning(f"Initial job update failed, continuing: {update_err}") + + time.sleep(60) + logger.info(f"Job {job.jobid} current state: {job.state}") + + job.complete() # Wait until the job completes. + logger.info("Transfer job completed successfully.") + return True + + except Exception as e: + logger.error(f"Error during job submission or completion: {e}") + match = re.search(r"Job not found:\s*(\d+)", str(e)) + if match: + jobid = match.group(1) + logger.info(f"Attempting to recover job {jobid}.") + try: + job = self.client.perlmutter.job(jobid=jobid) + time.sleep(30) + job.complete() + logger.info("Transfer job completed successfully after recovery.") + return True + except Exception as recovery_err: + logger.error(f"Failed to recover job {jobid}: {recovery_err}") + return False + else: + return False + + +# ---------------------------------- +# HPSS Transfer Controllers +# ---------------------------------- + +class CFSToHPSSTransferController(TransferController[HPSSEndpoint]): + """ + Use SFAPI, Slurm, hsi, and htar to move data from CFS to HPSS at NERSC. + + This controller requires the source to be a FileSystemEndpoint on CFS and the + destination to be an HPSSEndpoint. For a single file, the transfer is done using hsi (via hsi cput). + For a directory, the transfer is performed with htar. In this updated version, if the source is a + directory then the files are bundled into tar archives based on their modification dates as follows: + - Files with modification dates between Jan 1 and Jul 15 (inclusive) are grouped together + (Cycle 1 for that year). + - Files with modification dates between Jul 16 and Dec 31 are grouped together (Cycle 2). + + Within each group, if the total size exceeds 2 TB the files are partitioned into multiple tar bundles. + The resulting naming convention on HPSS is: + + /home/a/alsdev/data_mover/[beamline]/raw/[proposal_name]/ + [proposal_name]_[year]-[cycle].tar + [proposal_name]_[year]-[cycle]_part0.tar + [proposal_name]_[year]-[cycle]_part1.tar + ... + + At the end of the SLURM script, the directory tree for both the source (CFS) and destination (HPSS) + is echoed for logging purposes. + """ + + def __init__( + self, + client: Client, + config: BeamlineConfig + ) -> None: + super().__init__(config) + self.client = client + + def list_hpss( + self, + endpoint: HPSSEndpoint, + remote_path: str, + recursive: bool = True + ) -> List[str]: + """ + Schedule and run a Slurm job to list contents on HPSS. + + Args: + endpoint (HPSSEndpoint): HPSS endpoint (knows root_path & URI). + remote_path (str): Path under endpoint.root_path to list. + recursive (bool): If True, pass -R to `hsi ls` (ignored for tar). + + Returns: + List[str]: Lines of output from the listing command. + """ + return list_hpss_slurm( + client=self.client, + endpoint=endpoint, + remote_path=remote_path, + recursive=recursive + ) + + def copy( + self, + file_path: str = None, + source: FileSystemEndpoint = None, + destination: HPSSEndpoint = None, + days_from_now: datetime.timedelta = 0 + ) -> bool: + """ + Copy a file or directory from a CFS source endpoint to an HPSS destination endpoint. + + Args: + file_path (str): Path to the file or directory on CFS. + source (FileSystemEndpoint): The CFS source endpoint. + destination (HPSSEndpoint): The HPSS destination endpoint. + + Returns: + bool: True if the transfer job completes successfully, False otherwise. + """ + logger.info("Transferring data from CFS to HPSS") + if not file_path or not source or not destination: + logger.error("Missing required parameters for CFSToHPSSTransferController.") + return False + + # Compute the full path on CFS for the file/directory. + full_cfs_path = source.full_path(file_path) + # Get the beamline_id from the configuration. + beamline_id = self.config.beamline_id + # Build the HPSS destination root path using the convention: [destination.root_path]/[beamline_id]/raw + hpss_root_path = f"{destination.root_path.rstrip('/')}/{beamline_id}/raw" + + # Determine the proposal (project) folder name from the file_path. + path = Path(file_path) + proposal_name = path.parent.name + if not proposal_name or proposal_name == ".": # if file_path is in the root directory + proposal_name = file_path + + logger.info(f"Proposal name derived from file path: {proposal_name}") + + logs_path = f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{beamline_id}" + logger.info(f"Logs will be saved to: {logs_path}") + # Build the SLURM job script with detailed inline comments for clarity. + + job_script = load_slurm_job( + "cfs_to_hpss", + full_cfs_path=full_cfs_path, + hpss_root_path=hpss_root_path, + proposal_name=proposal_name, + logs_path=logs_path + ) + + try: + logger.info("Submitting HPSS transfer job to Perlmutter.") + perlmutter = self.client.compute(Machine.perlmutter) + job = perlmutter.submit_job(job_script) + logger.info(f"Submitted job ID: {job.jobid}") + + try: + job.update() + except Exception as update_err: + logger.warning(f"Initial job update failed, continuing: {update_err}") + + time.sleep(60) + logger.info(f"Job {job.jobid} current state: {job.state}") + + job.complete() # Wait until the job completes. + logger.info("Transfer job completed successfully.") + return True + + except Exception as e: + logger.error(f"Error during job submission or completion: {e}") + match = re.search(r"Job not found:\s*(\d+)", str(e)) + if match: + jobid = match.group(1) + logger.info(f"Attempting to recover job {jobid}.") + try: + job = self.client.perlmutter.job(jobid=jobid) + time.sleep(30) + job.complete() + logger.info("Transfer job completed successfully after recovery.") + return True + except Exception as recovery_err: + logger.error(f"Failed to recover job {jobid}: {recovery_err}") + return False + else: + return False + + +class HPSSToCFSTransferController(TransferController[HPSSEndpoint]): + """ + Use SFAPI, Slurm, hsi and htar to move data between HPSS and CFS at NERSC. + + This controller retrieves data from an HPSS source endpoint and places it on a CFS destination endpoint. + It supports the following modes: + - "single": Single file retrieval via hsi get. + - "tar": Full tar archive extraction via htar -xvf. + - "partial": Partial extraction from a tar archive: if a list of files is provided (via files_to_extract), + only the specified files will be extracted. + + A single SLURM job script is generated that branches based on the mode. + """ + + def __init__( + self, + client: Client, + config: BeamlineConfig + ) -> None: + super().__init__(config) + self.client = client + + def list_hpss( + self, + endpoint: HPSSEndpoint, + remote_path: str, + recursive: bool = True + ) -> List[str]: + """ + Schedule and run a Slurm job to list contents on HPSS. + + Args: + endpoint (HPSSEndpoint): HPSS endpoint (knows root_path & URI). + remote_path (str): Path under endpoint.root_path to list. + recursive (bool): If True, pass -R to `hsi ls` (ignored for tar). + + Returns: + List[str]: Lines of output from the listing command. + """ + return list_hpss_slurm( + client=self.client, + endpoint=endpoint, + remote_path=remote_path, + recursive=recursive + ) + + def copy( + self, + file_path: str = None, + source: HPSSEndpoint = None, + destination: FileSystemEndpoint = None, + files_to_extract: Optional[List[str]] = None, + ) -> bool: + """ + Copy a file from an HPSS source endpoint to a CFS destination endpoint. + + Args: + file_path (str): Path to the file or tar archive on HPSS. + source (HPSSEndpoint): The HPSS source endpoint. + destination (FileSystemEndpoint): The CFS destination endpoint. + files_to_extract (List[str], optional): Specific files to extract from the tar archive. + If provided (and file_path ends with '.tar'), only these files will be extracted. + If not provided, the entire tar archive will be extracted. + If file_path is a single file, this parameter is ignored. + + Returns: + bool: True if the transfer job completes successfully, False otherwise. + """ + logger.info("Starting HPSS to CFS transfer.") + if not file_path or not source or not destination: + logger.error("Missing required parameters: file_path, source, or destination.") + return False + + # Sanitize the file_path for the log file names. + # Build a small ord→char map + translate_dict: Dict[int, str] = { + ord("/"): "_", # always replace forward slash + ord(" "): "_", # replace spaces + ord(os.sep): "_", # replace primary separator + ord("\\"): "_" # In case of Windows + } + + # One-pass replacement + sanitized_path = file_path.translate(translate_dict) + + # Compute the full HPSS path from the source endpoint. + hpss_path = source.full_path(file_path) + dest_root = destination.root_path + + # Get the beamline_id from the configuration. + beamline_id = self.config.beamline_id + + logs_path = f"/global/cfs/cdirs/als/data_mover/hpss_transfer_logs/{beamline_id}" + + # If files_to_extract is provided, join them as a space‐separated string. + files_to_extract_str = " ".join(files_to_extract) if files_to_extract else "" + + # The following SLURM script contains all logic to decide the transfer mode. + # It determines: + # - if HPSS_PATH ends with .tar, then if FILES_TO_EXTRACT is nonempty, MODE becomes "partial", + # else MODE is "tar". + # - Otherwise, MODE is "single" and hsi get is used. + + job_script = load_slurm_job( + "hpss_to_cfs", + logs_path=logs_path, + file_path=file_path, + sanitized_path=sanitized_path, + hpss_path=hpss_path, + dest_root=dest_root, + files_to_extract_str=files_to_extract_str + ) + + logger.info("Submitting HPSS to CFS transfer job to Perlmutter.") + try: + perlmutter = self.client.compute(Machine.perlmutter) + job = perlmutter.submit_job(job_script) + logger.info(f"Submitted job ID: {job.jobid}") + + try: + job.update() + except Exception as update_err: + logger.warning(f"Initial job update failed, continuing: {update_err}") + + time.sleep(60) + logger.info(f"Job {job.jobid} current state: {job.state}") + + job.complete() # Wait until the job completes. + logger.info("HPSS to CFS transfer job completed successfully.") + return True + + except Exception as e: + logger.error(f"Error during job submission or completion: {e}") + match = re.search(r"Job not found:\s*(\d+)", str(e)) + if match: + jobid = match.group(1) + logger.info(f"Attempting to recover job {jobid}.") + try: + job = self.client.perlmutter.job(jobid=jobid) + time.sleep(30) + job.complete() + logger.info("HPSS to CFS transfer job completed successfully after recovery.") + return True + except Exception as recovery_err: + logger.error(f"Failed to recover job {jobid}: {recovery_err}") + return False + else: + return False diff --git a/orchestration/prometheus_utils.py b/orchestration/prometheus_utils.py index 9b3ca566..d988cea3 100644 --- a/orchestration/prometheus_utils.py +++ b/orchestration/prometheus_utils.py @@ -2,41 +2,52 @@ import uuid from prometheus_client import Gauge, CollectorRegistry, push_to_gateway + class PrometheusMetrics(): def __init__(self): try: # Create a new registry self.registry = CollectorRegistry() - + # Define the required metrics # 1. Count of requests - Gauge - self.request_counter = Gauge('nersc_transfer_request_count', - 'Number of times the flow has been executed', - ['execution_id'], - registry=self.registry) - - self.bytes_counter = Gauge('nersc_transfer_total_bytes', - 'Number of bytes for all the executed flows', - ['execution_id'], - registry=self.registry) - + self.request_counter = Gauge( + 'nersc_transfer_request_count', + 'Number of times the flow has been executed', + ['execution_id'], + registry=self.registry + ) + + self.bytes_counter = Gauge( + 'nersc_transfer_total_bytes', + 'Number of bytes for all the executed flows', + ['execution_id'], + registry=self.registry + ) + # 2. Total bytes transferred - Gauge - self.transfer_bytes = Gauge('nersc_transfer_file_bytes', - 'Total size of all file transfers to NERSC', - ['machine'], - registry=self.registry) - + self.transfer_bytes = Gauge( + 'nersc_transfer_file_bytes', + 'Total size of all file transfers to NERSC', + ['machine'], + registry=self.registry + ) + # 3. Transfer speed - Gauge - self.transfer_speed = Gauge('nersc_transfer_speed_bytes_per_second', - 'Transfer speed for NERSC file transfers in bytes per second', - ['machine'], - registry=self.registry) - + self.transfer_speed = Gauge( + 'nersc_transfer_speed_bytes_per_second', + 'Transfer speed for NERSC file transfers in bytes per second', + ['machine'], + registry=self.registry + ) + # 4. Transfer time - Gauge - self.transfer_time = Gauge('nersc_transfer_time_seconds', - 'Time taken for NERSC file transfers in seconds', - ['machine'], - registry=self.registry) + self.transfer_time = Gauge( + 'nersc_transfer_time_seconds', + 'Time taken for NERSC file transfers in seconds', + ['machine'], + registry=self.registry + ) except Exception as e: print(f"Error initializing Prometheus metrics: {e}") @@ -45,22 +56,22 @@ def push_metrics_to_prometheus(self, metrics, logger): PUSHGATEWAY_URL = os.getenv('PUSHGATEWAY_URL', 'http://localhost:9091') JOB_NAME = os.getenv('JOB_NAME', 'nersc_transfer') INSTANCE_LABEL = os.getenv('INSTANCE_LABEL', 'data_transfer') - + try: # Generate a unique execution ID for this transfer execution_id = f"exec_{str(uuid.uuid4())}" - + # Set the metrics self.request_counter.labels(execution_id=execution_id).set(1) self.bytes_counter.labels(execution_id=execution_id).set(metrics['bytes_transferred']) self.transfer_bytes.labels(machine=metrics['machine']).set(metrics['bytes_transferred']) self.transfer_time.labels(machine=metrics['machine']).set(metrics['duration_seconds']) self.transfer_speed.labels(machine=metrics['machine']).set(metrics['transfer_speed']) - + # Log metrics for debugging logger.info(f"Pushing metrics: transfer_bytes = {metrics['bytes_transferred']} bytes") logger.info(f"Pushing metrics: transfer_speed = {metrics['transfer_speed']} bytes/second") - + # Push to Pushgateway with error handling try: push_to_gateway( @@ -72,6 +83,6 @@ def push_metrics_to_prometheus(self, metrics, logger): logger.info(f"Successfully pushed metrics to Pushgateway at {PUSHGATEWAY_URL}") except Exception as push_error: logger.error(f"Error pushing to Pushgateway at {PUSHGATEWAY_URL}: {push_error}") - + except Exception as e: - logger.error(f"Error preparing metrics for Prometheus: {e}") \ No newline at end of file + logger.error(f"Error preparing metrics for Prometheus: {e}") diff --git a/orchestration/prune_controller.py b/orchestration/prune_controller.py new file mode 100644 index 00000000..8710b244 --- /dev/null +++ b/orchestration/prune_controller.py @@ -0,0 +1,415 @@ +from abc import ABC, abstractmethod +import datetime +from enum import Enum +import logging +import os +from typing import Generic, Optional, TypeVar + +from prefect import flow +from prefect.blocks.system import JSON + +from orchestration.config import BeamlineConfig +from orchestration.globus.transfer import GlobusEndpoint, prune_one_safe +from orchestration.prefect import schedule_prefect_flow +from orchestration.transfer_endpoints import FileSystemEndpoint, TransferEndpoint + + +logger = logging.getLogger(__name__) + +Endpoint = TypeVar("Endpoint", bound=TransferEndpoint) + + +class PruneController(Generic[Endpoint], ABC): + """ + Abstract base class for pruning controllers. + + This class defines the common interface that all prune controllers must implement, + regardless of the specific pruning mechanism they use. + + Args: + config (BeamlineConfig): Configuration object containing endpoints and credentials + """ + def __init__( + self, + config: BeamlineConfig, + ) -> None: + """ + Initialize the prune controller with configuration. + + Args: + config (BeamlineConfig): Configuration object containing endpoints and credentials + """ + self.config = config + logger.debug(f"Initialized {self.__class__.__name__} with config for beamline {config.beamline_id}") + + @abstractmethod + def prune( + self, + file_path: str = None, + source_endpoint: Endpoint = None, + check_endpoint: Optional[Endpoint] = None, + days_from_now: float = 0.0 + ) -> bool: + """ + Prune (delete) data from the source endpoint. + + This method either executes the pruning immediately or schedules it for future execution, + depending on the days_from_now parameter. + + Args: + file_path (str): The path to the file or directory to prune + source_endpoint (Endpoint): The endpoint containing the data to be pruned + check_endpoint (Optional[Endpoint]): If provided, verify data exists here before pruning + days_from_now (float): Delay in days before pruning; if 0.0, prune immediately. + + Returns: + bool: True if pruning was successful or scheduled successfully, False otherwise + """ + pass + + +class FileSystemPruneController(PruneController[FileSystemEndpoint]): + """ + Controller for pruning files from local file systems. + + This controller handles pruning operations on local or mounted file systems + using standard file system operations. + + Args: + config (BeamlineConfig): Configuration object containing file system paths + """ + def __init__( + self, + config: BeamlineConfig + ) -> None: + """ + Initialize the file system prune controller. + + Args: + config (BeamlineConfig): Configuration object containing file system paths + """ + super().__init__(config) + logger.debug(f"Initialized FileSystemPruneController for beamline {config.beamline_id}") + + def prune( + self, + file_path: str = None, + source_endpoint: FileSystemEndpoint = None, + check_endpoint: Optional[FileSystemEndpoint] = None, + days_from_now: float = 0.0, + ) -> bool: + """ + Prune (delete) data from a file system endpoint. + + If days_from_now is 0, executes pruning immediately. + Otherwise, schedules pruning for future execution using Prefect. + + Args: + file_path (str): The path to the file or directory to prune + source_endpoint (FileSystemEndpoint): The file system endpoint containing the data + check_endpoint (Optional[FileSystemEndpoint]): If provided, verify data exists here before pruning + days_from_now (float): Delay in days before pruning; if 0.0, prune immediately. If <0, throws error. + + Returns: + bool: True if pruning was successful or scheduled successfully, False otherwise + """ + if not file_path: + logger.error("No file_path provided for pruning operation") + return False + + if not source_endpoint: + logger.error("No source_endpoint provided for pruning operation") + return False + + if days_from_now < 0: + logger.error("days_from_now cannot be negative") + return False + + flow_name = f"prune_from_{source_endpoint.name}" + logger.info(f"Setting up pruning of '{file_path}' from '{source_endpoint.name}'") + + # convert float days → timedelta + days_from_now: datetime.timedelta = datetime.timedelta(days=days_from_now) + + # If days_from_now is 0, prune immediately + if days_from_now.total_seconds() == 0: + logger.info(f"Executing immediate pruning of '{file_path}' from '{source_endpoint.name}'") + try: + prune_filesystem_endpoint( + relative_path=file_path, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + config=self.config + ) + return True + except Exception as e: + logger.error(f"Failed to prune file: {str(e)}", exc_info=True) + return False + else: + # Otherwise, schedule pruning for future execution + logger.info(f"Scheduling pruning of '{file_path}' from '{source_endpoint.name}' " + f"in {days_from_now.total_seconds()/86400:.1f} days") + + try: + schedule_prefect_flow( + deployment_name="prune_filesystem_endpoint/prune_filesystem_endpoint", + flow_run_name=flow_name, + parameters={ + "relative_path": file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint, + "config": self.config + }, + duration_from_now=days_from_now, + ) + logger.info(f"Successfully scheduled pruning task for {days_from_now.total_seconds()/86400:.1f} days from now") + return True + except Exception as e: + logger.error(f"Failed to schedule pruning task: {str(e)}", exc_info=True) + return False + + +@flow(name="prune_filesystem_endpoint") +def prune_filesystem_endpoint( + relative_path: str, + source_endpoint: FileSystemEndpoint, + check_endpoint: Optional[FileSystemEndpoint] = None, + config: BeamlineConfig = None +) -> None: + """ + Prefect flow that performs the actual filesystem pruning operation. + + Args: + relative_path (str): The path of the file or directory to prune + source_endpoint (FileSystemEndpoint): The source endpoint to prune from + check_endpoint (Optional[FileSystemEndpoint]): If provided, verify data exists here before pruning + config (Optional[BeamlineConfig]): Configuration object, if needed + + Returns: + bool: True if pruning was successful, False otherwise + """ + logger.info(f"Running flow: prune_from_{source_endpoint.name}") + logger.info(f"Pruning {relative_path} from source endpoint: {source_endpoint.name}") + + # Check if the file exists at the source endpoint using os.path + source_full_path = source_endpoint.full_path(relative_path) + if not os.path.exists(source_full_path): + logger.warning(f"File {relative_path} does not exist at the source: {source_endpoint.name}.") + return False + + # If check_endpoint is provided, verify file exists there before pruning + if check_endpoint is not None: + check_full_path = check_endpoint.full_path(relative_path) + if os.path.exists(check_full_path): + logger.info(f"File {relative_path} exists on the check point: {check_endpoint.name}.") + logger.info("Safe to prune.") + else: + logger.warning(f"File {relative_path} does not exist at the check point: {check_endpoint.name}.") + logger.warning("Not safe to prune.") + return False + + # Now perform the pruning operation + if os.path.isdir(source_full_path): + logger.info(f"Pruning directory {relative_path}") + import shutil + shutil.rmtree(source_full_path) + else: + logger.info(f"Pruning file {relative_path}") + os.remove(source_full_path) + + logger.info(f"Successfully pruned {relative_path} from {source_endpoint.name}") + return True + + +class GlobusPruneController(PruneController[GlobusEndpoint]): + """ + Controller for pruning files from Globus endpoints. + + This controller handles pruning operations on Globus endpoints using + the Globus Transfer API. + + Args: + config (BeamlineConfig): Configuration object containing Globus endpoints and credentials + """ + def __init__( + self, + config: BeamlineConfig + ) -> None: + """ + Initialize the file system prune controller. + + Args: + config (BeamlineConfig): Configuration object containing file system paths + """ + super().__init__(config) + logger.debug(f"Initialized FileSystemPruneController for beamline {config.beamline_id}") + + def prune( + self, + file_path: str = None, + source_endpoint: GlobusEndpoint = None, + check_endpoint: Optional[GlobusEndpoint] = None, + days_from_now: float = 0.0 + ) -> bool: + """ + Prune (delete) data from a file system endpoint. + + If days_from_now is 0, executes pruning immediately. + Otherwise, schedules pruning for future execution using Prefect. + + Args: + file_path (str): The path to the file or directory to prune + source_endpoint (FileSystemEndpoint): The file system endpoint containing the data + check_endpoint (Optional[FileSystemEndpoint]): If provided, verify data exists here before pruning + days_from_now (float): Delay before pruning; if 0, prune immediately. If <0, throws error. + + Returns: + bool: True if pruning was successful or scheduled successfully, False otherwise + """ + if not file_path: + logger.error("No file_path provided for pruning operation") + return False + + if not source_endpoint: + logger.error("No source_endpoint provided for pruning operation") + return False + + if days_from_now < 0: + logger.error("days_from_now cannot be negative") + return False + + # globus_settings = JSON.load("globus-settings").value + # max_wait_seconds = globus_settings["max_wait_seconds"] + flow_name = f"prune_from_{source_endpoint.name}" + logger.info(f"Setting up pruning of '{file_path}' from '{source_endpoint.name}'") + + # convert float days → timedelta + days_from_now: datetime.timedelta = datetime.timedelta(days=days_from_now) + + # If days_from_now is 0, prune immediately + if days_from_now.total_seconds() == 0: + logger.info(f"Executing immediate pruning of '{file_path}' from '{source_endpoint.name}'") + try: + prune_globus_endpoint( + relative_path=file_path, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + config=self.config + ) + return True + except Exception as e: + logger.error(f"Failed to prune file: {str(e)}", exc_info=True) + return False + else: + # Otherwise, schedule pruning for future execution + logger.info(f"Scheduling pruning of '{file_path}' from '{source_endpoint.name}' " + f"in {days_from_now.total_seconds()/86400:.1f} days") + + try: + schedule_prefect_flow( + deployment_name="prune_globus_endpoint/prune_globus_endpoint", + flow_run_name=flow_name, + parameters={ + "relative_path": file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint, + "config": self.config + }, + duration_from_now=days_from_now, + ) + logger.info(f"Successfully scheduled pruning task for {days_from_now.total_seconds()/86400:.1f} days from now") + return True + except Exception as e: + logger.error(f"Failed to schedule pruning task: {str(e)}", exc_info=True) + return False + + +@flow(name="prune_globus_endpoint") +def prune_globus_endpoint( + relative_path: str, + source_endpoint: GlobusEndpoint, + check_endpoint: Optional[GlobusEndpoint] = None, + config: BeamlineConfig = None +) -> None: + """ + Prefect flow that performs the actual Globus endpoint pruning operation. + + Args: + relative_path (str): The path of the file or directory to prune + source_endpoint (GlobusEndpoint): The Globus endpoint to prune from + check_endpoint (Optional[GlobusEndpoint]): If provided, verify data exists here before pruning + config (BeamlineConfig): Configuration object with transfer client + """ + logger.info(f"Running Globus pruning flow for '{relative_path}' from '{source_endpoint.name}'") + + globus_settings = JSON.load("globus-settings").value + max_wait_seconds = globus_settings["max_wait_seconds"] + flow_name = f"prune_from_{source_endpoint.name}" + logger.info(f"Running flow: {flow_name}") + logger.info(f"Pruning {relative_path} from source endpoint: {source_endpoint.name}") + prune_one_safe( + file=relative_path, + if_older_than_days=0, + transfer_client=config.tc, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + logger=logger, + max_wait_seconds=max_wait_seconds + ) + + +class PruneMethod(Enum): + """ + Enum representing different prune methods. + + These values are used to select the appropriate prune controller + through the factory function get_prune_controller(). + + Attributes: + GLOBUS: Use Globus Transfer API for pruning operations + SIMPLE: Use local file system operations for pruning + HPSS: Use HPSS tape archive specific commands for pruning + """ + GLOBUS = "globus" + SIMPLE = "simple" + HPSS = "hpss" + + +def get_prune_controller( + prune_type: PruneMethod, + config: BeamlineConfig +) -> PruneController: + """ + Factory function to get the appropriate prune controller based on the prune type. + + Args: + prune_type (PruneMethod): The type of pruning to perform + config (BeamlineConfig): The configuration object containing endpoint information + + Returns: + PruneController: The appropriate prune controller instance + + Raises: + ValueError: If an invalid prune type is provided + """ + logger.debug(f"Creating prune controller of type: {prune_type.name}") + + if prune_type == PruneMethod.GLOBUS: + logger.debug("Returning GlobusPruneController") + return GlobusPruneController(config) + elif prune_type == PruneMethod.SIMPLE: + logger.debug("Returning FileSystemPruneController") + return FileSystemPruneController(config) + elif prune_type == PruneMethod.HPSS: + logger.debug("Importing and returning HPSSPruneController") + # Import here to avoid circular dependencies + from orchestration.hpss import HPSSPruneController + from orchestration.sfapi import create_sfapi_client + return HPSSPruneController( + client=create_sfapi_client(), + config=config + ) + else: + error_msg = f"Invalid prune type: {prune_type}" + logger.error(error_msg) + raise ValueError(error_msg) diff --git a/orchestration/sfapi.py b/orchestration/sfapi.py new file mode 100644 index 00000000..f845443f --- /dev/null +++ b/orchestration/sfapi.py @@ -0,0 +1,41 @@ +from dotenv import load_dotenv +import json +import logging +import os +from pathlib import Path + +from authlib.jose import JsonWebKey +from sfapi_client import Client + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +load_dotenv() + + +# TODO: we need a better way to store the client_id and client_secret +def create_sfapi_client( + client_id_path: str = os.getenv("PATH_NERSC_CLIENT_ID"), + client_secret_path: str = os.getenv("PATH_NERSC_PRI_KEY"), +) -> Client: + """Create and return an NERSC client instance""" + + # When generating the SFAPI Key in Iris, make sure to select "asldev" as the user! + # Otherwise, the key will not have the necessary permissions to access the data. + + if not client_id_path or not client_secret_path: + logger.error("NERSC credentials paths are missing.") + raise ValueError("NERSC credentials paths are missing.") + if not Path(client_id_path).is_file() or not Path(client_secret_path).is_file(): + raise FileNotFoundError("NERSC credential files are missing.") + + client_id = Path(client_id_path).read_text(encoding="utf-8").strip() + secret_text = Path(client_secret_path).read_text(encoding="utf-8") + client_secret = JsonWebKey.import_key(json.loads(secret_text)) + + try: + client = Client(client_id, client_secret) + logger.info("NERSC client created successfully.") + return client + except Exception as e: + logger.error(f"Failed to create NERSC client: {e}") + raise e diff --git a/orchestration/slurm/cfs_to_hpss.slurm b/orchestration/slurm/cfs_to_hpss.slurm new file mode 100644 index 00000000..a672d16e --- /dev/null +++ b/orchestration/slurm/cfs_to_hpss.slurm @@ -0,0 +1,324 @@ +#!/bin/bash +# ------------------------------------------------------------------ +# SLURM Job Script for Transferring Data from CFS to HPSS +# This script will: +# 1. Define the source (CFS) and destination (HPSS) paths. +# 2. Create the destination directory on HPSS if it doesn't exist. +# 3. Determine if the source is a file or a directory. +# - If a file, transfer it using 'hsi cput'. +# - If a directory, group files by beam cycle and archive them. +# * Cycle 1: Jan 1 - Jul 15 +# * Cycle 2: Jul 16 - Dec 31 +# * If a group exceeds 2 TB, it is partitioned into multiple tar archives. +# * Archive names: +# [proposal_name]_[year]-[cycle].tar +# [proposal_name]_[year]-[cycle]_part0.tar, _part1.tar, etc. +# 4. Echo directory trees for both source and destination for logging. +# ------------------------------------------------------------------ + +#SBATCH -q xfer # Specify the SLURM queue to use. +#SBATCH -A als # Specify the account. +#SBATCH -C cron # Use the 'cron' constraint. +#SBATCH --time=12:00:00 # Maximum runtime of 12 hours. +#SBATCH --job-name=transfer_to_HPSS_{proposal_name} # Set a descriptive job name. +#SBATCH --output={logs_path}/{proposal_name}_to_hpss_%j.out # Standard output log file. +#SBATCH --error={logs_path}/{proposal_name}_to_hpss_%j.err # Standard error log file. +#SBATCH --licenses=SCRATCH # Request the SCRATCH license. +#SBATCH --mem=2GB # Request #GB of memory. Default 2GB. + +set -euo pipefail # Enable strict error checking. +echo "[LOG] Job started at: $(date)" + +# ------------------------------------------------------------------ +# Define source and destination variables. +# ------------------------------------------------------------------ + +echo "[LOG] Defining source and destination paths." + +# SOURCE_PATH: Full path of the file or directory on CFS. +SOURCE_PATH="{full_cfs_path}" +echo "[LOG] SOURCE_PATH set to: $SOURCE_PATH" + +# DEST_ROOT: Root destination on HPSS built from configuration. +DEST_ROOT="{hpss_root_path}" +echo "[LOG] DEST_ROOT set to: $DEST_ROOT" + +# FOLDER_NAME: Proposal name (project folder) derived from the file path. +FOLDER_NAME="{proposal_name}" +echo "[LOG] FOLDER_NAME set to: $FOLDER_NAME" + +# DEST_PATH: Final HPSS destination directory. +DEST_PATH="${{DEST_ROOT}}/${{FOLDER_NAME}}" +echo "[LOG] DEST_PATH set to: $DEST_PATH" + +# ------------------------------------------------------------------ +# Create destination directory on HPSS recursively using hsi mkdir. +# This section ensures that the entire directory tree specified in DEST_PATH +# exists on HPSS. Since HPSS hsi does not support a recursive mkdir option, +# we split the path into its components and create each directory one by one. +# ------------------------------------------------------------------ + +echo "[LOG] Checking if HPSS destination directory exists at $DEST_PATH." + +# Use 'hsi ls' to verify if the destination directory exists. +# The '-q' flag is used for quiet mode, and any output or errors are discarded. +if hsi -q "ls $DEST_PATH" >/dev/null 2>&1; then + echo "[LOG] Destination directory $DEST_PATH already exists." +else + # If the directory does not exist, begin the process to create it. + echo "[LOG] Destination directory $DEST_PATH does not exist. Attempting to create it recursively." + + # Initialize an empty variable 'current' that will store the path built so far. + current="" + + # Split the DEST_PATH using '/' as the delimiter. + # This creates an array 'parts' where each element is a directory level in the path. + IFS='/' read -ra parts <<< "$DEST_PATH" + + # Iterate over each directory component in the 'parts' array. + for part in "${{parts[@]}}"; do + # Skip any empty parts. An empty string may occur if the path starts with a '/'. + if [ -z "$part" ]; then + continue + fi + + # Append the current part to the 'current' path variable. + # This step incrementally reconstructs the full path one directory at a time. + current="$current/$part" + + # Check if the current directory exists on HPSS using 'hsi ls'. + if ! hsi -q "ls $current" >/dev/null 2>&1; then + # If the directory does not exist, attempt to create it using 'hsi mkdir'. + if hsi "mkdir $current" >/dev/null 2>&1; then + echo "[LOG] Created directory $current." + else + echo "[ERROR] Failed to create directory $current." + exit 1 + fi + else + echo "[LOG] Directory $current already exists." + fi + done +fi + +# List the final HPSS directory tree for logging purposes. +# For some reason this gets logged in the project.err file, not the .out file. +hsi ls $DEST_PATH + +# ------------------------------------------------------------------ +# Transfer Logic: Check if SOURCE_PATH is a file or directory. +# ------------------------------------------------------------------ + +echo "[LOG] Determining type of SOURCE_PATH: $SOURCE_PATH" +if [ -f "$SOURCE_PATH" ]; then + # Case: Single file detected. + echo "[LOG] Single file detected. Transferring via hsi cput." + FILE_NAME=$(basename "$SOURCE_PATH") + echo "[LOG] File name: $FILE_NAME" + # Note about hsi cput: If the file already exists on HPSS, hsi cput will skip the transfer. + hsi cput "$SOURCE_PATH" "$DEST_PATH/$FILE_NAME" + echo "[LOG] (Simulated) File transfer completed for $FILE_NAME." +elif [ -d "$SOURCE_PATH" ]; then + # Case: Directory detected. + echo "[LOG] Directory detected. Initiating bundling process." + + # ------------------------------------------------------------------ + # Define thresholds + # - THRESHOLD: maximum total size per HTAR archive (2 TB). + # - MEMBER_LIMIT: maximum size per member file in an HTAR (set to 65 GB). + # ------------------------------------------------------------------ + + THRESHOLD=2199023255552 # 2 TB in bytes. + MEMBER_LIMIT=$((65*1024**3)) # 65 GB in bytes. 68 GB is the htar limit. Move files >65 GB than this using hsi cput. + echo "[LOG] Threshold set to 2 TB (bytes): $THRESHOLD" + echo "[LOG] Threshold for individual file transfer (bytes): $MEMBER_LIMIT" + + # ------------------------------------------------------------------ + # Generate a list of relative file paths in the project directory. + # This list will be used to group files by their modification date. + # ------------------------------------------------------------------ + # Create a temporary file to store the list of relative file paths. + # Explanation: + # 1. FILE_LIST=$(mktemp) + # - mktemp creates a unique temporary file and its path is stored in FILE_LIST. + # + # 2. (cd "$SOURCE_PATH" && find . -type f | sed 's|^\./||') + # - The parentheses run the commands in a subshell, so the directory change does not affect the current shell. + # - cd "$SOURCE_PATH": Changes the working directory to the source directory. + # - find . -type f: Recursively finds all files starting from the current directory (which is now SOURCE_PATH), + # outputting paths prefixed with "./". + # - sed 's|^\./||': Removes the leading "./" from each file path, resulting in relative paths without the prefix. + # + # 3. The output is then redirected into the temporary file specified by FILE_LIST. + # ------------------------------------------------------------------ + + echo "[LOG] Grouping files by modification date." + + FILE_LIST=$(mktemp) + (cd "$SOURCE_PATH" && find . -type f | sed 's|^\./||') > "$FILE_LIST" + + echo "[LOG] List of files stored in temporary file: $FILE_LIST" + + # ------------------------------------------------------------------ + # Filter out oversized files (>65GB) for immediate transfer + # - For each file: + # • If fsize > MEMBER_LIMIT: transfer via hsi cput. + # • Else: add path to new list for bundling. + # ------------------------------------------------------------------ + + echo "[LOG] Beginning oversized-file filtering (> $MEMBER_LIMIT bytes)" + FILTERED_LIST=$(mktemp) + echo "[LOG] Writing remaining file paths to $FILTERED_LIST" + + while IFS= read -r f; do + # Absolute local path and size + full_local="$SOURCE_PATH/$f" + fsize=$(stat -c %s "$full_local") + + if (( fsize > MEMBER_LIMIT )); then + # Relative subdirectory and filename + rel_dir=$(dirname "$f") + fname=$(basename "$f") + + # Compute HPSS directory under project (create if needed) + if [ "$rel_dir" = "." ]; then + dest_dir="$DEST_PATH" + else + dest_dir="$DEST_PATH/$rel_dir" + fi + + if ! hsi -q "ls $dest_dir" >/dev/null 2>&1; then + echo "[LOG] Creating HPSS directory $dest_dir" + hsi mkdir "$dest_dir" + fi + + # Full remote file path (directory + filename) + remote_file="$dest_dir/$fname" + + # Transfer via conditional put + echo "[LOG] Transferring oversized file '$f' ($fsize bytes) to HPSS path $remote_file" + echo "[DEBUG] hsi cput \"$full_local\" : \"$remote_file\"" + hsi cput "$full_local" : "$remote_file" + echo "[LOG] Completed hsi cput for '$f'." + else + # Keep for bundling later + echo "$f" >> "$FILTERED_LIST" + fi + done < "$FILE_LIST" + + # Swap in the filtered list and report + mv "$FILTERED_LIST" "$FILE_LIST" + remaining=$(wc -l < "$FILE_LIST") + echo "[LOG] Oversized-file transfer done. Remaining for bundling: $remaining files." + + # ------------------------------------------------------------------ + # Cycle-based grouping & tar-bundling logic (unchanged). + # ------------------------------------------------------------------ + + # Declare associative arrays to hold grouped file paths and sizes. + declare -A group_files + declare -A group_sizes + + # ------------------------------------------------------------------ + # Group files by modification date. + # ------------------------------------------------------------------ + + cd "$SOURCE_PATH" && \ + while IFS= read -r file; do + mtime=$(stat -c %Y "$file") + year=$(date -d @"$mtime" +%Y) + month=$(date -d @"$mtime" +%m | sed 's/^0*//') + day=$(date -d @"$mtime" +%d | sed 's/^0*//') + # Determine cycle: Cycle 1 if month < 7 or (month == 7 and day <= 15), else Cycle 2. + if [ "$month" -lt 7 ] || {{ [ "$month" -eq 7 ] && [ "$day" -le 15 ]; }}; then + cycle=1 + else + cycle=2 + fi + key="${{year}}-${{cycle}}" + group_files["$key"]="${{group_files["$key"]:-}} $file" + fsize=$(stat -c %s "$file") + group_sizes["$key"]=$(( ${{group_sizes["$key"]:-0}} + fsize )) + done < "$FILE_LIST" + rm "$FILE_LIST" + echo "[LOG] Completed grouping files." + + # Print the files in each group at the end + for key in "${{!group_files[@]}}"; do + echo "[LOG] Group $key contains files:" + for f in ${{group_files["$key"]}}; do + echo " $f" + done + done + + # ------------------------------------------------------------------ + # Bundle files into tar archives. + # ------------------------------------------------------------------ + for key in "${{!group_files[@]}}"; do + files=(${{group_files["$key"]}}) + total_group_size=${{group_sizes["$key"]}} + echo "[LOG] Processing group $key with ${{#files[@]}} files; total size: $total_group_size bytes." + + part=0 + current_size=0 + current_files=() + for f in "${{files[@]}}"; do + fsize=$(stat -c %s "$f") + # If adding this file exceeds the threshold, process the current bundle. + if (( current_size + fsize > THRESHOLD && ${{#current_files[@]}} > 0 )); then + if [ $part -eq 0 ]; then + tar_name="${{FOLDER_NAME}}_${{key}}.tar" + else + tar_name="${{FOLDER_NAME}}_${{key}}_part${{part}}.tar" + fi + echo "[LOG] Bundle reached threshold." + echo "[LOG] Files in current bundle:" + for file in "${{current_files[@]}}"; do + echo "$file" + done + echo "[LOG] Creating archive $tar_name with ${{#current_files[@]}} files; bundle size: $current_size bytes." + (cd "$SOURCE_PATH" && htar -cvf "${{DEST_PATH}}/${{tar_name}}" $(printf "%s " "${{current_files[@]}}")) + part=$((part+1)) + echo "[DEBUG] Resetting bundle variables." + current_files=() + current_size=0 + fi + current_files+=("$f") + current_size=$(( current_size + fsize )) + done + if [ ${{#current_files[@]}} -gt 0 ]; then + if [ $part -eq 0 ]; then + tar_name="${{FOLDER_NAME}}_${{key}}.tar" + else + tar_name="${{FOLDER_NAME}}_${{key}}_part${{part}}.tar" + fi + echo "[LOG] Final bundle for group $key:" + echo "[LOG] Files in final bundle:" + for file in "${{current_files[@]}}"; do + echo "$file" + done + echo "[LOG] Creating final archive $tar_name with ${{#current_files[@]}} files." + echo "[LOG] Bundle size: $current_size bytes." + (cd "$SOURCE_PATH" && htar -cvf "${{DEST_PATH}}/${{tar_name}}" $(printf "%s " "${{current_files[@]}}")) + fi + echo "[LOG] Completed processing group $key." + done +else + echo "[ERROR] $SOURCE_PATH is neither a file nor a directory. Exiting." + exit 1 +fi + +# ------------------------------------------------------------------ +# Logging: Display directory trees for both source and destination. +# ------------------------------------------------------------------ +echo "[LOG] Listing Source (CFS) Tree:" +if [ -d "$SOURCE_PATH" ]; then + find "$SOURCE_PATH" -print +else + echo "[LOG] $SOURCE_PATH is a file." +fi + +echo "[LOG] Listing Destination (HPSS) Tree:" +hsi ls -R "$DEST_PATH" || echo "[ERROR] Failed to list HPSS tree at $DEST_PATH" + +echo "[LOG] Job completed at: $(date)" \ No newline at end of file diff --git a/orchestration/slurm/hpss_to_cfs.slurm b/orchestration/slurm/hpss_to_cfs.slurm new file mode 100644 index 00000000..e5161193 --- /dev/null +++ b/orchestration/slurm/hpss_to_cfs.slurm @@ -0,0 +1,111 @@ +#!/bin/bash +#SBATCH -q xfer # Specify the SLURM queue +#SBATCH -A als # Specify the account. +#SBATCH -C cron # Use the 'cron' constraint. +#SBATCH --time=12:00:00 # Maximum runtime of 12 hours. +#SBATCH --job-name=transfer_from_HPSS_{file_path} # Set a descriptive job name. +#SBATCH --output={logs_path}/{sanitized_path}_from_hpss_%j.out # Standard output log file. +#SBATCH --error={logs_path}/{sanitized_path}_from_hpss_%j.err # Standard error log file. +#SBATCH --licenses=SCRATCH # Request the SCRATCH license. +#SBATCH --mem=2GB # Request #GB of memory. Default 2GB. +set -euo pipefail # Enable strict error checking. +echo "[LOG] Job started at: $(date)" + +# ------------------------------------------------------------------- +# Define source and destination variables. +# ------------------------------------------------------------------- + +echo "[LOG] Defining source and destination paths." + +# SOURCE_PATH: Full path of the file or directory on HPSS. +SOURCE_PATH="{hpss_path}" +echo "[LOG] SOURCE_PATH set to: $SOURCE_PATH" + +# DEST_ROOT: Root destination on CFS built from configuration. +DEST_ROOT="{dest_root}" +echo "[LOG] DEST_ROOT set to: $DEST_ROOT" + +# FILES_TO_EXTRACT: Specific files to extract from the tar archive, if any. +# If not provided, this will be empty. +FILES_TO_EXTRACT="{files_to_extract_str}" +echo "[LOG] FILES_TO_EXTRACT set to: $FILES_TO_EXTRACT" + +# ------------------------------------------------------------------- +# Verify that SOURCE_PATH exists on HPSS using hsi ls. +# ------------------------------------------------------------------- + +echo "[LOG] Verifying file existence with hsi ls." +if ! hsi ls "$SOURCE_PATH" >/dev/null 2>&1; then + echo "[ERROR] File not found on HPSS: $SOURCE_PATH" + exit 1 +fi + +# ------------------------------------------------------------------- +# Determine the transfer mode based on the type (file vs tar). +# ------------------------------------------------------------------- + +echo "[LOG] Determining transfer mode based on the type (file vs tar)." + +# Check if SOURCE_PATH ends with .tar +if [[ "$SOURCE_PATH" =~ \.tar$ ]]; then + # If FILES_TO_EXTRACT is nonempty, MODE becomes "partial", else MODE is "tar". + if [ -n "${{FILES_TO_EXTRACT}}" ]; then + MODE="partial" + else + MODE="tar" + fi +else + MODE="single" +fi + +echo "Transfer mode: $MODE" + +# ------------------------------------------------------------------- +# Transfer Logic: Based on the mode, perform the appropriate transfer. +# ------------------------------------------------------------------- + +if [ "$MODE" = "single" ]; then + echo "[LOG] Single file detected. Using hsi get." + mkdir -p "$DEST_ROOT" + hsi get "$SOURCE_PATH" "$DEST_ROOT/" +elif [ "$MODE" = "tar" ]; then + echo "[LOG] Tar archive detected. Extracting entire archive using htar." + ARCHIVE_BASENAME=$(basename "$SOURCE_PATH") + ARCHIVE_NAME="${{ARCHIVE_BASENAME%.tar}}" + DEST_PATH="${{DEST_ROOT}}/${{ARCHIVE_NAME}}" + echo "[LOG] Extracting to: $DEST_PATH" + mkdir -p "$DEST_PATH" + htar -xvf "$SOURCE_PATH" -C "$DEST_PATH" +elif [ "$MODE" = "partial" ]; then + echo "[LOG] Partial extraction detected. Extracting selected files using htar." + ARCHIVE_BASENAME=$(basename "$SOURCE_PATH") + ARCHIVE_NAME="${{ARCHIVE_BASENAME%.tar}}" + DEST_PATH="${{DEST_ROOT}}/${{ARCHIVE_NAME}}" + + # Verify that each requested file exists in the tar archive. + echo "[LOG] Verifying requested files are in the tar archive." + ARCHIVE_CONTENTS=$(htar -tvf "$SOURCE_PATH") + echo "[LOG] List: $ARCHIVE_CONTENTS" + for file in $FILES_TO_EXTRACT; do + echo "[LOG] Checking for file: $file" + if ! echo "$ARCHIVE_CONTENTS" | grep -q "$file"; then + echo "[ERROR] Requested file '$file' not found in archive $SOURCE_PATH" + exit 1 + else + echo "[LOG] File '$file' found in archive." + fi + done + + echo "[LOG] All requested files verified. Proceeding with extraction." + mkdir -p "$DEST_PATH" + (cd "$DEST_PATH" && htar -xvf "$SOURCE_PATH" -Hnostage $FILES_TO_EXTRACT) + + echo "[LOG] Extraction complete. Listing contents of $DEST_PATH:" + ls -l "$DEST_PATH" + +else + echo "[ERROR]: Unknown mode: $MODE" + exit 1 +fi + +date \ No newline at end of file diff --git a/orchestration/slurm/ls_hpss.slurm b/orchestration/slurm/ls_hpss.slurm new file mode 100644 index 00000000..a47157fb --- /dev/null +++ b/orchestration/slurm/ls_hpss.slurm @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH -q xfer +#SBATCH -A als +#SBATCH -C cron +#SBATCH --time=00:10:00 +#SBATCH --job-name={job_name} +#SBATCH --output={out_pattern} +#SBATCH --error={err_pattern} + +set -euo pipefail + +echo "[LOG] Listing HPSS path: {full_hpss}" +{cmd} \ No newline at end of file diff --git a/orchestration/slurm/prune_hpss.slurm b/orchestration/slurm/prune_hpss.slurm new file mode 100644 index 00000000..bb01f11e --- /dev/null +++ b/orchestration/slurm/prune_hpss.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +# ------------------------------------------------------------------ +# SLURM Job Script for Pruning Data from HPSS +# ------------------------------------------------------------------ + +#SBATCH -q xfer # Specify the SLURM queue to use. +#SBATCH -A als # Specify the account. +#SBATCH -C cron # Use the 'cron' constraint. +#SBATCH --time=12:00:00 # Maximum runtime of 12 hours. +#SBATCH --job-name=transfer_to_HPSS_{relative_path} # Set a descriptive job name. +#SBATCH --output={logs_path}/{relative_path}_prune_from_hpss_%j.out # Standard output log file. +#SBATCH --error={logs_path}/{relative_path}_prune_from_hpss_%j.err # Standard error log file. +#SBATCH --licenses=SCRATCH # Request the SCRATCH license. +#SBATCH --mem=20GB # Request #GB of memory. Default 2GB. + +set -euo pipefail # Enable strict error checking. +echo "[LOG] Job started at: $(date)" + +# Check if the file exists on HPSS +if hsi "ls {source_endpoint.full_path(relative_path)}" &> /dev/null; then + echo "[LOG] File {relative_path} exists on HPSS. Proceeding to prune." + # Prune the file from HPSS + hsi "rm {source_endpoint.full_path(relative_path)}" + echo "[LOG] File {relative_path} has been pruned from HPSS." + hsi ls -R {source_endpoint.full_path(relative_path)} +else + echo "[LOG] Could not find File {relative_path} does not on HPSS. Check your file path again." + exit 0 +fi +echo "[LOG] Job completed at: $(date) \ No newline at end of file diff --git a/orchestration/transfer_controller.py b/orchestration/transfer_controller.py index 09e82649..809e957e 100644 --- a/orchestration/transfer_controller.py +++ b/orchestration/transfer_controller.py @@ -5,92 +5,40 @@ import logging import os import time -from typing import Generic, TypeVar, Optional +from typing import Generic, Optional, TypeVar import globus_sdk -from orchestration.flows.bl832.config import Config832 -from orchestration.flows.bl832.job_controller import HPC +# Import the generic Beamline configuration class. +from orchestration.config import BeamlineConfig from orchestration.globus.transfer import GlobusEndpoint, start_transfer from orchestration.prometheus_utils import PrometheusMetrics +from orchestration.transfer_endpoints import FileSystemEndpoint, TransferEndpoint logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) load_dotenv() -class TransferEndpoint(ABC): - """ - Abstract base class for endpoints. - """ - def __init__( - self, - name: str, - root_path: str - ) -> None: - self.name = name - self.root_path = root_path - - def name(self) -> str: - """ - A human-readable or reference name for the endpoint. - """ - return self.name - - def root_path(self) -> str: - """ - Root path or base directory for this endpoint. - """ - return self.root_path - - -class FileSystemEndpoint(TransferEndpoint): - """ - A file system endpoint. - - Args: - TransferEndpoint: Abstract class for endpoints. - """ - def __init__( - self, - name: str, - root_path: str - ) -> None: - super().__init__(name, root_path) - - def full_path( - self, - path_suffix: str - ) -> str: - """ - Constructs the full path by appending the path_suffix to the root_path. - - Args: - path_suffix (str): The relative path to append. - - Returns: - str: The full absolute path. - """ - if path_suffix.startswith("/"): - path_suffix = path_suffix[1:] - return f"{self.root_path.rstrip('/')}/{path_suffix}" - - Endpoint = TypeVar("Endpoint", bound=TransferEndpoint) class TransferController(Generic[Endpoint], ABC): """ - Abstract class for transferring data. + Abstract base class for transferring data between endpoints. + + This class defines the common interface that all transfer controllers must implement, + regardless of the specific transfer mechanism they use. Args: - ABC: Abstract Base Class + config (BeamlineConfig): Configuration object containing endpoints and credentials """ def __init__( self, - config: Config832 + config: BeamlineConfig ) -> None: self.config = config + logger.debug(f"Initialized {self.__class__.__name__} with config for beamline {config.beamline_id}") @abstractmethod def copy( @@ -103,9 +51,9 @@ def copy( Copy a file from a source endpoint to a destination endpoint. Args: - file_path (str): The path of the file to copy. - source (Endpoint): The source endpoint. - destination (Endpoint): The destination endpoint. + file_path (str): The path of the file to copy, relative to the endpoint's root path + source (Endpoint): The source endpoint from which to copy the file + destination (Endpoint): The destination endpoint to which to copy the file Returns: bool: True if the transfer was successful, False otherwise. @@ -115,18 +63,28 @@ def copy( class GlobusTransferController(TransferController[GlobusEndpoint]): """ - Use Globus Transfer to move data between endpoints. + Use Globus Transfer to move data between Globus endpoints. + + This controller handles the transfer of files between Globus endpoints using the + Globus Transfer API. It manages authentication, transfer submissions, and status tracking. Args: - TransferController: Abstract class for transferring data. + config (BeamlineConfig): Configuration object containing Globus endpoints and credentials """ + def __init__( self, - config: Config832, + config: BeamlineConfig, prometheus_metrics: Optional[PrometheusMetrics] = None ) -> None: super().__init__(config) self.prometheus_metrics = prometheus_metrics + """ + Use Globus Transfer to move data between endpoints. + + Args: + TransferController: Abstract class for transferring data. + """ def get_transfer_file_info( self, @@ -135,23 +93,23 @@ def get_transfer_file_info( ) -> Optional[dict]: """ Get information about a completed transfer from the Globus API. - + Args: task_id (str): The Globus transfer task ID transfer_client (TransferClient, optional): TransferClient instance - + Returns: Optional[dict]: Task information including bytes_transferred, or None if unavailable """ if transfer_client is None: transfer_client = self.config.tc - + try: task_info = transfer_client.get_task(task_id) task_dict = task_info.data if task_dict.get('status') == 'SUCCEEDED': - bytes_transferred = task_dict.get('bytes_transferred', 0) + bytes_transferred = task_dict.get('bytes_transferred', 0) bytes_checksummed = task_dict.get('bytes_checksummed', 0) files_transferred = task_dict.get('files_transferred', 0) effective_bytes_per_second = task_dict.get('effective_bytes_per_second', 0) @@ -161,13 +119,13 @@ def get_transfer_file_info( 'files_transferred': files_transferred, 'effective_bytes_per_second': effective_bytes_per_second } - + return None - + except Exception as e: logger.error(f"Error getting transfer task info: {e}") return None - + def collect_and_push_metrics( self, start_time: float, @@ -181,7 +139,7 @@ def collect_and_push_metrics( ) -> None: """ Collect transfer metrics and push them to Prometheus. - + Args: start_time (float): Transfer start time as UNIX timestamp. end_time (float): Transfer end time as UNIX timestamp. @@ -201,10 +159,10 @@ def collect_and_push_metrics( end_datetime = datetime.datetime.fromtimestamp(end_time, tz=datetime.timezone.utc) start_timestamp = start_datetime.isoformat() end_timestamp = end_datetime.isoformat() - + # Calculate duration in seconds duration_seconds = end_time - start_time - + # Calculate transfer speed (bytes per second) # transfer_speed = file_size / duration_seconds if duration_seconds > 0 and file_size > 0 else 0 @@ -220,13 +178,13 @@ def collect_and_push_metrics( "status": "success" if success else "failed", "machine": machine_name } - + # Push metrics to Prometheus self.prometheus_metrics.push_metrics_to_prometheus(metrics, logger) - + except Exception as e: logger.error(f"Error collecting or pushing metrics: {e}") - + def copy( self, file_path: str = None, @@ -234,31 +192,46 @@ def copy( destination: GlobusEndpoint = None, ) -> bool: """ - Copy a file from a source endpoint to a destination endpoint. + Copy a file from a source Globus endpoint to a destination Globus endpoint. + + This method handles the full transfer process, including path normalization, + submission to the Globus Transfer API, and waiting for completion or error. Args: - file_path (str): The path of the file to copy. - source (GlobusEndpoint): The source endpoint. - destination (GlobusEndpoint): The destination endpoint. + file_path (str): The path of the file to copy, relative to the endpoint's root path + source (GlobusEndpoint): The source Globus endpoint from which to copy the file + destination (GlobusEndpoint): The destination Globus endpoint to which to copy the file Returns: - bool: True if the transfer was successful, False otherwise. + bool: True if the transfer was successful, False otherwise + + Raises: + globus_sdk.services.transfer.errors.TransferAPIError: If there are issues with the Globus API """ - + if not file_path: + logger.error("No file path provided for transfer") + return False + + if not source or not destination: + logger.error("Missing source or destination endpoint for transfer") + return False + if not file_path: logger.error("No file_path provided") return False - + if not source or not destination: logger.error("Source or destination endpoint not provided") return False logger.info(f"Transferring {file_path} from {source.name} to {destination.name}") - # Remove leading slash if present + # Normalize the file path by removing leading slashes if present if file_path[0] == "/": file_path = file_path[1:] + logger.debug(f"Normalized file path to '{file_path}'") + # Build full paths for source and destination source_path = os.path.join(source.root_path, file_path) dest_path = os.path.join(destination.root_path, file_path) logger.info(f"Transferring {source_path} to {dest_path}") @@ -268,8 +241,9 @@ def copy( success = False task_id = None # Initialize task_id here to prevent UnboundLocalError file_size = 0 # Initialize file_size here as well - + try: + logger.info(f"Submitting Globus transfer task from {source.uuid} to {destination.uuid}") success, task_id = start_transfer( transfer_client=self.config.tc, source_endpoint=source, @@ -284,10 +258,16 @@ def copy( logger.info("Transfer completed successfully.") else: logger.error("Transfer failed.") - + except globus_sdk.services.transfer.errors.TransferAPIError as e: - logger.error(f"Failed to submit transfer: {e}") - + logger.error(f"Globus Transfer API error: {e}") + logger.error(f"Status code: {e.status_code if hasattr(e, 'status_code') else 'unknown'}") + logger.error(f"Error details: {e.data if hasattr(e, 'data') else e}") + return False + except Exception as e: + logger.error(f"Unexpected error during transfer: {str(e)}", exc_info=True) + return False + finally: # Stop the timer and calculate the duration transfer_end_time = time.time() @@ -300,7 +280,7 @@ def copy( transfer_speed = transfer_info.get('effective_bytes_per_second', 0) logger.info(f"Globus Task Info: Transferred {file_size} bytes ") logger.info(f"Globus Task Info: Effective speed: {transfer_speed} bytes/second") - + # Collect and push metrics if enabled if self.prometheus_metrics and file_size > 0: self.collect_and_push_metrics( @@ -313,49 +293,61 @@ def copy( transfer_speed=transfer_speed, success=success, ) - + return success class SimpleTransferController(TransferController[FileSystemEndpoint]): - def __init__(self, config: Config832) -> None: - super().__init__(config) """ Use a simple 'cp' command to move data within the same system. + This controller is suitable for transfers between directories on the same + file system, where network transfer protocols are not needed. + Args: - TransferController: Abstract class for transferring data. + config (BeamlineConfig): Configuration object containing file system paths """ + def __init__( + self, + config: BeamlineConfig + ) -> None: + super().__init__(config) def copy( self, - file_path: str = "", + file_path: str = None, source: FileSystemEndpoint = None, destination: FileSystemEndpoint = None, ) -> bool: """ - Copy a file from a source endpoint to a destination endpoint using the 'cp' command. + Copy a file from a source directory to a destination directory using the 'cp' command. + + This method handles local file copying through the system's cp command, + including path normalization and status tracking. Args: - file_path (str): The path of the file to copy. - source (FileSystemEndpoint): The source endpoint. - destination (FileSystemEndpoint): The destination endpoint. + file_path (str): The path of the file to copy, relative to the endpoint's root path + source (FileSystemEndpoint): The source file system location + destination (FileSystemEndpoint): The destination file system location Returns: bool: True if the transfer was successful, False otherwise. """ if not file_path: - logger.error("No file_path provided.") + logger.error("No file_path provided for local copy operation") return False if not source or not destination: - logger.error("Source or destination endpoint not provided.") + logger.error("Source or destination endpoint not provided for local copy operation") return False logger.info(f"Transferring {file_path} from {source.name} to {destination.name}") + # Normalize file path by removing leading slash if present if file_path.startswith("/"): file_path = file_path[1:] + logger.debug(f"Normalized file path to '{file_path}'") + # Build full paths for source and destination source_path = os.path.join(source.root_path, file_path) dest_path = os.path.join(destination.root_path, file_path) logger.info(f"Transferring {source_path} to {dest_path}") @@ -364,72 +356,98 @@ def copy( start_time = time.time() try: + # Check if source file/directory exists + if not os.path.exists(source_path): + logger.error(f"Source path does not exist: {source_path}") + return False + + # Ensure destination directory exists + dest_dir = os.path.dirname(dest_path) + if not os.path.exists(dest_dir): + logger.debug(f"Creating destination directory: {dest_dir}") + os.makedirs(dest_dir, exist_ok=True) + + # Execute the cp command result = os.system(f"cp -r '{source_path}' '{dest_path}'") if result == 0: - logger.info("Transfer completed successfully.") + logger.info(f"Local copy of '{file_path}' completed successfully") return True else: - logger.error(f"Transfer failed with exit code {result}.") + logger.error(f"Local copy of '{file_path}' failed with exit code {result}") return False except Exception as e: - logger.error(f"Transfer failed: {e}") + logger.error(f"Unexpected error during local copy: {str(e)}", exc_info=True) return False finally: # Stop the timer and calculate the duration elapsed_time = time.time() - start_time - logger.info(f"Transfer process took {elapsed_time:.2f} seconds.") + logger.info(f"Local copy process took {elapsed_time:.2f} seconds") class CopyMethod(Enum): """ Enum representing different transfer methods. - Use enum names as strings to identify transfer methods, ensuring a standard set of values. + + These values are used to select the appropriate transfer controller + through the factory function get_transfer_controller(). """ - GLOBUS = "globus" - SIMPLE = "simple" + GLOBUS = "globus" # Transfer between Globus endpoints + SIMPLE = "simple" # Local filesystem copy + CFS_TO_HPSS = "cfs_to_hpss" # NERSC CFS to HPSS tape archive + HPSS_TO_CFS = "hpss_to_cfs" # HPSS tape archive to NERSC CFS def get_transfer_controller( transfer_type: CopyMethod, - config: Config832, + config: BeamlineConfig, prometheus_metrics: Optional[PrometheusMetrics] = None ) -> TransferController: """ - Get the appropriate transfer controller based on the transfer type. + Factory function to get the appropriate transfer controller based on the transfer type. Args: - transfer_type (str): The type of transfer to perform. - config (Config832): The configuration object. + transfer_type (CopyMethod): The type of transfer to perform + config (BeamlineConfig): The configuration object containing endpoint information Returns: - TransferController: The transfer controller object. + TransferController: The appropriate transfer controller instance + + Raises: + ValueError: If an invalid transfer type is provided """ + # Add explicit type checking to handle non-enum inputs + if not isinstance(transfer_type, CopyMethod): + error_msg = f"Invalid transfer type: {transfer_type}" + logger.error(error_msg) + raise ValueError(error_msg) + + logger.debug(f"Creating transfer controller of type: {transfer_type.name}") + if transfer_type == CopyMethod.GLOBUS: + logger.debug("Returning GlobusTransferController") return GlobusTransferController(config, prometheus_metrics) elif transfer_type == CopyMethod.SIMPLE: + logger.debug("Returning SimpleTransferController") return SimpleTransferController(config) + elif transfer_type == CopyMethod.CFS_TO_HPSS: + logger.debug("Importing and returning CFSToHPSSTransferController") + # Import here to avoid circular dependencies + from orchestration.hpss import CFSToHPSSTransferController + from orchestration.sfapi import create_sfapi_client + return CFSToHPSSTransferController( + client=create_sfapi_client(), + config=config + ) + elif transfer_type == CopyMethod.HPSS_TO_CFS: + logger.debug("Importing and returning HPSSToCFSTransferController") + # Import here to avoid circular dependencies + from orchestration.hpss import HPSSToCFSTransferController + from orchestration.sfapi import create_sfapi_client + return HPSSToCFSTransferController( + client=create_sfapi_client(), + config=config + ) else: - raise ValueError(f"Invalid transfer type: {transfer_type}") - - -if __name__ == "__main__": - config = Config832() - transfer_type = CopyMethod.GLOBUS - globus_transfer_controller = get_transfer_controller(transfer_type, config) - globus_transfer_controller.copy( - file_path="dabramov/test.txt", - source=config.alcf832_raw, - destination=config.alcf832_scratch - ) - - simple_transfer_controller = get_transfer_controller(CopyMethod.SIMPLE, config) - success = simple_transfer_controller.copy( - file_path="test.rtf", - source=FileSystemEndpoint("source", "/Users/david/Documents/copy_test/test_source/"), - destination=FileSystemEndpoint("destination", "/Users/david/Documents/copy_test/test_destination/") - ) - - if success: - logger.info("Simple transfer succeeded.") - else: - logger.error("Simple transfer failed.") \ No newline at end of file + error_msg = f"Invalid transfer type: {transfer_type}" + logger.error(error_msg) + raise ValueError(error_msg) diff --git a/orchestration/transfer_endpoints.py b/orchestration/transfer_endpoints.py new file mode 100644 index 00000000..40a04180 --- /dev/null +++ b/orchestration/transfer_endpoints.py @@ -0,0 +1,98 @@ +# orchestration/transfer_endpoints.py +from abc import ABC +from pathlib import Path + + +class TransferEndpoint(ABC): + """ + Abstract base class for endpoints. + """ + def __init__( + self, + name: str, + root_path: str, + uri: str + ) -> None: + self.name = name + self.root_path = root_path + self.uri = uri + + def name(self) -> str: + """ + A human-readable or reference name for the endpoint. + """ + return self.name + + def root_path(self) -> str: + """ + Root path or base directory for this endpoint. + """ + return self.root_path + + def uri(self) -> str: + """ + Uri for this endpoint. + """ + return self.uri + + +class FileSystemEndpoint(TransferEndpoint): + """ + A file system endpoint. + + Args: + TransferEndpoint: Abstract class for endpoints. + """ + def __init__( + self, + name: str, + root_path: str, + uri: str + ) -> None: + super().__init__(name, root_path, uri) + + def full_path( + self, + path_suffix: str + ) -> str: + """ + Constructs the full path by appending the path_suffix to the root_path. + + Args: + path_suffix (str): The relative path to append. + + Returns: + str: The full absolute path. + """ + return str(Path(self.root_path) / path_suffix) + + +class HPSSEndpoint(TransferEndpoint): + """ + An HPSS endpoint. + + Args: + TransferEndpoint: Abstract class for endpoints. + """ + def __init__( + self, + name: str, + root_path: str, + uri: str + ) -> None: + super().__init__(name, root_path, uri) + + def full_path(self, path_suffix: str) -> str: + """ + Constructs the full path by appending the path_suffix to the HPSS endpoint's root_path. + This is used by the HPSS transfer controllers to compute the absolute path on HPSS. + + Args: + path_suffix (str): The relative path to append. + + Returns: + str: The full absolute path. + """ + if path_suffix.startswith("/"): + path_suffix = path_suffix[1:] + return f"{self.root_path.rstrip('/')}/{path_suffix}" diff --git a/requirements.txt b/requirements.txt index 24b8e9e4..4453a70f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ +authlib +dynaconf +globus-compute-sdk @ git+https://github.com/globus/globus-compute.git@d1731340074be56861ec91d732bdff44f8e2b46e#subdirectory=compute_sdk globus-sdk>=3.0 +griffe>=0.49.0,<2.0.0 h5py httpx>=0.22.0 mkdocs @@ -9,10 +13,7 @@ pillow pydantic==2.11 python-dotenv prefect==2.20.17 +prometheus_client==0.21.1 pyscicat pyyaml -authlib -sfapi_client -globus-compute-sdk @ git+https://github.com/globus/globus-compute.git@d1731340074be56861ec91d732bdff44f8e2b46e#subdirectory=compute_sdk -griffe>=0.49.0,<2.0.0 -prometheus_client==0.21.1 +sfapi_client \ No newline at end of file diff --git a/scripts/check_hpss.py b/scripts/check_hpss.py new file mode 100644 index 00000000..d625da70 --- /dev/null +++ b/scripts/check_hpss.py @@ -0,0 +1,137 @@ +from orchestration.hpss import ( + cfs_to_hpss_flow, + hpss_to_cfs_flow, + get_prune_controller, + get_transfer_controller, + PruneMethod, + CopyMethod, +) +from orchestration.transfer_endpoints import FileSystemEndpoint, HPSSEndpoint +import datetime +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +TEST_HPSS_PRUNE = False +TEST_CFS_TO_HPSS = False +TEST_HPSS_TO_CFS = False +TEST_HPSS_LS = True + +# ------------------------------------------------------ +# Test pruning from HPSS +# ------------------------------------------------------ +if TEST_HPSS_PRUNE: + from orchestration.flows.bl832.config import Config832 + config = Config832() + file_name = "8.3.2/raw/ALS-11193_nbalsara/ALS-11193_nbalsara_2022-2.tar" + source = HPSSEndpoint( + name="HPSS", + root_path=config.hpss_alsdev["root_path"], + uri=config.hpss_alsdev["uri"] + ) + + days_from_now = datetime.timedelta(days=0) # Prune immediately + + prune_controller = get_prune_controller( + prune_type=PruneMethod.HPSS, + config=config + ) + prune_controller.prune( + file_path=f"{file_name}", + source_endpoint=source, + check_endpoint=None, + days_from_now=days_from_now + ) +# ------------------------------------------------------ +# Test transfer from CFS to HPSS +# ------------------------------------------------------ +if TEST_CFS_TO_HPSS: + from orchestration.flows.bl832.config import Config832 + config = Config832() + project_name = "ALS-11193_nbalsara" + source = FileSystemEndpoint( + name="CFS", + root_path="/global/cfs/cdirs/als/data_mover/8.3.2/raw/", + uri="nersc.gov" + ) + destination = HPSSEndpoint( + name="HPSS", + root_path=config.hpss_alsdev["root_path"], + uri=config.hpss_alsdev["uri"] + ) + cfs_to_hpss_flow( + file_path=f"{project_name}", + source=source, + destination=destination, + config=config + ) + +# ------------------------------------------------------ +# Test transfer from HPSS to CFS +# ------------------------------------------------------ +if TEST_HPSS_TO_CFS: + from orchestration.flows.bl832.config import Config832 + config = Config832() + relative_file_path = f"{config.beamline_id}/raw/ALS-11193_nbalsara/ALS-11193_nbalsara_2022-2.tar" + source = HPSSEndpoint( + name="HPSS", + root_path=config.hpss_alsdev["root_path"], # root_path: /home/a/alsdev/data_mover + uri=config.hpss_alsdev["uri"] + ) + destination = FileSystemEndpoint( + name="CFS", + root_path="/global/cfs/cdirs/als/data_mover/8.3.2/retrieved_from_tape", + uri="nersc.gov" + ) + + files_to_extract = [ + "20221109_012020_MSB_Book1_Proj33_Cell5_2pFEC_LiR2_6C_Rest3.h5", + "20221012_172023_DTH_100722_LiT_r01_cell3_10x_0_19_CP2.h5", + ] + + hpss_to_cfs_flow( + file_path=f"{relative_file_path}", + source=source, + destination=destination, + files_to_extract=files_to_extract, + config=config + ) + +# ------------------------------------------------------ +# Test listing HPSS files +# ------------------------------------------------------ +if TEST_HPSS_LS: + from orchestration.flows.bl832.config import Config832 + + # Build client, config, endpoint + config = Config832() + endpoint = HPSSEndpoint( + name="HPSS", + root_path=config.hpss_alsdev["root_path"], + uri=config.hpss_alsdev["uri"] + ) + + # Instantiate controller + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.CFS_TO_HPSS, + config=config + ) + + # Directory listing + project_path = f"{config.beamline_id}/raw/BLS-00564_dyparkinson" + logger.info("Controller-based directory listing on HPSS:") + output_file = transfer_controller.list_hpss( + endpoint=endpoint, + remote_path=project_path, + recursive=True + ) + + # TAR archive listing + archive_name = project_path.split("/")[-1] + tar_path = f"{project_path}/{archive_name}_2023-1.tar" + logger.info("Controller-based tar archive listing on HPSS:") + output_file = transfer_controller.list_hpss( + endpoint=endpoint, + remote_path=tar_path, + recursive=False + ) diff --git a/scripts/login_to_globus_and_prefect.sh b/scripts/login_to_globus_and_prefect.sh index dbc57f9e..75ae958f 100755 --- a/scripts/login_to_globus_and_prefect.sh +++ b/scripts/login_to_globus_and_prefect.sh @@ -17,4 +17,7 @@ export GLOBUS_CLI_CLIENT_SECRET="$GLOBUS_CLIENT_SECRET" export GLOBUS_COMPUTE_CLIENT_ID="$GLOBUS_CLIENT_ID" export GLOBUS_COMPUTE_CLIENT_SECRET="$GLOBUS_CLIENT_SECRET" export PREFECT_API_KEY="$PREFECT_API_KEY" -export PREFECT_API_URL="$PREFECT_API_URL" \ No newline at end of file +export PREFECT_API_URL="$PREFECT_API_URL" +export SCICAT_API_URL="$SCICAT_API_URL" +export SCICAT_INGEST_USER="$SCICAT_INGEST_USER" +export SCICAT_INGEST_PASSWORD="$SCICAT_INGEST_PASSWORD" \ No newline at end of file diff --git a/scripts/test_controllers_end_to_end.py b/scripts/test_controllers_end_to_end.py new file mode 100644 index 00000000..66ab0cec --- /dev/null +++ b/scripts/test_controllers_end_to_end.py @@ -0,0 +1,664 @@ +""" +End-to-end tests for transfer, prune, and ingest controllers. +These tests are designed to be as generic as possible and should work with any beamline configuration. + +""" + +from datetime import datetime +from dotenv import load_dotenv +import logging +import os +import shutil +from typing import Optional + +from pyscicat.client import ScicatClient + +from orchestration.config import BeamlineConfig +from orchestration.flows.scicat.ingestor_controller import BeamlineIngestorController +from orchestration.globus import transfer +from orchestration.globus.transfer import GlobusEndpoint +from orchestration.hpss import cfs_to_hpss_flow, hpss_to_cfs_flow +from orchestration.prune_controller import get_prune_controller, PruneMethod +from orchestration.transfer_controller import get_transfer_controller, CopyMethod +from orchestration.transfer_endpoints import FileSystemEndpoint, HPSSEndpoint +from globus_sdk import TransferClient + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +load_dotenv() + + +# ---------------------------------------------------------------------------------------------------------------------- +# Setup Environment Configuration and Test Classes +# ---------------------------------------------------------------------------------------------------------------------- + + +def check_required_envvars() -> bool: + """ + Check for required environment variables before running the end-to-end tests. + """ + to_check = ( + "GLOBUS_CLIENT_ID", "GLOBUS_CLIENT_SECRET", + "PREFECT_API_URL", "PREFECT_API_KEY", + "SCICAT_API_URL", "SCICAT_INGEST_USER", "SCICAT_INGEST_PASSWORD", + "PATH_NERSC_CLIENT_ID", "PATH_NERSC_PRI_KEY", + ) + missing = [] + for var in to_check: + logger.info(f"Checking environment variable: {var}") + if var not in os.environ or os.environ.get(var) is None: + logger.warning(f"Environment variable {var} is not set.") + missing.append(var) + if missing: + logger.error("Missing required environment variables: %s", ", ".join(missing)) + return False + logger.info("All required environment variables are set.") + return True + + +class TestConfig(BeamlineConfig): + """ + Test configuration class for a beamline + """ + def __init__(self) -> None: + super().__init__(beamline_id="0.0.0") + + def _beam_specific_config(self) -> None: + from orchestration.globus import flows # Wait to import here to ensure checked env vars first + self.endpoints = transfer.build_endpoints(self.config) + self.apps = transfer.build_apps(self.config) + self.tc: TransferClient = transfer.init_transfer_client(self.apps["als_transfer"]) + self.flow_client = flows.get_flows_client() + self.nersc_alsdev = self.endpoints["nersc_alsdev"] # root: /global/homes/a/alsdev/test_directory/ + self.hpss_alsdev = self.config["hpss_alsdev"] + self.scicat = self.config["scicat"] + + +class TestIngestorController(BeamlineIngestorController): + """ + Test ingestor controller class for SciCat that does very basic ingest operations. + + Works with scicatlive v3.2.5 + https://github.com/SciCatProject/scicatlive + """ + def __init__( + self, + config: TestConfig, + scicat_client: Optional[ScicatClient] = None + ) -> None: + super().__init__(config, scicat_client) + + def ingest_new_raw_dataset( + self, + file_path: str = "", + ) -> str: + """ + Create a minimal raw dataset in SciCat for the given file. + + Args: + file_path: Path to the file to ingest. + + Returns: + str: The SciCat ID of the created dataset. + """ + if not self.scicat_client: + logger.error("SciCat client not initialized. Call get_scicat_client first.") + raise ValueError("SciCat client not initialized. Call get_scicat_client first.") + + # Create minimal metadata for the dataset + from pyscicat.model import CreateDatasetOrigDatablockDto, DataFile, RawDataset + + filename = os.path.basename(file_path) + basename = os.path.splitext(filename)[0] + + logger.info(f"Creating raw dataset for {filename}") + + try: + # Create a RawDataset object directly with parameters + # Making sure to include principalInvestigator as a string + dataset = RawDataset( + owner="ingestor", + contactEmail="test@example.com", + creationLocation=f"/test/location/{basename}", + sourceFolder="/test/source/folder", + datasetName=basename, + type="raw", + proposalId="test-proposal", + description=f"Test dataset for {filename}", + ownerGroup="admin", + accessGroups=["admin"], + creationTime=datetime.now().isoformat(), + principalInvestigator="Test Investigator" # Add this required field + ) + + # Upload dataset to SciCat + dataset_id = self.scicat_client.upload_new_dataset(dataset) + + logger.info(f"Created raw dataset with ID: {dataset_id}") + + # Add a dummy file to the datablock + dummy_file = DataFile( + path=file_path, + size=1024, # Dummy size + time=datetime.now().isoformat() + ) + + datablock = CreateDatasetOrigDatablockDto( + size=1024, # Dummy size + dataFileList=[dummy_file] + ) + + # Attach the datablock to the dataset + self.scicat_client.upload_dataset_origdatablock(dataset_id, datablock) + logger.info(f"Added datablock to dataset {dataset_id}") + + return dataset_id + + except Exception as e: + logger.error(f"Error creating raw dataset: {e}") + raise e + + def ingest_new_derived_dataset( + self, + file_path: str = "", + raw_dataset_id: str = "", + ) -> str: + """ + Create a minimal derived dataset in SciCat for the given file, + linked to the provided raw dataset. + + Args: + file_path: Path to the file to ingest. + raw_dataset_id: ID of the parent raw dataset. + + Returns: + str: The SciCat ID of the created dataset. + """ + if not self.scicat_client: + logger.error("SciCat client not initialized. Call get_scicat_client first.") + raise ValueError("SciCat client not initialized. Call get_scicat_client first.") + + # Create minimal metadata for the dataset + from pyscicat.model import CreateDatasetOrigDatablockDto, DataFile, DerivedDataset + + filename = os.path.basename(file_path) + basename = os.path.splitext(filename)[0] + + logger.info(f"Creating derived dataset for {filename} from {raw_dataset_id}") + + try: + # Create a DerivedDataset object + derived_dataset = DerivedDataset( + owner="ingestor", + contactEmail="test@example.com", + creationLocation=f"/test/location/{basename}_derived", + sourceFolder="/test/source/folder", + datasetName=f"{basename}_derived", + type="derived", + proposalId="test-proposal", + description=f"Derived dataset from {raw_dataset_id}", + ownerGroup="admin", + accessGroups=["admin"], + creationTime=datetime.now().isoformat(), + investigator="test-investigator", + inputDatasets=[raw_dataset_id], + principalInvestigator="Test Investigator", + usedSoftware=["TestSoftware"] + ) + + # Upload the dataset to SciCat + dataset_id = self.scicat_client.upload_new_dataset(derived_dataset) + + logger.info(f"Created derived dataset with ID: {dataset_id}") + + # Add a dummy file to the datablock + dummy_file = DataFile( + path=file_path, + size=1024, # Dummy size + time=datetime.now().isoformat() + ) + + datablock = CreateDatasetOrigDatablockDto( + size=1024, # Dummy size + dataFileList=[dummy_file] + ) + + # Attach the datablock to the dataset + self.scicat_client.upload_dataset_origdatablock(dataset_id, datablock) + logger.info(f"Added datablock to dataset {dataset_id}") + + return dataset_id + + except Exception as e: + logger.error(f"Error creating derived dataset: {e}") + raise e + + # TODO: Add methods to add and remove dataset locations + + +# ---------------------------------------------------------------------------------------------------------------------- +# End-to-end Tests +# ---------------------------------------------------------------------------------------------------------------------- + + +def test_transfer_controllers( + file_path: str, + test_globus: bool, + test_filesystem: bool, + test_hpss: bool, + config: BeamlineConfig, +) -> None: + """ + Test the transfer controller by transferring a file to each endpoint. + + Args: + file_path (str): The path to the file to transfer. + test_globus (bool): Whether to test the Globus transfer controller. + test_filesystem (bool): Whether to test the FileSystem transfer controller. + test_hpss (bool): Whether to test the HPSS transfer controller. + config (BeamlineConfig): The beamline configuration. + + Returns: + None + """ + + logger.info("Testing transfer controllers...") + logger.info(f"File path: {file_path}") + logger.info(f"Test Globus: {test_globus}") + logger.info(f"Test Filesystem: {test_filesystem}") + logger.info(f"Test HPSS: {test_hpss}") + + if test_globus: + # Create a transfer controller for Globus transfers + globus_transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.GLOBUS, + config=config + ) + + # Configure the source and destination endpoints + # Use the NERSC alsdev endpoint with root_path: /global/homes/a/alsdev/test_directory/source/ as the source + source_endpoint = GlobusEndpoint( + uuid=config.nersc_alsdev.uuid, + uri=config.nersc_alsdev.uri, + root_path=config.nersc_alsdev.root_path + "source/", + name="source_endpoint" + ) + + # Use the NERSC alsdev endpoint with root_path: /global/homes/a/alsdev/test_directory/destination/ as the destination + destination_endpoint = GlobusEndpoint( + uuid=config.nersc_alsdev.uuid, + uri=config.nersc_alsdev.uri, + root_path=config.nersc_alsdev.root_path + "destination/", + name="destination_endpoint" + ) + + globus_transfer_controller.copy( + file_path=file_path, + source=source_endpoint, + destination=destination_endpoint, + ) + + if test_filesystem: + # Create a transfer controller for filesystem transfers + filesystem_transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.SIMPLE, + config=config + ) + + # Configure the source and destination endpoints + + # Create temporary directories for testing in current working directory + base_test_dir = os.path.join(os.getcwd(), "orchestration_test_dir") + source_dir = os.path.join(base_test_dir, "source") + dest_dir = os.path.join(base_test_dir, "destination") + + # Create directories + os.makedirs(source_dir, exist_ok=True) + os.makedirs(dest_dir, exist_ok=True) + + # Create a test file in the source directory + test_file_path = os.path.join(source_dir, file_path) + with open(test_file_path, "w") as f: + f.write("This is a test file for SimpleTransferController") + + logger.info(f"Created test file at {test_file_path}") + + # Use the defined FileSystemEndpoint + source_endpoint = FileSystemEndpoint( + name="source_endpoint", + root_path=source_dir, + uri="source.test" + ) + destination_endpoint = FileSystemEndpoint( + name="destination_endpoint", + root_path=dest_dir, + uri="destination.test" + ) + + result = filesystem_transfer_controller.copy( + file_path=file_path, + source=source_endpoint, + destination=destination_endpoint, + ) + + # Verify the transfer + dest_file_path = os.path.join(dest_dir, file_path) + if os.path.exists(dest_file_path): + logger.info(f"File successfully transferred to {dest_file_path}") + else: + logger.error(f"Transfer failed: file not found at {dest_file_path}") + + assert result is True, "Transfer operation returned False" + assert os.path.exists(dest_file_path), "File wasn't copied to destination" + + if test_hpss: + from orchestration.flows.bl832.config import Config832 + + config = Config832() + project_name = "BLS-00564_dyparkinson" + source = FileSystemEndpoint( + name="CFS", + root_path="/global/cfs/cdirs/als/data_mover/8.3.2/raw/", + uri="nersc.gov" + ) + destination = HPSSEndpoint( + name="HPSS", + root_path=config.hpss_alsdev["root_path"], + uri=config.hpss_alsdev["uri"] + ) + success = cfs_to_hpss_flow( + file_path=project_name, + source=source, + destination=destination, + config=config + ) + logger.info(f"Transfer success: {success}") + config = Config832() + relative_file_path = f"{config.beamline_id}/raw/BLS-00520_dyparkinson/BLS-00520_dyparkinson_2022-2.tar" + source = HPSSEndpoint( + name="HPSS", + root_path=config.hpss_alsdev["root_path"], # root_path: /home/a/alsdev/data_mover + uri=config.hpss_alsdev["uri"] + ) + destination = FileSystemEndpoint( + name="CFS", + root_path="/global/cfs/cdirs/als/data_mover/8.3.2/retrieved_from_tape", + uri="nersc.gov" + ) + + files_to_extract = [ + "20221028_101514_arun_JSC-1.h5", + "20220923_160531_ethan_robin_climbing-vine_x00y05.h5", + "20221222_082548_strangpresse_20pCFABS_800rpm_Non-vacuum.h5", + "20220923_160531_ethan_robin_climbing-vine_x00y04.h5" + ] + + hpss_to_cfs_flow( + file_path=f"{relative_file_path}", + source=source, + destination=destination, + files_to_extract=files_to_extract, + config=config + ) + + +def test_prune_controllers( + file_path: str, + test_globus: bool, + test_filesystem: bool, + test_hpss: bool, + config: BeamlineConfig, +) -> None: + """ + Test the prune controllers by pruning files from each endpoint. + + Note: not pruning the source endpoint test.txt file, so it can be used in future tests. + + Args: + file_path (str): Path to the file to prune. + test_globus (bool): Whether to test the Globus pruner. + test_filesystem (bool): Whether to test the filesystem pruner. + test_hpss (bool): Whether to test the HPSS pruner. + config (BeamlineConfig): Configuration object for the beam + + Returns: + None + """ + logger.info("Testing prune controllers...") + logger.info(f"File path: {file_path}") + logger.info(f"Test Globus: {test_globus}") + logger.info(f"Test Filesystem: {test_filesystem}") + logger.info(f"Test HPSS: {test_hpss}") + + if test_globus: + globus_prune_controller = get_prune_controller( + prune_type=PruneMethod.GLOBUS, + config=config + ) + + # PRUNE FROM SOURCE ENDPOINT + # Configure the source and destination endpoints + # Use the NERSC alsdev endpoint with root_path: /global/homes/a/alsdev/test_directory/source/ as the source + source_endpoint = GlobusEndpoint( + uuid=config.nersc_alsdev.uuid, + uri=config.nersc_alsdev.uri, + root_path=config.nersc_alsdev.root_path + "source/", + name="source_endpoint" + ) + + # Prune the source endpoint + globus_prune_controller.prune( + file_path=file_path, + source_endpoint=source_endpoint, + check_endpoint=None, + days_from_now=0.0 + ) + + # PRUNE FROM DESTINATION ENDPOINT + # Use the NERSC alsdev endpoint with root_path: /global/homes/a/alsdev/test_directory/destination/ as the destination + destination_endpoint = GlobusEndpoint( + uuid=config.nersc_alsdev.uuid, + uri=config.nersc_alsdev.uri, + root_path=config.nersc_alsdev.root_path + "destination/", + name="destination_endpoint" + ) + + # Assume files were created and transferred in the previous test + # Prune the destination endpoint + globus_prune_controller.prune( + file_path=file_path, + source_endpoint=destination_endpoint, + check_endpoint=None, + days_from_now=0.0 + ) + + if test_filesystem: + filesystem_prune_controller = get_prune_controller( + prune_type=PruneMethod.SIMPLE, + config=config + ) + + # Configure the source and destination endpoints to match the TransferController test + # Create temporary directories for testing + base_test_dir = os.path.join(os.getcwd(), "orchestration_test_dir") + source_dir = os.path.join(base_test_dir, "source") + dest_dir = os.path.join(base_test_dir, "destination") + + # Use the defined FileSystemEndpoint + source_endpoint = FileSystemEndpoint( + name="source_endpoint", + root_path=source_dir, + uri="source.test" + ) + + destination_endpoint = FileSystemEndpoint( + name="destination_endpoint", + root_path=dest_dir, + uri="destination.test" + ) + + # Assume files were created and transferred in the previous test + + # Prune the source endpoint + filesystem_prune_controller.prune( + file_path=file_path, + source_endpoint=source_endpoint, + check_endpoint=None, + days_from_now=0.0 + ) + + # Prune the destination endpoint + filesystem_prune_controller.prune( + file_path=file_path, + source_endpoint=destination_endpoint, + check_endpoint=None, + days_from_now=0.0 + ) + + # After pruning in the filesystem pruner test + source_file_path = os.path.join(source_dir, file_path) + assert not os.path.exists(source_file_path), "File wasn't removed from source" + if not os.path.exists(source_file_path): + logger.info(f"File successfully deleted from source: {source_file_path}") + dest_file_path = os.path.join(dest_dir, file_path) + assert not os.path.exists(dest_file_path), "File wasn't removed from destination" + if not os.path.exists(dest_file_path): + logger.info(f"File successfully deleted from destination: {dest_file_path}") + + if test_hpss: + hpss_prune_controller = get_prune_controller( + prune_type=PruneMethod.HPSS, + config=config + ) + + hpss_prune_controller.prune() + # TODO: Finish this test + + +def test_scicat_ingest( + file_path: str = "test.txt" + +) -> None: + """ + Test the SciCat ingestor controller by ingesting a file. + """ + config = TestConfig() + test_ingestor = TestIngestorController(config) + + # Login to SciCat, assuming credentials are saved in environment variables + # If not, and you are testing with scicatlive, use these defaults: + # SCICAT_API_URL="http://localhost:3000/api/v3/" + # SCICAT_INGEST_USER="admin" + # SCICAT_INGEST_PASSWORD="2jf70TPNZsS" + + test_ingestor.get_scicat_client( + scicat_base_url=os.getenv("SCICAT_API_URL"), + scicat_user=os.getenv("SCICAT_INGEST_USER"), + scicat_password=os.getenv("SCICAT_INGEST_PASSWORD") + ) + + raw_id = test_ingestor.ingest_new_raw_dataset( + file_path=file_path + ) + + test_ingestor.ingest_new_derived_dataset( + file_path=file_path, + raw_dataset_id=raw_id + ) + + test_ingestor.add_new_dataset_location( + dataset_id=raw_id, + source_folder="test_folder", + source_folder_host="test_host" + ) + + # This will probably fail. Need to figure out default scicatlive user permissions. + test_ingestor.remove_dataset_location( + dataset_id=raw_id, + source_folder_host="test_host" + ) + + +def test_it_all( + test_globus: bool = True, + test_filesystem: bool = False, + test_hpss: bool = False, + test_scicat: bool = False +) -> None: + """ + Run end-to-end tests for transfer and prune controllers." + """ + try: + check_required_envvars() + except Exception as e: + logger.error(f"Error checking environment variables: {e}") + return + finally: + logger.info("Continuing with tests...") + + config = TestConfig() + + try: + test_transfer_controllers( + file_path="test.txt", + test_globus=test_globus, + test_filesystem=test_filesystem, + test_hpss=test_hpss, + config=config + ) + logger.info("Transfer controller tests passed.") + except Exception as e: + logger.error(f"Error running transfer controller tests: {e}") + return + + if test_scicat: + try: + test_scicat_ingest( + file_path="test.txt" + ) + except Exception as e: + logger.error(f"Error running SciCat ingestor tests: {e}") + return + + try: + test_prune_controllers( + file_path="test.txt", + test_globus=test_globus, + test_filesystem=test_filesystem, + test_hpss=test_hpss, + config=config + ) + logger.info("Prune controller tests passed.") + except Exception as e: + logger.error(f"Error running prune controller tests: {e}") + return + + logger.info("All tests passed. Cleaning up...") + base_test_dir = os.path.join(os.getcwd(), "orchestration_test_dir") + if os.path.exists(base_test_dir): + shutil.rmtree(base_test_dir) + + +if __name__ == "__main__": + + # Uncomment the following line to run all tests + # Set test_globus, test_filesystem, test_hpss, and test_scicat to True or False as needed + + # test_it_all( + # test_globus=False, + # test_filesystem=False, + # test_hpss=False, + # test_scicat=False + # ) + check_required_envvars() + + # Test individual transfer controllers directly + test_transfer_controllers( + file_path="test.txt", + test_globus=False, + test_filesystem=False, + test_hpss=False, + config=TestConfig() + )