diff --git a/.github/workflows/apptainer-build-image.yml b/.github/workflows/apptainer-build-image.yml new file mode 100644 index 0000000..ea6d63d --- /dev/null +++ b/.github/workflows/apptainer-build-image.yml @@ -0,0 +1,163 @@ +name: build-image + +on: + workflow_call: + inputs: + image-name: + required: true + type: string + build-context: + required: true + type: string + dockerfile: + required: true + type: string + r-version: + required: true + type: string + parent-image: + required: false + default: '' + type: string + model-version: + required: false + default: '' + type: string + image-version: + required: false + default: "latest" + type: string + dockerhub-repo: + required: false + default: "hdpriest0uiuc" + type: string + platforms: + required: false + default: "linux/amd64" + type: string + secrets: + DOCKERHUB_USERNAME: + description: 'DockerHub username used to push images' + required: false + DOCKERHUB_PASSWORD: + description: 'DockerHub password used to push images' + required: false + +env: + DEFAULT_R_VERSION: "4.4" + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + +jobs: + build: + runs-on: ubuntu-24.04 + permissions: + packages: write + + steps: + + - name: lowercase image name + id: name + run: | + echo "image_name=$(echo ${{ inputs.image-name }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + echo "repository=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + + - name: set PARENT_IMAGE only if specified + id: parent + shell: bash + run: | + echo "PARENT_IMAGE_IF_SET=$( + [[ -n '${{ inputs.parent-image }}' ]] && + echo "PARENT_IMAGE=ghcr.io/${{ steps.name.outputs.repository }}/"'${{ inputs.parent-image }}' + )" >> $GITHUB_OUTPUT + + - name: set MODEL_VERSION only if specified + id: modelver + shell: bash + run: | + echo "MODEL_VERSION_IF_SET=$( + [[ -n '${{ inputs.model-version }}' ]] && + echo 'MODEL_VERSION=${{ inputs.model-version }}' + )" >> $GITHUB_OUTPUT + + - uses: actions/checkout@v4 + + # create metadata for image + - name: Docker meta + env: + check_var: ${{ secrets.DOCKERHUB_USERNAME }} + is_default_R: ${{ inputs.r-version == env.DEFAULT_R_VERSION }} + id: meta + uses: docker/metadata-action@v5 + with: + # list of Docker images to use as base name for tags + images: | + name=ghcr.io/${{ steps.name.outputs.repository }}/${{ steps.name.outputs.image_name }} + name=${{ inputs.dockerhub-repo }}/${{ steps.name.outputs.image_name }},enable=${{ env.check_var != null }} + # generate Docker tags based on the following events/attributes + tags: | + type=raw,value=${{ inputs.image-version }} + # type=schedule + # type=ref,event=branch,enable=${{ env.is_default_R }} + # type=ref,event=branch,suffix=-R${{ inputs.r-version }} + # type=ref,event=pr + # type=semver,pattern={{version}},enable=${{ env.is_default_R }} + # type=semver,pattern={{major}}.{{minor}},enable=${{ env.is_default_R }} + # type=semver,pattern={{major}},enable=${{ env.is_default_R }} + # type=semver,pattern={{version}},suffix=-R${{ inputs.r-version }} + + # setup docker build + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Inspect Builder + run: | + echo "Name: ${{ steps.buildx.outputs.name }}" + echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}" + echo "Status: ${{ steps.buildx.outputs.status }}" + echo "Flags: ${{ steps.buildx.outputs.flags }}" + echo "Platforms: ${{ steps.buildx.outputs.platforms }}" + + # login to registries + - name: Login to DockerHub + env: + check_var: ${{ secrets.DOCKERHUB_USERNAME }} + if: env.check_var != null + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + # build the docker images + - name: Build and push ${{ steps.name.outputs.image_name }} + uses: docker/build-push-action@v6 + with: + context: ${{ inputs.build-context }} + file: ${{ inputs.dockerfile }} + push: true + platforms: ${{ inputs.platforms }} + cache-from: type=gha + cache-to: type=gha,mode=max + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + VERSION=${{ steps.meta.outputs.version }} + IMAGE_VERSION=${{ steps.meta.outputs.version }} + PECAN_VERSION=${{ steps.meta.outputs.version }} + R_VERSION=${{ inputs.r-version }} + ${{ steps.parent.outputs.PARENT_IMAGE_IF_SET }} + ${{ steps.modelver.outputs.MODEL_VERSION_IF_SET }} + GITHUB_PAT=${{ secrets.GITHUB_TOKEN }} + PECAN_GIT_BRANCH=${{ github.head_ref || github.ref_name }} + PECAN_GIT_CHECKSUM=${{ github.sha }} + PECAN_GIT_DATE=${{ github.event.repository.updated_at }} diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml new file mode 100644 index 0000000..3efee34 --- /dev/null +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -0,0 +1,73 @@ +name: Apptainer GHA CARB + +env: + DEFAULT_R_VERSION: 4.4 + R_VERSION: 4.4 + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + +on: + push: + branches: + - main + - develop + + pull_request: + merge_group: + workflow_dispatch: + inputs: + r_version: + description: 'R version to use' + required: true + type: choice + default: "$DEFAULT_R_VERSION" + options: + - 4.1 + - 4.2 + - 4.3 + - 4.4 + - devel + image_version: + description: 'version of sipnet container to use' + required: true + type: choice + default: "latest" + options: + - develop + - latest + +jobs: + # ---------------------------------------------------------------------- + # Set R version. + # This is a hack: We really just want a global env var here, but it seems + # `env:` values can't be passed into a `jobs..with` context + # (see https://github.com/actions/runner/issues/2372). + # As an ugly workaround, we assign it to a job output instead. + # ---------------------------------------------------------------------- + rversion: + runs-on: ubuntu-latest + steps: + - id: default + if: github.event_name != 'schedule' + run: echo "R_VERSION=4.4" >> "$GITHUB_OUTPUT" + outputs: + # Note: "steps.*" seems to mean "all step ids", not "all steps" + # If seeing weird results here, check that all steps above have an id set. + R_VERSION: 4.4 + +# ---------------------------------------------------------------------- +# Next are images that have specific layers added +# ---------------------------------------------------------------------- + sipnet-carb: + needs: [rversion] + uses: ./.github/workflows/apptainer-build-image.yml + with: + image-name: sipnet-carb + build-context: tools/apptainer-sipnet-carb + dockerfile: tools/apptainer-sipnet-carb/Dockerfile + r-version: ${{ needs.rversion.outputs.R_VERSION }} + parent-image: "pecan/model-sipnet-git" + image-version: ${{ inputs.image_version }} + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }} + diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml new file mode 100644 index 0000000..ee304f6 --- /dev/null +++ b/.github/workflows/run-workflow-examples.yml @@ -0,0 +1,103 @@ +name: Run Workflow Examples + +env: + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + +on: + push: + branches: + - main + - develop + paths: + - 'workflow_examples/**' + - '.github/workflows/run-workflow-examples.yml' + pull_request: + paths: + - 'workflow_examples/**' + - '.github/workflows/run-workflow-examples.yml' + workflow_dispatch: + inputs: + orchestration_version: + description: 'Orchestration XML version to use (devel or latest)' + required: true + type: choice + default: 'devel' + options: + - devel + - latest + +jobs: + # ---------------------------------------------------------------------- + # Workflow 01: Data Prep Workflow + # This is the first workflow that prepares the base data + # ---------------------------------------------------------------------- + workflow_01_data_prep: + runs-on: ['self-hosted', 'Linux', 'X64'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set orchestration XML version + id: orchestration + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "version=${{ github.event.inputs.orchestration_version }}" >> $GITHUB_OUTPUT + else + echo "version=devel" >> $GITHUB_OUTPUT + fi + + - name: Run 01_data_prep_workflow + working-directory: workflow_examples/01_simple_data_workflow + run: | + Rscript 01_data_prep_workflow.R -s ../../.github/workflows_resources/01_orchestration_${{ steps.orchestration.outputs.version }}.xml + + # ---------------------------------------------------------------------- + # Workflow 02: Data Reference Workflow + # This workflow references data from workflow 01 + # ---------------------------------------------------------------------- + workflow_02_data_reference: + needs: [workflow_01_data_prep] + runs-on: ['self-hosted', 'Linux', 'X64'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set orchestration XML version + id: orchestration + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "version=${{ github.event.inputs.orchestration_version }}" >> $GITHUB_OUTPUT + else + echo "version=devel" >> $GITHUB_OUTPUT + fi + + - name: Run 02_run_data_reference_workflow + working-directory: workflow_examples/02_referencing_data_workflow + run: | + Rscript 02_run_data_reference_workflow.R -s ../../.github/workflows_resources/02_orchestration_${{ steps.orchestration.outputs.version }}.xml + + # ---------------------------------------------------------------------- + # Workflow 03: Distributed Workflow + # This workflow runs the distributed analysis workflow + # ---------------------------------------------------------------------- + workflow_03_distributed: + needs: [workflow_02_data_reference] + runs-on: ['self-hosted', 'Linux', 'X64'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set orchestration XML version + id: orchestration + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "version=${{ github.event.inputs.orchestration_version }}" >> $GITHUB_OUTPUT + else + echo "version=devel" >> $GITHUB_OUTPUT + fi + + - name: Run 03_run_distributed_workflow + working-directory: workflow_examples/03_distributed_workflow + run: | + Rscript 03_run_distributed_workflow.R -s ../../.github/workflows_resources/03_orchestration_${{ steps.orchestration.outputs.version }}.xml + diff --git a/.github/workflows_resources/01_orchestration_devel.xml b/.github/workflows_resources/01_orchestration_devel.xml new file mode 100644 index 0000000..8ba4be9 --- /dev/null +++ b/.github/workflows_resources/01_orchestration_devel.xml @@ -0,0 +1,19 @@ + + + + /project/60007/hpriest/data/workflow_runs_ci + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + diff --git a/.github/workflows_resources/01_pecan_config_devel.xml b/.github/workflows_resources/01_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/.github/workflows_resources/01_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/.github/workflows_resources/02_orchestration_devel.xml b/.github/workflows_resources/02_orchestration_devel.xml new file mode 100644 index 0000000..ac755f1 --- /dev/null +++ b/.github/workflows_resources/02_orchestration_devel.xml @@ -0,0 +1,30 @@ + + + + /project/60007/hpriest/data/workflow_runs_ci + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + diff --git a/.github/workflows_resources/02_pecan_config_devel.xml b/.github/workflows_resources/02_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/.github/workflows_resources/02_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/.github/workflows_resources/03_orchestration_devel.xml b/.github/workflows_resources/03_orchestration_devel.xml new file mode 100644 index 0000000..dbc9339 --- /dev/null +++ b/.github/workflows_resources/03_orchestration_devel.xml @@ -0,0 +1,39 @@ + + + + /project/60007/hpriest/data/workflow_runs_ci + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + analysis_run_identifier_03_sourcing + ./03_pecan_config_devel.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_develop.sif + + + + diff --git a/.github/workflows_resources/03_pecan_config_devel.xml b/.github/workflows_resources/03_pecan_config_devel.xml new file mode 100644 index 0000000..5804d53 --- /dev/null +++ b/.github/workflows_resources/03_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/.gitignore b/.gitignore index 0558174..0f814ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # Analysis outcomes **/output/** +# Workflow runs +**/workflow_runs/** + # container files **/**.sif @@ -43,3 +46,4 @@ Thumbs.db # Temporary files *.tmp *.log +**/data_raw/** diff --git a/2a_grass/01_ERA5_nc_to_clim_adapter.R b/2a_grass/01_ERA5_nc_to_clim_adapter.R new file mode 100755 index 0000000..7b96e33 --- /dev/null +++ b/2a_grass/01_ERA5_nc_to_clim_adapter.R @@ -0,0 +1,82 @@ +#!/usr/bin/env Rscript + +# Standalone command-line adapter for convert_era5_nc_to_clim function from workflow_functions.R +# This script sources workflow_functions.R, parses command-line arguments using the +# exact same argument parsing as the original 01_ERA5_nc_to_clim.R, and builds the +# required data structures to pass into the workflow_functions.R version. + +# Source the workflow functions +source("../tools/workflow_functions.R") + +# Argument parsing section (exact copy from 01_ERA5_nc_to_clim.R) +options <- list( + optparse::make_option("--site_era5_path", + default = "data_raw/ERA5_nc", + help = paste( + "Path to your existing ERA5 data in PEcAn CF format, organized as", + "single-site, single-year netcdfs in subdirectories per ensemble member.", + "Files should be named", + "'/ERA5__/ERA5...nc'" + ) + ), + optparse::make_option("--site_sipnet_met_path", + default = "data/ERA5_SIPNET", + help = paste( + "Output path:", + "single-site, multi-year Sipnet clim files, one per ensemble member.", + "Files will be named", + "//ERA5....clim" + ) + ), + optparse::make_option("--site_info_file", + default = "site_info.csv", + help = "CSV file with one row per location. Only the `id` column is used", + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = "Date to begin clim file", + ), + optparse::make_option("--end_date", + default = "2023-12-31", + help = "Date to end clim file", + ), + optparse::make_option("--n_cores", + default = 1L, + help = "number of CPUs to use in parallel", + ), + optparse::make_option("--parallel_strategy", + default = "multisession", + help = "Strategy for parallel conversion, passed to future::plan()", + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Build site_combinations data frame using the helper function from workflow_functions.R +# This replicates the logic from the original script which does: +# site_info |> dplyr::rename(site_id = id) |> dplyr::cross_join(data.frame(ens_id = 1:10)) +# The original script hardcodes ensemble members 1:10, so we do the same here. + +site_combinations <- build_era5_site_combinations( + site_info_file = args$site_info_file, + start_date = args$start_date, + end_date = args$end_date, + ensemble_members = 1:10 +) + +## --------------------------------------------------------- +# Call the convert_era5_nc_to_clim function from workflow_functions.R +convert_era5_nc_to_clim( + site_combinations = site_combinations, + site_era5_path = args$site_era5_path, + site_sipnet_met_path = args$site_sipnet_met_path, + n_workers = as.integer(args$n_cores) +) + diff --git a/2a_grass/02_ic_build_adapter.R b/2a_grass/02_ic_build_adapter.R new file mode 100755 index 0000000..ea0f41c --- /dev/null +++ b/2a_grass/02_ic_build_adapter.R @@ -0,0 +1,127 @@ +#!/usr/bin/env Rscript + +# Standalone command-line adapter for build_ic_files function from workflow_functions.R +# This script sources workflow_functions.R, parses command-line arguments using the +# exact same argument parsing as the original 02_ic_build.R, and builds an in-memory +# XML structure to pass into the workflow_functions.R version of build_ic_files. + +# Source the workflow functions +source("../tools/workflow_functions.R") + +# Argument parsing section (exact copy from 02_ic_build.R) +options <- list( + optparse::make_option("--site_info_path", + default = "site_info.csv", + help = "CSV giving ids, locations, and PFTs for sites of interest" + ), + optparse::make_option("--field_shape_path", + default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb", + help = "file containing site geometries, used for extraction from rasters" + ), + optparse::make_option("--ic_ensemble_size", + default = 100, + help = "number of files to generate for each site" + ), + optparse::make_option("--run_start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "For now, start date must be same for all sites,", + "and some download/extraction functions rely on this.", + "Workaround: Call this script separately for sites whose dates differ" + ) + ), + optparse::make_option("--run_LAI_date", + default = "2016-07-01", + help = "Date to look near (up to 30 days each direction) for initial LAI" + ), + optparse::make_option("--ic_outdir", + default = "IC_files", + help = "Directory to write completed initial conditions as nc files" + ), + optparse::make_option("--data_dir", + default = "data/IC_prep", + help = "Directory to store data retrieved/computed in the IC build process" + ), + optparse::make_option("--pft_dir", + default = "pfts", + help = paste( + "path to parameter distributions used for PFT-specific conversions", + "from LAI to estimated leaf carbon.", + "Must be path to a dir whose child subdirectory names match the", + "`site.pft` column of site_info and that contain a file", + "`post.distns.Rdata`" + ) + ), + optparse::make_option("--params_read_from_pft", + default = "SLA,leafC", # SLA units are m2/kg, leafC units are % + help = "Parameters to read from the PFT file, comma separated" + ), + optparse::make_option("--landtrendr_raw_files", + default = paste0( + "data_raw/ca_biomassfiaald_2016_median.tif,", + "data_raw/ca_biomassfiaald_2016_stdv.tif" + ), + help = paste( + "Paths to two geotiffs, with a comma between them.", + "These should contain means and standard deviations of aboveground", + "biomass on the start date.", + "We used Landtrendr-based values from the Kennedy group at Oregon State,", + "which require manual download.", + "Medians are available by anonymous FTP at islay.ceoas.oregonstate.edu", + "and by web (but possibly this is a different version?) from", + "https://emapr.ceoas.oregonstate.edu/pages/data/viz/index.html", + "The uncertainty layer was formerly distributed by FTP but I cannot find", + "it on the ceoas server at the moment.", + "TODO find out whether this is available from a supported source.", + "", + "Demo used a subset (year 2016 clipped to the CA state boundaries)", + "of the 30-m CONUS median and stdev maps that are stored on the Dietze", + "lab server" + ) + ), + optparse::make_option("--additional_params", + # Wood C fraction isn't in these PFTs, so just using my estimate. + # TODO update from a citeable source, + # and consider adding to PFT when calibrating + default = + "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005", + help = paste( + "Further params not available from site or PFT data,", + "as a comma-separated named list with names `varname`, `distn`,", + "`parama`, and `paramb`. Currently used only for `wood_carbon_fraction`" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Build in-memory XML structure to pass to build_ic_files +# This mimics the structure that would come from parsing workflow.create.clim.files +# section of the orchestration XML + +orchestration_xml <- list( + site.info.file = args$site_info_path, + field.shape.path = args$field_shape_path, + ic.ensemble.size = as.character(args$ic_ensemble_size), + start.date = args$run_start_date, + run_LAI.date = args$run_LAI_date, + ic.outdir = args$ic_outdir, + data.dir = args$data_dir, + pft.dir = args$pft_dir, + params.from.pft = args$params_read_from_pft, + landtrendr.raw.files = args$landtrendr_raw_files, + additional.params = args$additional_params +) + +## --------------------------------------------------------- +# Call the build_ic_files function from workflow_functions.R +build_ic_files(orchestration_xml = orchestration_xml) + diff --git a/2a_grass/03_xml_build_adapter.R b/2a_grass/03_xml_build_adapter.R new file mode 100755 index 0000000..3c4e503 --- /dev/null +++ b/2a_grass/03_xml_build_adapter.R @@ -0,0 +1,96 @@ +#!/usr/bin/env Rscript + +# Standalone command-line adapter for build_pecan_xml function from workflow_functions.R +# This script sources workflow_functions.R, parses command-line arguments using the +# exact same argument parsing as the original 03_xml_build.R, and builds an in-memory +# XML structure to pass into the workflow_functions.R version. + +# Source the workflow functions +source("../tools/workflow_functions.R") + +# Argument parsing section (exact copy from 03_xml_build.R) +options <- list( + optparse::make_option("--n_ens", + default = 20, + help = "number of ensemble simulations per site" + ), + optparse::make_option("--n_met", + default = 10, + help = "number of met files available (ensemble will sample from all)" + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "Ensure your IC files are valid for this date" + ) + ), + optparse::make_option("--end_date", + default = "2024-12-31", + help = "Date to end simulations" + ), + optparse::make_option("--ic_dir", + default = "IC_files", + help = paste( + "Directory containing initial conditions.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--met_dir", + default = "data/ERA5_CA_SIPNET", + help = paste( + "Directory containing climate data.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--site_file", + default = "site_info.csv", + help = paste( + "CSV file containing one row for each site to be simulated.", + "Must contain at least columns `id`, `lat`, `lon`, and `site.pft`" + ) + ), + optparse::make_option("--template_file", + default = "template.xml", + help = paste( + "XML file containing whole-run settings,", + "Will be expanded to contain all sites at requested ensemble size" + ) + ), + optparse::make_option("--output_file", + default = "settings.xml", + help = "path to write output XML" + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Build in-memory XML structure to pass to build_pecan_xml +# This mimics the structure that would come from parsing workflow.build.xml +# section of the orchestration XML + +orchestration_xml <- list( + site.info.file = args$site_file, + n.ens = as.character(args$n_ens), + n.met = as.character(args$n_met), + start.date = args$start_date, + end.date = args$end_date, + ic.dir = args$ic_dir, + met.dir = args$met_dir, + output.xml = args$output_file +) + +## --------------------------------------------------------- +# Call the build_pecan_xml function from workflow_functions.R +build_pecan_xml( + orchestration_xml = orchestration_xml, + template_file = args$template_file +) + diff --git a/orchestration/01_get_base_data.R b/orchestration/01_get_base_data.R new file mode 100644 index 0000000..70ec13b --- /dev/null +++ b/orchestration/01_get_base_data.R @@ -0,0 +1,123 @@ +library(targets) +library(tarchetypes) +library(XML) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow configuration XML" + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} + +this_workflow_name <- "workflow.get.base.data" + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=this_workflow_name) + + + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +dir.create(paste0(analysis_run_directory,"/data_raw"), recursive = TRUE) + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + check_orchestration_keys(orchestration_xml = orchestration_settings$orchestration, key_list = c(workflow_name, "workflow.base.run.directory")) + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + + check_orchestration_keys(orchestration_xml = workflow_settings, key_list = c("apptainer", "ccmmf.s3.artifact.01.url", "ccmmf.s3.artifact.01.filename", "ccmmf.s3.artifact.02.url", "ccmmf.s3.artifact.02.filename")) + check_orchestration_keys(orchestration_xml = workflow_settings$apptainer, key_list = c("remote.url", "container.name", "tag", "sif")) + + + apptainer_url = workflow_settings$apptainer$remote.url + apptainer_name = workflow_settings$apptainer$container.name + apptainer_tag = workflow_settings$apptainer$tag + apptainer_sif = workflow_settings$apptainer$sif + + artifact1_url <- workflow_settings$ccmmf.s3.artifact.01.url + artifact1_filename <- workflow_settings$ccmmf.s3.artifact.01.filename + artifact2_url <- workflow_settings$ccmmf.s3.artifact.02.url + artifact2_filename <- workflow_settings$ccmmf.s3.artifact.02.filename + median_tif_url <- workflow_settings$ccmmf.s3.median_tif.url + median_tif_filename <- workflow_settings$ccmmf.s3.median_tif.filename + stdv_tif_filename <- workflow_settings$ccmmf.s3.stdv_tif.filename + + tar_option_set(packages = character(0)) + + list( + # tar_target( + # ccmmf_artifact_01_file, + # download_ccmmf_data(prefix_url = artifact1_url, local_path = tar_path_store(), prefix_filename = artifact1_filename) + # ), + # tar_target( + # ccmmf_artifact_01_contents, + # untar(ccmmf_artifact_01_file, exdir = tar_path_store()) + # ), + tar_target( + ccmmf_artifact_02_file, + download_ccmmf_data(prefix_url = artifact2_url,local_path = tar_path_store(),prefix_filename = artifact2_filename) + ), + tar_target( + ccmmf_artifact_02_contents, + untar(ccmmf_artifact_02_file, exdir = tar_path_store()) + ), + tar_target( + ccmmf_median_tif_file, + download_ccmmf_data(prefix_url = median_tif_url, local_path = paste0(tar_path_store(),"data_raw/"), prefix_filename = median_tif_filename) + ), + tar_target( + ccmmf_stdv_tif_file, + download_ccmmf_data(prefix_url = median_tif_url, local_path = paste0(tar_path_store(),"data_raw/"), prefix_filename = stdv_tif_filename) + ), + tar_target( + apptainer_reference, + pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif) + ) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", this_workflow_name, script_content, fixed=TRUE) + +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/02_create_clim_files_dist.R b/orchestration/02_create_clim_files_dist.R new file mode 100644 index 0000000..a14b46f --- /dev/null +++ b/orchestration/02_create_clim_files_dist.R @@ -0,0 +1,122 @@ +library(targets) +library(tarchetypes) +library(XML) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow configuration XML" + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} + +workflow_name = "workflow.create.clim.files" + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + pecan_xml_path = "@PECANXMLPATH@" + workflow_name = "@WORKFLOWNAME@" + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + site_era5_path <- normalizePath(workflow_settings$site.era5.path, mustWork = FALSE) + site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) + site_info_filename = workflow_settings$site.info.file + start_date <- workflow_settings$start.date + end_date <- workflow_settings$end.date + num_cores <- workflow_settings$n.workers + parallel_strategy <- workflow_settings$parallel.strategy + data_download_directory = file.path(base_workflow_directory, workflow_settings$data.download.reference) + apptainer_sif = workflow_settings$apptainer$sif + ensemble_literal <- sprintf( + "c(%s)", + paste(sprintf("%sL", seq_len(10)), collapse = ", ") + ) + tar_option_set( + packages = c() + ) + + list( + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + + step__resolve_data_routing( + workflow_data_source_directory = data_download_directory, + target_artifact_names = c("reference_era5_path", "data_raw", "site_info_file", "pfts", "data"), + external_name_list = c("data_raw/ERA5_nc", "data_raw", site_info_filename, "pfts", "data"), + localized_name_list = c("ERA5_nc", "data_raw", "site_info.csv", "pfts", "data"), + action_list = c("reference","reference","reference","reference", "copy") + ), + step__resolve_apptainer(apptainer_source_directory=data_download_directory, workflow_xml=workflow_settings), + + step__create_clim_files( + pecan_settings=quote(pecan_settings), + container=quote(apptainer_reference), + workflow_settings=workflow_settings, + reference_path = quote(reference_era5_path), + data_raw = quote(data_raw), + site_info = quote(site_info_file), + dependencies = c("pecan_settings", "apptainer_reference", "site_info_file", "reference_era5_path", "data_raw", "data") + ), + step__build_ic_files( + workflow_settings = workflow_settings, + orchestration_settings = orchestration_settings, + container = quote(apptainer_reference), + dependencies = c("era5_clim_conversion", "apptainer_reference") + ) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/03_build_xml_and_run.R b/orchestration/03_build_xml_and_run.R new file mode 100644 index 0000000..fda436d --- /dev/null +++ b/orchestration/03_build_xml_and_run.R @@ -0,0 +1,128 @@ +library(targets) +library(tarchetypes) +library(XML) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow configuration XML" + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} + +workflow_name = "workflow.build.xml" + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +# hopefully can find a more elegant way to do this +pecan_template_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.template)) +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + pecan_template_path = "@PECANTEMPLATEPATH@" + pecan_xml_path = "@PECANXMLPATH@" + workflow_name = "@WORKFLOWNAME@" + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + site_info_filename = workflow_settings$site.info.file + start_date <- workflow_settings$start.date + end_date <- workflow_settings$end.date + data_download_directory = normalizePath(file.path(base_workflow_directory, workflow_settings$data.download.reference)) + clim_data_directory = normalizePath(file.path(base_workflow_directory, workflow_settings$data.clim.reference)) + + check_orchestration_keys(orchestration_xml = workflow_settings$apptainer, key_list = c("sif")) + apptainer_sif = workflow_settings$apptainer$sif + + tar_option_set( + packages = c() + ) + + list( + step__resolve_apptainer(apptainer_source_directory=data_download_directory, workflow_xml=workflow_settings), + + tar_target(pecan_template_file, pecan_template_path, format = "file"), + + step__resolve_data_routing( + workflow_data_source_directory = data_download_directory, + target_artifact_names = c("site_info_file", "pfts"), + external_name_list = c(site_info_filename, "pfts"), + localized_name_list = c("site_info.csv", "pfts"), + action_list = c("reference", "reference") + ), + step__resolve_data_routing( + workflow_data_source_directory = clim_data_directory, + target_artifact_names = c("IC_files", "ERA5"), + external_name_list = c( "IC_files", "data"), + localized_name_list = c( "IC_files", "data"), + action_list = c("reference", "copy") + ), + + step__build_pecan_xml(), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_built_xml)), + + step__run_distributed_write_configs( + container=quote(apptainer_reference), + pecan_settings=quote(pecan_built_xml), + use_abstraction=TRUE, + dependencies=c("apptainer_reference", "pecan_settings", "pecan_built_xml", "IC_files","ERA5", "site_info_file", "pfts") + ), + step__run_model_2a( + container=quote(apptainer_reference), + pecan_settings=quote(pecan_built_xml), + use_abstraction=TRUE, + dependencies=c("apptainer_reference", "settings_job_outcome", "pecan_built_xml", "IC_files","ERA5", "site_info_file", "pfts") + ) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANTEMPLATEPATH@", pecan_template_path, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/grass_template.xml b/orchestration/grass_template.xml new file mode 100644 index 0000000..bc042cf --- /dev/null +++ b/orchestration/grass_template.xml @@ -0,0 +1,76 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate.deciduous/post.distns.Rdata + + + grass + pfts/grass/post.distns.Rdata + + + + + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + cp data/events.in @RUNDIR@ + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + localhost + output/out + output/run + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + diff --git a/orchestration/pecan_base_config.xml b/orchestration/pecan_base_config.xml new file mode 100644 index 0000000..f08a90f --- /dev/null +++ b/orchestration/pecan_base_config.xml @@ -0,0 +1,208 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + grass + pfts/grass/post.distns.Rdata + output/pfts/grass + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/orchestration/workflow_orchestration.xml b/orchestration/workflow_orchestration.xml new file mode 100644 index 0000000..a522eaa --- /dev/null +++ b/orchestration/workflow_orchestration.xml @@ -0,0 +1,79 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + base_data_01 + ./pecan_workflow_with_orchestration.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + s3://carb/data/workflows/phase_2a + ccmmf_phase_2a_input_artifacts.tgz + s3://carb/data_raw + ca_biomassfiaald_2016_median.tif + s3://carb/data_raw + ca_biomassfiaald_2016_stdv.tif + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_develop.sif + + + + clim_run_01 + base_data_01 + ./pecan_base_config.xml + 1 + site_info.csv + data/ERA5_SIPNET + data_raw/ERA5_nc + 100 + data_raw/dwr_map/i15_Crop_Mapping_2018.gdb + IC_files + data/IC_prep + pfts + SLA,leafC + data_raw/ca_biomassfiaald_2016_median.tif,data_raw/ca_biomassfiaald_2016_stdv.tif + varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005 + multisession + 2016-01-01 + 2024-12-31 + 2016-07-01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_develop.sif + + + + build_xml_03 + base_data_01 + clim_run_01 + ./grass_template.xml + ./pecan_base_config.xml + site_info.csv + 10 + 20 + data/ERA5_CA_SIPNET + IC_files + 2016-01-01 + 2024-12-31 + pecan_built_config.xml + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_develop.sif + + + + \ No newline at end of file diff --git a/tools/apptainer-sipnet-carb/Dockerfile b/tools/apptainer-sipnet-carb/Dockerfile new file mode 100644 index 0000000..fac8f74 --- /dev/null +++ b/tools/apptainer-sipnet-carb/Dockerfile @@ -0,0 +1,12 @@ +# this needs to be at the top, what version are we building +# ARG IMAGE_VERSION="develop" +# ARG PARENT_IMAGE="pecan/model-sipnet-git" + +# FROM ${PARENT_IMAGE}:${IMAGE_VERSION} +# i think? the only way to get this particular tag is to build from a branch called develop. +FROM pecan/model-sipnet-git:develop +# ---------------------------------------------------------------------- +# ADD IN TARGETS FOR CCMMF NEEDS +# ---------------------------------------------------------------------- + +RUN Rscript --vanilla -e "install.packages(c('targets', 'uuid', 'tarchetypes'), repos = c(CRAN = 'cloud.r-project.org'))" \ No newline at end of file diff --git a/tools/setup_workflows.sh b/tools/setup_workflows.sh new file mode 100755 index 0000000..e4c5d40 --- /dev/null +++ b/tools/setup_workflows.sh @@ -0,0 +1,400 @@ +#!/bin/bash + +# CARB PEcAn Environment Setup Script +# This script automates the setup process described in CARB-Slurm-Pecan.md +# with defensive checking for all required components. + +set -euo pipefail # Exit on error, undefined vars, pipe failures + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Configuration variables +WORKFLOWS_REPO="https://github.com/ccmmf/workflows.git" +S3_ENDPOINT="https://s3.garage.ccmmf.ncsa.cloud" +S3_BUCKET="carb" +CONDA_ENV_NAME="PEcAn-head" +WORKFLOW_DIR="workflows/1a_single_site/slurm_distributed_workflow" +INPUT_DATA_FILE="00_cccmmf_phase_1a_input_artifacts.tgz" +EXPECTED_MD5="a3822874c7dd78cbb2de1be2aca76be3" + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Function to check if a file exists +file_exists() { + [[ -f "$1" ]] +} + +# Function to check if a directory exists +dir_exists() { + [[ -d "$1" ]] +} + +# Function to validate AWS credentials +check_aws_credentials() { + log_info "Checking AWS credentials..." + + if ! command_exists aws; then + log_error "AWS CLI is not installed. Please install it first." + log_info "Installation instructions: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html" + exit 1 + fi + + # Check if AWS credentials are configured + if ! aws configure list | grep -q "access_key"; then + log_warning "AWS credentials not configured. You will need to configure them." + log_info "Run: aws configure" + log_info "Use these values:" + log_info " AWS Access Key ID: GK8bb0d9c6b355c9a25b0b67fa" + log_info " AWS Secret Access Key: [provided separately]" + log_info " Default region name: garage" + log_info " Default output format: [leave blank]" + + read -p "Have you configured AWS credentials? (y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + log_error "Please configure AWS credentials first and run this script again." + exit 1 + fi + fi + + # Test S3 access + log_info "Testing S3 access..." + if ! aws s3 ls --endpoint-url "$S3_ENDPOINT" "s3://$S3_BUCKET" >/dev/null 2>&1; then + log_error "Cannot access S3 bucket. Please check your credentials and network connection." + exit 1 + fi + + log_success "AWS credentials and S3 access verified" +} + +# Function to check and install conda if needed +check_conda() { + log_info "Checking Conda installation..." + + if command_exists conda; then + log_success "Conda is already installed" + return 0 + fi + + log_warning "Conda is not installed. Installing Miniconda..." + + # Download and install Miniconda + local miniconda_installer="Miniconda3-latest-Linux-x86_64.sh" + + if ! file_exists "$miniconda_installer"; then + log_info "Downloading Miniconda installer..." + wget -q "https://repo.anaconda.com/miniconda/$miniconda_installer" + fi + + log_info "Installing Miniconda..." + bash "$miniconda_installer" -b -p "$HOME/miniconda3" + + # Add conda to PATH + export PATH="$HOME/miniconda3/bin:$PATH" + echo 'export PATH="$HOME/miniconda3/bin:$PATH"' >> "$HOME/.bashrc" + + # Initialize conda + "$HOME/miniconda3/bin/conda" init bash + + log_success "Miniconda installed successfully" + log_warning "Please restart your shell or run 'source ~/.bashrc' to use conda" +} + +# Function to check required software modules +check_software_modules() { + log_info "Checking required software modules..." + + # Check for module command + if ! command_exists module; then + log_error "Environment Modules system is not available." + log_error "Please ensure the Environment Modules system is installed on this HPC cluster." + exit 1 + fi + + # Check for apptainer module by attempting to load it + log_info "Checking for apptainer module..." + if module load apptainer 2>/dev/null; then + log_success "Apptainer module loaded successfully" + # Unload it for now - we'll load it again when needed + module unload apptainer + else + log_error "Failed to load apptainer module." + log_error "Please contact your system administrator to make the apptainer module available." + exit 1 + fi + + log_success "Required software modules are available" +} + +# Function to setup conda environment +setup_conda_environment() { + log_info "Setting up Conda environment..." + + # Ensure conda is in PATH + if ! command_exists conda; then + if [[ -f "$HOME/miniconda3/bin/conda" ]]; then + export PATH="$HOME/miniconda3/bin:$PATH" + else + log_error "Conda is not available. Please install it first." + exit 1 + fi + fi + + # Create conda directories if they don't exist + mkdir -p "$HOME/.conda/envs" + + # Check if environment already exists + if conda env list | grep -q "$CONDA_ENV_NAME"; then + log_warning "Conda environment '$CONDA_ENV_NAME' already exists." + read -p "Do you want to recreate it? (y/n): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + log_info "Removing existing environment..." + conda env remove -n "$CONDA_ENV_NAME" -y + else + log_info "Using existing environment..." + return 0 + fi + fi + + # Download and extract the environment tarball + local env_tarball="PEcAn-head.tar.gz" + + if ! file_exists "$env_tarball"; then + log_info "Downloading PEcAn environment tarball..." + aws s3 cp --endpoint-url "$S3_ENDPOINT" \ + "s3://$S3_BUCKET/environments/PEcAn-head.tar.gz" "./$env_tarball" + fi + + # Create environment directory + mkdir -p "$HOME/.conda/envs/$CONDA_ENV_NAME" + + # Extract the tarball + log_info "Extracting environment tarball..." + tar -xzf "$env_tarball" -C "$HOME/.conda/envs/$CONDA_ENV_NAME" + + # Configure environment paths using conda run + log_info "Configuring environment paths..." + + if conda run -n "$CONDA_ENV_NAME" conda-unpack; then + log_success "conda-unpack completed successfully" + else + log_warning "conda-unpack failed or not found. Environment may need manual path configuration." + fi + + # Verify R installation + log_info "Verifying R installation..." + if conda run -n "$CONDA_ENV_NAME" Rscript -e '.libPaths()' >/dev/null 2>&1; then + log_success "R installation verified" + else + log_error "R installation verification failed" + exit 1 + fi + + # Verify PEcAn libraries + log_info "Verifying PEcAn libraries..." + if conda run -n "$CONDA_ENV_NAME" Rscript -e 'library("PEcAn.workflow")' >/dev/null 2>&1; then + log_success "PEcAn.workflow library verified" + else + log_error "PEcAn.workflow library not available" + exit 1 + fi + + if conda run -n "$CONDA_ENV_NAME" Rscript -e 'library("PEcAn.remote")' >/dev/null 2>&1; then + log_success "PEcAn.remote library verified" + else + log_error "PEcAn.remote library not available" + exit 1 + fi + + log_success "Conda environment setup completed" +} + +# Function to clone workflows repository +clone_workflows() { + log_info "Cloning workflows repository..." + + if dir_exists "workflows"; then + log_warning "Workflows directory already exists." + read -p "Do you want to remove and re-clone? (y/n): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + rm -rf workflows + else + log_info "Using existing workflows directory..." + return 0 + fi + fi + + git clone "$WORKFLOWS_REPO" + + if [[ ! -d "$WORKFLOW_DIR" ]]; then + log_error "Expected workflow directory not found: $WORKFLOW_DIR" + exit 1 + fi + + log_success "Workflows repository cloned successfully" +} + +# Function to download and setup workflow data +setup_workflow_data() { + log_info "Setting up workflow data..." + + cd "$WORKFLOW_DIR" + + # Download input data + if ! file_exists "$INPUT_DATA_FILE"; then + log_info "Downloading workflow input data..." + aws s3 cp --endpoint-url "$S3_ENDPOINT" \ + "s3://$S3_BUCKET/data/workflows/phase_1a/$INPUT_DATA_FILE" "./$INPUT_DATA_FILE" + fi + + # Verify download + log_info "Verifying data integrity..." + local actual_md5 + actual_md5=$(md5sum "$INPUT_DATA_FILE" | cut -d' ' -f1) + + if [[ "$actual_md5" != "$EXPECTED_MD5" ]]; then + log_error "MD5 checksum mismatch!" + log_error "Expected: $EXPECTED_MD5" + log_error "Actual: $actual_md5" + exit 1 + fi + + log_success "Data integrity verified" + + # Extract data + log_info "Extracting workflow data..." + tar -xf "$INPUT_DATA_FILE" + + log_success "Workflow data setup completed" +} + +# Function to setup apptainer +setup_apptainer() { + log_info "Setting up Apptainer..." + + # Load apptainer module + module load apptainer + + # Verify apptainer is available + if ! command_exists apptainer; then + log_error "Apptainer is not available after loading module" + exit 1 + fi + + # Pull the required Docker image + local sif_file="model-sipnet-git_latest.sif" + + if ! file_exists "$sif_file"; then + log_info "Pulling PEcAn SIPNET model container..." + apptainer pull docker://pecan/model-sipnet-git:latest + else + log_info "Apptainer image already exists: $sif_file" + fi + + log_success "Apptainer setup completed" +} + +# Function to create activation script +create_activation_script() { + log_info "Creating environment activation script..." + + cat > "activate_carb_pecan.sh" << 'EOF' +#!/bin/bash +# CARB PEcAn Environment Activation Script + +# Load required modules +module load apptainer + +# Activate conda environment +source ~/.conda/envs/PEcAn-head/bin/activate + +echo "CARB PEcAn environment activated!" +echo "Available commands:" +echo " - conda activate PEcAn-head (if not already active)" +echo " - module load apptainer (if not already loaded)" +echo " - sbatch commands for running workflows" +EOF + + chmod +x "activate_carb_pecan.sh" + + log_success "Activation script created: activate_carb_pecan.sh" +} + +# Function to display final instructions +display_final_instructions() { + log_success "Setup completed successfully!" + echo + log_info "Next steps:" + echo "1. Activate the environment:" + echo " source activate_carb_pecan.sh" + echo + echo "2. Navigate to the workflow directory:" + echo " cd $WORKFLOW_DIR" + echo + echo "3. Run the workflow setup step:" + echo " sbatch -n1 --mem-per-cpu=1G --time=01:00:00 \\" + echo " --output=pecan_workflow_runlog_\"\$(date +%Y%m%d%H%M%S)_%j.log\" \\" + echo " apptainer run model-sipnet-git_latest.sif ./04a_run_model.R \\" + echo " --settings=slurm_distributed_single_site_almond.xml" + echo + echo "4. Run the main workflow:" + echo " sbatch -n1 --mem-per-cpu=1G --time=01:00:00 \\" + echo " --output=pecan_workflow_runlog_\"\$(date +%Y%m%d%H%M%S)_%j.log\" \\" + echo " ./04b_run_model.R \\" + echo " --settings=slurm_distributed_single_site_almond.xml" + echo + log_info "For more information, see: CARB-Slurm-Pecan.md" +} + +# Main execution +main() { + log_info "Starting CARB PEcAn environment setup..." + echo + + # Check prerequisites + check_aws_credentials + check_conda + check_software_modules + + # Setup environment + setup_conda_environment + clone_workflows + setup_workflow_data + setup_apptainer + create_activation_script + + # Return to original directory + cd - >/dev/null + + display_final_instructions +} + +# Run main function +main "$@" diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R new file mode 100644 index 0000000..a213cc8 --- /dev/null +++ b/tools/workflow_functions.R @@ -0,0 +1,1982 @@ +################## +# workflow functions for targets-based PEcAn workflows +# Note that these functions will be executed in different environments depending on the context, so it is not safe to assume that dependencies are always present in the namespace from which the function is called. +# other functions will be abstracted by the targets framework, and loaded into a novel namespace on a different node. +# function authors are encouraged to think carefully about the dependencies of their functions. +# if dependencies are not present, it would be ideal for functions to error informatively rather than fail on imports. + +#' Download CCMMF Data +#' +#' Downloads data from the CCMMF S3-compatible storage using AWS CLI. +#' +#' @param prefix_url Character string specifying the S3 URL prefix for the data. +#' @param local_path Character string specifying the local directory path where the file will be downloaded. +#' @param prefix_filename Character string specifying the filename to download. +#' +#' @return Character string containing the full path to the downloaded file. +#' +#' @examples +#' \dontrun{ +#' file_path <- download_ccmmf_data("s3://bucket/path", "/local/path", "data.nc") +#' } +#' +#' @export +download_ccmmf_data <- function(prefix_url, local_path, prefix_filename) { + system2("aws", args = c("s3", "cp", "--endpoint-url", "https://s3.garage.ccmmf.ncsa.cloud", paste0(prefix_url, "/", prefix_filename), local_path)) + return(file.path(local_path, prefix_filename)) +} + +#' Build ERA5 Site/Ensemble Combinations +#' +#' Reads the site metadata file and constructs a data frame of site / ensemble +#' combinations with associated start and end dates. Intended to be used with a +#' downstream targets dynamic branching step. +#' +#' @param site_info_file Character. Path to the CSV containing site metadata. +#' Must include an `id` column. +#' @param start_date Character (YYYY-MM-DD). Start date for each combination. +#' @param end_date Character (YYYY-MM-DD). End date for each combination. +#' @param ensemble_members Integer vector identifying ensemble member indices. +#' +#' @return Data frame with columns `site_id`, `start_date`, `end_date`, and +#' `ens_id`. Any additional columns from `site_info_file` are preserved and +#' repeated across ensemble members. +#' @export +build_era5_site_combinations <- function( + site_info_file = "site_info.csv", + start_date = "2016-01-01", + end_date = "2023-12-31", + ensemble_members = 1:10, + dependencies = NULL +) { + + if (!file.exists(site_info_file)) { + stop(sprintf("Site info file not found: %s", site_info_file), call. = FALSE) + } + + site_info <- utils::read.csv(site_info_file, stringsAsFactors = FALSE) + if (!"id" %in% names(site_info)) { + stop("`site_info_file` must contain an `id` column.", call. = FALSE) + } + + site_info$site_id <- site_info$id + site_info$start_date <- start_date + site_info$end_date <- end_date + + if (!is.numeric(ensemble_members)) { + stop("`ensemble_members` must be numeric.", call. = FALSE) + } + + if (length(ensemble_members) == 0) { + return(site_info[0, , drop = FALSE]) + } + + replicated_info <- site_info[rep(seq_len(nrow(site_info)), each = length(ensemble_members)), , drop = FALSE] + replicated_info$ens_id <- rep(ensemble_members, times = nrow(site_info)) + + rownames(replicated_info) <- NULL + return(replicated_info) +} + + +build_era5_site_combinations_args <- function( + site_info_file = "site_info.csv", + start_date = "2016-01-01", + end_date = "2023-12-31", + ensemble_members = 1:10, + reference_path = "", + sipnet_met_path = "", + dependencies = NULL +) { + if (!file.exists(site_info_file)) { + stop(sprintf("Site info file not found: %s", site_info_file), call. = FALSE) + } + + site_info <- utils::read.csv(site_info_file, stringsAsFactors = FALSE) + if (!"id" %in% names(site_info)) { + stop("`site_info_file` must contain an `id` column.", call. = FALSE) + } + + site_info$site_id <- site_info$id + site_info$start_date <- start_date + site_info$end_date <- end_date + site_info$reference_path <- reference_path + site_info$sipnet_met_path <- sipnet_met_path + + if (!is.numeric(ensemble_members)) { + stop("`ensemble_members` must be numeric.", call. = FALSE) + } + + if (length(ensemble_members) == 0) { + return(site_info[0, , drop = FALSE]) + } + + replicated_info <- site_info[rep(seq_len(nrow(site_info)), each = length(ensemble_members)), , drop = FALSE] + replicated_info$ens_id <- rep(ensemble_members, times = nrow(site_info)) + + rownames(replicated_info) <- NULL + return(replicated_info) +} + +#' Convert a Single ERA5 Combination to SIPNET Clim Drivers +#' +#' Runs `PEcAn.SIPNET::met2model.SIPNET()` for a single site / ensemble +#' combination. Designed for use within a dynamic branching target fed by +#' `build_era5_site_combinations()`. +#' +#' @param site_id Character. Site identifier matching directory naming. +#' @param ens_id Integer. Ensemble member index. +#' @param start_date Character (YYYY-MM-DD). Start date for generated `clim` +#' file. +#' @param end_date Character (YYYY-MM-DD). End date for generated `clim` +#' file. +#' @param site_era5_path Character. Base directory containing ERA5 NetCDF +#' inputs organised as `ERA5__/ERA5...nc`. +#' @param site_sipnet_met_path Character. Directory where SIPNET `clim` files +#' should be written. +#' +#' @return Character string giving the output directory used for the `clim` +#' files. +#' @export +convert_era5_nc_to_clim <- function( + site_combinations, + site_era5_path = NULL, + site_sipnet_met_path = NULL, + n_workers = 2, + dependencies = NULL +) { + + if (is.null(site_combinations$site_id) + || is.null(site_combinations$ens_id) + || is.null(site_combinations$start_date) + || is.null(site_combinations$end_date)) { + stop("`site_id`, `ens_id`, `start_date`, and `end_date` must all be supplied.", call. = FALSE) + } + + if (!dir.exists(site_era5_path)) { + stop(sprintf("Input ERA5 directory not found: %s", site_era5_path), call. = FALSE) + } + + if (!dir.exists(site_sipnet_met_path)) { + dir.create(site_sipnet_met_path, recursive = TRUE) + } + + output_directory <- file.path(site_sipnet_met_path) + if (!dir.exists(output_directory)) { + dir.create(output_directory, recursive = TRUE) + } + + parallel_strategy = "multisession" + future::plan(parallel_strategy, workers = n_workers) + furrr::future_pwalk( + site_combinations, + function(site_id, start_date, end_date, ens_id, ...) { + PEcAn.SIPNET::met2model.SIPNET( + in.path = file.path( + site_era5_path, + paste("ERA5", site_id, ens_id, sep = "_") + ), + start_date = start_date, + end_date = end_date, + in.prefix = paste0("ERA5.", ens_id), + outfolder = file.path(site_sipnet_met_path, site_id) + ) + } + ) + output_directory +} + + +#' Prepare PEcAn Run Directory +#' +#' Prepare PEcAn Run Directory +#' +#' Creates the output directory for a PEcAn workflow run if it doesn't exist. +#' Stops execution if the directory already exists to prevent overwriting. +#' +#' @param pecan_settings List containing PEcAn settings including the output directory path. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @examples +#' \dontrun{ +#' settings <- prepare_pecan_run_directory(pecan_settings) +#' } +#' +#' @export +prepare_pecan_run_directory <- function(pecan_settings, dependencies = NULL) { + print(getwd()) + pecan_run_directory = pecan_settings$outdir + if (!dir.exists(file.path(pecan_run_directory))) { + print(paste("Creating run directory", pecan_run_directory)) + dir.create(file.path(pecan_run_directory), recursive = TRUE) + } else { + stop(paste("Run directory", pecan_run_directory, "already exists")) + } + return(pecan_settings) +} + +#' Check PEcAn Continue Directive +#' +#' Checks if a PEcAn workflow should continue from a previous run by examining +#' the STATUS file in the output directory. +#' +#' @param pecan_settings List containing PEcAn settings including the output directory path. +#' @param continue Logical indicating whether to continue from a previous run. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return Logical value indicating whether to continue the workflow. +#' +#' @examples +#' \dontrun{ +#' should_continue <- check_pecan_continue_directive(pecan_settings, continue=TRUE) +#' } +#' +#' @export +check_pecan_continue_directive <- function(pecan_settings, continue=FALSE, dependencies = NULL) { + status_file <- file.path(pecan_settings$outdir, "STATUS") + if (continue && file.exists(status_file)) { + file.remove(status_file) + } + return(continue) +} + +#' Monitor PEcAn Cluster Job +#' +#' Monitors the status of cluster jobs submitted via PEcAn's remote execution system. +#' Continuously checks job status until all jobs are completed. +#' +#' @param pecan_settings List containing PEcAn settings including host configuration. +#' @param job_id_list Named list of job IDs to monitor. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return Logical TRUE when all jobs are completed. +#' +#' @details +#' This function is adapted from PEcAn.remote::start_qsub and PEcAn.workflow::start_model_runs. +#' It polls job status every 10 seconds and removes completed jobs from the monitoring list. +#' +#' @examples +#' \dontrun{ +#' job_ids <- list("job1" = "12345", "job2" = "12346") +#' pecan_monitor_cluster_job(pecan_settings, job_ids) +#' } +#' +#' @export +pecan_monitor_cluster_job <- function(pecan_settings, job_id_list, dependencies = NULL){ + # adapted heavily from + ## pecan.remote:start_qsub + ## pecan.workflow:start_model_runs + # list of job IDs (may be list of 1) + while (length(job_id_list) > 0) { + Sys.sleep(10) + for (run in names(job_id_list)) { + job_finished = FALSE + job_finished = PEcAn.remote::qsub_run_finished( + run = job_id_list[run], + host = pecan_settings$host$name, + qstat = pecan_settings$host$qstat + ) + if(job_finished){ + job_id_list[run] = NULL + } + } + } + return(TRUE) +} + +monitor_cluster_job <- function(distribution_adapter, job_id_list, dependencies = NULL){ + # adapted heavily from + ## pecan.remote:start_qsub + ## pecan.workflow:start_model_runs + # list of job IDs (may be list of 1) + while (length(job_id_list) > 0) { + Sys.sleep(10) + for (run in names(job_id_list)) { + job_finished = FALSE + job_finished = PEcAn.remote::qsub_run_finished( + run = job_id_list[run], + host = distribution_adapter$name, + qstat = distribution_adapter$qstat + ) + if(job_finished){ + job_id_list[run] = NULL + } + } + } + return(TRUE) +} + +#' Start PEcAn Ecosystem Model Runs +#' +#' Initiates ecosystem model runs using PEcAn's workflow system. +#' Handles both single runs and ensemble runs with appropriate error handling. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function uses PEcAn.utils, PEcAn.logger, and PEcAn.workflow packages. +#' It determines whether to stop on error based on ensemble size and settings. +#' For single runs, it stops on error; for ensemble runs, it continues on error. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_start_ecosystem_model_runs(pecan_settings) +#' } +#' +#' @export +pecan_start_ecosystem_model_runs <- function(pecan_settings, dependencies = NULL) { + # pecan.utils + # pecan.logger + # pecan.workflow + # Start ecosystem model runs + if (PEcAn.utils::status.check("MODEL") == 0) { + PEcAn.utils::status.start("MODEL") + stop_on_error <- as.logical(pecan_settings[[c("run", "stop_on_error")]]) + if (length(stop_on_error) == 0) { + # If we're doing an ensemble run, don't stop. If only a single run, we + # should be stopping. + if (is.null(pecan_settings[["ensemble"]]) || + as.numeric(pecan_settings[[c("ensemble", "size")]]) == 1) { + stop_on_error <- TRUE + } else { + stop_on_error <- FALSE + } + } + PEcAn.logger::logger.setUseConsole(TRUE) + PEcAn.logger::logger.setLevel("ALL") + PEcAn.workflow::runModule_start_model_runs(pecan_settings, stop.on.error = stop_on_error) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Get PEcAn Model Results +#' +#' Retrieves and processes the results from completed PEcAn model runs. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function uses PEcAn.uncertainty::runModule.get.results to process +#' model output and prepare it for further analysis. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_get_model_results(pecan_settings) +#' } +#' +#' @export +pecan_get_model_results <- function(pecan_settings, dependencies = NULL) { + # Get results of model runs + if (PEcAn.utils::status.check("OUTPUT") == 0) { + PEcAn.utils::status.start("OUTPUT") + PEcAn.uncertainty::runModule.get.results(pecan_settings) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Run PEcAn Ensemble Analysis +#' +#' Performs ensemble analysis on PEcAn model output if ensemble settings are configured. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function runs ensemble analysis using PEcAn.uncertainty::runModule.run.ensemble.analysis +#' only if ensemble configuration is present in the settings. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_run_ensemble_analysis(pecan_settings) +#' } +#' +#' @export +pecan_run_ensemble_analysis <- function(pecan_settings, dependencies = NULL) { + # Run ensemble analysis on model output. + if ("ensemble" %in% names(pecan_settings) && PEcAn.utils::status.check("ENSEMBLE") == 0) { + PEcAn.utils::status.start("ENSEMBLE") + PEcAn.uncertainty::runModule.run.ensemble.analysis(pecan_settings, TRUE) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Run PEcAn Sensitivity Analysis +#' +#' Performs sensitivity analysis and variance decomposition on PEcAn model output +#' if sensitivity analysis settings are configured. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function runs sensitivity analysis using PEcAn.uncertainty::runModule.run.sensitivity.analysis +#' only if sensitivity analysis configuration is present in the settings. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_run_sensitivity_analysis(pecan_settings) +#' } +#' +#' @export +pecan_run_sensitivity_analysis <- function(pecan_settings, dependencies = NULL) { + # Run sensitivity analysis and variance decomposition on model output + if ("sensitivity.analysis" %in% names(pecan_settings) && PEcAn.utils::status.check("SENSITIVITY") == 0) { + PEcAn.utils::status.start("SENSITIVITY") + PEcAn.uncertainty::runModule.run.sensitivity.analysis(pecan_settings) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Complete PEcAn Workflow +#' +#' Finalizes a PEcAn workflow by cleaning up resources and sending notification emails. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function performs final cleanup tasks including: +#' - Killing SSH tunnels +#' - Sending completion email notifications (if configured) +#' - Updating workflow status +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_workflow_complete(pecan_settings) +#' } +#' +#' @export +pecan_workflow_complete <- function(pecan_settings, dependencies = NULL) { + if (PEcAn.utils::status.check("FINISHED") == 0) { + PEcAn.utils::status.start("FINISHED") + PEcAn.remote::kill.tunnel(pecan_settings) + + # Send email if configured + if (!is.null(pecan_settings$email) + && !is.null(pecan_settings$email$to) + && (pecan_settings$email$to != "")) { + sendmail( + pecan_settings$email$from, + pecan_settings$email$to, + paste0("Workflow has finished executing at ", base::date()), + paste0("You can find the results on ", pecan_settings$email$url) + ) + } + PEcAn.utils::status.end() + } + + print("---------- PEcAn Workflow Complete ----------") + return(pecan_settings) +} + +#' Write PEcAn Configuration Files +#' +#' Writes PEcAn configuration files for model runs, either by generating new configs +#' or loading existing ones if they already exist. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param xml_file Character string specifying the path to the XML settings file. +#' +#' @return Updated pecan_settings list with configuration information. +#' +#' @details +#' This function either generates new configuration files using PEcAn.workflow::runModule.run.write.configs +#' or loads existing configuration files if they are already present in the output directory. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_write_configs(pecan_settings, "settings.xml") +#' } +#' +#' @export +pecan_write_configs <- function(pecan_settings, xml_file) { + # print(xml_file) + # pecan_settings <- PEcAn.settings::read.settings(xml_file) + pecan_settings = xml_file + PEcAn.logger::logger.setLevel("ALL") + if (PEcAn.utils::status.check("CONFIG") == 0) { + PEcAn.utils::status.start("CONFIG") + print("Writing configs via PEcAn.workflow::runModule.run.write.configs") + pecan_settings <- PEcAn.workflow::runModule.run.write.configs(pecan_settings) + print(paste("Writing configs to", file.path(pecan_settings$outdir, "pecan.CONFIGS.xml"))) + PEcAn.settings::write.settings(pecan_settings, outputfile = "pecan.CONFIGS.xml") + PEcAn.utils::status.end() + } else if (file.exists(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml"))) { + pecan_settings <- PEcAn.settings::read.settings(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml")) + } + return(pecan_settings) +} + +#' Resolve Data Routing +#' +#' Routes external data resources to local targets store using either symbolic links +#' (reference) or file/directory copying based on the specified action. +#' +#' @param external_workflow_directory Character string specifying the directory containing the external data resource. +#' @param external_name Character string specifying the name of the external data file or directory. +#' @param localized_name Character string specifying the name for the local file or directory. +#' @param action Character string specifying the routing action. Must be either "reference" (creates symbolic link) or "copy" (copies the resource). Default is "reference". +#' +#' @return Character string containing the path to the localized resource (symbolic link or copied file/directory), or NULL if external_name is NULL. +#' +#' @details +#' This function provides a unified interface for routing external data resources to the +#' targets store. It supports two modes: +#' \itemize{ +#' \item \code{"reference"}: Creates a symbolic link to the external resource using \code{reference_external_data_entity()} +#' \item \code{"copy"}: Copies the external resource to the targets store using \code{localize_data_resource()} +#' } +#' The function automatically detects whether the resource is a file or directory when using +#' the "copy" action. If an invalid action is specified, the function will throw an error. +#' +#' @examples +#' \dontrun{ +#' # Create a symbolic link to external data +#' link_path <- resolve_data_routing("/external/path", "data.nc", "local_data.nc", action="reference") +#' +#' # Copy external data to targets store +#' copy_path <- resolve_data_routing("/external/path", "data_dir", "local_data_dir", action="copy") +#' } +#' +#' @export +resolve_data_routing <- function(external_workflow_directory, external_name, localized_name, action="reference"){ + final_path = NULL + if(action=="reference"){ + final_path = reference_external_data_entity( + external_workflow_directory = external_workflow_directory, + external_name = external_name, + localized_name = localized_name + ) + } else if (action=="copy"){ + final_path = localize_data_resource( + external_workflow_directory = external_workflow_directory, + external_name = external_name, + localized_name = localized_name + ) + } else { + stop(paste0("Could not determine action for data routing. Passed action must be 'reference' or 'copy'. Passed action: ", action)) + } + return(final_path) +} + + +#' Reference External Data Entity +#' +#' Creates a symbolic link to an external data entity within the targets store. +#' +#' @param external_workflow_directory Character string specifying the directory containing the external data. +#' @param external_name Character string specifying the name of the external data file. +#' @param localized_name Character string specifying the name for the local symbolic link. +#' +#' @return Character string containing the path to the created symbolic link, or NULL if external_name is NULL. +#' +#' @details +#' This function creates a symbolic link from an external data entity to the targets store. +#' It validates that the external file exists and that the local link doesn't already exist. +#' +#' @examples +#' \dontrun{ +#' link_path <- reference_external_data_entity("/external/path", "data.nc", "local_data.nc") +#' } +#' +#' @export +reference_external_data_entity <- function(external_workflow_directory, external_name, localized_name){ + if (is.null(external_name)){ + return(NULL) + } + local_link_path = file.path(paste0(tar_path_store(), localized_name)) + external_link_path = file.path(paste0(external_workflow_directory, "/",external_name)) + if (!file.exists(external_link_path)){ + stop(paste("External link path", external_link_path, "does not exist")) + return(NULL) + } + if (file.exists(local_link_path)){ + warning(paste("Local link path", local_link_path, "already exists -- skipping.")) + }else{ + file.symlink(from=external_link_path, to=local_link_path) + } + return(local_link_path) +} + +#' Localize Data Resource (File or Directory) +#' +#' Copies a data file or directory from a central location to a local targets store location. +#' Automatically detects whether the resource is a file or directory and handles it appropriately. +#' +#' @param external_workflow_directory Character string specifying the directory containing the external data resource. +#' @param external_name Character string specifying the name of the external data file or directory. +#' @param localized_name Character string specifying the name for the local file or directory. +#' +#' @return Character string containing the path to the copied resource, or NULL if external_name is NULL. +#' +#' @details +#' This function automatically detects whether the external resource is a file or directory +#' and copies it to the targets store. For files, it ensures the parent directory exists. +#' For directories, it copies recursively. If the local path already exists, the function +#' will throw an error as this indicates a pipeline configuration error. +#' +#' @examples +#' \dontrun{ +#' # Copy a file +#' file_path <- localize_data_resource("/external/path", "data.nc", "local_data.nc") +#' # Copy a directory +#' dir_path <- localize_data_resource("/external/path", "data_dir", "local_data_dir") +#' } +#' +#' @export +localize_data_resource <- function(external_workflow_directory, external_name, localized_name) { + if (is.null(external_name)){ + return(NULL) + } + local_path = file.path(paste0(tar_path_store(), localized_name)) + external_path = file.path(paste0(external_workflow_directory, "/", external_name)) + + # Determine if resource is a file or directory + is_directory = dir.exists(external_path) + is_file = file.exists(external_path) + + if (!is_directory && !is_file){ + stop(paste("External resource path", external_path, "does not exist")) + return(NULL) + } + + # Check if local path already exists - this indicates a pipeline configuration error + if (file.exists(local_path) || dir.exists(local_path)){ + stop(paste("Local path", local_path, "already exists. This indicates a pipeline configuration error.")) + } + + # Ensure parent directory exists for the local path + local_path_parent = dirname(local_path) + if (!dir.exists(local_path_parent)){ + dir.create(local_path_parent, recursive = TRUE) + } + + # Copy the resource + if (is_directory){ + # For directories: copy to parent directory, which creates the directory with source name + # Then rename if the source name doesn't match the desired target name + copied_path = file.path(local_path_parent, basename(external_path)) + file.copy(external_path, local_path_parent, recursive = TRUE) + + # If the copied directory name doesn't match the desired name, rename it + if (copied_path != local_path){ + if (!dir.exists(copied_path)){ + stop(paste("Failed to copy directory. Expected", copied_path, "but it was not created as a directory.")) + } + file.rename(copied_path, local_path) + } + } else { + print(paste0("Copying file: ", external_path, " to: ", local_path)) + file.copy(external_path, local_path, overwrite = FALSE) + } + + return(local_path) +} + +#' Localize Data Resource Directory +#' +#' @inheritParams localize_data_resource +#' @export +localize_data_resource_directory <- function(external_workflow_directory, external_name, localized_name) { + localize_data_resource(external_workflow_directory, external_name, localized_name) +} + +#' Localize Data Resource File +#' +#' @inheritParams localize_data_resource +#' @export +localize_data_resource_file <- function(external_workflow_directory, external_name, localized_name) { + localize_data_resource(external_workflow_directory, external_name, localized_name) +} + +#' Generate Standard SLURM Batch Header +#' +#' Generates a standard SLURM batch script header with optional Apptainer module loading. +#' +#' @param apptainer Character string specifying the Apptainer container path (optional). +#' +#' @return Character string containing the SLURM batch script header. +#' +#' @details +#' This function generates a standard SLURM batch script header with default resource allocations: +#' - 1 node, 1 task per node, 1 CPU per task +#' - 1 hour runtime +#' - Standard output and error logging +#' If apptainer is provided, it adds a module load command for Apptainer. +#' +#' @examples +#' \dontrun{ +#' header <- sbatch_header_standard() +#' header_with_container <- sbatch_header_standard("/path/to/container.sif") +#' } +#' +#' @export +sbatch_header_standard <- function(apptainer=NULL) { + header_string <- "#!/bin/bash +#SBATCH --job-name=my_job_name # Job name +#SBATCH --output=pecan_workflow_out_%j.log # Standard output file +#SBATCH --error=pecan_workflow_err_%j.log # Standard error file +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node +#SBATCH --mem=32000 +#SBATCH --cpus-per-task=1 # Number of CPU cores per task +#SBATCH --time=1:00:00 # Maximum runtime (D-HH:MM:SS) + +#Load necessary modules (if needed) +" + if (!is.null(apptainer)) { + header_string = paste0(header_string, "module load apptainer\n") + } + return(header_string) +} + +pull_apptainer_container <- function(apptainer_url_base=NULL, apptainer_image_name=NULL, apptainer_disk_sif=NULL, apptainer_tag="latest") { + # TODO: handle nulls and non-passes. validate url/names, + apptainer_output_sif = paste0(apptainer_image_name,"_",apptainer_tag,".sif") + out = system2("apptainer", c(paste0("pull ", apptainer_output_sif ," ", apptainer_url_base,apptainer_image_name,":",apptainer_tag)), stdout = TRUE, stderr = TRUE) + return(apptainer_output_sif) +} + + +#' Targets Function Abstraction +#' +#' Retrieves a function by name and returns it as a targets object for remote execution. +#' +#' @param function_name Character string specifying the name of the function to retrieve. +#' +#' @return The function object retrieved by name. +#' +#' @details +#' This function retrieves an arbitrary function by its name and returns it as a target product. +#' The targets framework saves the function as an unnamed function object in the workflow store, +#' making it available to targets::tar_read() calls. Once tar_read is called into a namespace, +#' the function is available under the name it is saved into. It is incumbent on the function +#' and data author to ensure that the data passed into the function in the remote matches the signature. +#' +#' @examples +#' \dontrun{ +#' func <- targets_function_abstraction("my_function") +#' } +#' +#' @export +targets_function_abstraction <- function(function_name) { + # We need to retrieve an arbitrary function by its name, and return it as a target product + # targets will then save the function as an un-named function object in the workflow store, making it available to a targets::tar_read() call + # once tar_read is called into a namespace, that function is available under the name it is saved into + # it will be incumbent on the function and data author to ensure that the data passed into the function in the remote matches the signature. + return(get(function_name, mode="function")) +} + +#' Targets Argument Abstraction +#' +#' Returns an argument object as a targets object for remote execution. +#' +#' @param argument_object R object containing arguments to be passed to a function. +#' +#' @return The original argument_object. +#' +#' @details +#' If targets returns an R object, it can be read into a namespace via targets::tar_read(). +#' The object - as it is constructed, including its values, is then available under the variable +#' it is saved into. This allows a user on a headnode to construct an arguments object variable +#' with custom names, orders, etc., register it with targets, and on a remote, access the object +#' as it was constructed, and pass it into a function call. +#' +#' @examples +#' \dontrun{ +#' args <- list(param1 = "value1", param2 = 42) +#' arg_obj <- targets_argument_abstraction(args) +#' } +#' +#' @export +targets_argument_abstraction <- function(argument_object) { + # if we have targets return an R object, it can be read into a namespace via targets::tar_read() + # the object - as it is constructed, including its values, is then available under the variable it is saved into + # this allows a user on a headnode to construct a arguments object variable with custom names, orders, etc, register it with targets + # and on a remote, access the object as it was constructed, and pass it into a function call. + return(argument_object) +} + +#' Targets Abstract SLURM Batch Execution +#' +#' Executes a targets function remotely via SLURM batch job with optional containerization. +#' +#' @param pecan_settings List containing PEcAn settings including host configuration. +#' @param function_artifact Character string specifying the name of the targets function object. +#' @param args_artifact Character string specifying the name of the targets arguments object. +#' @param task_id Character string specifying the task identifier. +#' @param apptainer Character string specifying the Apptainer container path (optional). +#' @param dependencies Optional parameter for dependency tracking (unused). +#' @param conda_env Character string specifying the conda environment name (optional). +#' +#' @return Named list containing job IDs for the submitted SLURM jobs. +#' +#' @details +#' This function creates a SLURM batch script that executes a targets function remotely. +#' It supports both Apptainer containers and conda environments. The function_artifact and +#' args_artifact should be the string names of targets objects, not the objects themselves. +#' The function generates a batch script, submits it via sbatch, and returns the job IDs. +#' +#' @examples +#' \dontrun{ +#' job_ids <- targets_abstract_sbatch_exec(pecan_settings, "my_func", "my_args", "task1") +#' } +#' +#' @export +targets_abstract_sbatch_exec <- function(pecan_settings, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL) { + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + slurm_output_file = paste0("slurm_command_", task_id, ".sh") + file_content = sbatch_header_standard(apptainer=apptainer) + if (!is.null(conda_env)) { + file_content = paste0(file_content, ' conda run -n ', conda_env, ' ') + } + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + writeLines(file_content, slurm_output_file) + out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) + print(paste0("Output from sbatch command is: ", out)) + print(paste0("System will use this pattern: ", pecan_settings$host$qsub.jobid )) + jobids = list() + # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) + jobids[task_id] <- PEcAn.remote::qsub_get_jobid( + out = out[length(out)], + qsub.jobid = pecan_settings$host$qsub.jobid, + stop.on.error = stop.on.error) + # print(paste0("System thinks the jobid is: ", submitted_jobid)) + return(jobids) +} + +targets_sbatch_exec <- function(qsub_pattern, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL) { + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + slurm_output_file = paste0("slurm_command_", task_id, ".sh") + file_content = sbatch_header_standard(apptainer=apptainer) + if (!is.null(conda_env)) { + file_content = paste0(file_content, ' conda run -n ', conda_env, ' ') + } + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + writeLines(file_content, slurm_output_file) + out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) + print(paste0(out)) + jobids = list() + # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) + jobids[task_id] <- PEcAn.remote::qsub_get_jobid( + out = out[length(out)], + qsub.jobid = qsub_pattern, + stop.on.error = stop.on.error) + # print(paste0("System thinks the jobid is: ", submitted_jobid)) + return(jobids) +} + +#' Targets Source-based SLURM Batch Execution +#' +#' Executes a function loaded via source() remotely via SLURM batch job with optional containerization. +#' +#' @param pecan_settings List containing PEcAn settings including host configuration. +#' @param function_artifact Character string specifying the name of the function within the node's calling namespace. +#' @param args_artifact Character string specifying the name of the targets arguments object. +#' @param task_id Character string specifying the task identifier. +#' @param apptainer Character string specifying the Apptainer container path (optional). +#' @param dependencies Optional parameter for dependency tracking (unused). +#' @param conda_env Character string specifying the conda environment name (optional). +#' @param functional_source Optional character string path to a file to be loaded via source() (optional). +#' +#' @return Named list containing job IDs for the submitted SLURM jobs. +#' +#' @details +#' This function creates a SLURM batch script that executes a function remotely. +#' It supports both Apptainer containers and conda environments. The function_artifact must be a string +#' variable and the function specified must exist in the calling namespace on the compute node. The +#' args_artifact should be the string name of a previously-returned targets object, (not the variable object itself). +#' The function generates a batch script, submits it via sbatch, and returns the job IDs. +#' +#' @examples +#' \dontrun{ +#' job_ids <- targets_abstract_sbatch_exec(pecan_settings, "my_func", "my_args", "task1") +#' } +#' +#' @export +targets_abstract_args_sbatch_exec <- function(pecan_settings, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL, functional_source=NULL) { + # the biggest difference between this method of execution (sourcing the function file) is that this is done at runtime within the node + # this means that targets sees the path to the file, but not the file contents + # we can therefore reference code outside the memory space of this R process (or any R process) + # but: targets doesn't see this code. if this code changes, if this code is user's and is wobbly, targets won't know about it. + # returning the function which is called via the targets framework incorporates it into target's smart re-eval + # thats the benefit. This is a little more simple, but works fine. + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + # Construct slurm batch file + slurm_output_file = paste0("slurm_command_", task_id, ".sh") + file_content = sbatch_header_standard(apptainer=apptainer) + if (!is.null(conda_env)) { + file_content = paste0(file_content, ' conda run -n ', conda_env, ' ') + } + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" ') + if(!is.null(functional_source)){ + file_content = paste0(file_content, '-e "source(\'', functional_source, '\')" ') + } + file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') + file_content = paste0(file_content, '-e "do.call(', function_artifact,', abstract_args)"') + writeLines(file_content, slurm_output_file) + + # Submit slurm batch file; leverages PEcAn.remote for monitoring + out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) + print(paste0(out)) + # print(paste0("Output from sbatch command is: ", out)) + # print(paste0("System will use this pattern: ", pecan_settings$host$qsub.jobid )) + jobids = list() + # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) + jobids[task_id] <- PEcAn.remote::qsub_get_jobid( + out = out[length(out)], + qsub.jobid = pecan_settings$host$qsub.jobid, + stop.on.error = stop.on.error) + # print(paste0("System thinks the jobid is: ", submitted_jobid)) + return(jobids) +} + +#' Targets Based Local Execution +#' +#' Executes a targets function locally using a shell script. +#' +#' @param function_artifact Character string specifying the name of the targets function object. +#' @param args_artifact Character string specifying the name of the targets arguments object. +#' @param task_id Character string specifying the task identifier. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return Logical TRUE when execution completes. +#' +#' @details +#' This function is the local execution equivalent of targets_abstract_sbatch_exec. +#' It creates a shell script that executes a targets function locally and runs it via bash. +#' The function_artifact and args_artifact should be the string names of targets objects. +#' +#' @examples +#' \dontrun{ +#' result <- targets_based_local_exec("my_func", "my_args", "task1") +#' } +#' +#' @export +targets_based_containerized_local_exec <- function(pecan_settings, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL) { + # this function is NOT silly. It allows us to execute code on the local node, but within an apptainer! + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + local_output_file = paste0("local_command_", task_id, ".sh") + file_content="" + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + file_content = paste0(file_content, ' Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + writeLines(file_content, local_output_file) + system(paste0("bash ", local_output_file)) + return(TRUE) +} + +targets_sourcing_test_encapsulate <- function(func_name=NULL, string_to_print=NULL, task_id, targets_code_file_obj_name=NULL, apptainer=NULL, dependencies = NULL) { + + local_output_file = paste0("local_command_", task_id, ".sh") + file_content="" + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" ') + + file_content = paste0(file_content, '-e "source(\'', targets_code_file_obj_name, '\')" ') + + # file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') + # file_content = paste0(file_content, '-e "function_result=do.call(', function_artifact,', abstract_args)" ') + file_content = paste0(file_content, '-e "', func_name, '(string_to_print=\'', string_to_print,'\')" ') + get_response=FALSE + if(get_response){ + file_content = paste0(file_content, '-e "print(function_result)" ') + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file), intern = TRUE) + }else{ + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file)) + } + + return(outcome) +} + + +targets_based_sourced_containerized_local_exec <- function(function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL, functional_source=NULL) { + # this function is NOT silly. It allows us to execute code on the local node, but within an apptainer! + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + local_output_file = paste0("local_command_", task_id, ".sh") + file_content="" + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" ') + if(!is.null(functional_source)){ + file_content = paste0(file_content, '-e "source(\'', functional_source, '\')" ') + } + file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') + file_content = paste0(file_content, '-e "function_result=do.call(', function_artifact,', abstract_args)" ') + get_response=TRUE + if(get_response){ + file_content = paste0(file_content, '-e "print(function_result)" ') + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file), intern = TRUE) + }else{ + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file)) + } + + return(outcome) +} + + +step__run_model_2a <- function(pecan_settings = NULL, container = NULL, dependencies = NULL, use_abstraction=TRUE){ + list( + tar_target_raw( + "pecan_run_model_function", + quote(targets_function_abstraction(function_name = "run_model_2a")), + deps = dependencies + ), + tar_target_raw( + "pecan_run_model_arguments", + substitute( + targets_argument_abstraction( + argument_object = list( + settings = pecan_settings_raw + ) + ), + env = list(pecan_settings_raw = pecan_settings) + ), + deps = c(dependencies, "pecan_run_model_function") + ), + # run the abstracted function on the abstracted arguments via slurm + tar_target_raw( + "pecan_run_model_2a_job_submission", + substitute( + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings_raw, + function_artifact="pecan_run_model_function", + args_artifact="pecan_run_model_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference_raw, + dependencies=c() + ), + env = list(pecan_settings_raw = pecan_settings, apptainer_reference_raw = NULL) + ), + deps = c(dependencies, "pecan_run_model_arguments") + ), + tar_target_raw( + "run_model_2a_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_run_model_2a_job_submission)) + ) + ) +} + +run_model_2a <- function(settings = NULL){ + library(PEcAn.settings) + library(PEcAn.workflow) + library(PEcAn.logger) + library(PEcAn.uncertainty) + # Write model specific configs + stop_on_error = TRUE + PEcAn.workflow::runModule_start_model_runs(settings, + stop.on.error = stop_on_error) + + + # Get results of model runs + # this function is arguably too chatty, so we'll suppress + # INFO-level log output for this step. + loglevel <- PEcAn.logger::logger.setLevel("WARN") + + PEcAn.uncertainty::runModule.get.results(settings) + + PEcAn.logger::logger.setLevel(loglevel) + + + # Run sensitivity analysis and variance decomposition on model output + runModule.run.sensitivity.analysis(settings) + + print("---------- PEcAn Workflow Complete ----------") + +} + +step__build_pecan_xml <- function(workflow_settings = NULL, template_file = NULL, dependencies = NULL){ + list( + tar_target_raw( + "pecan_build_xml_function", + quote(targets_function_abstraction(function_name = "build_pecan_xml")) + ), + tar_target_raw( + "pecan_build_xml_arguments", + quote(targets_argument_abstraction( + argument_object = list( + orchestration_xml = workflow_settings, + template_file = pecan_template_file, + dependencies = c("site_info_file", "IC_files", "pecan_template_file") + ) + )) + ), + tar_target_raw("pecan_xml_file", quote(pecan_xml_path), format = "file"), + tar_target_raw("pecan_settings", quote(PEcAn.settings::read.settings(pecan_xml_file))), + # run the abstracted function on the abstracted arguments via slurm + tar_target_raw( + "pecan_xml_build_job_submission", + quote(targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_build_xml_function", + args_artifact="pecan_build_xml_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_build_xml_arguments) + )) + ), + tar_target_raw( + "build_xml_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_xml_build_job_submission)) + ), + tar_target_raw("pecan_built_xml_file", quote("./pecan_built_config.xml"), format = "file", deps=c("build_xml_job_outcome")), + tar_target_raw("pecan_built_xml", quote(PEcAn.settings::read.settings(pecan_built_xml_file)), deps=c("pecan_built_xml_file")) + ) +} + +build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, dependencies = NULL) { + library(PEcAn.settings) + + site_info <- read.csv(orchestration_xml$site.info.file) + stopifnot( + length(unique(site_info$id)) == nrow(site_info), + all(site_info$lat > 0), # just to simplify grid naming below + all(site_info$lon < 0) + ) + site_info <- site_info |> + dplyr::mutate( + # match locations to half-degree ERA5 grid cell centers + # CAUTION: Calculation only correct when all lats are N and all lons are W! + ERA5_grid_cell = paste0( + ((lat + 0.25) %/% 0.5) * 0.5, "N_", + ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W" + ) + ) + + settings <- read.settings(template_file) |> + setDates(orchestration_xml$start.date, orchestration_xml$end.date) + + settings$ensemble$size <- orchestration_xml$n.ens + settings$run$inputs$poolinitcond$ensemble <- orchestration_xml$n.ens + + # Hack: setEnsemblePaths leaves all path components other than siteid + # identical across sites. + # To use site-specific grid id, I'll string-replace each siteid + id2grid <- function(s) { + # replacing in place to preserve names (easier than thinking) + for (p in seq_along(s$run$inputs$met$path)) { + s$run$inputs$met$path[[p]] <- gsub( + pattern = s$run$site$id, + replacement = s$run$site$ERA5_grid_cell, + x = s$run$inputs$met$path[[p]] + ) + } + s + } + + settings <- settings |> + createMultiSiteSettings(site_info) |> + setEnsemblePaths( + n_reps = as.numeric(orchestration_xml$n.met), + input_type = "met", + path = orchestration_xml$met.dir, + d1 = orchestration_xml$start.date, + d2 = orchestration_xml$end.date, + # TODO use caladapt when ready + # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc" + path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim" + ) |> + papply(id2grid) |> + setEnsemblePaths( + n_reps = as.numeric(orchestration_xml$n.ens), + input_type = "poolinitcond", + path = orchestration_xml$ic.dir, + path_template = "{path}/{id}/IC_site_{id}_{n}.nc" + ) + + # Hack: Work around a regression in PEcAn.uncertainty 1.8.2 by specifying + # PFT outdirs explicitly (even though they go unused in this workflow) + settings$pfts <- settings$pfts |> + lapply(\(x) { + x$outdir <- file.path(settings$outdir, "pfts", x$name) + x + }) + + write.settings( + settings, + outputfile = basename(orchestration_xml$output.xml), + outputdir = dirname(orchestration_xml$output.xml) + ) + + return(settings) +} + +step__build_ic_files <- function(workflow_settings = NULL, orchestration_settings = NULL, container = NULL, dependencies = NULL){ + list( + tar_target_raw( + "pecan_build_ic_files_function", + quote(targets_function_abstraction(function_name = "build_ic_files")), + deps = c(dependencies) + ), + tar_target_raw( + "pecan_build_ic_files_arguments", + substitute(targets_argument_abstraction( + argument_object = list( + orchestration_xml = workflow_settings_raw + ) + ), + env = list(workflow_settings_raw = workflow_settings) + ), + deps = c(dependencies) + ), + + # run the abstracted function on the abstracted arguments via slurm + tar_target_raw( + "pecan_build_ic_files_job_submission", + substitute(targets_sbatch_exec( + qsub_pattern=qsub_pattern_raw, + function_artifact="pecan_build_ic_files_function", + args_artifact="pecan_build_ic_files_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=container_raw + ), + env = list(container_raw = container, qsub_pattern_raw=orchestration_settings$orchestration$distributed.compute.adapter$qsub.jobid) + ), + deps = c("pecan_build_ic_files_arguments", dependencies) + ), + tar_target_raw( + "build_ic_files_job_outcome", + substitute(monitor_cluster_job(distribution_adapter=adapter_raw, job_id_list=pecan_build_ic_files_job_submission), + env = list( + adapter_raw=orchestration_settings$orchestration$distributed.compute.adapter + ) + ), + deps = c("pecan_build_ic_files_job_submission", dependencies) + ) + ) +} + +build_ic_files <- function(orchestration_xml = NULL){ + # adapted from CB's 02_ic_build.R + set.seed(6824625) + library(tidyverse) + + # Do parallel processing in separate R processes instead of via forking + # (without this the {furrr} calls inside soilgrids_soilC_extract + # were crashing for me. TODO check if this is machine-specific) + op <- options(parallelly.fork.enable = FALSE) + on.exit(options(op)) + + # if (!dir.exists(args$data_dir)) dir.create(args$data_dir, recursive = TRUE) + if (!dir.exists(orchestration_xml$data.dir)) dir.create(orchestration_xml$data.dir, recursive = TRUE) + + # split up comma-separated options + params_read_from_pft <- strsplit(orchestration_xml$params.from.pft, ",")[[1]] + landtrendr_raw_files <- strsplit(orchestration_xml$landtrendr.raw.files, ",")[[1]] + additional_params <- orchestration_xml$additional.params |> + str_match_all("([^=]+)=([^,]+),?") |> + _[[1]] |> + (\(x) setNames(as.list(x[, 3]), x[, 2]))() |> + as.data.frame() |> + mutate(across(starts_with("param"), as.numeric)) + + site_info <- read.csv( + orchestration_xml$site.info.file, + colClasses = c(field_id = "character") + ) + site_info$start_date <- orchestration_xml$start.date + site_info$LAI_date <- orchestration_xml$run_LAI.date + + + PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m") + # NB this takes several minutes to run + # csv filename is hardcoded by fn + soilc_csv_path <- file.path(orchestration_xml$data.dir, "soilgrids_soilC_data.csv") + if (file.exists(soilc_csv_path)) { + PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path) + soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE) + sites_needing_soilc <- site_info |> + filter(!id %in% soil_carbon_est$Site_ID) + } else { + soil_carbon_est <- NULL + sites_needing_soilc <- site_info + } + nsoilc <- nrow(sites_needing_soilc) + if (nsoilc > 0) { + PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites") + new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract( + sites_needing_soilc |> select(site_id = id, site_name = name, lat, lon), + outdir = orchestration_xml$data.dir + ) + soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |> + arrange(Site_ID) + write.csv(soil_carbon_est, soilc_csv_path, row.names = FALSE) + } + + + + PEcAn.logger::logger.info("Soil moisture") + sm_outdir <- file.path(orchestration_xml$data.dir, "soil_moisture") |> + normalizePath(mustWork = FALSE) + sm_csv_path <- file.path(orchestration_xml$data.dir, "sm.csv") # name is hardcorded by fn + if (file.exists(sm_csv_path)) { + PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path) + soil_moisture_est <- read.csv(sm_csv_path) + sites_needing_soilmoist <- site_info |> + filter(!id %in% soil_moisture_est$site.id) + } else { + soil_moisture_est <- NULL + sites_needing_soilmoist <- site_info + } + nmoist <- nrow(sites_needing_soilmoist) + if (nmoist > 0) { + PEcAn.logger::logger.info("Retrieving soil moisture for", nmoist, "sites") + if (!dir.exists(sm_outdir)) dir.create(sm_outdir) + new_soil_moisture <- PEcAn.data.land::extract_SM_CDS( + site_info = sites_needing_soilmoist |> + dplyr::select(site_id = id, lat, lon), + time.points = as.Date(site_info$start_date[[1]]), + in.path = sm_outdir, + out.path = dirname(sm_csv_path), + allow.download = TRUE + ) + soil_moisture_est <- bind_rows(soil_moisture_est, new_soil_moisture) |> + arrange(site.id) + write.csv(soil_moisture_est, sm_csv_path, row.names = FALSE) + } + + PEcAn.logger::logger.info("LAI") + # Note that this currently creates *two* CSVs: + # - "LAI.csv", with values from each available day inside the search window + # (filename is hardcoded inside MODIS_LAI_PREP()) + # - this path, aggregated to one row per site + # TODO consider cleaning this up -- eg reprocess from LAI.csv on the fly? + lai_csv_path <- file.path(orchestration_xml$data.dir, "LAI_bysite.csv") + if (file.exists(lai_csv_path)) { + PEcAn.logger::logger.info("using existing LAI file", lai_csv_path) + lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames? + sites_needing_lai <- site_info |> + filter(!id %in% lai_est$site_id) + } else { + lai_est <- NULL + sites_needing_lai <- site_info + } + nlai <- nrow(sites_needing_lai) + if (nlai > 0) { + PEcAn.logger::logger.info("Retrieving LAI for", nlai, "sites") + lai_res <- PEcAn.data.remote::MODIS_LAI_prep( + site_info = sites_needing_lai |> dplyr::select(site_id = id, lat, lon), + time_points = as.Date(site_info$LAI_date[[1]]), + outdir = orchestration_xml$data.dir, + export_csv = TRUE, + skip_download = FALSE + ) + lai_est <- bind_rows(lai_est, lai_res$LAI_Output) |> + arrange(site_id) + write.csv(lai_est, lai_csv_path, row.names = FALSE) + } + + + PEcAn.logger::logger.info("Aboveground biomass from LandTrendr") + + landtrendr_agb_outdir <- orchestration_xml$data.dir + + landtrendr_csv_path <- file.path( + landtrendr_agb_outdir, + "aboveground_biomass_landtrendr.csv" + ) + if (file.exists(landtrendr_csv_path)) { + PEcAn.logger::logger.info( + "using existing LandTrendr AGB file", + landtrendr_csv_path + ) + agb_est <- read.csv(landtrendr_csv_path) + sites_needing_agb <- site_info |> + filter(!id %in% agb_est$site_id) + } else { + agb_est <- NULL + sites_needing_agb <- site_info + } + nagb <- nrow(sites_needing_agb) + if (nagb > 0) { + PEcAn.logger::logger.info("Retrieving aboveground biomass for", nagb, "sites") + lt_med_path <- grep("_median.tif$", landtrendr_raw_files, value = TRUE) + lt_sd_path <- grep("_stdv.tif$", landtrendr_raw_files, value = TRUE) + stopifnot( + all(file.exists(landtrendr_raw_files)), + length(lt_med_path) == 1, + length(lt_sd_path) == 1 + ) + lt_med <- terra::rast(lt_med_path) + lt_sd <- terra::rast(lt_sd_path) + field_shp <- terra::vect(orchestration_xml$field.shape.path) + + site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |> + terra::project(lt_med) + + # Check for unmatched sites + # TODO is stopping here too strict? Could reduce to warning if needed + stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID)) + + new_agb <- lt_med |> + terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |> + terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |> + as.data.frame() |> + left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |> + dplyr::select( + site_id = id, + AGB_median_Mg_ha = ends_with("median"), + AGB_sd = ends_with("stdv") + ) |> + mutate(across(where(is.numeric), \(x) signif(x, 5))) + agb_est <- bind_rows(agb_est, new_agb) |> + arrange(site_id) + write.csv(agb_est, landtrendr_csv_path, row.names = FALSE) + } + + # --------------------------------------------------------- + # Great, we have estimates for some variables. + # Now let's make IC files! + + PEcAn.logger::logger.info("Building IC files") + + + initial_condition_estimated <- dplyr::bind_rows( + soil_organic_carbon_content = soil_carbon_est |> + dplyr::select( + site_id = Site_ID, + mean = `Total_soilC_0-30cm`, + sd = `Std_soilC_0-30cm` + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + SoilMoistFrac = soil_moisture_est |> + dplyr::select( + site_id = site.id, + mean = sm.mean, + sd = sm.uncertainty + ) |> + # Note that we pass this as a percent -- yes, Sipnet wants a fraction, + # but write.configs.SIPNET hardcodes a division by 100. + # TODO consider modifying write.configs.SIPNET + # to not convert when 0 > SoilMoistFrac > 1 + dplyr::mutate( + lower_bound = 0, + upper_bound = 100 + ), + LAI = lai_est |> + dplyr::select( + site_id = site_id, + mean = ends_with("LAI"), + sd = ends_with("SD") + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + AbvGrndBiomass = agb_est |> # NB this assumes AGB ~= AGB woody + dplyr::select( + site_id = site_id, + mean = AGB_median_Mg_ha, + sd = AGB_sd + ) |> + dplyr::mutate(across( + c("mean", "sd"), + ~ PEcAn.utils::ud_convert(.x, "Mg ha-1", "kg m-2") + )) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + .id = "variable" + ) + write.csv( + initial_condition_estimated, + file.path(orchestration_xml$data.dir, "IC_means.csv"), + row.names = FALSE + ) + + + + # read params from PFTs + + sample_distn <- function(varname, distn, parama, paramb, ..., n) { + if (distn == "exp") { + samp <- rexp(n, parama) + } else { + rfn <- get(paste0("r", distn)) + samp <- rfn(n, parama, paramb) + } + + data.frame(samp) |> + setNames(varname) + } + + sample_pft <- function(path, + vars = params_read_from_pft, + n_samples = orchestration_xml$ic.ensemble.size) { + e <- new.env() + load(file.path(path, "post.distns.Rdata"), envir = e) + e$post.distns |> + tibble::rownames_to_column("varname") |> + dplyr::select(-"n") |> # this is num obs used in posterior; conflicts with n = ens size when sampling + dplyr::filter(varname %in% vars) |> + dplyr::bind_rows(additional_params) |> + purrr::pmap(sample_distn, n = n_samples) |> + purrr::list_cbind() |> + tibble::rowid_to_column("replicate") + } + + pft_var_samples <- site_info |> + mutate(pft_path = file.path(orchestration_xml$pft.dir, site.pft)) |> + nest_by(id) |> + mutate(samp = purrr::map(data$pft_path, sample_pft)) |> + unnest(samp) |> + dplyr::select(-"data") |> + dplyr::rename(site_id = id) + + + ic_sample_draws <- function(df, n = 100, ...) { + stopifnot(nrow(df) == 1) + data.frame( + replicate = seq_len(n), + sample = truncnorm::rtruncnorm( + n = n, + a = df$lower_bound, + b = df$upper_bound, + mean = df$mean, + sd = df$sd + ) + ) + } + + ic_samples <- initial_condition_estimated |> + dplyr::filter(site_id %in% site_info$id) |> + dplyr::group_by(site_id, variable) |> + dplyr::group_modify(ic_sample_draws, n = as.numeric(orchestration_xml$ic.ensemble.size)) |> + tidyr::pivot_wider(names_from = variable, values_from = sample) |> + dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |> + dplyr::mutate( + AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction, + leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100), + wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0) + ) + + ic_names <- colnames(ic_samples) + std_names <- c("site_id", "replicate", PEcAn.utils::standard_vars$Variable.Name) + nonstd_names <- ic_names[!ic_names %in% std_names] + if (length(nonstd_names) > 0) { + PEcAn.logger::logger.debug( + "Not writing these nonstandard variables to the IC files:", nonstd_names + ) + ic_samples <- ic_samples |> dplyr::select(-any_of(nonstd_names)) + } + + file.path(orchestration_xml$ic.outdir, site_info$id) |> + unique() |> + purrr::walk(dir.create, recursive = TRUE) + + ic_samples |> + dplyr::group_by(site_id, replicate) |> + dplyr::group_walk( + ~ PEcAn.SIPNET::veg2model.SIPNET( + outfolder = file.path(orchestration_xml$ic.outdir, .y$site_id), + poolinfo = list( + dims = list(time = 1), + vals = .x + ), + siteid = .y$site_id, + ens = .y$replicate + ) + ) + + PEcAn.logger::logger.info("IC files written to", orchestration_xml$ic.outdir) + PEcAn.logger::logger.info("Done") +} + +check_directory_exists <- function(directory_path, stop_on_nonexistent=FALSE) { + if (!dir.exists(directory_path)) { + if (stop_on_nonexistent) { + print(paste0("Directory: ", directory_path, " doesn't exist.")) + stop("This path is required to proceed. Exiting.") + } + return(FALSE) + } + return(TRUE) +} + + +workflow_run_directory_setup <- function(orchestration_settings = NULL, workflow_name = NULL) { + workflow_run_directory = orchestration_settings$orchestration$workflow.base.run.directory + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + run_identifier = workflow_settings$run.identifier + + if(is.null(workflow_run_directory)){ + stop("Cannot continue without a workflow run directory - check XML configuration.") + } + if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) + } + analysis_run_id = paste0("analysis_run_", uuid::UUIDgenerate() ) + if (is.null(run_identifier)) { + print(paste("Analysis run id specified:", analysis_run_id)) + } else { + print(paste("Analysis run id specified:", run_identifier)) + analysis_run_id = run_identifier + } + analysis_run_directory = file.path(workflow_run_directory, analysis_run_id) + if (!check_directory_exists(analysis_run_directory, stop_on_nonexistent=FALSE)) { + dir.create(analysis_run_directory, recursive = TRUE) + } + return(list(run_dir=analysis_run_directory, run_id=analysis_run_id)) +} + + +parse_orchestration_xml <- function(orchestration_xml_path=NULL) { + if(is.null(orchestration_xml_path)){ + stop("must provide orchestration XML path for parsing.") + } + orchestration_xml = XML::xmlParse(orchestration_xml_path) + orchestration_xml <- XML::xmlToList(orchestration_xml) + return(orchestration_xml) +} + +check_orchestration_keys = function(orchestration_xml = NULL, key_list = NULL, required=TRUE){ + missing_values=FALSE + for(key in key_list){ + if(key %in% names(orchestration_xml)){ + # warning(paste0("Found key: ", key)) + }else{ + missing_values=TRUE + } + } + if (missing_values && required) { + stop("One or more needed keys are not present in orchestration configuration. Please see prior warnings.") + } else if (missing_values) { + return(FALSE) + } + return(TRUE) +} + +#' @title Example target factory. +#' @description Define 3 targets: +#' 1. Track the user-supplied data file. +#' 2. Read the data using `read_data()` (defined elsewhere). +#' 3. Fit a model to the data using `fit_model()` (defined elsewhere). +#' @return A list of target objects. +#' @export +#' @param file Character, data file path. +# apptainer_factory <- function(orchestration_settings, workflow_name) { +apptainer_can_download <- function(apptainer_xml = NULL) { + if(check_orchestration_keys(orchestration_xml = apptainer_xml, key_list = c("sif", "remote.url", "container.name", "tag"), required=FALSE)){ + # print("Missing required parameters in configuration to download apptainer. Required keys under apptainer: url, name, tag, sif") + return(TRUE) + }else{ + return(FALSE) + } +} + +apptainer_can_link <- function(source_directory = NULL, apptainer_xml = NULL) { + if(check_orchestration_keys(orchestration_xml = apptainer_xml, key_list = c("sif"), required=FALSE)){ + if(!is.null(source_directory) && file.exists(file.path(paste0(source_directory, "/",apptainer_xml$sif)))){ + return(TRUE) + } + } + return(FALSE) +} + +step__resolve_apptainer <- function(apptainer_source_directory=NULL, workflow_xml=NULL) { + # Strictly speaking, this argument munging is not necessary. The below unevaluated [quote()'ed] expression + # is returned to the calling targets pipeline as it is - unevaluated + # this means that the variables passed are not actually used - they aren't evaluated until runtime + # so the variables aren't even bound until this step is evaluated within the calling namespace. + apptainer_settings = workflow_xml$apptainer + link = apptainer_can_link(source_directory=apptainer_source_directory, apptainer_xml=apptainer_settings) + download = apptainer_can_download(apptainer_xml=apptainer_settings) + system("module load apptainer") + if(link){ + print("Attempting to link apptainer SIF.") + list( + tar_target_raw( + "apptainer_reference", + reference_external_data_entity( + external_workflow_directory=substitute(apptainer_source_value, env = list(apptainer_source_value = apptainer_source_directory)), + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ) + ) + }else if(download){ + print("Attempting to download apptainer.") + list( + tar_target_raw( + "apptainer_reference", + pull_apptainer_container( + apptainer_url_base=substitute(raw_apptainer_url, env = list(raw_apptainer_url = workflow_xml$apptainer$remote.url)), + apptainer_image_name=substitute(raw_apptainer_name, env = list(raw_apptainer_name = workflow_xml$apptainer$container.name)), + apptainer_tag=substitute(raw_apptainer_tag, env = list(raw_apptainer_tag = workflow_xml$apptainer$tag)), + apptainer_disk_sif=substitute(raw_apptainer_sif, env = list(raw_apptainer_sif = workflow_xml$apptainer$sif)) + ) + ) + ) + }else{ + print(workflow_xml) + stop("Failed to resolve apptainer - could not link or download container. Please check configuration XML.") + } +} + +step__link_data_by_name <- function(workflow_data_source_directory = NULL, target_artifact_names = c(), localized_name_list = c(), external_name_list = c()){ + target_list = list() + if((length(localized_name_list) != length(target_artifact_names)) || (length(localized_name_list) != length(external_name_list))){ + stop("Cannot link internal names to external link targets with unequal length lists") + } + for(i in seq_along(localized_name_list)){ + target_list = append(target_list, + tar_target_raw(substitute(target_name, env = list(target_name = target_artifact_names[i])), + reference_external_data_entity( + external_workflow_directory=substitute(raw_data_source, env = list(raw_data_source = workflow_data_source_directory)), + external_name=substitute(external_name, env = list(external_name = external_name_list[i])), + localized_name=substitute(localized_name, env = list(localized_name = localized_name_list[i])) + ) + ) + ) + } + # print(target_list) + target_list +} + +step__resolve_data_routing <- function(workflow_data_source_directory = NULL, target_artifact_names = c(), localized_name_list = c(), external_name_list = c(), action_list = c()){ + target_list = list() + if((length(localized_name_list) != length(target_artifact_names)) || (length(localized_name_list) != length(external_name_list))){ + stop("Cannot link internal names to external link targets with unequal length lists") + } + for(i in seq_along(localized_name_list)){ + target_list = append(target_list, + tar_target_raw(substitute(target_name, env = list(target_name = target_artifact_names[i])), + resolve_data_routing( + external_workflow_directory=substitute(raw_data_source, env = list(raw_data_source = workflow_data_source_directory)), + external_name=substitute(external_name, env = list(external_name = external_name_list[i])), + localized_name=substitute(localized_name, env = list(localized_name = localized_name_list[i])), + action=substitute(action, env = list(action = action_list[i])) + ) + ) + ) + } + # print(target_list) + target_list +} + + +step__run_distributed_write_configs <- function(pecan_settings=NULL, container=NULL, use_abstraction=TRUE, dependencies = NULL) { + # note on substitution: when substitutions are needed inside of functions that must also be quoted, + # the solution is to expand the captured expression which has substitutions and to do all subs at once + if(use_abstraction){ + list( + tar_target_raw( + "pecan_write_configs_function", + quote(targets_function_abstraction(function_name = "pecan_write_configs")), + deps = dependencies + ), + # create the abstraction of the pecan write configs arguments + tar_target_raw( + "pecan_write_configs_arguments", + substitute( + targets_argument_abstraction(argument_object = list(pecan_settings=raw_pecan_settings, xml_file=raw_pecan_xml)), + env = list(raw_pecan_settings = pecan_settings, raw_pecan_xml = pecan_settings) + ), + deps = dependencies + ), + tar_target_raw( + "pecan_settings_job_submission", + substitute(targets_abstract_sbatch_exec( + pecan_settings=raw_pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=raw_apptainer, + dependencies=c(pecan_continue) + ), env=list(raw_pecan_settings = pecan_settings, raw_apptainer = container)), + deps = dependencies + ), + tar_target_raw( + "settings_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission)) + ) + ) + }else{ + list( + tar_target_raw( + "pecan_write_configs_arguments", + quote(targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file))) + ), + tar_target_raw( + "pecan_settings_job_submission", + quote( + targets_abstract_args_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + functional_source=function_sourcefile, + apptainer=apptainer_reference, + dependencies=c(pecan_continue) + ) + ) + ), + tar_target_raw( + "settings_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission)) + ) + ) + } +} + +step__create_clim_files <- function(pecan_settings=NULL, container=NULL, workflow_settings=NULL, dependencies = NULL, reference_path=NULL, data_raw=NULL, site_info=NULL) { + site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) + list( + tar_target_raw( + "era5_site_combinations", + substitute( + build_era5_site_combinations_args( + site_info_file = site_info_file_raw, + start_date = start_date_raw, + end_date = end_date_raw, + reference_path = reference_era5_path_raw, + sipnet_met_path = site_sipnet_met_path_raw, + dependencies = c() + ), + env = list( + site_sipnet_met_path_raw = site_sipnet_met_path, + reference_era5_path_raw = reference_path, + site_info_file_raw = site_info, + start_date_raw = workflow_settings$start.date, + end_date_raw = workflow_settings$end.date + ) + ), + deps = substitute(raw_dependencies, env = list(raw_dependencies = dependencies)) + ), + tar_target_raw( + "era5_clim_create_args", + substitute( + targets_argument_abstraction( + argument_object = list( + site_combinations = era5_site_combinations, + site_era5_path = reference_era5_path_raw, + site_sipnet_met_path = site_sipnet_met_path_raw, + n_workers = 1, + dependencies=c() + ) + ), + env = list( + site_sipnet_met_path_raw = site_sipnet_met_path, + reference_era5_path_raw = reference_path + ) + ), + deps = c("era5_site_combinations", dependencies) + ), + tar_target_raw( + "era5_clim_output", + substitute( + targets_abstract_args_sbatch_exec( + pecan_settings=pecan_settings_raw, + function_artifact="convert_era5_nc_to_clim", + args_artifact="era5_clim_create_args", + task_id=uuid::UUIDgenerate(), + apptainer= apptainer_reference_raw, + dependencies = era5_clim_create_args, + functional_source = function_sourcefile + ), + env = list( + pecan_settings_raw = pecan_settings, + apptainer_reference_raw = container + ) + ), + deps = c("era5_clim_create_args", dependencies) + ), + tar_target_raw( + "era5_clim_conversion", + substitute( + pecan_monitor_cluster_job( + pecan_settings=pecan_settings_raw, + job_id_list=era5_clim_output + ), + env = list( + pecan_settings_raw = pecan_settings + ) + ), + deps = c("era5_clim_output", dependencies) + ) + ) +} + +step__run_pecan_workflow <- function() { + list( + tar_target_raw( + "ecosystem_settings", + quote(pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome))) + ), + tar_target_raw( + "model_results_settings", + quote(pecan_get_model_results(pecan_settings=ecosystem_settings)) + ), + tar_target_raw( + "ensembled_results_settings", ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + quote(pecan_run_ensemble_analysis(pecan_settings=model_results_settings)) + ), + tar_target_raw( + "sensitivity_settings", + quote(pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings)) + ), + tar_target_raw( + "complete_settings", + quote(pecan_workflow_complete(pecan_settings=sensitivity_settings)) + ) + ) +} diff --git a/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R b/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R new file mode 100644 index 0000000..77c0fc3 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R @@ -0,0 +1,104 @@ +library(targets) +library(tarchetypes) +library(PEcAn.settings) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +workflow_name = "workflow.data.prep.1" + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + pecan_xml_path = "@PECANXMLPATH@" + workflow_name = "@WORKFLOWNAME@" + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + ccmmf_data_tarball_url = workflow_settings$ccmmf.data.s3.url + ccmmf_data_filename = workflow_settings$ccmmf.data.tarball.filename + run_identifier = workflow_settings$run.identifier + + tar_option_set( + packages = c("readr", "dplyr"), + imports = c() + ) + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +}, ask = FALSE, script = tar_script_path) + +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) +writeLines(script_content, tar_script_path) + +#### workflow execution #### +# this changes the cwd to the designated tar store +tar_make(script = tar_script_path) diff --git a/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml b/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml new file mode 100644 index 0000000..2150d85 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml @@ -0,0 +1,19 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + diff --git a/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml b/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml new file mode 100644 index 0000000..a3fac68 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml @@ -0,0 +1,19 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_latest.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + diff --git a/workflow_examples/01_simple_data_workflow/01_pecan_config_devel.xml b/workflow_examples/01_simple_data_workflow/01_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml b/workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml new file mode 100644 index 0000000..640a6c2 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/01_simple_data_workflow/README.md b/workflow_examples/01_simple_data_workflow/README.md new file mode 100644 index 0000000..e44d1c9 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/README.md @@ -0,0 +1,294 @@ +# Simple Data Workflow Example + +This example demonstrates a **simple data preparation workflow** that downloads and extracts CCMMF data artifacts from S3 storage. This is the foundational workflow that subsequent workflows can reference. + +## Overview + +This workflow showcases: +1. **Configuration-driven workflows** using XML settings +2. **Data artifact management** with automatic download and extraction from S3 +3. **Reproducible execution** with unique run identifiers +4. **Smart re-evaluation** using the targets framework + +## Key Files + +- `01_data_prep_workflow.R` - Main workflow script +- `01_pecan_workflow_config_example.xml` - Configuration file + +## Workflow Script Breakdown + +### Section 1: Workflow setup & settings parsing + +```r +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +this_workflow_name = "workflow.data.prep.1" + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) + +pecan_xml_path = workflow_settings$pecan.xml.path +ccmmf_data_tarball_url = workflow_settings$ccmmf.data.s3.url +ccmmf_data_filename = workflow_settings$ccmmf.data.tarball.filename +run_identifier = workflow_settings$run.identifier +``` + +**Purpose**: + +This set-up section brings in standard command line arguments, and extracts the orchestration settings for this workflow via the workflow name. + +The content here binds into the XML configuration file. The workflow name is a particularly useful field, as it can be used to easily switch to a different configuration stanza, while keeping the remainder of the workflow set-up identical. + +This section also identifies the base workflow run directory - this is a critical field, as subsequent data references look in this directory by default for data sourcing. + +This section also extracts the data source configuration parameters: +- The S3 URL where the CCMMF data tarball is hosted +- The specific filename to download +- A run identifier for this workflow execution + +The workflow name (`workflow.data.prep.1`) identifies this as the foundational data preparation step that subsequent workflows will reference. + +The comment block early in this section documents the smart re-evaluation behavior of the targets framework, which will only re-run pipeline steps if inputs or code have changed. + + +--- + +### Section 2: Path Normalization and Run Directory Setup + +```r +# TODO: input parameter validation and defense + +#### Handle input parameters parased from settings file #### +#### workflow prep #### +function_path = normalizePath(file.path(workflow_function_source)) +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory = normalizePath(workflow_run_directory) + +if (is.null(run_identifier)) { + run_id = uuid::UUIDgenerate() +} else { + print(paste("Run id specified:", run_identifier)) + run_id = run_identifier +} + +this_run_directory = file.path(workflow_run_directory, run_id) +if (!dir.exists(this_run_directory)) { + dir.create(this_run_directory, recursive = TRUE) +} +``` + +**Purpose**: Sets up the workflow execution environment and run directory structure. + +The paths to the workflow functions and PEcAn XML are normalized to ensure absolute paths, which is critical for reliability across different working directories. + +The base workflow run directory is created if it doesn't exist. This directory serves as the root for all workflow runs and is where subsequent workflows will look for data artifacts. + +A run identifier is either generated (using UUID) or used from the configuration. This identifier will be used by other workflows to reference the data produced by this workflow execution. + +Finally, a specific run directory is created for this workflow instance. This directory will contain all artifacts produced by this execution, including the downloaded and extracted data. + + +--- + +### Section 3: Pipeline Definition and Setup + +```r +print(paste("Starting workflow run in directory:", this_run_directory)) +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) +``` + +**Purpose**: Sets up the initial pipeline runtime environment. + +- Changes working directory to the specific run directory +- Configures the targets store to be in the current directory +- Defines the path for the generated pipeline script file + +The tar_script block sets up the pipeline definition with placeholder values (marked with `@...@`) that will be replaced with actual configuration values in a later step. These placeholders are necessary because tar_make executes the script in a separate process without access to the current R environment's variables. + +The required R packages for PEcAn workflows are specified, and necessary PEcAn modules are imported. + + +--- + +### Section 4: Pipeline Targets Definitions + +```r + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +}, ask = FALSE, script = tar_script_path) +``` + +**Purpose**: Defines the three targets that constitute this workflow's data preparation pipeline. + +The first target, `ccmmf_data_tarball`, downloads the data tarball from S3 using the CCMMF data access function. This function uses AWS CLI to access the S3-compatible storage. The tarball is downloaded to the targets store directory. + +The second target, `workflow_data_paths`, extracts the tarball contents to the targets store. This extraction happens automatically whenever the tarball is downloaded or updated. + +The third target, `obtained_resources_untar`, lists the extracted files. This serves as verification that the extraction was successful and also provides a record of what files were extracted. + +All data is stored in the targets store directory using the `tar_path_store()` function. This ensures that all workflow artifacts are managed by the targets framework, enabling smart re-evaluation and dependency tracking. + + +--- + +### Section 5: Script Post-Processing and Execution + +```r +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) +``` + +**Purpose**: +- Replaces all placeholder values with actual paths and values +- Writes the final pipeline script + +```r +#### workflow execution #### +# this changes the cwd to the designated tar store +tar_make(script = tar_script_path) +``` + +This line actually executes the pipeline script, in the workflow run directory. + +The comment block explains why the placeholder replacement approach is necessary: tar_make executes in a separate process without access to the current R environment's variables. By using string replacement on the generated script, we can inject the actual configuration values before execution. + +The final call to `tar_make()` triggers the execution of the complete workflow pipeline, which will download and extract the data, or use cached results if the pipeline has been run previously with the same inputs. + + +## Key Concepts Demonstrated + +### 1. Configuration-Driven Workflows +The XML configuration separates workflow orchestration from execution logic, enabling: +- Easy modification of data sources without code changes +- Reusable workflow templates +- Clear documentation of workflow parameters + +### 2. Data Artifact Management +- Automatic download from remote S3 storage +- Organized storage in workflow run directories +- Complete provenance tracking through the targets framework + +### 3. Reproducible Execution +- Unique run identifiers prevent conflicts +- Complete isolation of workflow runs +- Full audit trail of data origins + +### 4. Smart Re-evaluation +The targets framework ensures: +- Only changed components are re-executed +- Efficient use of disk space (shared data references) +- Automatic dependency resolution + +### 5. Foundation for Workflow Composition +This workflow provides data artifacts that can be referenced by subsequent workflows using run identifiers, enabling: +- Clear dependency chains between workflows +- Data reuse across multiple analyses +- Separation of data preparation from analysis + +## Workflow Sequence + +This workflow is the first in the sequence: + +``` +Workflow 01: Data Preparation (This workflow) + ↓ (provides data artifacts) +Workflow 02: Container Setup & Configuration + ↓ (uses data from 01) +Workflow 03: Model Execution & Analysis +``` + +## Usage + +```bash +Rscript 01_data_prep_workflow.R --settings 01_pecan_workflow_config_example.xml +``` + +## Dependencies + +- R packages: `targets`, `tarchetypes`, `PEcAn.all`, `optparse`, `uuid` +- AWS CLI configured for S3 access with CCMMF credentials +- Access to CCMMF S3 storage endpoint at `s3.garage.ccmmf.ncsa.cloud` + +## Output + +This workflow produces: +- Downloaded data tarball: `00_cccmmf_phase_1a_input_artifacts.tgz` +- Extracted data files in subdirectories: + - `data/` - Meteorological data files + - `IC_files/` - Initial condition files + - `pfts/` - Plant functional type files +- Complete workflow execution history and metadata in targets store +- Executed pipeline script: `executed_pipeline.R` + +## Next Steps + +After running this workflow successfully: +1. Note the run identifier (e.g., `data_prep_run_01`) for use in subsequent workflows +2. Examine the extracted data artifacts in the run directory +3. Use this workflow's output as input to workflow 02 (container setup) +4. Build more complex workflows that depend on this data preparation step +5. Iterate with smart re-evaluation by modifying data sources or workflow parameters diff --git a/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml b/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml new file mode 100644 index 0000000..13b5f50 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml @@ -0,0 +1,30 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + diff --git a/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml b/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml new file mode 100644 index 0000000..322d1db --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml @@ -0,0 +1,30 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_latest.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + ./02_pecan_config_latest.xml + + + diff --git a/workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml b/workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/02_referencing_data_workflow/02_pecan_config_latest.xml b/workflow_examples/02_referencing_data_workflow/02_pecan_config_latest.xml new file mode 100644 index 0000000..640a6c2 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_pecan_config_latest.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R new file mode 100644 index 0000000..65cb1cf --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R @@ -0,0 +1,122 @@ +library(targets) +library(tarchetypes) +library(PEcAn.settings) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} + + +workflow_name = "workflow.reference.02" + +#### Primary workflow settings parsing #### + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + pecan_xml_path = "@PECANXMLPATH@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + apptainer_url = workflow_settings$apptainer$remote.url + apptainer_name = workflow_settings$apptainer$container.name + apptainer_tag = workflow_settings$apptainer$tag + apptainer_sif = workflow_settings$apptainer$sif + + #### DATA REFERENCING #### + #### Workflow run base directory + data source ID = source of data #### + data_source_run_identifier = workflow_settings$data.source.01.reference + workflow_data_source = file.path(base_workflow_directory, data_source_run_identifier) + dir_check = check_directory_exists(workflow_data_source, stop_on_nonexistent=TRUE) + + # tar_option_set( + # packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + # ) + tar_option_set( + packages = c("PEcAn.settings", "readr", "dplyr") + ) + list( + step__resolve_data_routing( + workflow_data_source_directory = workflow_data_source, + target_artifact_names = c("reference_IC_directory", "reference_data_entity", "reference_pft_entity"), + external_name_list = c("IC_files", "data", "pfts"), + localized_name_list = c("IC_files", "data", "pfts"), + action_list = c("reference", "reference", "reference") + ), + # how does the user either specify what vars are populated, or clarify what vars are populated by a func call + step__resolve_apptainer(apptainer_source_directory=NULL, workflow_xml=workflow_settings), + + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + + # TODO: find a method which allows passing of non-quoted vars + step__run_distributed_write_configs(container=quote(apptainer_reference), pecan_settings=quote(pecan_settings_prepared), use_abstraction=TRUE) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) + +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + + + diff --git a/workflow_examples/02_referencing_data_workflow/README.md b/workflow_examples/02_referencing_data_workflow/README.md new file mode 100644 index 0000000..b79b6bf --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/README.md @@ -0,0 +1,354 @@ +# Data Referencing Workflow Example + +This example demonstrates how to **reference data from previous workflow runs** and **pull Apptainer containers** using the distributed workflows framework. This workflow builds upon the data preparation workflow and adds container management and PEcAn configuration preparation. + +## Overview + +This workflow showcases: +1. **External data referencing** using symbolic links to previous workflow runs +2. **Apptainer container management** with remote container pulling +3. **PEcAn configuration generation** using distributed execution +4. **Workflow dependency management** with proper sequencing + +## Key Files + +- `02_run_data_reference_workflow.R` - Main workflow script +- `02_pecan_workflow_config_example.xml` - Configuration file + +## Workflow Script Breakdown + +### Section 1: Workflow setup & settings parsing + +```r +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +this_workflow_name = "workflow.reference.02" + +#### Primary workflow settings parsing #### + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) + +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = workflow_settings$pecan.xml.path + +data_source_run_identifier = workflow_settings$data.source.01.reference +``` + +**Purpose**: + +This set-up section brings in standard command line arguments, and extracts the orchestration settings for this workflow via the workflow name. + +The content here binds into the XML configuration file. The workflow name is a particularly useful field, as it can be used to easily switch to a different configuration stanza, while keeping the remainder of the workflow set-up identical. + +This section also identifies the base workflow run directory - this is a critical field, as subsequent data references look in this directory by default for data sourcing. + +This workflow specifically extracts the `data.source.01.reference` field, which identifies the run ID of workflow 01 (data preparation). This reference allows this workflow to access the data artifacts produced by that prior workflow run. + +The comment block early in this section documents the smart re-evaluation behavior of the targets framework, which will only re-run pipeline steps if inputs or code have changed. + + +--- + +### Section 2: Data Referencing Setup + +```r +# TODO: input parameter validation and defense +#### Handle input parameters parsed from settings file #### +#### workflow prep #### +function_path = normalizePath(file.path(workflow_function_source)) +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +#### DATA REFERENCING #### +#### Workflow run base directory + data source ID = source of data ## +this_data_source_directory = file.path(workflow_run_directory, data_source_run_identifier) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) +``` + +**Purpose**: Sets up the reference to external data from workflow 01. + +The paths to the workflow functions and PEcAn XML are normalized to ensure absolute paths. Then, the data source directory is constructed by combining the base workflow run directory with the data source run identifier (from workflow 01). + +The `check_directory_exists()` function validates that this directory exists, stopping execution if it does not. This ensures that the prerequisite workflow (01) has completed successfully before this workflow attempts to reference its data. + +This is the key mechanism for referencing external data without copying - by constructing a path based on a run identifier, subsequent workflows can access data from prior workflow executions through symbolic links. + + +--- + +### Section 3: Pipeline Definition and Launch Setup + +```r +#### THIS ANALYSIS RUN DIRECTORY SETUP #### +ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) +analysis_run_directory = ret_obj$run_dir +analysis_run_id = ret_obj$run_id + +#### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + apptainer_url = "@APPTAINERURL" + apptainer_name = "@APPTAINERNAME@" + apptainer_tag = "@APPTAINERTAG@" + apptainer_sif = "@APPTAINERSIF@" + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) +``` + +**Purpose**: Sets up the initial pipeline runtime environment. + +Uses the `workflow_run_directory_setup()` helper function to create the analysis run directory and retrieve both the directory path and run ID. This provides a cleaner interface for directory management. + +Changes working directory to the analysis run directory, configures the targets store, and defines the path for the generated pipeline script file. + +The tar_script block sets up the pipeline definition with placeholder values (marked with `@...@`) that will be replaced with actual configuration values in a later step. These placeholders are necessary because tar_make executes the script in a separate process without access to the current R environment's variables. + +The Apptainer container configuration parameters (URL, name, tag, and SIF filename) are all set as placeholders here. The required R packages for PEcAn workflows are specified, and necessary PEcAn modules are imported. + + +--- + +### Section 4: Pipeline Targets Definitions + +```r + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), +``` + +**Purpose**: Creates symbolic links to data from workflow 01 (data preparation). + +These three targets create symbolic links to data from the data preparation workflow. The system looks in the workflow_data_source directory (which is generated as a combination of the base workflow directory and the run identifier of workflow 01). + +From within that directory, each of the three objects are identified by their 'external_name' within that directory. They are then linked based on the 'localized_name' provided. The 'localized_name' is what the workflow targets, when run, would be able to access. + +The comment block emphasizes an important limitation: these are symbolic links, not copies. If the content of the source data changes after the link is created, this workflow will not detect those changes. For scenarios where data integrity checking is required, a different approach (such as copying and checksumming) would be needed. + +```r + # pull down the apptainer from remote + # we could do this in the prior step. + # doing it here in this example allows the next step to reference two different data sources + tar_target(apptainer_reference, pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif)), +``` + +This target downloads the Apptainer container from a remote registry (e.g., Docker Hub) and saves it as a `.sif` file in the current workflow run directory. The comment notes that this could be done in the prior workflow step, but doing it here allows workflow 03 to reference both the data (from workflow 01) and the container (from workflow 02) separately. + +Downloading containers as workflow artifacts enables reproducible execution environments and version control of container images. By making containers workflow artifacts, we can track which container version was used for each analysis run. + +```r + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), +``` + +Prepares PEcAn settings by reading the XML configuration file and creating the PEcAn run directory. The continue directive check determines whether the workflow should attempt to continue from a previous run (currently set to FALSE). + +```r + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), +``` + +These two steps are critical to understand the process by which distributed computing is supported in this framework. + +In order to ease the process of executing arbitrary code, including calls of PEcAn functions, both the function and the arguments to that function are abstracted via the above steps. This causes the Targets framework to register the function, and the arguments as separate compressed R objects on-disk within the workflow run directory. + +This allows the submission of a simple functional call via SBatch to Slurm. This call creates a new R process, using the workflow run directory as its working directory. It simply loads the function from the target store's compressed R object, loads the arguments as well, and calls the function on the arguments. + +The two target steps above are the required preparation steps to enable this process. The sections below actually submit the function call to sbatch, and then monitor the process on the cluster. + +```r + # run the abstracted function on the abstracted arguments via slurm + tar_target( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_continue, apptainer_reference) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ) +``` + +These two target steps submit the function call which is abstracted in the previous two steps. It is important to note that the function artifact and the argument artifact are passed as __string__ names, not variable names. + +The apptainer reference provides the apptainer information that will encapsulate the R function call on the Slurm worker node. The 'task_id' variable provides the unique identifier for the job submission to ensure non-collision with existing files or directories. + +The final tar_target monitors the job submission and blocks until it is complete. This should be used as-needed, as in some cases, it is important to finish a distributed compute process before moving on with the rest of an analysis pipeline. + + +--- + +### Section 5: Script Post-Processing and Execution + +```r + ) +}, ask = FALSE, script = analysis_tar_script_path) + +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERURL", workflow_settings$apptainer$remote.url, script_content) +script_content <- gsub("@APPTAINERNAME@", workflow_settings$apptainer$container.name, script_content) +script_content <- gsub("@APPTAINERTAG@", workflow_settings$apptainer$tag, script_content) +script_content <- gsub("@APPTAINERSIF@", workflow_settings$apptainer$sif, script_content) + +writeLines(script_content, analysis_tar_script_path) + +tar_make(script = analysis_tar_script_path) +``` + +**Purpose**: +- Replaces all placeholder values with actual paths and values +- Writes the final pipeline script +- Executes the workflow using the targets framework + +The comment block explains why the placeholder replacement approach is necessary: tar_make executes in a separate process without access to the current R environment's variables. By using string replacement on the generated script, we can inject the actual configuration values before execution. + +Note that the Apptainer configuration values are accessed from `workflow_settings$apptainer` rather than individual variables, since they were not extracted into separate variables earlier in the script. + +The final call to `tar_make()` triggers the execution of the complete workflow pipeline, which will reference data from workflow 01, download the Apptainer container, generate PEcAn configurations via distributed execution, and monitor the distributed job. + + +## Key Concepts Demonstrated + +### 1. External Data Referencing +Workflows can reference data from previous runs without copying, using symbolic links that provide: +- Disk space efficiency +- Data consistency across workflows +- Clear dependency tracking + +### 2. Container Management +Downloading containers as workflow artifacts enables: +- Reproducible execution environments +- Version control of container images +- Efficient reuse across multiple workflow runs + +### 3. Distributed Execution Abstraction +The function abstraction pattern allows: +- Remote execution without code duplication +- Flexible job scheduling +- Proper dependency management in distributed environments + +### 4. Workflow Composition +This workflow demonstrates how to compose multiple workflows: +- Data preparation (workflow 01) +- Container management and configuration (workflow 02) +- Actual analysis (workflow 03 - see next example) + +### 5. Helper Function Integration +The use of `workflow_run_directory_setup()` demonstrates: +- Code reusability +- Cleaner interfaces +- Encapsulation of common patterns + +## Workflow Sequence + +This workflow sits in the middle of the sequence: + +``` +Workflow 01: Data Preparation + ↓ (provides data artifacts) +Workflow 02: Container Setup & Configuration (This workflow) + ↓ (uses data from 01) +Workflow 03: Model Execution & Analysis +``` + +## Usage + +```bash +Rscript 02_run_data_reference_workflow.R --settings 02_pecan_workflow_config_example.xml +``` + +## Dependencies + +- Workflow 01 (data preparation) must complete first +- Access to remote container registry (e.g., Docker Hub) +- SLURM cluster for distributed execution +- Apptainer installed and available + +## Output + +This workflow produces: +- Symbolic links to data from workflow 01 +- Downloaded Apptainer container (.sif file) +- PEcAn configuration files generated via distributed execution +- Complete workflow execution history in targets store + +## Next Steps + +After running this workflow successfully: +1. Note the run identifier for use in workflow 03 +2. Verify the symbolic links to workflow 01 data are functional +3. Confirm the Apptainer container download completed +4. Check PEcAn configuration files were generated +5. Use this workflow's output as input to workflow 03 (model execution) diff --git a/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml b/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml new file mode 100644 index 0000000..d134d31 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml @@ -0,0 +1,39 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + analysis_run_identifier_03_sourcing + ./03_pecan_config_devel.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_develop.sif + + + + diff --git a/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml b/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml new file mode 100644 index 0000000..793a292 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml @@ -0,0 +1,39 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_latest.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + ./02_pecan_config_latest.xml + + + analysis_run_identifier_03_sourcing + ./03_pecan_config_latest.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_latest.sif + + + + diff --git a/workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml b/workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml new file mode 100644 index 0000000..5804d53 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/03_distributed_workflow/03_pecan_config_latest.xml b/workflow_examples/03_distributed_workflow/03_pecan_config_latest.xml new file mode 100644 index 0000000..72f70d3 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_pecan_config_latest.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R new file mode 100644 index 0000000..6828334 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R @@ -0,0 +1,126 @@ +library(targets) +library(tarchetypes) +library(XML) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} + +########################################################## + +workflow_name = "workflow.analysis.03" + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + pecan_xml_path = "@PECANXMLPATH@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + #### Data Referencing #### + ## Workflow run base directory + data source ID = source of data ## + data_source_run_identifier = workflow_settings$data.source.01.reference + workflow_data_source = normalizePath(file.path(base_workflow_directory, data_source_run_identifier)) + dir_check = check_directory_exists(workflow_data_source, stop_on_nonexistent=TRUE) + + ## apptainer is referenced from a different workflow run id ## + apptainer_source_run_identifier = workflow_settings$apptainer.source.reference + apptainer_source_directory = normalizePath(file.path(base_workflow_directory, apptainer_source_run_identifier)) + dir_check = check_directory_exists(apptainer_source_directory, stop_on_nonexistent=TRUE) + apptainer_sif = workflow_settings$apptainer$sif + + # tar pipeline options and config + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) + list( + # we can reference data products in an external directory + # here, we can call this once per directory, and identify the components of that directory we want to reference + step__resolve_data_routing( + workflow_data_source_directory = workflow_data_source, + target_artifact_names = c("reference_IC_directory", "reference_data_entity", "reference_pft_entity"), + external_name_list = c("IC_files", "data", "pfts"), + localized_name_list = c("IC_files", "data", "pfts"), + action_list = c("reference", "reference", "reference") + ), + # this is still a little chunky; workflow steps referencing these target names do so invisibily at the moment. + + # If we can't link to the apptainer via apptainer_source_directory, attempt to pull it from the remote. + step__resolve_apptainer(apptainer_source_directory=apptainer_source_directory, workflow_xml=workflow_settings), + + # we can mix and match our own functions with classic tar_target imperatives + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + + # we write: + step__run_distributed_write_configs(container=quote(apptainer_reference), pecan_settings=quote(pecan_settings), use_abstraction=TRUE, + dependencies=c("apptainer_reference", "pecan_settings")), + + # we can do this: + step__run_pecan_workflow() + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) + +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) \ No newline at end of file diff --git a/workflow_examples/03_distributed_workflow/README.md b/workflow_examples/03_distributed_workflow/README.md new file mode 100644 index 0000000..b512ce0 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/README.md @@ -0,0 +1,376 @@ +# Distributed Workflow Example + +This example demonstrates **complete PEcAn model execution with distributed computing** using the distributed workflows framework. This is the most complex workflow, pulling together data referencing, container management, and distributed PEcAn ecosystem modeling. + +## Overview + +This workflow showcases: +1. **Complete PEcAn ecosystem model workflow** execution +2. **Distributed computing** via SLURM with Apptainer containers +3. **Multi-stage PEcAn analysis** including ensemble runs and sensitivity analysis +4. **Workflow composition** building upon data preparation and container setup +5. **Result aggregation** and workflow completion handling + +## Key Files + +- `03_run_distributed_workflow.R` - Main workflow script +- `03_pecan_workflow_config_example.xml` - Configuration file + +## Workflow Script Breakdown + +### Section 1: Workflow setup & settings parsing + +```r +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +this_workflow_name = "workflow.analysis.03" + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) +function_path = normalizePath(file.path(workflow_function_source)) + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = normalizePath(file.path(workflow_settings$pecan.xml.path)) +``` + +**Purpose**: + +This set-up section brings in standard command line arguments, and extracts the orchestration settings for this workflow via the workflow name. + +The content here binds into the XML configuration file. The workflow name is a particularly useful field, as it can be used to easily switch to a different configuration stanza, while keeping the remainder of the workflow set-up identical. + +This section also identifies the base workflow run directory - this is a critical field, as subsequent data references look in this directory by default for data sourcing. + +This section identifies the PEcAn XML file which will be used as part of any PEcAn invocations. This __can__ be the same as the orchestration XML, and in these examples, it is. However, these can be separate XMLs - this is intended to enable swapping between PEcAn XMLs for the purposes of comparison. + + +--- + +### Section 2: Data Referencing Setup + +```r +#### Data Referencing #### +## Workflow run base directory + data source ID = source of data ## +data_source_run_identifier = workflow_settings$data.source.01.reference +this_data_source_directory = normalizePath(file.path(workflow_run_directory, data_source_run_identifier)) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) + +## apptainer is referenced from a different workflow run id ## +apptainer_source_run_identifier = workflow_settings$apptainer.source.reference +apptainer_source_dir = normalizePath(file.path(workflow_run_directory, apptainer_source_run_identifier)) +dir_check = check_directory_exists(apptainer_source_dir, stop_on_nonexistent=TRUE) +apptainer_sif = workflow_settings$apptainer$sif +``` + +**Purpose**: As an expansion of example #02, sets up references to external workflow artifacts + +- Data source: References data from workflow 01 (data preparation) +- Apptainer source: References container from workflow 02 (container setup) + +In particular note the way in which we are now referencing objects from two different prior workflow runs. We can extend this concept to an arbitrary number of such prior runs or external directories. It is important to pay careful attention to the disposition of data which is incorporated into workflows as references from prior runs, as this allows the effective separation of concerns between data handling and logistics, and data analysis and summary. + + +--- + +### Section 3: Pipeline Definition and Launch Setup + +```r +#### Pipeline definition and launch #### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + # prep parameter receivers + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + apptainer_source_directory = "@APPTAINERSOURCE@" + apptainer_sif = "@APPTAINERSIF@" + + # tar pipeline options and config + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) +``` + +**Purpose**: Sets up the initial pipeline runtime environment. + +- Defines the pipeline execution directory, changes the working directory, and sets the path for the target store. +- Imports libraries needed +- sets up the placeholder variables which will be populated with variables. See below for the actual method of replacing these placeholders + with actual values. +- Sets up required R packages for PEcAn workflows - it is important to note that these libraries will **not be imported into methods called on slurm-managed nodes**. The user will have to import those packages within the function which is abstracted. + +--- + +### Section 4: Pipeline Targets Definitions + +#### External Data Referencing + +```r + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), +``` + +Each of these three targets creates a symbolic link to data from the data preparation workflow (workflow 01). + +Data referencing within this framework begins by identifying the source directory (```workflow_data_source```), and then the specific on-disk name of the resource being referenced (e.g., ```IC_files```). In this case, these are directories containing input data for PEcAn. In order to facilitate referencing objects which share a name (e.g., the generic external name of ```data```), each object may be labeled with a different localized name for the resource. + +From within that directory, each of the three objects are identified by their 'external name', within that directory. They are then linked to, based on the 'localized_name' provided. The 'localized_name' is what the workflow targets, when run, would be able to access. + +#### Apptainer Image Referencing + +```r + # In this case, we're not pulling the apptainer - we are referencing it from a prior run + # this means you can use the data-prep runs to iterate the apptainer version (when needed) + # and use analysis runs to leverage the apptainer (but not update it) + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=apptainer_source_directory, + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ), +``` + +This target uses a similar approach to locate the apptainer which was downloaded in step 02. The apptainer sif exists in the workflow directory from step 02, and this exposes it to the subsequent target steps which depend on the presence of an apptainer. + +It is also important to note that the apptainer sif name is referenced within the PEcAn XML, and it is important that the localized name here matches that value in the PEcAn XML. In the future, this reference will be parameterized to match this apptainer SIF. + +Referencing the apptainer in this way has two major benefits. First, it does not re-download the apptainer for each subsequent run of this workflow step. Apptainer sifs are typically fairly large on-disk, and over time this represents major savings of storage foot print. + +Second, keeping the apptainer image in a seperate workflow directory means that it will not be re-pulled every time this analysis is run. It would is ideal to run multiple analyses under identical code-states such that their outcomes can be directly compared. When it is necessary, the apptainer workflow can be run under a new run identifier, and then the differences between apptainer version can also be directly compared. + +#### PEcAn Configuration Loading + +```r + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), +``` + +Identifies and prepares the PEcAn settings and run directory for subsequent steps. + +#### Function Abstraction in preparation for Slurm submission + +```r + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), +``` + +These two steps are critical to understand the process by which distributed computing is supported in this framework. + +In order to ease the process of executing arbitrary code, including calls of PEcAn functions, both the function and the arguments to that function are both abstracted via the above steps. This causes the Targets framework to register the function, and the arguments as separate compressed R objects on-disk within the workflow run directory. + +This allows the submission of a simple functional call via SBatch to Slurm. This call creates a new R process, using the workflow run directory as its working directory. It simply loads the function from the target store's compressed R object, loads the arguments as well, and calls the function on the arguments. + +The two target steps above are the required preparation steps to enable this process. The sections below actually submit the function call to sbatch, and then monitor the process on the cluster. + +#### Slurm job submission of workflow methods + +```r + # run the abstracted function on the abstracted arguments via slurm + tar_target( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_continue) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ), ## blocks until component jobs are done +``` + +These two target steps submit the function call which is abstracted in the previous two steps. It is important to note that the function artifact and the argument artifact are passed as __string__ names, not variable names. + +The apptainer reference provides the apptainer information that will encapsulate the R function call on the Slurm worker node. The 'task_id' variable provides the unique identifier for the job submission to ensure non-collision with existing files or directories. + +The final tar_target here monitors the job submission and blocks until it is complete. This should be used as-needed, as in some cases, it is important to finish a distributed compute process before moving on with the rest of an analysis pipeline. In other cases, large amounts of compute of multiple steps can be executed simultaneously, and so it may not be necessary to block until all those computations are complete. + + +--- + +### Section 5: Ecosystem Model Runs + +```r + tar_target( + ecosystem_settings, + pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) + ), + tar_target( + model_results_settings, + pecan_get_model_results(pecan_settings=ecosystem_settings) + ), + tar_target( + ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + pecan_run_ensemble_analysis(pecan_settings=model_results_settings) + ), + tar_target( + sensitivity_settings, + pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) + ), + tar_target( + complete_settings, + pecan_workflow_complete(pecan_settings=sensitivity_settings) + ) + ) +}, ask = FALSE, script = analysis_tar_script_path) +``` + +These sections show sequential execution of PEcAn functions. Note that these functions submit work via slurm based on PEcAn internal functionality. Because these functions submit work to Slurm, they __cannot__ be executed within an apptainer themselves. + +Also note that each step uses a __pecan_settings__ object, and returns a similar object. These do not mutate this object in any way, and so in fact all of these settings objects are in fact identical. However, by passing these objects from one call to the next, we create dependency of each step on the prior step, and enforce their sequential evaluation. If all of these different steps were passed the original __pecan_settings__ variable, each step would execute in parallel. + + +--- + +### Section 6: Script Post-Processing and Execution + +```r +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERSOURCE@", apptainer_source_dir, script_content) +script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) + +writeLines(script_content, analysis_tar_script_path) +``` + +**Purpose**: +- Replaces all placeholder values with actual paths and values +- Writes the final pipeline script + +```r +tar_make(script = analysis_tar_script_path) +``` +This line actually executes the pipeline script, in the workflow run directory. + + +## Key Concepts Demonstrated + +### 1. Complete PEcAn Workflow Integration +This workflow executes the full PEcAn ecosystem modeling pipeline from configuration through ensemble and sensitivity analysis. + +### 2. Multi-Workflow Composition +References artifacts from two different previous workflows, enabling: +- Workflow reuse +- Clear dependency management +- Modular development + +### 3. Distributed Computing Pattern +The abstraction pattern enables: +- Remote execution of arbitrary R functions +- Proper job scheduling via SLURM +- Resource management on HPC clusters + +### 4. Sequential Workflow Orchestration +Dependencies ensure proper execution order while allowing parallel execution where possible. + +### 5. Helper Function Integration +The use of `workflow_run_directory_setup()` demonstrates: +- Code reusability +- Cleaner interfaces +- Encapsulation of common patterns + +## Workflow Sequence + +``` +Workflow 01: Data Preparation + ↓ +Workflow 02: Container Setup & Configuration + ↓ +Workflow 03: Model Execution & Analysis (This workflow) +``` + +## Usage + +```bash +Rscript 03_run_distributed_workflow.R --settings 03_pecan_workflow_config_example.xml +``` + +## Dependencies + +- Workflow 01 (data preparation) must complete first +- Workflow 02 (container and configuration setup) must complete first +- SLURM cluster access +- Apptainer available on cluster nodes +- Sufficient cluster resources for model ensemble runs + +## Output + +This workflow produces: +- PEcAn model configurations +- Ecosystem model outputs (NetCDF files) +- Ensemble summary statistics +- Sensitivity analysis results +- Completed workflow status + +## Next Steps + +After running this workflow: +1. Examine model outputs in the run directory +2. Review ensemble and sensitivity analysis results +3. Use results as inputs for downstream analysis workflows +4. Modify PEcAn XML configuration to explore different scenarios +5. Iterate with smart re-evaluation by changing model parameters