Skip to content

Commit

Permalink
r2.0.0 cicd updates (#10496)
Browse files Browse the repository at this point in the history
* [Nemo Unit Tests] Split GPU unit tests (#10380)

* Split GPU unit tests

* Make L0_Unit_Tests_GPU_Lightning optional since flaky

* Add time for GPU_Core test

* Add time for GPU_Audio test

* CICD: Attempt fix for False positive (not all tests have run) (#10436)

* [Nemo CICD] Make flaky test optional (#10438)

* Make flaky test optional

* Make flaky test optional

* [Nemo CICD] Make flaky test optional  (#10442)

* Make flaky test optional

* Make flaky test optional

* Make flaky test optional

* [Nemo CICD] Make flaky test optional (#10446)

* Make flaky test optional

* Make flaky test optional

* Make flaky test optional

* Make flaky test optional

---------

Signed-off-by: Pablo Garay <[email protected]>

* Make flaky test optional (#10448)

* [Nemo CICD] Make flaky test optional (#10450)

* Make flaky test optional

* Make flaky test optional

* [Nemo CICD] Make flaky test optional (#10452)

* Make flaky test optional

* Make flaky test optional

* Make flaky test optional

* Make flaky test optional (#10456)

* Make flaky test optional (#10459)

* Optional test needs optional field set true (#10475)

* [Nemo CICD] Make flaky test optional (#10476)

* Optional test needs optional field set true

* make flaky test optional

---------

Signed-off-by: Pablo Garay <[email protected]>
  • Loading branch information
pablo-garay authored Sep 17, 2024
1 parent 6c7b953 commit a39f068
Showing 1 changed file with 201 additions and 29 deletions.
230 changes: 201 additions & 29 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,112 +122,232 @@ jobs:
'
### \'\'
L0_Unit_Tests_GPU:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
TIMEOUT: 60
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true
# L0: GPU unit tests
OPTIONAL_L0_Unit_Tests_GPU_ASR:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
TIMEOUT: 20
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

OPTIONAL_L0_Unit_Tests_GPU_Audio:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
TIMEOUT: 20
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

L0_Unit_Tests_GPU_Common:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
L0_Unit_Tests_GPU_LLM:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
L0_Unit_Tests_GPU_Multimodal:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads
OPTIONAL_L0_Unit_Tests_GPU_NLP:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

OPTIONAL_L0_Unit_Tests_GPU_TTS:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

OPTIONAL_L0_Unit_Tests_GPU_Core:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
TIMEOUT: 20
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

L0_Unit_Tests_GPU_Hydra:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads
OPTIONAL_L0_Unit_Tests_GPU_Lightning:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

L0_Unit_Tests_GPU_Others:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads \
--ignore=tests/collections/asr \
--ignore=tests/collections/audio \
--ignore=tests/collections/common \
--ignore=tests/collections/llm \
--ignore=tests/collections/multimodal \
--ignore=tests/collections/nlp \
--ignore=tests/collections/tts \
--ignore=tests/core \
--ignore=tests/core_ptl \
--ignore=tests/hydra \
--ignore=tests/lightning \
--ignore=tests/utils
# L0: CPU unit tests
L0_Unit_Tests_CPU_ASR:
OPTIONAL_L0_Unit_Tests_CPU_ASR:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Audio:
OPTIONAL_L0_Unit_Tests_CPU_Audio:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Common:
OPTIONAL_L0_Unit_Tests_CPU_Common:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_LLM:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true
L0_Unit_Tests_CPU_Multimodal:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true
L0_Unit_Tests_CPU_NLP:
OPTIONAL_L0_Unit_Tests_CPU_NLP:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_TTS:
OPTIONAL_L0_Unit_Tests_CPU_TTS:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Core:
OPTIONAL_L0_Unit_Tests_CPU_Core:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Hydra:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true
L0_Unit_Tests_CPU_Lightning:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true
L0_Unit_Tests_CPU_Ohers:
L0_Unit_Tests_CPU_Others:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
Expand Down Expand Up @@ -4966,22 +5086,72 @@ jobs:
rm -rf examples/llm/gpt_pretrain_results
rm -rf examples/llm/gpt_index_mappings
OPTIONAL_L2_NeMo_2_SSM_Pretraining:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \
--devices 1 \
--experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \
--max-steps 10 \
--data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain
IS_OPTIONAL: true

OPTIONAL_L2_NeMo_2_SSM_Finetuning:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \
--devices 1 \
--experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \
--max-steps 10 \
--model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft
IS_OPTIONAL: true

Nemo_CICD_Test:
needs:
- pre-flight
- gpu-test
- cicd-test-container-setup
- L0_Unit_Tests_GPU
- L0_Unit_Tests_CPU_ASR
- L0_Unit_Tests_CPU_Audio
- L0_Unit_Tests_CPU_Common

#- OPTIONAL_L0_Unit_Tests_GPU_ASR
#- OPTIONAL_L0_Unit_Tests_GPU_Audio
- L0_Unit_Tests_GPU_Common
- L0_Unit_Tests_GPU_LLM
- L0_Unit_Tests_GPU_Multimodal
#- OPTIONAL_L0_Unit_Tests_GPU_NLP
#- OPTIONAL_L0_Unit_Tests_GPU_TTS
#- OPTIONAL_L0_Unit_Tests_GPU_Core
- L0_Unit_Tests_GPU_Hydra
#- OPTIONAL_L0_Unit_Tests_GPU_Lightning
- L0_Unit_Tests_GPU_Others

#- OPTIONAL_L0_Unit_Tests_CPU_ASR
#- OPTIONAL_L0_Unit_Tests_CPU_Audio
#- OPTIONAL_L0_Unit_Tests_CPU_Common
- L0_Unit_Tests_CPU_LLM
- L0_Unit_Tests_CPU_Multimodal
- L0_Unit_Tests_CPU_NLP
- L0_Unit_Tests_CPU_TTS
- L0_Unit_Tests_CPU_Core
#- OPTIONAL_L0_Unit_Tests_CPU_NLP
#- OPTIONAL_L0_Unit_Tests_CPU_TTS
#- OPTIONAL_L0_Unit_Tests_CPU_Core
- L0_Unit_Tests_CPU_Hydra
- L0_Unit_Tests_CPU_Lightning
- L0_Unit_Tests_CPU_Ohers
- L0_Unit_Tests_CPU_Others

- L2_Community_LLM_Checkpoints_tests_Bert
- L2_Community_LLM_Checkpoints_tests_Mamba2
- L2_Community_LLM_Checkpoints_tests_Llama
Expand Down Expand Up @@ -5083,6 +5253,8 @@ jobs:
#- OPTIONAL_L2_Stable_Diffusion_Training
- L2_NeMo_2_GPT_Pretraining_no_transformer_engine
- L2_NeMo_2_GPT_DDP_Param_Parity_check
#- OPTIONAL_L2_NeMo_2_SSM_Pretraining
#- OPTIONAL_L2_NeMo_2_SSM_Finetuning
if: always()
runs-on: ubuntu-latest
steps:
Expand Down

0 comments on commit a39f068

Please sign in to comment.