From c1f5bb3cb146a310047f7c535d4be6432715dd28 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 25 Apr 2025 09:36:17 -0700 Subject: [PATCH 1/4] Add requirements.txt to examples which miss them Signed-off-by: Dmitry Rogozhkin --- fast_neural_style/requirements.txt | 3 +++ fx/requirements.txt | 2 ++ regression/requirements.txt | 1 + super_resolution/requirements.txt | 3 +++ 4 files changed, 9 insertions(+) create mode 100644 fast_neural_style/requirements.txt create mode 100644 fx/requirements.txt create mode 100644 regression/requirements.txt create mode 100644 super_resolution/requirements.txt diff --git a/fast_neural_style/requirements.txt b/fast_neural_style/requirements.txt new file mode 100644 index 0000000000..cef06d7884 --- /dev/null +++ b/fast_neural_style/requirements.txt @@ -0,0 +1,3 @@ +numpy +torch +torchvision diff --git a/fx/requirements.txt b/fx/requirements.txt new file mode 100644 index 0000000000..ac988bdf84 --- /dev/null +++ b/fx/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision diff --git a/regression/requirements.txt b/regression/requirements.txt new file mode 100644 index 0000000000..12c6d5d5ea --- /dev/null +++ b/regression/requirements.txt @@ -0,0 +1 @@ +torch diff --git a/super_resolution/requirements.txt b/super_resolution/requirements.txt new file mode 100644 index 0000000000..877dcb484e --- /dev/null +++ b/super_resolution/requirements.txt @@ -0,0 +1,3 @@ +six +torch +torchvision From 6a258509d12c3184a3bc539a922b81a5966504d9 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 25 Apr 2025 15:30:14 -0700 Subject: [PATCH 2/4] Update numpy requirement for reinforcement_learning to be <2 Current version of the example requires `numpy<2` otherwise the following error can be seen: ``` AttributeError: module 'numpy' has no attribute 'bool8'. Did you mean: 'bool'? ``` Signed-off-by: Dmitry Rogozhkin --- reinforcement_learning/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reinforcement_learning/requirements.txt b/reinforcement_learning/requirements.txt index cd32b22fec..bd262b2733 100644 --- a/reinforcement_learning/requirements.txt +++ b/reinforcement_learning/requirements.txt @@ -1,4 +1,4 @@ torch -numpy +numpy<2 gym pygame From 8cd601a45fdffd8d8b1c6ba9eab933e072159509 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 25 Apr 2025 16:19:07 -0700 Subject: [PATCH 3/4] Update torch requirement for time and word examples to be <2.6 Current version of examples require `torch<2.6` otherwise the following error can be seen: ``` File "/pytorch/examples/time_sequence_prediction/train.py", line 47, in data = torch.load('traindata.pt') ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/pytorch/examples/time_sequence_prediction/.venv/lib/python3.12/site-packages/torch/serialization.py", line 1524, in load raise pickle.UnpicklingError(_get_wo_message(str(e))) from None ``` Signed-off-by: Dmitry Rogozhkin --- time_sequence_prediction/requirements.txt | 2 +- word_language_model/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/time_sequence_prediction/requirements.txt b/time_sequence_prediction/requirements.txt index 52f936ea1d..a1ba01197c 100644 --- a/time_sequence_prediction/requirements.txt +++ b/time_sequence_prediction/requirements.txt @@ -1,2 +1,2 @@ -torch +torch<2.6 matplotlib diff --git a/word_language_model/requirements.txt b/word_language_model/requirements.txt index 12c6d5d5ea..43dbf9ee52 100644 --- a/word_language_model/requirements.txt +++ b/word_language_model/requirements.txt @@ -1 +1 @@ -torch +torch<2.6 From 1203511e59f922d4fce0a529ac09771501fe4606 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 25 Apr 2025 09:49:12 -0700 Subject: [PATCH 4/4] Respect each example requirements and use uv This commit introduces few changes to CI by modifying `run_*_examples.sh` and respective github workflows: * Switched to uv * Added tearup and teardown stages for tests (`start()` and `stop()` methods wrapping up test bodies - these are called automatically) * Tearup (`start()`) installs example dependencies and, optionally (if `VIRTUAL_ENV=.venv` is passed), creates uv virtual environment * Teardown (`stop()`) removes uv virtual environment if it was created (to save space) * If no `VIRTUAL_ENV` set, then scripts expect to be executed in the existing virtual environment. These can be `python -m venv`, `uv env` or `conda env`. In this case example dependencies will be installed in this environment potentially reinstalling existing packages (including `torch`!). * Dropped automated detection of CUDA platform. Now scripts require `USE_CUDA=True` to be passed explicitly * Added `PIP_INSTALL_ARGS` environment variable to be passed to `uv pip install` calls for each example dependencies. This allows to adjust torch indices and other options. Execute all tests in current virtual environment (might rewrite packages): ``` ./run_distributed_examples.sh ``` Execute all tests creating separate environment for each example: ``` VIRTUAL_ENV=.venv ./run_distributed_examples.sh ``` Run with CUDA: ``` USE_CUDA=True ./run_distributed_examples.sh ``` Adjust index: ``` PIP_INSTALL_ARGS="--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" \ ./run_distributed_examples.sh ``` Signed-off-by: Dmitry Rogozhkin --- .github/workflows/main_distributed.yaml | 10 +- .github/workflows/main_python.yml | 14 +-- CONTRIBUTING.md | 4 +- run_distributed_examples.sh | 46 ++++--- run_python_examples.sh | 159 ++++++++++++------------ utils.sh | 41 ++++-- 6 files changed, 150 insertions(+), 124 deletions(-) diff --git a/.github/workflows/main_distributed.yaml b/.github/workflows/main_distributed.yaml index cbd3c14e9a..b70da3617a 100644 --- a/.github/workflows/main_distributed.yaml +++ b/.github/workflows/main_distributed.yaml @@ -22,12 +22,14 @@ jobs: with: python-version: 3.8 - name: Install PyTorch - run: | - python -m pip install --upgrade pip - pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html + uses: astral-sh/setup-uv@v6 - name: Run Tests + env: + USE_CUDA: 'True' + VIRTUAL_ENV: '.venv' + PIP_INSTALL_ARGS: '--pre -f https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html' run: | - ./run_distributed_examples.sh "run_all,clean" + ./run_distributed_examples.sh - name: Open issue on failure if: ${{ failure() && github.event_name == 'schedule' }} uses: rishabhgupta/git-action-issue@v2 diff --git a/.github/workflows/main_python.yml b/.github/workflows/main_python.yml index ffc0c4deff..9c6a58d2d5 100644 --- a/.github/workflows/main_python.yml +++ b/.github/workflows/main_python.yml @@ -21,16 +21,14 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.10' - - name: Install PyTorch - run: | - python -m pip install --upgrade pip - # Install CPU-based pytorch - pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # Maybe use the CUDA 10.2 version instead? - # pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html + - name: Install uv + uses: astral-sh/setup-uv@v6 - name: Run Tests + env: + VIRTUAL_ENV: '.venv' + PIP_INSTAL_ARGS: '--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html' run: | - ./run_python_examples.sh "install_deps,run_all,clean" + ./run_python_examples.sh - name: Open issue on failure if: ${{ failure() && github.event_name == 'schedule' }} uses: rishabhgupta/git-action-issue@v2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e0e0e5ac90..c0382e7ead 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,8 +40,8 @@ If you're new, we encourage you to take a look at issues tagged with [good first 1. Fork the repo and create your branch from `main`. 2. Make sure you have a GPU-enabled machine, either locally or in the cloud. `g4dn.4xlarge` is a good starting point on AWS. 3. Make your code change. -4. First, install all dependencies with `./run_python_examples.sh "install_deps"`. -5. Then, make sure that `./run_python_examples.sh` passes locally by running the script end to end. +4. Install `uv`. +5. Then, make sure that `VIRTUAL_ENV=.venv ./run_python_examples.sh` passes locally by running the script end to end. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 7. Address any feedback in code review promptly. diff --git a/run_distributed_examples.sh b/run_distributed_examples.sh index c2260d7c78..e1f579c072 100755 --- a/run_distributed_examples.sh +++ b/run_distributed_examples.sh @@ -4,16 +4,30 @@ # The purpose is just as an integration test, not to actually train models in any meaningful way. # For that reason, most of these set epochs = 1 and --dry-run. # -# Optionally specify a comma separated list of examples to run. -# can be run as: -# ./run_python_examples.sh "install_deps,run_all,clean" -# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files. -# Expects pytorch, torchvision to be installed. +# Optionally specify a comma separated list of examples to run. Can be run as: +# * To run all examples: +# ./run_distributed_examples.sh +# * To run specific example: +# ./run_distributed_examples.sh "distributed/tensor_parallelism,distributed/ddp" +# +# To test examples on CUDA accelerator, run as: +# USE_CUDA=True ./run_distributed_examples.sh +# +# Script requires uv to be installed. When executed, script will install prerequisites from +# `requirements.txt` for each example. If ran within activated virtual environment (uv venv, +# python -m venv, conda) this might reinstall some of the packages. To change pip installation +# index or to pass additional pip install options, run as: +# PIP_INSTALL_ARGS="--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" \ +# ./run_python_examples.sh +# +# To force script to create virtual environment for each example, run as: +# VIRTUAL_ENV=".venv" ./run_distributed_examples.sh +# Script will remove environments it creates in a teardown step after execution of each example. BASE_DIR="$(pwd)/$(dirname $0)" source $BASE_DIR/utils.sh -USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())") +USE_CUDA=${USE_CUDA:-False} case $USE_CUDA in "True") echo "using cuda" @@ -30,21 +44,19 @@ case $USE_CUDA in ;; esac -function distributed() { - start - bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed" - bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed" - bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed" - python ddp/main.py || error "ddp example failed" +function distributed_tensor_parallelism() { + uv run bash run_example.sh tensor_parallel_example.py || error "tensor parallel example failed" + uv run bash run_example.sh sequence_parallel_example.py || error "sequence parallel example failed" + uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed" } -function clean() { - cd $BASE_DIR - echo "running clean to remove cruft" +function distributed_ddp() { + uv run main.py || error "ddp example failed" } function run_all() { - distributed + run distributed/tensor_parallelism + run distributed/ddp } # by default, run all examples @@ -54,7 +66,7 @@ else for i in $(echo $EXAMPLES | sed "s/,/ /g") do echo "Starting $i" - $i + run $i echo "Finished $i, status $?" done fi diff --git a/run_python_examples.sh b/run_python_examples.sh index 0e06e4cfc0..d12fb60021 100755 --- a/run_python_examples.sh +++ b/run_python_examples.sh @@ -1,19 +1,33 @@ -#!/usr/bin/env bash +#!/bin/bash # # This script runs through the code in each of the python examples. # The purpose is just as an integration test, not to actually train models in any meaningful way. # For that reason, most of these set epochs = 1 and --dry-run. # -# Optionally specify a comma separated list of examples to run. -# can be run as: -# ./run_python_examples.sh "install_deps,run_all,clean" -# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files. -# Expects pytorch, torchvision to be installed. +# Optionally specify a comma separated list of examples to run. Can be run as: +# * To run all examples: +# ./run_python_examples.sh +# * To run few specific examples: +# ./run_python_examples.sh "dcgan,fast_neural_style" +# +# To test examples on CUDA accelerator, run as: +# USE_CUDA=True ./run_python_examples.sh +# +# Script requires uv to be installed. When executed, script will install prerequisites from +# `requirements.txt` for each example. If ran within activated virtual environment (uv venv, +# python -m venv, conda) this might reinstall some of the packages. To change pip installation +# index or to pass additional pip install options, run as: +# PIP_INSTALL_ARGS="--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" \ +# ./run_python_examples.sh +# +# To force script to create virtual environment for each example, run as: +# VIRTUAL_ENV=".venv" ./run_python_examples.sh +# Script will remove environments it creates in a teardown step after execution of each example. BASE_DIR="$(pwd)/$(dirname $0)" source $BASE_DIR/utils.sh -USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())") +USE_CUDA=${USE_CUDA:-False} case $USE_CUDA in "True") echo "using cuda" @@ -31,24 +45,21 @@ case $USE_CUDA in esac function dcgan() { - start - python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed" + uv run main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed" } function fast_neural_style() { - start if [ ! -d "saved_models" ]; then echo "downloading saved models for fast neural style" - python download_saved_models.py + uv run download_saved_models.py fi test -d "saved_models" || { error "saved models not found"; return; } echo "running fast neural style model" - python neural_style/neural_style.py eval --content-image images/content-images/amber.jpg --model saved_models/candy.pth --output-image images/output-images/amber-candy.jpg --cuda $CUDA --mps || error "neural_style.py failed" + uv run neural_style/neural_style.py eval --content-image images/content-images/amber.jpg --model saved_models/candy.pth --output-image images/output-images/amber-candy.jpg --cuda $CUDA --mps || error "neural_style.py failed" } function imagenet() { - start if [[ ! -d "sample/val" || ! -d "sample/train" ]]; then mkdir -p sample/val/n mkdir -p sample/train/n @@ -56,110 +67,94 @@ function imagenet() { mv Socks-clinton.jpg sample/train/n cp sample/train/n/* sample/val/n/ fi - python main.py --epochs 1 sample/ || error "imagenet example failed" + uv run main.py --epochs 1 sample/ || error "imagenet example failed" } function language_translation() { - start - python -m spacy download en || error "couldn't download en package from spacy" - python -m spacy download de || error "couldn't download de package from spacy" - python main.py -e 1 --enc_layers 1 --dec_layers 1 --backend cpu --logging_dir output/ --dry_run || error "language translation example failed" + uv run -m spacy download en || error "couldn't download en package from spacy" + uv run -m spacy download de || error "couldn't download de package from spacy" + uv run main.py -e 1 --enc_layers 1 --dec_layers 1 --backend cpu --logging_dir output/ --dry_run || error "language translation example failed" } function mnist() { - start - python main.py --epochs 1 --dry-run || error "mnist example failed" + uv run main.py --epochs 1 --dry-run || error "mnist example failed" } function mnist_forward_forward() { - start - python main.py --epochs 1 --no_mps --no_cuda || error "mnist forward forward failed" + uv run main.py --epochs 1 --no_mps --no_cuda || error "mnist forward forward failed" } function mnist_hogwild() { - start - python main.py --epochs 1 --dry-run $CUDA_FLAG || error "mnist hogwild failed" + uv run main.py --epochs 1 --dry-run $CUDA_FLAG || error "mnist hogwild failed" } function mnist_rnn() { - start - python main.py --epochs 1 --dry-run || error "mnist rnn example failed" + uv run main.py --epochs 1 --dry-run || error "mnist rnn example failed" } function regression() { - start - python main.py --epochs 1 $CUDA_FLAG || error "regression failed" + uv run main.py --epochs 1 $CUDA_FLAG || error "regression failed" } function siamese_network() { - start - python main.py --epochs 1 --dry-run || error "siamese network example failed" + uv run main.py --epochs 1 --dry-run || error "siamese network example failed" } function reinforcement_learning() { - start - python reinforce.py || error "reinforcement learning reinforce failed" - python actor_critic.py || error "reinforcement learning actor_critic failed" + uv run reinforce.py || error "reinforcement learning reinforce failed" + uv run actor_critic.py || error "reinforcement learning actor_critic failed" } function snli() { - start echo "installing 'en' model if not installed" - python -m spacy download en || { error "couldn't download 'en' model needed for snli"; return; } + uv run -m spacy download en || { error "couldn't download 'en' model needed for snli"; return; } echo "training..." - python train.py --epochs 1 --dev_every 1 --no-bidirectional --dry-run || error "couldn't train snli" + uv run train.py --epochs 1 --dev_every 1 --no-bidirectional --dry-run || error "couldn't train snli" } function fx() { - start - # python custom_tracer.py || error "fx custom tracer has failed" UnboundLocalError: local variable 'tabulate' referenced before assignment - python invert.py || error "fx invert has failed" - python module_tracer.py || error "fx module tracer has failed" - python primitive_library.py || error "fx primitive library has failed" - python profiling_tracer.py || error "fx profiling tracer has failed" - python replace_op.py || error "fx replace op has failed" - python subgraph_rewriter_basic_use.py || error "fx subgraph has failed" - python wrap_output_dynamically.py || error "vmap output dynamically has failed" + # uv run custom_tracer.py || error "fx custom tracer has failed" UnboundLocalError: local variable 'tabulate' referenced before assignment + uv run invert.py || error "fx invert has failed" + uv run module_tracer.py || error "fx module tracer has failed" + uv run primitive_library.py || error "fx primitive library has failed" + uv run profiling_tracer.py || error "fx profiling tracer has failed" + uv run replace_op.py || error "fx replace op has failed" + uv run subgraph_rewriter_basic_use.py || error "fx subgraph has failed" + uv run wrap_output_dynamically.py || error "vmap output dynamically has failed" } function super_resolution() { - start - python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 1 --lr 0.001 --mps || error "super resolution failed" + uv run main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 1 --lr 0.001 --mps || error "super resolution failed" } function time_sequence_prediction() { - start - python generate_sine_wave.py || { error "generate sine wave failed"; return; } - python train.py --steps 2 || error "time sequence prediction training failed" + uv run generate_sine_wave.py || { error "generate sine wave failed"; return; } + uv run train.py --steps 2 || error "time sequence prediction training failed" } function vae() { - start - python main.py --epochs 1 || error "vae failed" + uv run main.py --epochs 1 || error "vae failed" } function vision_transformer() { - start - python main.py --epochs 1 --dry-run || error "vision transformer example failed" + uv run main.py --epochs 1 --dry-run || error "vision transformer example failed" } function word_language_model() { - start - python main.py --epochs 1 --dry-run $CUDA_FLAG --mps || error "word_language_model failed" + uv run main.py --epochs 1 --dry-run $CUDA_FLAG --mps || error "word_language_model failed" } function gcn() { - start - python main.py --epochs 1 --dry-run || error "graph convolutional network failed" + uv run main.py --epochs 1 --dry-run || error "graph convolutional network failed" } function gat() { - start - python main.py --epochs 1 --dry-run || error "graph attention network failed" + uv run main.py --epochs 1 --dry-run || error "graph attention network failed" } -function clean() { +eval "base_$(declare -f stop)" + +function stop() { cd $BASE_DIR - echo "running clean to remove cruft" rm -rf dcgan/fake_samples_epoch_000.png \ dcgan/netD_epoch_0.pth \ dcgan/netG_epoch_0.pth \ @@ -170,7 +165,7 @@ function clean() { imagenet/lsun/ \ imagenet/model_best.pth.tar \ imagenet/sample/ \ - language_translation/output/ \ + language_translation/output/ \ snli/.data/ \ snli/.vector_cache/ \ snli/results/ \ @@ -183,30 +178,32 @@ function clean() { gat/cora/ || error "couldn't clean up some files" git checkout fast_neural_style/images/output-images/amber-candy.jpg || error "couldn't clean up fast neural style image" + + base_stop "$1" } function run_all() { # cpp moved to `run_cpp_examples.sh``` - dcgan + run dcgan # distributed moved to `run_distributed_examples.sh` - fast_neural_style - imagenet + run fast_neural_style + run imagenet # language_translation - mnist - mnist_forward_forward - mnist_hogwild - mnist_rnn - regression - reinforcement_learning - siamese_network - super_resolution - time_sequence_prediction - vae + run mnist + run mnist_forward_forward + run mnist_hogwild + run mnist_rnn + run regression + run reinforcement_learning + run siamese_network + run super_resolution + run time_sequence_prediction + run vae # vision_transformer - example broken see https://github.com/pytorch/examples/issues/1184 and https://github.com/pytorch/examples/pull/1258 for more details - word_language_model - fx - gcn - gat + run word_language_model + run fx + run gcn + run gat } # by default, run all examples @@ -216,7 +213,7 @@ else for i in $(echo $EXAMPLES | sed "s/,/ /g") do echo "Starting $i" - $i + run $i echo "Finished $i, status $?" done fi diff --git a/utils.sh b/utils.sh index b7ed613e6e..72b6e44cd8 100644 --- a/utils.sh +++ b/utils.sh @@ -21,18 +21,35 @@ function error() { fi } -function install_deps() { - echo "installing requirements" - cat $BASE_DIR/*/requirements.txt | \ - sort -u | \ - # testing the installed version of torch, so don't pip install it. - grep -vE '^torch$' | \ - pip install -r /dev/stdin || \ - { error "failed to install dependencies"; exit 1; } -} - function start() { - EXAMPLE=${FUNCNAME[1]} - cd $BASE_DIR/$EXAMPLE + EXAMPLE=$1 + cd $BASE_DIR/$EXAMPLE || { error "$EXAMPLE: no such example"; return 1; } + echo "Install dependencies for $EXAMPLE" + # Setting VIRTUAL_ENV=.venv externally will create uv virtual environment + # for each sample in start() and remove it in stop(). Note that this environment + # variable also forces other uv commands such as `uv pip...` and `uv run...` to + # use the specified environment. + if [ "$VIRTUAL_ENV" = ".venv" ]; then + uv venv || { error "$EXAMPLE: failed to create virtual environment"; return 1; } + fi + uv pip install -r requirements.txt $PIP_INSTALL_ARGS || { error "$EXAMPLE: failed to install requirements"; return 1; } echo "Running example: $EXAMPLE" } + +function stop() { + EXAMPLE=$1 + if [ "$VIRTUAL_ENV" = ".venv" ]; then + cd $BASE_DIR/$EXAMPLE && rm -rf .venv + fi +} + +function run() { + EXAMPLE=$1 + if start $EXAMPLE; then + # drop trailing slash (occurs due to auto completion in bash interactive mode) + # replace slashes with underscores: this allows to call nested examples + EXAMPLE_FN=$(echo $EXAMPLE | sed "s@/\$@@" | sed 's@/@_@') + $EXAMPLE_FN + fi + stop $EXAMPLE +}