Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,6 @@ runs:
with:
path: cache-mount
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-

- name: Restore Docker cache mounts
uses: reproducible-containers/buildkit-cache-dance@5b81f4d29dc8397a7d341dba3aeecc7ec54d6361
Expand Down
16 changes: 3 additions & 13 deletions .github/workflows/install-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,12 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install CUDA Toolkit for Ubuntu 22.04
- name: Install CUDA Toolkit and cuDNN headers for Ubuntu 22.04
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cuda-toolkit-12-8
sudo apt-get clean

- name: Install cuDNN headers
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cudnn-cuda-12
sudo apt-get install -y cuda-toolkit-12-8 cudnn-cuda-12 libcudnn9-cuda-12
sudo apt-get clean

- name: Install Python ${{ matrix.python-version }}
Expand All @@ -69,6 +61,7 @@ jobs:

- name: Install project
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
pip install --pre --no-cache-dir torch pybind11 wheel_stub numpy
pip install --pre --no-cache-dir --no-build-isolation .

Expand Down Expand Up @@ -246,9 +239,6 @@ jobs:

- name: Install dependencies with UV
run: |

export PATH="${UV_PROJECT_ENVIRONMENT}/bin/:$PATH"

uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages

uv sync --link-mode copy --locked --only-group build --no-cache
Expand Down
9 changes: 4 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ dependencies = [
"tqdm>=4.67.1",
"hydra-core>1.3,<=1.3.2",
"megatron-core>=0.14.0a0,<0.15.0",
"nvidia-modelopt[torch,onnx]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
"nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0; sys_platform != 'darwin'",
"transformer-engine[pytorch]>=2.5.0a0,<2.6.0; sys_platform != 'darwin'"
"transformer-engine[pytorch]>=2.6.0a0,<2.8.0; sys_platform != 'darwin'"
]


Expand All @@ -100,12 +100,11 @@ version = { attr = "megatron.bridge.package_info.__version__" }
no-build-isolation-package = ["transformer-engine", "transformer-engine-torch"]
prerelease = "allow"

# # uv.sources allows us to override dependencies with VCS commits.
# # Lets use this only for debugging purposes, but not for production (main).
# uv.sources allows us to override dependencies with VCS commits.
# Lets use this only for debugging purposes, but not for production (main).
# [tool.uv.sources]
# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "5f1142e8c12172510d34709df3629be6f88dc993" } # on release_v2.6.0


[project.optional-dependencies]
recipes = [
"nemo-run>=0.5.0a0,<0.6.0",
Expand Down
10 changes: 2 additions & 8 deletions tests/unit_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,9 @@ def ensure_test_data(tmp_path_factory):

logger.info("Test data downloaded successfully.")

except ImportError as e:
logger.info(f"Failed to import download function: {e}")
except ValueError as e:
logger.error(e)
pytest.exit(f"Failed to download test data: {e}", returncode=1)
# Don't fail the tests, just warn
except Exception as e:
logger.info(f"Failed to download test data: {e}")
# Don't fail the tests, just warn
logger.error(f"Failed to download test data: {e}")
pytest.exit(f"Failed to download test data: {e}", returncode=1)
else:
logger.info(f"Test data already available at {data_path}")

Expand Down
Loading
Loading