Skip to content

Build zoom backend v2 #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions .github/workflows/build_zoom_backend.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
name: "Build Zoom wheel"

on:
workflow_dispatch:
inputs:
force_debug_with_tmate:
type: boolean
description: 'Run the build with tmate session'
required: false
default: false
debug_with_tmate:
type: boolean
description: 'Run the build with a tmate session ONLY in case of failure'
required: false
default: false
pull_request:
push:
branches:
- main

concurrency:
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true

jobs:
build:

strategy:
fail-fast: false
matrix:
include:
- name: "ubuntu-22.04"
runs-on: "nodai-amdgpu-mi250-x86-64"
# runs-on: "azure-cpubuilder-linux-scale"
# container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"

runs-on: ${{ matrix.runs-on }}

name: ${{ matrix.name }}

env:
CACHE_DIR: ${{ github.workspace }}/.container-cache
# either the PR number or `branch-N` where N always increments
CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}

defaults:
run:
shell: bash

permissions:
id-token: write
contents: write

container:
image: ${{ matrix.container }}

steps:
- name: "Check out repository"
uses: actions/[email protected]
with:
submodules: true

- name: Enable cache
uses: actions/cache/restore@v3
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}
restore-keys: linux-build-test-cpp-

# - name: "Setting up Python"
# run: |
# sudo apt update
# sudo apt install software-properties-common -y
# sudo add-apt-repository ppa:deadsnakes/ppa -y
# sudo apt install python3.11 python3-pip -y
# sudo apt-get install python3.11-dev python3.11-venv build-essential -y

- name: "Build PyTorch"
id: build
run: |

# curl -sSL https://raw.githubusercontent.com/mrodden/get-rocm/refs/heads/master/get-rocm.py -o get-rocm.py
# sudo python3.11 get-rocm.py --rocm-version 6.2.3

export CCACHE_DIR="${{ env.CACHE_DIR }}"
export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros

python3.11 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
./build.sh

- name: "Audit"
id: audit
run: |

sudo apt install patchelf
source venv/bin/activate
pip install auditwheel
auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*

- name: "Test"
id: test
run: |

# smoke test
python zoom_extension/examples/test.py
# device tests
PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh

cat zoom_test_errors.log
cat zoom_unimplemented_operators.log

- name: Save cache
uses: actions/cache/save@v3
if: ${{ !cancelled() }}
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}

- name: Upload artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.name }}_artifact
path: dist
if-no-files-found: warn

- name: Release current commit
if: ${{ !cancelled() }}
uses: ncipollo/[email protected]
with:
artifacts: "dist/torch*.whl"
token: "${{ secrets.GITHUB_TOKEN }}"
tag: "latest"
name: "latest"
removeArtifacts: false
allowUpdates: true
replacesArtifacts: true
makeLatest: true

- name: "Setup tmate session"
if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
uses: mxschmitt/[email protected]
with:
limit-access-to-actor: true
install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}
11 changes: 9 additions & 2 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ load("@pytorch//tools/config:defs.bzl", "if_cuda")
load("@pytorch//:aten.bzl", "generate_aten", "intern_build_aten_ops")
load(":build.bzl", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON", "define_targets")
load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources", "aten_ufunc_generated_zoom_sources")
load("//:tools/bazel.bzl", "rules")

define_targets(rules = rules)
Expand Down Expand Up @@ -104,6 +104,12 @@ generated_cuda_cpp = [
"aten/src/ATen/RegisterSparseCsrCUDA.cpp",
]

generated_zoom_cpp = [
"aten/src/ATen/ZoomFunctions.h",
"aten/src/ATen/ZoomFunctions_inl.h",
"aten/src/ATen/RegisterPrivateUse1.cpp",
]

generate_aten(
name = "generated_aten_cpp",
srcs = aten_generation_srcs,
Expand All @@ -112,7 +118,8 @@ generate_aten(
generated_cuda_cpp +
aten_ufunc_generated_cpu_sources("aten/src/ATen/{}") +
aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}") +
aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + [
aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") +
aten_ufunc_generated_zoom_sources("aten/src/ATen/{}") + [
"aten/src/ATen/Declarations.yaml",
]
),
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_ZOOM "Use ZOOM HIP Backend" OFF)
option(USE_CUDA "Use CUDA" ON)
cmake_dependent_option(
USE_XPU "Use XPU. Only available on Linux." ON
Expand Down
19 changes: 19 additions & 0 deletions aten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@ set(ATen_CUDA_SRCS_W_SORT_BY_KEY)
set(ATen_CUDA_TEST_SRCS)
set(ATen_CUDA_INCLUDE)
set(ATen_NVRTC_STUB_SRCS)
set(ATen_HIPRTC_STUB_SRCS)
set(ATen_HIP_SRCS)
set(ATen_ZOOM_SRCS)
set(ATen_HIP_SRCS_W_SORT_BY_KEY)
set(ATen_HIP_TEST_SRCS)
set(ATen_HIP_INCLUDE)
set(ATen_ZOOM_INCLUDE)
set(ATen_MPS_SRCS)
set(ATen_MPS_TEST_SRCS)
set(ATen_XPU_SRCS)
Expand All @@ -44,6 +47,7 @@ set(ATen_CPU_DEPENDENCY_LIBS)
set(ATen_XPU_DEPENDENCY_LIBS)
set(ATen_CUDA_DEPENDENCY_LIBS)
set(ATen_HIP_DEPENDENCY_LIBS)
set(ATen_ZOOM_DEPENDENCY_LIBS)
set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS)
set(ATen_PUBLIC_HIP_DEPENDENCY_LIBS)
set(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
Expand All @@ -70,6 +74,17 @@ if(USE_ROCM)
endif()
endif()

if(USE_ZOOM)
include(LoadHIP)
if(NOT PYTORCH_FOUND_HIP)
message(WARNING "Could not load HIP, setting USE_ZOOM = OFF")
set(USE_ZOOM OFF)
else()
message(STATUS "Loaded HIP, Zoom Enabled")
endif()
endif()


# Both CUDA and ROCM are enabled and found. Report an error.
if(USE_CUDA AND USE_ROCM)
message(FATAL_ERROR "Both CUDA and ROCm are enabled and found. PyTorch can only be built with either of them. Please turn one off by using either USE_CUDA=OFF or USE_ROCM=OFF.")
Expand Down Expand Up @@ -116,12 +131,14 @@ set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_ZOOM_SRCS ${ATen_ZOOM_SRCS} PARENT_SCOPE)
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_HIPRTC_STUB_SRCS ${ATen_HIPRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
Expand All @@ -132,12 +149,14 @@ set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
set(ATen_ZOOM_INCLUDE ${ATen_ZOOM_INCLUDE} PARENT_SCOPE)
set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_ZOOM_DEPENDENCY_LIBS ${ATen_ZOOM_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
Expand Down
28 changes: 18 additions & 10 deletions aten/src/ATen/AccumulateType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@

namespace at {

// TODO(Arham): exchange keys
c10::ScalarType toAccumulateType(c10::ScalarType type, c10::DeviceType device) {
switch (type) {
#define DEFINE_CASE(scalar_t, TypeNum) \
case ScalarType::TypeNum: \
switch (device) { \
case DeviceType::CUDA: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CUDA>>::value; \
case DeviceType::MPS: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::MPS>>::value; \
default: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CPU>>::value; \
#define DEFINE_CASE(scalar_t, TypeNum) \
case ScalarType::TypeNum: \
switch (device) { \
case DeviceType::CUDA: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CUDA>>::value; \
case DeviceType::PrivateUse1: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::PrivateUse1>>::value; \
case DeviceType::MPS: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::MPS>>::value; \
default: \
return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CPU>>::value; \
}

AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(DEFINE_CASE)
Expand All @@ -23,7 +26,12 @@ c10::ScalarType toAccumulateType(c10::ScalarType type, c10::DeviceType device) {
}

c10::ScalarType toAccumulateType(c10::ScalarType type, bool is_cuda) {
return is_cuda ? toAccumulateType(type, c10::DeviceType::CUDA) : toAccumulateType(type, c10::DeviceType::CPU);
#ifndef USE_ZOOM
return is_cuda ? toAccumulateType(type, c10::DeviceType::CUDA) : toAccumulateType(type, c10::DeviceType::CPU);
#else
// TODO(Arham): exchange keys
return is_cuda ? toAccumulateType(type, c10::DeviceType::PrivateUse1) : toAccumulateType(type, c10::DeviceType::CPU);
#endif
}

}
31 changes: 30 additions & 1 deletion aten/src/ATen/AccumulateType.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,12 @@ struct AccumulateType<T, false> {

template <typename T>
struct AccumulateType<T, true> {
using type = typename AccumulateTypeDevice<T, c10::DeviceType::CUDA>::type;
#ifndef USE_ZOOM
using type = typename AccumulateTypeDevice<T, c10::DeviceType::CUDA>::type;
#else
// TODO(Arham): exchange keys
using type = typename AccumulateTypeDevice<T, c10::DeviceType::PrivateUse1>::type;
#endif
};

template <typename T, c10::DeviceType device>
Expand All @@ -83,6 +88,8 @@ using acc_type = typename AccumulateType<T, is_cuda>::type;
};
#define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS)
#define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA)
// TODO(Arham): exchange keys
#define ZOOM_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::PrivateUse1)
#define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU)

MPS_ACC_TYPE(BFloat16, float);
Expand Down Expand Up @@ -126,6 +133,28 @@ CUDA_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
CUDA_ACC_TYPE(c10::complex<float>, c10::complex<float>);
CUDA_ACC_TYPE(c10::complex<double>, c10::complex<double>);

#if defined(__HIPCC__)
ZOOM_ACC_TYPE(half, float);
#endif
ZOOM_ACC_TYPE(BFloat16, float);
ZOOM_ACC_TYPE(Half, float);
ZOOM_ACC_TYPE(Float8_e5m2, float);
ZOOM_ACC_TYPE(Float8_e4m3fn, float);
ZOOM_ACC_TYPE(Float8_e5m2fnuz, float);
ZOOM_ACC_TYPE(Float8_e4m3fnuz, float);
ZOOM_ACC_TYPE(float, float);
ZOOM_ACC_TYPE(double, double);
ZOOM_ACC_TYPE(int8_t, int64_t);
ZOOM_ACC_TYPE(uint8_t, int64_t);
ZOOM_ACC_TYPE(char, int64_t);
ZOOM_ACC_TYPE(int16_t, int64_t);
ZOOM_ACC_TYPE(int32_t, int64_t);
ZOOM_ACC_TYPE(int64_t, int64_t);
ZOOM_ACC_TYPE(bool, bool);
ZOOM_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
ZOOM_ACC_TYPE(c10::complex<float>, c10::complex<float>);
ZOOM_ACC_TYPE(c10::complex<double>, c10::complex<double>);

CPU_ACC_TYPE(BFloat16, float);
CPU_ACC_TYPE(Half, float);
CPU_ACC_TYPE(Float8_e5m2, float);
Expand Down
Loading
Loading