Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions .github/workflows/_Metax-X86.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@ on:
default: 'true'


defaults:
run:
shell: bash


jobs:

check-bypass:
Expand Down Expand Up @@ -65,10 +60,10 @@ jobs:
# !!!!! SKIP IF NO METAX CHANGE !!!!
echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
change_numbers=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | wc -l)
# change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)
# change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)
change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
# change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)
change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
# change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)
git --no-pager diff --name-only remotes/origin/${BRANCH}

if [ $change_numbers -ne $change_backend ]; then
Expand Down
5 changes: 5 additions & 0 deletions backends/metax_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so)
target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)

include_directories(BEFORE ${PADDLE_SOURCE_DIR})
include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers)

target_compile_definitions(
${TARGET_NAME}
Expand Down Expand Up @@ -826,8 +827,12 @@ add_custom_command(
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/python/
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/python/
COMMAND ${CMAKE_COMMAND} -E make_directory
${CMAKE_CURRENT_BINARY_DIR}/python/include/
COMMAND ${CMAKE_COMMAND} -E make_directory
${CMAKE_CURRENT_BINARY_DIR}/python/paddle_custom_device/
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/headers
${CMAKE_CURRENT_BINARY_DIR}/python/include/
COMMAND
${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.so
Expand Down
4 changes: 2 additions & 2 deletions backends/metax_gpu/change_patch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

rm -r ../../Paddle/third_party/eigen3
cd patch
unzip mcEigen_3.4.0_paddle_final.zip
mv mcEigen_3.4.0_paddle_final eigen3
unzip Eigen_3.4.0_paddle.zip
mv Eigen_3.4.0_paddle eigen3
cd ..
cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
rm -r patch/eigen3
Expand Down
2 changes: 1 addition & 1 deletion backends/metax_gpu/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ fi
echo "make_maca"
cd build
cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
make_maca -j18
make_maca -j18 VERBOSE=1


echo "install whl"
Expand Down
148 changes: 148 additions & 0 deletions backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <cublasXt.h>
#include <cublas_v2.h>
#include <cuda.h>
#if CUDA_VERSION >= 12030 && defined(__linux__)
#include <cublas_api.h>
#endif

#include <mutex> // NOLINT
#include <type_traits>

#include "paddle/phi/backends/dynload/dynamic_loader.h"
#include "paddle/phi/common/port.h"

namespace phi {
namespace dynload {

extern std::once_flag cublas_dso_flag;
extern void* cublas_dso_handle;

/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublas_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublas_dso_flag, []() { \
cublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \
}); \
std::string replaced_name = #__name; \
replaced_name = replaced_name.replace(0, 2, "mc"); \
int index = replaced_name.find("_", 0); \
if (index != -1) replaced_name = replaced_name.substr(0, index); \
static void* p_##__name = \
dlsym(cublas_dso_handle, replaced_name.c_str()); \
return reinterpret_cast<cublas_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name

#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
__macro(cublasCaxpy_v2); \
__macro(cublasZaxpy_v2); \
__macro(cublasSscal_v2); \
__macro(cublasDscal_v2); \
__macro(cublasScopy_v2); \
__macro(cublasDcopy_v2); \
__macro(cublasSgemv_v2); \
__macro(cublasDgemv_v2); \
__macro(cublasCgemv_v2); \
__macro(cublasZgemv_v2); \
__macro(cublasSgemm_v2); \
__macro(cublasDgemm_v2); \
__macro(cublasCgemm_v2); \
__macro(cublasZgemm_v2); \
__macro(cublasHgemm); \
__macro(cublasSgemmEx); \
__macro(cublasSgeam); \
__macro(cublasDgeam); \
__macro(cublasStrsm_v2); \
__macro(cublasDtrsm_v2); \
__macro(cublasCtrsm_v2); \
__macro(cublasZtrsm_v2); \
__macro(cublasCreate_v2); \
__macro(cublasDestroy_v2); \
__macro(cublasSetStream_v2); \
__macro(cublasSetPointerMode_v2); \
__macro(cublasGetPointerMode_v2); \
__macro(cublasSgemmBatched); \
__macro(cublasDgemmBatched); \
__macro(cublasCgemmBatched); \
__macro(cublasZgemmBatched); \
__macro(cublasStrsmBatched); \
__macro(cublasDtrsmBatched); \
__macro(cublasCtrsmBatched); \
__macro(cublasZtrsmBatched); \
__macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \
__macro(cublasDgetriBatched); \
__macro(cublasCgetrfBatched); \
__macro(cublasCgetriBatched); \
__macro(cublasZgetrfBatched); \
__macro(cublasZgetriBatched); \
__macro(cublasSmatinvBatched); \
__macro(cublasDmatinvBatched); \
__macro(cublasCmatinvBatched); \
__macro(cublasZmatinvBatched); \
__macro(cublasSgetrsBatched); \
__macro(cublasDgetrsBatched); \
__macro(cublasSdot_v2); \
__macro(cublasDdot_v2); \
__macro(cublasCdotc_v2); \
__macro(cublasZdotc_v2); \
__macro(cublasCdotu_v2); \
__macro(cublasZdotu_v2); \
__macro(cublasDotEx); \
__macro(cublasGemmEx); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched); \
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode); \
__macro(cublasCgeam); \
__macro(cublasZgeam); \
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);

CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)

#if CUDA_VERSION >= 12030 && defined(__linux__)
#define CUBLAS_BLAS_ROUTINE_EACH_R5(__macro) \
__macro(cublasGemmStridedBatchedEx_64); \
__macro(cublasGemmEx_64); \
__macro(cublasSgemmEx_64);

CUBLAS_BLAS_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif

#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload
} // namespace phi
114 changes: 114 additions & 0 deletions backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <cublasLt.h>
#include <cuda.h>

#include <mutex> // NOLINT
#include <type_traits>

#include "paddle/phi/backends/dynload/dynamic_loader.h"
#include "paddle/phi/common/port.h"

namespace phi {
namespace dynload {

extern std::once_flag cublasLt_dso_flag;
extern void* cublasLt_dso_handle;

/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle(); \
}); \
std::string replaced_name = #__name; \
replaced_name = replaced_name.replace(0, 2, "mc"); \
static void* p_##__name = \
dlsym(cublasLt_dso_handle, replaced_name.c_str()); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
// APIs available after CUDA 11.1
#if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatmulDescGetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixLayoutGetAttribute); \
__macro(cublasLtMatmulPreferenceCreate); \
__macro(cublasLtMatmulPreferenceDestroy); \
__macro(cublasLtMatmulPreferenceSetAttribute); \
__macro(cublasLtMatmulAlgoGetHeuristic); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute); \
__macro(cublasLtMatmulAlgoInit); \
__macro(cublasLtMatmulAlgoConfigSetAttribute); \
__macro(cublasLtMatmulAlgoConfigGetAttribute); \
__macro(cublasLtMatmulAlgoGetIds); \
__macro(cublasLtMatmulAlgoCapGetAttribute); \
__macro(cublasLtMatmulAlgoCheck);
// __macro(cublasLtGetCudartVersion);
#else
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatmulDescGetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixLayoutGetAttribute); \
__macro(cublasLtMatmulPreferenceCreate); \
__macro(cublasLtMatmulPreferenceDestroy); \
__macro(cublasLtMatmulPreferenceSetAttribute); \
__macro(cublasLtMatmulAlgoGetHeuristic); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
#endif

CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif

#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload
} // namespace phi
Loading
Loading