diff --git a/.github/workflows/_Metax-X86.yml b/.github/workflows/_Metax-X86.yml index d90a375d79f..3754be15a0d 100644 --- a/.github/workflows/_Metax-X86.yml +++ b/.github/workflows/_Metax-X86.yml @@ -17,11 +17,6 @@ on: default: 'true' -defaults: - run: - shell: bash - - jobs: check-bypass: @@ -65,10 +60,10 @@ jobs: # !!!!! SKIP IF NO METAX CHANGE !!!! echo "=========== Checking PR Changes If METAX FULL CI Needed ===========" change_numbers=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | wc -l) - # change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l) - change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true) - # change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l) - change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true) + change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l) + # change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true) + change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l) + # change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true) git --no-pager diff --name-only remotes/origin/${BRANCH} if [ $change_numbers -ne $change_backend ]; then diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 2e2f9aa0e1f..79941c2866a 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -788,6 +788,7 @@ target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so) target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) +include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers) target_compile_definitions( ${TARGET_NAME} @@ -826,8 +827,12 @@ add_custom_command( POST_BUILD COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/python/ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/python/ + COMMAND ${CMAKE_COMMAND} -E make_directory + ${CMAKE_CURRENT_BINARY_DIR}/python/include/ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/python/paddle_custom_device/ + COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/headers + ${CMAKE_CURRENT_BINARY_DIR}/python/include/ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.so diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh index 7ac600a0da7..3fa9a647615 100644 --- a/backends/metax_gpu/change_patch.sh +++ b/backends/metax_gpu/change_patch.sh @@ -17,8 +17,8 @@ rm -r ../../Paddle/third_party/eigen3 cd patch -unzip mcEigen_3.4.0_paddle_final.zip -mv mcEigen_3.4.0_paddle_final eigen3 +unzip Eigen_3.4.0_paddle.zip +mv Eigen_3.4.0_paddle eigen3 cd .. cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 rm -r patch/eigen3 diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh index bdc6269a60b..ed2b1b2374a 100644 --- a/backends/metax_gpu/compile.sh +++ b/backends/metax_gpu/compile.sh @@ -31,7 +31,7 @@ fi echo "make_maca" cd build cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON -make_maca -j18 +make_maca -j18 VERBOSE=1 echo "install whl" diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h new file mode 100644 index 00000000000..959614c094e --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#if CUDA_VERSION >= 12030 && defined(__linux__) +#include +#endif + +#include // NOLINT +#include + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag cublas_dso_flag; +extern void* cublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cublas_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(cublas_dso_flag, []() { \ + cublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \ + }); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + int index = replaced_name.find("_", 0); \ + if (index != -1) replaced_name = replaced_name.substr(0, index); \ + static void* p_##__name = \ + dlsym(cublas_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSaxpy_v2); \ + __macro(cublasDaxpy_v2); \ + __macro(cublasCaxpy_v2); \ + __macro(cublasZaxpy_v2); \ + __macro(cublasSscal_v2); \ + __macro(cublasDscal_v2); \ + __macro(cublasScopy_v2); \ + __macro(cublasDcopy_v2); \ + __macro(cublasSgemv_v2); \ + __macro(cublasDgemv_v2); \ + __macro(cublasCgemv_v2); \ + __macro(cublasZgemv_v2); \ + __macro(cublasSgemm_v2); \ + __macro(cublasDgemm_v2); \ + __macro(cublasCgemm_v2); \ + __macro(cublasZgemm_v2); \ + __macro(cublasHgemm); \ + __macro(cublasSgemmEx); \ + __macro(cublasSgeam); \ + __macro(cublasDgeam); \ + __macro(cublasStrsm_v2); \ + __macro(cublasDtrsm_v2); \ + __macro(cublasCtrsm_v2); \ + __macro(cublasZtrsm_v2); \ + __macro(cublasCreate_v2); \ + __macro(cublasDestroy_v2); \ + __macro(cublasSetStream_v2); \ + __macro(cublasSetPointerMode_v2); \ + __macro(cublasGetPointerMode_v2); \ + __macro(cublasSgemmBatched); \ + __macro(cublasDgemmBatched); \ + __macro(cublasCgemmBatched); \ + __macro(cublasZgemmBatched); \ + __macro(cublasStrsmBatched); \ + __macro(cublasDtrsmBatched); \ + __macro(cublasCtrsmBatched); \ + __macro(cublasZtrsmBatched); \ + __macro(cublasSgetrfBatched); \ + __macro(cublasSgetriBatched); \ + __macro(cublasDgetrfBatched); \ + __macro(cublasDgetriBatched); \ + __macro(cublasCgetrfBatched); \ + __macro(cublasCgetriBatched); \ + __macro(cublasZgetrfBatched); \ + __macro(cublasZgetriBatched); \ + __macro(cublasSmatinvBatched); \ + __macro(cublasDmatinvBatched); \ + __macro(cublasCmatinvBatched); \ + __macro(cublasZmatinvBatched); \ + __macro(cublasSgetrsBatched); \ + __macro(cublasDgetrsBatched); \ + __macro(cublasSdot_v2); \ + __macro(cublasDdot_v2); \ + __macro(cublasCdotc_v2); \ + __macro(cublasZdotc_v2); \ + __macro(cublasCdotu_v2); \ + __macro(cublasZdotu_v2); \ + __macro(cublasDotEx); \ + __macro(cublasGemmEx); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); \ + __macro(cublasSetMathMode); \ + __macro(cublasGetMathMode); \ + __macro(cublasCgeam); \ + __macro(cublasZgeam); \ + __macro(cublasGemmBatchedEx); \ + __macro(cublasGemmStridedBatchedEx); + +CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) + +#if CUDA_VERSION >= 12030 && defined(__linux__) +#define CUBLAS_BLAS_ROUTINE_EACH_R5(__macro) \ + __macro(cublasGemmStridedBatchedEx_64); \ + __macro(cublasGemmEx_64); \ + __macro(cublasSgemmEx_64); + +CUBLAS_BLAS_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#endif + +#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h new file mode 100644 index 00000000000..5bb2ae07f0b --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include // NOLINT +#include + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag cublasLt_dso_flag; +extern void* cublasLt_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublasLt routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cublasLt_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(cublasLt_dso_flag, []() { \ + cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle(); \ + }); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + static void* p_##__name = \ + dlsym(cublasLt_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +// APIs available after CUDA 11.1 +#if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE) +#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasLtCreate); \ + __macro(cublasLtDestroy); \ + __macro(cublasLtMatmul); \ + __macro(cublasLtMatmulDescCreate); \ + __macro(cublasLtMatmulDescDestroy); \ + __macro(cublasLtMatmulDescSetAttribute); \ + __macro(cublasLtMatmulDescGetAttribute); \ + __macro(cublasLtMatrixLayoutCreate); \ + __macro(cublasLtMatrixLayoutDestroy); \ + __macro(cublasLtMatrixLayoutSetAttribute); \ + __macro(cublasLtMatrixLayoutGetAttribute); \ + __macro(cublasLtMatmulPreferenceCreate); \ + __macro(cublasLtMatmulPreferenceDestroy); \ + __macro(cublasLtMatmulPreferenceSetAttribute); \ + __macro(cublasLtMatmulAlgoGetHeuristic); \ + __macro(cublasLtMatrixTransform); \ + __macro(cublasLtMatrixTransformDescCreate); \ + __macro(cublasLtMatrixTransformDescDestroy); \ + __macro(cublasLtMatrixTransformDescSetAttribute); \ + __macro(cublasLtMatmulAlgoInit); \ + __macro(cublasLtMatmulAlgoConfigSetAttribute); \ + __macro(cublasLtMatmulAlgoConfigGetAttribute); \ + __macro(cublasLtMatmulAlgoGetIds); \ + __macro(cublasLtMatmulAlgoCapGetAttribute); \ + __macro(cublasLtMatmulAlgoCheck); +// __macro(cublasLtGetCudartVersion); +#else +#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasLtCreate); \ + __macro(cublasLtDestroy); \ + __macro(cublasLtMatmul); \ + __macro(cublasLtMatmulDescCreate); \ + __macro(cublasLtMatmulDescDestroy); \ + __macro(cublasLtMatmulDescSetAttribute); \ + __macro(cublasLtMatmulDescGetAttribute); \ + __macro(cublasLtMatrixLayoutCreate); \ + __macro(cublasLtMatrixLayoutDestroy); \ + __macro(cublasLtMatrixLayoutSetAttribute); \ + __macro(cublasLtMatrixLayoutGetAttribute); \ + __macro(cublasLtMatmulPreferenceCreate); \ + __macro(cublasLtMatmulPreferenceDestroy); \ + __macro(cublasLtMatmulPreferenceSetAttribute); \ + __macro(cublasLtMatmulAlgoGetHeuristic); \ + __macro(cublasLtMatrixTransform); \ + __macro(cublasLtMatrixTransformDescCreate); \ + __macro(cublasLtMatrixTransformDescDestroy); \ + __macro(cublasLtMatrixTransformDescSetAttribute); +#endif + +CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) +// #endif + +#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cudnn.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cudnn.h new file mode 100644 index 00000000000..b3d3d582a06 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cudnn.h @@ -0,0 +1,219 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUDA +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag cudnn_dso_flag; +extern void* cudnn_dso_handle; +extern bool HasCUDNN(); + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cudnn_func = decltype(&::__name); \ + std::call_once(cudnn_dso_flag, []() { \ + cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle(); \ + }); \ + EnforceCUDNNLoaded(#__name); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + static void* p_##__name = \ + dlsym(cudnn_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cudnn functions in HPPL + * different cudnn version has different interfaces + **/ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetCallback); \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreateSpatialTransformerDescriptor); \ + __macro(cudnnSetSpatialTransformerNdDescriptor); \ + __macro(cudnnDestroySpatialTransformerDescriptor); \ + __macro(cudnnSpatialTfGridGeneratorForward); \ + __macro(cudnnSpatialTfGridGeneratorBackward); \ + __macro(cudnnSpatialTfSamplerForward); \ + __macro(cudnnSpatialTfSamplerBackward); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnActivationBackward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ + __macro(cudnnFindConvolutionForwardAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ + __macro(cudnnGetErrorString); \ + __macro(cudnnCreateDropoutDescriptor); \ + __macro(cudnnDropoutGetStatesSize); \ + __macro(cudnnSetDropoutDescriptor); \ + __macro(cudnnRestoreDropoutDescriptor); \ + __macro(cudnnCreateRNNDescriptor); \ + __macro(cudnnDestroyDropoutDescriptor); \ + __macro(cudnnDestroyRNNDescriptor); \ + __macro(cudnnSetTensorNdDescriptorEx); \ + __macro(cudnnAddTensor); \ + __macro(cudnnConvolutionBackwardData); \ + __macro(cudnnConvolutionBackwardFilter); \ + __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ + __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); \ + __macro(cudnnBatchNormalizationForwardTraining); \ + __macro(cudnnBatchNormalizationForwardInference); \ + __macro(cudnnBatchNormalizationBackward); \ + __macro(cudnnCreateActivationDescriptor); \ + __macro(cudnnSetActivationDescriptor); \ + __macro(cudnnGetActivationDescriptor); \ + __macro(cudnnDestroyActivationDescriptor); +CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(cudnnSetConvolutionGroupCount); \ + __macro(cudnnSetConvolutionMathType); \ + __macro(cudnnConvolutionBiasActivationForward); \ + __macro(cudnnCreateCTCLossDescriptor); \ + __macro(cudnnDestroyCTCLossDescriptor); \ + __macro(cudnnGetCTCLossDescriptor); \ + __macro(cudnnSetCTCLossDescriptor); \ + __macro(cudnnGetCTCLossWorkspaceSize); \ + __macro(cudnnCTCLoss); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ + __macro(cudnnGetConvolutionForwardAlgorithm_v7); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount); +CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \ + __macro(cudnnCreateRNNDataDescriptor); \ + __macro(cudnnDestroyRNNDataDescriptor); \ + __macro(cudnnSetRNNDataDescriptor); +CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ + __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ + __macro(cudnnBatchNormalizationForwardTrainingEx); \ + __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ + __macro(cudnnBatchNormalizationBackwardEx); \ + __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); +CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) \ + __macro(cudnnSetRNNDescriptor_v8); \ + __macro(cudnnCreateFusedOpsPlan); \ + __macro(cudnnCreateFusedOpsConstParamPack); \ + __macro(cudnnCreateFusedOpsVariantParamPack); \ + __macro(cudnnDestroyFusedOpsPlan); \ + __macro(cudnnDestroyFusedOpsConstParamPack); \ + __macro(cudnnDestroyFusedOpsVariantParamPack); \ + __macro(cudnnFusedOpsExecute); \ + __macro(cudnnSetFusedOpsConstParamPackAttribute); \ + __macro(cudnnSetFusedOpsVariantParamPackAttribute); \ + __macro(cudnnMakeFusedOpsPlan); +CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#ifdef PADDLE_WITH_CUDNN_FRONTEND +#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \ + __macro(cudnnBackendCreateDescriptor); \ + __macro(cudnnBackendDestroyDescriptor); \ + __macro(cudnnBackendExecute); \ + __macro(cudnnBackendFinalize); \ + __macro(cudnnBackendGetAttribute); \ + __macro(cudnnBackendSetAttribute); \ + __macro(cudnnGetStream); \ + __macro(cudnnReorderFilterAndBias); +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION < 90000 +#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \ + __macro(cudnnGetRNNParamsSize); \ + __macro(cudnnGetRNNWorkspaceSize); \ + __macro(cudnnGetRNNTrainingReserveSize); \ + __macro(cudnnSetRNNDescriptor_v6); \ + __macro(cudnnRNNForwardInference); \ + __macro(cudnnRNNForwardTraining); \ + __macro(cudnnRNNBackwardData); \ + __macro(cudnnRNNBackwardWeights); +CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(__macro) \ + __macro(cudnnSetRNNPaddingMode); \ + __macro(cudnnRNNForwardInferenceEx); \ + __macro(cudnnRNNForwardTrainingEx); \ + __macro(cudnnRNNBackwardDataEx); \ + __macro(cudnnRNNBackwardWeightsEx); +CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9( + DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 90000 +#define CUDNN_DNN_ROUTINE_EACH_R9(__macro) \ + __macro(cudnnGetLastErrorString); \ + __macro(cudnnGetRNNWeightSpaceSize); \ + __macro(cudnnGetRNNTempSpaceSizes); \ + __macro(cudnnRNNForward); \ + __macro(cudnnRNNBackwardData_v8); \ + __macro(cudnnRNNBackwardWeights_v8); +CUDNN_DNN_ROUTINE_EACH_R9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif +} // namespace dynload +} // namespace phi + +#endif diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cufft.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cufft.h new file mode 100644 index 00000000000..996080ae4a4 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cufft.h @@ -0,0 +1,117 @@ +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUDA +#include +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag cufft_dso_flag; +extern void* cufft_dso_handle; +extern bool HasCUFFT(); + +extern void EnforceCUFFTLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cufft_func = decltype(&::__name); \ + std::call_once(cufft_dso_flag, []() { \ + cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ + }); \ + EnforceCUFFTLoaded(#__name); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + static void* p_##__name = \ + dlsym(cufft_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cufft functions in HPPL + * different cufft version has different interfaces + **/ +#define CUFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(cufftPlan1d); \ + __macro(cufftPlan2d); \ + __macro(cufftPlan3d); \ + __macro(cufftPlanMany); \ + __macro(cufftMakePlan1d); \ + __macro(cufftMakePlan2d); \ + __macro(cufftMakePlan3d); \ + __macro(cufftMakePlanMany); \ + __macro(cufftMakePlanMany64); \ + __macro(cufftGetSizeMany64); \ + __macro(cufftEstimate1d); \ + __macro(cufftEstimate2d); \ + __macro(cufftEstimate3d); \ + __macro(cufftEstimateMany); \ + __macro(cufftCreate); \ + __macro(cufftGetSize1d); \ + __macro(cufftGetSize2d); \ + __macro(cufftGetSize3d); \ + __macro(cufftGetSizeMany); \ + __macro(cufftGetSize); \ + __macro(cufftSetWorkArea); \ + __macro(cufftSetAutoAllocation); \ + __macro(cufftExecC2C); \ + __macro(cufftExecR2C); \ + __macro(cufftExecC2R); \ + __macro(cufftExecZ2Z); \ + __macro(cufftExecD2Z); \ + __macro(cufftExecZ2D); \ + __macro(cufftSetStream); \ + __macro(cufftDestroy); \ + __macro(cufftGetVersion); \ + __macro(cufftGetProperty); \ + __macro(cufftXtSetGPUs); \ + __macro(cufftXtMalloc); \ + __macro(cufftXtMemcpy); \ + __macro(cufftXtFree); \ + __macro(cufftXtSetWorkArea); \ + __macro(cufftXtExecDescriptorC2C); \ + __macro(cufftXtExecDescriptorR2C); \ + __macro(cufftXtExecDescriptorC2R); \ + __macro(cufftXtExecDescriptorZ2Z); \ + __macro(cufftXtExecDescriptorD2Z); \ + __macro(cufftXtExecDescriptorZ2D); \ + __macro(cufftXtQueryPlan); \ + __macro(cufftXtSetCallback); \ + __macro(cufftXtClearCallback); \ + __macro(cufftXtSetCallbackSharedSize); \ + __macro(cufftXtMakePlanMany); \ + __macro(cufftXtGetSizeMany); \ + __macro(cufftXtExec); \ + __macro(cufftXtExecDescriptor); \ + __macro(cufftXtSetWorkAreaPolicy); + +CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP) + +} // namespace dynload +} // namespace phi + +#endif diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cupti.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cupti.h new file mode 100644 index 00000000000..754e7b4cb89 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cupti.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_CUPTI + +#include +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +// namespace phi { +// namespace dynload { + +extern std::once_flag cupti_dso_flag; +extern void *cupti_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cupti routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#ifdef PADDLE_WITH_XPU +inline bool IsXPUTracingEnabled() { + static bool initialized = false; + static bool enabled = false; + + if (!initialized) { + initialized = true; + + const char *env = std::getenv("XPU_ENABLE_PROFILER_TRACING"); + enabled = (env && (std::string(env) == "1")); + } + return enabled; +} + +#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline CUptiResult CUPTIAPI operator()(Args... args) { \ + using cuptiFunc = decltype(&::__name); \ + if (!IsXPUTracingEnabled()) { \ + return CUPTI_SUCCESS; \ + } \ + std::call_once(cupti_dso_flag, []() { \ + cupti_dso_handle = phi::dynload::GetCUPTIDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cupti_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#else // !PADDLE_WITH_XPU + +#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline CUptiResult CUPTIAPI operator()(Args... args) { \ + using cuptiFunc = decltype(&::__name); \ + std::call_once(cupti_dso_flag, []() { \ + cupti_dso_handle = phi::dynload::GetCUPTIDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(cupti_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif // PADDLE_WITH_XPU + +#define CUPTI_ROUTINE_EACH(__macro) \ + __macro(cuptiActivityEnable); \ + __macro(cuptiActivityDisable); \ + __macro(cuptiActivityRegisterCallbacks); \ + __macro(cuptiActivityGetAttribute); \ + __macro(cuptiActivitySetAttribute); \ + __macro(cuptiGetTimestamp); \ + __macro(cuptiActivityGetNextRecord); \ + __macro(cuptiGetResultString); \ + __macro(cuptiActivityGetNumDroppedRecords); \ + __macro(cuptiActivityFlushAll); \ + __macro(cuptiSubscribe); \ + __macro(cuptiUnsubscribe); \ + __macro(cuptiEnableCallback); \ + __macro(cuptiEnableDomain); \ + __macro(cudaOccMaxActiveBlocksPerMultiprocessor); + +CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP +// } // namespace dynload +// } // namespace phi + +#endif // PADDLE_WITH_CUPTI diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusolver.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusolver.h new file mode 100644 index 00000000000..c52af44a6ff --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusolver.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag cusolver_dso_flag; +extern void* cusolver_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cusolverStatus_t operator()(Args... args) { \ + using cusolverFunc = decltype(&::__name); \ + std::call_once(cusolver_dso_flag, []() { \ + cusolver_dso_handle = phi::dynload::GetCusolverDsoHandle(); \ + }); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + static void* p_##__name = \ + dlsym(cusolver_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CUSOLVER_ROUTINE_EACH(__macro) \ + __macro(cusolverDnCreate); \ + __macro(cusolverDnDestroy); \ + __macro(cusolverDnSetStream); \ + __macro(cusolverDnSpotrf_bufferSize); \ + __macro(cusolverDnDpotrf_bufferSize); \ + __macro(cusolverDnXpotrf_bufferSize); \ + __macro(cusolverDnSpotrf); \ + __macro(cusolverDnDpotrf); \ + __macro(cusolverDnXpotrf); \ + __macro(cusolverDnSpotrs); \ + __macro(cusolverDnDpotrs); \ + __macro(cusolverDnCpotrs); \ + __macro(cusolverDnZpotrs); \ + __macro(cusolverDnSsyevd_bufferSize); \ + __macro(cusolverDnDsyevd_bufferSize); \ + __macro(cusolverDnCheevd_bufferSize); \ + __macro(cusolverDnZheevd_bufferSize); \ + __macro(cusolverDnSsyevd); \ + __macro(cusolverDnDsyevd); \ + __macro(cusolverDnCheevd); \ + __macro(cusolverDnZheevd); \ + __macro(cusolverDnSpotrfBatched); \ + __macro(cusolverDnDpotrfBatched); \ + __macro(cusolverDnSpotrsBatched); \ + __macro(cusolverDnDpotrsBatched); \ + __macro(cusolverDnSgetrf_bufferSize); \ + __macro(cusolverDnDgetrf_bufferSize); \ + __macro(cusolverDnCgetrf_bufferSize); \ + __macro(cusolverDnZgetrf_bufferSize); \ + __macro(cusolverDnSgeqrf_bufferSize); \ + __macro(cusolverDnDgeqrf_bufferSize); \ + __macro(cusolverDnCgeqrf_bufferSize); \ + __macro(cusolverDnZgeqrf_bufferSize); \ + __macro(cusolverDnXgeqrf_bufferSize); \ + __macro(cusolverDnSorgqr_bufferSize); \ + __macro(cusolverDnDorgqr_bufferSize); \ + __macro(cusolverDnSormqr_bufferSize); \ + __macro(cusolverDnDormqr_bufferSize); \ + __macro(cusolverDnCungqr_bufferSize); \ + __macro(cusolverDnZungqr_bufferSize); \ + __macro(cusolverDnDestroyGesvdjInfo); \ + __macro(cusolverDnCreateGesvdjInfo); \ + __macro(cusolverDnSgesvdj_bufferSize); \ + __macro(cusolverDnDgesvdj_bufferSize); \ + __macro(cusolverDnCgesvdj_bufferSize); \ + __macro(cusolverDnZgesvdj_bufferSize); \ + __macro(cusolverDnSgesvdj); \ + __macro(cusolverDnDgesvdj); \ + __macro(cusolverDnCgesvdj); \ + __macro(cusolverDnZgesvdj); \ + __macro(cusolverDnSgetrf); \ + __macro(cusolverDnSgetrs); \ + __macro(cusolverDnDgetrs); \ + __macro(cusolverDnCgetrs); \ + __macro(cusolverDnZgetrs); \ + __macro(cusolverDnDgetrf); \ + __macro(cusolverDnCgetrf); \ + __macro(cusolverDnZgetrf); \ + __macro(cusolverDnSgeqrf); \ + __macro(cusolverDnDgeqrf); \ + __macro(cusolverDnCgeqrf); \ + __macro(cusolverDnZgeqrf); \ + __macro(cusolverDnXgeqrf); \ + __macro(cusolverDnSorgqr); \ + __macro(cusolverDnDorgqr); \ + __macro(cusolverDnSormqr); \ + __macro(cusolverDnDormqr); \ + __macro(cusolverDnCungqr); \ + __macro(cusolverDnZungqr); \ + __macro(cusolverDnCreateSyevjInfo); \ + __macro(cusolverDnCreateParams); \ + __macro(cusolverDnDestroyParams); \ + __macro(cusolverDnSsyevj_bufferSize); \ + __macro(cusolverDnDsyevj_bufferSize); \ + __macro(cusolverDnCheevj_bufferSize); \ + __macro(cusolverDnZheevj_bufferSize); \ + __macro(cusolverDnSsyevj); \ + __macro(cusolverDnDsyevj); \ + __macro(cusolverDnCheevj); \ + __macro(cusolverDnZheevj); \ + __macro(cusolverDnDestroySyevjInfo); \ + __macro(cusolverDnXsyevjSetSortEig); \ + __macro(cusolverDnSsyevjBatched_bufferSize); \ + __macro(cusolverDnDsyevjBatched_bufferSize); \ + __macro(cusolverDnCheevjBatched_bufferSize); \ + __macro(cusolverDnZheevjBatched_bufferSize); \ + __macro(cusolverDnSsyevjBatched); \ + __macro(cusolverDnDsyevjBatched); \ + __macro(cusolverDnCheevjBatched); \ + __macro(cusolverDnZheevjBatched); + +CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusparse.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusparse.h new file mode 100644 index 00000000000..82c3fc91e79 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusparse.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag cusparse_dso_flag; +extern void* cusparse_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cusparseStatus_t operator()(Args... args) { \ + using Func = decltype(&::__name); \ + std::call_once(cusparse_dso_flag, []() { \ + cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \ + }); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + static void* p_##__name = \ + dlsym(cusparse_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#if defined(PADDLE_WITH_CUDA) +#define CUSPARSE_ROUTINE_EACH(__macro) \ + __macro(cusparseCreate); \ + __macro(cusparseSetStream); \ + __macro(cusparseCreateMatDescr); \ + __macro(cusparseDestroy); \ + __macro(cusparseSnnz); \ + __macro(cusparseDnnz); \ + __macro(cusparseSetMatType); \ + __macro(cusparseSetMatIndexBase); \ + __macro(cusparseCreateCsr); \ + __macro(cusparseCreateCoo); \ + __macro(cusparseCreateDnMat); \ + __macro(cusparseCreateDnVec); \ + __macro(cusparseSpMM_bufferSize); \ + __macro(cusparseSpMM); \ + __macro(cusparseDestroySpMat); \ + __macro(cusparseDestroyDnMat); \ + __macro(cusparseDestroyDnVec); \ + __macro(cusparseSpMV_bufferSize); \ + __macro(cusparseSpMV); \ + __macro(cusparseSpMatGetSize); \ + __macro(cusparseCsrSetPointers); \ + __macro(cusparseSpGEMM_createDescr); \ + __macro(cusparseSpGEMM_compute); \ + __macro(cusparseSpGEMM_workEstimation); \ + __macro(cusparseSpGEMM_copy); \ + __macro(cusparseSpGEMM_destroyDescr); + +CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) + +#if CUDA_VERSION >= 11030 +#define CUSPARSE_ROUTINE_EACH_R2(__macro) \ + __macro(cusparseSpMM_preprocess); \ + __macro(cusparseSDDMM_bufferSize); \ + __macro(cusparseSDDMM_preprocess); \ + __macro(cusparseSDDMM); + +CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +#endif + +#if CUDA_VERSION >= 11080 +#define CUSPARSE_ROUTINE_EACH_R3(__macro) \ + __macro(cusparseDnMatSetStridedBatch); \ + __macro(cusparseCooSetStridedBatch); \ + __macro(cusparseCsrSetStridedBatch); + +CUSPARSE_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +#endif + +#endif // PADDLE_WITH_CUDA + +#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.cc b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.cc new file mode 100644 index 00000000000..9f2ac0890f6 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.cc @@ -0,0 +1,1131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/phi/backends/dynload/dynamic_loader.h" + +#include + +#include +#include +#include +#include + +#include "paddle/phi/common/port.h" +#include "paddle/phi/core/enforce.h" + +#if defined(_WIN32) +#include +#endif + +// TODO(wilber): The phi computing library requires a component to manage flags +// (maybe not use gflags). +#include "glog/logging.h" +#include "paddle/common/flags.h" + +COMMON_DECLARE_string(cudnn_dir); +COMMON_DECLARE_string(cuda_dir); +COMMON_DECLARE_string(cublas_dir); +COMMON_DECLARE_string(nccl_dir); +COMMON_DECLARE_string(cupti_dir); +COMMON_DECLARE_string(tensorrt_dir); +COMMON_DECLARE_string(mklml_dir); +COMMON_DECLARE_string(hml_dir); +COMMON_DECLARE_string(lapack_dir); +COMMON_DECLARE_string(mkl_dir); +COMMON_DECLARE_string(op_dir); +COMMON_DECLARE_string(cusparselt_dir); +COMMON_DECLARE_string(curand_dir); +COMMON_DECLARE_string(cusolver_dir); +COMMON_DECLARE_string(cusparse_dir); +COMMON_DECLARE_string(win_cuda_bin_dir); +#ifdef PADDLE_WITH_MAGMA +COMMON_DECLARE_string(magma_dir); +#endif + +#ifndef CUDA_LIB_NAME +#define CUDA_LIB_NAME "libcuda.so" +#endif + +#ifndef BLAS_LIB_NAME +#define BLAS_LIB_NAME "libcublas.so" +#endif + +#ifndef BLASLT_LIB_NAME +#define BLASLT_LIB_NAME "libcublasLt.so" +#endif + +#ifndef DNN_LIB_NAME +#define DNN_LIB_NAME "libcudnn.so" +#endif + +#ifndef PTI_LIB_NAME +#define PTI_LIB_NAME "libcupti.so" +#endif + +#ifndef RAND_LIB_NAME +#define RAND_LIB_NAME "libcurand.so" +#endif + +#ifndef JPEG_LIB_NAME +#define JPEG_LIB_NAME "libnvjpeg.so" +#endif + +#ifndef SOLVER_LIB_NAME +#define SOLVER_LIB_NAME "libcusolver.so" +#endif + +#ifndef SPARSE_LIB_NAME +#define SPARSE_LIB_NAME "libcusparse.so" +#endif + +#ifndef RTC_LIB_NAME +#define RTC_LIB_NAME "libnvrtc.so" +#endif + +#ifndef FLASHATTN_LIB_NAME +#define FLASHATTN_LIB_NAME "libflashattn.so" +#endif + +#ifndef FLASHATTNV3_LIB_NAME +#define FLASHATTNV3_LIB_NAME "libflashattnv3.so" +#endif + +#ifndef CCL_LIB_NAME +#define CCL_LIB_NAME "libnccl.so" +#endif + +#ifndef FFT_LIB_NAME +#define FFT_LIB_NAME "libcufft.so" +#endif + +#ifndef SPARSELT_LIB_NAME +#define SPARSELT_LIB_NAME "libcusparseLt.so" +#endif + +#ifndef CUPTI_LIB_PATH +#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@" +#endif + +#ifdef PADDLE_WITH_HIP + +PHI_DEFINE_string(miopen_dir, + "", + "Specify path for loading libMIOpen.so. For instance, " + "/opt/rocm/miopen/lib. If empty [default], dlopen " + "will search miopen from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(rocm_dir, + "", + "Specify path for loading rocm library, such as librocblas, " + "libmiopen, libhipsparse. For instance, /opt/rocm/lib. " + "If default, dlopen will search rocm from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(rccl_dir, + "", + "Specify path for loading rccl library, such as librccl.so. " + "For instance, /opt/rocm/rccl/lib. If default, " + "dlopen will search rccl from LD_LIBRARY_PATH"); +#endif + +#ifdef PADDLE_WITH_FLAGCX +COMMON_DECLARE_string(flagcx_dir); + +PHI_DEFINE_EXPORTED_string( + flagcx_dir, // NOLINT + "", + "Specify path for loading libflagcx.so. For instance, " + "For instance, /usr/local/flagcx/lib. If default, " + "dlopen will search flagcx from LD_LIBRARY_PATH"); +#endif + +#ifdef PADDLE_WITH_XPU +PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so."); +PD_DEFINE_string(xputx_dir, "", "Specify path for loading llibxpuToolsExt.so."); +#endif + +namespace phi::dynload { + +struct PathNode { + PathNode() = default; + std::string path = ""; +}; + +static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; // NOLINT + +// NOTE: In order to adapt to the default installation path of cuda +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin"; +#else +static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64"; // NOLINT +#endif + +static PathNode s_py_site_pkg_path; + +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll"; +static constexpr char* win_cublas_lib = + "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_curand_lib = + "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll"; +static constexpr char* win_cusolver_lib = + "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cusolver64_" CUDA_VERSION_MAJOR + ".dll;cusolver64_11.dll;cusolver64_10.dll"; +static constexpr char* win_cusparse_lib = + "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll"; +static constexpr char* win_cufft_lib = + "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_11.dll;cufft64_10.dll"; +#endif + +static inline std::string join(const std::string& part1, + const std::string& part2) { +// directory separator +#if defined(_WIN32) + const char sep = '\\'; +#else + const char sep = '/'; +#endif + if (!part2.empty() && part2.front() == sep) { + return part2; + } + std::string ret; + ret.reserve(part1.size() + part2.size() + 1); + ret = part1; + if (!ret.empty() && ret.back() != sep) { + ret += sep; + } + ret += part2; + return ret; +} + +static inline std::vector split( + const std::string& str, const std::string separator = " ") { + std::vector str_list; + std::string::size_type firstPos = 0; + firstPos = str.find_first_not_of(separator, 0); + std::string::size_type lastPos = 0; + lastPos = str.find_first_of(separator, firstPos); + while (std::string::npos != firstPos && std::string::npos != lastPos) { + str_list.push_back(str.substr(firstPos, lastPos - firstPos)); + firstPos = str.find_first_not_of(separator, lastPos); + lastPos = str.find_first_of(separator, firstPos); + } + if (std::string::npos == lastPos) { + str_list.push_back(str.substr(firstPos, lastPos - firstPos)); + } + return str_list; +} + +void SetPaddleLibPath(const std::string& py_site_pkg_path) { + s_py_site_pkg_path.path = py_site_pkg_path; + VLOG(6) << "Set paddle lib path : " << py_site_pkg_path; +} + +static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path, + const std::string& dso_name, + int dynload_flags) { + void* dso_handle = nullptr; + if (!spec_path.empty() || !dso_name.empty()) { + // search xxx.so from custom path + VLOG(6) << "Try to find library: " << dso_name + << " from specific path: " << spec_path; + std::string dso_path = join(spec_path, dso_name); +#if defined(_WIN32) || defined(_WIN64) + HMODULE handle = LoadLibraryA(dso_path.c_str()); + dso_handle = reinterpret_cast(handle); +#else + dso_handle = dlopen(dso_path.c_str(), dynload_flags); +#endif + } + return dso_handle; +} + +static inline std::string FindLibAbsolutePath(const std::string& directory, + const std::string& filename) { + DIR* dir = opendir(directory.c_str()); + struct dirent* ent; + + if (dir != nullptr) { + while ((ent = readdir(dir)) != nullptr) { + if (ent->d_type == DT_REG || ent->d_type == DT_LNK) { + if (filename == std::string(ent->d_name)) { + closedir(dir); + return join(directory, ent->d_name); + } + } else if (ent->d_type == DT_DIR) { + if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) { + std::string res = + FindLibAbsolutePath(join(directory, ent->d_name) + "/", filename); + if (!res.empty()) { + closedir(dir); + return res; + } + } + } + } + closedir(dir); + } + return ""; +} + +static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, + int dynload_flags) { +#if defined(_WIN32) || defined(_WIN64) + HMODULE hModule = LoadLibraryA(dso_path.c_str()); + return reinterpret_cast(hModule); +#else + // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + // and /usr/local/lib path + void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); + VLOG(3) << "Try to find library: " << dso_path + << " from default system path."; + +// TODO(chenweihang): This path is used to search which libs? +// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to +// bring System Integrity Projection (SIP), if dso_handle +// is null, search from default package path in Mac OS. +#if defined(__APPLE__) || defined(__OSX__) +#if defined(__arm__) || defined(__aarch64__) + if (nullptr == dso_handle) { + dso_handle = + dlopen(FindLibAbsolutePath("/opt/homebrew/Cellar/", dso_path).c_str(), + dynload_flags); + } +#else + if (nullptr == dso_handle) { + dso_handle = + dlopen(FindLibAbsolutePath("/usr/local/cuda/lib/", dso_path).c_str(), + dynload_flags); + } +#endif +#endif + + return dso_handle; +#endif +} + +/* + * We define three priorities for dynamic library search: + * + * First: Search for path specified by the user + * Second: Search the stheystem default path + * Third: Search for a special path corresponding to + * a specific library to adapt to changes and easy to expand. + */ + +static inline void* GetDsoHandleFromSearchPath( + const std::string& config_path, + const std::string& dso_name, + bool throw_on_error = true, + const std::vector& extra_paths = std::vector(), + const std::string& warning_msg = std::string()) { +#if !defined(_WIN32) + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 +#if defined(_WIN32) + std::vector cuda_bin_search_path = { + L"cublas", + L"cuda_nvrtc", + L"cuda_runtime", + L"cudnn", + L"cufft", + L"curand", + L"cusolver", + L"cusparse", + L"nvjitlink", + }; + for (auto search_path : cuda_bin_search_path) { + std::wstring_convert> converter; + std::wstring win_path_wstring = + converter.from_bytes(FLAGS_win_cuda_bin_dir); + search_path = win_path_wstring + L"\\" + search_path + L"\\bin"; +#ifdef PADDLE_WITH_CUDA + AddDllDirectory(search_path.c_str()); +#endif + } +#endif + std::vector dso_names = split(dso_name, ";"); + void* dso_handle = nullptr; + for (auto const& dso : dso_names) { + // 1. search in user config path by FLAGS + dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags); + // 2. search in system default path + if (nullptr == dso_handle) { + dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags); + } + // 3. search in extra paths + if (nullptr == dso_handle) { + for (auto const& path : extra_paths) { + VLOG(3) << "extra_paths: " << path; + dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags); + } + } + if (nullptr != dso_handle) break; + } + + // 4. [If Failed for All dso_names] logging warning if exists + if (nullptr == dso_handle && !warning_msg.empty()) { + LOG(WARNING) << warning_msg; + } + + // 5. [If Failed for All dso_names] logging or throw error info + if (nullptr == dso_handle) { + auto error_msg = + "The third-party dynamic library (%s) that Paddle depends on is not " + "configured correctly. (error code is %s)\n" + " Suggestions:\n" + " 1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) " + "is installed correctly and its version is matched with paddlepaddle " + "you installed.\n" + " 2. Configure third-party dynamic library environment variables as " + "follows:\n" + " - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n" + " - Windows: set PATH by `set PATH=XXX;%%PATH%%`\n" + " - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` " + "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is " + "impossible unless System Integrity Protection (SIP) is disabled.]"; +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 + if (throw_on_error) { + // NOTE: Special error report case, no need to change its format + PADDLE_THROW( + common::errors::PreconditionNotMet(error_msg, dso_name, errorno)); + } else { + LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno); + } + } + + return dso_handle; +} + +void* GetCublasDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, BLAS_LIB_NAME); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_13.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 13, paddle " + "temporarily no longer supports"); + return nullptr; + } +#elif defined(__linux__) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so"); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so"); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.13"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so"); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 14, paddle " + "temporarily no longer supports"); + return nullptr; + } +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); +#endif +} + +void* GetCublasLtDsoHandle() { +// APIs available after CUDA 10.1 +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, BLASLT_LIB_NAME); +#elif defined(__linux__) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so"); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so"); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.13"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so"); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 14, paddle " + "temporarily no longer supports"); + return nullptr; + } +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_13.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 14, paddle " + "temporarily no longer supports"); + return nullptr; + } +#elif !defined(__linux__) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so"); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipblaslt.so"); +#else + std::string warning_msg( + "Your CUDA_VERSION less 11, not support CublasLt. " + "If you want to use CublasLt, please upgrade CUDA and rebuild " + "PaddlePaddle."); + return nullptr; +#endif +} + +void* GetCUDNNDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + std::string mac_warn_meg( + "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " + "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " + "chmod a+r /usr/local/cuda/include/cudnn.h " + "/usr/local/cuda/lib/libcudnn*"); + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, DNN_LIB_NAME, false, {cuda_lib_path}); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + std::string win_warn_meg( + "Note: [Recommend] copy cudnn into CUDA installation directory. \n " + "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from " + "NVIDIA's official website, \n" + "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing " + "Toolkit\\CUDA\\v10.0\n" + "You should do this according to your CUDA installation directory and " + "CUDNN version."); + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); +#endif + } else if (CUDA_VERSION >= 12030) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); +#endif + } +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); +#else +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + if (CUDA_VERSION >= 12030) { + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path}); + } else { + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libcudnn.so.8", false, {cuda_lib_path}); + } +#else + return GetDsoHandleFromSearchPath( + FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path}); +#endif +#endif +} + +void* GetCUPTIDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path}); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, PTI_LIB_NAME, false, {cupti_lib_path}); +#elif defined(PADDLE_WITH_XPU) + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path}); +#elif defined(__linux__) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path}); +#endif + + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path}); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so.13", false, {cupti_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 14, paddle " + "temporarily no longer supports"); + return nullptr; + } +#else + return GetDsoHandleFromSearchPath( + FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path}); +#endif +} + +void* GetCurandDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, RAND_LIB_NAME); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); +#endif +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); +#else +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10"); +#else + return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so"); +#endif + +#endif +} + +#ifdef PADDLE_WITH_HIP +void* GetROCFFTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipfft.so"); +#endif +} +#endif + +void* GetNvjpegDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, JPEG_LIB_NAME); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so"); +#endif +} + +void* GetCusolverDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, SOLVER_LIB_NAME); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path}); +#endif + } else { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cusolver64_12.dll", true, {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path}); +#endif + } +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsolver.so"); +#elif defined(__linux__) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so"); +#endif + } else { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.12"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so"); +#endif + } +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11"); +#endif +} + +void* GetCusparseDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, SPARSE_LIB_NAME); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 13, paddle " + "temporarily no longer supports"); + return nullptr; + } +#elif defined(__linux__) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so"); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so"); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 14, paddle " + "temporarily no longer."); + return nullptr; + } +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so"); +#endif +} + +void* GetNVRTCDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, RTC_LIB_NAME); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); +#endif +} + +void* GetCUDADsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, CUDA_LIB_NAME, false); +#elif defined(PADDLE_WITH_HIP) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#elif defined(_WIN32) + char system32_dir[MAX_PATH]; + GetSystemDirectory(system32_dir, MAX_PATH); + return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false); +#endif +} + +void* GetWarpCTCDsoHandle() { + std::string warpctc_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + warpctc_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll"); +#else + return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so"); +#endif +} + +void* GetWarpRNNTDsoHandle() { + std::string warprnnt_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + warprnnt_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(warprnnt_dir, "warprnnt.dll"); +#else + return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.so"); +#endif +} + +void* GetFlashAttnDsoHandle() { + std::string flashattn_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + flashattn_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattn.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(flashattn_dir, "flashattn.dll"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(flashattn_dir, FLASHATTN_LIB_NAME); +#else + return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattn.so"); +#endif +} + +void* GetFlashAttnV3DsoHandle() { + std::string flashattn_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + flashattn_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(flashattn_dir, "flashattnv3.dll"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(flashattn_dir, FLASHATTNV3_LIB_NAME); +#else + return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.so"); +#endif +} + +void* GetFlashMaskV2DsoHandle() { + std::string flashattn_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + flashattn_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(flashattn_dir, "libflashmaskv2.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(flashattn_dir, "flashmaskv2.dll"); +#else + return GetDsoHandleFromSearchPath(flashattn_dir, "libflashmaskv2.so"); +#endif +} + +void* GetAfsApiDsoHandle() { + std::string afsapi_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + afsapi_dir = s_py_site_pkg_path.path; + } +#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32) + return NULL; +#else + return GetDsoHandleFromSearchPath(afsapi_dir, "libafs-api-so.so"); +#endif +} + +void* GetNCCLDsoHandle() { +#ifdef PADDLE_WITH_HIP + std::string warning_msg( + "You may need to install 'rccl' from ROCM official website: " + "https://rocmdocs.amd.com/en/latest/Installation_Guide/" + "Installation-Guide.html before install PaddlePaddle."); +#else + std::string warning_msg( + "You may need to install 'nccl2' from NVIDIA official website: " + "https://developer.nvidia.com/nccl/nccl-download " + "before install PaddlePaddle."); +#endif + +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg); +#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) + return GetDsoHandleFromSearchPath( + FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg); +#else +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg); +#else +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, CCL_LIB_NAME, true, {}, warning_msg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); +#endif +#endif + +#endif +} + +void* GetFLAGCXDsoHandle() { +#ifdef PADDLE_WITH_FLAGCX + return GetDsoHandleFromSearchPath(FLAGS_flagcx_dir, "libflagcx.so"); +#else + return nullptr; +#endif +} + +void* GetTensorRtDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); +#endif +} + +void* GetMKLMLDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); +#endif +} + +void* GetHMLDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return nullptr; +#elif defined(_WIN32) + return nullptr; +#else + return GetDsoHandleFromSearchPath(FLAGS_hml_dir, "libhml_rt.so"); +#endif +} + +void* GetLAPACKDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) +#if defined(__arm__) || defined(__aarch64__) + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib"); +#endif +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3"); +#endif +} + +void* GetMAGMADsoHandle() { +#if defined(PADDLE_WITH_MAGMA) + return GetDsoHandleFromSearchPath(FLAGS_magma_dir, "libmagma.so"); +#endif + return nullptr; +} + +void* GetOpDsoHandle(const std::string& dso_name) { + return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name); +} + +void* GetNvtxDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Apple.")); +#elif defined(_WIN32) + PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Windows.")); +#elif !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) + PADDLE_THROW(common::errors::Unimplemented( + "Nvtx do not support without CUDA or XPU.")); +#elif defined(PADDLE_WITH_XPU) + return GetDsoHandleFromSearchPath("FLAGS_xputx_dir", "libxpuToolsExt.so"); +#else + if (CUDA_VERSION >= 12090) { + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvtx3interop.so.1"); + } + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so"); +#endif +} + +void* GetCUFFTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib"); +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, FFT_LIB_NAME); +#elif defined(__linux__) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so"); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11"); + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.12"); + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 14, paddle " + "temporarily no longer supports"); + return nullptr; + } +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 13, paddle " + "temporarily no longer supports"); + return nullptr; + } +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so"); +#endif +} + +void* GetMKLRTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so"); +#endif +} + +void* GetCusparseLtDsoHandle() { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, SPARSELT_LIB_NAME); +#elif defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, "libcusparseLt.so"); +#else + std::string warning_msg( + "Your CUDA_VERSION less 11.2, not support cusparseLt. " + "If you want to use cusparseLt, please upgrade CUDA and rebuild " + "PaddlePaddle."); + return nullptr; +#endif +} + +void* GetXPTIDsoHandle() { +#ifdef PADDLE_WITH_XPTI + return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so"); +#else + return nullptr; +#endif +} +} // namespace phi::dynload diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.h new file mode 100644 index 00000000000..208eaca1137 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "paddle/utils/test_macros.h" +namespace phi { +namespace dynload { + +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + +void* GetCublasDsoHandle(); +void* GetCublasLtDsoHandle(); +void* GetCUDNNDsoHandle(); +void* GetCUPTIDsoHandle(); +void* GetCurandDsoHandle(); +void* GetNvjpegDsoHandle(); +void* GetCusolverDsoHandle(); +void* GetCusparseDsoHandle(); +void* GetNVRTCDsoHandle(); +void* GetCUDADsoHandle(); +void* GetWarpCTCDsoHandle(); +void* GetWarpRNNTDsoHandle(); +void* GetFlashAttnDsoHandle(); +void* GetFlashAttnV3DsoHandle(); +void* GetFlashMaskV2DsoHandle(); +void* GetNCCLDsoHandle(); +void* GetFLAGCXDsoHandle(); +void* GetTensorRtDsoHandle(); +void* GetMKLMLDsoHandle(); +void* GetLAPACKDsoHandle(); +void* GetMAGMADsoHandle(); +void* GetOpDsoHandle(const std::string& dso_name); +void* GetNvtxDsoHandle(); +void* GetCUFFTDsoHandle(); +void* GetMKLRTDsoHandle(); +void* GetHMLDsoHandle(); +void* GetROCFFTDsoHandle(); +void* GetCusparseLtDsoHandle(); +void* GetXPTIDsoHandle(); +void* GetAfsApiDsoHandle(); + +void SetPaddleLibPath(const std::string&); + +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/nvjpeg.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/nvjpeg.h new file mode 100644 index 00000000000..bb2dc37f895 --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/nvjpeg.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_CUDA +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag nvjpeg_dso_flag; +extern void* nvjpeg_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + nvjpegStatus_t operator()(Args... args) { \ + using nvjpegFunc = decltype(&::__name); \ + std::call_once(nvjpeg_dso_flag, []() { \ + nvjpeg_dso_handle = phi::dynload::GetNvjpegDsoHandle(); \ + }); \ + std::string replaced_name = #__name; \ + replaced_name = replaced_name.replace(0, 2, "mc"); \ + static void* p_##__name = \ + dlsym(nvjpeg_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDecode); + +NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); + +} // namespace dynload +} // namespace phi + +#endif diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/warpctc.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warpctc.h new file mode 100644 index 00000000000..bea933a7e3b --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warpctc.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" +#include "warpctc/include/ctc.h" + +namespace phi { +namespace dynload { + +extern std::once_flag warpctc_dso_flag; +extern void* warpctc_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using warpctcFunc = decltype(&::__name); \ + std::call_once(warpctc_dso_flag, []() { \ + warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + DYNAMIC_LOAD_WARPCTC_WRAP(__name) + +#define WARPCTC_ROUTINE_EACH(__macro) \ + __macro(get_warpctc_version); \ + __macro(ctcGetStatusString); \ + __macro(compute_ctc_loss); \ + __macro(compute_ctc_loss_double); \ + __macro(get_workspace_size); \ + __macro(get_workspace_size_double) + +WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/warprnnt.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warprnnt.h new file mode 100644 index 00000000000..5a84efc491e --- /dev/null +++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warprnnt.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/common/port.h" +#include "warprnnt/include/rnnt.h" + +namespace phi { +namespace dynload { + +extern std::once_flag warprnnt_dso_flag; +extern void* warprnnt_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warprnnt routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using warprnntFunc = decltype(&::__name); \ + std::call_once(warprnnt_dso_flag, []() { \ + warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \ + DYNAMIC_LOAD_WARPRNNT_WRAP(__name) + +#define WARPRNNT_ROUTINE_EACH(__macro) \ + __macro(get_warprnnt_version); \ + __macro(rnntGetStatusString); \ + __macro(compute_rnnt_loss); \ + __macro(compute_rnnt_loss_fp64); \ + __macro(get_rnnt_workspace_size); + +WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP); + +#undef DYNAMIC_LOAD_WARPRNNT_WRAP + +} // namespace dynload +} // namespace phi diff --git a/backends/metax_gpu/patch/Eigen_3.4.0_paddle.zip b/backends/metax_gpu/patch/Eigen_3.4.0_paddle.zip new file mode 100644 index 00000000000..c82a9c81f62 Binary files /dev/null and b/backends/metax_gpu/patch/Eigen_3.4.0_paddle.zip differ diff --git a/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip b/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip deleted file mode 100644 index 69d962f1132..00000000000 Binary files a/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip and /dev/null differ diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 71bf0b8777d..ab8d77514ac 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -34,149 +34,6 @@ index 8d445b39ae..504e7b6293 100755 if(CUDA_VERSION GREATER_EQUAL 11.6) op_library(fused_gemm_epilogue_op) endif() -diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h -index bda9cbe17e..c73eba9c8a 100644 ---- a/paddle/phi/backends/dynload/cublas.h -+++ b/paddle/phi/backends/dynload/cublas.h -@@ -49,7 +49,12 @@ extern void *cublas_dso_handle; - std::call_once(cublas_dso_flag, []() { \ - cublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \ - }); \ -- static void *p_##__name = dlsym(cublas_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0, 2, "mc"); \ -+ int index = replaced_name.find("_", 0); \ -+ if (index != -1) replaced_name = replaced_name.substr(0, index); \ -+ static void* p_##__name = \ -+ dlsym(cublas_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ -diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h -index 8b2e08c777..ca926df151 100644 ---- a/paddle/phi/backends/dynload/cublasLt.h -+++ b/paddle/phi/backends/dynload/cublasLt.h -@@ -46,12 +46,14 @@ extern void *cublasLt_dso_handle; - std::call_once(cublasLt_dso_flag, []() { \ - cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle(); \ - }); \ -- static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0, 2, "mc"); \ -+ static void* p_##__name = \ -+ dlsym(cublasLt_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name -- - // APIs available after CUDA 11.1 - #if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE) - #define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ -@@ -79,8 +81,8 @@ extern void *cublasLt_dso_handle; - __macro(cublasLtMatmulAlgoConfigGetAttribute); \ - __macro(cublasLtMatmulAlgoGetIds); \ - __macro(cublasLtMatmulAlgoCapGetAttribute); \ -- __macro(cublasLtMatmulAlgoCheck); \ -- __macro(cublasLtGetCudartVersion); -+ __macro(cublasLtMatmulAlgoCheck); -+ // __macro(cublasLtGetCudartVersion); - #else - #define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ - __macro(cublasLtCreate); \ -diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h -index ad2ada9dfa..9e8389e7dc 100644 ---- a/paddle/phi/backends/dynload/cudnn.h -+++ b/paddle/phi/backends/dynload/cudnn.h -@@ -38,7 +38,10 @@ extern void EnforceCUDNNLoaded(const char* fn_name); - cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle(); \ - }); \ - EnforceCUDNNLoaded(#__name); \ -- static void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0, 2, "mc"); \ -+ static void* p_##__name = \ -+ dlsym(cudnn_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ -diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h -index 1547909d92..ef20838434 100644 ---- a/paddle/phi/backends/dynload/cufft.h -+++ b/paddle/phi/backends/dynload/cufft.h -@@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. - /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); -@@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name); - cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ - }); \ - EnforceCUFFTLoaded(#__name); \ -- static void* p_##__name = dlsym(cufft_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0,2,"mc"); \ -+ static void* p_##__name = dlsym(cufft_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ -diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h -index 4241a512e8..94e32b743e 100644 ---- a/paddle/phi/backends/dynload/cupti.h -+++ b/paddle/phi/backends/dynload/cupti.h -@@ -24,8 +24,8 @@ limitations under the License. */ - #include "paddle/phi/backends/dynload/dynamic_loader.h" - #include "paddle/phi/common/port.h" - --namespace phi { --namespace dynload { -+// namespace phi { -+// namespace dynload { - - extern std::once_flag cupti_dso_flag; - extern void *cupti_dso_handle; -@@ -105,7 +105,7 @@ inline bool IsXPUTracingEnabled() { - CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); - - #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP --} // namespace dynload --} // namespace phi -+// } // namespace dynload -+// } // namespace phi - --#endif // PADDLE_WITH_CUPTI -+#endif // PADDLE_WITH_CUPTI -\ No newline at end of file -diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h -index 57e09bb6e4..87fb5b1797 100644 ---- a/paddle/phi/backends/dynload/cusolver.h -+++ b/paddle/phi/backends/dynload/cusolver.h -@@ -34,7 +34,9 @@ extern void *cusolver_dso_handle; - std::call_once(cusolver_dso_flag, []() { \ - cusolver_dso_handle = phi::dynload::GetCusolverDsoHandle(); \ - }); \ -- static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0,2,"mc"); \ -+ static void* p_##__name = dlsym(cusolver_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ -diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h -index e8cb0ac643..e8e7596d44 100644 ---- a/paddle/phi/backends/dynload/cusparse.h -+++ b/paddle/phi/backends/dynload/cusparse.h -@@ -34,7 +34,9 @@ extern void *cusparse_dso_handle; - std::call_once(cusparse_dso_flag, []() { \ - cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \ - }); \ -- static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0,2,"mc"); \ -+ static void* p_##__name = dlsym(cusparse_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 39f50bd95d..4d627b99b7 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc @@ -200,21 +57,6 @@ index 39f50bd95d..4d627b99b7 100644 #ifdef PADDLE_WITH_HIP PHI_DEFINE_string(miopen_dir, -diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h -index c5309e7e11..3328571380 100644 ---- a/paddle/phi/backends/dynload/nvjpeg.h -+++ b/paddle/phi/backends/dynload/nvjpeg.h -@@ -31,7 +31,9 @@ extern void *nvjpeg_dso_handle; - std::call_once(nvjpeg_dso_flag, []() { \ - nvjpeg_dso_handle = phi::dynload::GetNvjpegDsoHandle(); \ - }); \ -- static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \ -+ std::string replaced_name = #__name; \ -+ replaced_name = replaced_name.replace(0,2,"mc"); \ -+ static void* p_##__name = dlsym(nvjpeg_dso_handle, replaced_name.c_str()); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h index 092365a961..8bd3f9fcea 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h diff --git a/backends/metax_gpu/setup.py.in b/backends/metax_gpu/setup.py.in index b1600e9bb5a..41831e07b15 100644 --- a/backends/metax_gpu/setup.py.in +++ b/backends/metax_gpu/setup.py.in @@ -1,6 +1,20 @@ import os from setuptools import setup, Distribution +from setuptools.command.install import install +import os +import shutil + +class CustomInstallCommand(install): + def run(self): + install.run(self) + install_dir = os.path.join(self.install_lib, 'paddle_custom_device', 'include') + source_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'python', 'include') + + if os.path.exists(source_dir): + if not os.path.exists(install_dir): + os.makedirs(install_dir) + shutil.copytree(source_dir, install_dir, dirs_exist_ok=True) packages = [] package_data = {} @@ -108,13 +122,17 @@ def main(): ], include_package_data=True, package_data = { - '': ['*.so', '*.h', '*.py', '*.hpp'], + 'paddle_custom_device': ['*.so'], + '': ['*.h', '*.hpp'], }, package_dir = { '': 'python', }, zip_safe=False, distclass=BinaryDistribution, + cmdclass={ + 'install': CustomInstallCommand, +}, entry_points={ 'console_scripts': [ ]