[Metax] Fix dynload #2287

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

metax666 merged 1 commit into PaddlePaddle:develop from metax666:develop

Dec 19, 2025

.github/workflows/_Metax-X86.yml

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -17,11 +17,6 @@ on:
  
            default: 'true'

    defaults:

      run:

        shell: bash

    jobs:

      check-bypass:

    @@ -65,10 +60,10 @@ jobs:
  
              # !!!!! SKIP IF NO METAX CHANGE !!!!

              echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="

              change_numbers=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | wc -l)

              # change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)

              change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)

              # change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)

              change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)

              change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)

              # change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)

              change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)

              # change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)

              git --no-pager diff --name-only remotes/origin/${BRANCH}

              if [ $change_numbers -ne $change_backend ]; then

backends/metax_gpu/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up @@
     target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)
     include_directories(BEFORE ${PADDLE_SOURCE_DIR})
+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers)
     target_compile_definitions(
       ${TARGET_NAME}
@@ Expand Down Expand Up / @@ -826,8 +827,12 @@ add_custom_command( @@
       POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/python/
       COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/python/
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${CMAKE_CURRENT_BINARY_DIR}/python/include/
       COMMAND ${CMAKE_COMMAND} -E make_directory
               ${CMAKE_CURRENT_BINARY_DIR}/python/paddle_custom_device/
+      COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/headers
+              ${CMAKE_CURRENT_BINARY_DIR}/python/include/
       COMMAND
         ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.so
@@ Expand Down @@

backends/metax_gpu/change_patch.sh

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -17,8 +17,8 @@
  
    rm -r ../../Paddle/third_party/eigen3

    cd patch

    unzip mcEigen_3.4.0_paddle_final.zip

    mv mcEigen_3.4.0_paddle_final eigen3

    unzip Eigen_3.4.0_paddle.zip

    mv Eigen_3.4.0_paddle eigen3

    cd ..

    cp -r patch/eigen3/ ../../Paddle/third_party/eigen3

    rm -r patch/eigen3

backends/metax_gpu/compile.sh

-Original file line number
+Diff line change
@@ Expand Up / @@ -31,7 +31,7 @@ fi @@
     echo "make_maca"
     cd build
     cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
-    make_maca -j18
+    make_maca -j18 VERBOSE=1
     echo "install whl"
@@ Expand Down @@

backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h

-Original file line number
+Diff line change
@@ -0,0 +1,148 @@
+    /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+        http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License. */
+    #pragma once
+    #include <cublasXt.h>
+    #include <cublas_v2.h>
+    #include <cuda.h>
+    #if CUDA_VERSION >= 12030 && defined(__linux__)
+    #include <cublas_api.h>
+    #endif
+    #include <mutex>  // NOLINT
+    #include <type_traits>
+    #include "paddle/phi/backends/dynload/dynamic_loader.h"
+    #include "paddle/phi/common/port.h"
+    namespace phi {
+    namespace dynload {
+    extern std::once_flag cublas_dso_flag;
+    extern void* cublas_dso_handle;
+    /**
+     * The following macro definition can generate structs
+     * (for each function) to dynamic load cublas routine
+     * via operator overloading.
+     *
+     * note: default dynamic linked libs
+     */
+    #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+      struct DynLoad__##__name {                                                \
+        template <typename... Args>                                             \
+        inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+          using cublas_func =                                                   \
+              decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+          std::call_once(cublas_dso_flag, []() {                                \
+            cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+          });                                                                   \
+          std::string replaced_name = #__name;                                  \
+          replaced_name = replaced_name.replace(0, 2, "mc");                    \
+          int index = replaced_name.find("_", 0);                               \
+          if (index != -1) replaced_name = replaced_name.substr(0, index);      \
+          static void* p_##__name =                                             \
+              dlsym(cublas_dso_handle, replaced_name.c_str());                  \
+          return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+        }                                                                       \
+      };                                                                        \
+      extern DynLoad__##__name __name
+    #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+      __macro(cublasSaxpy_v2);                \
+      __macro(cublasDaxpy_v2);                \
+      __macro(cublasCaxpy_v2);                \
+      __macro(cublasZaxpy_v2);                \
+      __macro(cublasSscal_v2);                \
+      __macro(cublasDscal_v2);                \
+      __macro(cublasScopy_v2);                \
+      __macro(cublasDcopy_v2);                \
+      __macro(cublasSgemv_v2);                \
+      __macro(cublasDgemv_v2);                \
+      __macro(cublasCgemv_v2);                \
+      __macro(cublasZgemv_v2);                \
+      __macro(cublasSgemm_v2);                \
+      __macro(cublasDgemm_v2);                \
+      __macro(cublasCgemm_v2);                \
+      __macro(cublasZgemm_v2);                \
+      __macro(cublasHgemm);                   \
+      __macro(cublasSgemmEx);                 \
+      __macro(cublasSgeam);                   \
+      __macro(cublasDgeam);                   \
+      __macro(cublasStrsm_v2);                \
+      __macro(cublasDtrsm_v2);                \
+      __macro(cublasCtrsm_v2);                \
+      __macro(cublasZtrsm_v2);                \
+      __macro(cublasCreate_v2);               \
+      __macro(cublasDestroy_v2);              \
+      __macro(cublasSetStream_v2);            \
+      __macro(cublasSetPointerMode_v2);       \
+      __macro(cublasGetPointerMode_v2);       \
+      __macro(cublasSgemmBatched);            \
+      __macro(cublasDgemmBatched);            \
+      __macro(cublasCgemmBatched);            \
+      __macro(cublasZgemmBatched);            \
+      __macro(cublasStrsmBatched);            \
+      __macro(cublasDtrsmBatched);            \
+      __macro(cublasCtrsmBatched);            \
+      __macro(cublasZtrsmBatched);            \
+      __macro(cublasSgetrfBatched);           \
+      __macro(cublasSgetriBatched);           \
+      __macro(cublasDgetrfBatched);           \
+      __macro(cublasDgetriBatched);           \
+      __macro(cublasCgetrfBatched);           \
+      __macro(cublasCgetriBatched);           \
+      __macro(cublasZgetrfBatched);           \
+      __macro(cublasZgetriBatched);           \
+      __macro(cublasSmatinvBatched);          \
+      __macro(cublasDmatinvBatched);          \
+      __macro(cublasCmatinvBatched);          \
+      __macro(cublasZmatinvBatched);          \
+      __macro(cublasSgetrsBatched);           \
+      __macro(cublasDgetrsBatched);           \
+      __macro(cublasSdot_v2);                 \
+      __macro(cublasDdot_v2);                 \
+      __macro(cublasCdotc_v2);                \
+      __macro(cublasZdotc_v2);                \
+      __macro(cublasCdotu_v2);                \
+      __macro(cublasZdotu_v2);                \
+      __macro(cublasDotEx);                   \
+      __macro(cublasGemmEx);                  \
+      __macro(cublasSgemmStridedBatched);     \
+      __macro(cublasDgemmStridedBatched);     \
+      __macro(cublasCgemmStridedBatched);     \
+      __macro(cublasZgemmStridedBatched);     \
+      __macro(cublasHgemmStridedBatched);     \
+      __macro(cublasSetMathMode);             \
+      __macro(cublasGetMathMode);             \
+      __macro(cublasCgeam);                   \
+      __macro(cublasZgeam);                   \
+      __macro(cublasGemmBatchedEx);           \
+      __macro(cublasGemmStridedBatchedEx);
+    CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+    #if CUDA_VERSION >= 12030 && defined(__linux__)
+    #define CUBLAS_BLAS_ROUTINE_EACH_R5(__macro) \
+      __macro(cublasGemmStridedBatchedEx_64);    \
+      __macro(cublasGemmEx_64);                  \
+      __macro(cublasSgemmEx_64);
+    CUBLAS_BLAS_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+    #endif
+    #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+    }  // namespace dynload
+    }  // namespace phi

backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h

-Original file line number
+Diff line change
@@ -0,0 +1,114 @@
+    /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+    Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+        http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License. */
+    #pragma once
+    #include <cublasLt.h>
+    #include <cuda.h>
+    #include <mutex>  // NOLINT
+    #include <type_traits>
+    #include "paddle/phi/backends/dynload/dynamic_loader.h"
+    #include "paddle/phi/common/port.h"
+    namespace phi {
+    namespace dynload {
+    extern std::once_flag cublasLt_dso_flag;
+    extern void* cublasLt_dso_handle;
+    /**
+     * The following macro definition can generate structs
+     * (for each function) to dynamic load cublasLt routine
+     * via operator overloading.
+     *
+     * note: default dynamic linked libs
+     */
+    #define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+      struct DynLoad__##__name {                                                \
+        template <typename... Args>                                             \
+        inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+          using cublasLt_func =                                                 \
+              decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+          std::call_once(cublasLt_dso_flag, []() {                              \
+            cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
+          });                                                                   \
+          std::string replaced_name = #__name;                                  \
+          replaced_name = replaced_name.replace(0, 2, "mc");                    \
+          static void* p_##__name =                                             \
+              dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
+          return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+        }                                                                       \
+      };                                                                        \
+      extern DynLoad__##__name __name
+    // APIs available after CUDA 11.1
+    #if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+      __macro(cublasLtCreate);                          \
+      __macro(cublasLtDestroy);                         \
+      __macro(cublasLtMatmul);                          \
+      __macro(cublasLtMatmulDescCreate);                \
+      __macro(cublasLtMatmulDescDestroy);               \
+      __macro(cublasLtMatmulDescSetAttribute);          \
+      __macro(cublasLtMatmulDescGetAttribute);          \
+      __macro(cublasLtMatrixLayoutCreate);              \
+      __macro(cublasLtMatrixLayoutDestroy);             \
+      __macro(cublasLtMatrixLayoutSetAttribute);        \
+      __macro(cublasLtMatrixLayoutGetAttribute);        \
+      __macro(cublasLtMatmulPreferenceCreate);          \
+      __macro(cublasLtMatmulPreferenceDestroy);         \
+      __macro(cublasLtMatmulPreferenceSetAttribute);    \
+      __macro(cublasLtMatmulAlgoGetHeuristic);          \
+      __macro(cublasLtMatrixTransform);                 \
+      __macro(cublasLtMatrixTransformDescCreate);       \
+      __macro(cublasLtMatrixTransformDescDestroy);      \
+      __macro(cublasLtMatrixTransformDescSetAttribute); \
+      __macro(cublasLtMatmulAlgoInit);                  \
+      __macro(cublasLtMatmulAlgoConfigSetAttribute);    \
+      __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
+      __macro(cublasLtMatmulAlgoGetIds);                \
+      __macro(cublasLtMatmulAlgoCapGetAttribute);       \
+      __macro(cublasLtMatmulAlgoCheck);
+    // __macro(cublasLtGetCudartVersion);
+    #else
+    #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
+      __macro(cublasLtCreate);                       \
+      __macro(cublasLtDestroy);                      \
+      __macro(cublasLtMatmul);                       \
+      __macro(cublasLtMatmulDescCreate);             \
+      __macro(cublasLtMatmulDescDestroy);            \
+      __macro(cublasLtMatmulDescSetAttribute);       \
+      __macro(cublasLtMatmulDescGetAttribute);       \
+      __macro(cublasLtMatrixLayoutCreate);           \
+      __macro(cublasLtMatrixLayoutDestroy);          \
+      __macro(cublasLtMatrixLayoutSetAttribute);     \
+      __macro(cublasLtMatrixLayoutGetAttribute);     \
+      __macro(cublasLtMatmulPreferenceCreate);       \
+      __macro(cublasLtMatmulPreferenceDestroy);      \
+      __macro(cublasLtMatmulPreferenceSetAttribute); \
+      __macro(cublasLtMatmulAlgoGetHeuristic);       \
+      __macro(cublasLtMatrixTransform);              \
+      __macro(cublasLtMatrixTransformDescCreate);    \
+      __macro(cublasLtMatrixTransformDescDestroy);   \
+      __macro(cublasLtMatrixTransformDescSetAttribute);
+    #endif
+    CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+    // #endif
+    #undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+    }  // namespace dynload
+    }  // namespace phi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Metax] Fix dynload #2287

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!