diff --git a/.github/workflows/_Metax-X86.yml b/.github/workflows/_Metax-X86.yml
index d90a375d79f..3754be15a0d 100644
--- a/.github/workflows/_Metax-X86.yml
+++ b/.github/workflows/_Metax-X86.yml
@@ -17,11 +17,6 @@ on:
         default: 'true'
 
 
-defaults:
-  run:
-    shell: bash
-
-
 jobs:
 
   check-bypass:
@@ -65,10 +60,10 @@ jobs:
           # !!!!! SKIP IF NO METAX CHANGE !!!!
           echo "=========== Checking PR Changes If METAX FULL CI Needed ==========="
           change_numbers=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | wc -l)
-          # change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
-          change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)
-          # change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
-          change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)
+          change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/"| wc -l)
+          # change_backend=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/" || true)
+          change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep "backends/metax_gpu"| wc -l)
+          # change_metax_only=$(git --no-pager diff --name-only remotes/origin/${BRANCH} | grep -c "backends/metax_gpu" || true)
           git --no-pager diff --name-only remotes/origin/${BRANCH}
 
           if [ $change_numbers -ne $change_backend ]; then
diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 2e2f9aa0e1f..79941c2866a 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -788,6 +788,7 @@ target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so)
 target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)
 
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
+include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers)
 
 target_compile_definitions(
   ${TARGET_NAME}
@@ -826,8 +827,12 @@ add_custom_command(
   POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/python/
   COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/python/
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+          ${CMAKE_CURRENT_BINARY_DIR}/python/include/
   COMMAND ${CMAKE_COMMAND} -E make_directory
           ${CMAKE_CURRENT_BINARY_DIR}/python/paddle_custom_device/
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/headers
+          ${CMAKE_CURRENT_BINARY_DIR}/python/include/
   COMMAND
     ${CMAKE_COMMAND} -E copy_if_different
     ${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.so
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 7ac600a0da7..3fa9a647615 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -17,8 +17,8 @@
 
 rm -r ../../Paddle/third_party/eigen3
 cd patch
-unzip mcEigen_3.4.0_paddle_final.zip
-mv mcEigen_3.4.0_paddle_final eigen3
+unzip Eigen_3.4.0_paddle.zip
+mv Eigen_3.4.0_paddle eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
 rm -r patch/eigen3
diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh
index bdc6269a60b..ed2b1b2374a 100644
--- a/backends/metax_gpu/compile.sh
+++ b/backends/metax_gpu/compile.sh
@@ -31,7 +31,7 @@ fi
 echo "make_maca"
 cd build
 cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
-make_maca -j18
+make_maca -j18 VERBOSE=1
 
 
 echo "install whl"
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h
new file mode 100644
index 00000000000..959614c094e
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublas.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasXt.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#if CUDA_VERSION >= 12030 && defined(__linux__)
+#include <cublas_api.h>
+#endif
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag cublas_dso_flag;
+extern void* cublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublas_dso_flag, []() {                                \
+        cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+      });                                                                   \
+      std::string replaced_name = #__name;                                  \
+      replaced_name = replaced_name.replace(0, 2, "mc");                    \
+      int index = replaced_name.find("_", 0);                               \
+      if (index != -1) replaced_name = replaced_name.substr(0, index);      \
+      static void* p_##__name =                                             \
+          dlsym(cublas_dso_handle, replaced_name.c_str());                  \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
+  __macro(cublasCaxpy_v2);                \
+  __macro(cublasZaxpy_v2);                \
+  __macro(cublasSscal_v2);                \
+  __macro(cublasDscal_v2);                \
+  __macro(cublasScopy_v2);                \
+  __macro(cublasDcopy_v2);                \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasCgemv_v2);                \
+  __macro(cublasZgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasCgemm_v2);                \
+  __macro(cublasZgemm_v2);                \
+  __macro(cublasHgemm);                   \
+  __macro(cublasSgemmEx);                 \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasStrsm_v2);                \
+  __macro(cublasDtrsm_v2);                \
+  __macro(cublasCtrsm_v2);                \
+  __macro(cublasZtrsm_v2);                \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasStrsmBatched);            \
+  __macro(cublasDtrsmBatched);            \
+  __macro(cublasCtrsmBatched);            \
+  __macro(cublasZtrsmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched);           \
+  __macro(cublasCgetrfBatched);           \
+  __macro(cublasCgetriBatched);           \
+  __macro(cublasZgetrfBatched);           \
+  __macro(cublasZgetriBatched);           \
+  __macro(cublasSmatinvBatched);          \
+  __macro(cublasDmatinvBatched);          \
+  __macro(cublasCmatinvBatched);          \
+  __macro(cublasZmatinvBatched);          \
+  __macro(cublasSgetrsBatched);           \
+  __macro(cublasDgetrsBatched);           \
+  __macro(cublasSdot_v2);                 \
+  __macro(cublasDdot_v2);                 \
+  __macro(cublasCdotc_v2);                \
+  __macro(cublasZdotc_v2);                \
+  __macro(cublasCdotu_v2);                \
+  __macro(cublasZdotu_v2);                \
+  __macro(cublasDotEx);                   \
+  __macro(cublasGemmEx);                  \
+  __macro(cublasSgemmStridedBatched);     \
+  __macro(cublasDgemmStridedBatched);     \
+  __macro(cublasCgemmStridedBatched);     \
+  __macro(cublasZgemmStridedBatched);     \
+  __macro(cublasHgemmStridedBatched);     \
+  __macro(cublasSetMathMode);             \
+  __macro(cublasGetMathMode);             \
+  __macro(cublasCgeam);                   \
+  __macro(cublasZgeam);                   \
+  __macro(cublasGemmBatchedEx);           \
+  __macro(cublasGemmStridedBatchedEx);
+
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+#if CUDA_VERSION >= 12030 && defined(__linux__)
+#define CUBLAS_BLAS_ROUTINE_EACH_R5(__macro) \
+  __macro(cublasGemmStridedBatchedEx_64);    \
+  __macro(cublasGemmEx_64);                  \
+  __macro(cublasSgemmEx_64);
+
+CUBLAS_BLAS_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h
new file mode 100644
index 00000000000..5bb2ae07f0b
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cublasLt.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasLt.h>
+#include <cuda.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag cublasLt_dso_flag;
+extern void* cublasLt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublasLt routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublasLt_func =                                                 \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublasLt_dso_flag, []() {                              \
+        cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
+      });                                                                   \
+      std::string replaced_name = #__name;                                  \
+      replaced_name = replaced_name.replace(0, 2, "mc");                    \
+      static void* p_##__name =                                             \
+          dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
+      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+// APIs available after CUDA 11.1
+#if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+  __macro(cublasLtCreate);                          \
+  __macro(cublasLtDestroy);                         \
+  __macro(cublasLtMatmul);                          \
+  __macro(cublasLtMatmulDescCreate);                \
+  __macro(cublasLtMatmulDescDestroy);               \
+  __macro(cublasLtMatmulDescSetAttribute);          \
+  __macro(cublasLtMatmulDescGetAttribute);          \
+  __macro(cublasLtMatrixLayoutCreate);              \
+  __macro(cublasLtMatrixLayoutDestroy);             \
+  __macro(cublasLtMatrixLayoutSetAttribute);        \
+  __macro(cublasLtMatrixLayoutGetAttribute);        \
+  __macro(cublasLtMatmulPreferenceCreate);          \
+  __macro(cublasLtMatmulPreferenceDestroy);         \
+  __macro(cublasLtMatmulPreferenceSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetHeuristic);          \
+  __macro(cublasLtMatrixTransform);                 \
+  __macro(cublasLtMatrixTransformDescCreate);       \
+  __macro(cublasLtMatrixTransformDescDestroy);      \
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);    \
+  __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetIds);                \
+  __macro(cublasLtMatmulAlgoCapGetAttribute);       \
+  __macro(cublasLtMatmulAlgoCheck);
+// __macro(cublasLtGetCudartVersion);
+#else
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
+  __macro(cublasLtCreate);                       \
+  __macro(cublasLtDestroy);                      \
+  __macro(cublasLtMatmul);                       \
+  __macro(cublasLtMatmulDescCreate);             \
+  __macro(cublasLtMatmulDescDestroy);            \
+  __macro(cublasLtMatmulDescSetAttribute);       \
+  __macro(cublasLtMatmulDescGetAttribute);       \
+  __macro(cublasLtMatrixLayoutCreate);           \
+  __macro(cublasLtMatrixLayoutDestroy);          \
+  __macro(cublasLtMatrixLayoutSetAttribute);     \
+  __macro(cublasLtMatrixLayoutGetAttribute);     \
+  __macro(cublasLtMatmulPreferenceCreate);       \
+  __macro(cublasLtMatmulPreferenceDestroy);      \
+  __macro(cublasLtMatmulPreferenceSetAttribute); \
+  __macro(cublasLtMatmulAlgoGetHeuristic);       \
+  __macro(cublasLtMatrixTransform);              \
+  __macro(cublasLtMatrixTransformDescCreate);    \
+  __macro(cublasLtMatrixTransformDescDestroy);   \
+  __macro(cublasLtMatrixTransformDescSetAttribute);
+#endif
+
+CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+// #endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cudnn.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cudnn.h
new file mode 100644
index 00000000000..b3d3d582a06
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cudnn.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cudnn_func = decltype(&::__name);                        \
+      std::call_once(cudnn_dso_flag, []() {                          \
+        cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
+      });                                                            \
+      EnforceCUDNNLoaded(#__name);                                   \
+      std::string replaced_name = #__name;                           \
+      replaced_name = replaced_name.replace(0, 2, "mc");             \
+      static void* p_##__name =                                      \
+          dlsym(cudnn_dso_handle, replaced_name.c_str());            \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetCallback);                               \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
+  __macro(cudnnSetConvolutionGroupCount);                 \
+  __macro(cudnnSetConvolutionMathType);                   \
+  __macro(cudnnConvolutionBiasActivationForward);         \
+  __macro(cudnnCreateCTCLossDescriptor);                  \
+  __macro(cudnnDestroyCTCLossDescriptor);                 \
+  __macro(cudnnGetCTCLossDescriptor);                     \
+  __macro(cudnnSetCTCLossDescriptor);                     \
+  __macro(cudnnGetCTCLossWorkspaceSize);                  \
+  __macro(cudnnCTCLoss);                                  \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
+  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
+  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
+  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
+  __macro(cudnnBatchNormalizationBackwardEx);                        \
+  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
+  __macro(cudnnSetRNNDescriptor_v8);                  \
+  __macro(cudnnCreateFusedOpsPlan);                   \
+  __macro(cudnnCreateFusedOpsConstParamPack);         \
+  __macro(cudnnCreateFusedOpsVariantParamPack);       \
+  __macro(cudnnDestroyFusedOpsPlan);                  \
+  __macro(cudnnDestroyFusedOpsConstParamPack);        \
+  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
+  __macro(cudnnFusedOpsExecute);                      \
+  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
+  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
+  __macro(cudnnMakeFusedOpsPlan);
+CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \
+  __macro(cudnnBackendCreateDescriptor);         \
+  __macro(cudnnBackendDestroyDescriptor);        \
+  __macro(cudnnBackendExecute);                  \
+  __macro(cudnnBackendFinalize);                 \
+  __macro(cudnnBackendGetAttribute);             \
+  __macro(cudnnBackendSetAttribute);             \
+  __macro(cudnnGetStream);                       \
+  __macro(cudnnReorderFilterAndBias);
+CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION < 90000
+#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
+  __macro(cudnnGetRNNParamsSize);                     \
+  __macro(cudnnGetRNNWorkspaceSize);                  \
+  __macro(cudnnGetRNNTrainingReserveSize);            \
+  __macro(cudnnSetRNNDescriptor_v6);                  \
+  __macro(cudnnRNNForwardInference);                  \
+  __macro(cudnnRNNForwardTraining);                   \
+  __macro(cudnnRNNBackwardData);                      \
+  __macro(cudnnRNNBackwardWeights);
+CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(__macro) \
+  __macro(cudnnSetRNNPaddingMode);                                 \
+  __macro(cudnnRNNForwardInferenceEx);                             \
+  __macro(cudnnRNNForwardTrainingEx);                              \
+  __macro(cudnnRNNBackwardDataEx);                                 \
+  __macro(cudnnRNNBackwardWeightsEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(
+    DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 90000
+#define CUDNN_DNN_ROUTINE_EACH_R9(__macro) \
+  __macro(cudnnGetLastErrorString);        \
+  __macro(cudnnGetRNNWeightSpaceSize);     \
+  __macro(cudnnGetRNNTempSpaceSizes);      \
+  __macro(cudnnRNNForward);                \
+  __macro(cudnnRNNBackwardData_v8);        \
+  __macro(cudnnRNNBackwardWeights_v8);
+CUDNN_DNN_ROUTINE_EACH_R9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+}  // namespace dynload
+}  // namespace phi
+
+#endif
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cufft.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cufft.h
new file mode 100644
index 00000000000..996080ae4a4
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cufft.h
@@ -0,0 +1,117 @@
+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
+// Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cufft.h>
+#include <cufftXt.h>
+#include <glog/logging.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag cufft_dso_flag;
+extern void* cufft_dso_handle;
+extern bool HasCUFFT();
+
+extern void EnforceCUFFTLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUFFT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cufft_func = decltype(&::__name);                        \
+      std::call_once(cufft_dso_flag, []() {                          \
+        cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
+      });                                                            \
+      EnforceCUFFTLoaded(#__name);                                   \
+      std::string replaced_name = #__name;                           \
+      replaced_name = replaced_name.replace(0, 2, "mc");             \
+      static void* p_##__name =                                      \
+          dlsym(cufft_dso_handle, replaced_name.c_str());            \
+      return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cufft functions in HPPL
+ * different cufft version has different interfaces
+ **/
+#define CUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(cufftPlan1d);                  \
+  __macro(cufftPlan2d);                  \
+  __macro(cufftPlan3d);                  \
+  __macro(cufftPlanMany);                \
+  __macro(cufftMakePlan1d);              \
+  __macro(cufftMakePlan2d);              \
+  __macro(cufftMakePlan3d);              \
+  __macro(cufftMakePlanMany);            \
+  __macro(cufftMakePlanMany64);          \
+  __macro(cufftGetSizeMany64);           \
+  __macro(cufftEstimate1d);              \
+  __macro(cufftEstimate2d);              \
+  __macro(cufftEstimate3d);              \
+  __macro(cufftEstimateMany);            \
+  __macro(cufftCreate);                  \
+  __macro(cufftGetSize1d);               \
+  __macro(cufftGetSize2d);               \
+  __macro(cufftGetSize3d);               \
+  __macro(cufftGetSizeMany);             \
+  __macro(cufftGetSize);                 \
+  __macro(cufftSetWorkArea);             \
+  __macro(cufftSetAutoAllocation);       \
+  __macro(cufftExecC2C);                 \
+  __macro(cufftExecR2C);                 \
+  __macro(cufftExecC2R);                 \
+  __macro(cufftExecZ2Z);                 \
+  __macro(cufftExecD2Z);                 \
+  __macro(cufftExecZ2D);                 \
+  __macro(cufftSetStream);               \
+  __macro(cufftDestroy);                 \
+  __macro(cufftGetVersion);              \
+  __macro(cufftGetProperty);             \
+  __macro(cufftXtSetGPUs);               \
+  __macro(cufftXtMalloc);                \
+  __macro(cufftXtMemcpy);                \
+  __macro(cufftXtFree);                  \
+  __macro(cufftXtSetWorkArea);           \
+  __macro(cufftXtExecDescriptorC2C);     \
+  __macro(cufftXtExecDescriptorR2C);     \
+  __macro(cufftXtExecDescriptorC2R);     \
+  __macro(cufftXtExecDescriptorZ2Z);     \
+  __macro(cufftXtExecDescriptorD2Z);     \
+  __macro(cufftXtExecDescriptorZ2D);     \
+  __macro(cufftXtQueryPlan);             \
+  __macro(cufftXtSetCallback);           \
+  __macro(cufftXtClearCallback);         \
+  __macro(cufftXtSetCallbackSharedSize); \
+  __macro(cufftXtMakePlanMany);          \
+  __macro(cufftXtGetSizeMany);           \
+  __macro(cufftXtExec);                  \
+  __macro(cufftXtExecDescriptor);        \
+  __macro(cufftXtSetWorkAreaPolicy);
+
+CUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUFFT_WRAP)
+
+}  // namespace dynload
+}  // namespace phi
+
+#endif
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cupti.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cupti.h
new file mode 100644
index 00000000000..754e7b4cb89
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cupti.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include <cuda.h>
+#include <cuda_occupancy.h>
+#include <cupti.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+// namespace phi {
+// namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_WITH_XPU
+inline bool IsXPUTracingEnabled() {
+  static bool initialized = false;
+  static bool enabled = false;
+
+  if (!initialized) {
+    initialized = true;
+
+    const char *env = std::getenv("XPU_ENABLE_PROFILER_TRACING");
+    enabled = (env && (std::string(env) == "1"));
+  }
+  return enabled;
+}
+
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                   \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {        \
+      using cuptiFunc = decltype(&::__name);                      \
+      if (!IsXPUTracingEnabled()) {                               \
+        return CUPTI_SUCCESS;                                     \
+      }                                                           \
+      std::call_once(cupti_dso_flag, []() {                       \
+        cupti_dso_handle = phi::dynload::GetCUPTIDsoHandle();     \
+      });                                                         \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+
+#else  // !PADDLE_WITH_XPU
+
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                   \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {        \
+      using cuptiFunc = decltype(&::__name);                      \
+      std::call_once(cupti_dso_flag, []() {                       \
+        cupti_dso_handle = phi::dynload::GetCUPTIDsoHandle();     \
+      });                                                         \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+#endif  // PADDLE_WITH_XPU
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+// }  // namespace dynload
+// }  // namespace phi
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusolver.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusolver.h
new file mode 100644
index 00000000000..c52af44a6ff
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusolver.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusolverDn.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag cusolver_dso_flag;
+extern void* cusolver_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                  \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    cusolverStatus_t operator()(Args... args) {                     \
+      using cusolverFunc = decltype(&::__name);                     \
+      std::call_once(cusolver_dso_flag, []() {                      \
+        cusolver_dso_handle = phi::dynload::GetCusolverDsoHandle(); \
+      });                                                           \
+      std::string replaced_name = #__name;                          \
+      replaced_name = replaced_name.replace(0, 2, "mc");            \
+      static void* p_##__name =                                     \
+          dlsym(cusolver_dso_handle, replaced_name.c_str());        \
+      return reinterpret_cast<cusolverFunc>(p_##__name)(args...);   \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#define CUSOLVER_ROUTINE_EACH(__macro)         \
+  __macro(cusolverDnCreate);                   \
+  __macro(cusolverDnDestroy);                  \
+  __macro(cusolverDnSetStream);                \
+  __macro(cusolverDnSpotrf_bufferSize);        \
+  __macro(cusolverDnDpotrf_bufferSize);        \
+  __macro(cusolverDnXpotrf_bufferSize);        \
+  __macro(cusolverDnSpotrf);                   \
+  __macro(cusolverDnDpotrf);                   \
+  __macro(cusolverDnXpotrf);                   \
+  __macro(cusolverDnSpotrs);                   \
+  __macro(cusolverDnDpotrs);                   \
+  __macro(cusolverDnCpotrs);                   \
+  __macro(cusolverDnZpotrs);                   \
+  __macro(cusolverDnSsyevd_bufferSize);        \
+  __macro(cusolverDnDsyevd_bufferSize);        \
+  __macro(cusolverDnCheevd_bufferSize);        \
+  __macro(cusolverDnZheevd_bufferSize);        \
+  __macro(cusolverDnSsyevd);                   \
+  __macro(cusolverDnDsyevd);                   \
+  __macro(cusolverDnCheevd);                   \
+  __macro(cusolverDnZheevd);                   \
+  __macro(cusolverDnSpotrfBatched);            \
+  __macro(cusolverDnDpotrfBatched);            \
+  __macro(cusolverDnSpotrsBatched);            \
+  __macro(cusolverDnDpotrsBatched);            \
+  __macro(cusolverDnSgetrf_bufferSize);        \
+  __macro(cusolverDnDgetrf_bufferSize);        \
+  __macro(cusolverDnCgetrf_bufferSize);        \
+  __macro(cusolverDnZgetrf_bufferSize);        \
+  __macro(cusolverDnSgeqrf_bufferSize);        \
+  __macro(cusolverDnDgeqrf_bufferSize);        \
+  __macro(cusolverDnCgeqrf_bufferSize);        \
+  __macro(cusolverDnZgeqrf_bufferSize);        \
+  __macro(cusolverDnXgeqrf_bufferSize);        \
+  __macro(cusolverDnSorgqr_bufferSize);        \
+  __macro(cusolverDnDorgqr_bufferSize);        \
+  __macro(cusolverDnSormqr_bufferSize);        \
+  __macro(cusolverDnDormqr_bufferSize);        \
+  __macro(cusolverDnCungqr_bufferSize);        \
+  __macro(cusolverDnZungqr_bufferSize);        \
+  __macro(cusolverDnDestroyGesvdjInfo);        \
+  __macro(cusolverDnCreateGesvdjInfo);         \
+  __macro(cusolverDnSgesvdj_bufferSize);       \
+  __macro(cusolverDnDgesvdj_bufferSize);       \
+  __macro(cusolverDnCgesvdj_bufferSize);       \
+  __macro(cusolverDnZgesvdj_bufferSize);       \
+  __macro(cusolverDnSgesvdj);                  \
+  __macro(cusolverDnDgesvdj);                  \
+  __macro(cusolverDnCgesvdj);                  \
+  __macro(cusolverDnZgesvdj);                  \
+  __macro(cusolverDnSgetrf);                   \
+  __macro(cusolverDnSgetrs);                   \
+  __macro(cusolverDnDgetrs);                   \
+  __macro(cusolverDnCgetrs);                   \
+  __macro(cusolverDnZgetrs);                   \
+  __macro(cusolverDnDgetrf);                   \
+  __macro(cusolverDnCgetrf);                   \
+  __macro(cusolverDnZgetrf);                   \
+  __macro(cusolverDnSgeqrf);                   \
+  __macro(cusolverDnDgeqrf);                   \
+  __macro(cusolverDnCgeqrf);                   \
+  __macro(cusolverDnZgeqrf);                   \
+  __macro(cusolverDnXgeqrf);                   \
+  __macro(cusolverDnSorgqr);                   \
+  __macro(cusolverDnDorgqr);                   \
+  __macro(cusolverDnSormqr);                   \
+  __macro(cusolverDnDormqr);                   \
+  __macro(cusolverDnCungqr);                   \
+  __macro(cusolverDnZungqr);                   \
+  __macro(cusolverDnCreateSyevjInfo);          \
+  __macro(cusolverDnCreateParams);             \
+  __macro(cusolverDnDestroyParams);            \
+  __macro(cusolverDnSsyevj_bufferSize);        \
+  __macro(cusolverDnDsyevj_bufferSize);        \
+  __macro(cusolverDnCheevj_bufferSize);        \
+  __macro(cusolverDnZheevj_bufferSize);        \
+  __macro(cusolverDnSsyevj);                   \
+  __macro(cusolverDnDsyevj);                   \
+  __macro(cusolverDnCheevj);                   \
+  __macro(cusolverDnZheevj);                   \
+  __macro(cusolverDnDestroySyevjInfo);         \
+  __macro(cusolverDnXsyevjSetSortEig);         \
+  __macro(cusolverDnSsyevjBatched_bufferSize); \
+  __macro(cusolverDnDsyevjBatched_bufferSize); \
+  __macro(cusolverDnCheevjBatched_bufferSize); \
+  __macro(cusolverDnZheevjBatched_bufferSize); \
+  __macro(cusolverDnSsyevjBatched);            \
+  __macro(cusolverDnDsyevjBatched);            \
+  __macro(cusolverDnCheevjBatched);            \
+  __macro(cusolverDnZheevjBatched);
+
+CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusparse.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusparse.h
new file mode 100644
index 00000000000..82c3fc91e79
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/cusparse.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag cusparse_dso_flag;
+extern void* cusparse_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                  \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    cusparseStatus_t operator()(Args... args) {                     \
+      using Func = decltype(&::__name);                             \
+      std::call_once(cusparse_dso_flag, []() {                      \
+        cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \
+      });                                                           \
+      std::string replaced_name = #__name;                          \
+      replaced_name = replaced_name.replace(0, 2, "mc");            \
+      static void* p_##__name =                                     \
+          dlsym(cusparse_dso_handle, replaced_name.c_str());        \
+      return reinterpret_cast<Func>(p_##__name)(args...);           \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_CUDA)
+#define CUSPARSE_ROUTINE_EACH(__macro)    \
+  __macro(cusparseCreate);                \
+  __macro(cusparseSetStream);             \
+  __macro(cusparseCreateMatDescr);        \
+  __macro(cusparseDestroy);               \
+  __macro(cusparseSnnz);                  \
+  __macro(cusparseDnnz);                  \
+  __macro(cusparseSetMatType);            \
+  __macro(cusparseSetMatIndexBase);       \
+  __macro(cusparseCreateCsr);             \
+  __macro(cusparseCreateCoo);             \
+  __macro(cusparseCreateDnMat);           \
+  __macro(cusparseCreateDnVec);           \
+  __macro(cusparseSpMM_bufferSize);       \
+  __macro(cusparseSpMM);                  \
+  __macro(cusparseDestroySpMat);          \
+  __macro(cusparseDestroyDnMat);          \
+  __macro(cusparseDestroyDnVec);          \
+  __macro(cusparseSpMV_bufferSize);       \
+  __macro(cusparseSpMV);                  \
+  __macro(cusparseSpMatGetSize);          \
+  __macro(cusparseCsrSetPointers);        \
+  __macro(cusparseSpGEMM_createDescr);    \
+  __macro(cusparseSpGEMM_compute);        \
+  __macro(cusparseSpGEMM_workEstimation); \
+  __macro(cusparseSpGEMM_copy);           \
+  __macro(cusparseSpGEMM_destroyDescr);
+
+CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+
+#if CUDA_VERSION >= 11030
+#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
+  __macro(cusparseSpMM_preprocess);       \
+  __macro(cusparseSDDMM_bufferSize);      \
+  __macro(cusparseSDDMM_preprocess);      \
+  __macro(cusparseSDDMM);
+
+CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
+
+#if CUDA_VERSION >= 11080
+#define CUSPARSE_ROUTINE_EACH_R3(__macro) \
+  __macro(cusparseDnMatSetStridedBatch);  \
+  __macro(cusparseCooSetStridedBatch);    \
+  __macro(cusparseCsrSetStridedBatch);
+
+CUSPARSE_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
+
+#endif  // PADDLE_WITH_CUDA
+
+#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.cc b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.cc
new file mode 100644
index 00000000000..9f2ac0890f6
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -0,0 +1,1131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+
+#include <dirent.h>
+
+#include <codecvt>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/common/port.h"
+#include "paddle/phi/core/enforce.h"
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+// TODO(wilber): The phi computing library requires a component to manage flags
+// (maybe not use gflags).
+#include "glog/logging.h"
+#include "paddle/common/flags.h"
+
+COMMON_DECLARE_string(cudnn_dir);
+COMMON_DECLARE_string(cuda_dir);
+COMMON_DECLARE_string(cublas_dir);
+COMMON_DECLARE_string(nccl_dir);
+COMMON_DECLARE_string(cupti_dir);
+COMMON_DECLARE_string(tensorrt_dir);
+COMMON_DECLARE_string(mklml_dir);
+COMMON_DECLARE_string(hml_dir);
+COMMON_DECLARE_string(lapack_dir);
+COMMON_DECLARE_string(mkl_dir);
+COMMON_DECLARE_string(op_dir);
+COMMON_DECLARE_string(cusparselt_dir);
+COMMON_DECLARE_string(curand_dir);
+COMMON_DECLARE_string(cusolver_dir);
+COMMON_DECLARE_string(cusparse_dir);
+COMMON_DECLARE_string(win_cuda_bin_dir);
+#ifdef PADDLE_WITH_MAGMA
+COMMON_DECLARE_string(magma_dir);
+#endif
+
+#ifndef CUDA_LIB_NAME
+#define CUDA_LIB_NAME "libcuda.so"
+#endif
+
+#ifndef BLAS_LIB_NAME
+#define BLAS_LIB_NAME "libcublas.so"
+#endif
+
+#ifndef BLASLT_LIB_NAME
+#define BLASLT_LIB_NAME "libcublasLt.so"
+#endif
+
+#ifndef DNN_LIB_NAME
+#define DNN_LIB_NAME "libcudnn.so"
+#endif
+
+#ifndef PTI_LIB_NAME
+#define PTI_LIB_NAME "libcupti.so"
+#endif
+
+#ifndef RAND_LIB_NAME
+#define RAND_LIB_NAME "libcurand.so"
+#endif
+
+#ifndef JPEG_LIB_NAME
+#define JPEG_LIB_NAME "libnvjpeg.so"
+#endif
+
+#ifndef SOLVER_LIB_NAME
+#define SOLVER_LIB_NAME "libcusolver.so"
+#endif
+
+#ifndef SPARSE_LIB_NAME
+#define SPARSE_LIB_NAME "libcusparse.so"
+#endif
+
+#ifndef RTC_LIB_NAME
+#define RTC_LIB_NAME "libnvrtc.so"
+#endif
+
+#ifndef FLASHATTN_LIB_NAME
+#define FLASHATTN_LIB_NAME "libflashattn.so"
+#endif
+
+#ifndef FLASHATTNV3_LIB_NAME
+#define FLASHATTNV3_LIB_NAME "libflashattnv3.so"
+#endif
+
+#ifndef CCL_LIB_NAME
+#define CCL_LIB_NAME "libnccl.so"
+#endif
+
+#ifndef FFT_LIB_NAME
+#define FFT_LIB_NAME "libcufft.so"
+#endif
+
+#ifndef SPARSELT_LIB_NAME
+#define SPARSELT_LIB_NAME "libcusparseLt.so"
+#endif
+
+#ifndef CUPTI_LIB_PATH
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
+#endif
+
+#ifdef PADDLE_WITH_HIP
+
+PHI_DEFINE_string(miopen_dir,
+                  "",
+                  "Specify path for loading libMIOpen.so. For instance, "
+                  "/opt/rocm/miopen/lib. If empty [default], dlopen "
+                  "will search miopen from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(rocm_dir,
+                  "",
+                  "Specify path for loading rocm library, such as librocblas, "
+                  "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
+                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(rccl_dir,
+                  "",
+                  "Specify path for loading rccl library, such as librccl.so. "
+                  "For instance, /opt/rocm/rccl/lib. If default, "
+                  "dlopen will search rccl from LD_LIBRARY_PATH");
+#endif
+
+#ifdef PADDLE_WITH_FLAGCX
+COMMON_DECLARE_string(flagcx_dir);
+
+PHI_DEFINE_EXPORTED_string(
+    flagcx_dir,  // NOLINT
+    "",
+    "Specify path for loading libflagcx.so. For instance, "
+    "For instance, /usr/local/flagcx/lib. If default, "
+    "dlopen will search flagcx from LD_LIBRARY_PATH");
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
+PD_DEFINE_string(xputx_dir, "", "Specify path for loading llibxpuToolsExt.so.");
+#endif
+
+namespace phi::dynload {
+
+struct PathNode {
+  PathNode() = default;
+  std::string path = "";
+};
+
+static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;  // NOLINT
+
+// NOTE: In order to adapt to the default installation path of cuda
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
+#else
+static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";  // NOLINT
+#endif
+
+static PathNode s_py_site_pkg_path;
+
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
+static constexpr char* win_cublas_lib =
+    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_curand_lib =
+    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
+static constexpr char* win_cusolver_lib =
+    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR
+    ".dll;cusolver64_11.dll;cusolver64_10.dll";
+static constexpr char* win_cusparse_lib =
+    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
+static constexpr char* win_cufft_lib =
+    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_11.dll;cufft64_10.dll";
+#endif
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+// directory separator
+#if defined(_WIN32)
+  const char sep = '\\';
+#else
+  const char sep = '/';
+#endif
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline std::vector<std::string> split(
+    const std::string& str, const std::string separator = " ") {
+  std::vector<std::string> str_list;
+  std::string::size_type firstPos = 0;
+  firstPos = str.find_first_not_of(separator, 0);
+  std::string::size_type lastPos = 0;
+  lastPos = str.find_first_of(separator, firstPos);
+  while (std::string::npos != firstPos && std::string::npos != lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+    firstPos = str.find_first_not_of(separator, lastPos);
+    lastPos = str.find_first_of(separator, firstPos);
+  }
+  if (std::string::npos == lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+  }
+  return str_list;
+}
+
+void SetPaddleLibPath(const std::string& py_site_pkg_path) {
+  s_py_site_pkg_path.path = py_site_pkg_path;
+  VLOG(6) << "Set paddle lib path : " << py_site_pkg_path;
+}
+
+static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
+                                                 const std::string& dso_name,
+                                                 int dynload_flags) {
+  void* dso_handle = nullptr;
+  if (!spec_path.empty() || !dso_name.empty()) {
+    // search xxx.so from custom path
+    VLOG(6) << "Try to find library: " << dso_name
+            << " from specific path: " << spec_path;
+    std::string dso_path = join(spec_path, dso_name);
+#if defined(_WIN32) || defined(_WIN64)
+    HMODULE handle = LoadLibraryA(dso_path.c_str());
+    dso_handle = reinterpret_cast<void*>(handle);
+#else
+    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+#endif
+  }
+  return dso_handle;
+}
+
+static inline std::string FindLibAbsolutePath(const std::string& directory,
+                                              const std::string& filename) {
+  DIR* dir = opendir(directory.c_str());
+  struct dirent* ent;
+
+  if (dir != nullptr) {
+    while ((ent = readdir(dir)) != nullptr) {
+      if (ent->d_type == DT_REG || ent->d_type == DT_LNK) {
+        if (filename == std::string(ent->d_name)) {
+          closedir(dir);
+          return join(directory, ent->d_name);
+        }
+      } else if (ent->d_type == DT_DIR) {
+        if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) {
+          std::string res =
+              FindLibAbsolutePath(join(directory, ent->d_name) + "/", filename);
+          if (!res.empty()) {
+            closedir(dir);
+            return res;
+          }
+        }
+      }
+    }
+    closedir(dir);
+  }
+  return "";
+}
+
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
+#if defined(_WIN32) || defined(_WIN64)
+  HMODULE hModule = LoadLibraryA(dso_path.c_str());
+  return reinterpret_cast<void*>(hModule);
+#else
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  // and /usr/local/lib path
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+
+// TODO(chenweihang): This path is used to search which libs?
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__aarch64__)
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(FindLibAbsolutePath("/opt/homebrew/Cellar/", dso_path).c_str(),
+               dynload_flags);
+  }
+#else
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(FindLibAbsolutePath("/usr/local/cuda/lib/", dso_path).c_str(),
+               dynload_flags);
+  }
+#endif
+#endif
+
+  return dso_handle;
+#endif
+}
+
+/*
+ * We define three priorities for dynamic library search:
+ *
+ * First: Search for  path specified by the user
+ * Second: Search the stheystem default path
+ * Third: Search for a special path corresponding to
+ *        a specific library to adapt to changes and easy to expand.
+ */
+
+static inline void* GetDsoHandleFromSearchPath(
+    const std::string& config_path,
+    const std::string& dso_name,
+    bool throw_on_error = true,
+    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
+    const std::string& warning_msg = std::string()) {
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+#if defined(_WIN32)
+  std::vector<std::wstring> cuda_bin_search_path = {
+      L"cublas",
+      L"cuda_nvrtc",
+      L"cuda_runtime",
+      L"cudnn",
+      L"cufft",
+      L"curand",
+      L"cusolver",
+      L"cusparse",
+      L"nvjitlink",
+  };
+  for (auto search_path : cuda_bin_search_path) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    std::wstring win_path_wstring =
+        converter.from_bytes(FLAGS_win_cuda_bin_dir);
+    search_path = win_path_wstring + L"\\" + search_path + L"\\bin";
+#ifdef PADDLE_WITH_CUDA
+    AddDllDirectory(search_path.c_str());
+#endif
+  }
+#endif
+  std::vector<std::string> dso_names = split(dso_name, ";");
+  void* dso_handle = nullptr;
+  for (auto const& dso : dso_names) {
+    // 1. search in user config path by FLAGS
+    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
+    // 2. search in system default path
+    if (nullptr == dso_handle) {
+      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
+    }
+    // 3. search in extra paths
+    if (nullptr == dso_handle) {
+      for (auto const& path : extra_paths) {
+        VLOG(3) << "extra_paths: " << path;
+        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
+      }
+    }
+    if (nullptr != dso_handle) break;
+  }
+
+  // 4. [If Failed for All dso_names] logging warning if exists
+  if (nullptr == dso_handle && !warning_msg.empty()) {
+    LOG(WARNING) << warning_msg;
+  }
+
+  // 5. [If Failed for All dso_names] logging or throw error info
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "The third-party dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure third-party dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%%PATH%%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    if (throw_on_error) {
+      // NOTE: Special error report case, no need to change its format
+      PADDLE_THROW(
+          common::errors::PreconditionNotMet(error_msg, dso_name, errorno));
+    } else {
+      LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
+    }
+  }
+
+  return dso_handle;
+}
+
+void* GetCublasDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, BLAS_LIB_NAME);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_13.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.13");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
+#endif
+}
+
+void* GetCublasLtDsoHandle() {
+// APIs available after CUDA 10.1
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, BLASLT_LIB_NAME);
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.13");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_13.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#elif !defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipblaslt.so");
+#else
+  std::string warning_msg(
+      "Your CUDA_VERSION less 11, not support CublasLt. "
+      "If you want to use CublasLt, please upgrade CUDA and rebuild "
+      "PaddlePaddle.");
+  return nullptr;
+#endif
+}
+
+void* GetCUDNNDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string mac_warn_meg(
+      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+      "For instance, sudo tar -xzf "
+      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+      "chmod a+r /usr/local/cuda/include/cudnn.h "
+      "/usr/local/cuda/lib/libcudnn*");
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg);
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, DNN_LIB_NAME, false, {cuda_lib_path});
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  std::string win_warn_meg(
+      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
+      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
+      "NVIDIA's official website, \n"
+      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
+      "Toolkit\\CUDA\\v10.0\n"
+      "You should do this according to your CUDA installation directory and "
+      "CUDNN version.");
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+#endif
+  } else if (CUDA_VERSION >= 12030) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+#endif
+  }
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+#else
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+  if (CUDA_VERSION >= 12030) {
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path});
+  } else {
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cudnn_dir, "libcudnn.so.8", false, {cuda_lib_path});
+  }
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
+#endif
+#endif
+}
+
+void* GetCUPTIDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, PTI_LIB_NAME, false, {cupti_lib_path});
+#elif defined(PADDLE_WITH_XPU)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so.13", false, {cupti_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+}
+
+void* GetCurandDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, RAND_LIB_NAME);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
+#endif
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+#else
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so");
+#endif
+
+#endif
+}
+
+#ifdef PADDLE_WITH_HIP
+void* GetROCFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipfft.so");
+#endif
+}
+#endif
+
+void* GetNvjpegDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, JPEG_LIB_NAME);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
+#endif
+}
+
+void* GetCusolverDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, SOLVER_LIB_NAME);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
+#endif
+  } else {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, "cusolver64_12.dll", true, {cuda_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
+#endif
+  }
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsolver.so");
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+#endif
+  } else {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+#endif
+  }
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
+#endif
+}
+
+void* GetCusparseDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, SPARSE_LIB_NAME);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer.");
+    return nullptr;
+  }
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
+#endif
+}
+
+void* GetNVRTCDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, RTC_LIB_NAME);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
+#endif
+}
+
+void* GetCUDADsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, CUDA_LIB_NAME, false);
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(_WIN32)
+  char system32_dir[MAX_PATH];
+  GetSystemDirectory(system32_dir, MAX_PATH);
+  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
+#endif
+}
+
+void* GetWarpCTCDsoHandle() {
+  std::string warpctc_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    warpctc_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
+#else
+  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
+#endif
+}
+
+void* GetWarpRNNTDsoHandle() {
+  std::string warprnnt_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    warprnnt_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(warprnnt_dir, "warprnnt.dll");
+#else
+  return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.so");
+#endif
+}
+
+void* GetFlashAttnDsoHandle() {
+  std::string flashattn_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    flashattn_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattn.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(flashattn_dir, "flashattn.dll");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(flashattn_dir, FLASHATTN_LIB_NAME);
+#else
+  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattn.so");
+#endif
+}
+
+void* GetFlashAttnV3DsoHandle() {
+  std::string flashattn_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    flashattn_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(flashattn_dir, "flashattnv3.dll");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(flashattn_dir, FLASHATTNV3_LIB_NAME);
+#else
+  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.so");
+#endif
+}
+
+void* GetFlashMaskV2DsoHandle() {
+  std::string flashattn_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    flashattn_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashmaskv2.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(flashattn_dir, "flashmaskv2.dll");
+#else
+  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashmaskv2.so");
+#endif
+}
+
+void* GetAfsApiDsoHandle() {
+  std::string afsapi_dir = "";
+  if (!s_py_site_pkg_path.path.empty()) {
+    afsapi_dir = s_py_site_pkg_path.path;
+  }
+#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
+  return NULL;
+#else
+  return GetDsoHandleFromSearchPath(afsapi_dir, "libafs-api-so.so");
+#endif
+}
+
+void* GetNCCLDsoHandle() {
+#ifdef PADDLE_WITH_HIP
+  std::string warning_msg(
+      "You may need to install 'rccl' from ROCM official website: "
+      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
+      "Installation-Guide.html before install PaddlePaddle.");
+#else
+  std::string warning_msg(
+      "You may need to install 'nccl2' from NVIDIA official website: "
+      "https://developer.nvidia.com/nccl/nccl-download "
+      "before install PaddlePaddle.");
+#endif
+
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
+#else
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg);
+#else
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, CCL_LIB_NAME, true, {}, warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
+#endif
+#endif
+
+#endif
+}
+
+void* GetFLAGCXDsoHandle() {
+#ifdef PADDLE_WITH_FLAGCX
+  return GetDsoHandleFromSearchPath(FLAGS_flagcx_dir, "libflagcx.so");
+#else
+  return nullptr;
+#endif
+}
+
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+#endif
+}
+
+void* GetMKLMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+#endif
+}
+
+void* GetHMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return nullptr;
+#elif defined(_WIN32)
+  return nullptr;
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_hml_dir, "libhml_rt.so");
+#endif
+}
+
+void* GetLAPACKDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__aarch64__)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
+#endif
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
+#endif
+}
+
+void* GetMAGMADsoHandle() {
+#if defined(PADDLE_WITH_MAGMA)
+  return GetDsoHandleFromSearchPath(FLAGS_magma_dir, "libmagma.so");
+#endif
+  return nullptr;
+}
+
+void* GetOpDsoHandle(const std::string& dso_name) {
+  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
+}
+
+void* GetNvtxDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Apple."));
+#elif defined(_WIN32)
+  PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Windows."));
+#elif !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
+  PADDLE_THROW(common::errors::Unimplemented(
+      "Nvtx do not support without CUDA or XPU."));
+#elif defined(PADDLE_WITH_XPU)
+  return GetDsoHandleFromSearchPath("FLAGS_xputx_dir", "libxpuToolsExt.so");
+#else
+  if (CUDA_VERSION >= 12090) {
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvtx3interop.so.1");
+  }
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
+#endif
+}
+
+void* GetCUFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, FFT_LIB_NAME);
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11");
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.12");
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
+#endif
+}
+
+void* GetMKLRTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
+#endif
+}
+
+void* GetCusparseLtDsoHandle() {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, SPARSELT_LIB_NAME);
+#elif defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, "libcusparseLt.so");
+#else
+  std::string warning_msg(
+      "Your CUDA_VERSION less 11.2, not support cusparseLt. "
+      "If you want to use cusparseLt, please upgrade CUDA and rebuild "
+      "PaddlePaddle.");
+  return nullptr;
+#endif
+}
+
+void* GetXPTIDsoHandle() {
+#ifdef PADDLE_WITH_XPTI
+  return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so");
+#else
+  return nullptr;
+#endif
+}
+}  // namespace phi::dynload
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.h
new file mode 100644
index 00000000000..208eaca1137
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/dynamic_loader.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+#include "paddle/utils/test_macros.h"
+namespace phi {
+namespace dynload {
+
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
+void* GetCublasDsoHandle();
+void* GetCublasLtDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetNvjpegDsoHandle();
+void* GetCusolverDsoHandle();
+void* GetCusparseDsoHandle();
+void* GetNVRTCDsoHandle();
+void* GetCUDADsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetWarpRNNTDsoHandle();
+void* GetFlashAttnDsoHandle();
+void* GetFlashAttnV3DsoHandle();
+void* GetFlashMaskV2DsoHandle();
+void* GetNCCLDsoHandle();
+void* GetFLAGCXDsoHandle();
+void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
+void* GetLAPACKDsoHandle();
+void* GetMAGMADsoHandle();
+void* GetOpDsoHandle(const std::string& dso_name);
+void* GetNvtxDsoHandle();
+void* GetCUFFTDsoHandle();
+void* GetMKLRTDsoHandle();
+void* GetHMLDsoHandle();
+void* GetROCFFTDsoHandle();
+void* GetCusparseLtDsoHandle();
+void* GetXPTIDsoHandle();
+void* GetAfsApiDsoHandle();
+
+void SetPaddleLibPath(const std::string&);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/nvjpeg.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/nvjpeg.h
new file mode 100644
index 00000000000..bb2dc37f895
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/nvjpeg.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <nvjpeg.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag nvjpeg_dso_flag;
+extern void* nvjpeg_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                \
+  struct DynLoad__##__name {                                    \
+    template <typename... Args>                                 \
+    nvjpegStatus_t operator()(Args... args) {                   \
+      using nvjpegFunc = decltype(&::__name);                   \
+      std::call_once(nvjpeg_dso_flag, []() {                    \
+        nvjpeg_dso_handle = phi::dynload::GetNvjpegDsoHandle(); \
+      });                                                       \
+      std::string replaced_name = #__name;                      \
+      replaced_name = replaced_name.replace(0, 2, "mc");        \
+      static void* p_##__name =                                 \
+          dlsym(nvjpeg_dso_handle, replaced_name.c_str());      \
+      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...); \
+    }                                                           \
+  };                                                            \
+  extern DynLoad__##__name __name
+
+#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
+  __macro(nvjpegCreateSimple);            \
+  __macro(nvjpegJpegStateCreate);         \
+  __macro(nvjpegGetImageInfo);            \
+  __macro(nvjpegJpegStateDestroy);        \
+  __macro(nvjpegDecode);
+
+NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
+
+#endif
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/warpctc.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warpctc.h
new file mode 100644
index 00000000000..bea933a7e3b
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warpctc.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+#include "warpctc/include/ctc.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using warpctcFunc = decltype(&::__name);                       \
+      std::call_once(warpctc_dso_flag, []() {                        \
+        warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle();    \
+      });                                                            \
+      static void* p_##__name = dlsym(warpctc_dso_handle, #__name);  \
+      return reinterpret_cast<warpctcFunc>(p_##__name)(args...);     \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(compute_ctc_loss_double);   \
+  __macro(get_workspace_size);        \
+  __macro(get_workspace_size_double)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/headers/paddle/phi/backends/dynload/warprnnt.h b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warprnnt.h
new file mode 100644
index 00000000000..5a84efc491e
--- /dev/null
+++ b/backends/metax_gpu/headers/paddle/phi/backends/dynload/warprnnt.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/common/port.h"
+#include "warprnnt/include/rnnt.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag warprnnt_dso_flag;
+extern void* warprnnt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warprnnt routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name)                           \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using warprnntFunc = decltype(&::__name);                      \
+      std::call_once(warprnnt_dso_flag, []() {                       \
+        warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle();  \
+      });                                                            \
+      static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \
+      return reinterpret_cast<warprnntFunc>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \
+  DYNAMIC_LOAD_WARPRNNT_WRAP(__name)
+
+#define WARPRNNT_ROUTINE_EACH(__macro) \
+  __macro(get_warprnnt_version);       \
+  __macro(rnntGetStatusString);        \
+  __macro(compute_rnnt_loss);          \
+  __macro(compute_rnnt_loss_fp64);     \
+  __macro(get_rnnt_workspace_size);
+
+WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP);
+
+#undef DYNAMIC_LOAD_WARPRNNT_WRAP
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/backends/metax_gpu/patch/Eigen_3.4.0_paddle.zip b/backends/metax_gpu/patch/Eigen_3.4.0_paddle.zip
new file mode 100644
index 00000000000..c82a9c81f62
Binary files /dev/null and b/backends/metax_gpu/patch/Eigen_3.4.0_paddle.zip differ
diff --git a/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip b/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip
deleted file mode 100644
index 69d962f1132..00000000000
Binary files a/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip and /dev/null differ
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 71bf0b8777d..ab8d77514ac 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -34,149 +34,6 @@ index 8d445b39ae..504e7b6293 100755
    if(CUDA_VERSION GREATER_EQUAL 11.6)
      op_library(fused_gemm_epilogue_op)
    endif()
-diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
-index bda9cbe17e..c73eba9c8a 100644
---- a/paddle/phi/backends/dynload/cublas.h
-+++ b/paddle/phi/backends/dynload/cublas.h
-@@ -49,7 +49,12 @@ extern void *cublas_dso_handle;
-       std::call_once(cublas_dso_flag, []() {                                \
-         cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
-       });                                                                   \
--      static void *p_##__name = dlsym(cublas_dso_handle, #__name);          \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-+      int index = replaced_name.find("_", 0);                               \
-+      if (index != -1) replaced_name = replaced_name.substr(0, index);      \
-+      static void* p_##__name =                                             \
-+          dlsym(cublas_dso_handle, replaced_name.c_str());                  \
-       return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
-     }                                                                       \
-   };                                                                        \
-diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
-index 8b2e08c777..ca926df151 100644
---- a/paddle/phi/backends/dynload/cublasLt.h
-+++ b/paddle/phi/backends/dynload/cublasLt.h
-@@ -46,12 +46,14 @@ extern void *cublasLt_dso_handle;
-       std::call_once(cublasLt_dso_flag, []() {                              \
-         cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
-       });                                                                   \
--      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-+      static void* p_##__name =                                             \
-+          dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
-       return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
-     }                                                                       \
-   };                                                                        \
-   extern DynLoad__##__name __name
--
- // APIs available after CUDA 11.1
- #if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
- #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
-@@ -79,8 +81,8 @@ extern void *cublasLt_dso_handle;
-   __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
-   __macro(cublasLtMatmulAlgoGetIds);                \
-   __macro(cublasLtMatmulAlgoCapGetAttribute);       \
--  __macro(cublasLtMatmulAlgoCheck);                 \
--  __macro(cublasLtGetCudartVersion);
-+  __macro(cublasLtMatmulAlgoCheck);
-+  // __macro(cublasLtGetCudartVersion);
- #else
- #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
-   __macro(cublasLtCreate);                       \
-diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
-index ad2ada9dfa..9e8389e7dc 100644
---- a/paddle/phi/backends/dynload/cudnn.h
-+++ b/paddle/phi/backends/dynload/cudnn.h
-@@ -38,7 +38,10 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-         cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
-       });                                                            \
-       EnforceCUDNNLoaded(#__name);                                   \
--      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);    \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-+      static void* p_##__name =                                             \
-+          dlsym(cudnn_dso_handle, replaced_name.c_str());                \
-       return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
-     }                                                                \
-   };                                                                 \
-diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
-index 1547909d92..ef20838434 100644
---- a/paddle/phi/backends/dynload/cufft.h
-+++ b/paddle/phi/backends/dynload/cufft.h
-@@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
- /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
-@@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
-         cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
-       });                                                            \
-       EnforceCUFFTLoaded(#__name);                                   \
--      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name =  replaced_name.replace(0,2,"mc");          \
-+      static void* p_##__name = dlsym(cufft_dso_handle, replaced_name.c_str());    \
-       return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
-     }                                                                \
-   };                                                                 \
-diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
-index 4241a512e8..94e32b743e 100644
---- a/paddle/phi/backends/dynload/cupti.h
-+++ b/paddle/phi/backends/dynload/cupti.h
-@@ -24,8 +24,8 @@ limitations under the License. */
- #include "paddle/phi/backends/dynload/dynamic_loader.h"
- #include "paddle/phi/common/port.h"
- 
--namespace phi {
--namespace dynload {
-+// namespace phi {
-+// namespace dynload {
- 
- extern std::once_flag cupti_dso_flag;
- extern void *cupti_dso_handle;
-@@ -105,7 +105,7 @@ inline bool IsXPUTracingEnabled() {
- CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
- 
- #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
--}  // namespace dynload
--}  // namespace phi
-+// }  // namespace dynload
-+// }  // namespace phi
- 
--#endif  // PADDLE_WITH_CUPTI
-+#endif  // PADDLE_WITH_CUPTI
-\ No newline at end of file
-diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h
-index 57e09bb6e4..87fb5b1797 100644
---- a/paddle/phi/backends/dynload/cusolver.h
-+++ b/paddle/phi/backends/dynload/cusolver.h
-@@ -34,7 +34,9 @@ extern void *cusolver_dso_handle;
-       std::call_once(cusolver_dso_flag, []() {                       \
-         cusolver_dso_handle = phi::dynload::GetCusolverDsoHandle();  \
-       });                                                            \
--      static void *p_##__name = dlsym(cusolver_dso_handle, #__name); \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name =  replaced_name.replace(0,2,"mc");          \
-+      static void* p_##__name = dlsym(cusolver_dso_handle, replaced_name.c_str());    \
-       return reinterpret_cast<cusolverFunc>(p_##__name)(args...);    \
-     }                                                                \
-   };                                                                 \
-diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
-index e8cb0ac643..e8e7596d44 100644
---- a/paddle/phi/backends/dynload/cusparse.h
-+++ b/paddle/phi/backends/dynload/cusparse.h
-@@ -34,7 +34,9 @@ extern void *cusparse_dso_handle;
-       std::call_once(cusparse_dso_flag, []() {                       \
-         cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
-       });                                                            \
--      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name =  replaced_name.replace(0,2,"mc");          \
-+      static void* p_##__name = dlsym(cusparse_dso_handle, replaced_name.c_str());    \
-       return reinterpret_cast<Func>(p_##__name)(args...);            \
-     }                                                                \
-   };                                                                 \
 diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
 index 39f50bd95d..4d627b99b7 100644
 --- a/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -200,21 +57,6 @@ index 39f50bd95d..4d627b99b7 100644
  #ifdef PADDLE_WITH_HIP
  
  PHI_DEFINE_string(miopen_dir,
-diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h
-index c5309e7e11..3328571380 100644
---- a/paddle/phi/backends/dynload/nvjpeg.h
-+++ b/paddle/phi/backends/dynload/nvjpeg.h
-@@ -31,7 +31,9 @@ extern void *nvjpeg_dso_handle;
-       std::call_once(nvjpeg_dso_flag, []() {                       \
-         nvjpeg_dso_handle = phi::dynload::GetNvjpegDsoHandle();    \
-       });                                                          \
--      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
-+      std::string replaced_name = #__name;                                  \
-+      replaced_name =  replaced_name.replace(0,2,"mc");          \
-+      static void* p_##__name = dlsym(nvjpeg_dso_handle, replaced_name.c_str());    \
-       return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);    \
-     }                                                              \
-   };                                                               \
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 index 092365a961..8bd3f9fcea 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
diff --git a/backends/metax_gpu/setup.py.in b/backends/metax_gpu/setup.py.in
index b1600e9bb5a..41831e07b15 100644
--- a/backends/metax_gpu/setup.py.in
+++ b/backends/metax_gpu/setup.py.in
@@ -1,6 +1,20 @@
 import os
 from setuptools import setup, Distribution
+from setuptools.command.install import install
 
+import os
+import shutil
+
+class CustomInstallCommand(install):
+    def run(self):
+        install.run(self)
+        install_dir = os.path.join(self.install_lib, 'paddle_custom_device', 'include')
+        source_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'python', 'include')
+
+        if os.path.exists(source_dir):
+            if not os.path.exists(install_dir):
+                os.makedirs(install_dir)
+            shutil.copytree(source_dir, install_dir, dirs_exist_ok=True)
 
 packages = []
 package_data = {}
@@ -108,13 +122,17 @@ def main():
     ],
     include_package_data=True,
     package_data = {
-        '': ['*.so', '*.h', '*.py', '*.hpp'],
+        'paddle_custom_device': ['*.so'],
+        '': ['*.h', '*.hpp'],
     },
     package_dir = {
         '': 'python',
     },
     zip_safe=False,
     distclass=BinaryDistribution,
+    cmdclass={
+    'install': CustomInstallCommand,
+},
     entry_points={
         'console_scripts': [
         ]