PaddlePaddle · LittleHeroZZZX · Apr 25, 2025 · Apr 28, 2025
diff --git a/.env b/.env
@@ -0,0 +1,16 @@
+PYTHONPATH=/work/PaddleX/PaddleCustomDevice/python:/work/PaddleX/PaddleCustomDevice/python/tests
+FLAGS_enable_pir_api=1
+# FLAGS_print_ir=True
+# GLOG_v=6
+PADDLE_PDX_DEBUG=True
+PADDLE_PDX_DISABLE_DEV_MODEL_WL=True
+FLAGS_json_format_model=1
+FLAGS_fast_eager_deletion_mode=0
+FLAGS_use_stream_safe_cuda_allocator=false
+
+# FLAGS_new_executor_serial_run=1
+# FLAGS_call_stack_level=2
+
+
+# DEBUG_WAIT_AFTER_YIELD=0
+# DEBUG_WAIT_BEFORE_YIELD=0
diff --git a/backends/mlu/kernels/funcs/mlu_baseop.cc b/backends/mlu/kernels/funcs/mlu_baseop.cc
@@ -5985,6 +5985,9 @@ NormalizeDesc::~NormalizeDesc() {
                                                             workspace_size,
                                                             output_desc,
                                                             out));
+  cnrtQueue_t queue;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQueue(handle, &queue));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(queue));
 }
 
 /* static */ void MLUOP::OpYoloBox(const Context& ctx,

diff --git a/backends/mlu/kernels/memcpy_kernel.cc b/backends/mlu/kernels/memcpy_kernel.cc
@@ -12,11 +12,49 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cstdint>
 #include "kernels/funcs/mlu_baseop.h"
 #include "kernels/funcs/mlu_funcs.h"
 
 namespace custom_kernel {
+using phi::CPUPlace;
+using phi::DenseTensor;
+const int64_t SAMPLE_MAX = 4;
+template <typename T, typename Context>
+void printInfo(const Context &dev_ctx, const DenseTensor &x, const std::string& name, bool frequency=false, bool shoud_sleep=false) {
+    std::cout << "========================== START PRINT " << name << " ==========================" << std::endl;
+    std::cout << "numel: "
+              << x.numel()
+              << std::endl;
+    std::cout << "place: "
+              << x.place()
+              << std::endl;
+
+    if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t>) {
+    const T* data_p = static_cast<const T*>(x.data());
 
+    if(frequency){
+      std::vector<int64_t> frequency(SAMPLE_MAX, 0);
+      for (int i = 0; i < x.numel(); ++i) {
+        if (data_p[i] >= 0 && data_p[i] < SAMPLE_MAX) {
+          frequency[data_p[i]]++;
+        } else {
+          std::cout << "FOUND INVALID SAMPLE!" << std::endl;
+          return ;
+        }
+      }
+      std::cout << "frequency: " << std::endl;
+      for (int i = 0; i < SAMPLE_MAX; ++i) {
+        std::cout <<i << ": " << static_cast<float>(frequency[i]) / x.numel() << "\t";
+      }
+      std::cout << std::endl;
+    }
+    }
+
+  std::cout<< "========================== END PRINT " << name << " ==========================" << std::endl << std::endl;
+  if(shoud_sleep)
+    std::this_thread::sleep_for(std::chrono::milliseconds(5000));
+}
 template <typename T, typename Context>
 void MemcpyKernel(const Context& dev_ctx,
                   const phi::DenseTensor& x,
@@ -53,7 +91,9 @@ void MemcpyD2HKernel(const Context& dev_ctx,
                      const phi::DenseTensor& x,
                      int dst_place_type,
                      phi::DenseTensor* out) {
+  std::cout << "Begin MemcpyD2HKernel" << std::endl;
   TensorCopy(dev_ctx, x, false, out, phi::CPUPlace());
+  // printInfo<int64_t>(dev_ctx, *out, "Memcpy out", true, false);
 }
 
 template <typename T, typename Context>

diff --git a/backends/mlu/kernels/multinomial_kernel.cc b/backends/mlu/kernels/multinomial_kernel.cc
@@ -12,16 +12,68 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <unistd.h>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+#include <vector>
 #include "kernels/funcs/mlu_funcs.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace custom_kernel {
+using phi::CPUPlace;
+using phi::DenseTensor;
+const int64_t SAMPLE_MAX = 4;
+template <typename T, typename Context>
+void printInfo(const Context &dev_ctx, const DenseTensor &x, const std::string& name, bool frequency=false, bool shoud_sleep=false) {
+    std::cout << "========================== START PRINT " << name << " ==========================" << std::endl;
+    std::cout << "numel: "
+              << x.numel()
+              << std::endl;
+    std::cout << "place: "
+              << x.place()
+              << std::endl;
+
+    phi::DenseTensor tensor_tmp;
+    phi::Copy(
+        dev_ctx,
+        x,
+        CPUPlace(),
+        true,
+        &tensor_tmp);
+    if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t>) {
+    T* data_p = static_cast<T*>(tensor_tmp.data());
+
+    if(frequency){
+      std::vector<int64_t> frequency(SAMPLE_MAX, 0);
+      for (int i = 0; i < x.numel(); ++i) {
+        if (data_p[i] >= 0 && data_p[i] < SAMPLE_MAX) {
+          frequency[data_p[i]]++;
+        } else {
+          std::cout << "FOUND INVALID SAMPLE!" << std::endl;
+          return ;
+        }
+      }
+      std::cout << "frequency: " << std::endl;
+      for (int i = 0; i < SAMPLE_MAX; ++i) {
+        std::cout <<i << ": " << static_cast<float>(frequency[i]) / x.numel() << "\t";
+      }
+      std::cout << std::endl;
+    }
+    }
+
+  std::cout<< "========================== END PRINT " << name << " ==========================" << std::endl << std::endl;
+  if(shoud_sleep)
+    std::this_thread::sleep_for(std::chrono::milliseconds(5000));
+}
 
 template <typename T, typename Context>
-void MultinomialKernel(const Context& dev_ctx,
-                       const phi::DenseTensor& x,
-                       const phi::Scalar& num,
+void MultinomialKernel(const Context &dev_ctx,
+                       const phi::DenseTensor &x,
+                       const phi::Scalar &num,
                        bool replacement,
-                       phi::DenseTensor* out) {
+                       phi::DenseTensor *out) {
+  // std::this_thread::sleep_for(std::chrono::milliseconds(2000));
   dev_ctx.template Alloc<int64_t>(out);
   MLUCnnlTensorDesc desc_x(x);
   MLUCnnlTensorDesc desc_out(*out);
@@ -39,6 +91,9 @@ void MultinomialKernel(const Context& dev_ctx,
                                    GetBasePtr(&generator_desc->get_state()),
                                    desc_out.get(),
                                    GetBasePtr(out));
+  std::cout << "End MultinomialKernel" << std::endl;
+  // printInfo<T, Context>(dev_ctx, x, "x");
+  // printInfo<int64_t, Context>(dev_ctx, *out, "out", true, false);
 }
 
 }  // namespace custom_kernel

diff --git a/backends/mlu/tests/CMakeLists.txt b/backends/mlu/tests/CMakeLists.txt
@@ -37,7 +37,7 @@ endfunction()
 add_test(
   NAME test_LeNet_MNIST
   COMMAND
-    ${CMAKE_COMMAND} -E env
+    ${CMAKE_COMMAND} -E env FLAGS_use_stream_safe_cuda_allocator=false
     CUSTOM_DEVICE_ROOT=${CMAKE_BINARY_DIR}/python/paddle_custom_device/
     PYTHONPATH=${PYTHON_SOURCE_DIR}:${PYTHON_SOURCE_DIR}/tests:$ENV{PYTHONPATH}
     python test_LeNet_MNIST.py

diff --git a/backends/mlu/tests/unittests/test_multinomial_op_mlu.py b/backends/mlu/tests/unittests/test_multinomial_op_mlu.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 paddle.enable_static()
-
+paddle.seed(1000)
 
 def sample_output_one_dimension(out, dim):
     # count numbers of different categories
@@ -164,26 +164,32 @@ def test_dygraph4(self):
         paddle.enable_static()
 
     def test_static(self):
-        paddle.set_device("mlu:0")
-        startup_program = base.Program()
-        train_program = base.Program()
-        with base.program_guard(train_program, startup_program):
-            x = paddle.static.data("x", shape=[4], dtype="float32")
-            out = paddle.multinomial(x, num_samples=100000, replacement=True)
-
-            place = base.CustomPlace("mlu", 0)
-            exe = base.Executor(place)
-
-        exe.run(startup_program)
-        x_np = np.random.rand(4).astype("float32")
-        out = exe.run(train_program, feed={"x": x_np}, fetch_list=[out])
-
-        sample_prob = sample_output_one_dimension(out, 4)
-        prob = x_np / x_np.sum(axis=-1, keepdims=True)
-        self.assertTrue(
-            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
-            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob),
-        )
+        for _ in range(10000):
+            print(f"start {_}")
+            paddle.set_device("mlu:0")
+            startup_program = base.Program()
+            train_program = base.Program()
+            with base.program_guard(train_program, startup_program):
+                x = paddle.static.data("x", shape=[4], dtype="float32")
+                outs = [
+                    paddle.multinomial(x, num_samples=100000, replacement=True)
+                    for _ in range(10)
+                ]
+                out = paddle.concat(outs, axis=0)
+
+                place = base.CustomPlace("mlu", 0)
+                exe = base.Executor(place)
+
+            exe.run(startup_program)
+            x_np = np.random.rand(4).astype("float32")
+            out = exe.run(train_program, feed={"x": x_np}, fetch_list=[out])
+
+            sample_prob = sample_output_one_dimension(out, 4)
+            prob = x_np / x_np.sum(axis=-1, keepdims=True)
+            self.assertTrue(
+                np.allclose(sample_prob, prob, rtol=0, atol=0.01),
+                "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob),
+            )
 
 
 class TestMultinomialFP16Op(OpTest):

diff --git a/backends/mlu/tools/dockerfile/Dockerfile.mlu.kylinv10.gcc82.py310 b/backends/mlu/tools/dockerfile/Dockerfile.mlu.kylinv10.gcc82.py310
@@ -41,6 +41,7 @@ ENV FLAGS_use_stride_kernel=0
 ENV FLAGS_allocator_strategy=auto_growth
 ENV CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE=1
 ENV PADDLE_XCCL_BACKEND=mlu
+ENV FLAGS_use_stream_safe_cuda_allocator=false
 
 # yum and pip clean
 RUN yum clean all && \

diff --git a/backends/mlu/tools/dockerfile/Dockerfile.mlu.ubuntu20.gcc84.py310 b/backends/mlu/tools/dockerfile/Dockerfile.mlu.ubuntu20.gcc84.py310
@@ -83,6 +83,7 @@ ENV FLAGS_use_stride_kernel=0
 ENV FLAGS_allocator_strategy=auto_growth
 ENV CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE=1
 ENV PADDLE_XCCL_BACKEND=mlu
+ENV FLAGS_use_stream_safe_cuda_allocator=false
 
 # Clean
 RUN apt-get clean -y

diff --git a/cmake/paddle.cmake b/cmake/paddle.cmake
@@ -70,7 +70,7 @@ endif()
 
 # submodule Paddle first
 set(paddle_submodule $ENV{paddle_submodule})
-if(paddle_submodule)
+if(NOT paddle_submodule)
   get_filename_component(REPO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../"
                          ABSOLUTE)
   get_filename_component(PADDLE_SOURCE_DIR "${REPO_SOURCE_DIR}/Paddle" ABSOLUTE)