Skip to content

[MLU] Fix test_multinomial_op_mlu in PIR mode #1684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
PYTHONPATH=/work/PaddleX/PaddleCustomDevice/python:/work/PaddleX/PaddleCustomDevice/python/tests
FLAGS_enable_pir_api=1
# FLAGS_print_ir=True
# GLOG_v=6
PADDLE_PDX_DEBUG=True
PADDLE_PDX_DISABLE_DEV_MODEL_WL=True
FLAGS_json_format_model=1
FLAGS_fast_eager_deletion_mode=0
FLAGS_use_stream_safe_cuda_allocator=false

# FLAGS_new_executor_serial_run=1
# FLAGS_call_stack_level=2


# DEBUG_WAIT_AFTER_YIELD=0
# DEBUG_WAIT_BEFORE_YIELD=0
3 changes: 3 additions & 0 deletions backends/mlu/kernels/funcs/mlu_baseop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5985,6 +5985,9 @@ NormalizeDesc::~NormalizeDesc() {
workspace_size,
output_desc,
out));
cnrtQueue_t queue;
PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQueue(handle, &queue));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(queue));
}

/* static */ void MLUOP::OpYoloBox(const Context& ctx,
Expand Down
40 changes: 40 additions & 0 deletions backends/mlu/kernels/memcpy_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,49 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdint>
#include "kernels/funcs/mlu_baseop.h"
#include "kernels/funcs/mlu_funcs.h"

namespace custom_kernel {
using phi::CPUPlace;
using phi::DenseTensor;
const int64_t SAMPLE_MAX = 4;
template <typename T, typename Context>
void printInfo(const Context &dev_ctx, const DenseTensor &x, const std::string& name, bool frequency=false, bool shoud_sleep=false) {
std::cout << "========================== START PRINT " << name << " ==========================" << std::endl;
std::cout << "numel: "
<< x.numel()
<< std::endl;
std::cout << "place: "
<< x.place()
<< std::endl;

if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t>) {
const T* data_p = static_cast<const T*>(x.data());

if(frequency){
std::vector<int64_t> frequency(SAMPLE_MAX, 0);
for (int i = 0; i < x.numel(); ++i) {
if (data_p[i] >= 0 && data_p[i] < SAMPLE_MAX) {
frequency[data_p[i]]++;
} else {
std::cout << "FOUND INVALID SAMPLE!" << std::endl;
return ;
}
}
std::cout << "frequency: " << std::endl;
for (int i = 0; i < SAMPLE_MAX; ++i) {
std::cout <<i << ": " << static_cast<float>(frequency[i]) / x.numel() << "\t";
}
std::cout << std::endl;
}
}

std::cout<< "========================== END PRINT " << name << " ==========================" << std::endl << std::endl;
if(shoud_sleep)
std::this_thread::sleep_for(std::chrono::milliseconds(5000));
}
template <typename T, typename Context>
void MemcpyKernel(const Context& dev_ctx,
const phi::DenseTensor& x,
Expand Down Expand Up @@ -53,7 +91,9 @@ void MemcpyD2HKernel(const Context& dev_ctx,
const phi::DenseTensor& x,
int dst_place_type,
phi::DenseTensor* out) {
std::cout << "Begin MemcpyD2HKernel" << std::endl;
TensorCopy(dev_ctx, x, false, out, phi::CPUPlace());
// printInfo<int64_t>(dev_ctx, *out, "Memcpy out", true, false);
}

template <typename T, typename Context>
Expand Down
63 changes: 59 additions & 4 deletions backends/mlu/kernels/multinomial_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,68 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <unistd.h>
#include <cstdint>
#include <iostream>
#include <thread>
#include <vector>
#include "kernels/funcs/mlu_funcs.h"
#include "paddle/phi/core/dense_tensor.h"

namespace custom_kernel {
using phi::CPUPlace;
using phi::DenseTensor;
const int64_t SAMPLE_MAX = 4;
template <typename T, typename Context>
void printInfo(const Context &dev_ctx, const DenseTensor &x, const std::string& name, bool frequency=false, bool shoud_sleep=false) {
std::cout << "========================== START PRINT " << name << " ==========================" << std::endl;
std::cout << "numel: "
<< x.numel()
<< std::endl;
std::cout << "place: "
<< x.place()
<< std::endl;

phi::DenseTensor tensor_tmp;
phi::Copy(
dev_ctx,
x,
CPUPlace(),
true,
&tensor_tmp);
if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t>) {
T* data_p = static_cast<T*>(tensor_tmp.data());

if(frequency){
std::vector<int64_t> frequency(SAMPLE_MAX, 0);
for (int i = 0; i < x.numel(); ++i) {
if (data_p[i] >= 0 && data_p[i] < SAMPLE_MAX) {
frequency[data_p[i]]++;
} else {
std::cout << "FOUND INVALID SAMPLE!" << std::endl;
return ;
}
}
std::cout << "frequency: " << std::endl;
for (int i = 0; i < SAMPLE_MAX; ++i) {
std::cout <<i << ": " << static_cast<float>(frequency[i]) / x.numel() << "\t";
}
std::cout << std::endl;
}
}

std::cout<< "========================== END PRINT " << name << " ==========================" << std::endl << std::endl;
if(shoud_sleep)
std::this_thread::sleep_for(std::chrono::milliseconds(5000));
}

template <typename T, typename Context>
void MultinomialKernel(const Context& dev_ctx,
const phi::DenseTensor& x,
const phi::Scalar& num,
void MultinomialKernel(const Context &dev_ctx,
const phi::DenseTensor &x,
const phi::Scalar &num,
bool replacement,
phi::DenseTensor* out) {
phi::DenseTensor *out) {
// std::this_thread::sleep_for(std::chrono::milliseconds(2000));
dev_ctx.template Alloc<int64_t>(out);
MLUCnnlTensorDesc desc_x(x);
MLUCnnlTensorDesc desc_out(*out);
Expand All @@ -39,6 +91,9 @@ void MultinomialKernel(const Context& dev_ctx,
GetBasePtr(&generator_desc->get_state()),
desc_out.get(),
GetBasePtr(out));
std::cout << "End MultinomialKernel" << std::endl;
// printInfo<T, Context>(dev_ctx, x, "x");
// printInfo<int64_t, Context>(dev_ctx, *out, "out", true, false);
}

} // namespace custom_kernel
Expand Down
2 changes: 1 addition & 1 deletion backends/mlu/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ endfunction()
add_test(
NAME test_LeNet_MNIST
COMMAND
${CMAKE_COMMAND} -E env
${CMAKE_COMMAND} -E env FLAGS_use_stream_safe_cuda_allocator=false
CUSTOM_DEVICE_ROOT=${CMAKE_BINARY_DIR}/python/paddle_custom_device/
PYTHONPATH=${PYTHON_SOURCE_DIR}:${PYTHON_SOURCE_DIR}/tests:$ENV{PYTHONPATH}
python test_LeNet_MNIST.py
Expand Down
48 changes: 27 additions & 21 deletions backends/mlu/tests/unittests/test_multinomial_op_mlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

paddle.enable_static()

paddle.seed(1000)

def sample_output_one_dimension(out, dim):
# count numbers of different categories
Expand Down Expand Up @@ -164,26 +164,32 @@ def test_dygraph4(self):
paddle.enable_static()

def test_static(self):
paddle.set_device("mlu:0")
startup_program = base.Program()
train_program = base.Program()
with base.program_guard(train_program, startup_program):
x = paddle.static.data("x", shape=[4], dtype="float32")
out = paddle.multinomial(x, num_samples=100000, replacement=True)

place = base.CustomPlace("mlu", 0)
exe = base.Executor(place)

exe.run(startup_program)
x_np = np.random.rand(4).astype("float32")
out = exe.run(train_program, feed={"x": x_np}, fetch_list=[out])

sample_prob = sample_output_one_dimension(out, 4)
prob = x_np / x_np.sum(axis=-1, keepdims=True)
self.assertTrue(
np.allclose(sample_prob, prob, rtol=0, atol=0.01),
"sample_prob: " + str(sample_prob) + "\nprob: " + str(prob),
)
for _ in range(10000):
print(f"start {_}")
paddle.set_device("mlu:0")
startup_program = base.Program()
train_program = base.Program()
with base.program_guard(train_program, startup_program):
x = paddle.static.data("x", shape=[4], dtype="float32")
outs = [
paddle.multinomial(x, num_samples=100000, replacement=True)
for _ in range(10)
]
out = paddle.concat(outs, axis=0)

place = base.CustomPlace("mlu", 0)
exe = base.Executor(place)

exe.run(startup_program)
x_np = np.random.rand(4).astype("float32")
out = exe.run(train_program, feed={"x": x_np}, fetch_list=[out])

sample_prob = sample_output_one_dimension(out, 4)
prob = x_np / x_np.sum(axis=-1, keepdims=True)
self.assertTrue(
np.allclose(sample_prob, prob, rtol=0, atol=0.01),
"sample_prob: " + str(sample_prob) + "\nprob: " + str(prob),
)


class TestMultinomialFP16Op(OpTest):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ ENV FLAGS_use_stride_kernel=0
ENV FLAGS_allocator_strategy=auto_growth
ENV CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE=1
ENV PADDLE_XCCL_BACKEND=mlu
ENV FLAGS_use_stream_safe_cuda_allocator=false

# yum and pip clean
RUN yum clean all && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ ENV FLAGS_use_stride_kernel=0
ENV FLAGS_allocator_strategy=auto_growth
ENV CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE=1
ENV PADDLE_XCCL_BACKEND=mlu
ENV FLAGS_use_stream_safe_cuda_allocator=false

# Clean
RUN apt-get clean -y
Expand Down
2 changes: 1 addition & 1 deletion cmake/paddle.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ endif()

# submodule Paddle first
set(paddle_submodule $ENV{paddle_submodule})
if(paddle_submodule)
if(NOT paddle_submodule)
get_filename_component(REPO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../"
ABSOLUTE)
get_filename_component(PADDLE_SOURCE_DIR "${REPO_SOURCE_DIR}/Paddle" ABSOLUTE)
Expand Down
Loading