diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index ddadacdaf7..98d1a2179b 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -39,6 +39,9 @@ include(cutlass) include(dgc) include(warpctc) include(warprnnt) +if(WITH_MAGMA) + include(external/magma) +endif() set(PLUGIN_VERSION ${PADDLE_VERSION}) @@ -124,6 +127,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/dense_tensor_iterator.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc # kernels/gpu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_send_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_recv_kernel.cu @@ -668,6 +672,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc + ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -782,6 +787,7 @@ target_link_libraries( protobuf external_error_proto dgc + magma ${WARPCTC_LIBRARIES} ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) diff --git a/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu new file mode 100644 index 0000000000..942bcb20ec --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu @@ -0,0 +1,123 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/eig.h" +#include "paddle/phi/kernels/eig_kernel.h" + +namespace phi { + +template +void EigKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out_w, + DenseTensor* out_v) { + dev_ctx.template Alloc>(out_w); + dev_ctx.template Alloc>(out_v); + + if (x.numel() == 0) { + return; + } + + auto cpu_place = phi::CPUPlace(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* cpu_ctx = static_cast(pool.Get(cpu_place)); + + // prepare cpu Tensor here, since magma requires output on cpu + DenseTensor out_w_cpu, out_v_cpu; + out_w_cpu.Resize(out_w->dims()); + (*cpu_ctx).template Alloc>(&out_w_cpu); + out_v_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&out_v_cpu); + + if (!IsComplexType(x.dtype())) { + // output still be complex though input is real + int batch_count = BatchCount(x); + int order = static_cast(x.dims(-1)); + + DenseTensor real_w_cpu, real_v_cpu; + + std::vector real_w_dim = common::vectorize(out_w->dims()); + real_w_dim.back() *= 2; + real_w_cpu.Resize(common::make_ddim(real_w_dim)); + (*cpu_ctx).template Alloc>(&real_w_cpu); + real_v_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&real_v_cpu); + + phi::ApplyEigKernelMagma, Context>( + dev_ctx, x, &real_w_cpu, &real_v_cpu); + + // 1. extract real part & imag part from real_w_cpu + DenseTensor real_part_cpu = phi::funcs::Slice>( + (*cpu_ctx), real_w_cpu, {-1}, {0}, {order}); + DenseTensor imag_part_cpu = phi::funcs::Slice>( + (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2}); + + // 2. construct complex values + auto* real_part_data = real_part_cpu.data>(); + auto* imag_part_data = imag_part_cpu.data>(); + int64_t out_w_numel = static_cast(out_w->numel()); + + phi::funcs::ForRange for_range((*cpu_ctx), out_w_numel); + phi::funcs::RealImagToComplexFunctor> functor( + real_part_data, + imag_part_data, + out_w_cpu.data>(), + out_w_numel); + for_range(functor); + + // 3. construct complex vectors + DenseTensor real_v_trans_cpu = + phi::TransposeLast2Dim, phi::CPUContext>( + (*cpu_ctx), real_v_cpu); + DenseTensor out_v_trans_cpu; + out_v_trans_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&out_v_trans_cpu); + + phi::ConstructComplexVectors, + phi::dtype::Complex, + phi::CPUContext>(&out_v_trans_cpu, + out_w_cpu, + real_v_trans_cpu, + (*cpu_ctx), + batch_count, + order); + + TransposeTwoAxis, phi::CPUContext>( + out_v_trans_cpu, + &out_v_cpu, + x.dims().size() - 1, + x.dims().size() - 2, + (*cpu_ctx)); + + } else { + phi::ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); + } + + // copy result from cpu to gpu tensor + phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w); + phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v); +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL( + eig, metax_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) { + if (kernel_key.dtype() == phi::DataType::FLOAT32) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); + kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); + } +}