PaddlePaddle · YqGe585 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 24, 2025
diff --git a/Paddle b/Paddle
diff --git a/backends/iluvatar_gpu/kernels/ernie_core/layer_norm_cuda_kernel.cu b/backends/iluvatar_gpu/kernels/ernie_core/layer_norm_cuda_kernel.cu
@@ -81,8 +81,16 @@ void RMSLnBwd(const Context &ctx,
 
 }  // namespace phi
 
-PD_REGISTER_PLUGIN_KERNEL(
-    fused_rms_norm_ext, iluvatar_gpu, ALL_LAYOUT, phi::RMSLnFwd, float) {}
+PD_REGISTER_PLUGIN_KERNEL(fused_rms_norm_ext,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::RMSLnFwd,
+                          float,
+                          phi::bfloat16) {}
 
-PD_REGISTER_PLUGIN_KERNEL(
-    fused_rms_norm_ext_grad, iluvatar_gpu, ALL_LAYOUT, phi::RMSLnBwd, float) {}
+PD_REGISTER_PLUGIN_KERNEL(fused_rms_norm_ext_grad,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::RMSLnBwd,
+                          float,
+                          phi::bfloat16) {}
diff --git a/backends/iluvatar_gpu/tests/disabled_test.txt b/backends/iluvatar_gpu/tests/disabled_test.txt
@@ -556,3 +556,4 @@ test_rms_norm_op.py
 test_batched_gemm.py
 test_match_matrix_tensor_op.py
 test_tensor.py
+test_rms_norm.py
+1 −3		.github/workflows/_Auto-Parallel.yml
+39 −37		.github/workflows/_Deepmd-pd-test.yml
+10 −3		cmake/external/warprnnt.cmake
+11 −4		paddle/fluid/distributed/collective/deep_ep/CMakeLists.txt
+0 −6		paddle/phi/kernels/funcs/CMakeLists.txt
+2 −118		paddle/phi/kernels/funcs/dense_tensor_iterator.cc
+4 −46		paddle/phi/kernels/funcs/dense_tensor_iterator.h
+0 −206		paddle/phi/kernels/funcs/fast_ln_v1.h
+0 −190		paddle/phi/kernels/funcs/fast_ln_v2.h
+0 −670		paddle/phi/kernels/funcs/fast_ln_v2_bwd_kernel.cu
+0 −237		paddle/phi/kernels/funcs/fast_ln_v2_common.h
+0 −439		paddle/phi/kernels/funcs/fast_ln_v2_fwd_kernel.cu
+0 −877		paddle/phi/kernels/funcs/fast_ln_v2_utils.h
+180 −2		paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+0 −1,427		paddle/phi/kernels/funcs/reduce_gpu_kernel.h
+5 −88		paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+72 −130		paddle/phi/kernels/gpu/layer_norm_kernel.cu
+27 −93		paddle/phi/kernels/gpu/reduce.h
+2 −6		paddle/phi/kernels/gpu/reduce_amin_amax_common.h
+23 −21		paddle/phi/kernels/kps/reduce_kernel.cu
+1 −2		paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu
+180 −12		paddle/phi/kernels/stride/reduce_stride_kernel.cu
+1 −1		paddle/phi/kernels/xpu/top_p_sampling_kernel.cc
+35 −0		patches/warprnnt/CMakeLists.txt.cuda130.patch
+2 −0		python/paddle/distributed/auto_parallel/process_mesh.py
+5 −1		python/paddle/distributed/auto_parallel/static/cluster.py
+14 −7		python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+17 −0		python/paddle/distributed/fsdp/__init__.py
+155 −0		python/paddle/distributed/fsdp/fully_shard.py
+16 −8		python/paddle/distributed/launch/context/device.py
+6 −2		python/paddle/distributed/parallel.py
+2 −0		python/paddle/nn/functional/__init__.py
+81 −0		python/paddle/nn/functional/norm.py
+1 −0		python/paddle/utils/cpp_extension/__init__.py
+18 −1		python/paddle/utils/cpp_extension/cpp_extension.py
+1 −0		python/setup.py.in
+1 −0		setup.py
+2 −2		test/auto_parallel/semi_auto_parallel_fsdp.py
+2 −4		test/collective/fleet/dygraph_group_fsdp.py
+1 −0		test/legacy_test/CMakeLists.txt
+0 −211		test/legacy_test/test_layer_norm_op.py
+0 −52		test/legacy_test/test_reduce_op.py
+160 −0		test/legacy_test/test_rms_norm.py
+168 −0		test/xpu/test_rms_norm_xpu.py
+1 −1		third_party/flashattn