added multi-input & scalar support for UDF transforms

rapidsai · rapids-bot · Feb 13, 2025 · Jan 29, 2025 · Jan 30, 2025 · Jan 30, 2025
commit 2e914e4265b716899bec032de99bb724d7716574
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,27 +32,30 @@ namespace CUDF_EXPORT cudf {
  */
 
 /**
- * @brief Creates a new column by applying a unary function against every
- * element of an input column.
+ * @brief Creates a new column by applying a transform function against every
+ * element of the input columns.
  *
  * Computes:
- * `out[i] = F(in[i])`
+ * `out[i] = F(inputs[i]...)`.
+ *
+ * Note that for every scalar in `inputs`, `input[i] == input[0]`
  *
  * The output null mask is the same is the input null mask so if input[i] is
- * null then output[i] is also null
+ * null then output[i] is also null. The size of the resulting column is the size of the largest
+ * column. Scalar inputs are also supported and they should have size of 1.
  *
- * @param input         An immutable view of the input column to transform
- * @param unary_udf     The PTX/CUDA string of the unary function to apply
+ * @param inputs        Immutable views of the input columns to transform
+ * @param transform_udf The PTX/CUDA string of the transform function to apply
  * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
  * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned column's device memory
- * @return              The column resulting from applying the unary function to
+ * @return              The column resulting from applying the transform function to
  *                      every element of the input
  */
 std::unique_ptr<column> transform(
-  column_view const& input,
-  std::string const& unary_udf,
+  std::vector<column_view> const& inputs,
+  std::string const& transform_udf,
   data_type output_type,
   bool is_ptx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,8 +33,20 @@ namespace cudf {
 namespace transformation {
 namespace jit {
 
-template <typename TypeOut, typename TypeIn>
-CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
+/// @brief This class supports striding into columns of data as either scalars or actual
+/// columns at no runtime cost. Although it implies the kernel will be recompiled if scalar and
+/// column inputs are interchanged.
+template <typename T, int multiplier>
+struct strided {
+  T data;
+
+  __device__ T const& get(int64_t index) const { return (&data)[index * multiplier]; }
+
+  __device__ T& get(int64_t index) { return (&data)[index * multiplier]; }
+};
+
+template <typename Out, typename... In>
+CUDF_KERNEL void kernel(int64_t size, Out* __restrict__ out, In const* __restrict__... ins)
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
@@ -43,7 +55,7 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data
   thread_index_type const stride = block_size * gridDim.x;
 
   for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
-    GENERIC_UNARY_OP(&out_data[i], in_data[i]);
+    GENERIC_TRANSFORM_OP(&out->get(i), ins->get(i)...);
   }
 }
 

@@ -34,75 +34,137 @@ namespace cudf {
 namespace transformation {
 namespace jit {
 namespace {
-void unary_operation(mutable_column_view output,
-                     column_view input,
-                     std::string const& udf,
-                     data_type output_type,
-                     bool is_ptx,
-                     rmm::cuda_stream_view stream)
+
+using device_data_t = void*;
+
+std::map<unsigned int, std::string> ptx_params(mutable_column_view output,
+                                               std::vector<column_view> const& inputs
+
+)
 {
+  std::map<unsigned int, std::string> types;
+
+  unsigned int i = 0;
+  types.emplace(i, cudf::type_to_name(output.type()) + " *");
+  i++;
+
+  for (auto& input : inputs) {
+    types.emplace(i, cudf::type_to_name(input.type()));
+    i++;
+  }
+
+  return types;
+}
+
+void transform_operation(mutable_column_view output,
+                         std::vector<column_view> const& inputs,
+                         std::string const& udf,
+                         data_type output_type,
+                         bool is_ptx,
+                         rmm::cuda_stream_view stream,
+                         cudf::size_type base_column_size)
+{
+  std::vector<std::string> typenames;
+  typenames.push_back(jitify2::reflection::Template("cudf::transformation::jit::strided")
+                        .instantiate(cudf::type_to_name(output.type()), 1));
+
+  for (auto& input : inputs) {
+    bool const is_scalar = input.size() != base_column_size;
+    typenames.push_back(jitify2::reflection::Template("cudf::transformation::jit::strided")
+                          .instantiate(cudf::type_to_name(input.type()), is_scalar ? 0 : 1));
+  }
+
   std::string const kernel_name =
     jitify2::reflection::Template("cudf::transformation::jit::kernel")  //
-      .instantiate(cudf::type_to_name(output.type()),  // list of template arguments
-                   cudf::type_to_name(input.type()));
-
-  std::string cuda_source = is_ptx ? cudf::jit::parse_single_function_ptx(
-                                       udf,  //
-                                       "GENERIC_UNARY_OP",
-                                       {
-                                         {0, "void *"},                         // output argument
-                                         {1, cudf::type_to_name(input.type())}  // input argument
-                                       })
-                                   : cudf::jit::parse_single_function_cuda(udf,  //
-                                                                           "GENERIC_UNARY_OP");
-
-  cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit)
-    .get_kernel(
-      kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, nullptr, stream.value())                             //
-    ->launch(output.size(),                                                                 //
-             cudf::jit::get_data_ptr(output),
-             cudf::jit::get_data_ptr(input));
+      .instantiate(typenames);
+
+  std::string cuda_source = is_ptx
+                              ? cudf::jit::parse_single_function_ptx(udf,  //
+                                                                     "GENERIC_TRANSFORM_OP",
+                                                                     ptx_params(output, inputs))
+                              : cudf::jit::parse_single_function_cuda(udf,  //
+                                                                      "GENERIC_TRANSFORM_OP");
+
+  {
+    std::vector<device_data_t> device_data;
+
+    device_data.push_back(const_cast<device_data_t>(cudf::jit::get_data_ptr(output)));
+    std::transform(
+      inputs.begin(), inputs.end(), std::back_inserter(device_data), [](column_view view) {
+        return const_cast<device_data_t>(cudf::jit::get_data_ptr(view));
+      });
+
+    int64_t size = output.size();
+
+    std::vector<void*> args;
+    args.push_back(&size);
+    std::transform(device_data.begin(),
+                   device_data.end(),
+                   std::back_inserter(args),
+                   [](device_data_t& data) -> void* { return &data; });
+
+    cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit)
+      .get_kernel(
+        kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+      ->configure_1d_max_occupancy(0, 0, nullptr, stream.value())                             //
+      ->launch(args.data());
+  }
 }
 }  // namespace
 
 }  // namespace jit
 }  // namespace transformation
 
 namespace detail {
-std::unique_ptr<column> transform(column_view const& input,
-                                  std::string const& unary_udf,
+std::unique_ptr<column> transform(std::vector<column_view> const& inputs,
+                                  std::string const& transform_udf,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(is_fixed_width(input.type()), "Unexpected non-fixed-width type.");
+  CUDF_EXPECTS(is_fixed_width(output_type), "Unexpected non-fixed-width type.");
+  std::for_each(inputs.begin(), inputs.end(), [](auto& col) {
+    CUDF_EXPECTS(is_fixed_width(col.type()), "Unexpected non-fixed-width type.");
+  });
+
+  auto base_column = std::max_element(
+    inputs.begin(), inputs.end(), [](auto& a, auto& b) { return a.size() < b.size(); });
+
+  std::for_each(inputs.begin(), inputs.end(), [&](column_view const& col) {
+    CUDF_EXPECTS((col.size() == 1) || (col.size() == base_column->size()), "");
+    CUDF_EXPECTS((col.null_count() == 0) || (col.null_count() == base_column->null_count()), "");
+  });
 
-  std::unique_ptr<column> output = make_fixed_width_column(
-    output_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  std::unique_ptr<column> output = make_fixed_width_column(output_type,
+                                                           base_column->size(),
+                                                           copy_bitmask(*base_column, stream, mr),
+                                                           base_column->null_count(),
+                                                           stream,
+                                                           mr);
 
-  if (input.is_empty()) { return output; }
+  if (base_column->is_empty()) { return output; }
 
   mutable_column_view const output_view = *output;
 
   // transform
-  transformation::jit::unary_operation(output_view, input, unary_udf, output_type, is_ptx, stream);
+  transformation::jit::transform_operation(
+    output_view, inputs, transform_udf, output_type, is_ptx, stream, base_column->size());
 
   return output;
 }
 
 }  // namespace detail
 
-std::unique_ptr<column> transform(column_view const& input,
-                                  std::string const& unary_udf,
+std::unique_ptr<column> transform(std::vector<column_view> const& inputs,
+                                  std::string const& transform_udf,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transform(input, unary_udf, output_type, is_ptx, stream, mr);
+  return detail::transform(inputs, transform_udf, output_type, is_ptx, stream, mr);
 }
 
 }  // namespace cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,8 +32,11 @@ void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx
   auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);
   cudf::test::fixed_width_column_wrapper<dtype, typename decltype(data_iter)::value_type> in(
     data_iter, data_iter + size, all_valid);
-  cudf::transform(
-    in, udf, cudf::data_type(cudf::type_to_id<dtype>()), is_ptx, cudf::test::get_default_stream());
+  cudf::transform({in},
+                  udf,
+                  cudf::data_type(cudf::type_to_id<dtype>()),
+                  is_ptx,
+                  cudf::test::get_default_stream());
 }
 
 TEST_F(TransformTest, Transform)

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <[email protected]>
@@ -39,7 +39,7 @@ void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool
     data_iter, data_iter + size, all_valid);
 
   std::unique_ptr<cudf::column> out =
-    cudf::transform(in, udf, cudf::data_type(cudf::type_to_id<dtype>()), is_ptx);
+    cudf::transform({in}, udf, cudf::data_type(cudf::type_to_id<dtype>()), is_ptx);
 
   ASSERT_UNARY<dtype, dtype>(out->view(), in, op);
 }

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1503,7 +1503,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(
     cudf::jni::native_jstring n_j_udf(env, j_udf);
     std::string n_udf(n_j_udf.get());
     return release_as_jlong(
-      cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
+      cudf::transform({*column}, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
   }
   CATCH_STD(env, 0);
 }

@@ -185,7 +185,7 @@ def __setitem__(self, key: Any, value: Any):
     @acquire_spill_lock()
     def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
         plc_column = plc.transform.transform(
-            self.to_pylibcudf(mode="read"),
+            [self.to_pylibcudf(mode="read")],
             compiled_op[0],
             plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
             True,

@@ -1,8 +1,9 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.string cimport string
+from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -33,8 +34,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] transform(
-        column_view input,
-        string unary_udf,
+        const vector[column_view] & inputs,
+        const string & transform_udf,
         data_type output_type,
         bool is_ptx
     ) except +libcudf_exception_handler

@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from pylibcudf.libcudf.types cimport bitmask_type, data_type
 
@@ -17,7 +17,10 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input)
 
 cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit)
 
-cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx)
+cpdef Column transform(list[Column] inputs,
+                       str transform_udf,
+                       DataType output_type,
+                       bool is_ptx)
 
 cpdef tuple[Table, Column] encode(Table input)
 

@@ -10,7 +10,10 @@ def compute_column(input: Table, expr: Expression) -> Column: ...
 def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ...
 def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ...
 def transform(
-    input: Column, unary_udf: str, output_type: DataType, is_ptx: bool
+    inputs: list[Column],
+    transform_udf: str,
+    output_type: DataType,
+    is_ptx: bool,
 ) -> Column: ...
 def encode(input: Table) -> tuple[Table, Column]: ...
 def one_hot_encode(input: Column, categories: Column) -> Table: ...
@@ -1,11 +1,13 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
+from libcpp.vector cimport vector
 from libcpp.utility cimport move, pair
 from pylibcudf.libcudf cimport transform as cpp_transform
 from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, size_type
@@ -129,16 +131,19 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx):
-    """Create a new column by applying a unary function against every
-       element of an input column.
+cpdef Column transform(list[Column] inputs,
+                       str transform_udf,
+                       DataType output_type,
+                       bool is_ptx):
+    """Create a new column by applying a transform function against
+       multiple input columns.
 
     Parameters
     ----------
-    input : Column
-        Column to transform.
-    unary_udf : str
-        The PTX/CUDA string of the unary function to apply.
+    inputs : list[Column]
+        Columns to transform.
+    transform_udf : str
+        The PTX/CUDA string of the transform function to apply.
     output_type : DataType
         The output type that is compatible with the output type in the unary_udf.
     is_ptx : bool
@@ -150,13 +155,17 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i
     Column
         The transformed column having the UDF applied to each element.
     """
+    cdef vector[column_view] c_inputs
     cdef unique_ptr[column] c_result
-    cdef string c_unary_udf = unary_udf.encode()
+    cdef string c_transform_udf = transform_udf.encode()
     cdef bool c_is_ptx = is_ptx
 
+    for input in inputs:
+        c_inputs.push_back((<Column?>input).view())
+
     with nogil:
         c_result = cpp_transform.transform(
-            input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
+            c_inputs, c_transform_udf, output_type.c_obj, c_is_ptx
         )
 
     return Column.from_libcudf(move(c_result))