UI-Research · wcurrangroome · Apr 23, 2026 · Apr 23, 2026
diff --git a/AGENTS.md b/AGENTS.md
diff --git a/R/crosswalk_data.R b/R/crosswalk_data.R
@@ -41,6 +41,17 @@
 #'    mean, median, percentage, and ratio variables. These will be calculated as weighted
 #'    means using the allocation factor as weights. If NULL (default), automatically
 #'    detects columns with prefixes "mean_", "median_", "percent_", or "ratio_".
+#' @param custom_interpolations A list of lists, each specifying a group of columns
+#'    and a custom interpolation function. Each element must have:
+#'    \describe{
+#'      \item{columns}{Character vector of column names}
+#'      \item{fn}{A function or formula for interpolation. Receives two arguments:
+#'        `.x` (column values) and `.w` (allocation factors). Formulas using `~`
+#'        syntax (e.g., `~sum(.x * .w, na.rm = TRUE)`) are converted to functions
+#'        via `rlang::as_function()`.}
+#'    }
+#'    Columns in `custom_interpolations` must not overlap with `count_columns` or
+#'    `non_count_columns`. Default is NULL (no custom interpolations).
 #' @param return_intermediate Logical. If TRUE and crosswalk has multiple steps,
 #'    returns a list containing both the final result and intermediate results
 #'    from each step. Default is FALSE, which returns only the final result.
@@ -151,6 +162,21 @@
 #' # Access intermediate and final
 #' result$intermediate$step_1  # After first crosswalk
 #' result$final                # Final result
+#'
+#' # Custom interpolation functions
+#' result <- crosswalk_data(
+#'   data = my_data,
+#'   crosswalk = crosswalk,
+#'   custom_interpolations = list(
+#'     list(
+#'       columns = c("count_population", "count_housing"),
+#'       fn = ~sum(.x * .w, na.rm = TRUE)
+#'     ),
+#'     list(
+#'       columns = c("pct_poverty"),
+#'       fn = ~weighted.mean(.x, .w, na.rm = TRUE)
+#'     )
+#'   ))
 #' }
 
 crosswalk_data <- function(
@@ -165,6 +191,7 @@ crosswalk_data <- function(
     geoid_column = "source_geoid",
     count_columns = NULL,
     non_count_columns = NULL,
+    custom_interpolations = NULL,
     return_intermediate = FALSE,
     show_join_quality = TRUE,
     silent = getOption("crosswalk.silent", FALSE)) {
@@ -222,14 +249,28 @@ crosswalk_data <- function(
       stringr::str_starts(data_columns, "ratio_")]
   }
 
-  if (length(count_columns) == 0 & length(non_count_columns) == 0) {
+  # Validate custom_interpolations if provided
+  if (!is.null(custom_interpolations)) {
+    custom_interpolations <- validate_custom_interpolations(
+      custom_interpolations, names(data), count_columns, non_count_columns)
+  }
+
+  has_custom <- !is.null(custom_interpolations) && length(custom_interpolations) > 0
+
+  if (length(count_columns) == 0 & length(non_count_columns) == 0 & !has_custom) {
     stop(
-      "No columns to crosswalk. Either specify `count_columns` or `non_count_columns`, ",
-      "or ensure your data has columns with prefixes: count_, mean_, median_, percent_, or ratio_.")
+      "No columns to crosswalk. Either specify `count_columns`, `non_count_columns`, ",
+      "or `custom_interpolations`, or ensure your data has columns with prefixes: ",
+      "count_, mean_, median_, percent_, or ratio_.")
   }
 
   # Check that specified columns exist in original data
-  all_value_columns <- c(count_columns, non_count_columns)
+  custom_cols <- if (has_custom) {
+    unlist(purrr::map(custom_interpolations, "columns"))
+  } else {
+    character(0)
+  }
+  all_value_columns <- c(count_columns, non_count_columns, custom_cols)
   missing_columns <- setdiff(all_value_columns, names(data))
   if (length(missing_columns) > 0) {
     stop(
@@ -261,6 +302,7 @@ crosswalk_data <- function(
       geoid_column = current_geoid_column,
       count_columns = count_columns,
       non_count_columns = non_count_columns,
+      custom_interpolations = custom_interpolations,
       step_number = i,
       total_steps = n_steps,
       show_join_quality = show_join_quality)
@@ -633,6 +675,79 @@ report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1,
 }
 
 
+#' Validate Custom Interpolations Parameter
+#'
+#' Internal function that validates and normalizes the custom_interpolations parameter.
+#' Converts formula-style functions to proper functions with `.x` and `.w` parameters.
+#'
+#' @param custom_interpolations A list of lists with `columns` and `fn` elements.
+#' @param data_columns Character vector of column names in data.
+#' @param count_columns Character vector of count column names.
+#' @param non_count_columns Character vector of non-count column names.
+#' @return The validated and normalized custom_interpolations list.
+#' @keywords internal
+#' @noRd
+validate_custom_interpolations <- function(custom_interpolations, data_columns,
+                                           count_columns, non_count_columns) {
+  if (!is.list(custom_interpolations)) {
+    stop("`custom_interpolations` must be a list of lists, each with `columns` and `fn` elements.")
+  }
+
+  all_custom_cols <- character(0)
+
+  for (i in seq_along(custom_interpolations)) {
+    group <- custom_interpolations[[i]]
+
+    if (!is.list(group) || !all(c("columns", "fn") %in% names(group))) {
+      stop(
+        "`custom_interpolations[[", i, "]]` must be a list with `columns` and `fn` elements.")
+    }
+
+    if (!is.character(group$columns) || length(group$columns) == 0) {
+      stop(
+        "`custom_interpolations[[", i, "]]$columns` must be a non-empty character vector.")
+    }
+
+    # Convert formula to function with .x and .w parameters
+    if (rlang::is_formula(group$fn)) {
+      fn_body <- rlang::f_rhs(group$fn)
+      fn_env <- rlang::f_env(group$fn)
+      custom_interpolations[[i]]$fn <- rlang::new_function(
+        alist(.x = , .w = ), fn_body, fn_env)
+    } else if (!is.function(group$fn)) {
+      stop(
+        "`custom_interpolations[[", i, "]]$fn` must be a function or formula.")
+    }
+
+    # Check for overlap within custom groups
+    overlap_within <- intersect(group$columns, all_custom_cols)
+    if (length(overlap_within) > 0) {
+      stop(
+        "Column(s) appear in multiple `custom_interpolations` groups: ",
+        paste(overlap_within, collapse = ", "))
+    }
+    all_custom_cols <- c(all_custom_cols, group$columns)
+  }
+
+  # Check for overlap with count_columns and non_count_columns
+  overlap_count <- intersect(all_custom_cols, count_columns)
+  if (length(overlap_count) > 0) {
+    stop(
+      "Column(s) in `custom_interpolations` overlap with `count_columns`: ",
+      paste(overlap_count, collapse = ", "))
+  }
+
+  overlap_non_count <- intersect(all_custom_cols, non_count_columns)
+  if (length(overlap_non_count) > 0) {
+    stop(
+      "Column(s) in `custom_interpolations` overlap with `non_count_columns`: ",
+      paste(overlap_non_count, collapse = ", "))
+  }
+
+  return(custom_interpolations)
+}
+
+
 #' Apply a Single Crosswalk Step
 #'
 #' Internal function that applies one crosswalk tibble to data.
@@ -642,6 +757,7 @@ report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1,
 #' @param geoid_column Column name for source geoid
 #' @param count_columns Count variable columns
 #' @param non_count_columns Non-count variable columns
+#' @param custom_interpolations Validated custom interpolation groups (or NULL)
 #' @param step_number Integer. Current step number for multi-step reporting.
 #' @param total_steps Integer. Total number of steps for multi-step reporting.
 #' @param show_join_quality Logical. Whether to report join quality diagnostics.
@@ -654,6 +770,7 @@ apply_single_crosswalk <- function(
     geoid_column,
     count_columns,
     non_count_columns,
+    custom_interpolations = NULL,
     step_number = 1,
     total_steps = 1,
     show_join_quality = TRUE) {
@@ -700,7 +817,22 @@ apply_single_crosswalk <- function(
   current_count_cols <- intersect(count_columns, names(data))
   current_non_count_cols <- intersect(non_count_columns, names(data))
 
-  # Identify "other" columns (not geoid, count, or non-count columns)
+  # Filter custom interpolation columns to those in current data
+  current_custom_interpolations <- if (!is.null(custom_interpolations)) {
+    purrr::map(custom_interpolations, function(group) {
+      list(
+        columns = intersect(group$columns, names(data)),
+        fn = group$fn)
+    }) |>
+      purrr::keep(~ length(.x$columns) > 0)
+  } else {
+    list()
+  }
+
+  current_custom_cols <- unlist(purrr::map(current_custom_interpolations, "columns"))
+  if (is.null(current_custom_cols)) current_custom_cols <- character(0)
+
+  # Identify "other" columns (not geoid, count, non-count, or custom columns)
   # These will be aggregated by taking the first non-missing value
   # Include both original crosswalk column names AND their renamed versions
   # (e.g., "geography_name" which comes from "target_geography_name" after renaming)
@@ -714,41 +846,96 @@ apply_single_crosswalk <- function(
                       "land_area_sqmi")
   other_cols <- setdiff(
     names(data),
-    c(geoid_column, current_count_cols, current_non_count_cols, crosswalk_cols)
+    c(geoid_column, current_count_cols, current_non_count_cols,
+      current_custom_cols, crosswalk_cols)
   )
 
+  # All interpolated columns (for NA validity tracking)
+  all_interpolated_cols <- c(current_count_cols, current_non_count_cols,
+                             current_custom_cols)
 
   # Join crosswalk to data
-  result <- data |>
+  joined_data <- data |>
     dplyr::mutate(
       dplyr::across(dplyr::all_of(geoid_column), as.character)) |>
     dplyr::left_join(
       crosswalk,
-      by = stats::setNames("source_geoid", geoid_column)) |>
-    tidytable::summarize(
-      .by = dplyr::all_of(group_cols),
-      ## count variables we take the sum of the weighted count variable
-      dplyr::across(
-        .cols = dplyr::all_of(current_count_cols),
-        .fns = ~ sum(.x * allocation_factor_source_to_target, na.rm = TRUE)),
-      ## non-count variables--means, medians, percentages, ratios, etc.--
-      ## we take the weighted mean of the variable, weighted by the allocation factor
-      tidytable::across(
-        .cols = tidytable::all_of(current_non_count_cols),
-        .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)),
-      ## other columns: take first non-missing value (or NA if all missing)
-      tidytable::across(
-        .cols = tidytable::all_of(other_cols),
-        .fns = ~ dplyr::first(.x, na_rm = TRUE)),
-      tidytable::across(
-        .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)),
-        .fns = ~ sum(!is.na(.x)),
-        .names = "{.col}_validx")) |>
-    tidytable::mutate(
-      tidytable::across(
-        .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)),
-        .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0, .x, NA))) |>
-    dplyr::select(-dplyr::matches("_validx$")) |>
+      by = stats::setNames("source_geoid", geoid_column))
+
+  # Standard columns for main summarize (count, non-count)
+  standard_cols <- c(current_count_cols, current_non_count_cols)
+  has_standard <- length(standard_cols) > 0 || length(other_cols) > 0
+
+  # Main summarize for standard columns (count, non-count, other)
+  if (has_standard) {
+    result <- joined_data |>
+      tidytable::summarize(
+        .by = dplyr::all_of(group_cols),
+        ## count variables: sum of weighted values
+        dplyr::across(
+          .cols = dplyr::all_of(current_count_cols),
+          .fns = ~ sum(.x * allocation_factor_source_to_target, na.rm = TRUE)),
+        ## non-count variables: weighted mean
+        tidytable::across(
+          .cols = tidytable::all_of(current_non_count_cols),
+          .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)),
+        ## other columns: first non-missing value
+        tidytable::across(
+          .cols = tidytable::all_of(other_cols),
+          .fns = ~ dplyr::first(.x, na_rm = TRUE)),
+        ## validity tracking for NA handling
+        tidytable::across(
+          .cols = tidytable::all_of(standard_cols),
+          .fns = ~ sum(!is.na(.x)),
+          .names = "{.col}_validx"))
+
+    # Replace standard interpolated values with NA where all source values were NA
+    if (length(standard_cols) > 0) {
+      result <- result |>
+        tidytable::mutate(
+          tidytable::across(
+            .cols = tidytable::all_of(standard_cols),
+            .fns = ~ tidytable::if_else(
+              get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0,
+              .x, NA)))
+    }
+
+    result <- result |>
+      dplyr::select(-dplyr::matches("_validx$"))
+  } else {
+    # No standard columns; start with unique target groups
+    result <- joined_data |>
+      dplyr::distinct(dplyr::across(dplyr::all_of(group_cols)))
+  }
+
+  # Compute custom interpolations in a separate pass and join back
+  if (length(current_custom_interpolations) > 0) {
+    for (group in current_custom_interpolations) {
+      group_fn <- group$fn
+      group_cols_i <- group$columns
+
+      # Compute each custom column individually and join to result
+      custom_results <- purrr::map(group_cols_i, function(col_name) {
+        joined_data |>
+          tidytable::summarize(
+            .by = dplyr::all_of(group_cols),
+            val = group_fn(!!rlang::sym(col_name),
+                           .w = allocation_factor_source_to_target),
+            validx = sum(!is.na(!!rlang::sym(col_name)))) |>
+          tidytable::mutate(
+            val = tidytable::if_else(validx > 0, val, NA)) |>
+          dplyr::select(-validx) |>
+          dplyr::rename(!!col_name := val)
+      })
+
+      for (custom_result in custom_results) {
+        result <- result |>
+          dplyr::left_join(custom_result, by = group_cols)
+      }
+    }
+  }
+
+  result <- result |>
     dplyr::rename_with(
       .cols = dplyr::everything(),
       .fn = ~ stringr::str_remove_all(.x, "target_")) |>