From 024940df71ddbc93d1442856c8f560d32ac049a6 Mon Sep 17 00:00:00 2001 From: Will Curran-Groome Date: Thu, 23 Apr 2026 10:35:26 -0400 Subject: [PATCH] supporting tribal geographies as a target --- AGENTS.md | 40 --- R/crosswalk_data.R | 251 +++++++++++++++--- R/get_crosswalk.R | 16 +- R/get_geocorr_crosswalk.R | 36 ++- R/get_nhgis_crosswalk.R | 14 + R/plan_crosswalk_chain.R | 24 ++ man/crosswalk_data.Rd | 28 ++ man/get_crosswalk.Rd | 7 +- .../testthat/test-get_available_crosswalks.R | 46 ++++ tests/testthat/test-get_geocorr_crosswalk.R | 116 ++++++++ tests/testthat/test-plan_crosswalk_chain.R | 55 ++++ 11 files changed, 554 insertions(+), 79 deletions(-) delete mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index e58b193..0000000 --- a/AGENTS.md +++ /dev/null @@ -1,40 +0,0 @@ -# Agent Instructions - -This project uses **bd** (beads) for issue tracking. Run `bd onboard` to get started. - -## Quick Reference - -```bash -bd ready # Find available work -bd show # View issue details -bd update --status in_progress # Claim work -bd close # Complete work -bd sync # Sync with git -``` - -## Landing the Plane (Session Completion) - -**When ending a work session**, you MUST complete ALL steps below. Work is NOT complete until `git push` succeeds. - -**MANDATORY WORKFLOW:** - -1. **File issues for remaining work** - Create issues for anything that needs follow-up -2. **Run quality gates** (if code changed) - Tests, linters, builds -3. **Update issue status** - Close finished work, update in-progress items -4. **PUSH TO REMOTE** - This is MANDATORY: - ```bash - git pull --rebase - bd sync - git push - git status # MUST show "up to date with origin" - ``` -5. **Clean up** - Clear stashes, prune remote branches -6. **Verify** - All changes committed AND pushed -7. **Hand off** - Provide context for next session - -**CRITICAL RULES:** -- Work is NOT complete until `git push` succeeds -- NEVER stop before pushing - that leaves work stranded locally -- NEVER say "ready to push when you are" - YOU (Claude) must push -- If push fails, resolve and retry until it succeeds - diff --git a/R/crosswalk_data.R b/R/crosswalk_data.R index 3996fcc..1e9d118 100644 --- a/R/crosswalk_data.R +++ b/R/crosswalk_data.R @@ -41,6 +41,17 @@ #' mean, median, percentage, and ratio variables. These will be calculated as weighted #' means using the allocation factor as weights. If NULL (default), automatically #' detects columns with prefixes "mean_", "median_", "percent_", or "ratio_". +#' @param custom_interpolations A list of lists, each specifying a group of columns +#' and a custom interpolation function. Each element must have: +#' \describe{ +#' \item{columns}{Character vector of column names} +#' \item{fn}{A function or formula for interpolation. Receives two arguments: +#' `.x` (column values) and `.w` (allocation factors). Formulas using `~` +#' syntax (e.g., `~sum(.x * .w, na.rm = TRUE)`) are converted to functions +#' via `rlang::as_function()`.} +#' } +#' Columns in `custom_interpolations` must not overlap with `count_columns` or +#' `non_count_columns`. Default is NULL (no custom interpolations). #' @param return_intermediate Logical. If TRUE and crosswalk has multiple steps, #' returns a list containing both the final result and intermediate results #' from each step. Default is FALSE, which returns only the final result. @@ -151,6 +162,21 @@ #' # Access intermediate and final #' result$intermediate$step_1 # After first crosswalk #' result$final # Final result +#' +#' # Custom interpolation functions +#' result <- crosswalk_data( +#' data = my_data, +#' crosswalk = crosswalk, +#' custom_interpolations = list( +#' list( +#' columns = c("count_population", "count_housing"), +#' fn = ~sum(.x * .w, na.rm = TRUE) +#' ), +#' list( +#' columns = c("pct_poverty"), +#' fn = ~weighted.mean(.x, .w, na.rm = TRUE) +#' ) +#' )) #' } crosswalk_data <- function( @@ -165,6 +191,7 @@ crosswalk_data <- function( geoid_column = "source_geoid", count_columns = NULL, non_count_columns = NULL, + custom_interpolations = NULL, return_intermediate = FALSE, show_join_quality = TRUE, silent = getOption("crosswalk.silent", FALSE)) { @@ -222,14 +249,28 @@ crosswalk_data <- function( stringr::str_starts(data_columns, "ratio_")] } - if (length(count_columns) == 0 & length(non_count_columns) == 0) { + # Validate custom_interpolations if provided + if (!is.null(custom_interpolations)) { + custom_interpolations <- validate_custom_interpolations( + custom_interpolations, names(data), count_columns, non_count_columns) + } + + has_custom <- !is.null(custom_interpolations) && length(custom_interpolations) > 0 + + if (length(count_columns) == 0 & length(non_count_columns) == 0 & !has_custom) { stop( - "No columns to crosswalk. Either specify `count_columns` or `non_count_columns`, ", - "or ensure your data has columns with prefixes: count_, mean_, median_, percent_, or ratio_.") + "No columns to crosswalk. Either specify `count_columns`, `non_count_columns`, ", + "or `custom_interpolations`, or ensure your data has columns with prefixes: ", + "count_, mean_, median_, percent_, or ratio_.") } # Check that specified columns exist in original data - all_value_columns <- c(count_columns, non_count_columns) + custom_cols <- if (has_custom) { + unlist(purrr::map(custom_interpolations, "columns")) + } else { + character(0) + } + all_value_columns <- c(count_columns, non_count_columns, custom_cols) missing_columns <- setdiff(all_value_columns, names(data)) if (length(missing_columns) > 0) { stop( @@ -261,6 +302,7 @@ crosswalk_data <- function( geoid_column = current_geoid_column, count_columns = count_columns, non_count_columns = non_count_columns, + custom_interpolations = custom_interpolations, step_number = i, total_steps = n_steps, show_join_quality = show_join_quality) @@ -633,6 +675,79 @@ report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1, } +#' Validate Custom Interpolations Parameter +#' +#' Internal function that validates and normalizes the custom_interpolations parameter. +#' Converts formula-style functions to proper functions with `.x` and `.w` parameters. +#' +#' @param custom_interpolations A list of lists with `columns` and `fn` elements. +#' @param data_columns Character vector of column names in data. +#' @param count_columns Character vector of count column names. +#' @param non_count_columns Character vector of non-count column names. +#' @return The validated and normalized custom_interpolations list. +#' @keywords internal +#' @noRd +validate_custom_interpolations <- function(custom_interpolations, data_columns, + count_columns, non_count_columns) { + if (!is.list(custom_interpolations)) { + stop("`custom_interpolations` must be a list of lists, each with `columns` and `fn` elements.") + } + + all_custom_cols <- character(0) + + for (i in seq_along(custom_interpolations)) { + group <- custom_interpolations[[i]] + + if (!is.list(group) || !all(c("columns", "fn") %in% names(group))) { + stop( + "`custom_interpolations[[", i, "]]` must be a list with `columns` and `fn` elements.") + } + + if (!is.character(group$columns) || length(group$columns) == 0) { + stop( + "`custom_interpolations[[", i, "]]$columns` must be a non-empty character vector.") + } + + # Convert formula to function with .x and .w parameters + if (rlang::is_formula(group$fn)) { + fn_body <- rlang::f_rhs(group$fn) + fn_env <- rlang::f_env(group$fn) + custom_interpolations[[i]]$fn <- rlang::new_function( + alist(.x = , .w = ), fn_body, fn_env) + } else if (!is.function(group$fn)) { + stop( + "`custom_interpolations[[", i, "]]$fn` must be a function or formula.") + } + + # Check for overlap within custom groups + overlap_within <- intersect(group$columns, all_custom_cols) + if (length(overlap_within) > 0) { + stop( + "Column(s) appear in multiple `custom_interpolations` groups: ", + paste(overlap_within, collapse = ", ")) + } + all_custom_cols <- c(all_custom_cols, group$columns) + } + + # Check for overlap with count_columns and non_count_columns + overlap_count <- intersect(all_custom_cols, count_columns) + if (length(overlap_count) > 0) { + stop( + "Column(s) in `custom_interpolations` overlap with `count_columns`: ", + paste(overlap_count, collapse = ", ")) + } + + overlap_non_count <- intersect(all_custom_cols, non_count_columns) + if (length(overlap_non_count) > 0) { + stop( + "Column(s) in `custom_interpolations` overlap with `non_count_columns`: ", + paste(overlap_non_count, collapse = ", ")) + } + + return(custom_interpolations) +} + + #' Apply a Single Crosswalk Step #' #' Internal function that applies one crosswalk tibble to data. @@ -642,6 +757,7 @@ report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1, #' @param geoid_column Column name for source geoid #' @param count_columns Count variable columns #' @param non_count_columns Non-count variable columns +#' @param custom_interpolations Validated custom interpolation groups (or NULL) #' @param step_number Integer. Current step number for multi-step reporting. #' @param total_steps Integer. Total number of steps for multi-step reporting. #' @param show_join_quality Logical. Whether to report join quality diagnostics. @@ -654,6 +770,7 @@ apply_single_crosswalk <- function( geoid_column, count_columns, non_count_columns, + custom_interpolations = NULL, step_number = 1, total_steps = 1, show_join_quality = TRUE) { @@ -700,7 +817,22 @@ apply_single_crosswalk <- function( current_count_cols <- intersect(count_columns, names(data)) current_non_count_cols <- intersect(non_count_columns, names(data)) - # Identify "other" columns (not geoid, count, or non-count columns) + # Filter custom interpolation columns to those in current data + current_custom_interpolations <- if (!is.null(custom_interpolations)) { + purrr::map(custom_interpolations, function(group) { + list( + columns = intersect(group$columns, names(data)), + fn = group$fn) + }) |> + purrr::keep(~ length(.x$columns) > 0) + } else { + list() + } + + current_custom_cols <- unlist(purrr::map(current_custom_interpolations, "columns")) + if (is.null(current_custom_cols)) current_custom_cols <- character(0) + + # Identify "other" columns (not geoid, count, non-count, or custom columns) # These will be aggregated by taking the first non-missing value # Include both original crosswalk column names AND their renamed versions # (e.g., "geography_name" which comes from "target_geography_name" after renaming) @@ -714,41 +846,96 @@ apply_single_crosswalk <- function( "land_area_sqmi") other_cols <- setdiff( names(data), - c(geoid_column, current_count_cols, current_non_count_cols, crosswalk_cols) + c(geoid_column, current_count_cols, current_non_count_cols, + current_custom_cols, crosswalk_cols) ) + # All interpolated columns (for NA validity tracking) + all_interpolated_cols <- c(current_count_cols, current_non_count_cols, + current_custom_cols) # Join crosswalk to data - result <- data |> + joined_data <- data |> dplyr::mutate( dplyr::across(dplyr::all_of(geoid_column), as.character)) |> dplyr::left_join( crosswalk, - by = stats::setNames("source_geoid", geoid_column)) |> - tidytable::summarize( - .by = dplyr::all_of(group_cols), - ## count variables we take the sum of the weighted count variable - dplyr::across( - .cols = dplyr::all_of(current_count_cols), - .fns = ~ sum(.x * allocation_factor_source_to_target, na.rm = TRUE)), - ## non-count variables--means, medians, percentages, ratios, etc.-- - ## we take the weighted mean of the variable, weighted by the allocation factor - tidytable::across( - .cols = tidytable::all_of(current_non_count_cols), - .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)), - ## other columns: take first non-missing value (or NA if all missing) - tidytable::across( - .cols = tidytable::all_of(other_cols), - .fns = ~ dplyr::first(.x, na_rm = TRUE)), - tidytable::across( - .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)), - .fns = ~ sum(!is.na(.x)), - .names = "{.col}_validx")) |> - tidytable::mutate( - tidytable::across( - .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)), - .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0, .x, NA))) |> - dplyr::select(-dplyr::matches("_validx$")) |> + by = stats::setNames("source_geoid", geoid_column)) + + # Standard columns for main summarize (count, non-count) + standard_cols <- c(current_count_cols, current_non_count_cols) + has_standard <- length(standard_cols) > 0 || length(other_cols) > 0 + + # Main summarize for standard columns (count, non-count, other) + if (has_standard) { + result <- joined_data |> + tidytable::summarize( + .by = dplyr::all_of(group_cols), + ## count variables: sum of weighted values + dplyr::across( + .cols = dplyr::all_of(current_count_cols), + .fns = ~ sum(.x * allocation_factor_source_to_target, na.rm = TRUE)), + ## non-count variables: weighted mean + tidytable::across( + .cols = tidytable::all_of(current_non_count_cols), + .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)), + ## other columns: first non-missing value + tidytable::across( + .cols = tidytable::all_of(other_cols), + .fns = ~ dplyr::first(.x, na_rm = TRUE)), + ## validity tracking for NA handling + tidytable::across( + .cols = tidytable::all_of(standard_cols), + .fns = ~ sum(!is.na(.x)), + .names = "{.col}_validx")) + + # Replace standard interpolated values with NA where all source values were NA + if (length(standard_cols) > 0) { + result <- result |> + tidytable::mutate( + tidytable::across( + .cols = tidytable::all_of(standard_cols), + .fns = ~ tidytable::if_else( + get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0, + .x, NA))) + } + + result <- result |> + dplyr::select(-dplyr::matches("_validx$")) + } else { + # No standard columns; start with unique target groups + result <- joined_data |> + dplyr::distinct(dplyr::across(dplyr::all_of(group_cols))) + } + + # Compute custom interpolations in a separate pass and join back + if (length(current_custom_interpolations) > 0) { + for (group in current_custom_interpolations) { + group_fn <- group$fn + group_cols_i <- group$columns + + # Compute each custom column individually and join to result + custom_results <- purrr::map(group_cols_i, function(col_name) { + joined_data |> + tidytable::summarize( + .by = dplyr::all_of(group_cols), + val = group_fn(!!rlang::sym(col_name), + .w = allocation_factor_source_to_target), + validx = sum(!is.na(!!rlang::sym(col_name)))) |> + tidytable::mutate( + val = tidytable::if_else(validx > 0, val, NA)) |> + dplyr::select(-validx) |> + dplyr::rename(!!col_name := val) + }) + + for (custom_result in custom_results) { + result <- result |> + dplyr::left_join(custom_result, by = group_cols) + } + } + } + + result <- result |> dplyr::rename_with( .cols = dplyr::everything(), .fn = ~ stringr::str_remove_all(.x, "target_")) |> diff --git a/R/get_crosswalk.R b/R/get_crosswalk.R index df8e55d..483700b 100644 --- a/R/get_crosswalk.R +++ b/R/get_crosswalk.R @@ -45,7 +45,8 @@ #' @param source_geography Character. Source geography name. One of c("block", #' "block group", "tract", "place", "county", "urban_area", "zcta", "puma", #' "puma12", "puma22", "cd115", "cd116", "cd118", "cd119", "urban_area", -#' "core_based_statistical_area"). +#' "core_based_statistical_area"). Note: "aiannh" (tribal areas) is currently +#' supported as a **target** geography only, not as a source. #' @param target_year Character or numeric. Year of the target geography, one of #' c(1990, 2000, 2010, 2020) for decennial crosswalks, or c(2011, 2012, 2014, #' 2015, 2022) for non-census year crosswalks (limited to block groups, tracts, @@ -53,7 +54,9 @@ #' @param target_geography Character. Target geography name. One of c("block", #' "block group", "tract", "place", "county", "urban_area", "zcta", "puma", #' "puma12", "puma22", "cd115", "cd116", "cd118", "cd119", "urban_area", -#' "core_based_statistical_area"). +#' "core_based_statistical_area", "aiannh"). Tribal areas ("aiannh", American +#' Indian / Alaska Native / Native Hawaiian areas) are available via +#' GeoCorr 2022 only (target_year >= 2020). #' @param weight Character. Weighting variable for Geocorr crosswalks. One of #' c("population", "housing", "land"). #' @param cache Directory path. Where to download the crosswalk to. If NULL (default), @@ -470,6 +473,15 @@ get_available_crosswalks <- function() { source_year = 2022L, target_year = 2022L) + # aiannh (tribal areas) is supported as a target geography only, via GeoCorr 2022 + geocorr_2022_aiannh <- tibble::tibble( + source_geography = geocorr_2022_geographies, + target_geography = "aiannh", + source_year = 2022L, + target_year = 2022L) + + geocorr_2022 <- dplyr::bind_rows(geocorr_2022, geocorr_2022_aiannh) + # 3. Geocorr 2018: all pairwise combinations of 9 canonical geographies geocorr_2018_geographies <- c( "block", "block_group", "tract", "county", "place", diff --git a/R/get_geocorr_crosswalk.R b/R/get_geocorr_crosswalk.R index 4db677c..bc760e0 100644 --- a/R/get_geocorr_crosswalk.R +++ b/R/get_geocorr_crosswalk.R @@ -32,7 +32,11 @@ get_geocorr_config <- function(version = "2022") { "puma" = "puma22", "puma22" = "puma22", "cd118" = "cd118", - "cd119" = "cd119"), + "cd119" = "cd119", + "aiannh" = "aiannh", + "tribal" = "aiannh", + "tribal_area" = "aiannh", + "american_indian_area" = "aiannh"), puma_col = "puma22", cd_pattern = "^cd11[89]", has_name_columns = TRUE, @@ -117,10 +121,12 @@ resolve_geocorr_geography <- function(geography, geocorr_version = "2022") { #' of the geographies supported by GeoCorr. #' #' For GeoCorr 2022: c("place", "county", "tract", "blockgroup", "zcta", -#' "puma22", "cd119", "cd118"). +#' "puma22", "cd119", "cd118", "aiannh"). Note that "aiannh" (tribal areas) +#' is supported as a **target** geography only, not as a source. #' #' For GeoCorr 2018: c("placefp", "county", "tract", "bg", "zcta5", -#' "puma12", "cd115", "cd116"). +#' "puma12", "cd115", "cd116"). Tribal areas (aiannh) are not available in +#' GeoCorr 2018. #' #' User-facing names like "puma", "zcta", "place", "blockgroup" are #' automatically resolved to the correct API code for the version. @@ -151,6 +157,15 @@ get_geocorr_crosswalk <- function( source_api_code <- resolve_geocorr_geography(source_geography, geocorr_version) target_api_code <- resolve_geocorr_geography(target_geography, geocorr_version) + # Tribal areas (aiannh) are currently supported as a target geography only. + # Sourcing from aiannh would require re-normalizing allocation factors across + # per-state query chunks for tribal areas that span state boundaries. + if (source_api_code == "aiannh") { + stop( + "Tribal areas ('aiannh') are currently supported as a target geography ", + "only, not as a source. Please specify 'aiannh' via `target_geography`.") + } + outpath = "no file exists here" ## identify the relevant file paths for potentially-cached crosswalks if (!is.null(cache)) { @@ -329,6 +344,11 @@ get_geocorr_crosswalk <- function( dplyr::rename_with( .cols = dplyr::matches("zip_name"), .fn = ~ .x |> stringr::str_replace("zip", "zcta")) |> + ## defensive: ensure aiannh name column is snake_cased regardless of how + ## janitor::clean_names handled the original aiannhName field + dplyr::rename_with( + .cols = dplyr::matches("^aiannhname$"), + .fn = ~ stringr::str_replace(.x, "aiannhname", "aiannh_name")) |> dplyr::rename_with( .cols = dplyr::matches(stringr::str_c(config$puma_col, "name")), .fn = ~ .x |> stringr::str_replace( @@ -482,6 +502,16 @@ get_geocorr_crosswalk <- function( weighting_factor = weight, dplyr::across(.cols = dplyr::matches("allocation"), .fns = as.numeric)) + ## When target is aiannh, GeoCorr returns rows with NA target_geoid for the + ## portion of each source geography that falls outside any tribal area. + ## These rows carry no crosswalk information (there is no tribal area to + ## allocate to) so we drop them. After this filter, allocation factors per + ## source sum to <= 1 rather than exactly 1, which correctly reflects that + ## tribal areas do not cover all land. + if (target_api_code == "aiannh") { + df2 <- df2 |> dplyr::filter(!is.na(target_geoid)) + } + if (!is.null(cache)) { ## if the file does not already exist and cache is TRUE if (!file.exists(outpath) & !is.null(cache)) { diff --git a/R/get_nhgis_crosswalk.R b/R/get_nhgis_crosswalk.R index 2dbf6af..9d792b2 100644 --- a/R/get_nhgis_crosswalk.R +++ b/R/get_nhgis_crosswalk.R @@ -864,6 +864,20 @@ variable. Get your key at https://account.ipums.org/api_keys") } names_to = "weighting_factor", values_to = "allocation_factor_source_to_target") + # Pad 1990 tract GEOIDs to standard 11 chars. The 1990 Census used 4-digit + # tract codes (without the ".00" decimal suffix), producing 9-char GEOIDs in + # NHGIS data (state=2 + county=3 + tract=4). Standard Census format uses + # 6-digit tract codes (tract=4 + suffix=2), so we right-pad with "0" to + # restore the implicit ".00" suffix. + if (source_geography_standardized == "tr") { + crosswalk_df <- crosswalk_df |> + dplyr::mutate(source_geoid = stringr::str_pad(source_geoid, 11, "right", "0")) + } + if (target_geography_standardized == "tr") { + crosswalk_df <- crosswalk_df |> + dplyr::mutate(target_geoid = stringr::str_pad(target_geoid, 11, "right", "0")) + } + ## if the file does not already exist and cache is not NULL if (!file.exists(csv_path) & !is.null(cache)) { if (!dir.exists(cache)) { diff --git a/R/plan_crosswalk_chain.R b/R/plan_crosswalk_chain.R index 5c4535a..f7b7e27 100644 --- a/R/plan_crosswalk_chain.R +++ b/R/plan_crosswalk_chain.R @@ -56,6 +56,28 @@ plan_crosswalk_chain <- function( source_geog_std <- standardize_geography_for_chain(source_geography) target_geog_std <- standardize_geography_for_chain(target_geography) + # Tribal areas (aiannh) are target-only in the current implementation. + if (isTRUE(source_geog_std == "aiannh")) { + result$error <- stringr::str_c( + "Tribal areas ('aiannh') are currently supported as a target geography ", + "only, not as a source. Please specify 'aiannh' via `target_geography`.") + return(result) + } + + # Tribal areas are only available via GeoCorr 2022 (2020 Census geography). + # Reject requests that would route to GeoCorr 2018 (target_year 2010-2019). + if (isTRUE(target_geog_std == "aiannh") && !is.null(target_year)) { + target_year_num <- suppressWarnings(as.numeric(target_year)) + if (!is.na(target_year_num) && target_year_num < 2020) { + result$error <- stringr::str_c( + "Tribal area ('aiannh') crosswalks are only available via GeoCorr 2022 ", + "(2020 Census geography). The requested target_year ", target_year, + " would route to GeoCorr 2018, which does not support aiannh. ", + "Use target_year >= 2020.") + return(result) + } + } + # Convert years to character for consistent handling # Use NA_character_ instead of NULL for tibble compatibility @@ -326,6 +348,8 @@ standardize_geography_for_chain <- function(geography) { geography %in% c("cbsa", "core based statistical area") ~ "cbsa", geography %in% c("ua", "urban area", "urban areas") ~ "urban_area", geography %in% c("cd115", "cd116", "cd118", "cd119", "congressional district") ~ geography, + geography %in% c("aiannh", "tribal", "tribal area", "american indian area", + "reservation", "aian") ~ "aiannh", TRUE ~ geography) } diff --git a/man/crosswalk_data.Rd b/man/crosswalk_data.Rd index 97d9fcc..f823343 100644 --- a/man/crosswalk_data.Rd +++ b/man/crosswalk_data.Rd @@ -16,6 +16,7 @@ crosswalk_data( geoid_column = "source_geoid", count_columns = NULL, non_count_columns = NULL, + custom_interpolations = NULL, return_intermediate = FALSE, show_join_quality = TRUE, silent = getOption("crosswalk.silent", FALSE) @@ -66,6 +67,18 @@ mean, median, percentage, and ratio variables. These will be calculated as weigh means using the allocation factor as weights. If NULL (default), automatically detects columns with prefixes "mean_", "median_", "percent_", or "ratio_".} +\item{custom_interpolations}{A list of lists, each specifying a group of columns +and a custom interpolation function. Each element must have: +\describe{ +\item{columns}{Character vector of column names} +\item{fn}{A function or formula for interpolation. Receives two arguments: +\code{.x} (column values) and \code{.w} (allocation factors). Formulas using \code{~} +syntax (e.g., \code{~sum(.x * .w, na.rm = TRUE)}) are converted to functions +via \code{rlang::as_function()}.} +} +Columns in \code{custom_interpolations} must not overlap with \code{count_columns} or +\code{non_count_columns}. Default is NULL (no custom interpolations).} + \item{return_intermediate}{Logical. If TRUE and crosswalk has multiple steps, returns a list containing both the final result and intermediate results from each step. Default is FALSE, which returns only the final result.} @@ -186,5 +199,20 @@ result <- crosswalk_data( # Access intermediate and final result$intermediate$step_1 # After first crosswalk result$final # Final result + +# Custom interpolation functions +result <- crosswalk_data( + data = my_data, + crosswalk = crosswalk, + custom_interpolations = list( + list( + columns = c("count_population", "count_housing"), + fn = ~sum(.x * .w, na.rm = TRUE) + ), + list( + columns = c("pct_poverty"), + fn = ~weighted.mean(.x, .w, na.rm = TRUE) + ) + )) } } diff --git a/man/get_crosswalk.Rd b/man/get_crosswalk.Rd index 84d0728..3e20c7e 100644 --- a/man/get_crosswalk.Rd +++ b/man/get_crosswalk.Rd @@ -18,12 +18,15 @@ get_crosswalk( \item{source_geography}{Character. Source geography name. One of c("block", "block group", "tract", "place", "county", "urban_area", "zcta", "puma", "puma12", "puma22", "cd115", "cd116", "cd118", "cd119", "urban_area", -"core_based_statistical_area").} +"core_based_statistical_area"). Note: "aiannh" (tribal areas) is currently +supported as a \strong{target} geography only, not as a source.} \item{target_geography}{Character. Target geography name. One of c("block", "block group", "tract", "place", "county", "urban_area", "zcta", "puma", "puma12", "puma22", "cd115", "cd116", "cd118", "cd119", "urban_area", -"core_based_statistical_area").} +"core_based_statistical_area", "aiannh"). Tribal areas ("aiannh", American +Indian / Alaska Native / Native Hawaiian areas) are available via +GeoCorr 2022 only (target_year >= 2020).} \item{source_year}{Character or numeric. Year of the source geography, one of c(1990, 2000, 2010, 2020).} diff --git a/tests/testthat/test-get_available_crosswalks.R b/tests/testthat/test-get_available_crosswalks.R index b409c54..60e0dc9 100644 --- a/tests/testthat/test-get_available_crosswalks.R +++ b/tests/testthat/test-get_available_crosswalks.R @@ -109,3 +109,49 @@ test_that("get_available_crosswalks contains CTData rows", { expect_equal(nrow(ctdata_match), 1) }) + +# ============================================================================== +# Content tests - aiannh (tribal area) rows +# ============================================================================== + +test_that("get_available_crosswalks contains aiannh as target from GeoCorr 2022 sources", { + result <- get_available_crosswalks() + + # tract -> aiannh (2022) + tract_aiannh <- result |> + dplyr::filter( + source_geography == "tract", + target_geography == "aiannh", + source_year == 2022L, + target_year == 2022L) + expect_equal(nrow(tract_aiannh), 1) + + # county -> aiannh (2022) + county_aiannh <- result |> + dplyr::filter( + source_geography == "county", + target_geography == "aiannh", + source_year == 2022L, + target_year == 2022L) + expect_equal(nrow(county_aiannh), 1) +}) + +test_that("get_available_crosswalks does NOT list aiannh as a source geography", { + result <- get_available_crosswalks() + + aiannh_sources <- result |> + dplyr::filter(source_geography == "aiannh") + + expect_equal(nrow(aiannh_sources), 0) +}) + +test_that("get_available_crosswalks does NOT list aiannh under GeoCorr 2018", { + result <- get_available_crosswalks() + + aiannh_2018 <- result |> + dplyr::filter( + target_geography == "aiannh", + target_year == 2018L) + + expect_equal(nrow(aiannh_2018), 0) +}) diff --git a/tests/testthat/test-get_geocorr_crosswalk.R b/tests/testthat/test-get_geocorr_crosswalk.R index 197e2bc..0905f18 100644 --- a/tests/testthat/test-get_geocorr_crosswalk.R +++ b/tests/testthat/test-get_geocorr_crosswalk.R @@ -325,3 +325,119 @@ test_that("get_crosswalk routes to Geocorr when no years specified", { metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata") expect_equal(metadata$data_source, "geocorr") }) + +# ============================================================================== +# Tribal area (aiannh) tests +# ============================================================================== + +test_that("get_geocorr_crosswalk handles tract to aiannh target", { + skip_if_offline() + + result <- crosswalk:::get_geocorr_crosswalk( + source_geography = "tract", + target_geography = "aiannh", + weight = "population") + + expect_s3_class(result, "tbl_df") + expect_equal(unique(result$source_geography), "tract") + expect_equal(unique(result$target_geography), "aiannh") + + # NA target rows (tracts outside any tribal area) are filtered out + expect_false(any(is.na(result$target_geoid))) + + # aiannh GEOIDs are 4-character federal codes + expect_true(all(stringr::str_length(result$target_geoid) == 4)) + + # Tract GEOIDs are still 11 characters + expect_true(all(stringr::str_length(result$source_geoid) == 11)) + + # Standard columns present + expected_cols <- c( + "source_geoid", "target_geoid", + "source_geography_name", "target_geography_name", + "allocation_factor_source_to_target", + "weighting_factor") + expect_true(all(expected_cols %in% colnames(result))) + + # Allocation factors in valid range + expect_true(all(result$allocation_factor_source_to_target >= 0)) + expect_true(all(result$allocation_factor_source_to_target <= 1)) + + # Allocation factors per source sum to <= 1 (not exactly 1), because tribal + # areas do not cover all land — any fraction of a source tract that falls + # outside tribal areas is intentionally dropped. + source_sums <- result |> + dplyr::summarize( + total = sum(allocation_factor_source_to_target, na.rm = TRUE), + .by = "source_geoid") + expect_true(all(source_sums$total <= 1 + 0.01)) +}) + +test_that("get_geocorr_crosswalk handles county to aiannh target", { + skip_if_offline() + + result <- crosswalk:::get_geocorr_crosswalk( + source_geography = "county", + target_geography = "aiannh", + weight = "population") + + expect_s3_class(result, "tbl_df") + expect_equal(unique(result$target_geography), "aiannh") + expect_true(all(stringr::str_length(result$target_geoid) == 4)) +}) + +test_that("get_geocorr_crosswalk rejects aiannh as source geography", { + expect_error( + crosswalk:::get_geocorr_crosswalk( + source_geography = "aiannh", + target_geography = "tract", + weight = "population"), + regexp = "target geography only") +}) + +test_that("get_geocorr_crosswalk rejects aiannh on GeoCorr 2018", { + # GeoCorr 2018 does not support aiannh; should error from resolve_geocorr_geography + expect_error( + crosswalk:::get_geocorr_crosswalk( + source_geography = "tract", + target_geography = "aiannh", + weight = "population", + geocorr_version = "2018"), + regexp = "not supported by GeoCorr 2018") +}) + +test_that("get_crosswalk handles tract to aiannh via top-level entry point", { + skip_if_offline() + + result <- get_crosswalk( + source_geography = "tract", + target_geography = "aiannh", + weight = "population") + + expect_type(result, "list") + expect_true("step_1" %in% names(result$crosswalks)) + + metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata") + expect_equal(metadata$data_source, "geocorr") + expect_equal(metadata$target_geography, "aiannh") +}) + +test_that("get_crosswalk rejects pre-2020 target_year for aiannh target", { + expect_error( + get_crosswalk( + source_geography = "tract", + target_geography = "aiannh", + source_year = 2015, + target_year = 2015, + weight = "population"), + regexp = "GeoCorr 2022") +}) + +test_that("get_crosswalk rejects aiannh as source via top-level entry point", { + expect_error( + get_crosswalk( + source_geography = "aiannh", + target_geography = "tract", + weight = "population"), + regexp = "target geography only") +}) diff --git a/tests/testthat/test-plan_crosswalk_chain.R b/tests/testthat/test-plan_crosswalk_chain.R index 9b06d68..0bdf966 100644 --- a/tests/testthat/test-plan_crosswalk_chain.R +++ b/tests/testthat/test-plan_crosswalk_chain.R @@ -255,6 +255,61 @@ test_that("standardize_geography_for_chain handles 2010-era geography aliases", expect_equal(crosswalk:::standardize_geography_for_chain("cd116"), "cd116") }) +test_that("standardize_geography_for_chain handles aiannh aliases", { + expect_equal(crosswalk:::standardize_geography_for_chain("aiannh"), "aiannh") + expect_equal(crosswalk:::standardize_geography_for_chain("tribal"), "aiannh") + expect_equal(crosswalk:::standardize_geography_for_chain("tribal area"), "aiannh") + expect_equal(crosswalk:::standardize_geography_for_chain("tribal_area"), "aiannh") + expect_equal(crosswalk:::standardize_geography_for_chain("american indian area"), "aiannh") + expect_equal(crosswalk:::standardize_geography_for_chain("reservation"), "aiannh") + expect_equal(crosswalk:::standardize_geography_for_chain("aian"), "aiannh") +}) + +test_that("plan_crosswalk_chain rejects aiannh as a source geography", { + plan <- plan_crosswalk_chain( + source_geography = "aiannh", + target_geography = "tract", + source_year = 2020, + target_year = 2020) + + expect_false(is.null(plan$error)) + expect_true(stringr::str_detect(plan$error, "target geography only")) +}) + +test_that("plan_crosswalk_chain rejects aiannh target with pre-2020 target_year", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "aiannh", + source_year = 2015, + target_year = 2015) + + expect_false(is.null(plan$error)) + expect_true(stringr::str_detect(plan$error, "GeoCorr 2022")) +}) + +test_that("plan_crosswalk_chain accepts aiannh target with 2020+ target_year", { + # Same-year: direct GeoCorr 2022 step + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "aiannh", + source_year = 2020, + target_year = 2020) + expect_true(is.null(plan$error)) + expect_false(plan$is_multi_step) + expect_equal(plan$steps$crosswalk_source[1], "geocorr") + + # Multi-step: NHGIS temporal + GeoCorr 2022 geography + plan_multi <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "aiannh", + source_year = 2010, + target_year = 2020) + expect_true(is.null(plan_multi$error)) + expect_true(plan_multi$is_multi_step) + expect_equal(plan_multi$steps$crosswalk_source[nrow(plan_multi$steps)], "geocorr") + expect_equal(plan_multi$steps$target_geography[nrow(plan_multi$steps)], "aiannh") +}) + test_that("determine_temporal_source returns correct source", { expect_equal(crosswalk:::determine_temporal_source("2010", "2020"), "nhgis") expect_equal(crosswalk:::determine_temporal_source("2020", "2022"), "ctdata_2020_2022")