Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
export("%>%")
export(calculate_custom_geographies)
export(compile_acs_data)
export(define_across_percent)
export(define_across_sum)
export(define_metadata)
export(define_one_minus)
export(define_percent)
export(filter_variables)
export(get_acs_codebook)
export(list_acs_variables)
Expand Down
107 changes: 85 additions & 22 deletions R/calculate_cvs.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ se_sum = function(...) {
data_long %>% dplyr::filter(is.na(estimate)),
data_long %>%
dplyr::filter(estimate == 0) %>%
dplyr::slice_max(order_by = moe, by = "observation", n = 1)) %>%
dplyr::slice_max(order_by = moe, by = "observation", n = 1, with_ties = FALSE)) %>%
dplyr::group_split(observation) %>%
purrr::map(~ .x %>% dplyr::pull(moe)) %>%
purrr::map(se_simple) %>%
Expand Down Expand Up @@ -104,6 +104,8 @@ se_proportion_ratio = function(
((1 / estimate_denominator) * sqrt( radical_term_one + radical_term_two )),
((1 / estimate_denominator) * sqrt( radical_term_one - radical_term_two )))

se = dplyr::if_else(estimate_denominator == 0, NA_real_, se)

return(se)
}

Expand Down Expand Up @@ -229,6 +231,21 @@ se_weighted_mean = function(
return(se)
}

#' @title Calculate a coefficient of variation
#' @details Return a coefficient of variation at the 90% level
#' @param estimate The estimate
#' @param se The standard error
#' @returns A coefficient of variation at the 90% level
cv = function(estimate, se) {
cv = se / estimate * 100

## when the estimate is zero, this produces an infinite value
## replacing this with an NA value
cv = dplyr::if_else(is.infinite(cv), NA, cv)

return(cv)
}

#' @title Calculate margins of error for derived variables
#' @details Calculates margins of error for all derived ACS estimates. Standard
#' errors are computed internally as an intermediate step but are not included
Expand Down Expand Up @@ -269,7 +286,7 @@ calculate_moes = function(.df) {

## all variables for which to calculate CVs
cv_variables = codebook1 %>%
dplyr::filter(!se_calculation_type %in% c("metadata", "unknown")) %>%
dplyr::filter(!se_calculation_type %in% c("metadata", "unknown", "weighted_average")) %>%
dplyr::pull(calculated_variable)

## a named list of variables, grouped by SE calculation type
Expand Down Expand Up @@ -317,6 +334,7 @@ calculate_moes = function(.df) {
.fns = function(x) {

current_column = dplyr::cur_column()
original_column = current_column

## for "one minus" variables, use the underlying variable for error calculation
if (current_column %in% se_types[["one_minus"]]) {
Expand All @@ -340,51 +358,95 @@ calculate_moes = function(.df) {
denominator_moe_variables = denominator_estimate_variables %>%
stringr::str_c("_M")

## Get pre-parsed subtract variables from codebook
## SE of a difference uses the same formula as SE of a sum:
## SE(A - B) = sqrt(SE_A^2 + SE_B^2)
numerator_subtract_estimate_variables = codebook_row %>%
dplyr::pull(numerator_subtract_vars) %>%
unlist()
numerator_subtract_moe_variables = numerator_subtract_estimate_variables %>%
stringr::str_c("_M")
denominator_subtract_estimate_variables = codebook_row %>%
dplyr::pull(denominator_subtract_vars) %>%
unlist()
denominator_subtract_moe_variables = denominator_subtract_estimate_variables %>%
stringr::str_c("_M")

## combine additive and subtractive variables for SE calculations
all_numerator_estimate_variables = c(numerator_estimate_variables, numerator_subtract_estimate_variables)
all_numerator_moe_variables = c(numerator_moe_variables, numerator_subtract_moe_variables)
all_denominator_estimate_variables = c(denominator_estimate_variables, denominator_subtract_estimate_variables)
all_denominator_moe_variables = c(denominator_moe_variables, denominator_subtract_moe_variables)

## for variables where we already have an MOE, this is simple
if (current_column %in% c(se_types$raw, se_types$sum)) {
SE = se_simple(get(dplyr::cur_column() %>% paste0("_M")))}
SE = se_simple(get(current_column %>% paste0("_M")))

## for simple percent variables with one numerator, one denominator
if (current_column %in% se_types[["simple_percent"]]) {
} else if (current_column %in% se_types[["simple_percent"]]) {
SE = se_proportion_ratio(
estimate_numerator = get(numerator_estimate_variables),
estimate_denominator = get(denominator_estimate_variables),
moe_numerator = get(numerator_moe_variables),
moe_denominator = get(denominator_moe_variables))}
moe_denominator = get(denominator_moe_variables))

## for percents with summed/subtracted numerators, one denominator
if (current_column %in% se_types[["complex_numerator"]]) {
} else if (current_column %in% se_types[["complex_numerator"]]) {
numerator_estimate = rowSums(dplyr::select(., dplyr::all_of(numerator_estimate_variables)))
if (length(numerator_subtract_estimate_variables) > 0) {
numerator_estimate = numerator_estimate -
rowSums(dplyr::select(., dplyr::all_of(numerator_subtract_estimate_variables)))
}
SE = se_proportion_ratio(
estimate_numerator = rowSums(dplyr::select(., dplyr::all_of(numerator_estimate_variables))),
estimate_numerator = numerator_estimate,
estimate_denominator = get(denominator_estimate_variables),
se_numerator = se_sum(
purrr::map(numerator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(numerator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))),
purrr::map(all_numerator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(all_numerator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))),
se_denominator = se_simple(
purrr::map(denominator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)) %>% unlist()))}
purrr::map(denominator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)) %>% unlist()))

## for percents with one numerator, summed/subtracted denominators
if (current_column %in% se_types[["complex_denominator"]]) {
} else if (current_column %in% se_types[["complex_denominator"]]) {
denominator_estimate = rowSums(dplyr::select(., dplyr::all_of(denominator_estimate_variables)))
if (length(denominator_subtract_estimate_variables) > 0) {
denominator_estimate = denominator_estimate -
rowSums(dplyr::select(., dplyr::all_of(denominator_subtract_estimate_variables)))
}
SE = se_proportion_ratio(
estimate_numerator = get(numerator_estimate_variables),
estimate_denominator = rowSums(dplyr::select(., dplyr::all_of(denominator_estimate_variables))),
estimate_denominator = denominator_estimate,
se_numerator = se_simple(
purrr::map(numerator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)) %>% unlist()),
se_denominator = se_sum(
purrr::map(denominator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(denominator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))))}
purrr::map(all_denominator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(all_denominator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))))

## for percents with summed numerators and summed denominators
if (current_column %in% se_types[["complex_both"]]) {
} else if (current_column %in% se_types[["complex_both"]]) {
numerator_estimate = rowSums(dplyr::select(., dplyr::all_of(numerator_estimate_variables)))
if (length(numerator_subtract_estimate_variables) > 0) {
numerator_estimate = numerator_estimate -
rowSums(dplyr::select(., dplyr::all_of(numerator_subtract_estimate_variables)))
}
denominator_estimate = rowSums(dplyr::select(., dplyr::all_of(denominator_estimate_variables)))
if (length(denominator_subtract_estimate_variables) > 0) {
denominator_estimate = denominator_estimate -
rowSums(dplyr::select(., dplyr::all_of(denominator_subtract_estimate_variables)))
}
SE = se_proportion_ratio(
estimate_numerator = rowSums(dplyr::select(., dplyr::all_of(numerator_estimate_variables))),
estimate_denominator = rowSums(dplyr::select(., dplyr::all_of(denominator_estimate_variables))),
estimate_numerator = numerator_estimate,
estimate_denominator = denominator_estimate,
se_numerator = se_sum(
purrr::map(numerator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(numerator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))),
purrr::map(all_numerator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(all_numerator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))),
se_denominator = se_sum(
purrr::map(denominator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(denominator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))))}
purrr::map(all_denominator_moe_variables, ~ df_with_sum_moes %>% dplyr::pull(.x)),
purrr::map(all_denominator_estimate_variables, ~ df_with_sum_moes %>% dplyr::pull(.x))))

} else {
stop(paste0("Unhandled SE calculation type for variable: ", original_column))
}

return(SE)},
.names = "{.col}_SE"))
Expand Down Expand Up @@ -417,4 +479,5 @@ calculate_moes = function(.df) {

utils::globalVariables(c(
"calculated_variable", "observation", "type", "estimate", "moe",
"se_calculation_type", "numerator_vars", "denominator_vars"))
"se_calculation_type", "numerator_vars", "numerator_subtract_vars",
"denominator_vars", "denominator_subtract_vars"))
67 changes: 59 additions & 8 deletions R/compile_acs_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ safe_divide = function(x, y) { dplyr::if_else(y == 0, 0, x / y) }
#' @description Construct measures frequently used in social sciences
#' research, leveraging \code{tidycensus::get_acs()} to acquire raw estimates from
#' the Census Bureau API.
#' @param tables A character vector of table names to include. Two formats are
#' accepted:
#' @param tables A character vector, list, or NULL specifying which data to
#' include. Three kinds of elements are accepted and can be mixed freely
#' inside a \code{list()}:
#' \itemize{
#' \item \strong{Registered table names} (e.g., \code{"race"}, \code{"snap"}).
#' These are pre-built tables with curated variable definitions. Use
Expand All @@ -28,8 +29,16 @@ safe_divide = function(x, y) { dplyr::if_else(y == 0, 0, x / y) }
#' label hierarchy is parsed, and percentages are computed automatically.
#' Use the \code{denominator} parameter to control how percentages are
#' calculated for these tables.
#' \item \strong{DSL definition objects} created with \code{\link{define_percent}},
#' \code{\link{define_across_percent}}, \code{\link{define_across_sum}},
#' \code{\link{define_one_minus}}, or \code{\link{define_metadata}}.
#' These let you compute custom derived variables from the columns
#' produced by the tables you request. User definitions are executed
#' after all registered and auto-table definitions, and their results
#' appear in the codebook and have MOEs computed automatically.
#' }
#' Both formats can be mixed freely (e.g., \code{c("snap", "B25070")}).
#' When mixing strings and definitions, wrap everything in \code{list()}
#' (e.g., \code{list("snap", define_percent(...))}).
#' If an ACS code corresponds to an already-registered table, the registered
#' version is used automatically.
#' When NULL (default), all registered tables are included (unregistered ACS
Expand Down Expand Up @@ -78,6 +87,16 @@ safe_divide = function(x, y) { dplyr::if_else(y == 0, 0, x / y) }
#' ## Use table total as denominator instead of parent subtotals
#' df = compile_acs_data(tables = "B25070", denominator = "total",
#' years = 2022, geography = "state", states = "DC")
#'
#' ## Add a custom derived variable alongside a registered table
#' df = compile_acs_data(
#' tables = list(
#' "snap",
#' define_percent("snap_not_received_percent",
#' numerator_variables = c("snap_universe", "snap_received"),
#' numerator_subtract_variables = c("snap_received"),
#' denominator_variables = c("snap_universe"))),
#' years = 2022, geography = "county", states = "DC")
#' }
#' @export
#' @importFrom magrittr %>%
Expand Down Expand Up @@ -132,10 +151,24 @@ compile_acs_data = function(
stop(paste0("`denominator` must be \"parent\", \"total\", or a valid ACS variable code (e.g., \"B25070_001\"). Got: \"", denominator, "\"."))
}

####----Partition tables into registry vs auto (raw ACS codes)----####
####----Partition tables into registry vs auto vs user definitions----####
auto_table_entries = list()
registry_tables = tables
raw_acs_codes = character(0)
user_definitions = list()
has_explicit_tables = !is.null(tables)

if (!is.null(tables)) {
## separate DSL definitions from string elements
if (is.list(tables) && !is.character(tables)) {
user_definitions = purrr::keep(tables, is_dsl_definition)
string_elements = purrr::keep(tables, function(x) is.character(x) && length(x) == 1)
tables = if (length(string_elements) > 0) as.character(string_elements) else NULL
}

## validate user definitions structurally (fail fast)
purrr::walk(user_definitions, validate_definition)
}

if (!is.null(tables)) {
construct_map = build_construct_map()
Expand Down Expand Up @@ -196,9 +229,13 @@ compile_acs_data = function(

####----Resolve tables and variables via the registry----####
## resolve which tables to include
if (is.null(tables)) {
## default: all internal table names
if (is.null(tables) && !has_explicit_tables) {
## default: all internal table names (user passed tables = NULL)
resolved_tables = names(.table_registry$tables)
} else if (is.null(tables) && has_explicit_tables) {
## user passed only definitions, no string tables — include total_population only
resolved_tables = resolve_tables(tables = NULL)
resolved_tables = "total_population"
} else {
## pass only registry tables (not raw ACS codes) to resolve_tables
registry_tables_input = if (length(registry_tables) > 0) registry_tables else NULL
Expand Down Expand Up @@ -243,6 +280,11 @@ compile_acs_data = function(
variables = c(variables, auto_variables)
}

## resolve ACS codes in user definitions to clean column names
if (length(user_definitions) > 0) {
user_definitions = resolve_definition_variables(user_definitions, variables)
}

## default values for the states argument
if (length(states) == 0) {
states = tigris::fips_codes %>%
Expand Down Expand Up @@ -434,10 +476,18 @@ this function returns.")}
}, .init = df_calculated_estimates)
}

## apply user-supplied definitions
if (length(user_definitions) > 0) {
validate_definition_variables(user_definitions, colnames(df_calculated_estimates))
check_multi_table_variables(user_definitions, resolved_tables, auto_table_entries)
df_calculated_estimates = execute_definitions(df_calculated_estimates, user_definitions)
}

####----Generate codebook----####
codebook = generate_codebook(.data = df_calculated_estimates,
resolved_tables = resolved_tables,
auto_table_entries = auto_table_entries)
auto_table_entries = auto_table_entries,
user_definitions = user_definitions)

df_calculated_estimates = df_calculated_estimates %>%
## ensure the vintage of the data and the GEOID for each observation are the first columns
Expand Down Expand Up @@ -478,4 +528,5 @@ utils::globalVariables(c(
"ALAND", "AWATER", "area_land_sq_kilometer", "area_water_sq_kilometer", "total_population_universe",
"state", "GEOID", "data_source_year", ".",
"state_code", "county_code", "county_fips", "state_name", "county",
"needs_tigris", "resolved_tables", "auto_table_entries"))
"needs_tigris", "resolved_tables", "auto_table_entries", "user_definitions",
"has_explicit_tables"))
13 changes: 12 additions & 1 deletion R/generate_codebook.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#' table registry. When NULL (default), all registered tables are used.
#' @param auto_table_entries A list of auto-generated table entries from
#' \code{build_auto_table_entry()}. Default is an empty list.
#' @param user_definitions A list of user-supplied DSL definition objects
#' (e.g., from \code{define_percent()}). Default is an empty list.
#' @returns A tibble containing the names and definitions of variables returned from
#' \code{urbnindicators::compile_acs_data()}.
#' @examples
Expand All @@ -24,7 +26,7 @@
#' @importFrom magrittr %>%
#' @keywords internal

generate_codebook = function(.data, resolved_tables = NULL, auto_table_entries = list()) {
generate_codebook = function(.data, resolved_tables = NULL, auto_table_entries = list(), user_definitions = list()) {

.data = .data %>%
sf::st_drop_geometry()
Expand Down Expand Up @@ -140,6 +142,15 @@ generate_codebook = function(.data, resolved_tables = NULL, auto_table_entries =
partial_documentation = dplyr::bind_rows(partial_documentation, auto_documentation)
}

## Expand user-supplied definitions into codebook rows
if (length(user_definitions) > 0) {
user_documentation = purrr::map(
user_definitions,
~ expand_codebook_entry(entry = .x, .data = .data, crosswalk = variable_name_crosswalk)) %>% purrr::list_rbind()

partial_documentation = dplyr::bind_rows(partial_documentation, user_documentation)
}

####----Raw Variables----####
## collect all raw variable clean names from the resolved tables
raw_variable_names = variable_name_crosswalk$clean_name %>%
Expand Down
2 changes: 1 addition & 1 deletion R/list_acs_variables.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ list_acs_variables = function(year = "2022", tables = NULL) {
#' get_acs_codebook() %>% dplyr::filter(stringr::str_detect(variable_clean, "snap"))
#' }
#' @export
get_acs_codebook = function(year = 2022, table = NULL) {
get_acs_codebook = function(year = 2024, table = NULL) {
suppressWarnings({suppressMessages({
census_variables = tidycensus::load_variables(year = year, dataset = "acs5")
})})
Expand Down
Loading
Loading