diff --git a/R/calculate_custom_geographies.R b/R/calculate_custom_geographies.R index 1690519..36cce43 100644 --- a/R/calculate_custom_geographies.R +++ b/R/calculate_custom_geographies.R @@ -433,7 +433,8 @@ calculate_custom_geographies = function( paste0(definition, " [Percentage recalculated from summed components.]"), aggregation_strategy == "weighted_average" ~ paste0(definition, " [Aggregated via population-weighted average using ", weight_variable, ".]"), - TRUE ~ definition)) + TRUE ~ definition)) %>% + dplyr::select(calculated_variable, variable_type, definition, dplyr::everything()) attr(result, "codebook") = updated_codebook diff --git a/R/compile_acs_data.R b/R/compile_acs_data.R index 1a7ab43..8953313 100644 --- a/R/compile_acs_data.R +++ b/R/compile_acs_data.R @@ -332,7 +332,8 @@ this function returns.")} })}) ## attach the codebook and resolved tables as attributes to the returned dataset - attr(df_cvs, "codebook") = codebook + attr(df_cvs, "codebook") = codebook %>% + dplyr::select(calculated_variable, variable_type, definition, dplyr::everything()) attr(df_cvs, "resolved_tables") = resolved_tables if (isTRUE(spatial)) { df_cvs = sf::st_as_sf(df_cvs) } diff --git a/README.Rmd b/README.Rmd index 43d7230..bae9d07 100644 --- a/README.Rmd +++ b/README.Rmd @@ -37,8 +37,11 @@ the American Community Survey (ACS). With a single function call, you get: -- Access to hundreds of standardized variables, such as percentages, in addition - to the raw count variables used to produce them. +- Access to hundreds of standardized variables, such as percentages and + the raw count variables used to produce them. + +- Margins of error and coefficients of variation for all + variables--those direct from the API as well as derived variables. - Meaningful, consistent variable names. @@ -50,11 +53,11 @@ With a single function call, you get: - Supplemental measures, such as population density, that aren't available from the ACS. -- Built-in quality checks to help ensure that calculated variables are - accurate. Plus some good, old-fashioned manual QC. +- Built-in quality checks to help ensure that calculated variables + and measures of error are accurate. Plus some good, old-fashioned manual QC. + That said--use at your own risk. We cannot and do not guarantee there aren't bugs. + -- Margins of error and coefficients of variation for all - variables--those direct from the API as well as derived variables. # Installation @@ -62,11 +65,11 @@ Install the development version of `urbnindicators` from [GitHub](https://github.com/) with: ```r -install.packages("renv") +# install.packages("renv") renv::install("UI-Research/urbnindicators") ``` -You'll need a Census API key +You'll want a Census API key ([request one here](https://api.census.gov/data/key_signup.html)). Set it once with: @@ -95,8 +98,11 @@ library(urbnindicators) ## Discover Available Data ```{r, warning = FALSE, message = FALSE} -list_tables() -list_variables() +list_tables() |> head(10) +``` + +```{r, warning = FALSE, message = FALSE} +list_variables() |> head(10) ``` ## Obtain Data @@ -112,7 +118,7 @@ df = compile_acs_data( geography = "county", states = "NJ") -glimpse(df) +glimpse(df) |> head(10) ``` ## Visualize Data @@ -136,8 +142,9 @@ plot_data = df %>% data_source_year = factor(data_source_year)) state_averages = plot_data %>% - group_by(data_source_year) %>% - summarize(mean_pct = mean(race_personofcolor_percent)) %>% + summarize( + .by = data_source_year, + mean_pct = mean(race_personofcolor_percent)) %>% arrange(data_source_year) %>% pull(mean_pct) @@ -156,9 +163,7 @@ dumbbell_data = plot_data %>% names_from = data_source_year, values_from = race_personofcolor_percent, names_prefix = "year_") -``` -```{r, warning = FALSE, message = FALSE} ggplot() + geom_segment( data = dumbbell_data, @@ -210,7 +215,7 @@ ggplot() + title = "All NJ Counties Experienced Racial Diversification from 2019 to 2024", subtitle = paste0("Share of population who are people of color, by county, 2019-2024 Confidence intervals are presented around each point but are extremely small"), - x = "County", + x = "", y = "Share of population who are people of color") + scale_x_discrete(expand = expansion(mult = c(.03, .04))) + scale_y_continuous( @@ -223,8 +228,7 @@ Confidence intervals are presented around each point but are extremely small"), # Learn More -A growing number of vignettes aim to support users in effectively using -this package. These vignettes include: +Check out the vignettes for additional details: - A package overview to help users [**Get Started**](articles/urbnindicators.html). @@ -241,6 +245,10 @@ this package. These vignettes include: Error**](articles/quantified-survey-error.html) can improve inference making. +- You can re-create your indicators and their measures of error + for [**Custom Geographies**](articles/custom-geographies.html). + Neighborhoods? Unincorporated counties? Start here. + # Credits This package is built on top of and enormously indebted to diff --git a/README.md b/README.md index 8c98aa5..c5d920c 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,11 @@ the American Community Survey (ACS). With a single function call, you get: -- Access to hundreds of standardized variables, such as percentages, in - addition to the raw count variables used to produce them. +- Access to hundreds of standardized variables, such as percentages and + the raw count variables used to produce them. + +- Margins of error and coefficients of variation for all variables–those + direct from the API as well as derived variables. - Meaningful, consistent variable names. @@ -34,11 +37,10 @@ With a single function call, you get: - Supplemental measures, such as population density, that aren’t available from the ACS. -- Built-in quality checks to help ensure that calculated variables are - accurate. Plus some good, old-fashioned manual QC. - -- Margins of error and coefficients of variation for all variables–those - direct from the API as well as derived variables. +- Built-in quality checks to help ensure that calculated variables and + measures of error are accurate. Plus some good, old-fashioned manual + QC. That said–use at your own risk. We cannot and do not guarantee + there aren’t bugs. # Installation @@ -46,11 +48,11 @@ Install the development version of `urbnindicators` from [GitHub](https://github.com/) with: ``` r -install.packages("renv") +# install.packages("renv") renv::install("UI-Research/urbnindicators") ``` -You’ll need a Census API key ([request one +You’ll want a Census API key ([request one here](https://api.census.gov/data/key_signup.html)). Set it once with: ``` r @@ -65,26 +67,16 @@ updates–check to ensure you have the most recent version installed! ## Discover Available Data ``` r -list_tables() -#> [1] "age" "computing_devices" -#> [3] "cost_burden" "disability" -#> [5] "educational_attainment" "employment" -#> [7] "gini" "health_insurance" -#> [9] "household_size" "income_quintiles" -#> [11] "internet" "language" -#> [13] "median_household_income" "median_housing_cost" -#> [15] "median_income_by_tenure" "mortgage_status" -#> [17] "nativity" "occupants_per_room" -#> [19] "population_density" "poverty" -#> [21] "public_assistance" "race" -#> [23] "school_enrollment" "sex" -#> [25] "snap" "tenure" -#> [27] "tenure_by_housing_costs" "tenure_by_units_in_structure" -#> [29] "total_population" "transportation_to_work" -#> [31] "travel_time_to_work" "units_in_structure" -#> [33] "vehicles_available" "year_structure_built" -list_variables() -#> # A tibble: 861 × 2 +list_tables() |> head(10) +#> [1] "age" "computing_devices" "cost_burden" +#> [4] "disability" "educational_attainment" "employment" +#> [7] "gini" "health_insurance" "household_size" +#> [10] "income_quintiles" +``` + +``` r +list_variables() |> head(10) +#> # A tibble: 10 × 2 #> variable table #> #> 1 total_population_universe total_population @@ -96,8 +88,7 @@ list_variables() #> 7 snap_received_percent snap #> 8 household_income_quintile_upper_limit_1 income_quintiles #> 9 household_income_quintile_upper_limit_2 income_quintiles -#> 10 household_income_quintile_upper_limit_3 income_quintiles -#> # ℹ 851 more rows +#> 10 household_income_quintile_upper_limit_3 income_quintiles ``` ## Obtain Data @@ -113,7 +104,7 @@ df = compile_acs_data( geography = "county", states = "NJ") -glimpse(df) +glimpse(df) |> head(10) #> Rows: 21 #> Columns: 175 #> $ data_source_year 2024, 2024, 2… @@ -291,6 +282,26 @@ glimpse(df) #> $ race_hispanic_twoormore_includingotherrace_percent_M 0.0073, 0.003… #> $ race_hispanic_twoormore_excludingotherrace_percent_M 0.0017, 0.001… #> $ race_personofcolor_percent_M 0.0023, 0.001… +#> # A tibble: 10 × 175 +#> data_source_year GEOID NAME total_population_uni…¹ race_universe +#> +#> 1 2024 34001 Atlantic County,… 276270 276270 +#> 2 2024 34003 Bergen County, N… 962316 962316 +#> 3 2024 34005 Burlington Count… 467805 467805 +#> 4 2024 34007 Camden County, N… 527257 527257 +#> 5 2024 34009 Cape May County,… 94941 94941 +#> 6 2024 34011 Cumberland Count… 153305 153305 +#> 7 2024 34013 Essex County, Ne… 863002 863002 +#> 8 2024 34015 Gloucester Count… 306954 306954 +#> 9 2024 34017 Hudson County, N… 718323 718323 +#> 10 2024 34019 Hunterdon County… 130160 130160 +#> # ℹ abbreviated name: ¹​total_population_universe +#> # ℹ 170 more variables: race_nonhispanic_allraces , +#> # race_nonhispanic_white_alone , race_nonhispanic_black_alone , +#> # race_nonhispanic_aian_alone , race_nonhispanic_asian_alone , +#> # race_nonhispanic_nhpi_alone , race_nonhispanic_otherrace_alone , +#> # race_nonhispanic_twoormore , +#> # race_nonhispanic_twoormore_includingotherrace , … ``` ## Visualize Data @@ -314,8 +325,9 @@ plot_data = df %>% data_source_year = factor(data_source_year)) state_averages = plot_data %>% - group_by(data_source_year) %>% - summarize(mean_pct = mean(race_personofcolor_percent)) %>% + summarize( + .by = data_source_year, + mean_pct = mean(race_personofcolor_percent)) %>% arrange(data_source_year) %>% pull(mean_pct) @@ -334,9 +346,7 @@ dumbbell_data = plot_data %>% names_from = data_source_year, values_from = race_personofcolor_percent, names_prefix = "year_") -``` -``` r ggplot() + geom_segment( data = dumbbell_data, @@ -388,7 +398,7 @@ ggplot() + title = "All NJ Counties Experienced Racial Diversification from 2019 to 2024", subtitle = paste0("Share of population who are people of color, by county, 2019-2024 Confidence intervals are presented around each point but are extremely small"), - x = "County", + x = "", y = "Share of population who are people of color") + scale_x_discrete(expand = expansion(mult = c(.03, .04))) + scale_y_continuous( @@ -403,8 +413,7 @@ Confidence intervals are presented around each point but are extremely small"), # Learn More -A growing number of vignettes aim to support users in effectively using -this package. These vignettes include: +Check out the vignettes for additional details: - A package overview to help users [**Get Started**](articles/urbnindicators.html). @@ -421,6 +430,10 @@ this package. These vignettes include: Error**](articles/quantified-survey-error.html) can improve inference making. +- You can re-create your indicators and their measures of error for + [**Custom Geographies**](articles/custom-geographies.html). + Neighborhoods? Unincorporated counties? Start here. + # Credits This package is built on top of and enormously indebted to diff --git a/man/figures/README-unnamed-chunk-6-1.png b/man/figures/README-unnamed-chunk-6-1.png index 551e2a6..449df0a 100644 Binary files a/man/figures/README-unnamed-chunk-6-1.png and b/man/figures/README-unnamed-chunk-6-1.png differ diff --git a/vignettes/.gitignore b/vignettes/.gitignore index 097b241..47018d6 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -1,2 +1,5 @@ *.html *.R + +/.quarto/ +**/*.quarto_ipynb diff --git a/vignettes/codebook.Rmd b/vignettes/codebook.Rmd index 191f167..85b931b 100644 --- a/vignettes/codebook.Rmd +++ b/vignettes/codebook.Rmd @@ -17,19 +17,21 @@ knitr::opts_chunk$set( ```{r setup, echo = FALSE} library(urbnindicators) +library(dplyr) +library(reactable) -df = compile_acs_data(year = 2023, geography = "us") +df = compile_acs_data(year = 2024, geography = "us") codebook = attr(df, "codebook") -reactable::reactable( +reactable( codebook %>% - dplyr::rename( + transmute( Variable = calculated_variable, Type = variable_type, Definition = definition), columns = list( - Variable = reactable::colDef(minWidth = 200), - Type = reactable::colDef(minWidth = 50), - Type = reactable::colDef(minWidth = 100)), + Variable = colDef(minWidth = 200), + Type = colDef(minWidth = 50), + Definition = colDef(minWidth = 100)), filterable = TRUE) ``` diff --git a/vignettes/custom-geographies.Rmd b/vignettes/custom-geographies.Rmd index 0bdcd75..ad13078 100644 --- a/vignettes/custom-geographies.Rmd +++ b/vignettes/custom-geographies.Rmd @@ -36,19 +36,20 @@ data, these imprecise estimates make it difficult to detect meaningful differences between areas---even when real differences exist. `calculate_custom_geographies()` addresses this by aggregating -tract-level data to user-defined geographies (e.g., neighborhoods, +tract-level (or really any level of data) data to user-defined geographies (e.g., neighborhoods, planning districts, or school zones). This aggregation increases sample sizes, reduces coefficients of variation, and enables more reliable statistical inference. -# Example: DC Tract Data +# Example: DC Quadrants We'll demonstrate using tract-level data for Washington, DC, comparing -the share of population receiving SNAP benefits across areas. +the share of population receiving SNAP benefits across areas "quadrants". ```{r, message = FALSE} dc_tracts = compile_acs_data( years = 2024, + tables = "snap", geography = "tract", states = "DC", spatial = TRUE) @@ -83,57 +84,40 @@ dc_quadrants = calculate_custom_geographies( spatial = TRUE) ``` -# Comparing Precision: Tracts vs. Custom Geographies +# Comparing Precision The maps below show the share of households receiving SNAP benefits. Notice how aggregating to quadrants produces more precise estimates with -smaller coefficients of variation. +smaller coefficients of variation. Indeed, the median coefficient of variation +for tract level is greater than 30, a common upper bound for "reliable" estimates. ```{r, fig.height = 4} # Tract-level map -map_tracts = dc_tracts %>% - ggplot() + - geom_sf(aes(fill = snap_received_percent), color = "white", linewidth = 0.1) + - scale_fill_gradientn( - colors = c("#CFE8F3", "#1696D2", "#0A4C6A"), - limits = c(0, 0.5), - labels = scales::percent, - na.value = "grey80") + - urbnthemes::theme_urbn_map() + - theme(legend.position = "bottom") + - labs( - fill = "SNAP Receipt", - title = "Tract-Level Estimates", - subtitle = paste0("Median CV: ", round(median(dc_tracts$snap_received_percent_CV, na.rm = TRUE), 1))) - -# Quadrant-level map -map_quadrants = dc_quadrants %>% + bind_rows( + dc_tracts %>% mutate(geography = "Tract"), + dc_quadrants %>% mutate(geography = "Quadrant")) %>% + mutate( + .by = geography, + median_cv = round(median(snap_received_percent_CV, na.rm = TRUE)), + label = str_c(geography, " - median CV: ", median_cv)) %>% ggplot() + - geom_sf(aes(fill = snap_received_percent), color = "white", linewidth = 0.3) + - scale_fill_gradientn( - colors = c("#CFE8F3", "#1696D2", "#0A4C6A"), - limits = c(0, 0.5), - labels = scales::percent, - na.value = "grey80") + - urbnthemes::theme_urbn_map() + - theme(legend.position = "bottom") + - labs( - fill = "SNAP Receipt", - title = "Quadrant-Level Estimates", - subtitle = paste0("Median CV: ", round(median(dc_quadrants$snap_received_percent_CV, na.rm = TRUE), 1))) - -map_tracts -map_quadrants + geom_sf(aes(fill = snap_received_percent), color = "white", linewidth = 0.1) + + scale_fill_continuous(palette = palette_urbn_cyan[c(3, 5, 7)], labels = scales::percent) + + theme_urbn_map() + + labs(fill = "SNAP Receipt (%)") + + facet_wrap(~ label) ``` The quadrant-level estimates have substantially lower CVs, indicating -more reliable estimates. This precision gain enables meaningful -statistical comparisons. +more reliable estimates. # Detecting Statistically Significant Differences -With more precise estimates, we can better identify where SNAP receipt -rates differ significantly from the citywide average. +By aggregating our tract observations, we can also calculate +statistically significant differences at greater geographic scales. +This enables analysis for more policy-relevant areas and helps mitigate +shortcomings associated with high measures of error for smaller-population +observations, which can lead to findings of no statistically significant differences. ```{r, fig.height = 4} # Calculate DC-wide SNAP rate for comparison @@ -181,7 +165,7 @@ map_tracts_sig = tracts_sig %>% scale_fill_manual(values = sig_colors, na.value = "grey80") + urbnthemes::theme_urbn_map() + theme(legend.position = "bottom") + - labs(fill = "", title = "Tract-Level") + labs(fill = "", subtitle = "Tract-level", title = "") map_quadrants_sig = quadrants_sig %>% ggplot() + @@ -189,20 +173,16 @@ map_quadrants_sig = quadrants_sig %>% scale_fill_manual(values = sig_colors, na.value = "grey80") + urbnthemes::theme_urbn_map() + theme(legend.position = "bottom") + - labs(fill = "", title = "Quadrant-Level") + labs(fill = "", subtitle = "Quadrant-level", title = "") gridExtra::grid.arrange( map_tracts_sig, map_quadrants_sig, ncol = 2, top = grid::textGrob( - "Aggregation Enables Detection of Statistically Significant Differences", + "Aggregation can mitigate challenges with small-population, high-error observations", gp = grid::gpar(fontsize = 12, fontface = "bold"))) ``` -At the tract level, large margins of error prevent many estimates from -reaching statistical significance. At the quadrant level, the same -underlying data produces estimates precise enough to detect significant -differences from the citywide average. # Key Takeaways @@ -213,11 +193,8 @@ differences from the citywide average. statistically significant differences that would otherwise be obscured by sampling error. -3. **Proper error handling**: `calculate_custom_geographies()` correctly - recalculates MOEs, SEs, and CVs for aggregated estimates following - Census Bureau methodology---simply averaging tract-level statistics - would produce incorrect results. - -4. **Flexible boundaries**: The function works with any custom geography - defined by a grouping variable, allowing analysis at neighborhood, - district, or other policy-relevant scales. +3. **More relevant units of analysis**: The ACS reports estimates at + many geographies, but there are many others that are not supported. + Think neighborhoods, wards, continuums of care, school districts, and more. + To robustly calculate errors and draw reliable inferences for these + other geographies is critical but challenging. diff --git a/vignettes/design-philosophy.Rmd b/vignettes/design-philosophy.Rmd index 07ab70c..80a5cca 100644 --- a/vignettes/design-philosophy.Rmd +++ b/vignettes/design-philosophy.Rmd @@ -63,19 +63,3 @@ or improve the accuracy of a common use-case involving a large set of variables `race_personofcolor_percent`. Variables in the codebook have their original API names included in their definitions. -- **Return a very large, wide dataset.** The underlying - `library(tidycensus)` interface to the Census Bureau API can return - a single variable or table, and often this is how users employ it. - Conversely, it's common to want dozens or perhaps even hundreds of - variables--this is the use case around which - `library(urbnindicators)` was designed. Queries at small geographies - can be slow, but the result is a dataset containing everything you - could want. If you're just looking for - one, or a few, variables, `library(tidycensus)` is probably a better - approach, and you can still use functions like - `urbnindicators::select_variables_by_name()`, - `urbnindicators::filter_variables()`, and - `urbnindicators::list_variables()` to select and sensibly name - variables returned from `library(tidycensus)`. In the future, we'll - add caching options so that you don't have to repeatedly make the - same queries and can instead read results in from a local directory. diff --git a/vignettes/opening-a-pull-request.Rmd b/vignettes/opening-a-pull-request.Rmd index 89cb5d4..0bbad34 100644 --- a/vignettes/opening-a-pull-request.Rmd +++ b/vignettes/opening-a-pull-request.Rmd @@ -35,10 +35,9 @@ For `library(urbnindicators)`, a common use-case for a PR is to propose a new variable or series of variables. This vignette will illustrate the complete PR process for a new series of variables. The core of the PR is a single `register_table()` call in `R/table_registry.R` that -declares the raw ACS variables and uses the `define_*` DSL to specify -how derived variables are computed. The codebook, coefficients of -variation, and variable listings are all generated automatically from -this registration. +defines the raw ACS variables and specifies how derived variables are +computed. The codebook and measures of error are all generated automatically +from the table registry. ## Overview @@ -49,9 +48,7 @@ Adding a new table requires a single code change: list that uses the `define_*` helpers to specify how derived variables (e.g., percentages) are calculated. -Everything else--the codebook, coefficients of variation, variable -listings, and the `tables`/`indicators` API--is generated automatically -from this registration. After writing the registration, the PR should +After writing the registration, the PR should include quality checks to verify that the new variables are correctly calculated and appropriately documented. @@ -60,24 +57,25 @@ The quality-check steps are: 2. **Codebook**: Verify that each new variable is documented in the codebook and that its documentation is accurate. 3. **Coefficients of variation**: Verify that CVs appear reasonable. -4. **Pretty names**: Verify that `make_pretty_names()` produces - reasonable labels. +4. **Pretty names**: Verify that `make_pretty_names()` consistently + clearly renames the new variables. 5. **Integration test**: Call `compile_acs_data()` end-to-end and inspect the results. ## Our Variable Series We're going to add a series of estimates that describe "Household -Type", which are contained in table B11001. We can find the relevant +Type (White Alone)", which are contained in table B11001. We can find the relevant variables by navigating the codebook returned by `tidycensus::load_variables()`. ```{r} -codebook = load_variables(dataset = "acs5", year = 2023) +codebook = load_variables(dataset = "acs5", year = 2024) codebook %>% filter(str_detect(name, "B11001")) %>% - head() + head(2) %>% + glimpse() ``` ## Step 1: Identify the Raw Variables @@ -113,17 +111,19 @@ variable choices and denominator logic are correct: ```{r} sample_data = tidycensus::get_acs( - years = 2022, + years = 2024, geography = "county", state = "NJ", variables = select_variables_by_name("B11001_", census_codebook = codebook) %>% stats::setNames(names(.) %>% stringr::str_remove_all("including_living_alone_")), output = "wide") %>% + ## selecting only estimate ("_E") variables, not margin of error ("_M") variables select(GEOID, NAME, matches("_E")) %>% rename_with(cols = everything(), ~ str_remove_all(.x, "_E")) sample_data %>% - head() + head(2) %>% + glimpse() ``` We can test our percentage calculations on the sample data. Note the @@ -132,12 +132,13 @@ use of `urbnindicators::safe_divide()`, which returns `0` rather than ```{r} sample_data %>% - dplyr::transmute( - dplyr::across( - .cols = c(dplyr::matches("household_type"), -dplyr::matches("universe")), + transmute( + across( + .cols = c(matches("household_type"), -matches("universe")), .fns = ~ safe_divide(.x, household_type_universe), .names = "{.col}_percent")) %>% - head() %>% select(1:3) + head() %>% + glimpse() ``` ## Step 3: Write the `register_table()` Call @@ -149,9 +150,9 @@ change in the PR. The `definitions` list uses the `define_*` helpers to declaratively specify how each derived variable is computed. The package uses this specification to both execute the computation and auto-generate the -codebook documentation and CV calculations. +codebook documentation and error calculations. -### The `define_*` DSL +### The `define_*` Helpers | Helper | Use case | Key arguments | |---|---|---| @@ -185,10 +186,6 @@ register_table(list( )) ``` -This single block replaces what previously required separate changes -across three files (`R/list_acs_variables.R`, -`R/compile_acs_data.R`, and manual codebook verification). - ### More examples **Simple percentage** (one numerator, one denominator): @@ -212,6 +209,8 @@ definitions = list( exclude_regex = NULL, output_suffix = "_percent", denominator = "race_universe"), + ## race_personofcolor_percent is the share of all individuals who are not + ## non-Hispanic, White alone, i.e, the complement define_one_minus("race_personofcolor_percent", source_variable = "race_nonhispanic_white_alone_percent")) ``` @@ -219,8 +218,16 @@ definitions = list( **Across-sum followed by across-percent** (sum male + female counts into combined age variables, then calculate percentages): +This one is a bit tricky--the source table includes variables for each +age group, split by sex. To get age groups, we have to add the two +sex-specific estimates for the given age group. This requires us to specify +an `input_regex`, which selects, in this case, all female-specific age variables. +The `addend_function` then programmatically identifies the same-named, male-specific +variables. The `output_naming_function` simplifies the resulting combined variable, +removing the sex category and other extraneous words. + ```r -## from the sex_by_age table (age construct) +## from the sex_by_age table definitions = list( define_across_sum( input_regex = "sex_by_age_female_.*years($|_over$)", @@ -242,22 +249,16 @@ The `register_table()` call should be added to `R/table_registry.R` under the appropriate comment header. For our example, this would be under `####----TABLE REGISTRATIONS: HOUSEHOLD COMPOSITION----####`. -If any new column names are created by `compute_fn` logic, add them to -the `utils::globalVariables()` call at the bottom of -`R/table_registry.R` to avoid R CMD check notes. - ## Step 4: Verify the Codebook After writing the registration, load the package with -`devtools::load_all()` and run a test call. New variables should be +`devtools::load_all()` and call `compile_acs_data()`. New variables should be automatically documented by `generate_codebook()` and included in the -codebook attribute of the dataframe returned by `compile_acs_data()`. -Verify that each new variable is documented and that its documentation +codebook. Verify that each new variable is documented and that its documentation is accurate. If the documentation is incorrect, the error we estimate for derived variables will also be incorrect. -If the documentation is incorrect, the PR should note the issue; it -should not include changes to `R/generate_codebook.R`. +If the documentation is incorrect, the PR should note the issue. ## Step 5: Verify Coefficients of Variation @@ -266,12 +267,12 @@ Like the codebook, CVs are computed automatically from the reasonable. Users should also check the magnitude of errors for all variables--raw -ACS estimates and `urbnindicators`-calculated variables alike--across +ACS estimates and derived variables alike--across smaller geographies, such as all tracts in one or more states. If CVs are large (e.g., over 50) for a large share of all tracts, this may indicate that the series of interest is not appropriate for tract-level analysis. Because `urbnindicators` is designed to -facilitate tract-level analysis, variables that are consistently +facilitate tract-level analysis, tables that are consistently unreliable at the tract level will not be integrated into the codebase. ## Step 6: Verify Pretty Names @@ -280,12 +281,12 @@ Ensure that the new variables have reasonable names: ```{r} sample_data %>% - dplyr::mutate( - dplyr::across( - .cols = c(dplyr::matches("household_type"), -dplyr::matches("universe")), + mutate( + across( + .cols = c(matches("household_type"), -matches("universe")), .fns = ~ .x / household_type_universe, .names = "{.col}_percent")) %>% - urbnindicators::make_pretty_names() %>% + make_pretty_names() %>% colnames() ``` @@ -295,32 +296,7 @@ the pretty-ifying process (e.g., acronyms, series of numbers, etc.). We can leave it to users to make other adjustments, such as removing the substring "Household Type ", if they want even more concise names. -## Step 7: Integration Test - -Integrate the proposed changes into the codebase (on a branch), load -the current version of `urbnindicators` (via `devtools::load_all()`), -and call `compile_acs_data()`. Verify that the new table works both in -isolation and as part of the full suite: - -```r -## test the new table in isolation -compile_acs_data( - tables = "household_type", - years = 2022, - geography = "county", - states = "NJ") - -## test the full suite -compile_acs_data( - years = 2022, - geography = "county", - states = "NJ") -``` - -Interactively explore both the data and the codebook (accessed via -`attr(result, "codebook")`). - -## Step 8: Quality Check Results +## Step 7: Quality Check Results There are a few strategies for quality-checking the results of a series of variables: @@ -334,9 +310,8 @@ series of variables: of the subject tables. (An example of one of multiple reasons that `urbnindicators` exclusively uses data from the detailed tables.) -2. Manually compute a benchmark value. This works well for all those - derived variables that aren't reported in any Census Bureau - product. Identify the relevant numerator and denominator variables +2. Manually compute a benchmark value. + Identify the relevant numerator and denominator variables (in the case of a derived percentage) and manually calculate the derived variable, then compare the manually-computed benchmark to the programmatically-calculated version. This seems very simple @@ -345,7 +320,7 @@ series of variables: variables--e.g., where a numerator is a summed variable itself--this is a very useful quality check. -3. Plot a histogram of the computed variable(s) (if multiple +3. Plot a histogram of the computed variable(s). If there are multiple variables in a series, use `pivot_longer()` to turn the dataframe long, then use `facet_wrap()` to plot each histogram side-by-side. Check for unexpected spikes and outlier values. @@ -355,7 +330,7 @@ series of variables: observations may be an indication that a calculation has gone awry. -## Step 9: Open the PR +## Step 8: Open the PR Once users are satisfied with the proposed code (and/or have noted any issues), they should click on their branch in the GitHub repository diff --git a/vignettes/quantified-survey-error.Rmd b/vignettes/quantified-survey-error.Rmd index d2cec24..685fd4f 100644 --- a/vignettes/quantified-survey-error.Rmd +++ b/vignettes/quantified-survey-error.Rmd @@ -15,7 +15,6 @@ knitr::opts_chunk$set( warning = FALSE, message = FALSE, collapse = TRUE, - eval = nzchar(Sys.getenv("CENSUS_API_KEY")), dpi = 600, fig.width = 7, comment = "#>") @@ -28,9 +27,6 @@ library(tidyr) library(stringr) library(urbnindicators) library(sf) -# Note: urbnthemes is an internal Urban Institute package -# (https://github.com/UrbanInstitute/urbnthemes) and is not on CRAN. -# Install with renv::install("UI-Research/urbnthemes") library(urbnthemes) library(tidycensus) library(gridExtra) @@ -119,11 +115,6 @@ smaller CVs. Typically, there are two strategies to reduce CVs: (1) aggregate estimates, either across geographies or across variables, or (2) use larger geographies. -We plan to add utilities to support users in aggregating estimates and -calculating adjusted measurements of error. For now, we warn that any -aggregation should be done with care, as error cannot be simply added -(or otherwise summarized) the way that estimates can. - ```{r, warning = FALSE, message = FALSE} acs_df_county = compile_acs_data( years = c(2024), diff --git a/vignettes/urbnindicators.Rmd b/vignettes/urbnindicators.Rmd index 5b29f17..9bbfede 100644 --- a/vignettes/urbnindicators.Rmd +++ b/vignettes/urbnindicators.Rmd @@ -44,14 +44,8 @@ This vignette is organized into three parts: ## A Typical Workflow [**tidycensus**](https://walker-data.com/tidycensus/index.html) provides -a suite of functions for working with ACS data in R. In fact, -`library(urbnindicators)` is built on top of `library(tidycensus)`, and we highly -encourage those who are unfamiliar to explore `library(tidycensus)` before using -`library(urbnindicators)`. - -While `tidycensus` is versatile and allows users to access many more -datasets (and variables within those datasets) than does -`library(urbnindicators)`, it can require a significant amount of knowledge and +a suite of functions for working with ACS data in R. While it's versatile +and comprehensive, it can require a significant amount of knowledge and time to support a robust analysis, leading many users to fall into common pitfalls without realizing they've made an error(s). @@ -61,10 +55,11 @@ We load the built-in codebook and search for our construct of interest (disability). This leaves us 500 variables to choose from. ```{r} -acs_codebook = tidycensus::load_variables(dataset = "acs5", year = 2022) +acs_codebook = load_variables(dataset = "acs5", year = 2022) acs_codebook %>% - dplyr::filter(stringr::str_detect(concept, "Disability")) %>% + filter(str_detect(concept, "Disability")) %>% + select(name, label, concept) %>% ## only printing three, for brevity head(3) %>% reactable::reactable() @@ -85,11 +80,10 @@ because the other characteristics combined with disability status (e.g., health insurance coverage status) may be available only for a subset of the individuals for whom disability status is available. -Putting these challenges aside, let's imagine we select the table of -variables prefixed "B18101", for "Sex by Age by Disability". We think -that most respondents who respond about their disability status will -also have responded about their sex and age. We then pass this to -`library(tidycensus)` as: +Let's imagine we select the table of variables prefixed "B18101", +for "Sex by Age by Disability". We think that most respondents who respond +about their disability status will also have responded about their sex and +age. We then pass this to `library(tidycensus)` as: ```{r} df_disability = get_acs( @@ -99,13 +93,10 @@ df_disability = get_acs( output = "wide", survey = "acs5", table = "B18101") - -df_disability %>% dim() -df_disability %>% colnames() %>% head(5) ``` This returns us 21 observations–one for each county in NJ--along with an -intimidating 80 columns with meaningless names along the lines of +intimidating 80 columns with unintelligble names along the lines of `B18101_039E`. ### Calculating our measure of interest @@ -118,14 +109,14 @@ error-prone. For an analysis that leverages more than a single measure, and especially when measures are required from distinct tables, this workflow is burdensome and creates significant surface area for -undetected errors. To acquire data for multiple years, or for multiple -different types of geographies, also requires repeated calls to -`tidycensus::get_acs()`. +undetected errors. At the same time, many analysts will be overwhelmed by and unsure how to -combine the margins of error that are returned by `tidycensus::get_acs()`, opting -simply to drop this critical information from their analysis as they go -about calculating "% Disabled". +combine the margins of error that are returned by `tidycensus::get_acs()` +to calculate pooled errors for the new percent-disabled variable, opting +simply to drop this critical information from their analysis. (See +(quantified-survey-error)[quantified-survey-error.html] to learn more about +how `library(urbnindicators)` helps simplify this task.) ## Using urbnindicators @@ -133,7 +124,7 @@ about calculating "% Disabled". of a call to `tidycensus::get_acs()`, a call to `urbnindicators::compile_acs_data()` returns a dataset of both raw ACS measures and derived estimates (such as the share of all individuals who -are disabled). And that dataset includes a range of measures–-spanning +are disabled). And that dataset can include a range of measures–-spanning things such as health insurance, employment, housing costs, and race and ethnicity–-not just one variable or table from the ACS. @@ -143,22 +134,24 @@ It's as simple as the call below. Note that you can provide a vector of years and/or states if you want data over different time periods or geographies. -Note that selecting more geographic units--either by selecting a `geography` option -comprising more units, by selecting more states, or selecting more years--can significantly -increase the query time. A tract-level query of the entire US for all supported variables -can take 30+ minutes. +Note that selecting more tables or more geographic units--either by selecting +a `geography` option comprising more units, by selecting more states, or selecting +more years--can significantly increase the query time. A tract-level query of the +entire US for all supported variables can take 30+ minutes. Use `list_tables()` to see which tables are available: ```{r} -list_tables() +list_tables() |> head(10) ``` -Here we request just two tables--`disability` and `transportation_to_work`: +Here we request just two tables--`disability` and `transportation_to_work`. +Alternately, you can set `tables = NULL` (the default) and get a very wide +dataset comprising every variable supported by the package. ```{r, message = FALSE, warning = FALSE} -df_urbnindicators = urbnindicators::compile_acs_data( - years = 2022, +df_urbnindicators = compile_acs_data( + years = 2024, tables = c("disability", "transportation_to_work"), geography = "county", states = "NJ", @@ -186,14 +179,8 @@ df_urbnindicators %>% There's a lot happening behind the scenes, so it's important to understand what each variable represents and how it was calculated. `library(urbnindicators)` includes a codebook as an attribute of the dataframe -returned from `urbnindicators::compile_acs_data()`. View and navigate through the [full codebook -here](articles/codebook.html). - -```{r} -df_urbnindicators %>% - attr("codebook") %>% - reactable::reactable() -``` +returned from `urbnindicators::compile_acs_data()`. View and navigate through the +[full codebook here](articles/codebook.html). The codebook specifies the variable type and provides a definition of how the variable was calculated. Most (though not all) variables that @@ -206,6 +193,7 @@ variables, where we divide two count variables. For example, df_urbnindicators %>% attr("codebook") %>% filter(str_detect(calculated_variable, "^disability_percent$")) %>% + select(calculated_variable, variable_type, definition) %>% reactable::reactable() ``` @@ -222,17 +210,9 @@ from home. df_urbnindicators %>% attr("codebook") %>% filter(str_detect(calculated_variable, "means.*motor_vehicle")) %>% + select(calculated_variable, variable_type, definition) %>% reactable::reactable() ``` This allows us say something along the lines of: "Of individuals who commute to work, XX% use a motor vehicle as their primary commute mode." - -## Summary - -In short, `library(urbnindicators)` trades the flexibility of -`library(tidycensus)` for speed and reliability: meaningful variable -names replace opaque codes, a codebook documents every calculation, -and margins of error and coefficients of variation are carried through -automatically so users can make informed decisions about statistical -significance and data quality.