diff --git a/DESCRIPTION b/DESCRIPTION index 9f6f1693..9aedcc13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19 +Version: 1.4.1 Authors@R: c(person(given = "Tony", family = "Fujs", @@ -70,7 +70,9 @@ Imports: DBI, duckdb, jsonlite, - digest + digest, + parallel, + R.utils Remotes: PIP-Technical-Team/wbpip@DEV Depends: diff --git a/NAMESPACE b/NAMESPACE index 207921ed..af6ba74d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,11 @@ # Generated by roxygen2: do not edit by hand +export(add_attributes_as_columns_multi) +export(add_attributes_as_columns_vectorized) export(add_dist_stats) +export(add_dist_stats_old) export(assign_serializer) +export(assign_stat) export(change_grouped_stats_to_csv) export(citation_from_version) export(create_etag_header) @@ -12,11 +16,13 @@ export(get_aux_table) export(get_aux_table_ui) export(get_caller_names) export(get_ctr_alt_agg) +export(get_from_pipapienv) export(get_grp_to_compute) export(get_impl_ctrs) export(get_md_vars) export(get_param_values) export(get_pip_version) +export(get_pipapienv) export(get_user_alt_gt) export(get_user_x_code) export(get_valid_aux_long_format_tables) @@ -24,15 +30,20 @@ export(is_empty) export(is_forked) export(load_inter_cache) export(pip) +export(pip_agg) export(pip_grp) export(pip_grp_logic) +export(pip_new_lineups) export(pip_old) +export(pip_old_lineups) export(pipgd_lorenz_curve) export(return_correct_version) export(return_if_exists) +export(safe_endpoint) export(select_off_alt_agg) export(select_reporting_level) export(select_user_aggs) +export(set_in_pipapienv) export(start_api) export(ui_cp_charts) export(ui_cp_download) @@ -44,11 +55,14 @@ export(ui_pc_regional) export(ui_svy_meta) export(unnest_dt_longer) export(update_master_file) +export(use_new_lineup_version) export(valid_years) export(validate_input_grouped_stats) export(version_dataframe) +export(with_req_timeout) export(wld_lineup_year) import(collapse, except = fdroplevels) +import(data.table) import(data.table, except = fdroplevels) importFrom(glue,glue) importFrom(glue,glue_collapse) diff --git a/NEWS.md b/NEWS.md index 55099b39..4c3be5b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,60 @@ +# pipapi 1.4.1 +* Removed old regions aggregate from `pc_chart`. +* Removed old regions aggregate from `get_aux_table_ui`. +* Bring back MDG. +* Add hash to all UI enpdoints + +# pipapi 1.4.0 +* Added `safe_endpoint` function for safer endpoint handling. +* Removed `preroute` as it is now managed in `endpoints.R`. +* Introduced a new and improved Plumber setup. +* Refactored Plumber configuration for better maintainability. +* Implemented a timeout wrapper for more endpoints. +* Added "Request timed out" responses to `pip` and `pip-grp` endpoints. +* Introduced a lightweight request ID and timing filter for better request tracking. +* Added `tryCatch` blocks to improve error handling in several endpoints. +* Enhanced error handling throughout the API. +* Fixed bug by changing `req$args` to `req$argsQuery`. +* Removed unnecessary calls to `gc()`. +* Improved filtering of poverty lines when updating the master (intermediate cache) file. +* Utilized the missing data file instead of recreating it. +* Removed verbose output in join operations. +* Ensured correct sorting for surveys with multiple `reporting_level` values. +* Updated handling of empty data responses. +* Enabled support for multiple `popshare` values in fill gaps operations. +* Removed `filter_Lkup` from `fg_pip()`. +* Allowed region selection from any variable (to be revised when supporting more aggregations). +* Added prosperity gap metric to `ui_cp_charts`. +* Enabled multiple `popshare` values for survey years. +* Improved the `infer_poverty_line()` function. +* Updated data documentation for clarity and completeness. +* Stored variables in the environment defined in `zzz`. +* Fixed several bugs across the codebase. + + +# pipapi 1.3.24 +* Incorporate Lineup distribution and Countries with Missing Data (CMD) distributions +* New way to estimate poverty using cumulative sums +* allow multiple `popshare` + + +# pipapi 1.3.23 +* Add Venn diagram information to Country profiles +* Add prosperity gap to country profiles chart. + + +# pipapi 1.3.22 + +* Fix issue with popshare in fill gaps calls. + +# pipapi 1.3.21 +* Fix issue with popshare in survey year calls +* Fix problem with alternative aggregates like AFW and AFE +* Make sure all tests pass +* add logs in docker container + +# pipapi 1.3.20 + # pipapi 1.3.19 * fix issue with comparability diff --git a/R/aaa.R b/R/aaa.R new file mode 100644 index 00000000..cd685993 --- /dev/null +++ b/R/aaa.R @@ -0,0 +1 @@ +.pipapienv <- new.env(parent = emptyenv()) diff --git a/R/add_agg_stats.R b/R/add_agg_stats.R index 5b93067d..bbc81ed9 100644 --- a/R/add_agg_stats.R +++ b/R/add_agg_stats.R @@ -16,7 +16,8 @@ add_agg_stats <- function(df, aggregated_list <- split(aggregated, interaction( aggregated$country_code, - aggregated$reporting_year + aggregated$reporting_year, + aggregated$poverty_line ), drop = TRUE ) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R new file mode 100644 index 00000000..6c68b139 --- /dev/null +++ b/R/compute_fgt_new.R @@ -0,0 +1,349 @@ +# OLD APPROACH WITH MEAN -------------- + +# Efficient FGT calculation for a data.table and vector of poverty lines +#' Title +#' +#' @param dt data frame with `welfare` and `weight` columns +#' @param welfare character: welfare variable name +#' @param weight character: weight variable name +#' @param povlines double: vector with poveryt lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) { + w <- dt[[welfare]] + wt <- dt[[weight]] + n <- length(w) + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency (vectorized) + + pos <- w > 0 + # logw <- log(w) + logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> + suppressWarnings() + # logw <- fifelse(w > 0, log(w), NA_real_) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + setv(rel_dist, poor, 0, invert = TRUE) + # rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + } else { + watts_vec[i] <- 0 + } + } + + if (mean_and_med) { + mn <- ffirst(dt$mean) + med <- ffirst(dt$median) + cy <- ffirst(dt$coutnry_code) + ry <- ffirst(dt$reporting_year) + out <- data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec, + mean = mn, + median = med, + country_code = cy, + reporting_year = ry) + } else { + out <- data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec) + } + + out + +} + + + +#' Efficient FGT calculation for vectors (No data.table) +#' +#' @param w character: welfare variable name +#' @param wt character: weight variable name +#' @param povlines double: vector with poverty lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt <- function(w, wt, povlines) { + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency (vectorized) + + pos <- w > 0 + # logw <- log(w) + # logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> + # suppressWarnings() + # logw <- fifelse(w > 0, log(w), NA_real_) + logw <- log(w) |> + suppressWarnings() + + tot_pop <- fsum(wt) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + setv(rel_dist, poor, 0, invert = TRUE) + # rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / tot_pop + } else { + watts_vec[i] <- 0 + } + } + + + data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec) + +} + +#' compute FGT using indices by reporting level +#' +#' This function is intended to be used inside [map_fgt] +#' +#' @param x data.table from lt list, with welfare and weight vectors +#' @param y list of indices for each reporting level +#' @param nx name of data table. Usuall country code and year in the form "CCC_YYYY" +#' +#' @rdname map_fgt +#' @keywords internal +DT_fgt_by_rl <- \(x, y, nx, povline) { + uni_rl <- names(y) |> + unique() + DT_fgt <- lapply(uni_rl, \(rl) { + + idx <- y[[rl]] + w <- x[idx, welfare] + wt <- x[idx, weight] + RL <- compute_fgt(w = w, wt = wt, povlines = povline) + RL[, reporting_level := rl] + + }) |> + rbindlist(fill = TRUE) + + + DT_fgt[, `:=`( + country_code = gsub("([^_]+)(_.+)", "\\1", nx), + reporting_year = gsub("(.+_)([^_]+)", "\\2", nx) + )] +} + + + +#' jkoin reporting level and lt list into one data.table +#' +#' @rdname map_fgt +lt_to_dt <- \(x, y, nx, povline) { + DT <- lapply(names(y), \(rl) { + + idx <- y[[rl]] + x[idx, reporting_level := rl] + + }) |> + rbindlist(fill = TRUE) + + + DT[, `:=`( + country_code = gsub("([^_]+)(_.+)", "\\1", nx), + reporting_year = gsub("(.+_)([^_]+)", "\\2", nx) + )] +} + +#' Map lt_to_dt +#' +#' @rdname map_fgt +map_lt_to_dt <- \(lt, l_rl_rows, povline) { + Map(lt_to_dt, lt, l_rl_rows, names(lt), + MoreArgs = list(povline = povline)) |> + rbindlist(fill = TRUE) +} + + +#' map over list of data.tables and indices to compute FGT by reporting_level +#' +#' @param lt list of data.tables with welfare and weight data +#' @param l_rl_rows list of indeces +#' +#' @return data.table with all measured +#' @keywords internal +map_fgt <- \(lt, l_rl_rows, povline) { + Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt), + MoreArgs = list(povline = povline)) |> + rbindlist(fill = TRUE) +} + +process_dt <- function(dt, povline, + mean_and_med = FALSE, + id_var = "file") { + byvars <- c(id_var, "reporting_level") + dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), + by = byvars] +} + +#' load survey year files and store them in a list +#' +#' @param metadata data frame from `subset_lkup()` +#' +#' @return list with survey years data +#' @keywords internal +load_data_list <- \(metadata) { + + # unique values + mdout <- metadata[, lapply(.SD, list), by = path] + upaths <- mdout$path + urep_level <- mdout$reporting_level + uppp <- mdout$ppp + ucpi <- mdout$cpi + + seq_along(upaths) |> + lapply(\(f) { + path <- upaths[f] + rep_level <- urep_level[f][[1]] + ppp <- uppp[f][[1]] + cpi <- ucpi[f][[1]] + + # Build a data.table to merge cpi and ppp + fdt <- data.table(reporting_level = as.character(rep_level), + ppp = ppp, + cpi = cpi) + + # load data and format + dt <- fst::read_fst(path, as.data.table = TRUE) + + if (length(rep_level) == 1) { + if (rep_level == "national") dt[, area := "national"] + } + setnames(dt, "area", "reporting_level") + dt[, + `:=`( + file = basename(path), + reporting_level = as.character(reporting_level) + ) + ] + + dt <- join(dt, fdt, + on = "reporting_level", + validate = "m:1", + how = "left", + verbose = 0) + + dt[, welfare := welfare/(cpi * ppp) + ][, + c("cpi", "ppp") := NULL] + + }) + +} + + + +pov_from_DT <- function(DT, povline, g, cores = 1) { + w <- DT$welfare + wt <- DT$weight + n_pov <- length(povline) + + ng <- g$N.groups + grp_ids <- qDT(g$groups) + + # Precompute log(w) for efficiency + pos <- w > 0 + logw <- fifelse(pos, log(w), NA_real_) + + # Prepare result lists + fgt0 <- vector("list", n_pov) + fgt1 <- vector("list", n_pov) + fgt2 <- vector("list", n_pov) + watts <- vector("list", n_pov) + + for (i in seq_along(povline)) { + pov <- povline[i] + poor <- w < pov + rel_dist <- fifelse(poor, 1 - w/pov, 0) + keep <- poor & pos + watts_val <- fmean((log(pov) - logw) * keep, + g = g, w = wt, nthreads = cores ) + fgt0[[i]] <- fmean(poor, g = g, w = wt, + nthreads = cores) + fgt1[[i]] <- fmean(rel_dist, g = g, w = wt, + nthreads = cores) + fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt, + nthreads = cores) + watts[[i]] <- watts_val + } + + out <- data.table( + povline = rep(povline, each = ng), + fgt0 = unlist(fgt0), + fgt1 = unlist(fgt1), + fgt2 = unlist(fgt2), + watts = unlist(watts) + ) + # Repeat group columns for each povline + grp_dt <- grp_ids[rep(seq_len(ng), times = n_pov)] + add_vars(out, pos = "front") <- grp_dt + out +} + + + + + +# pov_from_DT2 <- function(DT, povline, g) { +# fgt0 <- numeric(length(povline)) +# fgt1 <- numeric(length(povline)) +# fgt2 <- numeric(length(povline)) +# w <- DT$welfare +# wt <- DT$weight +# +# +# for (i in seq_along(povline)) { +# pov <- povline[i] +# poor <- w < pov +# rel_dist <- fifelse(poor, 1 - w/pov, 0) +# fgt0[i] <- fmean(poor, g = g, w = wt) +# fgt1[i] <- fmean(rel_dist, g = g, w = wt) +# fgt2[i] <- fmean(rel_dist^2, g = g, w = wt) +# } +# +# list(fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2) +# } + + diff --git a/R/compute_fgt_old.R b/R/compute_fgt_old.R new file mode 100644 index 00000000..cf999b08 --- /dev/null +++ b/R/compute_fgt_old.R @@ -0,0 +1,112 @@ +# OLD: Efficient FGT calculation for a data.table and vector of poverty lines +#' Title +#' +#' @param dt data frame with `welfare` and `weight` columns +#' @param welfare character: welfare variable name +#' @param weight character: weight variable name +#' @param povlines double: vector with poveryt lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt_dt_old <- function(dt, welfare, weight, povlines) { + w <- dt[[welfare]] + wt <- dt[[weight]] + n <- length(w) + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency + logw <- rep(NA_real_, n) + pos <- w > 0 + logw[pos] <- log(w[pos]) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + } else { + watts_vec[i] <- 0 + } + } + data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec + ) +} + + +process_dt_old <- function(dt, povline) { + dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline), + by = .(file, reporting_level)] +} + +#' OLD: load survey year files and store them in a list +#' +#' @param metadata data frame from `subset_lkup()` +#' +#' @return list with survey years data +#' @keywords internal +load_data_list_old <- \(metadata) { + + # unique values + mdout <- metadata[, lapply(.SD, list), by = path] + upaths <- mdout$path + urep_level <- mdout$reporting_level + uppp <- mdout$ppp + ucpi <- mdout$cpi + + seq_along(upaths) |> + lapply(\(f) { + path <- upaths[f] + rep_level <- urep_level[f][[1]] + ppp <- uppp[f][[1]] + cpi <- ucpi[f][[1]] + + # Build a data.table to merge cpi and ppp + fdt <- data.table(reporting_level = as.character(rep_level), + ppp = ppp, + cpi = cpi) + + # load data and format + dt <- fst::read_fst(path, as.data.table = TRUE) + + if (length(rep_level) == 1) { + if (rep_level == "national") dt[, area := "national"] + } + setnames(dt, "area", "reporting_level") + dt[, + `:=`( + file = basename(path), + reporting_level = as.character(reporting_level) + ) + ] + + dt <- join(dt, fdt, + on = "reporting_level", + validate = "m:1", + how = "left", + verbose = 0) + + dt[, welfare := welfare/(cpi * ppp) + ][, + c("cpi", "ppp") := NULL] + + }) + +} diff --git a/R/create_countries_vctr.R b/R/create_countries_vctr.R index 230118c4..4cb985f1 100644 --- a/R/create_countries_vctr.R +++ b/R/create_countries_vctr.R @@ -6,14 +6,15 @@ #' #' @inheritParams pip #' @param valid_years list: Valid years information provided through lkup object -#' @param aux_files list: List of auxiliary tables provided through lkup object +#' @param lkup lkup object #' #' @return a list of vectors with countries and regions code to be used in #' `pip()` and `pip_grp()` create_countries_vctr <- function(country, year, - valid_years, - aux_files) { + lkup) { + valid_years <- lkup$valid_years + aux_files <- lkup$aux_files # STEP 1: Setup ---- ## init Return list ---- @@ -53,7 +54,12 @@ create_countries_vctr <- function(country, } # STEP 2: Identify regions ---- ## All available aggregates ---- - aggs <- aux_files$regions ## all aggregates + aggs <- aux_files$regions |> ## all aggregates + # temporal filter + _[grouping_type %in% c("region", "regionpcn", "africa_split", "word")] + + + ## Official grouping type ---- off_gt <- c("region", "world") #c("region") ## Official valid region codes ---- diff --git a/R/create_lkups.R b/R/create_lkups.R index add360c4..a6086bd6 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -11,14 +11,13 @@ create_versioned_lkups <- vintage_pattern <- create_vintage_pattern_call(vintage_pattern) - data_dirs <- extract_data_dirs(data_dir = data_dir, vintage_pattern = vintage_pattern) versions <- names(data_dirs) # versions[1] <- "latest_release" - - versions_paths <- lapply(data_dirs, create_lkups, versions = versions) + versions_paths <- mapply(create_lkups, data_dirs, versions, + SIMPLIFY = FALSE, USE.NAMES = FALSE) names(versions_paths) <- versions return(list(versions = versions, @@ -36,6 +35,9 @@ extract_data_dirs <- function(data_dir, vintage_pattern ) { + + + # List data directories under data_dir data_dirs <- fs::dir_ls(data_dir, type = "directory") @@ -72,6 +74,10 @@ extract_data_dirs <- #' @return list create_lkups <- function(data_dir, versions) { + + # Use new lineup approach? ----- + use_new_lineup_version <- use_new_lineup_version(versions) + # Get survey paths ---- paths <- list.files(fs::path(data_dir, "survey_data")) paths_ids <- tools::file_path_sans_ext(paths) @@ -86,6 +92,7 @@ create_lkups <- function(data_dir, versions) { ## country_list ---- cl_lkup_path <- fs::path(data_dir, "_aux/country_list.fst") country_list <- fst::read_fst(cl_lkup_path, as.data.table = TRUE) + data.table::setnames(country_list, 'region', 'region_name') # Why is this necessary? ## countries ---- cts_path <- fs::path(data_dir, "_aux/countries.fst") @@ -176,6 +183,244 @@ create_lkups <- function(data_dir, versions) { by = .(interpolation_id)] + + + # ZP ADD - CREATE OBJECT: refy_lkup + + # CREATE OBJECT: refy_lkup ------------- + #___________________________________________________________________________ + if (use_new_lineup_version) { + refy_lkup_path <- fs::path(data_dir, + "estimations/prod_refy_estimation.fst") + + # NOTE: THIS `prod_refy_estimation.fst` is the refy table but + # unique at the country-year level + refy_lkup <- fst::read_fst(refy_lkup_path, + as.data.table = TRUE) + + ## TEMP START: add distribution type ----------- + dt <- ref_lkup[, .(country_code, + reporting_year, + welfare_type, + reporting_level, + distribution_type)] + + + dt[, + y := as.integer(length(unique(distribution_type)) == 1), + by = .(country_code, + reporting_year, + welfare_type, + reporting_level) + ] + dt[y == 0, + distribution_type := "mixed" + ][, y := NULL] + + dt <- funique(dt) + refy_lkup <- joyn::joyn(refy_lkup, dt, + by = c("country_code", + "reporting_year", + "welfare_type", + "reporting_level"), + match_type = "1:1", + keep = "left", + update_values = TRUE, + reportvar = FALSE) + + + ## TEMP END: add distribution type ----------- + + + + # ZP ADD - CREATE OBJECT: lineup years + #______________________________________________________________ + lineup_years_path <- + fs::path(data_dir, + "estimations/lineup_years.fst") + + lineup_years <- fst::read_fst(lineup_years_path) |> + as.list() # Why Is this a list? + + + + # --- START NOTE AC> Include here the refy_lkup for CMD + ncountries <- nrow(country_list) + ly <- lineup_years$lineup_years + + cmd <- fs::path(data_dir, + "_aux/missing_data.fst") |> + fst::read_fst(as.data.table = TRUE) |> + fselect(country_code, + reporting_year = year, + welfare_type) + + + # build some variables + cmd[, + `:=`( + survey_coverage = "national", + reporting_level = "national", + distribution_type = "CMD distribution", + is_interpolated = FALSE, + is_used_for_line_up = TRUE, + is_used_for_aggregation = FALSE, + estimation_type = "CMD estimation", + display_cp = "0", + monotonic = TRUE, # ? + same_direction = TRUE, # NA ? + relative_distance = 1, + lineup_approach = "CMD", + mult_factor = 1, + wt_code = toupper(substr(welfare_type, 1, 3)) + )][, + cache_id := paste(country_code, + reporting_year, + paste0("NOSVY_D1_", wt_code,"_CMD"), + sep = "_") + ][, wt_code := NULL] + + # Append lineup and CMD info + + refy_lkup <- rbindlist(list(refy_lkup, cmd), + use.names = TRUE, + fill = TRUE) + + + # Create additional variables + refy_lkup[ , + path := { + fs::path(data_dir, + "lineup_data", + paste0(country_code, + "_", + reporting_year), + ext = "fst") |> + as.character() + } + ] + refy_lkup[, + interpolation_id := paste(country_code, + reporting_year, + reporting_level, + sep = "_")] + + # if ("region_code" %in% names(refy_lkup)) { + # refy_lkup[, + # region_code := NULL] + # } + + + refy_lkup[, + data_interpolation_id := paste(cache_id, + reporting_level, + sep = "_") + ] + + refy_lkup[, + data_interpolation_id := paste(unique(data_interpolation_id), + collapse = "|"), + by = .(interpolation_id)] + + + # Temporal fix + refy_lkup <- joyn::joyn(refy_lkup, country_list, + by = "country_code", + keep = "left", + reportvar = FALSE, + match_type = "m:1", + update_values = TRUE) + + + ## TEMP START: fix ARG population ---- + pw <- pivot(pop, + ids = c("country_code", "data_level"), + names = list(variable = "reporting_year", + value = "reporting_pop"), + how = "longer") |> + pivot(how = "wider", + ids = c("country_code", "reporting_year"), + values = "reporting_pop", + names = "data_level") |> + setorder(country_code, reporting_year) + + pw[country_code != "CHN", `:=`( + urban = national, + rural = national + )] + + ## TEMP END: fix ARG population ------ + + popl <- pivot(pw, + ids = c("country_code", "reporting_year"), + names = list(variable = "reporting_level", + value = "reporting_pop"), + how = "longer") |> + ftransform(reporting_year = as_integer_factor(reporting_year), + reporting_level = as_character_factor(reporting_level)) |> + setkey(NULL) + + + refy_lkup <- joyn::joyn(refy_lkup, popl, + by = c('country_code', + 'reporting_year', + 'reporting_level'), + keep = "left", + reportvar = FALSE, + match_type = "1:1", + update_values = TRUE) + + + # --- END inclussion of CMD data. + + refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ] + + gv(refy_lkup, + c("monotonic", + "same_direction", + "mult_factor", + "nac", + "nac_sy", + "svy_mean", + #"data_interpolation", + "relative_distance")) <- NULL + gv(ref_lkup, + c("monotonic", + "same_direction", + "nac", + "nac_sy", + "svy_mean", + #"data_interpolation", + "relative_distance")) <- NULL + + + + + # ZP ADD - CREATE OBJECT: lineup dist stats + #___________________________________________________________________________ + lineup_dist_stats <- + fs::path(data_dir, + "estimations/lineup_dist_stats.fst") + + lineup_dist_stats <- fst::read_fst(lineup_dist_stats, + as.data.table = TRUE) |> + fmutate(file = paste(country_code, + reporting_year, + sep = "_")) + gv(lineup_dist_stats, + c("min", + "max")) <- NULL + + } + + + + + + + #___________________________________________________________________________ + + # CREATE OBJECT: interpolation_list ---- # This is to facilitate interpolation computations unique_survey_files <- unique(ref_lkup$data_interpolation_id) @@ -414,6 +659,11 @@ create_lkups <- function(data_dir, versions) { # CREATE OBJECT: valid_years ---- valid_years <- valid_years(data_dir) + if (use_new_lineup_version) { + valid_years <- c(valid_years, + lineup_years) # add lineup years + + } # CREATE OBJECT: query_controls ---- # Create list of query controls @@ -435,12 +685,17 @@ create_lkups <- function(data_dir, versions) { ## `pipapi` runs onto, such as path to data files, which will not be the same ## on my laptop, and on the PROD VM. These variables therefore need to be removed ## prior to the creation of the cache_data_id - hash_svy_lkup <- svy_lkup + hash_svy_lkup <- data.table::copy(svy_lkup) hash_svy_lkup$path <- NULL - hash_ref_lkup <- ref_lkup + if (use_new_lineup_version) { + hash_ref_lkup <- data.table::copy(refy_lkup) + } else { + hash_ref_lkup <- data.table::copy(ref_lkup) + } hash_ref_lkup$path <- NULL + query_controls_hash <- query_controls query_controls_hash$version <- NULL @@ -519,22 +774,27 @@ create_lkups <- function(data_dir, versions) { # Create list of lkups lkup <- list( - svy_lkup = svy_lkup, - ref_lkup = ref_lkup, - dist_stats = dist_stats, - pop_region = pop_region, - cp_lkups = cp_lkups, - pl_lkup = pl_lkup, - censored = censored, - aux_files = aux_files, - return_cols = return_cols, - query_controls = query_controls, - data_root = data_dir, - aux_tables = aux_tables, - interpolation_list = interpolation_list, - valid_years = valid_years, - cache_data_id = cache_data_id - ) + svy_lkup = svy_lkup, + ref_lkup = ref_lkup, + dist_stats = dist_stats, + pop_region = pop_region, + cp_lkups = cp_lkups, + pl_lkup = pl_lkup, + censored = censored, + aux_files = aux_files, + return_cols = return_cols, + query_controls = query_controls, + data_root = data_dir, + aux_tables = aux_tables, + interpolation_list = interpolation_list, + valid_years = valid_years, + cache_data_id = cache_data_id, + use_new_lineup_version = use_new_lineup_version) + + if (use_new_lineup_version) { + lkup$refy_lkup <- refy_lkup + lkup$lineup_dist_stats <- lineup_dist_stats + } return(lkup) } @@ -734,3 +994,38 @@ available_versions <- function(data_dir) { test_regex = vintage_pattern$test_regex) } + + +#' Should the new lineup approach be used? +#' +#' Check if the date in a string is more recent than May 2025 +#' +#' This function extracts the first 8 characters from an input string, +#' interprets them as a date in the format \code{YYYYMMDD}, and checks +#' whether this date is more recent than May 1st, 2025. +#' +#' @param x A character vector where each element starts with an +#' 8-digit date in the format \code{YYYYMMDD}. +#' +#' @return A logical vector: \code{TRUE} if the extracted date is +#' after May 1st, 2025, otherwise \code{FALSE}. +#' +#' @examples +#' use_new_lineup_version("20250401_2021_01_02_PROD") # FALSE +#' use_new_lineup_version("20250615_2021_01_02_PROD") # TRUE +#' +#' @export +use_new_lineup_version <- function(x) { + # Extract YYYYMMDD + date_str <- substr(x, 1, 8) + + # Convert to Date + date_val <- as.Date(date_str, format = "%Y%m%d") + + # Threshold date + threshold <- as.Date("2025-05-01") + + # Compare + date_val > threshold +} + diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 70779668..12b47e80 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -10,25 +10,42 @@ return_if_exists <- function(slkup, fill_gaps, verbose = getOption("pipapi.verbose")) { + # none selected if (fnrow(slkup) == 0 ) { return(list(data_present_in_master = NULL, - lkup = slkup, - povline = povline)) + lkup = slkup, + povline = povline)) } - + # don't use cache if (getOption("pipapi.query_live_data")) { return(list(data_present_in_master = NULL, - lkup = slkup, - povline = povline)) + lkup = slkup, + povline = povline)) } - master_file <- load_inter_cache(cache_file_path = cache_file_path, - fill_gaps = fill_gaps) + # load cache + # ZP new temp code to avoid error from load_inter_cache due to dbConnect + master_file <- tryCatch( + load_inter_cache(cache_file_path = cache_file_path, + fill_gaps = fill_gaps), + error = function(e) { + cli::cli_warn("Failed to load intermediate cache: {e$message}") + master_file <- slkup[0] # zero-row data.table with same columns as lkup + } + ) + # temp ZP just to bypass cache + #master_file <- slkup[0] + + # ZP old code: + # master_file <- load_inter_cache(cache_file_path = cache_file_path, + # fill_gaps = fill_gaps) + + # if no cached files, return selected lkup if (fnrow(master_file) == 0) { return(list(data_present_in_master = NULL, - lkup = slkup, - povline = povline)) + lkup = slkup, + povline = povline)) } @@ -37,7 +54,10 @@ return_if_exists <- function(slkup, # convert survey_comparability to NA # NOTE: This should not be necessary. for the new lineup distribution # metadata should come without this variable. - slkup[, survey_comparability := NA] + + # ZP comment: if using refy_lkup, this should be removed because + # it does not include survey_comparability + slkup[, survey_comparability := NA_real_] } else { @@ -47,9 +67,13 @@ return_if_exists <- function(slkup, # This is probably unnecesary + # ZP comment: in my quick checks this has no impact, meaning + # slkup is already unique + # ZP Question: is this to get rid of duplicates from df_refy??? lkup_kvars <- slkup |> copy() |> funique() # this is not big. + # get all vars slkup_vars <- setdiff(names(slkup), key_vars) # transform to NA when necessary @@ -75,15 +99,15 @@ return_if_exists <- function(slkup, # lkup_kvars_pov <- lkup_kvars[, .(poverty_line = povline), # by = eval(names(lkup_kvars))] lkup_kvars_pov <- lkup_kvars[rep(seq_len(nrow(lkup_kvars)), - each = length(povline))] + each = length(povline))] # ZP: add povline lkup_kvars_pov[, poverty_line := rep(povline, times = nrow(lkup_kvars))] # Find which (key_vars, poverty_line) are present in master_file lk_not_ms <- join(x = lkup_kvars_pov, - y = master_file, + y = master_file, # ZP: remember, master_file is full cache file on = key_vars_pl, - how = "anti", + how = "anti", # rows in lkup not in master_file to know what new to do # validate = "1:1", overid = 2, verbose = 0, @@ -100,8 +124,13 @@ return_if_exists <- function(slkup, verbose = 0, multiple = TRUE) + # now we have two dfs: lk_not_ms and data_present_in_master + # which gives the lkup rows not in cache (master_file), + # and the lkup rows in cache (master_file) + # If no data is present in master + # i.e. if no common rows between if (fnrow(data_present_in_master) == 0) { return(list(data_present_in_master = NULL, lkup = slkup, @@ -177,9 +206,35 @@ return_if_exists <- function(slkup, update_master_file <- function(dat, cache_file_path, fill_gaps, - verbose = getOption("pipapi.verbose") + verbose = getOption("pipapi.verbose"), + decimal = 2 ) { + # select the right lines + pl <- get_from_pipapienv("pl_to_store") + + + # Keep only rows with <= 2 decimal places + to_keep <- get_vars(dat, "poverty_line") |> + reg_elem() |> # extract vectos + as.character() |> + sub("^[^.]*\\.?","", x = _) |> # get only the decimal part + (\(x) which(nchar(x) <= decimal))() + + dat <- dat[to_keep] + + povline <- dat[, poverty_line] |> + unique() + + # Keep only those that belong to the list + wpl <- povline[povline %in% round(pl, decimal)] + + if (length(wpl) == 0) return(invisible(FALSE)) + + dat <- dat[poverty_line %in% wpl] + + if (nrow(dat) == 0) return(invisible(FALSE)) + write_con <- connect_with_retry(cache_file_path, read_only = FALSE) if (fill_gaps) { @@ -188,8 +243,6 @@ update_master_file <- function(dat, keep_vars <- c( "interpolation_id", "poverty_line", - "mean", - "median", "headcount", "poverty_gap", "poverty_severity", @@ -204,8 +257,6 @@ update_master_file <- function(dat, "cache_id", "reporting_level", "poverty_line", - "mean", - "median", "headcount", "poverty_gap", "poverty_severity", @@ -213,6 +264,14 @@ update_master_file <- function(dat, ) } + # Get column names from DuckDB table + table_info <- DBI::dbGetQuery(write_con, glue("PRAGMA table_info({target_file})")) + col_names <- table_info$name + # Add mean and median if present in table + if (all(c("mean", "median") %in% col_names)) { + keep_vars <- c(keep_vars, "mean", "median") + } + # Select variables dat <- dat[, ..keep_vars] @@ -232,6 +291,7 @@ update_master_file <- function(dat, ")) duckdb::dbDisconnect(write_con) + if (nr > 0 && verbose) message(glue("{target_file} is updated.")) return(nr) @@ -263,8 +323,11 @@ connect_with_retry <- function(db_path = NULL, message("Attempt ", attempt, " failed: ", conditionMessage(e)) } + # if (attempt == max_attempts) { + # stop("Failed to connect after ", max_attempts, " attempts.") + # } if (attempt == max_attempts) { - stop("Failed to connect after ", max_attempts, " attempts.") + stop("Failed to connect after ", max_attempts, " attempts.\nLast error: ", conditionMessage(e)) } Sys.sleep(delay_sec) attempt <<- attempt + 1 @@ -299,6 +362,7 @@ reset_cache <- function(pass = Sys.getenv('PIP_CACHE_LOCAL_KEY'), DBI::dbExecute(write_con, "DELETE from fg_master_file") } duckdb::dbDisconnect(write_con) + } create_duckdb_file <- function(cache_file_path) { @@ -307,8 +371,7 @@ create_duckdb_file <- function(cache_file_path) { cache_id VARCHAR, reporting_level VARCHAR, poverty_line DOUBLE, - mean DOUBLE, - median DOUBLE, + headcount DOUBLE, poverty_gap DOUBLE, poverty_severity DOUBLE, @@ -318,14 +381,14 @@ create_duckdb_file <- function(cache_file_path) { DBI::dbExecute(con, "CREATE OR REPLACE table fg_master_file ( interpolation_id VARCHAR, poverty_line DOUBLE, - mean DOUBLE, - median DOUBLE, + headcount DOUBLE, poverty_gap DOUBLE, poverty_severity DOUBLE, watts DOUBLE )") DBI::dbDisconnect(con) + } @@ -360,6 +423,7 @@ load_inter_cache <- function(lkup = NULL, # connection object if it is not closed More details here # https://app.clickup.com/t/868cdpe3q duckdb::dbDisconnect(con) + setDT(master_file) } diff --git a/R/fg_pip.R b/R/fg_pip.R index 95c67653..70111a53 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -12,176 +12,187 @@ fg_pip <- function(country, welfare_type, reporting_level, ppp, - lkup) { + lkup, + pipenv = NULL) { valid_regions <- lkup$query_controls$region$values interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root - ref_lkup <- lkup$ref_lkup + refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country- + + #povline is set to NULL if popshare is given + if (!is.null(popshare)) povline <- NULL + if (is.list(povline)) povline <- unlist(povline) cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") - # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created - # when calling from `pip`. For other functions we create it here. - # if (is.null(con)) { - # cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") - # con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE) - # } + # Handle interpolation metadata <- subset_lkup( country = country, year = year, welfare_type = welfare_type, reporting_level = reporting_level, - lkup = ref_lkup, + lkup = refy_lkup, # only place this is used, for 'interpolation_id' valid_regions = valid_regions, data_dir = data_dir, povline = povline, cache_file_path = cache_file_path, - fill_gaps = TRUE - ) + fill_gaps = TRUE) + data_present_in_master <- metadata$data_present_in_master povline <- metadata$povline - metadata <- metadata$lkup - # Remove aggregate distribution if popshare is specified - # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions - metadata <- filter_lkup(metadata = metadata, - popshare = popshare) - setDT(metadata) - + metadata <- metadata$lkup |> + setDT() - # Return empty dataframe if no metadata is found + # Return empty dataframe if no metadata is found (i.e. all in cache) if (nrow(metadata) == 0) { - return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) + #print("ZP: no metadata - i.e. nothing additional to estimate") + return(list(main_data = pipapi::empty_response_fg, + data_in_cache = data_present_in_master)) } - unique_survey_files <- unique(metadata$data_interpolation_id) - - # Interpolation list - interpolation_list <- interpolation_list[names(interpolation_list) %in% unique_survey_files] - - # Unique set of survey data to be read - out <- vector(mode = "list", length = length(unique_survey_files)) - - #NEW: iterate over survey files - for (svy_id in seq_along(unique_survey_files)) { - # Extract country-years for which stats will be computed from the same files - # tmp_metadata <- interpolation_list[[unique_survey_files[svy_id]]]$tmp_metadata - iteration <- interpolation_list[[unique_survey_files[svy_id]]] - svy_data <- get_svy_data(svy_id = iteration$cache_ids, - reporting_level = iteration$reporting_level, - path = iteration$paths) - - # Extract unique combinations of country-year - ctry_years <- subset_ctry_years(country = country, - year = year, - lkup = iteration$ctry_years, - valid_regions = valid_regions, - data_dir = data_dir) - - # Join because some data might be coming from cache so it might be absent in - # metadata - ctry_years <- collapse::join(ctry_years, metadata |> - collapse::fselect(intersect(names(ctry_years), - names(metadata))), - verbose = 0, - how = "inner", - overid = 2) - - results_subset <- vector(mode = "list", length = nrow(ctry_years)) - - for (ctry_year_id in seq_along(ctry_years$interpolation_id)) { - # Extract records to be used for a single country-year estimation - interp_id <- ctry_years[["interpolation_id"]][ctry_year_id] - tmp_metadata <- metadata[metadata$interpolation_id == interp_id, ] - - report_year <- ctry_years[["reporting_year"]][ctry_year_id] - - # Compute estimated statistics using the fill_gap method - tmp_stats <- wbpip:::prod_fg_compute_pip_stats( - request_year = report_year, - data = svy_data, - predicted_request_mean = tmp_metadata[["predicted_mean_ppp"]], - svy_mean_lcu = tmp_metadata[["survey_mean_lcu"]], - svy_median_lcu = tmp_metadata$survey_median_lcu, - svy_median_ppp = tmp_metadata$survey_median_ppp, - survey_year = tmp_metadata[["survey_year"]], - default_ppp = tmp_metadata[["ppp"]], - ppp = ppp, - distribution_type = tmp_metadata[["distribution_type"]], - poverty_line = povline, - popshare = popshare - ) - - # Handle multiple distribution types (for aggregated distributions) - if (length(unique(tmp_metadata$distribution_type)) > 1) { - tmp_metadata[, distribution_type := "mixed"] + # Build a dictionary for encoding (id, reporting_level) pairs as integer codes. + dict <- build_pair_dict(lkup = lkup, + fill_gaps = TRUE) + + # Create a list of file paths for all surveys to be loaded, based on the filtered metadata. + full_list <- create_full_list(metadata = metadata) + + # Load all survey data files into a named list of data.tables, each with an id column. + lfst <- load_list_refy(input_list = full_list) + # Calculate and update poverty line if popshare is passed + # YES. this is INEFFICIENT because welfare cumsum is already created in + # the data, but we don't have time... FIX for the next release + if (!is.null(popshare)) { + povline <- lapply(lfst, \(x) { + # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + uni_rl <- funique(x$reporting_level) + if (length(uni_rl) > 1) { + assume_sorted <- FALSE + } else { + assume_sorted <- TRUE } - # - # tmp_metadata <- unique(tmp_metadata) - # Add stats columns to data frame - - # Convert Statas into Data.table - ts_DT <- as.data.table(tmp_stats) - # Add reporting year to merge - ts_DT[, reporting_year := report_year] - - # convert survey_comparability to NA - # NOTE: This should not be necessary. for the new lineup distribution - # metadata should come without this variable. - tmp_metadata[, survey_comparability := NA] - - # get all vars - meta_vars <- setdiff(names(tmp_metadata), "reporting_year") - # transform to NA when necessary - tmp_metadata[, (meta_vars) := lapply(.SD, \(x) { - if (uniqueN(x) == 1) { - x - } else { - NA - }}), - by = reporting_year, .SDcols = meta_vars] - - # Remove duplicate rows by reporting_year (keep only one row per - # reporting_year) - tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year") - - # Now join as usual - - ts_md <- join(ts_DT, - tmp_metadata_unique, - on = "reporting_year", - how = "left", - verbose = 0, - overid = 2) - - results_subset[[ctry_year_id]] <- ts_md - } - out[[svy_id]] <- results_subset + infer_poverty_line(welfare = x$welfare, + weight = x$weight, + popshare = popshare, + include = FALSE, + method = "nearest", + assume_sorted = assume_sorted) + }) + + fgt <- Map(process_dt, lfst, povline, id_var = "id") |> + rbindlist(fill = TRUE) + + fgt[, `:=`( + country_code = gsub("(.+)(_.+)", "\\1", id), + reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id)) + )][, + id := NULL] + + + } else { + # Combine all loaded surveys into a single data.table, encode group identifiers, + # and create a GRP object for efficient grouping. + LDTg <- format_lfst(lfst = lfst, + dict = dict) + + # Compute the total population (sum of weights) for each group (id_rl) in + # the combined survey data. + tpop <- get_total_pop(LDTg = LDTg) + + # Compute FGT and Watts indices for all groups and poverty lines, then decode + # integer codes back to (country_code, reporting_year, reporting_level). + fgt <- fgt_cumsum(LDTg = LDTg, + tpop = tpop, + povline = povline) |> + decode_pairs(dict = dict) + + rm(LDTg) } - out <- unlist(out, recursive = FALSE) - out <- data.table::rbindlist(out) - - # Remove median - # out[, median := NULL] + rm(lfst) + + + + # Add just mean and median + res <- get_mean_median(fgt, lkup, fill_gaps = TRUE) + + + # try metadata unique code + tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency. + # Handle multiple distribution types (for aggregated distributions) + + tmp_metadata[, + y := as.integer(length(unique(distribution_type)) == 1), + by = .(country_code, + reporting_year, + welfare_type, + reporting_level) + ] + + tmp_metadata[y == 0, + distribution_type := "mixed" + ][, + y := NULL] + + # convert survey_comparability to NA + # NOTE: This should not be necessary. for the new lineup distribution + # metadata should come without this variable. + tmp_metadata[, survey_comparability := NA] + # get all vars + meta_vars <- setdiff(names(tmp_metadata), "reporting_year") + # transform to NA when necessary - i.e. when interpolated (two rows per reporting_year) + tmp_metadata[, + (meta_vars) := lapply(.SD, \(x) { + if (uniqueN(x) == 1) { + x + } else { + NA + }}), + by = c("reporting_year", + "country_code", + "reporting_level", + "welfare_type"), + .SDcols = meta_vars] + + # Remove duplicate rows by reporting_year (keep only one row per + # reporting_year) + tmp_metadata_unique <- funique(tmp_metadata) + + + out <- join(res, + tmp_metadata_unique, + on = c("country_code", + "reporting_year", + "reporting_level"), + how = "left", # ZP: change from full to left, + # this rm nowcast years - i.e. years not included + # as lineup years + validate = "m:1", + drop.dup.cols = TRUE, + verbose = 0, + overid = 2) + + setnames(out, "povline", "poverty_line") # Ensure that out does not have duplicates - out <- fg_remove_duplicates(out) + out <- fg_remove_duplicates(out, + use_new_lineup_version = lkup$use_new_lineup_version) - # Fix issue with rounding of poverty lines - out[, - poverty_line := round(poverty_line, digits = 3) ] - # Formatting. MUST be done in data.table tom modify by reference + # Formatting. MUST be done in data.table to modify by reference out[, path := as.character(path)] if ("max_year" %in% names(out)) { out[, max_year := NULL] } - return(list(main_data = out, data_in_cache = data_present_in_master)) + return(list(main_data = out, + data_in_cache = data_present_in_master)) + } + #' Remove duplicated rows created during the interpolation process #' #' @param df data.table: Table of results created in `fg_pip()` @@ -189,7 +200,6 @@ fg_pip <- function(country, #' #' @return data.table #' - fg_remove_duplicates <- function(df, cols = c("comparable_spell", "cpi", @@ -208,19 +218,28 @@ fg_remove_duplicates <- function(df, "survey_median_ppp", "survey_time", "survey_year", - "surveyid_year")) { - # Modify cache_id - # * Ensures that cache_id is unique for both extrapolated and interpolated surveys - # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows - df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id, - interpolation_id = df$data_interpolation_id, - reporting_level = df$reporting_level) - # Set collapse vars to NA (by type) - df <- fg_assign_nas_values_to_dup_cols(df = df, - cols = cols) + "surveyid_year"), + use_new_lineup_version = FALSE) { + + if (isFALSE(use_new_lineup_version)) { + print("here") + # not all cols need to be changes + cols <- setdiff(cols, + colnames(df)) + # Modify cache_id + # * Ensures that cache_id is unique for both extrapolated and interpolated surveys + # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows + # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id, + # interpolation_id = df$data_interpolation_id, + # reporting_level = df$reporting_level) + # Set collapse vars to NA (by type) + df <- fg_assign_nas_values_to_dup_cols(df = df, + cols = cols) + + # Ensure that out does not have duplicates + df <- unique(df) + } - # Ensure that out does not have duplicates - df <- unique(df) return(df) } @@ -232,7 +251,6 @@ fg_remove_duplicates <- function(df, #' @param reporting_level character #' #' @return character - fg_standardize_cache_id <- function(cache_id, interpolation_id, reporting_level) { @@ -252,10 +270,28 @@ fg_standardize_cache_id <- function(cache_id, #' @inheritParams fg_remove_duplicates #' #' @return data.table - fg_assign_nas_values_to_dup_cols <- function(df, cols) { #Classes are maintained by default. df[, (cols) := NA] return(df) } + + + + + + +#' Create full list for fg data load, not including country-years in cache +#' +#' @param metadata data table from subset_lkup()$lkup +#' @return data.table +create_full_list <- function(metadata) { + + metadata[, path] |> + funique() + +} + + + diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R new file mode 100644 index 00000000..444df522 --- /dev/null +++ b/R/fg_pip_old.R @@ -0,0 +1,261 @@ +#' Compute imputed year stats +#' +#' Compute the main PIP poverty and inequality statistics for imputed years. +#' +#' @inheritParams pip +#' @return data.frame +#' @keywords internal +fg_pip_old <- function(country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup) { + + valid_regions <- lkup$query_controls$region$values + interpolation_list <- lkup$interpolation_list + data_dir <- lkup$data_root + ref_lkup <- lkup$ref_lkup + + if (!is.null(popshare)) povline <- NULL + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created + # when calling from `pip`. For other functions we create it here. + # if (is.null(con)) { + # cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + # con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE) + # } + # Handle interpolation + metadata <- subset_lkup( + country = country, + year = year, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = ref_lkup, + valid_regions = valid_regions, + data_dir = data_dir, + povline = povline, + cache_file_path = cache_file_path, + fill_gaps = TRUE + ) + + data_present_in_master <- metadata$data_present_in_master + povline <- metadata$povline + metadata <- metadata$lkup + # Remove aggregate distribution if popshare is specified + # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions + metadata <- filter_lkup(metadata = metadata, + popshare = popshare) + setDT(metadata) + + + # Return empty dataframe if no metadata is found + if (nrow(metadata) == 0) { + return(list(main_data = pipapi::empty_response_fg, + data_in_cache = data_present_in_master)) + } + + unique_survey_files <- unique(metadata$data_interpolation_id) + + # Interpolation list + interpolation_list <- interpolation_list[names(interpolation_list) %in% unique_survey_files] + + # Unique set of survey data to be read + out <- vector(mode = "list", length = length(unique_survey_files)) + + #NEW: iterate over survey files + for (svy_id in seq_along(unique_survey_files)) { + # Extract country-years for which stats will be computed from the same files + # tmp_metadata <- interpolation_list[[unique_survey_files[svy_id]]]$tmp_metadata + iteration <- interpolation_list[[unique_survey_files[svy_id]]] + svy_data <- get_svy_data(svy_id = iteration$cache_ids, + reporting_level = iteration$reporting_level, + path = iteration$paths) + + # Extract unique combinations of country-year + ctry_years <- subset_ctry_years(country = country, + year = year, + lkup = iteration$ctry_years, + valid_regions = valid_regions, + data_dir = data_dir) + + # Join because some data might be coming from cache so it might be absent in + # metadata + ctry_years <- collapse::join(ctry_years, metadata |> + collapse::fselect(intersect(names(ctry_years), + names(metadata))), + verbose = 0, + how = "inner", + overid = 2) + + results_subset <- vector(mode = "list", length = nrow(ctry_years)) + + for (ctry_year_id in seq_along(ctry_years$interpolation_id)) { + # Extract records to be used for a single country-year estimation + interp_id <- ctry_years[["interpolation_id"]][ctry_year_id] + tmp_metadata <- metadata[metadata$interpolation_id == interp_id, ] + + report_year <- ctry_years[["reporting_year"]][ctry_year_id] + + # Compute estimated statistics using the fill_gap method + tmp_stats <- wbpip:::prod_fg_compute_pip_stats( + request_year = report_year, + data = svy_data, + predicted_request_mean = tmp_metadata[["predicted_mean_ppp"]], + svy_mean_lcu = tmp_metadata[["survey_mean_lcu"]], + svy_median_lcu = tmp_metadata$survey_median_lcu, + svy_median_ppp = tmp_metadata$survey_median_ppp, + survey_year = tmp_metadata[["survey_year"]], + default_ppp = tmp_metadata[["ppp"]], + ppp = ppp, + distribution_type = tmp_metadata[["distribution_type"]], + poverty_line = povline, + popshare = popshare + ) + + # Handle multiple distribution types (for aggregated distributions) + if (length(unique(tmp_metadata$distribution_type)) > 1) { + tmp_metadata[, distribution_type := "mixed"] + } + # + # tmp_metadata <- unique(tmp_metadata) + # Add stats columns to data frame + + # Convert Statas into Data.table + ts_DT <- as.data.table(tmp_stats) + # Add reporting year to merge + ts_DT[, reporting_year := report_year] + + # convert survey_comparability to NA + # NOTE: This should not be necessary. for the new lineup distribution + # metadata should come without this variable. + tmp_metadata[, survey_comparability := NA] + + # get all vars + meta_vars <- setdiff(names(tmp_metadata), "reporting_year") + # transform to NA when necessary + tmp_metadata[, (meta_vars) := lapply(.SD, \(x) { + if (uniqueN(x) == 1) { + x + } else { + NA + }}), + by = reporting_year, .SDcols = meta_vars] + + # Remove duplicate rows by reporting_year (keep only one row per + # reporting_year) + tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year") + + # Now join as usual + + ts_md <- join(ts_DT, + tmp_metadata_unique, + on = "reporting_year", + how = "left", + verbose = 0, + overid = 2) + + results_subset[[ctry_year_id]] <- ts_md + } + out[[svy_id]] <- results_subset + } + out <- unlist(out, recursive = FALSE) + out <- data.table::rbindlist(out) + + # Remove median + # out[, median := NULL] + + # Ensure that out does not have duplicates + out <- fg_remove_duplicates_old(out) + + # Fix issue with rounding of poverty lines + out[, + poverty_line := round(poverty_line, digits = 3) ] + + # Formatting. MUST be done in data.table tom modify by reference + out[, path := as.character(path)] + + if ("max_year" %in% names(out)) { + out[, max_year := NULL] + } + + return(list(main_data = out, data_in_cache = data_present_in_master)) +} + +#' OLD: Remove duplicated rows created during the interpolation process +#' +#' @param df data.table: Table of results created in `fg_pip()` +#' @param cols character: Columns with potential duplicate values +#' +#' @return data.table +#' +fg_remove_duplicates_old <- function(df, + cols = c("comparable_spell", + "cpi", + "display_cp", + "gd_type", + # "interpolation_id", + "path", + "predicted_mean_ppp", + "survey_acronym", + "survey_comparability", + "survey_coverage", + "survey_id", + "survey_mean_lcu", + "survey_mean_ppp", + "survey_median_lcu", + "survey_median_ppp", + "survey_time", + "survey_year", + "surveyid_year")) { + # Modify cache_id + # * Ensures that cache_id is unique for both extrapolated and interpolated surveys + # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows + df$cache_id <- fg_standardize_cache_id_old(cache_id = df$cache_id, + interpolation_id = df$data_interpolation_id, + reporting_level = df$reporting_level) + # Set collapse vars to NA (by type) + df <- fg_assign_nas_values_to_dup_cols_old(df = df, + cols = cols) + + # Ensure that out does not have duplicates + df <- unique(df) + + return(df) +} + +#' OLD: Standardize cache_id format to avoid duplication of rows +#' +#' @param cache_id character +#' @param interpolation_id character +#' @param reporting_level character +#' +#' @return character +fg_standardize_cache_id_old <- function(cache_id, + interpolation_id, + reporting_level) { + + out <- ifelse(grepl("|", interpolation_id, fixed = TRUE), + gsub(paste0("_", + unique(reporting_level), + collapse = '|'), + '', + interpolation_id), + cache_id) + return(out) +} + +#' OLD: Coerce variable causing potential duplicates to NAs +#' +#' @inheritParams fg_remove_duplicates_old +#' +#' @return data.table +fg_assign_nas_values_to_dup_cols_old <- function(df, + cols) { + #Classes are maintained by default. + df[, (cols) := NA] + return(df) +} diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R new file mode 100644 index 00000000..8128b64f --- /dev/null +++ b/R/fgt_cumsum.R @@ -0,0 +1,354 @@ +# MAIN ------------------------------- + +#' Format loaded survey list for grouped poverty analysis +#' +#' Combines a list of survey data.tables into a single data.table and encodes group identifiers using a dictionary. +#' Returns a data.table and a GRP object for efficient grouped operations, used as a preprocessing step for FGT and population calculations. +#' +#' @param lfst Named list of data.tables, as returned by load_list_refy(). +#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()). +#' @return List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl). +#' @keywords internal +format_lfst <- \(lfst, dict) { + + DT <- rbindlist(lfst, fill = TRUE) + + # Convert to factors (is it faster?) + if (!is.integer(DT$index)) { + DT[, index := as.integer(index)] + } + + out <- encode_pairs(DT = DT, + dict = dict, + drop_labels = TRUE) + + ## Grouping ---------- + g <- GRP(out, ~ id_rl, sort = FALSE) + + list(DT = out, + g = g) +} + + + + +#' Compute total population by survey and reporting level +#' +#' Sums the weights for each (id, reporting_level) group in the combined survey data. +#' Used as a denominator for FGT and Watts index calculations. +#' +#' @param LDTg List from format_lfst() with DT and g objects. +#' @return data.table with total population by group (columns: id_rl, W). +#' @keywords internal +get_total_pop <- \(LDTg) { + list2env(LDTg, envir = environment()) + rm(LDTg) + add_vars(g[["groups"]], + get_vars(DT, c("weight")) |> + fsum(g)) |> + setnames(old = "weight", + new = "W") +} + +#' Compute FGT and Watts indices for all groups and poverty lines +#' +#' Calculates headcount, poverty gap, poverty severity, and Watts index for each group and poverty line using cumulative sums. +#' +#' @param LDTg List from format_lfst() with DT and g objects. +#' @param tpop data.table with total population by group (from get_total_pop()). +#' @param povline Numeric vector of poverty lines. +#' @param drop_vars Logical, if TRUE returns only summary columns. +#' @return data.table with FGT and Watts measures by group and poverty line. +#' @keywords internal +fgt_cumsum <- \(LDTg, tpop, povline, + drop_vars = TRUE) { + list2env(LDTg, envir = environment()) + rm(LDTg) + + # Temporal values to be added to the data.table + tz <- pmax(povline, 1e-12) + tz2 <- pmax(povline^2, 1e-16) + tlogz <- log(tz) + + # 1) Compute cutpoint index for each z, using ONLY non-zero rows for welfare + # -> findInterval(povline, welfare) returns values in 0..N (never N+1) + ID <- DT[index > 0L, + { + idx <- findInterval(povline, welfare, left.open = TRUE) + # 2) Attach z, z2, logz in-group (no replication/copies) + data.table(index = idx, + z = tz, + z2 = tz2, + logz = tlogz) + }, + by = id_rl] + + # 3) Minimal cumulative view (shallow column subset; avoids copying DT) + DT_min <- get_vars(DT, + c("id_rl", "index", "cw", "cwy", "cwy2", "cwylog")) + + # 4) join cutpoints to cumulatives (index==0 hits the already-present zero row) + CS <- join( + x = ID, + y = DT_min, + on = c("id_rl","index"), + how = "left", + validate = "m:1", # many cutpoints -> 1 cumulative row + drop.dup.cols = "y", + verbose = 0) |> + # 5) Bring total population W + join(tpop, + on = "id_rl", + how = "left", + validate = "m:1", + drop.dup.cols = "y", + verbose = 0) |> + setorder(id_rl, index) + + + # 6) Compute measures (vectorized). Small clamps for numerical safety. + CS[, `:=`( + headcount = cw / W, + poverty_gap = (z * cw - cwy) / (z * W), + poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2 * W), + watts = (logz * cw - cwylog) / W + )] + + setnames(CS, "z", "povline") + if (!drop_vars) { + return(CS) + } + get_vars(CS, c("id_rl", "povline", "headcount", + "poverty_gap", "poverty_severity", "watts")) + +} + + +# --- helpers --------------------------------------------------------------- + +# ------------------------------- # +# 1) Build pair dictionary (DT) # +# ------------------------------- # + +#' Build dictionary for id/reporting_level encoding +#' +#' Creates a data.table dictionary for mapping (id, reporting_level) pairs to integer codes for fast joins and decoding. +#' Used for efficient merging and decoding in the FGT pipeline. +#' +#' @param lkup Lookup object containing refy_lkup and svy_lkup. +#' @param fill_gaps Logical, TRUE for lineup years, FALSE for survey years. +#' @return data.table with columns id, reporting_level, and code. +#' @keywords internal +build_pair_dict <- function(lkup, fill_gaps = TRUE) { + + FT <- if (fill_gaps) { + lkup$refy_lkup[, .(country_code, reporting_year, reporting_level)] + } else { + lkup$svy_lkup[, .(country_code, reporting_year, reporting_level)] + } |> + funique() + + FT[, id := paste0(country_code, "_", reporting_year) + ][, c("country_code", "reporting_year") := NULL] + + cols <- c("id", "reporting_level") + dict <- unique(FT[, ..cols]) + + # deterministic code order + setorderv(dict, cols, order = 1L) # radix by default + dict[, code := as.integer(.I)] # fast in DT + setkeyv(dict, cols) # fast key lookups when needed + setindexv(dict, "code") # index on code + dict +} + + +# -------------------------------------------- # +# 2) Encode: add integer code via collapse::join +# -------------------------------------------- # +#' Encode (id, reporting_level) pairs as integer codes +#' +#' Joins a data.table with a dictionary to add an integer code column for each (id, reporting_level) pair. +#' Used for efficient grouping and decoding in the FGT pipeline. +#' +#' @param DT data.table to encode. +#' @param dict data.table from build_pair_dict(). +#' @param id_col Name of id column. +#' @param level_col Name of reporting level column. +#' @param code_col Name of code column to write. +#' @param drop_labels Logical, drop id and level columns if TRUE. +#' @param strict Logical, error if any pairs are missing from dict. +#' @param verbose Integer, verbosity level. +#' @return data.table with code column added. +#' @keywords internal +encode_pairs <- function(DT, dict, + id_col = "id", level_col = "reporting_level", + code_col = "id_rl", + drop_labels = FALSE, + strict = TRUE, + verbose = 0L) { + + stopifnot(is.data.table(DT), is.data.table(dict)) + cols <- c(id_col, level_col) + stopifnot(all(cols %in% names(DT)), all(c(cols, "code") %in% names(dict))) + + + out <- join( + x = DT, + y = dict, + on = cols, + how = "left", + drop.dup.cols = "y", + validate = "m:1", + verbose = verbose + ) + # Ensure it's a data.table (join usually preserves) + if (!is.data.table(out)) setDT(out) + + # Rename 'code' -> code_col if needed + if (code_col != "code" && "code" %in% names(out)) { + setnames(out, "code", code_col) + } + + if (strict) { + if (anyNA(out[[code_col]])) { + nas <- is.na(out[[code_col]]) + miss <- unique(out[nas, ..cols])[1:min(10L, sum(nas))] + cli::cli_abort( + c( + "encode_pairs(): {fsum(nas)} unseen (id, reporting_level) pair(s).", + "Examples:\n{paste(capture.output(print(miss)), collapse = '\n')}" + ) + ) + } + } + + if (drop_labels) out[, (cols) := NULL] + out +} + +# ------------------------------------------------ # +# 3) Decode: join labels by code via collapse::join # +# ------------------------------------------------ # +#' Decode integer code to (id, reporting_level) labels +#' +#' Joins a data.table with a dictionary to recover id and reporting_level columns from integer codes. +#' Used after FGT calculations to restore human-readable labels. +#' +#' @param DT data.table to decode. +#' @param dict data.table from build_pair_dict(). +#' @param code_col Name of code column in DT. +#' @param id_col Name of id column in dict. +#' @param level_col Name of reporting level column in dict. +#' @param keep_code Logical, keep code column if TRUE. +#' @param add_true_vars Logical, add country_code and reporting_year columns and remove id. +#' @param verbose Integer, verbosity level. +#' @return data.table with id and reporting_level columns added. +#' @keywords internal +decode_pairs <- function(DT, dict, + code_col = "id_rl", + id_col = "id", + level_col = "reporting_level", + keep_code = FALSE, + add_true_vars = TRUE, + verbose = 0L) { + stopifnot(exprs = { + is.data.table(DT) + is.data.table(dict) + }) + stopifnot(exprs = { + code_col %in% names(DT) + all(c("code", id_col, level_col) %in% names(dict)) + }) + + out <- join( + x = DT, + y = dict, + on = setNames("code", code_col), # map DT[code_col] to dict$code + how = "left", + drop.dup.cols = "y", + validate = "m:1", + verbose = verbose + ) |> + qDT() + + if (add_true_vars) { + out[, `:=`( + country_code = gsub("(.+)(_.+)", "\\1", id), + reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id)) + )][, + id := NULL] + } + + if (!keep_code) out[, (code_col) := NULL] + out +} + +# ----------------------------------------------------- # +# 4) Update dict with new pairs (append-only, fast DT) # +# ----------------------------------------------------- # +#' Update dictionary with new (id, reporting_level) pairs +#' +#' Appends new (id, reporting_level) pairs to the dictionary if needed, ensuring all groups are encoded. +#' Used to keep the dictionary in sync with new survey data. +#' +#' @param dict data.table dictionary from build_pair_dict(). +#' @param DT data.table with id and reporting_level columns. +#' @param id_col Name of id column. +#' @param level_col Name of reporting level column. +#' @return Updated data.table dictionary. +#' @keywords internal +update_pair_dict <- function(dict, DT, + id_col = "id", level_col = "reporting_level") { + stopifnot(is.data.table(dict), is.data.table(DT)) + cols <- c(id_col, level_col) + stopifnot(all(c(cols, "code") %in% names(dict)), all(cols %in% names(DT))) + + new_pairs <- fsetdiff(unique(DT[, ..cols]), dict[, ..cols]) + if (nrow(new_pairs)) { + new_pairs[, code := as.integer(max(dict$code) + seq_len(.N))] + setkeyv(new_pairs, cols) + setindexv(new_pairs, "code") + dict <- rbindlist(list(dict, new_pairs), use.names = TRUE) + setkeyv(dict, cols) + setindexv(dict, "code") + } + dict +} + + + +#' Load survey data from file list +#' +#' Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column. +#' Used as the first step in the pipeline after creating the file list. +#' +#' @param input_list Character vector of file paths (from create_full_list()). +#' @return Named list of data.tables, each with an id column. +#' @keywords internal +load_list_refy <- \(input_list){ + + id_names <- input_list |> + fs::path_file() |> + fs::path_ext_remove() + + seq_flex <- if (interactive()) { + cli::cli_progress_along + } else { + base::seq_along + } + + + lfst <- lapply(seq_flex(input_list), + \(i) { + x <- input_list[i] + idn <- fs::path_file(x) |> + fs::path_ext_remove() + fst::read_fst(x, as.data.table = TRUE) |> + _[, id := idn] + }) |> + setNames(id_names) + + lfst +} + diff --git a/R/get_aux_table.R b/R/get_aux_table.R index 93604585..5e517666 100644 --- a/R/get_aux_table.R +++ b/R/get_aux_table.R @@ -7,7 +7,17 @@ #' @return data.frame #' @export #' -get_aux_table <- function(data_dir, table, long_format = FALSE) { +get_aux_table <- function(data_dir = NULL, + table, long_format = FALSE) { + + if (is.null(data_dir)) { + if (exists("lkup", inherits = TRUE)) { + data_dir <- get("lkup", inherits = TRUE)$data_root + } else { + cli::cli_abort("{.code data_dir} not defined and {.field lkup} not found.") + } +} + if (long_format && !table %in% get_valid_aux_long_format_tables()) { long_format <- FALSE } @@ -35,15 +45,40 @@ get_aux_table <- function(data_dir, table, long_format = FALSE) { #' Helper function to the UI #' @param data_dir character: Data directory #' @param table character: Name of auxiliary table +#' @param esclude logical: whether or not to exclude some countries or regions... #' #' @return data.frame #' @export #' -get_aux_table_ui <- function(data_dir, table) { +get_aux_table_ui <- function(data_dir, + table, + exclude = TRUE, + lkup) { out <- get_aux_table(data_dir = data_dir, table = table, long_format = FALSE) + if (table == "regions") { + # TEMP START: remove old aggregations -------------- + cl <- lkup$aux_files$country_list + + regs <- cl[, .(region_code, africa_split_code)] |> + unlist() |> # convert to vector + na_omit() |> + unique() |> + unname() |> + c("WLD") # add the world + # TEMP END: remove old aggregations -------------- + + out <- out[region_code %in% regs] + + } else if (table == "countries" && exclude == TRUE) { + # hardcoded + to_remove <- c("UKR") + out <- out[!(country_code %in% to_remove)] + } + + return(out) } diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R new file mode 100644 index 00000000..9b3a53fe --- /dev/null +++ b/R/infer_poverty_line.R @@ -0,0 +1,98 @@ +#' Infer the poverty line for a given population share +#' +#' Computes the welfare value (poverty line) corresponding to a given population +#' share, using either nearest or interpolated weighted quantile methods. +#' Supports both discrete (nearest) and linear interpolation approaches, and can +#' optionally average neighbors for ties. +#' +#' @param welfare Numeric vector of welfare values (e.g., income or +#' consumption). +#' @param weight Numeric vector of sampling weights (must be non-negative, same +#' length as welfare). +#' @param popshare Numeric vector of population shares (probabilities in [0,1]); +#' default is 0.5 (median). +#' @param include Logical; if TRUE, averages neighbors for ties (only for method +#' = "nearest"). +#' @param method Character; either "nearest" (default, discrete quantile) or +#' "interp" (weighted linear interpolation). +#' @param assume_sorted Logical; if TRUE, assumes welfare and weight are already +#' sorted by welfare. +#' +#' @return Numeric vector of poverty line(s) corresponding to the requested +#' population share(s). +#' @details +#' - If method = "nearest", returns the welfare value at the closest cumulative weight fraction to each popshare. +#' - If method = "interp", uses collapse::fquantile for weighted linear interpolation. +#' - If include = TRUE (and method = "nearest"), averages the two closest neighbors using their weights. +#' - Returns numeric(0) if popshare is empty. +#' @keywords internal +infer_poverty_line <- function(welfare, weight, popshare = 0.5, + include = FALSE, + method = c("nearest","interp"), + assume_sorted = TRUE) { + method <- match.arg(method) + + # defenses + if (length(welfare) != length(weight)) cli::cli_abort("welfare and weight must have the same length") + if (anyNA(welfare) || anyNA(weight)) cli::cli_abort("welfare and weight cannot contain NA") + if (any(weight < 0)) cli::cli_abort("weights must be non-negative") + if (!length(popshare)) return(numeric(0)) + + # clamp probs + p <- pmin(pmax(as.numeric(popshare), 0), 1) + + # fast sort (or not) + if (!assume_sorted) { + o <- data.table:::forder(welfare) + y <- welfare[o] + w <- weight[o] + } else { + y <- welfare + w <- weight + o <- seq_along(welfare) + } + + if (method == "interp") { + # collapse::fquantile: weighted linear interpolation + # 'include' is not used here: interpolation doesn't have that discrete toggle + return(fquantile(y, + probs = p, + w = w, + o = o, + names = FALSE)) + } + + # ---- method == "nearest" (matches your function) ---- + # cumulative weight fractions + W <- fsum(w) + if (W <= 0) stop("sum(weight) must be > 0") + cw <- fcumsum(w) + prob <- cw / W + n <- length(y) + + # for each p, find the nearest cumulative location (ties -> lower index) + j <- findInterval(p, prob, left.open = FALSE) # j ∈ {0..n} + j[j < 0L] <- 0L # fit into boundries of vector + j[j > n] <- n + + prev_idx <- pmax.int(j, 1L) # 1..n + next_idx <- pmin.int(j + 1L, n) # 1..n + d_prev <- p - prob[prev_idx] + d_next <- prob[next_idx] - p + use_next <- (d_next < d_prev) & (j < n) + idx <- fifelse(use_next, next_idx, prev_idx) # final index + + if (!include) { + # take the discrete value at the nearest location + return(y[idx]) + } else { + # average the two neighbors using their weights (the original rule in wbpip) + idx2 <- pmin.int(idx + 1L, n) + wi <- w[idx] + wi2 <- w[idx2] + s <- wi + wi2 + num <- wi * y[idx] + wi2 * y[idx2] + out <- fifelse((idx == n) | (s <= 0), y[idx], num / s) + return(out) + } +} diff --git a/R/pip.R b/R/pip.R index dd62cd20..44a4d556 100644 --- a/R/pip.R +++ b/R/pip.R @@ -1,24 +1,13 @@ + + #' Compute PIP statistics #' #' Compute the main PIP poverty and inequality statistics. #' -#' @param country character: Country ISO 3 codes -#' @param year integer: Reporting year -#' @param povline numeric: Poverty line -#' @param popshare numeric: Proportion of the population living below the -#' poverty line -#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate -#' values for missing years -#' @param group_by character: Will return aggregated values for predefined -#' sub-groups -#' @param welfare_type character: Welfare type -#' @param reporting_level character: Geographical reporting level -#' @param ppp numeric: Custom Purchase Power Parity value -#' @param lkup list: A list of lkup tables -#' @param censor logical: Triggers censoring of country/year statistics -#' @param lkup_hash character: hash of pip -#' @param additional_ind logical: If TRUE add new set of indicators. Default if -#' FALSE +#' This function is a wrapper around the [pip_new_lineups] and [pip_old_lineups] +#' functions. +#' +#' @inheritParams pip_new_lineups #' #' @return data.table #' @examples @@ -63,219 +52,55 @@ pip <- function(country = "ALL", reporting_level = c("all", "national", "rural", "urban"), ppp = NULL, lkup, - censor = TRUE, + censor = FALSE, lkup_hash = lkup$cache_data_id$hash_pip, additional_ind = FALSE) { - - # set up ------------- - welfare_type <- match.arg(welfare_type) - reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) - povline <- round(povline, digits = 3) - - - - # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED - country <- toupper(country) - if (is.character(year)) { - year <- toupper(year) + # Should pip_old or pip_new be used? + #------------------------------------- + use_new <- lkup$use_new_lineup_version + + # Run correct function + #------------------------------------- + out <- if (use_new) { + pip_new_lineups(country = country, + year = year, + povline = povline, + popshare = popshare, + fill_gaps = fill_gaps, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) + } else { + pip_old_lineups(country = country, + year = year, + povline = povline, + popshare = popshare, + fill_gaps = fill_gaps, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) } - # If svy_lkup is not part of lkup throw an error. - if (!all(c('svy_lkup') %in% names(lkup))) - stop("You are probably passing more than one dataset as lkup argument. - Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") - - - # **** TO BE REMOVED **** REMOVAL STARTS HERE - # Once `pip-grp` has been integrated in ingestion pipeline - # Forces fill_gaps to TRUE when using group_by option - if (group_by != "none") { - fill_gaps <- TRUE - message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") - } - # **** TO BE REMOVED **** REMOVAL ENDS HERE - - # Countries vector ------------ - lcv <- # List with countries vectors - create_countries_vctr( - country = country, - year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files - ) - # lcv$est_ctrs has all the country_code that we are interested in - - cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") - if (!file.exists(cache_file_path)) { - # Create an empty duckdb file - create_duckdb_file(cache_file_path) - } - # mains estimates --------------- - if (fill_gaps) { - ## lineup years----------------- - out <- fg_pip( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup - ) - } else { - ## survey years ------------------ - out <- rg_pip( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup - ) - } - - cached_data <- out$data_in_cache - main_data <- out$main_data - - if (nrow(main_data) > 0) { - out <- main_data |> - rowbind(cached_data) - - update_master_file(main_data, cache_file_path, fill_gaps) - - } else { - out <- cached_data - } - if (!data.table::is.data.table(out)) { - setDT(out) - } - # Early return for empty table--------------- - if (nrow(out) == 0) return(pipapi::empty_response) - - # aggregate distributions ------------------ - if (reporting_level %in% c("national", "all")) { - out <- add_agg_stats( - df = out, - return_cols = lkup$return_cols$ag_average_poverty_stats - ) - if (reporting_level == "national") { - out <- out[reporting_level == "national"] - } - } - - - - add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) - - # **** TO BE REMOVED **** REMOVAL STARTS HERE - # Once `pip-grp` has been integrated in ingestion pipeline - # Handles grouped aggregations - if (group_by != "none") { - # Handle potential (insignificant) difference in poverty_line values that - # may mess-up the grouping - out$poverty_line <- povline - - out <- pip_aggregate_by( - df = out, - group_lkup = lkup[["pop_region"]], - return_cols = lkup$return_cols$pip_grp - ) - # Censor regional values - if (censor) { - out <- censor_rows(out, lkup[["censored"]], type = "regions") - } - - out <- out[, c("region_name", - "region_code", - "reporting_year", - "reporting_pop", - "poverty_line", - "headcount", - "poverty_gap", - "poverty_severity", - "watts", - "mean", - "pop_in_poverty")] - - return(out) - } - # **** TO BE REMOVED **** REMOVAL ENDS HERE - - - # pre-computed distributional stats --------------- - crr_names <- names(out) # current variables - names2keep <- lkup$return_cols$pip$cols # all variables - - out <- add_dist_stats( - df = out, - dist_stats = lkup[["dist_stats"]] - ) - - # Add aggregate medians ---------------- - out <- add_agg_medians( - df = out, - fill_gaps = fill_gaps, - data_dir = lkup$data_root - ) - - # format ---------------- - - - if (fill_gaps) { - - ## Inequality indicators to NA for lineup years ---- - dist_vars <- names2keep[!(names2keep %in% crr_names)] - out[, - (dist_vars) := NA_real_] - - ## estimate_var ----- - out <- estimate_type_ctr_lnp(out, lkup) - - } else { - out[, estimate_type := NA_character_] - } - ## Handle survey coverage ------------ - if (reporting_level != "all") { - keep <- out$reporting_level == reporting_level - out <- out[keep, ] - } - - # Censor country values - if (censor) { - out <- censor_rows(out, lkup[["censored"]], type = "countries") - } + # Return + #------------------------------------- + out +} - # Select columns - if (additional_ind) { - get_additional_indicators(out) - added_names <- attr(out, "new_indicators_names") - names2keep <- c(names2keep, added_names) - } - # Keep relevant variables - out <- out[, .SD, .SDcols = names2keep] - # make sure we always report the same precision in all numeric variables - doub_vars <- - names(out)[unlist(lapply(out, is.double))] |> - data.table::copy() - out[, (doub_vars) := lapply(.SD, round, digits = 12), - .SDcols = doub_vars] - # Order rows by country code and reporting year - data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) - #} - # Make sure no duplicate remains - out <- out |> collapse::funique() - # return ------------- - return(out) -} diff --git a/R/pip_agg.R b/R/pip_agg.R new file mode 100644 index 00000000..0d5485fb --- /dev/null +++ b/R/pip_agg.R @@ -0,0 +1,52 @@ +#' Logic for computing new aggregate +#' +#' @inheritParams pip +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' } +#' @export +pip_agg <- function(country = "ALL", + year = "ALL", + povline = 1.9, + group_by = c("wb", "none"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = FALSE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE) { + + # Should pip_old or pip_new be used? + #------------------------------------- + use_new <- lkup$use_new_lineup_version + + # Run correct function + #------------------------------------- + out <- if (use_new) { + pip_grp_new(country = country, + year = year, + povline = povline, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = lkup, + censor = censor, + additional_ind = additional_ind) + } else { + pip_grp_logic(country = country, + year = year, + povline = povline, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) + } + + # Return + #------------------------------------- + out +} diff --git a/R/pip_grp.R b/R/pip_grp.R index 8fc68331..df0797c2 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -50,7 +50,7 @@ pip_grp <- function(country = "ALL", reporting_level <- "national" } - out <- fg_pip( + out <- fg_pip_old( country = country, year = year, povline = povline, @@ -86,7 +86,6 @@ pip_grp <- function(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], country = country, return_cols = lkup$return_cols$pip_grp ) @@ -208,16 +207,14 @@ pip_aggregate <- function(df, by = NULL, return_cols) { } #' Aggregate by predefined groups -#' @param df data.frame: Response from `fg_pip()` or `rg_pip()`. -#' @param group_lkup data.frame: Group lkup table (pop_region) +#' @param df data.frame: Response from `fg_pip_old()` or `rg_pip()`. #' @param country character: Selected countries / regions #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned #' columns -#' @noRd +#' @keywords internal pip_aggregate_by <- function(df, - group_lkup, country = "ALL", - return_cols) { + return_cols = NULL) { all_cols <- return_cols$cols weighted_cols <- return_cols$weighted_average_cols @@ -228,79 +225,85 @@ pip_aggregate_by <- function(df, to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", "estimate_type")] - df <- df[, .SD, .SDcols = to_keep] - - group_lkup <- group_lkup[, c("region_code", - "reporting_year", - "reporting_pop")] + # df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part # Compute stats weighted average by groups - rgn <- df[, lapply(.SD, stats::weighted.mean, - w = reporting_pop, - na.rm = TRUE), - by = .(region_name, - region_code, - reporting_year, - poverty_line), - .SDcols = weighted_cols - ] - - rgn <- group_lkup[rgn, - on = .(region_code, reporting_year), - allow.cartesian = TRUE - ] - + rgn <- df |> + fgroup_by(region_name, + region_code, + reporting_year, + poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) + + # World aggregation if (any(c("ALL", "WLD") %in% country)) { # Compute world aggregates - wld <- compute_world_aggregates(rgn = rgn, - cols = weighted_cols) + wld <- compute_world_aggregates(rgn = rgn) + } else { + wld <- NULL + } + + # Africas aggregation + if (any(c("ALL", "AFE", "AFW") %in% country)) { + rgn <- df |> + fgroup_by(africa_split, + africa_split_code, + reporting_year, + poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) |> + fsubset(!is.na(africa_split_code)) |> + frename(africa_split_code = region_code, + africa_split = region_name) |> + rowbind(rgn, fill = TRUE) + } + # Vintage aggregation + if ("ALL" %in% country & "regionpcn_code" %in% names(df)) { + rgn <- df |> + fgroup_by(regionpcn, + regionpcn_code, + reporting_year, + poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) |> + fsubset(!is.na(regionpcn_code)) |> + frename(regionpcn_code = region_code, + regionpcn = region_name) |> + rowbind(rgn, fill = TRUE) + } + + if (length(country) == 1) { if (country == "WLD") { # Return only world aggregate - out <- wld - } else if (country == "ALL") { - # Combine with other regional aggregates - out <- rbind(rgn, wld, fill = TRUE) - } - } else { - # Combine with other regional aggregates - out <- rbind(rgn, wld, fill = TRUE) - # Return selection only - if (!"ALL" %in% country) { - out <- out[region_code %in% country, ] + wld[, pop_in_poverty := round(headcount * reporting_pop, 0)] + return(wld) } } - } else { - # Return only selected regions - out <- rgn - } + # Combine with other regional aggregates + out <- rowbind(rgn, wld, fill = TRUE) + out[, pop_in_poverty := round(headcount * reporting_pop, 0)] - # Compute population living in poverty - out$pop_in_poverty <- round(out$headcount * out$reporting_pop, 0) + if ("ALL" %in% country) { + return(out) + } - return(out) + out[region_code %in% country, ] } -compute_world_aggregates <- function(rgn, cols) { +compute_world_aggregates <- function(rgn, cols = NULL) { # Compute stats # Grouping by poverty line as well since we now have vectorized poverty line values - wld <- rgn[, lapply(.SD, - stats::weighted.mean, - w = reporting_pop, - na.rm = TRUE), - by = .(reporting_year, poverty_line), - .SDcols = cols - ] - # Compute yearly population WLD totals - tmp <- rgn[, .(reporting_pop = sum(reporting_pop)), - by = .(reporting_year, poverty_line)] - - - wld <- wld[tmp, on = .(reporting_year = reporting_year, poverty_line = poverty_line)] - wld[["region_code"]] <- "WLD" - wld[["region_name"]] <- "World" + wld <- rgn |> + fgroup_by(reporting_year, + poverty_line) |> + num_vars() |> + fmean(w = reporting_pop, stub = FALSE) |> + ftransform(region_code = "WLD", + region_name = "World") return(wld) @@ -308,7 +311,7 @@ compute_world_aggregates <- function(rgn, cols) { #' Filter relevant rows for aggregating by predefined groups -#' @param df data.frame: Response from `fg_pip()` +#' @param df data.frame: Response from `fg_pip_old()` #' @noRd filter_for_aggregate_by <- function(df) { # This algorithm is incorrect, but should mostly work as a first iteration @@ -316,10 +319,10 @@ filter_for_aggregate_by <- function(df) { # If nationally representative survey is available, use it # Otherwise, use whatever is available - out <- df[, check := length(reporting_level), - by = c("country_code", "reporting_year", "poverty_line")] - out <- out[out$check == 1 | (out$check > 1 & reporting_level == "national"), ] - - return(out) + df[, check := length(reporting_level), + by = c("country_code", "reporting_year", "poverty_line") + ][ + check == 1 | (check > 1 & reporting_level == "national"), + ] } diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index 9cc9bc34..cc23d253 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -1,6 +1,6 @@ -#' Logic for computing new aggregate +#' Old way to estimate aggregate data +#' @rdname pip_agg #' -#' @inheritParams pip #' @return data.table #' @examples #' \dontrun{ @@ -44,8 +44,7 @@ pip_grp_logic <- function(country = "ALL", create_countries_vctr( country = country, year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files + lkup = lkup ) # use the same names as before to avoid inconsistencies @@ -73,8 +72,8 @@ pip_grp_logic <- function(country = "ALL", ## STEP 3.2: Compute fg_pip for ALL required countries ---- ## This will then be re-used in various part of the function ## This is to avoid re-computing and re-loading the same data over and over - fg_pip_master <- fg_pip( - country = c(lcv$md_off_reg, lcv$user_off_reg), + fg_pip_master <- fg_pip_old( + country = c(lcv$md_off_reg, lcv$user_alt_agg, lcv$user_off_reg), year = year, povline = povline, popshare = NULL, @@ -84,11 +83,13 @@ pip_grp_logic <- function(country = "ALL", lkup = lkup ) # For now just rowbinding two dataframes, but we would need to use it more smartly in the future - fg_pip_master <- collapse::rowbind(fg_pip_master) + fg_pip_master <- rowbind(fg_pip_master, + # THis should not be necessary + fill = TRUE) |> + setDT() - if (!data.table::is.data.table(fg_pip_master)) { - setDT(fg_pip_master) - } + fg_pip_master[is.na(mean), + mean := predicted_mean_ppp] add_vars_out_of_pipeline(fg_pip_master, fill_gaps = TRUE, lkup = lkup) @@ -315,7 +316,6 @@ pip_grp_helper <- function(lcv_country, out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], country = country, return_cols = lkup$return_cols$pip_grp ) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R new file mode 100644 index 00000000..bf45c3e0 --- /dev/null +++ b/R/pip_grp_new.R @@ -0,0 +1,154 @@ +#' New way to estimate Aggregate data +#' @rdname pip_agg +pip_grp_new <- \(country = "ALL", + year = "ALL", + povline = 1.9, + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + additional_ind = FALSE, + lkup_hash = lkup$cache_data_id$hash_pip_grp) { + + welfare_type <- match.arg(welfare_type) + reporting_level <- match.arg(reporting_level) + + # Custom aggregations only supported at the national level + # subgroups aggregations only supported for "all" countries + country <- toupper(country) + year <- toupper(year) + + if (!all(country %in% c("ALL", lkup$query_controls$region$values))) { + country <- "ALL" + } + + # Select countries to estimate poverty + cts <- copy(lkup$aux_files$country_list) + country_code <- if (!"ALL" %in% country) { + get_country_code_subset(dt = cts, country = country) + } else { + "ALL" + } + + out <- fg_pip( + country = country_code, + year = year, + povline = povline, + popshare = NULL, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = NULL, + lkup = lkup) + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + if (!file.exists(cache_file_path)) { + # Create an empty duckdb file + create_duckdb_file(cache_file_path) + } + out <- treat_cache_and_main(out, + cache_file_path = cache_file_path, + lkup = lkup, fill_gaps = TRUE) + + # return empty dataframe if no metadata is found + if (nrow(out) == 0) { + return(pipapi::empty_response_grp) + } + + # Handles aggregated distributions (like CHN and IND) + if (tolower(reporting_level) %in% c("national", "all")) { + out <- add_agg_stats(out, + return_cols = lkup$return_cols$ag_average_poverty_stats) + } + + add_vars_out_of_pipeline(out, fill_gaps = TRUE, lkup = lkup) + + # Handle potential (insignificant) difference in poverty_line values that + # may mess-up the grouping + # I don't think we need this out$poverty_line already has the correct values additionally, + # since povline is vectorized the below line does not work as expected + #out$poverty_line <- povline + + # Handle aggregations with sub-groups + + out <- pip_aggregate_by( + df = out, + country = country, + return_cols = lkup$return_cols$pip_grp + ) + + out <- estimate_type_var(out,lkup) + + # Censor regional values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "regions") + } + + out +} + +#' Subset country_code values based on matches in *_code columns and country_code +#' +#' This function searches all columns in a data.table ending with '_code' (except 'country_code'), +#' as well as 'country_code' itself, and returns a unique character vector of 'country_code' values +#' for rows where any of those columns match a value in the provided 'country' vector. If any value +#' in 'country' is not found in any *_code column or in 'country_code', an error is thrown. The input +#' data.table 'dt' should be 'lkup$aux_files$country_list', which contains country and region codes for subsetting. +#' +#' @param dt A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns. +#' @param country Character vector of country or region codes to match against *_code columns and country_code. +#' +#' @return A unique character vector of country_code values corresponding to matches in any *_code column or country_code. +#' @examples +#' \dontrun{ +#' dt <- lkup$aux_files$country_list +#' get_country_code_subset(dt, c("USA", "EAP")) +#' } +get_country_code_subset <- function(dt, country) { + code_cols <- grep("_code$", names(dt), value = TRUE) + result <- character(0) + matched <- logical(length(country)) + + for (col in code_cols) { + idx <- dt[[col]] %in% country + if (any(idx, na.rm = TRUE)) { + result <- c(result, dt[idx, country_code]) + matched <- matched | country %in% dt[[col]] + } + } + # Also check country_code itself + idx_cc <- dt$country_code %in% country + if (any(idx_cc, na.rm = TRUE)) { + result <- c(result, dt[idx_cc, country_code]) + matched <- matched | country %in% dt$country_code + } + # Error if any country not matched + if (any(!matched)) { + cli::cli_abort( + "The following values in {.arg country} were not found in any *_code column or country_code: + {country[!matched]}") + } + funique(result) +} + +#' List values in each *_code column that match the country vector +#' +#' Returns a named list where each element is the vector of unique values in each *_code column +#' that are present in the provided 'country' vector. +#' +#' @param dt A data.table, typically lkup$aux_files$country_list. +#' @param country Character vector of country or region codes to match against *_code columns. +#' +#' @return A named list of unique values for each *_code column that match 'country'. +#' @examples +#' \dontrun{ +#' dt <- lkup$aux_files$country_list +#' list_code_column_values(dt, c("USA", "EAP")) +#' } +list_code_column_values <- function(dt, country) { + code_cols <- grep("_code$", names(dt), value = TRUE) + lapply(code_cols, \(col) { + dt[get(col) %in% country, ..col] |> + funique() + }) |> + setNames(code_cols) +} diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R new file mode 100644 index 00000000..a7c51b3e --- /dev/null +++ b/R/pip_new_lineups.R @@ -0,0 +1,341 @@ + +#' Compute PIP statistics +#' +#' Compute the main PIP poverty and inequality statistics. +#' +#' @param country character: Country ISO 3 codes +#' @param year integer: Reporting year +#' @param povline numeric: Poverty line +#' @param popshare numeric: Proportion of the population living below the +#' poverty line +#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate +#' values for missing years +#' @param group_by character: Will return aggregated values for predefined +#' sub-groups +#' @param welfare_type character: Welfare type +#' @param reporting_level character: Geographical reporting level +#' @param ppp numeric: Custom Purchase Power Parity value +#' @param lkup list: A list of lkup tables +#' @param censor logical: Triggers censoring of country/year statistics +#' @param lkup_hash character: hash of pip +#' @param additional_ind logical: If TRUE add new set of indicators. Default if +#' FALSE +#' +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' lkups <- create_lkups("") +#' +#' # A single country and year +#' pip_new_lineups(country = "AGO", +#' year = 2000, +#' povline = 1.9, +#' lkup = lkups) +#' +#' # All years for a single country +#' pip_new_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' lkup = lkups) +#' +#' # Fill gaps +#' pip_new_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' fill_gaps = TRUE, +#' lkup = lkups) +#' +#' # Group by regions +#' pip_new_lineups(country = "all", +#' year = "all", +#' povline = 1.9, +#' group_by = "wb", +#' lkup = lkups) +#' } +#' @export +pip_new_lineups <- function(country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { + + + # set up ------------- + welfare_type <- match.arg(welfare_type) + reporting_level <- match.arg(reporting_level) + group_by <- match.arg(group_by) + povline <- round(povline, digits = 3) + + + # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED + country <- toupper(country) + if (is.character(year)) { + year <- toupper(year) + } + + # If svy_lkup is not part of lkup throw an error. + if (!all(c('svy_lkup') %in% names(lkup))) + stop("You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") + + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Forces fill_gaps to TRUE when using group_by option + if (group_by != "none") { + fill_gaps <- TRUE + message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + # Countries vector ------------ + validate_country_codes(country = country, lkup = lkup) + + + + # lcv$est_ctrs has all the country_code that we are interested in + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + if (!file.exists(cache_file_path)) { + # Create an empty duckdb file + create_duckdb_file(cache_file_path) + } + # mains estimates --------------- + if (fill_gaps) { + ## lineup years----------------- + out <- fg_pip( + country = country, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } else { + ## survey years ------------------ + out <- rg_pip( + country = country, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } + + # Cache new data + #--------------------------------------------- + out <- treat_cache_and_main(out, + cache_file_path = cache_file_path, + lkup = lkup, fill_gaps = fill_gaps) + + # Early return for empty table--------------- + if (nrow(out) == 0) return(pipapi::empty_response) + + # aggregate distributions ------------------ + if (reporting_level %in% c("national", "all")) { + out <- add_agg_stats( + df = out, + return_cols = lkup$return_cols$ag_average_poverty_stats) + if (reporting_level == "national") { + out <- out[reporting_level == "national"] + } + } + + # Add out of pipeline variable + #--------------------------------------------- + add_vars_out_of_pipeline(out, + fill_gaps = fill_gaps, + lkup = lkup) + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Handles grouped aggregations + if (group_by != "none") { + # Handle potential (insignificant) difference in poverty_line values that + # may mess-up the grouping + out$poverty_line <- povline + + out <- pip_aggregate_by( + df = out, + return_cols = lkup$return_cols$pip_grp) + + # Censor regional values + if (censor) { + out <- censor_rows(out, + lkup[["censored"]], + type = "regions") + } + + out <- out[, c("region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty")] + + return(out) + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + + # pre-computed distributional stats --------------- + crr_names <- names(out) # current variables + names2keep <- lkup$return_cols$pip$cols # all variables + + out <- add_dist_stats( + df = out, + lkup = lkup, + fill_gaps = fill_gaps) + + # Add aggregate medians ---------------- + out <- add_agg_medians( + df = out, + fill_gaps = fill_gaps, + data_dir = lkup$data_root + ) + + # format ---------------- + + + if (fill_gaps) { + # ZP temp NA lineups: + #--------------------- + # ## Inequality indicators to NA for lineup years ---- + dist_vars <- names2keep[!(names2keep %in% crr_names)] + out[, + (dist_vars) := NA_real_] + + ## estimate_var ----- + out <- estimate_type_ctr_lnp(out, lkup) + + } else { + out[, estimate_type := NA_character_] + } + + ## Handle survey coverage ------------ + if (reporting_level != "all") { + keep <- out$reporting_level == reporting_level + out <- out[keep, ] + } + + # Censor country values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "countries") + } + + + # Select columns + if (additional_ind) { + get_additional_indicators(out) + added_names <- attr(out, "new_indicators_names") + names2keep <- c(names2keep, added_names) + + } + # Keep relevant variables + out <- out[, .SD, .SDcols = names2keep] + + + # make sure we always report the same precision in all numeric variables + doub_vars <- + names(out)[unlist(lapply(out, is.double))] |> + data.table::copy() + + out[, (doub_vars) := lapply(.SD, round, digits = 12), + .SDcols = doub_vars] + + # Order rows by country code and reporting year + data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + #} + + # Make sure no duplicate remains + out <- out |> collapse::funique() + # return ------------- + return(out) +} + + + +treat_cache_and_main <- \(out, cache_file_path, + lkup, fill_gaps) { + + # early return of cache data if not available. + cached_data <- + if (is.null(out$data_in_cache)) { + NULL + } else if (is.data.frame(out$data_in_cache)) { + + if (fnrow(out$data_in_cache) == 0) { + NULL + } else { + ft <- qDT(out$data_in_cache) + if (fill_gaps) { + ft <- + fg_remove_duplicates(ft, + use_new_lineup_version = lkup$use_new_lineup_version) + + } + + # Add just mean and median + get_mean_median(ft, lkup, fill_gaps = fill_gaps) + + } + } else { + cli::cli_abort( + "{.code out$data_in_cache} must be NULL or data.frame not + {.field {class(out$data_in_cache)}}" + ) + } + + main_data <- qDT(out$main_data) + + if (nrow(main_data) > 0) { + if (is.null(cached_data)) { + out <- copy(main_data) + } else { + out <- main_data |> + rowbind(cached_data) + } + + update_master_file(main_data, cache_file_path, fill_gaps) + rm(main_data) + + } else { + out <- cached_data + } + + + setDT(out) +} + + + +validate_country_codes <- \(country, lkup) { + cls <- lkup$aux_files$country_list$country_code |> + unique() |> + c("ALL") + + if (any(!country %in% cls)) { + wcls <- which(!country %in% cls) + cli::cli_abort("{.field {country[wcls]}} {?is/are} not {?a/} valid country code{?s}") + } + invisible(TRUE) +} diff --git a/R/pip_old.R b/R/pip_old.R index a6ed2c2b..e0194c20 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -27,26 +27,26 @@ #' lkups <- create_lkups("") #' #' # A single country and year -#' pip(country = "AGO", +#' pip_old(country = "AGO", #' year = 2000, #' povline = 1.9, #' lkup = lkups) #' #' # All years for a single country -#' pip(country = "AGO", +#' pip_old(country = "AGO", #' year = "all", #' povline = 1.9, #' lkup = lkups) #' #' # Fill gaps -#' pip(country = "AGO", +#' pip_old(country = "AGO", #' year = "all", #' povline = 1.9, #' fill_gaps = TRUE, #' lkup = lkups) #' #' # Group by regions -#' pip(country = "all", +#' pip_old(country = "all", #' year = "all", #' povline = 1.9, #' group_by = "wb", @@ -55,18 +55,18 @@ #' @export #' pip_old <- function(country = "ALL", - year = "ALL", - povline = 1.9, - popshare = NULL, - fill_gaps = FALSE, - group_by = c("none", "wb"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national", "rural", "urban"), - ppp = NULL, - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip, - additional_ind = FALSE) { + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { # set up ------------- @@ -100,8 +100,7 @@ pip_old <- function(country = "ALL", create_countries_vctr( country = country, year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files + lkup = lkup ) # lcv$est_ctrs has all the country_code that we are interested in @@ -113,7 +112,7 @@ pip_old <- function(country = "ALL", # mains estimates --------------- if (fill_gaps) { ## lineup years----------------- - out <- fg_pip( + out <- fg_pip_old( country = lcv$est_ctrs, year = year, povline = povline, @@ -142,12 +141,11 @@ pip_old <- function(country = "ALL", if (nrow(main_data) > 0) { out <- main_data |> - collapse::fmutate(path = as.character(path)) |> - collapse::rowbind(cached_data) + fmutate(path = as.character(path)) |> + rowbind(cached_data) # cached_data is NULL when we are querying live data in which case we don't update cache # This will be used only for development purpose and we don't have any intention to use it in production. - if(!is.null(cached_data)) { - # Update cache with data + if (!is.null(cached_data)) { update_master_file(main_data, cache_file_path, fill_gaps) } } else { @@ -184,7 +182,6 @@ pip_old <- function(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], return_cols = lkup$return_cols$pip_grp ) # Censor regional values @@ -213,7 +210,7 @@ pip_old <- function(country = "ALL", crr_names <- names(out) # current variables names2keep <- lkup$return_cols$pip$cols # all variables - out <- add_dist_stats( + out <- add_dist_stats_old( df = out, dist_stats = lkup[["dist_stats"]] ) @@ -304,6 +301,9 @@ rg_pip_old <- function(country, valid_regions <- lkup$query_controls$region$values svy_lkup <- lkup$svy_lkup data_dir <- lkup$data_root + # povline is set to NULL if popshare is given + if (!is.null(popshare)) povline <- NULL + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R new file mode 100644 index 00000000..1bb86dea --- /dev/null +++ b/R/pip_old_lineups.R @@ -0,0 +1,279 @@ +#' Compute PIP statistics - Old lineups function +#' +#' Compute the main PIP poverty and inequality statistics. +#' +#' @param country character: Country ISO 3 codes +#' @param year integer: Reporting year +#' @param povline numeric: Poverty line +#' @param popshare numeric: Proportion of the population living below the +#' poverty line +#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate +#' values for missing years +#' @param group_by character: Will return aggregated values for predefined +#' sub-groups +#' @param welfare_type character: Welfare type +#' @param reporting_level character: Geographical reporting level +#' @param ppp numeric: Custom Purchase Power Parity value +#' @param lkup list: A list of lkup tables +#' @param censor logical: Triggers censoring of country/year statistics +#' @param lkup_hash character: hash of pip +#' @param additional_ind logical: If TRUE add new set of indicators. Default if +#' FALSE +#' +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' lkups <- create_lkups("") +#' +#' # A single country and year +#' pip_old_lineups(country = "AGO", +#' year = 2000, +#' povline = 1.9, +#' lkup = lkups) +#' +#' # All years for a single country +#' pip_old_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' lkup = lkups) +#' +#' # Fill gaps +#' pip_old_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' fill_gaps = TRUE, +#' lkup = lkups) +#' +#' # Group by regions +#' pip_old_lineups(country = "all", +#' year = "all", +#' povline = 1.9, +#' group_by = "wb", +#' lkup = lkups) +#' } +#' @export +pip_old_lineups <- function(country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { + + + # set up ------------- + welfare_type <- match.arg(welfare_type) + reporting_level <- match.arg(reporting_level) + group_by <- match.arg(group_by) + povline <- round(povline, digits = 3) + + + + # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED + country <- toupper(country) + if (is.character(year)) { + year <- toupper(year) + } + + # If svy_lkup is not part of lkup throw an error. + if (!all(c('svy_lkup') %in% names(lkup))) + stop("You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") + + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Forces fill_gaps to TRUE when using group_by option + if (group_by != "none") { + fill_gaps <- TRUE + message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + # Countries vector ------------ + lcv <- # List with countries vectors + create_countries_vctr( + country = country, + year = year, + lkup = lkup + ) + # lcv$est_ctrs has all the country_code that we are interested in + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + if (!file.exists(cache_file_path)) { + # Create an empty duckdb file + create_duckdb_file(cache_file_path) + } + # mains estimates --------------- + if (fill_gaps) { + ## lineup years----------------- + out <- fg_pip_old( + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } else { + ## survey years ------------------ + out <- rg_pip_old( + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } + + cached_data <- out$data_in_cache + main_data <- out$main_data + + if (nrow(main_data) > 0) { + out <- main_data |> + rowbind(cached_data) + + update_master_file(main_data, cache_file_path, fill_gaps) + + } else { + out <- cached_data + } + if (!data.table::is.data.table(out)) { + setDT(out) + } + # Early return for empty table--------------- + if (nrow(out) == 0) return(pipapi::empty_response) + + # aggregate distributions ------------------ + if (reporting_level %in% c("national", "all")) { + out <- add_agg_stats( + df = out, + return_cols = lkup$return_cols$ag_average_poverty_stats + ) + if (reporting_level == "national") { + out <- out[reporting_level == "national"] + } + } + + + + add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Handles grouped aggregations + if (group_by != "none") { + # Handle potential (insignificant) difference in poverty_line values that + # may mess-up the grouping + out$poverty_line <- povline + + out <- pip_aggregate_by( + df = out, + return_cols = lkup$return_cols$pip_grp + ) + # Censor regional values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "regions") + } + + out <- out[, c("region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty")] + + return(out) + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + + # pre-computed distributional stats --------------- + crr_names <- names(out) # current variables + names2keep <- lkup$return_cols$pip$cols # all variables + + out <- add_dist_stats_old( + df = out, + dist_stats = lkup[["dist_stats"]] + ) + + # Add aggregate medians ---------------- + out <- add_agg_medians( + df = out, + fill_gaps = fill_gaps, + data_dir = lkup$data_root + ) + + # format ---------------- + + + if (fill_gaps) { + + ## Inequality indicators to NA for lineup years ---- + dist_vars <- names2keep[!(names2keep %in% crr_names)] + out[, + (dist_vars) := NA_real_] + + ## estimate_var ----- + out <- estimate_type_ctr_lnp(out, lkup) + + } else { + out[, estimate_type := NA_character_] + } + ## Handle survey coverage ------------ + if (reporting_level != "all") { + keep <- out$reporting_level == reporting_level + out <- out[keep, ] + } + + # Censor country values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "countries") + } + + + # Select columns + if (additional_ind) { + get_additional_indicators(out) + added_names <- attr(out, "new_indicators_names") + names2keep <- c(names2keep, added_names) + + } + # Keep relevant variables + out <- out[, .SD, .SDcols = names2keep] + + + # make sure we always report the same precision in all numeric variables + doub_vars <- + names(out)[unlist(lapply(out, is.double))] |> + data.table::copy() + + out[, (doub_vars) := lapply(.SD, round, digits = 12), + .SDcols = doub_vars] + + # Order rows by country code and reporting year + data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + #} + + # Make sure no duplicate remains + out <- out |> collapse::funique() + # return ------------- + return(out) +} diff --git a/R/pipapi-env.R b/R/pipapi-env.R new file mode 100644 index 00000000..630b277f --- /dev/null +++ b/R/pipapi-env.R @@ -0,0 +1,42 @@ +# Getter function: Returns the entire .pipapienv environment +#' Get the entire .pipapienv environment +#' +#' @return The .pipapienv environment +#' @export +#' +#' @examples +#' env <- get_pipapienv() +get_pipapienv <- function() { + .pipapienv +} + +# Getter for a specific key from .pipapienv +#' Get a value from .pipapienv +#' +#' @param key A character string representing the key +#' +#' @return The value associated with the key in .pipapienv +#' @export +#' +#' @examples +#' set_in_pipapienv("example_key", 42) +#' get_from_pipapienv("example_key") # returns 42 +get_from_pipapienv <- function(key) { + rlang::env_get(.pipapienv, key, default = NULL) # Returns NULL if key doesn't exist +} + +# Setter function: Assign a value in .pipapienv +#' Set a value in .pipapienv +#' +#' @param key A character string representing the key +#' @param value The value to store in .pipapienv +#' +#' @return The assigned value (invisibly) +#' @export +#' +#' @examples +#' set_in_pipapienv("example_key", 42) +set_in_pipapienv <- function(key, value) { + rlang::env_poke(.pipapienv, key, value) + invisible(value) # Return value invisibly to avoid clutter in console +} diff --git a/R/pipapi-package.R b/R/pipapi-package.R index 7a52927d..c45284df 100644 --- a/R/pipapi-package.R +++ b/R/pipapi-package.R @@ -59,6 +59,17 @@ NULL #' @format Data frame with 0 rows and 53 columns NULL +#' Dataframe for fill gaps empty response with additional variables +#' +#' @docType data +#' @keywords datasets +#' @name empty_response_fg_add +#' @usage data(empty_response_fg_add) +#' @format Data frame with 0 rows and 56 columns +NULL + + + utils::globalVariables( c( ".", diff --git a/R/rg_pip.R b/R/rg_pip.R index 5ff8552b..a1cce53c 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -17,6 +17,9 @@ rg_pip <- function(country, valid_regions <- lkup$query_controls$region$values svy_lkup <- lkup$svy_lkup data_dir <- lkup$data_root + # povline is set to NULL if popshare is given + if (!is.null(popshare)) povline <- NULL + if (is.list(povline)) povline <- unlist(povline) cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -30,7 +33,8 @@ rg_pip <- function(country, data_dir = data_dir, povline = povline, cache_file_path = cache_file_path, - fill_gaps = FALSE + fill_gaps = FALSE, + popshare = popshare ) data_present_in_master <- metadata$data_present_in_master @@ -51,16 +55,33 @@ rg_pip <- function(country, # load data lt <- load_data_list(metadata) + # Calculate and update poverty line if popshare is passed + if (!is.null(popshare)) { + povline <- lapply(lt, \(x) { + # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + infer_poverty_line(welfare = x$welfare, + weight = x$weight, + popshare = popshare, + include = FALSE, + method = "nearest", + assume_sorted = TRUE) + }) + } - # parallelization - # res <- get_pov_estimates(lt, povline = povline) - - # Regular lapply - res <- lapply(lt, process_dt, povline = povline) + # if popshare is not null, povline will be list + if (is.list(povline)) { + # If povline is list, it comes from infer_poverty_line when popshare + # is not null. Then, we have one set of lines per survey. + res <- Map(process_dt, lt, povline) + } else { + # if povline is vector, it should be applied to all surveys in lt + res <- lapply(lt, process_dt, povline = povline) + } + rm(lt) + res <- rbindlist(res, fill = TRUE) - # clean data metadata[, file := basename(path)] @@ -85,116 +106,3 @@ rg_pip <- function(country, -# Efficient FGT calculation for a data.table and vector of poverty lines -#' Title -#' -#' @param dt data frame with `welfare` and `weight` columns -#' @param welfare character: welfare variable name -#' @param weight character: weight variable name -#' @param povlines double: vector with poveryt lines -#' -#' @return data.table with estimates poverty estimates -#' @keywords internal -compute_fgt_dt <- function(dt, welfare, weight, povlines) { - w <- dt[[welfare]] - wt <- dt[[weight]] - n <- length(w) - m <- length(povlines) - - # Pre-allocate result matrix - res <- matrix(NA_real_, nrow = m, ncol = 3) - colnames(res) <- c("FGT0", "FGT1", "FGT2") - watts_vec <- numeric(m) - - # Precompute log(w) for efficiency - logw <- rep(NA_real_, n) - pos <- w > 0 - logw[pos] <- log(w[pos]) - - for (i in seq_along(povlines)) { - pov <- povlines[i] - poor <- w < pov - rel_dist <- 1 - (w / pov) - rel_dist[!poor] <- 0 - res[i, 1] <- fmean(poor, w = wt) # FGT0 - res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 - res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 - - # Optimized Watts index calculation - keep <- poor & pos - if (any(keep, na.rm = TRUE)) { - watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) - } else { - watts_vec[i] <- 0 - } - } - data.table( - povline = povlines, - headcount = res[, 1], - poverty_gap = res[, 2], - poverty_severity = res[, 3], - watts = watts_vec - ) -} - - -process_dt <- function(dt, povline) { - dt[, compute_fgt_dt(.SD, "welfare", "weight", povline), - by = .(file, reporting_level)] -} - -#' load survey year files and store them in a list -#' -#' @param metadata data frame from `subset_lkup()` -#' -#' @return list with survey years data -#' @keywords internal -load_data_list <- \(metadata) { - - # unique values - mdout <- metadata[, lapply(.SD, list), by = path] - upaths <- mdout$path - urep_level <- mdout$reporting_level - uppp <- mdout$ppp - ucpi <- mdout$cpi - - seq_along(upaths) |> - lapply(\(f) { - path <- upaths[f] - rep_level <- urep_level[f][[1]] - ppp <- uppp[f][[1]] - cpi <- ucpi[f][[1]] - - # Build a data.table to merge cpi and ppp - fdt <- data.table(reporting_level = as.character(rep_level), - ppp = ppp, - cpi = cpi) - - # load data and format - dt <- fst::read_fst(path, as.data.table = TRUE) - - if (length(rep_level) == 1) { - if (rep_level == "national") dt[, area := "national"] - } - setnames(dt, "area", "reporting_level") - dt[, - `:=`( - file = basename(path), - reporting_level = as.character(reporting_level) - ) - ] - - dt <- join(dt, fdt, - on = "reporting_level", - validate = "m:1", - how = "left", - verbose = 0) - - dt[, welfare := welfare/(cpi * ppp) - ][, - c("cpi", "ppp") := NULL] - - }) - -} - diff --git a/R/rg_pip_old.R b/R/rg_pip_old.R new file mode 100644 index 00000000..da17b015 --- /dev/null +++ b/R/rg_pip_old.R @@ -0,0 +1,87 @@ +#' OLD: Compute survey year stats +#' +#' Compute the main PIP poverty and inequality statistics for survey years. +#' +#' @inheritParams pip +#' @return data.frame +#' @keywords internal +rg_pip_old <- function(country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup) { + # get values from lkup + valid_regions <- lkup$query_controls$region$values + svy_lkup <- lkup$svy_lkup + data_dir <- lkup$data_root + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + + metadata <- subset_lkup( + country = country, + year = year, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = svy_lkup, + valid_regions = valid_regions, + data_dir = data_dir, + povline = povline, + cache_file_path = cache_file_path, + fill_gaps = FALSE + ) + + data_present_in_master <- metadata$data_present_in_master + povline <- metadata$povline + metadata <- metadata$lkup + + + # Remove aggregate distribution if popshare is specified + # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions + metadata <- filter_lkup(metadata = metadata, + popshare = popshare) + + # return empty dataframe if no metadata is found + if (nrow(metadata) == 0) { + return(list(main_data = pipapi::empty_response, + data_in_cache = data_present_in_master)) + } + + # load data + lt <- load_data_list_old(metadata) + + # parallelization + # res <- get_pov_estimates(lt, povline = povline) + + # Regular lapply + res <- lapply(lt, process_dt_old, povline = povline) + + res <- rbindlist(res, fill = TRUE) + + + # clean data + metadata[, file := basename(path)] + + out <- join(res, + metadata, + on = c("file", "reporting_level"), + how = "full", + validate = "m:1", + verbose = 0) + + out[, `:=`( + mean = survey_mean_ppp, + median = survey_median_ppp, + file = NULL + )] + + setnames(out, "povline", "poverty_line") + + + return(list(main_data = out, data_in_cache = data_present_in_master)) +} + + + diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R index ad64f538..75fa6102 100644 --- a/R/ui_country_profile.R +++ b/R/ui_country_profile.R @@ -52,14 +52,6 @@ ui_cp_ki_headcount <- function(country, reporting_level = "all", lkup = lkup) - # Select max year and country - # res <- - # res_all[res_all[, - # .I[which.max(reporting_year)], - # by = country_code]$V1 - # ][ - # country_code == country - # ] res <- res_all[country_code == country ][, @@ -83,8 +75,10 @@ ui_cp_ki_headcount <- function(country, ### TEMP FIX END out <- data.table::data.table( - country_code = country, reporting_year = res$reporting_year, - poverty_line = povline, headcount = res$headcount + country_code = country, + reporting_year = res$reporting_year, + poverty_line = povline, + headcount = res$headcount ) return(out) } @@ -124,8 +118,16 @@ ui_cp_charts <- function(country = "AGO", x[country_code == country] }) - dl <- list(append(dl, dl2)) - names(dl) <- country + # Add prosperity gap + pg <- get_aux_table(data_dir = lkup$data_root, + "pg_svy") + + pg <- pg[country_code == country] + dl2[["pg"]] <- pg + + + dl <- list(append(dl, dl2)) |> + setNames(country) return(dl) } @@ -317,7 +319,7 @@ ui_cp_download <- function(country = "AGO", hc <- lapply(country, \(.) { ui_cp_ki_headcount(., year, povline, lkup) }) |> - data.table::rbindlist(use.names = TRUE) + rbindlist(use.names = TRUE) df <- lkup[["cp_lkups"]]$flat$flat_cp df <- df[country_code %chin% country] diff --git a/R/ui_home_page.R b/R/ui_home_page.R index 04105c59..0cc201a6 100644 --- a/R/ui_home_page.R +++ b/R/ui_home_page.R @@ -15,20 +15,43 @@ ui_hp_stacked <- function(povline = 1.9, ref_years <- sort(unique(lkup$ref_lkup$reporting_year)) ref_years <- ref_years[!ref_years %in% c(1981:1989)] - out <- pip_grp( - country = "all", - year = ref_years, - povline = povline, - group_by = "wb", - reporting_level = "national", - censor = FALSE, - lkup = lkup - ) + use_new <- lkup$use_new_lineup_version + + # Run correct function + #------------------------------------- + out <- if (use_new) { + x <- pip_grp_new(country = "ALL", + year = ref_years, + povline = povline, + welfare_type = "all", + reporting_level = "national", + lkup = lkup, + censor = FALSE) + + regs <- lkup$aux_files$country_list[, region_code] |> + funique() |> + c("WLD") + + x |> + fsubset(region_code %in% regs) + + } else { + pip_grp( + country = "all", + year = ref_years, + povline = povline, + group_by = "wb", + reporting_level = "national", + censor = FALSE, + lkup = lkup + ) + + } + + out <- get_vars(out, + c("region_code", "reporting_year", + "poverty_line", "pop_in_poverty")) - out <- out[, c( - "region_code", "reporting_year", - "poverty_line", "pop_in_poverty" - )] return(out) } @@ -45,7 +68,9 @@ ui_hp_stacked <- function(povline = 1.9, ui_hp_countries <- function(country = c("IDN", "CIV"), povline = 1.9, pop_units = 1e6, - lkup) { + lkup, + lkup_hash = lkup$cache_data_id$hash_pip + ) { out <- pip( country = country, year = "all", @@ -57,13 +82,16 @@ ui_hp_countries <- function(country = c("IDN", "CIV"), ) # Add pop_in_poverty and scale according to pop_units - out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units - out$reporting_pop <- out$reporting_pop / pop_units + out[, + `:=`( + pop_in_poverty = reporting_pop * headcount / pop_units, + reporting_pop = reporting_pop / pop_units + )] - out <- out[, c( + out <- get_vars(out, c( "region_code", "country_code", "reporting_year", "poverty_line", "reporting_pop", "pop_in_poverty" - )] + )) return(out) } diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R index 2588da6b..4821aa7a 100644 --- a/R/ui_poverty_indicators.R +++ b/R/ui_poverty_indicators.R @@ -14,7 +14,10 @@ ui_pc_charts <- function(country = c("AGO"), welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national", "rural", "urban"), pop_units = 1e6, - lkup) { + censor = TRUE, + lkup, + lkup_hash = lkup$cache_data_id$hash_pip_grp + ) { # Set returned columns return_cols <- lkup$return_cols$ui_pc_charts$cols inequality_indicators <- lkup$return_cols$ui_pc_charts$inequality_indicators @@ -30,12 +33,17 @@ ui_pc_charts <- function(country = c("AGO"), fill_gaps = fill_gaps, group_by = group_by, reporting_level = reporting_level, - lkup = lkup + lkup = lkup, + censor = censor ) # Add pop_in_poverty and scale according to pop_units - out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units - out$reporting_pop <- out$reporting_pop / pop_units + out[, + `:=`( + pop_in_poverty = reporting_pop * headcount / pop_units, + reporting_pop = reporting_pop / pop_units + )] + # handle different responses when fill_gaps = TRUE / FALSE # Return all columns when survey years are requested @@ -45,7 +53,7 @@ ui_pc_charts <- function(country = c("AGO"), out <- out[, .SD, .SDcols = return_cols] } else { - out <- out[, .SD, .SDcols = return_cols] + out <- get_vars(out, return_cols) # Set non-interpolated variables to NA if line-up years are requested out[, (inequality_indicators) := NA] out[, survey_comparability := NA] # remove manually survey_comparability @@ -67,7 +75,9 @@ ui_pc_regional <- function(country = "ALL", year = "ALL", povline = 1.9, pop_units = 1e6, - lkup) { + lkup, + lkup_hash = lkup$cache_data_id$hash_pip_grp + ) { # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) @@ -75,19 +85,33 @@ ui_pc_regional <- function(country = "ALL", year <- toupper(year) } - out <- pip_grp_logic(country = country, - year = year, - group_by = "wb", - reporting_level = "national", - povline = povline, - lkup = lkup, - censor = TRUE) + out <- pip_agg(country = country, + year = year, + group_by = "wb", + reporting_level = "national", + povline = povline, + lkup = lkup, + censor = TRUE) # Add pop_in_poverty and scale according to pop_units - out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units - out$reporting_pop <- out$reporting_pop / pop_units + out[, + `:=`( + pop_in_poverty = reporting_pop * headcount / pop_units, + reporting_pop = reporting_pop / pop_units + )] + + # TEMP START: remove old aggregations -------------- + cl <- lkup$aux_files$country_list + + regs <- cl[, .(region_code, africa_split_code)] |> + unlist() |> # convert to vector + na_omit() |> + unique() |> + unname() |> + c("WLD") # add the world + # TEMP END: remove old aggregations -------------- - out <- out[estimate_type == "actual"] + out <- out[estimate_type == "actual" & region_code %in% regs] return(out) } diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R new file mode 100644 index 00000000..4274521c --- /dev/null +++ b/R/utils-pipdata.R @@ -0,0 +1,335 @@ + + + +#' transform input list +#' +#' @inheritParams load_list_refy +#' +#' @return formated list +#' @keywords internal +transform_input <- function(input_list){ + country_codes <- input_list$country_code + years <- input_list$year + if (!is.list(years)) { + years <- lapply(country_codes, function(x) years) + } + else { + if (length(years) != length(country_codes)) { + stop("The length of the 'year' list must match the length of the 'country_code' vector.") + } + } + output_list <- lapply(seq_along(country_codes), function(i) { + lapply(years[[i]], function(y) { + list(country_code = country_codes[i], year = y) + }) + }) + output_list <- unlist(output_list, recursive = FALSE) + return(output_list) +} + + +#' Add attributes as columns (vectorized, in-place) +#' +#' @description +#' Converts survey attributes on a `data.table`—including +#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`— +#' into columns using a **loop-free, segment-replication** strategy. Designed +#' for very large tables and objects loaded via `readRDS()`/`load()`: +#' uses `setDT()` and `alloc.col()` to ensure in-place assignment. +#' +#' @details +#' The function expects an attribute `reporting_level_rows`, a list with: +#' - `reporting_level`: character vector of the level label for each segment +#' (e.g., `c("rural","urban","rural", ...)`). +#' - `rows`: integer vector of **cumulative** row-ends (e.g., +#' `c(100000, 200000, 300000, ...)`). +#' +#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level` +#' is replicated with `rep.int(lev, counts)`. Constants `country_code`, +#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`) +#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are +#' provided (as named vectors/lists keyed by level), they are mapped by level +#' name and replicated per segment. If a level is missing from the names, +#' `NA` values may result for that segment. +#' +#' This implementation avoids loops and `findInterval()` edge cases, and +#' modifies `dt` by reference. +#' +#' @param dt A `data.table` carrying the attributes described above. +#' +#' @return The same `data.table`, modified by reference, with added columns: +#' `reporting_level`, `country_code`, `reporting_year`, `file`, and (if +#' present) `mean`, `median`. +#' +#' @section Assumptions: +#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`. +#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`. +#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names +#' align with the level labels. +#' +#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)` +#' ensures there is spare column capacity for by-reference assignment. +#' +#' @seealso [add_attributes_as_columns_multi()], [assign_stat()] +#' +#' @examples +#' \dontrun{ +#' library(data.table) +#' dt <- data.table(weight = 1:6, welfare = runif(6)) +#' attr(dt, "reporting_level_rows") <- list( +#' reporting_level = c("rural","urban","rural"), +#' rows = c(2L, 4L, 6L) +#' ) +#' attr(dt, "country_code") <- "XXY" +#' attr(dt, "reporting_year") <- 2000L +#' attr(dt, "dist_stats") <- list( +#' mean = list(rural = 2.5, urban = 5.0), +#' median = list(rural = 2.0, urban = 4.5) +#' ) +#' +#' add_attributes_as_columns_vectorized(dt) +#' head(dt) +#' } +#' +#' @import data.table +#' @export +add_attributes_as_columns_vectorized <- function(dt) { + + # Ensure proper internal state & spare column capacity (handles readRDS/load cases) + setDT(dt) # harmless if already a data.table + setalloccol(dt) # pre-allocate room for new columns... #AC, I am still not sure about this. + + rl <- attr(dt, "reporting_level_rows") + lev <- rl$reporting_level + rows <- as.integer(rl$rows) + n <- fnrow(dt) + + counts <- diff(c(0L, rows)) + if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).") + + # reporting_level: optimized assignment by range + reporting_level_vec <- character(n) + start <- 1L + for (i in seq_along(lev)) { + end <- rows[i] + reporting_level_vec[start:end] <- lev[i] + start <- end + 1L + } + dt[, reporting_level := reporting_level_vec] + + # constants + cc <- attr(dt, "country_code") + ry <- attr(dt, "reporting_year") + dt[, `:=`( + country_code = cc, + reporting_year = ry, + file = paste0(cc, "_", ry) + )] + + # dist_stats per reporting_level (align by names, then replicate by counts) + ds <- attr(dt, "dist_stats") + + + # This block processes distribution statistics (mean, median) for each reporting level. + # If this is not required at this stage, consider removing it or deferring it to a later step. + if (length(ds)) { + dstats <- c("mean", "median") + for (l in lev) { + wrl <- whichv(dt$reporting_level, l) + ld <- lapply(dstats, \(d) { + if (is.null(ds[[d]][[l]])) { + NA + } else { + ds[[d]][[l]] + } + }) + + dt[wrl, (dstats) := ld] + } + } + + dt +} + + + +#' Add attributes as columns for multi-segment reporting levels +#' +#' @description +#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`, +#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling +#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or +#' single-segment cases (e.g., ZAF). +#' +#' @param dt A `data.table` with attributes: +#' - `reporting_level_rows`: list with `reporting_level` (character) and +#' `rows` (integer cumulative row ends). +#' - `country_code` (character). +#' - `reporting_year` (integer/numeric). +#' - `dist_stats` (list) optionally containing `mean` and/or `median`, each as +#' a named list/vector keyed by reporting level, or a single scalar. +#' +#' @return The same `data.table`, modified by reference, with new columns: +#' `reporting_level`, `country_code`, `reporting_year`, `file`, and +#' optionally `mean`, `median`. +#' +#' @examples +#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000) +#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000) +#' @import data.table +#' @export +add_attributes_as_columns_multi <- function(dt) { + # Ensure DT internals and spare capacity for new columns + setDT(dt) + alloc.col(dt) + + # --- Pull + validate segment metadata --- + rl <- attr(dt, "reporting_level_rows") + if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) { + cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.") + } + lev <- as.character(rl$reporting_level) + rows <- as.integer(rl$rows) + n <- nrow(dt) + + if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.") + if (length(rows) == 0L) cli::cli_abort("'rows' is empty.") + if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.") + if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).") + + counts <- diff(c(0L, rows)) + if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).") + + # --- reporting_level: vectorized per-segment replication --- + dt[, reporting_level := rep.int(lev, counts)] + + # --- constants --- + cc <- attr(dt, "country_code") + ry <- attr(dt, "reporting_year") + dt[, `:=`( + country_code = cc, + reporting_year = ry, + file = paste0(cc, "_", ry) + )] + + # --- distribution stats --- + ds <- attr(dt, "dist_stats") + if (length(ds)) { + assign_stat(dt, lev, counts, ds$mean, "mean") + assign_stat(dt, lev, counts, ds$median, "median") + } + + dt +} + + + +#' Assign a per-level statistic to a data.table column (by reference) +#' +#' @description +#' Replicates a statistic per reporting-level segment and assigns it to a new +#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named +#' vector, or a named list (one value per level). +#' +#' @param dt A `data.table`. Modified by reference. +#' @param lev Character vector of reporting-level labels per segment +#' (e.g., `c("rural","urban","rural", ...)`). +#' @param counts Integer vector of segment lengths matching `lev` +#' (e.g., `c(100000, 100000, 100000, ...)`). +#' @param stat A numeric scalar, named vector, or named list with one value per +#' level (names must match `lev` values). +#' @param colname Name of the column to create/overwrite. +#' +#' @return Invisibly returns `dt` (modified by reference). +#' @examples +#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean") +#' @import data.table +#' @export +assign_stat <- function(dt, lev, counts, stat, colname) { + if (is.null(stat)) return(invisible(dt)) + n <- nrow(dt) + + v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat + + # Single scalar: broadcast + if (length(v) == 1L && is.null(names(v))) { + dt[, (colname) := rep.int(unname(v), n)] + return(invisible(dt)) + } + + # Need names to map values to levels + if (is.null(names(v))) { + stop("`stat` has length > 1 but no names; cannot map to levels.") + } + + map_idx <- match(lev, names(v)) + if (anyNA(map_idx)) { + missing_levels <- unique(lev[is.na(map_idx)]) + stop( + sprintf("`stat` missing value(s) for level(s): %s", + paste(missing_levels, collapse = ", ")) + ) + } + + dt[, (colname) := rep.int(unname(v[map_idx]), counts)] + invisible(dt) +} + + + + + +#' extract rows indices +#' +#' @param a list with attributes from lt +#' +#' @return names list with indices for reporting level +#' @keywords internal +get_rl_rows_single <- function(a) { + rl <- a$rl_rows + rl_rows <- vector("list", length(rl$reporting_level)) + + start <- 1L + for (i in seq_along(rl$reporting_level)) { + end <- rl$rows[i] + rl_rows[[i]] <- start:end + start <- end + 1L + } + setNames(rl_rows, rl$reporting_level) +} + + +#' apply get_rl_rows_single +#' @rdname get_rl_rows_single +get_rl_rows <- \(lt_att) { + lapply(lt_att, get_rl_rows_single) +} + + +#' get data.table with distribution stats +#' +#' this is a loop over lt attributes +#' +#' @param lt_att list of attributes of lt list +#' +#' @return data.table +#' @keywords internal +get_dt_dist_stats <- \(lt_att) { + lapply(lt_att, \(.) { + .$dist_stats + }) |> + rbindlist(fill = TRUE) +} + +#' Get some attributes from lt lis +#' +#' @param lt list +#' @keywords internal +get_lt_attr <- function(lt) { + lapply(lt, \(.) { + list( + dist_stats = attr(., "dt_dist_stats"), + rl_rows = attr(., "reporting_level_rows") + ) + }) +} diff --git a/R/utils-plumber.R b/R/utils-plumber.R index 215310b6..13d784e1 100644 --- a/R/utils-plumber.R +++ b/R/utils-plumber.R @@ -565,3 +565,134 @@ change_grouped_stats_to_csv <- function(out) { out$deciles <- NULL data.frame(out) } + + + + + +#' Wrap a Plumber endpoint with standardized error handling +#' +#' `safe_endpoint()` wraps an endpoint handler in a `tryCatch`, ensuring +#' consistent error handling across the API. On success, the original +#' handler's result is returned. On error, a structured JSON object is +#' returned with useful metadata (status, message, request ID, endpoint), +#' and optionally additional debug details. +#' +#' Debug mode can be enabled by either: +#' \itemize{ +#' \item Passing `debug = TRUE` explicitly, or +#' \item Setting the environment variable `PIPAPI_DEBUG=TRUE`. +#' } +#' When debug mode is active, the error payload also includes the error +#' class, call, query parameters, and a truncated traceback. +#' +#' @param fun A function `(req, res)` containing the endpoint logic. +#' This is where you parse request arguments and call the relevant +#' internal functions. +#' @param endpoint Character string giving the endpoint path +#' (e.g., `"/api/v1/pip"`). Used in error payloads so clients know +#' which endpoint failed. +#' @param debug Logical; if `NULL` (default), inherits from the +#' environment variable `PIPAPI_DEBUG`. When `TRUE`, include extended +#' diagnostic details in the error response. +#' +#' @return A function `(req, res)` suitable for use in Plumber routes. +#' On error, sets `res$status <- 500` and returns a JSON object with: +#' \describe{ +#' \item{error}{A short description ("Error in /api/v1/...")} +#' \item{message}{Either the actual error message (debug mode) or +#' `"Internal Server Error"`} +#' \item{request_id}{The Plumber request ID, if available} +#' \item{endpoint}{The endpoint string supplied} +#' \item{class}{Error class (debug mode only)} +#' \item{call}{The call that generated the error (debug mode only)} +#' \item{query}{The query parameters (debug mode only)} +#' \item{trace}{Traceback captured by `rlang::trace_back()` (debug mode only)} +#' } +#' +#' @examples +#' \dontrun{ +#' # Example: wrap a handler for /api/v1/pip +#' #* @get /api/v1/pip +#' function(req, res) { +#' safe_endpoint(function(req, res) { +#' params <- req$argsQuery +#' params$lkup <- lkups$versions_paths[[req$argsQuery$version]] +#' params$version <- NULL +#' do.call(pipapi::ui_pip, params) +#' }, endpoint = "/api/v1/pip")(req, res) +#' } +#' } +#' +#' @export +safe_endpoint <- function(fun, endpoint, debug = NULL) { + if (is.null(debug)) { + debug <- identical(Sys.getenv("PIPAPI_DEBUG"), "TRUE") + } + + function(req, res) { + tryCatch( + { + fun(req, res) + }, + error = function(e) { + res$status <- 500L + out <- list( + error = paste("Error in", endpoint), + message = if (debug) conditionMessage(e) else "Internal Server Error", + request_id = tryCatch(req$.id, error = \(.) NA), + endpoint = endpoint + ) + if (debug) { + out$class <- class(e)[[1]] + out$call <- as.character(conditionCall(e)) + out$query <- req$argsQuery + out$trace <- utils::capture.output( + rlang::trace_back(bottom = 10, simplify = "branch") + ) + } + out + } + ) + } +} + +# ---- bounded execution helper ------------------------------------------- +#' Evaluate an expression with a timeout +#' +#' Wraps [R.utils::withTimeout()] but returns a structured failure +#' object instead of stopping the whole process. This allows +#' `safe_endpoint()` to handle timeouts like normal errors without +#' killing the API process. +#' +#' @param expr Expression to evaluate. +#' @param secs Timeout in seconds (default: from env var `PLUMBER_REQ_TIMEOUT`, +#' or 150 if unset). +#' +#' @return Result of `expr` if it finishes in time; otherwise a list +#' with `ok = FALSE`, `error = "timeout"`, and `elapsed` seconds. +#' @export +with_req_timeout <- function(expr, + secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) { + if (!is.finite(secs) || secs <= 0) return(force(expr)) + + start <- proc.time()[["elapsed"]] + tryCatch( + { + R.utils::withTimeout( + expr = force(expr), + timeout = secs, + onTimeout = "error" + ) + + }, + TimeoutException = \(e) { + elapsed <- proc.time()[["elapsed"]] - start + list( + ok = FALSE, + error = sprintf("Request exceeded timeout of %s seconds", secs), + elapsed = elapsed + ) + } + ) +} diff --git a/R/utils.R b/R/utils.R index ae4fdc89..7c42de11 100644 --- a/R/utils.R +++ b/R/utils.R @@ -16,9 +16,45 @@ subset_lkup <- function(country, data_dir = NULL, povline, cache_file_path, - fill_gaps + fill_gaps, + popshare = NULL ) { + + lkup <- lkup_filter(lkup, + country, + year, + valid_regions, + reporting_level, + welfare_type, + data_dir) + # If povline is NULL, this happens when popshare is passed + # i.e popshare is not NULL + if (is.null(povline)) { + return(list( + data_present_in_master = NULL, + lkup = lkup, + povline = NULL + )) + } + # Return with grace + return_if_exists( + slkup = lkup, + povline = povline, + cache_file_path = cache_file_path, + fill_gaps = fill_gaps + ) +} + + +#' @keywords internal +lkup_filter <- function(lkup, + country, + year, + valid_regions, + reporting_level, + welfare_type, + data_dir) { # STEP 1 - Keep every row by default keep <- rep(TRUE, nrow(lkup)) # STEP 2 - Select countries @@ -31,9 +67,6 @@ subset_lkup <- function(country, data_dir = data_dir, valid_regions = valid_regions) - # # step 4. Select MRV - # keep <- select_MRV(lkup, keep, year, country, valid_regions, data_dir) - # STEP 4 - Select welfare_type if (welfare_type[1] != "all") { keep <- keep & lkup$welfare_type == welfare_type @@ -45,14 +78,10 @@ subset_lkup <- function(country, lkup <- lkup[keep, ] - - # Return with grace - return_if_exists(slkup = lkup, - povline = povline, - cache_file_path = cache_file_path, - fill_gaps = fill_gaps) + return(lkup) } + #' select_country #' Helper function for subset_lkup() #' @inheritParams subset_lkup @@ -65,7 +94,19 @@ select_country <- function(lkup, keep, country, valid_regions) { # Select regions if (any(country %in% valid_regions)) { selected_regions <- country[country %in% valid_regions] - keep_regions <- lkup$region_code %in% selected_regions + # Find all columns ending with _code + code_cols <- grep("_code$", names(lkup), value = TRUE) + code_cols <- code_cols[!code_cols %in% "wb_region_code"] # Temporary solution + # For each code column, check if any value matches selected_regions + keep_regions_list <- lapply(code_cols, \(col) { + lkup[[col]] %in% selected_regions + }) + # Combine with logical OR across all code columns + if (length(keep_regions_list) > 0) { + keep_regions <- Reduce(`|`, keep_regions_list) + } else { + keep_regions <- rep(FALSE, nrow(lkup)) + } } else { keep_regions <- rep(FALSE, length(lkup$country_code)) } @@ -289,8 +330,66 @@ get_svy_data <- function(svy_id, #' #' @return data.table #' @export +add_dist_stats <- function(df, lkup, fill_gaps) { + + if (fill_gaps) { + dist_stats <- lkup[["lineup_dist_stats"]] + } else { + dist_stats <- lkup[["dist_stats"]] + } + + if (fill_gaps) { + + df <- df |> + joyn::joyn(y = dist_stats, + by = c("country_code", + "reporting_level", + "reporting_year"), + match_type = "m:1", # multiple poverty lines + keep_common_vars = FALSE, + reportvar = FALSE, + verbose = FALSE, + keep = "left") + + } else { + # Keep only relevant columns + cols <- c( + "cache_id", + # "country_code", + # "reporting_year", + # "welfare_type", + "reporting_level", + "gini", + "polarization", + "mld", + sprintf("decile%s", 1:10) + ) + dist_stats <- dist_stats[, .SD, .SDcols = cols] + + # merge dist stats with main table + # data.table::setnames(dist_stats, "survey_median_ppp", "median") + + df <- dist_stats[df, + on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), + allow.cartesian = TRUE + ] + } + + + df +} + + + +#' Add pre-computed distributional stats #' -add_dist_stats <- function(df, dist_stats) { +#' @param df data.table: Data frame of poverty statistics +#' @param dist_stats data.table: Distributional stats lookup +#' +#' @return data.table +#' @export +#' +add_dist_stats_old <- function(df, dist_stats) { # Keep only relevant columns cols <- c( "cache_id", @@ -316,6 +415,7 @@ add_dist_stats <- function(df, dist_stats) { return(df) } + #' Collapse rows #' @return data.table #' @noRd @@ -1144,8 +1244,8 @@ add_distribution_type <- function(df, lkup, fill_gaps) { dt[, # distribution type by year distribution_type := fcase(use_groupdata == 1, "group", - use_imputed == 1, "imputed", - default = "micro") + use_imputed == 1, "imputed", + default = "micro") ] dt <- dt[, # collapse by reporting_year and keep relevant variables @@ -1154,9 +1254,12 @@ add_distribution_type <- function(df, lkup, fill_gaps) { } - df[, - surveyid_year := as.numeric(surveyid_year) - ][dt, + if (!fill_gaps) { + df <- df[, + surveyid_year := as.numeric(surveyid_year) + ] + } + df[dt, on = by_vars, distribution_type := i.distribution_type ][, @@ -1345,6 +1448,7 @@ get_caller_names <- function() { #' @keywords internal #' @return data.table from pip or pip_grp functions. add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) { + ## Add SPL and SPR --------------- out <- add_spl(df = out, fill_gaps = fill_gaps, @@ -1358,9 +1462,9 @@ add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) { ## add distribution type ------------- # based on info in framework data, rather than welfare data - out <- add_distribution_type(df = out, - lkup = lkup, - fill_gaps = fill_gaps) + out <- add_distribution_type(df = out, + lkup = lkup, + fill_gaps = fill_gaps) invisible(out) } @@ -1399,3 +1503,44 @@ unnest_dt_longer <- function(tbl, cols) { + + + +#' merge into fgt table the mean and median from dist stats table in lkup +#' +#' @param fgt data,table with fgt measures +#' @param lkup lkup +#' @param fill_gaps logical. whether to use lineup estimates +#' +#' @return data.table with with fgt, mean and median +#' @keywords internal +get_mean_median <- \(fgt, lkup, fill_gaps) { + + if (isFALSE(lkup$use_new_lineup_version)) return(fgt) + + if (fill_gaps) { + dist <- get_vars(lkup$lineup_dist_stats, + c("country_code", "reporting_year", + "reporting_level", "mean", "median")) + by_var <- c('country_code', "reporting_year", "reporting_level") + } else { + dist <- get_vars(lkup$dist_stats, + c("country_code", "reporting_year", + "reporting_level", "mean", + "survey_median_ppp", "welfare_type")) + setnames(dist, "survey_median_ppp", "median") + + by_var <- c('country_code', + "reporting_year", + "reporting_level", + "welfare_type") + } + join(x = fgt, + y = dist, + on = by_var, + how = "left", + validate = "m:1", # multiple povlines + verbose = 0L) +} + + diff --git a/R/zzz.R b/R/zzz.R index e534cced..094f9dda 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -17,11 +17,13 @@ pipapi_default_options <- list( max_size = as.numeric(Sys.getenv("PIPAPI_CACHE_MAX_SIZE")), prune_rate = 50) - pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup") - ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup") - pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup") - pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup") - ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup") + pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup") + ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup") + pip_agg <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup") + pip_grp_new <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup") + pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup") + pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup") + ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup") ui_cp_download <<- memoise::memoise(ui_cp_download, cache = cd, omit_args = "lkup") ui_cp_key_indicators <<- memoise::memoise(ui_cp_key_indicators, cache = cd, omit_args = "lkup") pos = 1L @@ -33,6 +35,22 @@ pipapi_default_options <- list( toset <- !(names(pipapi_default_options) %in% names(op)) if (any(toset)) options(pipapi_default_options[toset]) + + # set multi threats + # available_cores <- parallel::detectCores() - 1 + # + # cores_to_use <- max(available_cores, 1) |> + # min(8) + # set_in_pipapienv("cores_to_use", cores_to_use) + + + # pov lines to store + pl <- c(seq(from = 0.01, to = 5, by = 0.01), + seq(from = 5.1, to = 20, by = 0.1), + seq(from = 21, to = 100, by = 1), + seq(from = 105, to = 900, by = 5)) + set_in_pipapienv("pl_to_store", pl) + invisible() } diff --git a/data-raw/data.R b/data-raw/data.R index b6d075be..2a87ea42 100644 --- a/data-raw/data.R +++ b/data-raw/data.R @@ -35,61 +35,22 @@ empty_response_grp <- pip_grp("all", year, lkup = lkup, group_by = "wb") empty_response_grp <- empty_response_grp[-c(1:nrow(empty_response_grp))] -empty_response_fg <- data.table::data.table( - country_code = character(0), - survey_id = character(0), - cache_id = character(0), - wb_region_code = character(0), - reporting_year = numeric(0), - surveyid_year = character(0), - survey_year = numeric(0), - survey_time = character(0), - survey_acronym = character(0), - survey_coverage = character(0), - survey_comparability = numeric(0), - comparable_spell = character(0), - welfare_type = character(0), - reporting_level = character(0), - survey_mean_lcu = numeric(0), - survey_mean_ppp = numeric(0), - survey_median_ppp = numeric(0), - survey_median_lcu = numeric(0), - predicted_mean_ppp = numeric(0), - ppp = numeric(0), - cpi = numeric(0), - reporting_pop = numeric(0), - reporting_gdp = numeric(0), - reporting_pce = numeric(0), - pop_data_level = character(0), - gdp_data_level = character(0), - pce_data_level = character(0), - cpi_data_level = character(0), - ppp_data_level = character(0), - distribution_type = character(0), - gd_type = character(0), - is_interpolated = logical(0), - is_used_for_line_up = logical(0), - is_used_for_aggregation = logical(0), - estimation_type = character(0), - interpolation_id = character(0), - display_cp = numeric(0), - country_name = character(0), - africa_split = character(0), - africa_split_code = character(0), - region_name = character(0), - region_code = character(0), - world = character(0), - world_code = character(0), - path = character(0), - data_interpolation_id = character(0), - poverty_line = numeric(0), - mean = numeric(0), - median = numeric(0), - headcount = numeric(0), - poverty_gap = numeric(0), - poverty_severity = numeric(0), - watts = numeric(0) -) +fg <- fg_pip( + ctr, + year = year, + povline = 3, + welfare_type = "all", + reporting_level = "all", + popshare = NULL, + lkup = lkup +) |> + rbindlist(fill = TRUE) +empty_response_fg <- fg[-1] + + +add_vars_out_of_pipeline(fg, fill_gaps = TRUE, lkup = lkup) + +empty_response_fg_add <- fg[-1] usethis::use_data( empty_response, @@ -97,5 +58,6 @@ usethis::use_data( reporting_level_list, empty_response_grp, empty_response_fg, + empty_response_fg_add, overwrite = TRUE ) diff --git a/data/empty_response_fg.rda b/data/empty_response_fg.rda index 6785534c..31da4426 100644 Binary files a/data/empty_response_fg.rda and b/data/empty_response_fg.rda differ diff --git a/data/empty_response_fg_add.rda b/data/empty_response_fg_add.rda new file mode 100644 index 00000000..3acc0ca3 Binary files /dev/null and b/data/empty_response_fg_add.rda differ diff --git a/inst/TMP/TMP_API_launcher.R b/inst/TMP/TMP_API_launcher.R index 72e28195..7ce361b3 100644 --- a/inst/TMP/TMP_API_launcher.R +++ b/inst/TMP/TMP_API_launcher.R @@ -4,12 +4,9 @@ library(pipapi) # devtools::load_all(".") if (Sys.info()[["user"]] == "wb384996") { - force <- FALSE - if (!"lkups" %in% ls() || isTRUE(force)) { - data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> - fs::path() - fs::dir_ls(data_dir, recurse = FALSE) - } + data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> + fs::path() + fs::dir_ls(data_dir, recurse = FALSE) latest_version <- diff --git a/inst/TMP/TMP_duckdb_cache.R b/inst/TMP/TMP_duckdb_cache.R index e37562d9..b1e7137a 100644 --- a/inst/TMP/TMP_duckdb_cache.R +++ b/inst/TMP/TMP_duckdb_cache.R @@ -1,38 +1,60 @@ +# SETUP ------------- + devtools::load_all(".") -force <- FALSE -if (!"lkups" %in% ls() || isTRUE(force)) { - data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> - fs::path() - fs::dir_ls(data_dir, recurse = FALSE) -} +library(fastverse) +root_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> + fs::path() +fs::dir_ls(root_dir, recurse = FALSE) +# latest_version <- - pipapi:::available_versions(data_dir) |> + available_versions(root_dir) |> max() -latest_version <- NULL -latest_version <- "20240627_2017_01_02_PROD" -lkups <- create_versioned_lkups(data_dir, - vintage_pattern = latest_version) +lkups <- create_versioned_lkups(root_dir, + vintage_pattern = "^202509.+2017.+(PROD)$") +# lkups <- create_versioned_lkups(root_dir, +# vintage_pattern = latest_version) -lkup <- lkups$versions_paths[[lkups$latest_release]] +# lkup <- lkups$versions_paths[[lkups$versions[[2]]]] +ver_to_use <- lkups$latest_release # this is important. You need this object below +lkup <- lkups$versions_paths[[ver_to_use]] -reset_cache(lkup = lkup) +# DEGUB ------------- -# 1. -pip(country = "all", year = 2000, lkup = lkup) -# 2. -pip(country = "AGO", year = 2000, lkup = lkup) +options(pipapi.query_live_data = FALSE) +getOption("pipapi.query_live_data") +reset_cache(lkup = lkup) -pip(country = "all", year = "all", lkup = lkup) +tictoc::tic() +sv <- pip(country = "ALL", + year = "ALL", + povline = lkup$pl_lkup$poverty_line, + lkup = lkup, + fill_gaps = FALSE) +tictoc::toc() -pip(country = "IND", year = 2018, lkup = lkup) +tictoc::tic() +fg <- pip(country = "ALL", + year = "ALL", + povline = lkup$pl_lkup$poverty_line, + lkup = lkup, + fill_gaps = TRUE) +tictoc::toc() -pip(country = "IND", year = "all", lkup = lkup) + +# copy cache to TFS folder +ori_cache <- fs::path(lkup$data_root, "cache.duckdb") +dest_cache <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_SERVER") |> + fs::path(ver_to_use, "cache.duckdb") + +if (fs::file_exists(ori_cache)) { + fs::file_copy(ori_cache, dest_cache, overwrite = TRUE) +} diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 626f4b6a..7acae90b 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -4,8 +4,84 @@ library(pipapi) +# ---- tiny telemetry (ID + timings) -------------------------------------- + +# Monotonic wall time for durations (in seconds) +.now <- function() { + proc.time()[["elapsed"]] +} + +# Generate a request id AND return a decoded structure +# - id_raw: "milliseconds-since-epoch-random" +# - timestamp: POSIXct in UTC +# - random: integer +.req_id <- function() { + ts_ms_num <- as.numeric(Sys.time()) * 1000 + ts_ms_chr <- format(ts_ms_num, scientific = FALSE, trim = TRUE) + rnd <- sample.int(1e9, 1) + id_raw <- paste0(ts_ms_chr, "-", rnd) + + list( + id_raw = id_raw, + timestamp = as.POSIXct(as.numeric(ts_ms_chr) / 1000, + origin = "1970-01-01", tz = "UTC"), + random = rnd + ) +} + +`%||%` <- function(a, b) if (!is.null(a)) a else b + +# ---- Always-on request context (one log line per request) ---- +#* @filter ctx +function(req, res) { + rid <- .req_id() # list(id_raw, timestamp, random) + + req$.id <- rid$id_raw + res$setHeader("X-Request-ID", req$.id) + + # keep decoded pieces if helpful + req$.id_time <- rid$timestamp + req$.id_rand <- rid$random + + req$.start <- .now() + req$.path <- req$PATH_INFO %||% "" + req$.meth <- req$REQUEST_METHOD %||% "" + + on.exit({ + total <- .now() - req$.start + cat(sprintf( + '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', + req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total + ), file = stderr()) + }, add = TRUE) + + forward() +} + +# ---- Serialization timing hooks (kept here to avoid duplication) ---- +#* @plumber +function(pr) { + pr |> + pr_hook("preserialize", function(req, res) { + req$.ser0 <- .now() + }) |> + pr_hook("postserialize", function(req, res) { + if (!is.null(req$.ser0) && !is.na(req$.ser0)) { + ser <- .now() - req$.ser0 + cat(sprintf( + '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n', + req$.id %||% "", req$.path %||% "", ser + ), file = stderr()) + } + }) +} + + + +# ======================================================================== # API filters ------------------------------------------------------------- -## Validate version parameter ---- +# ======================================================================== + #* Ensure that version parameter is correct #* @filter validate_version @@ -151,7 +227,7 @@ function(req, res) { # Round poverty line # This is to prevent users to abuse the API by passing too many decimals if (!is.null(req$argsQuery$povline)) { - req$argsQuery$povline <- round(req$argsQuery$povline, digits = 3) + req$argsQuery$povline <- round(req$argsQuery$povline, digits = 2) } } plumber::forward() @@ -248,15 +324,33 @@ function(req, res) { #* Default is FALSE function(req, res) { - # Process request - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[params$version]] - res$serializer <- pipapi::assign_serializer(format = params$format) - params$format <- NULL - params$version <- NULL + # Defensive error handling for pip endpoint + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[params$version]] + res$serializer <- pipapi::assign_serializer(format = params$format) + params$format <- NULL + params$version <- NULL - out <- do.call(pipapi::pip, params) - out + out <- do.call(pipapi::pip, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, # added in the ctx filter + endpoint = "/api/v1/pip" + )) + } + + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pip", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA) + ) + }) } ### pip-grp ---------- @@ -275,15 +369,33 @@ function(req, res) { #* @param additional_ind:[bool] Additional indicators based on standard PIP output. #* Default is FALSE function(req, res) { - # Process request - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[params$version]] - res$serializer <- pipapi::assign_serializer(format = params$format) - params$format <- NULL - params$version <- NULL + # Defensive error handling for pip-grp endpoint + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[params$version]] + res$serializer <- pipapi::assign_serializer(format = params$format) + params$format <- NULL + params$version <- NULL - out <- do.call(pipapi::pip_grp_logic, params) - out + out <- do.call(pipapi::pip_agg, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, # added in the ctx filter + endpoint = "/api/v1/pip-grp" + )) + } + + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pip-grp", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA) + ) + }) } ### aux ------------------ @@ -299,7 +411,7 @@ function(req, res) { params <- req$argsQuery res$serializer <- pipapi::assign_serializer(format = params$format) - if (is.null(req$args$table)) { + if (is.null(req$argsQuery$table)) { # return all available tables if none selected list_of_tables <- lkups$versions_paths[[params$version]]$aux_tables out <- data.frame(tables = list_of_tables) @@ -606,8 +718,9 @@ function(req) { #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json list(na="null") function(req) { - pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root, + out <- pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root, table = "indicators") + out } ### decomposition-vars -------------- @@ -630,13 +743,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - - do.call(pipapi::ui_hp_stacked, params) - +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_hp_stacked, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/hp-stacked" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/hp-stacked", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } ### hp-countries ------------- @@ -647,11 +775,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - do.call(pipapi::ui_hp_countries, params) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_hp_countries, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/hp-countries" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/hp-countries", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } @@ -671,13 +816,29 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json list(na = "null") -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - - out <- do.call(pipapi::ui_pc_charts, params) - return(out) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + params$censor <- TRUE + out <- do.call(pipapi::ui_pc_charts, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/pc-charts" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pc-charts", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } ### pc-download ----------- @@ -699,6 +860,7 @@ function(req) { params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$pop_units <- 1 params$version <- NULL + params$censor <- TRUE do.call(pipapi::ui_pc_charts, params) @@ -714,13 +876,29 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - - do.call(pipapi::ui_pc_regional, params) - +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_pc_regional, params) |> + with_req_timeout() + + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/pc-regional-aggregates" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pc-regional-aggregates", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } @@ -735,11 +913,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json list(na="null") -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - do.call(pipapi::ui_cp_key_indicators, params) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_cp_key_indicators, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/cp-key-indicators" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/cp-key-indicators", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } @@ -752,13 +947,17 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { +cp_charts <- safe_endpoint(function(req, res) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - do.call(pipapi::ui_cp_charts, params) -} + + # wrap the heavy work in with_req_timeout + do.call(pipapi::ui_cp_charts, params) |> + with_req_timeout() + + }, + endpoint = "/api/v1/cp-charts") ### cp-download ----------- #* Return Country Profile - Downloads @@ -792,25 +991,18 @@ function(req, res) { #* @param exclude:[bool] exclude countries. only applies for "countries" table function(req, res) { params <- req$argsQuery - exclude <- req$argsQuery$exclude res$serializer <- pipapi::assign_serializer(format = params$format) - if (is.null(req$args$table)) { + if (is.null(req$argsQuery$table)) { # return all available tables if none selected list_of_tables <- lkups$versions_paths[[params$version]]$aux_tables out <- data.frame(tables = list_of_tables) } else { # Return only requested table params$data_dir <- lkups$versions_paths[[params$version]]$data_root + params$lkup <- lkups$versions_paths[[params$version]] params$format <- NULL params$version <- NULL - params$exclude <- NULL out <- do.call(pipapi::get_aux_table_ui, params) - - if (req$args$table == "countries" && exclude == TRUE) { - # hardcoded - to_remove <- c("MDG", "UKR") - out <- out[!(country_code %in% to_remove)] - } } out } diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index 9df24ad4..0b36deab 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -1,38 +1,86 @@ +# ---- process-level thread caps ------------------------------------------ +# (Avoid oversubscription; helps with stability under load) +Sys.setenv( + OPENBLAS_NUM_THREADS = "1", + MKL_NUM_THREADS = "1", + OMP_NUM_THREADS = "1" +) + +ncores <- parallel::detectCores(logical = FALSE) +data.table::setDTthreads(max(1L, ncores)) +collapse::set_collapse(nthreads = max(1L, ncores)) +fst::threads_fst(max(1L, ncores)) + +# local fallbacks used only in this file (do not depend on endpoints.R) +`%||%` <- function(a, b) if (!is.null(a)) a else b +.now_p <- function() proc.time()[["elapsed"]] + +# ---- build router -------------------------------------------------------- library(plumber) endpoints_path <- system.file("plumber/v1/endpoints.R", package = "pipapi") -api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") -# convert_empty <- pipapi:::convert_empty - -plumber::pr(endpoints_path) |> - # pre-route log - plumber::pr_hook("preroute", function() { - # log_separator() - # tictoc::tic("route") # Start timer for log info - }) |> - # post-route log - plumber::pr_hook("postroute", function(req, res) { - # end_route <- tictoc::toc(quiet = TRUE) - # log_info('route: {convert_empty(req$REMOTE_ADDR)} {convert_empty(req$REQUEST_METHOD)} {convert_empty(req$PATH_INFO)} {convert_empty(req$QUERY_STRING)} {convert_empty(res$status)} {round(end_route$toc - end_route$tic, digits = getOption("digits", 6))}') - }) |> - # pre-serialization log - plumber::pr_hook("preserialize", function() { - # tictoc::tic("serialize") - }) |> - # post-serialization log - plumber::pr_hook("postserialize", function(req) { - # end_serial <- tictoc::toc(quiet = TRUE) - # log_info('serialize: {convert_empty(req$PATH_INFO)} {round(end_serial$toc - end_serial$tic, digits = getOption("digits", 6))}') - # log_separator() - }) |> - plumber::pr_hook("exit", function() { - # log_info('Bye bye: {proc.time()[["elapsed"]]}') - }) |> - # Set API spec - plumber::pr_set_api_spec(api = function(spec) { - spec$info$version <- utils::packageVersion("pipapi") |> - as.character() - spec - }) |> - plumber::pr_set_api_spec( - yaml::read_yaml(api_spec_path)) +api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") + +pr <- plumber::pr(endpoints_path) |> + + # ---- Post-route: log handler duration (separate from total access time) ---- +plumber::pr_hook("postroute", function(req, res) { + if (!is.null(req$.start)) { + dur <- .now_p() - req$.start + cat( + sprintf( + '{"type":"route","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', + req$.id %||% "", + req$.meth %||% "", + req$.path %||% "", + as.character(res$status %||% NA_integer_), + dur + ), + file = stderr() + ) + } +}) |> + + # ---- Exit hook: when process shuts down ---- +plumber::pr_hook("exit", function() { + cat( + sprintf( + '{"type":"exit","uptime_s":%.2f}\n', + proc.time()[["elapsed"]] + ), + file = stderr() + ) +}) |> + + # ---- Global error handler (must return a serializable object) ---- +plumber::pr_set_error(function(req, res, err) { + method <- req$REQUEST_METHOD %||% "" + path <- req$PATH_INFO %||% "" + rid <- req$.id %||% "NA" + + cat( + sprintf( + '{"type":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n', + rid, method, path, jsonlite::toJSON(err$message, auto_unbox = TRUE) + ), + file = stderr() + ) + + res$status <- 500 + list( + error = "Internal Server Error", + message = err$message, + path = path, + method = method, + request_id = rid + ) +}) |> + + # ---- API Spec (with dynamic version injection) ---- +plumber::pr_set_api_spec(api = function(spec) { + spec$info$version <- as.character(utils::packageVersion("pipapi")) + spec +}) |> + plumber::pr_set_api_spec(yaml::read_yaml(api_spec_path)) + +pr diff --git a/man/add_attributes_as_columns_multi.Rd b/man/add_attributes_as_columns_multi.Rd new file mode 100644 index 00000000..27a0d5a7 --- /dev/null +++ b/man/add_attributes_as_columns_multi.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{add_attributes_as_columns_multi} +\alias{add_attributes_as_columns_multi} +\title{Add attributes as columns for multi-segment reporting levels} +\usage{ +add_attributes_as_columns_multi(dt) +} +\arguments{ +\item{dt}{A \code{data.table} with attributes: +\itemize{ +\item \code{reporting_level_rows}: list with \code{reporting_level} (character) and +\code{rows} (integer cumulative row ends). +\item \code{country_code} (character). +\item \code{reporting_year} (integer/numeric). +\item \code{dist_stats} (list) optionally containing \code{mean} and/or \code{median}, each as +a named list/vector keyed by reporting level, or a single scalar. +}} +} +\value{ +The same \code{data.table}, modified by reference, with new columns: +\code{reporting_level}, \code{country_code}, \code{reporting_year}, \code{file}, and +optionally \code{mean}, \code{median}. +} +\description{ +Converts attributes on a survey \code{data.table} (e.g., \code{reporting_level_rows}, +\code{country_code}, \code{reporting_year}, and \code{dist_stats}) into columns, handling +\strong{multiple alternating segments} (e.g., CHN rural/urban/rural/urban) or +single-segment cases (e.g., ZAF). +} +\examples{ +# chn2000_cols <- add_attributes_as_columns_multi(chn2000) +# zaf2000_cols <- add_attributes_as_columns_multi(zaf2000) +} diff --git a/man/add_attributes_as_columns_vectorized.Rd b/man/add_attributes_as_columns_vectorized.Rd new file mode 100644 index 00000000..40e8008d --- /dev/null +++ b/man/add_attributes_as_columns_vectorized.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{add_attributes_as_columns_vectorized} +\alias{add_attributes_as_columns_vectorized} +\title{Add attributes as columns (vectorized, in-place)} +\usage{ +add_attributes_as_columns_vectorized(dt) +} +\arguments{ +\item{dt}{A \code{data.table} carrying the attributes described above.} +} +\value{ +The same \code{data.table}, modified by reference, with added columns: +\code{reporting_level}, \code{country_code}, \code{reporting_year}, \code{file}, and (if +present) \code{mean}, \code{median}. +} +\description{ +Converts survey attributes on a \code{data.table}—including +\code{reporting_level_rows}, \code{country_code}, \code{reporting_year}, and \code{dist_stats}— +into columns using a \strong{loop-free, segment-replication} strategy. Designed +for very large tables and objects loaded via \code{readRDS()}/\code{load()}: +uses \code{setDT()} and \code{alloc.col()} to ensure in-place assignment. +} +\details{ +The function expects an attribute \code{reporting_level_rows}, a list with: +\itemize{ +\item \code{reporting_level}: character vector of the level label for each segment +(e.g., \code{c("rural","urban","rural", ...)}). +\item \code{rows}: integer vector of \strong{cumulative} row-ends (e.g., +\code{c(100000, 200000, 300000, ...)}). +} + +Segment lengths are computed as \code{diff(c(0L, rows))}, and \code{reporting_level} +is replicated with \code{rep.int(lev, counts)}. Constants \code{country_code}, +\code{reporting_year}, and \code{file} (\code{paste0(country_code, "_", reporting_year)}) +are added to all rows. If \code{dist_stats$mean} / \code{dist_stats$median} are +provided (as named vectors/lists keyed by level), they are mapped by level +name and replicated per segment. If a level is missing from the names, +\code{NA} values may result for that segment. + +This implementation avoids loops and \code{findInterval()} edge cases, and +modifies \code{dt} by reference. +} +\note{ +For objects loaded from disk (e.g., via \code{readRDS()}), \code{alloc.col(dt)} +ensures there is spare column capacity for by-reference assignment. +} +\section{Assumptions}{ + +\itemize{ +\item \code{length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)}. +\item \code{rows} are cumulative and non-decreasing, and their segment lengths sum to \code{nrow(dt)}. +\item If \code{dist_stats$mean} / \code{dist_stats$median} have multiple values, their names +align with the level labels. +} +} + +\examples{ +\dontrun{ +library(data.table) +dt <- data.table(weight = 1:6, welfare = runif(6)) +attr(dt, "reporting_level_rows") <- list( + reporting_level = c("rural","urban","rural"), + rows = c(2L, 4L, 6L) +) +attr(dt, "country_code") <- "XXY" +attr(dt, "reporting_year") <- 2000L +attr(dt, "dist_stats") <- list( + mean = list(rural = 2.5, urban = 5.0), + median = list(rural = 2.0, urban = 4.5) +) + +add_attributes_as_columns_vectorized(dt) +head(dt) +} + +} +\seealso{ +\code{\link[=add_attributes_as_columns_multi]{add_attributes_as_columns_multi()}}, \code{\link[=assign_stat]{assign_stat()}} +} diff --git a/man/add_dist_stats.Rd b/man/add_dist_stats.Rd index 8b9ba3bf..5ccd113c 100644 --- a/man/add_dist_stats.Rd +++ b/man/add_dist_stats.Rd @@ -4,7 +4,7 @@ \alias{add_dist_stats} \title{Add pre-computed distributional stats} \usage{ -add_dist_stats(df, dist_stats) +add_dist_stats(df, lkup, fill_gaps) } \arguments{ \item{df}{data.table: Data frame of poverty statistics} diff --git a/man/add_dist_stats_old.Rd b/man/add_dist_stats_old.Rd new file mode 100644 index 00000000..e59eb531 --- /dev/null +++ b/man/add_dist_stats_old.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{add_dist_stats_old} +\alias{add_dist_stats_old} +\title{Add pre-computed distributional stats} +\usage{ +add_dist_stats_old(df, dist_stats) +} +\arguments{ +\item{df}{data.table: Data frame of poverty statistics} + +\item{dist_stats}{data.table: Distributional stats lookup} +} +\value{ +data.table +} +\description{ +Add pre-computed distributional stats +} diff --git a/man/assign_stat.Rd b/man/assign_stat.Rd new file mode 100644 index 00000000..36cb81b9 --- /dev/null +++ b/man/assign_stat.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{assign_stat} +\alias{assign_stat} +\title{Assign a per-level statistic to a data.table column (by reference)} +\usage{ +assign_stat(dt, lev, counts, stat, colname) +} +\arguments{ +\item{dt}{A \code{data.table}. Modified by reference.} + +\item{lev}{Character vector of reporting-level labels per segment +(e.g., \code{c("rural","urban","rural", ...)}).} + +\item{counts}{Integer vector of segment lengths matching \code{lev} +(e.g., \code{c(100000, 100000, 100000, ...)}).} + +\item{stat}{A numeric scalar, named vector, or named list with one value per +level (names must match \code{lev} values).} + +\item{colname}{Name of the column to create/overwrite.} +} +\value{ +Invisibly returns \code{dt} (modified by reference). +} +\description{ +Replicates a statistic per reporting-level segment and assigns it to a new +column in \code{dt}, \strong{in place}. \code{stat} can be a scalar (broadcast), a named +vector, or a named list (one value per level). +} +\examples{ +# assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean") +} diff --git a/man/build_pair_dict.Rd b/man/build_pair_dict.Rd new file mode 100644 index 00000000..c0c1d212 --- /dev/null +++ b/man/build_pair_dict.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{build_pair_dict} +\alias{build_pair_dict} +\title{Build dictionary for id/reporting_level encoding} +\usage{ +build_pair_dict(lkup, fill_gaps = TRUE) +} +\arguments{ +\item{lkup}{Lookup object containing refy_lkup and svy_lkup.} + +\item{fill_gaps}{Logical, TRUE for lineup years, FALSE for survey years.} +} +\value{ +data.table with columns id, reporting_level, and code. +} +\description{ +Creates a data.table dictionary for mapping (id, reporting_level) pairs to integer codes for fast joins and decoding. +Used for efficient merging and decoding in the FGT pipeline. +} +\keyword{internal} diff --git a/man/compute_fgt.Rd b/man/compute_fgt.Rd new file mode 100644 index 00000000..0a5b479e --- /dev/null +++ b/man/compute_fgt.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_new.R +\name{compute_fgt} +\alias{compute_fgt} +\title{Efficient FGT calculation for vectors (No data.table)} +\usage{ +compute_fgt(w, wt, povlines) +} +\arguments{ +\item{w}{character: welfare variable name} + +\item{wt}{character: weight variable name} + +\item{povlines}{double: vector with poverty lines} +} +\value{ +data.table with estimates poverty estimates +} +\description{ +Efficient FGT calculation for vectors (No data.table) +} +\keyword{internal} diff --git a/man/compute_fgt_dt.Rd b/man/compute_fgt_dt.Rd index 80504ccb..ac27c0f1 100644 --- a/man/compute_fgt_dt.Rd +++ b/man/compute_fgt_dt.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rg_pip.R +% Please edit documentation in R/compute_fgt_new.R \name{compute_fgt_dt} \alias{compute_fgt_dt} \title{Title} \usage{ -compute_fgt_dt(dt, welfare, weight, povlines) +compute_fgt_dt(dt, welfare, weight, povlines, mean_and_med = FALSE) } \arguments{ \item{dt}{data frame with \code{welfare} and \code{weight} columns} diff --git a/man/compute_fgt_dt_old.Rd b/man/compute_fgt_dt_old.Rd new file mode 100644 index 00000000..58f12e5e --- /dev/null +++ b/man/compute_fgt_dt_old.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_old.R +\name{compute_fgt_dt_old} +\alias{compute_fgt_dt_old} +\title{Title} +\usage{ +compute_fgt_dt_old(dt, welfare, weight, povlines) +} +\arguments{ +\item{dt}{data frame with \code{welfare} and \code{weight} columns} + +\item{welfare}{character: welfare variable name} + +\item{weight}{character: weight variable name} + +\item{povlines}{double: vector with poveryt lines} +} +\value{ +data.table with estimates poverty estimates +} +\description{ +Title +} +\keyword{internal} diff --git a/man/create_countries_vctr.Rd b/man/create_countries_vctr.Rd index 87a18e80..c62f8276 100644 --- a/man/create_countries_vctr.Rd +++ b/man/create_countries_vctr.Rd @@ -4,16 +4,16 @@ \alias{create_countries_vctr} \title{Create countries vectors} \usage{ -create_countries_vctr(country, year, valid_years, aux_files) +create_countries_vctr(country, year, lkup) } \arguments{ \item{country}{character: Country ISO 3 codes} \item{year}{integer: Reporting year} -\item{valid_years}{list: Valid years information provided through lkup object} +\item{lkup}{lkup object} -\item{aux_files}{list: List of auxiliary tables provided through lkup object} +\item{valid_years}{list: Valid years information provided through lkup object} } \value{ a list of vectors with countries and regions code to be used in diff --git a/man/create_full_list.Rd b/man/create_full_list.Rd new file mode 100644 index 00000000..5fe3dbc9 --- /dev/null +++ b/man/create_full_list.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip.R +\name{create_full_list} +\alias{create_full_list} +\title{Create full list for fg data load, not including country-years in cache} +\usage{ +create_full_list(metadata) +} +\arguments{ +\item{metadata}{data table from subset_lkup()$lkup} +} +\value{ +data.table +} +\description{ +Create full list for fg data load, not including country-years in cache +} diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd new file mode 100644 index 00000000..cefc628b --- /dev/null +++ b/man/decode_pairs.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{decode_pairs} +\alias{decode_pairs} +\title{Decode integer code to (id, reporting_level) labels} +\usage{ +decode_pairs( + DT, + dict, + code_col = "id_rl", + id_col = "id", + level_col = "reporting_level", + keep_code = FALSE, + add_true_vars = TRUE, + verbose = 0L +) +} +\arguments{ +\item{DT}{data.table to decode.} + +\item{dict}{data.table from build_pair_dict().} + +\item{code_col}{Name of code column in DT.} + +\item{id_col}{Name of id column in dict.} + +\item{level_col}{Name of reporting level column in dict.} + +\item{keep_code}{Logical, keep code column if TRUE.} + +\item{add_true_vars}{Logical, add country_code and reporting_year columns and remove id.} + +\item{verbose}{Integer, verbosity level.} +} +\value{ +data.table with id and reporting_level columns added. +} +\description{ +Joins a data.table with a dictionary to recover id and reporting_level columns from integer codes. +Used after FGT calculations to restore human-readable labels. +} +\keyword{internal} diff --git a/man/empty_response_fg_add.Rd b/man/empty_response_fg_add.Rd new file mode 100644 index 00000000..7bb34bb1 --- /dev/null +++ b/man/empty_response_fg_add.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-package.R +\docType{data} +\name{empty_response_fg_add} +\alias{empty_response_fg_add} +\title{Dataframe for fill gaps empty response with additional variables} +\format{ +Data frame with 0 rows and 56 columns +} +\usage{ +data(empty_response_fg_add) +} +\description{ +Dataframe for fill gaps empty response with additional variables +} +\keyword{datasets} diff --git a/man/encode_pairs.Rd b/man/encode_pairs.Rd new file mode 100644 index 00000000..01841436 --- /dev/null +++ b/man/encode_pairs.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{encode_pairs} +\alias{encode_pairs} +\title{Encode (id, reporting_level) pairs as integer codes} +\usage{ +encode_pairs( + DT, + dict, + id_col = "id", + level_col = "reporting_level", + code_col = "id_rl", + drop_labels = FALSE, + strict = TRUE, + verbose = 0L +) +} +\arguments{ +\item{DT}{data.table to encode.} + +\item{dict}{data.table from build_pair_dict().} + +\item{id_col}{Name of id column.} + +\item{level_col}{Name of reporting level column.} + +\item{code_col}{Name of code column to write.} + +\item{drop_labels}{Logical, drop id and level columns if TRUE.} + +\item{strict}{Logical, error if any pairs are missing from dict.} + +\item{verbose}{Integer, verbosity level.} +} +\value{ +data.table with code column added. +} +\description{ +Joins a data.table with a dictionary to add an integer code column for each (id, reporting_level) pair. +Used for efficient grouping and decoding in the FGT pipeline. +} +\keyword{internal} diff --git a/man/fg_assign_nas_values_to_dup_cols_old.Rd b/man/fg_assign_nas_values_to_dup_cols_old.Rd new file mode 100644 index 00000000..da765eea --- /dev/null +++ b/man/fg_assign_nas_values_to_dup_cols_old.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_assign_nas_values_to_dup_cols_old} +\alias{fg_assign_nas_values_to_dup_cols_old} +\title{OLD: Coerce variable causing potential duplicates to NAs} +\usage{ +fg_assign_nas_values_to_dup_cols_old(df, cols) +} +\arguments{ +\item{df}{data.table: Table of results created in \code{fg_pip()}} + +\item{cols}{character: Columns with potential duplicate values} +} +\value{ +data.table +} +\description{ +OLD: Coerce variable causing potential duplicates to NAs +} diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd index 9012b01b..17faee02 100644 --- a/man/fg_pip.Rd +++ b/man/fg_pip.Rd @@ -12,7 +12,8 @@ fg_pip( welfare_type, reporting_level, ppp, - lkup + lkup, + pipenv = NULL ) } \arguments{ diff --git a/man/fg_pip_old.Rd b/man/fg_pip_old.Rd new file mode 100644 index 00000000..456989ea --- /dev/null +++ b/man/fg_pip_old.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_pip_old} +\alias{fg_pip_old} +\title{Compute imputed year stats} +\usage{ +fg_pip_old( + country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup +) +} +\arguments{ +\item{country}{character: Country ISO 3 codes} + +\item{year}{integer: Reporting year} + +\item{povline}{numeric: Poverty line} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} + +\item{welfare_type}{character: Welfare type} + +\item{reporting_level}{character: Geographical reporting level} + +\item{ppp}{numeric: Custom Purchase Power Parity value} + +\item{lkup}{list: A list of lkup tables} +} +\value{ +data.frame +} +\description{ +Compute the main PIP poverty and inequality statistics for imputed years. +} +\keyword{internal} diff --git a/man/fg_remove_duplicates.Rd b/man/fg_remove_duplicates.Rd index b9d9c87b..9b6b79dc 100644 --- a/man/fg_remove_duplicates.Rd +++ b/man/fg_remove_duplicates.Rd @@ -9,7 +9,8 @@ fg_remove_duplicates( cols = c("comparable_spell", "cpi", "display_cp", "gd_type", "path", "predicted_mean_ppp", "survey_acronym", "survey_comparability", "survey_coverage", "survey_id", "survey_mean_lcu", "survey_mean_ppp", "survey_median_lcu", - "survey_median_ppp", "survey_time", "survey_year", "surveyid_year") + "survey_median_ppp", "survey_time", "survey_year", "surveyid_year"), + use_new_lineup_version = FALSE ) } \arguments{ diff --git a/man/fg_remove_duplicates_old.Rd b/man/fg_remove_duplicates_old.Rd new file mode 100644 index 00000000..2190c4f1 --- /dev/null +++ b/man/fg_remove_duplicates_old.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_remove_duplicates_old} +\alias{fg_remove_duplicates_old} +\title{OLD: Remove duplicated rows created during the interpolation process} +\usage{ +fg_remove_duplicates_old( + df, + cols = c("comparable_spell", "cpi", "display_cp", "gd_type", "path", + "predicted_mean_ppp", "survey_acronym", "survey_comparability", "survey_coverage", + "survey_id", "survey_mean_lcu", "survey_mean_ppp", "survey_median_lcu", + "survey_median_ppp", "survey_time", "survey_year", "surveyid_year") +) +} +\arguments{ +\item{df}{data.table: Table of results created in \code{fg_pip()}} + +\item{cols}{character: Columns with potential duplicate values} +} +\value{ +data.table +} +\description{ +OLD: Remove duplicated rows created during the interpolation process +} diff --git a/man/fg_standardize_cache_id_old.Rd b/man/fg_standardize_cache_id_old.Rd new file mode 100644 index 00000000..fd56a1b4 --- /dev/null +++ b/man/fg_standardize_cache_id_old.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_standardize_cache_id_old} +\alias{fg_standardize_cache_id_old} +\title{OLD: Standardize cache_id format to avoid duplication of rows} +\usage{ +fg_standardize_cache_id_old(cache_id, interpolation_id, reporting_level) +} +\arguments{ +\item{cache_id}{character} + +\item{interpolation_id}{character} + +\item{reporting_level}{character} +} +\value{ +character +} +\description{ +OLD: Standardize cache_id format to avoid duplication of rows +} diff --git a/man/fgt_cumsum.Rd b/man/fgt_cumsum.Rd new file mode 100644 index 00000000..535ff3e9 --- /dev/null +++ b/man/fgt_cumsum.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{fgt_cumsum} +\alias{fgt_cumsum} +\title{Compute FGT and Watts indices for all groups and poverty lines} +\usage{ +fgt_cumsum(LDTg, tpop, povline, drop_vars = TRUE) +} +\arguments{ +\item{LDTg}{List from format_lfst() with DT and g objects.} + +\item{tpop}{data.table with total population by group (from get_total_pop()).} + +\item{povline}{Numeric vector of poverty lines.} + +\item{drop_vars}{Logical, if TRUE returns only summary columns.} +} +\value{ +data.table with FGT and Watts measures by group and poverty line. +} +\description{ +Calculates headcount, poverty gap, poverty severity, and Watts index for each group and poverty line using cumulative sums. +} +\keyword{internal} diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd new file mode 100644 index 00000000..c1268268 --- /dev/null +++ b/man/format_lfst.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{format_lfst} +\alias{format_lfst} +\title{Format loaded survey list for grouped poverty analysis} +\usage{ +format_lfst(lfst, dict) +} +\arguments{ +\item{lfst}{Named list of data.tables, as returned by load_list_refy().} + +\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).} +} +\value{ +List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl). +} +\description{ +Combines a list of survey data.tables into a single data.table and encodes group identifiers using a dictionary. +Returns a data.table and a GRP object for efficient grouped operations, used as a preprocessing step for FGT and population calculations. +} +\keyword{internal} diff --git a/man/get_aux_table.Rd b/man/get_aux_table.Rd index 66a90941..f11a9261 100644 --- a/man/get_aux_table.Rd +++ b/man/get_aux_table.Rd @@ -4,7 +4,7 @@ \alias{get_aux_table} \title{Return specified auxiliary data} \usage{ -get_aux_table(data_dir, table, long_format = FALSE) +get_aux_table(data_dir = NULL, table, long_format = FALSE) } \arguments{ \item{data_dir}{character: Data directory} diff --git a/man/get_aux_table_ui.Rd b/man/get_aux_table_ui.Rd index 8d5edc4b..21d1a485 100644 --- a/man/get_aux_table_ui.Rd +++ b/man/get_aux_table_ui.Rd @@ -5,12 +5,14 @@ \title{Return specified auxiliary data in wide format Helper function to the UI} \usage{ -get_aux_table_ui(data_dir, table) +get_aux_table_ui(data_dir, table, exclude = TRUE) } \arguments{ \item{data_dir}{character: Data directory} \item{table}{character: Name of auxiliary table} + +\item{esclude}{logical: whether or not to exclude some countries or regions...} } \value{ data.frame diff --git a/man/get_country_code_subset.Rd b/man/get_country_code_subset.Rd new file mode 100644 index 00000000..a1acd4dc --- /dev/null +++ b/man/get_country_code_subset.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp_new.R +\name{get_country_code_subset} +\alias{get_country_code_subset} +\title{Subset country_code values based on matches in *_code columns and country_code} +\usage{ +get_country_code_subset(dt, country) +} +\arguments{ +\item{dt}{A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.} + +\item{country}{Character vector of country or region codes to match against *_code columns and country_code.} +} +\value{ +A unique character vector of country_code values corresponding to matches in any *_code column or country_code. +} +\description{ +This function searches all columns in a data.table ending with '_code' (except 'country_code'), +as well as 'country_code' itself, and returns a unique character vector of 'country_code' values +for rows where any of those columns match a value in the provided 'country' vector. If any value +in 'country' is not found in any *_code column or in 'country_code', an error is thrown. The input +data.table 'dt' should be 'lkup$aux_files$country_list', which contains country and region codes for subsetting. +} +\examples{ +\dontrun{ +dt <- lkup$aux_files$country_list +get_country_code_subset(dt, c("USA", "EAP")) +} +} diff --git a/man/get_dt_dist_stats.Rd b/man/get_dt_dist_stats.Rd new file mode 100644 index 00000000..f59e6b9f --- /dev/null +++ b/man/get_dt_dist_stats.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{get_dt_dist_stats} +\alias{get_dt_dist_stats} +\title{get data.table with distribution stats} +\usage{ +get_dt_dist_stats(lt_att) +} +\arguments{ +\item{lt_att}{list of attributes of lt list} +} +\value{ +data.table +} +\description{ +this is a loop over lt attributes +} +\keyword{internal} diff --git a/man/get_from_pipapienv.Rd b/man/get_from_pipapienv.Rd new file mode 100644 index 00000000..2876f683 --- /dev/null +++ b/man/get_from_pipapienv.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-env.R +\name{get_from_pipapienv} +\alias{get_from_pipapienv} +\title{Get a value from .pipapienv} +\usage{ +get_from_pipapienv(key) +} +\arguments{ +\item{key}{A character string representing the key} +} +\value{ +The value associated with the key in .pipapienv +} +\description{ +Get a value from .pipapienv +} +\examples{ +set_in_pipapienv("example_key", 42) +get_from_pipapienv("example_key") # returns 42 +} diff --git a/man/get_lt_attr.Rd b/man/get_lt_attr.Rd new file mode 100644 index 00000000..303808ab --- /dev/null +++ b/man/get_lt_attr.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{get_lt_attr} +\alias{get_lt_attr} +\title{Get some attributes from lt lis} +\usage{ +get_lt_attr(lt) +} +\arguments{ +\item{lt}{list} +} +\description{ +Get some attributes from lt lis +} +\keyword{internal} diff --git a/man/get_mean_median.Rd b/man/get_mean_median.Rd new file mode 100644 index 00000000..f4fed163 --- /dev/null +++ b/man/get_mean_median.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_mean_median} +\alias{get_mean_median} +\title{merge into fgt table the mean and median from dist stats table in lkup} +\usage{ +get_mean_median(fgt, lkup, fill_gaps) +} +\arguments{ +\item{fgt}{data,table with fgt measures} + +\item{lkup}{lkup} + +\item{fill_gaps}{logical. whether to use lineup estimates} +} +\value{ +data.table with with fgt, mean and median +} +\description{ +merge into fgt table the mean and median from dist stats table in lkup +} +\keyword{internal} diff --git a/man/get_pipapienv.Rd b/man/get_pipapienv.Rd new file mode 100644 index 00000000..933e7044 --- /dev/null +++ b/man/get_pipapienv.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-env.R +\name{get_pipapienv} +\alias{get_pipapienv} +\title{Get the entire .pipapienv environment} +\usage{ +get_pipapienv() +} +\value{ +The .pipapienv environment +} +\description{ +Get the entire .pipapienv environment +} +\examples{ +env <- get_pipapienv() +} diff --git a/man/get_rl_rows_single.Rd b/man/get_rl_rows_single.Rd new file mode 100644 index 00000000..e11afce9 --- /dev/null +++ b/man/get_rl_rows_single.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{get_rl_rows_single} +\alias{get_rl_rows_single} +\alias{get_rl_rows} +\title{extract rows indices} +\usage{ +get_rl_rows_single(a) + +get_rl_rows(lt_att) +} +\arguments{ +\item{a}{list with attributes from lt} +} +\value{ +names list with indices for reporting level +} +\description{ +extract rows indices + +apply get_rl_rows_single +} +\keyword{internal} diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd new file mode 100644 index 00000000..97b36860 --- /dev/null +++ b/man/get_total_pop.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{get_total_pop} +\alias{get_total_pop} +\title{Compute total population by survey and reporting level} +\usage{ +get_total_pop(LDTg) +} +\arguments{ +\item{LDTg}{List from format_lfst() with DT and g objects.} +} +\value{ +data.table with total population by group (columns: id_rl, W). +} +\description{ +Sums the weights for each (id, reporting_level) group in the combined survey data. +Used as a denominator for FGT and Watts index calculations. +} +\keyword{internal} diff --git a/man/infer_poverty_line.Rd b/man/infer_poverty_line.Rd new file mode 100644 index 00000000..e00f00ef --- /dev/null +++ b/man/infer_poverty_line.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/infer_poverty_line.R +\name{infer_poverty_line} +\alias{infer_poverty_line} +\title{Infer the poverty line for a given population share} +\usage{ +infer_poverty_line( + welfare, + weight, + popshare = 0.5, + include = FALSE, + method = c("nearest", "interp"), + assume_sorted = TRUE +) +} +\arguments{ +\item{welfare}{Numeric vector of welfare values (e.g., income or +consumption).} + +\item{weight}{Numeric vector of sampling weights (must be non-negative, same +length as welfare).} + +\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1}); +default is 0.5 (median).} + +\item{include}{Logical; if TRUE, averages neighbors for ties (only for method += "nearest").} + +\item{method}{Character; either "nearest" (default, discrete quantile) or +"interp" (weighted linear interpolation).} + +\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already +sorted by welfare.} +} +\value{ +Numeric vector of poverty line(s) corresponding to the requested +population share(s). +} +\description{ +Computes the welfare value (poverty line) corresponding to a given population +share, using either nearest or interpolated weighted quantile methods. +Supports both discrete (nearest) and linear interpolation approaches, and can +optionally average neighbors for ties. +} +\details{ +\itemize{ +\item If method = "nearest", returns the welfare value at the closest cumulative weight fraction to each popshare. +\item If method = "interp", uses collapse::fquantile for weighted linear interpolation. +\item If include = TRUE (and method = "nearest"), averages the two closest neighbors using their weights. +\item Returns numeric(0) if popshare is empty. +} +} +\keyword{internal} diff --git a/man/list_code_column_values.Rd b/man/list_code_column_values.Rd new file mode 100644 index 00000000..2b285c88 --- /dev/null +++ b/man/list_code_column_values.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp_new.R +\name{list_code_column_values} +\alias{list_code_column_values} +\title{List values in each *_code column that match the country vector} +\usage{ +list_code_column_values(dt, country) +} +\arguments{ +\item{dt}{A data.table, typically lkup$aux_files$country_list.} + +\item{country}{Character vector of country or region codes to match against *_code columns.} +} +\value{ +A named list of unique values for each *_code column that match 'country'. +} +\description{ +Returns a named list where each element is the vector of unique values in each *_code column +that are present in the provided 'country' vector. +} +\examples{ +\dontrun{ +dt <- lkup$aux_files$country_list +list_code_column_values(dt, c("USA", "EAP")) +} +} diff --git a/man/load_data_list.Rd b/man/load_data_list.Rd index 26a142d2..8098daa9 100644 --- a/man/load_data_list.Rd +++ b/man/load_data_list.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rg_pip.R +% Please edit documentation in R/compute_fgt_new.R \name{load_data_list} \alias{load_data_list} \title{load survey year files and store them in a list} diff --git a/man/load_data_list_old.Rd b/man/load_data_list_old.Rd new file mode 100644 index 00000000..23dc3862 --- /dev/null +++ b/man/load_data_list_old.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_old.R +\name{load_data_list_old} +\alias{load_data_list_old} +\title{OLD: load survey year files and store them in a list} +\usage{ +load_data_list_old(metadata) +} +\arguments{ +\item{metadata}{data frame from \code{subset_lkup()}} +} +\value{ +list with survey years data +} +\description{ +OLD: load survey year files and store them in a list +} +\keyword{internal} diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd new file mode 100644 index 00000000..b91f4d82 --- /dev/null +++ b/man/load_list_refy.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{load_list_refy} +\alias{load_list_refy} +\title{Load survey data from file list} +\usage{ +load_list_refy(input_list) +} +\arguments{ +\item{input_list}{Character vector of file paths (from create_full_list()).} +} +\value{ +Named list of data.tables, each with an id column. +} +\description{ +Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column. +Used as the first step in the pipeline after creating the file list. +} +\keyword{internal} diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd new file mode 100644 index 00000000..e4d1c224 --- /dev/null +++ b/man/map_fgt.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_new.R +\name{DT_fgt_by_rl} +\alias{DT_fgt_by_rl} +\alias{lt_to_dt} +\alias{map_lt_to_dt} +\alias{map_fgt} +\title{compute FGT using indices by reporting level} +\usage{ +DT_fgt_by_rl(x, y, nx, povline) + +lt_to_dt(x, y, nx, povline) + +map_lt_to_dt(lt, l_rl_rows, povline) + +map_fgt(lt, l_rl_rows, povline) +} +\arguments{ +\item{x}{data.table from lt list, with welfare and weight vectors} + +\item{y}{list of indices for each reporting level} + +\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"} + +\item{lt}{list of data.tables with welfare and weight data} + +\item{l_rl_rows}{list of indeces} +} +\value{ +data.table with all measured +} +\description{ +This function is intended to be used inside \link{map_fgt} +} +\keyword{internal} diff --git a/man/pip.Rd b/man/pip.Rd index 3e31f031..f128449b 100644 --- a/man/pip.Rd +++ b/man/pip.Rd @@ -15,7 +15,7 @@ pip( reporting_level = c("all", "national", "rural", "urban"), ppp = NULL, lkup, - censor = TRUE, + censor = FALSE, lkup_hash = lkup$cache_data_id$hash_pip, additional_ind = FALSE ) @@ -57,6 +57,10 @@ data.table \description{ Compute the main PIP poverty and inequality statistics. } +\details{ +This function is a wrapper around the \link{pip_new_lineups} and \link{pip_old_lineups} +functions. +} \examples{ \dontrun{ # Create lkups diff --git a/man/pip_grp_logic.Rd b/man/pip_agg.Rd similarity index 59% rename from man/pip_grp_logic.Rd rename to man/pip_agg.Rd index a7d861f1..f47dbba8 100644 --- a/man/pip_grp_logic.Rd +++ b/man/pip_agg.Rd @@ -1,9 +1,24 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip_grp_logic.R -\name{pip_grp_logic} +% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R, R/pip_grp_new.R +\name{pip_agg} +\alias{pip_agg} \alias{pip_grp_logic} +\alias{pip_grp_new} \title{Logic for computing new aggregate} \usage{ +pip_agg( + country = "ALL", + year = "ALL", + povline = 1.9, + group_by = c("wb", "none"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = FALSE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE +) + pip_grp_logic( country = "ALL", year = "ALL", @@ -16,6 +31,17 @@ pip_grp_logic( lkup_hash = lkup$cache_data_id$hash_pip_grp, additional_ind = FALSE ) + +pip_grp_new( + country = "ALL", + year = "ALL", + povline = 1.9, + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + additional_ind = FALSE +) } \arguments{ \item{country}{character: Country ISO 3 codes} @@ -41,13 +67,22 @@ sub-groups} FALSE} } \value{ +data.table + data.table } \description{ Logic for computing new aggregate + +Old way to estimate aggregate data + +New way to estimate Aggregate data } \examples{ \dontrun{ # Create lkups } +\dontrun{ +# Create lkups +} } diff --git a/man/pip_aggregate_by.Rd b/man/pip_aggregate_by.Rd new file mode 100644 index 00000000..642567e7 --- /dev/null +++ b/man/pip_aggregate_by.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp.R +\name{pip_aggregate_by} +\alias{pip_aggregate_by} +\title{Aggregate by predefined groups} +\usage{ +pip_aggregate_by(df, country = "ALL", return_cols = NULL) +} +\arguments{ +\item{df}{data.frame: Response from \code{fg_pip_old()} or \code{rg_pip()}.} + +\item{country}{character: Selected countries / regions} + +\item{return_cols}{list: lkup$return_cols$pip_grp object. Controls returned +columns} +} +\description{ +Aggregate by predefined groups +} +\keyword{internal} diff --git a/man/pip_new_lineups.Rd b/man/pip_new_lineups.Rd new file mode 100644 index 00000000..db43427e --- /dev/null +++ b/man/pip_new_lineups.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_new_lineups.R +\name{pip_new_lineups} +\alias{pip_new_lineups} +\title{Compute PIP statistics} +\usage{ +pip_new_lineups( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) +} +\arguments{ +\item{country}{character: Country ISO 3 codes} + +\item{year}{integer: Reporting year} + +\item{povline}{numeric: Poverty line} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} + +\item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate +values for missing years} + +\item{group_by}{character: Will return aggregated values for predefined +sub-groups} + +\item{welfare_type}{character: Welfare type} + +\item{reporting_level}{character: Geographical reporting level} + +\item{ppp}{numeric: Custom Purchase Power Parity value} + +\item{lkup}{list: A list of lkup tables} + +\item{censor}{logical: Triggers censoring of country/year statistics} + +\item{lkup_hash}{character: hash of pip} + +\item{additional_ind}{logical: If TRUE add new set of indicators. Default if +FALSE} +} +\value{ +data.table +} +\description{ +Compute the main PIP poverty and inequality statistics. +} +\examples{ +\dontrun{ +# Create lkups +lkups <- create_lkups("") + +# A single country and year +pip_new_lineups(country = "AGO", + year = 2000, + povline = 1.9, + lkup = lkups) + +# All years for a single country +pip_new_lineups(country = "AGO", + year = "all", + povline = 1.9, + lkup = lkups) + +# Fill gaps +pip_new_lineups(country = "AGO", + year = "all", + povline = 1.9, + fill_gaps = TRUE, + lkup = lkups) + +# Group by regions +pip_new_lineups(country = "all", + year = "all", + povline = 1.9, + group_by = "wb", + lkup = lkups) +} +} diff --git a/man/pip_old.Rd b/man/pip_old.Rd index 2c1ff5a2..1a725396 100644 --- a/man/pip_old.Rd +++ b/man/pip_old.Rd @@ -63,26 +63,26 @@ Compute the main PIP poverty and inequality statistics. lkups <- create_lkups("") # A single country and year -pip(country = "AGO", +pip_old(country = "AGO", year = 2000, povline = 1.9, lkup = lkups) # All years for a single country -pip(country = "AGO", +pip_old(country = "AGO", year = "all", povline = 1.9, lkup = lkups) # Fill gaps -pip(country = "AGO", +pip_old(country = "AGO", year = "all", povline = 1.9, fill_gaps = TRUE, lkup = lkups) # Group by regions -pip(country = "all", +pip_old(country = "all", year = "all", povline = 1.9, group_by = "wb", diff --git a/man/pip_old_lineups.Rd b/man/pip_old_lineups.Rd new file mode 100644 index 00000000..1a4dbb71 --- /dev/null +++ b/man/pip_old_lineups.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_old_lineups.R +\name{pip_old_lineups} +\alias{pip_old_lineups} +\title{Compute PIP statistics - Old lineups function} +\usage{ +pip_old_lineups( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) +} +\arguments{ +\item{country}{character: Country ISO 3 codes} + +\item{year}{integer: Reporting year} + +\item{povline}{numeric: Poverty line} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} + +\item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate +values for missing years} + +\item{group_by}{character: Will return aggregated values for predefined +sub-groups} + +\item{welfare_type}{character: Welfare type} + +\item{reporting_level}{character: Geographical reporting level} + +\item{ppp}{numeric: Custom Purchase Power Parity value} + +\item{lkup}{list: A list of lkup tables} + +\item{censor}{logical: Triggers censoring of country/year statistics} + +\item{lkup_hash}{character: hash of pip} + +\item{additional_ind}{logical: If TRUE add new set of indicators. Default if +FALSE} +} +\value{ +data.table +} +\description{ +Compute the main PIP poverty and inequality statistics. +} +\examples{ +\dontrun{ +# Create lkups +lkups <- create_lkups("") + +# A single country and year +pip_old_lineups(country = "AGO", + year = 2000, + povline = 1.9, + lkup = lkups) + +# All years for a single country +pip_old_lineups(country = "AGO", + year = "all", + povline = 1.9, + lkup = lkups) + +# Fill gaps +pip_old_lineups(country = "AGO", + year = "all", + povline = 1.9, + fill_gaps = TRUE, + lkup = lkups) + +# Group by regions +pip_old_lineups(country = "all", + year = "all", + povline = 1.9, + group_by = "wb", + lkup = lkups) +} +} diff --git a/man/rg_pip_old.Rd b/man/rg_pip_old.Rd index bee24977..d5cea86f 100644 --- a/man/rg_pip_old.Rd +++ b/man/rg_pip_old.Rd @@ -1,9 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip_old.R +% Please edit documentation in R/pip_old.R, R/rg_pip_old.R \name{rg_pip_old} \alias{rg_pip_old} \title{Compute survey year stats} \usage{ +rg_pip_old( + country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup +) + rg_pip_old( country, year, @@ -34,9 +45,13 @@ poverty line} \item{lkup}{list: A list of lkup tables} } \value{ +data.frame + data.frame } \description{ +Compute the main PIP poverty and inequality statistics for survey years. + Compute the main PIP poverty and inequality statistics for survey years. } \keyword{internal} diff --git a/man/safe_endpoint.Rd b/man/safe_endpoint.Rd new file mode 100644 index 00000000..e754bdc4 --- /dev/null +++ b/man/safe_endpoint.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-plumber.R +\name{safe_endpoint} +\alias{safe_endpoint} +\title{Wrap a Plumber endpoint with standardized error handling} +\usage{ +safe_endpoint(fun, endpoint, debug = NULL) +} +\arguments{ +\item{fun}{A function \verb{(req, res)} containing the endpoint logic. +This is where you parse request arguments and call the relevant +internal functions.} + +\item{endpoint}{Character string giving the endpoint path +(e.g., \code{"/api/v1/pip"}). Used in error payloads so clients know +which endpoint failed.} + +\item{debug}{Logical; if \code{NULL} (default), inherits from the +environment variable \code{PIPAPI_DEBUG}. When \code{TRUE}, include extended +diagnostic details in the error response.} +} +\value{ +A function \verb{(req, res)} suitable for use in Plumber routes. +On error, sets \code{res$status <- 500} and returns a JSON object with: +\describe{ +\item{error}{A short description ("Error in /api/v1/...")} +\item{message}{Either the actual error message (debug mode) or +\code{"Internal Server Error"}} +\item{request_id}{The Plumber request ID, if available} +\item{endpoint}{The endpoint string supplied} +\item{class}{Error class (debug mode only)} +\item{call}{The call that generated the error (debug mode only)} +\item{query}{The query parameters (debug mode only)} +\item{trace}{Traceback captured by \code{rlang::trace_back()} (debug mode only)} +} +} +\description{ +\code{safe_endpoint()} wraps an endpoint handler in a \code{tryCatch}, ensuring +consistent error handling across the API. On success, the original +handler's result is returned. On error, a structured JSON object is +returned with useful metadata (status, message, request ID, endpoint), +and optionally additional debug details. +} +\details{ +Debug mode can be enabled by either: +\itemize{ +\item Passing \code{debug = TRUE} explicitly, or +\item Setting the environment variable \code{PIPAPI_DEBUG=TRUE}. +} +When debug mode is active, the error payload also includes the error +class, call, query parameters, and a truncated traceback. +} +\examples{ +\dontrun{ +# Example: wrap a handler for /api/v1/pip +#* @get /api/v1/pip +function(req, res) { + safe_endpoint(function(req, res) { + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + do.call(pipapi::ui_pip, params) + }, endpoint = "/api/v1/pip")(req, res) +} +} + +} diff --git a/man/set_in_pipapienv.Rd b/man/set_in_pipapienv.Rd new file mode 100644 index 00000000..127287cb --- /dev/null +++ b/man/set_in_pipapienv.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-env.R +\name{set_in_pipapienv} +\alias{set_in_pipapienv} +\title{Set a value in .pipapienv} +\usage{ +set_in_pipapienv(key, value) +} +\arguments{ +\item{key}{A character string representing the key} + +\item{value}{The value to store in .pipapienv} +} +\value{ +The assigned value (invisibly) +} +\description{ +Set a value in .pipapienv +} +\examples{ +set_in_pipapienv("example_key", 42) +} diff --git a/man/subset_lkup.Rd b/man/subset_lkup.Rd index d09baf8a..ee5dd4cf 100644 --- a/man/subset_lkup.Rd +++ b/man/subset_lkup.Rd @@ -14,7 +14,8 @@ subset_lkup( data_dir = NULL, povline, cache_file_path, - fill_gaps + fill_gaps, + popshare = NULL ) } \arguments{ @@ -39,6 +40,9 @@ for region selection} \item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate values for missing years} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} } \value{ data.frame diff --git a/man/transform_input.Rd b/man/transform_input.Rd new file mode 100644 index 00000000..14664b98 --- /dev/null +++ b/man/transform_input.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{transform_input} +\alias{transform_input} +\title{transform input list} +\usage{ +transform_input(input_list) +} +\arguments{ +\item{input_list}{Character vector of file paths (from create_full_list()).} +} +\value{ +formated list +} +\description{ +transform input list +} +\keyword{internal} diff --git a/man/ui_cp_poverty_charts.Rd b/man/ui_cp_poverty_charts.Rd index 3b211f31..e2356d1e 100644 --- a/man/ui_cp_poverty_charts.Rd +++ b/man/ui_cp_poverty_charts.Rd @@ -4,7 +4,7 @@ \alias{ui_cp_poverty_charts} \title{CP Poverty Charts} \usage{ -ui_cp_poverty_charts(country, povline, pop_units, lkup) +ui_cp_poverty_charts(country, povline, pop_units = 1e+06, lkup) } \arguments{ \item{country}{character: Country ISO 3 codes} diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd index 49715b48..9801ecfe 100644 --- a/man/ui_pc_charts.Rd +++ b/man/ui_pc_charts.Rd @@ -13,6 +13,7 @@ ui_pc_charts( welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national", "rural", "urban"), pop_units = 1e+06, + censor = TRUE, lkup ) } @@ -36,6 +37,8 @@ sub-groups} \item{pop_units}{numeric: Units used to express population numbers (default to million)} +\item{censor}{logical: Triggers censoring of country/year statistics} + \item{lkup}{list: A list of lkup tables} } \value{ diff --git a/man/update_master_file.Rd b/man/update_master_file.Rd index 39b1d3b9..968d3de4 100644 --- a/man/update_master_file.Rd +++ b/man/update_master_file.Rd @@ -8,7 +8,8 @@ update_master_file( dat, cache_file_path, fill_gaps, - verbose = getOption("pipapi.verbose") + verbose = getOption("pipapi.verbose"), + decimal = 2 ) } \arguments{ diff --git a/man/update_pair_dict.Rd b/man/update_pair_dict.Rd new file mode 100644 index 00000000..4a41a16c --- /dev/null +++ b/man/update_pair_dict.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{update_pair_dict} +\alias{update_pair_dict} +\title{Update dictionary with new (id, reporting_level) pairs} +\usage{ +update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level") +} +\arguments{ +\item{dict}{data.table dictionary from build_pair_dict().} + +\item{DT}{data.table with id and reporting_level columns.} + +\item{id_col}{Name of id column.} + +\item{level_col}{Name of reporting level column.} +} +\value{ +Updated data.table dictionary. +} +\description{ +Appends new (id, reporting_level) pairs to the dictionary if needed, ensuring all groups are encoded. +Used to keep the dictionary in sync with new survey data. +} +\keyword{internal} diff --git a/man/use_new_lineup_version.Rd b/man/use_new_lineup_version.Rd new file mode 100644 index 00000000..59360d6c --- /dev/null +++ b/man/use_new_lineup_version.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_lkups.R +\name{use_new_lineup_version} +\alias{use_new_lineup_version} +\title{Should the new lineup approach be used?} +\usage{ +use_new_lineup_version(x) +} +\arguments{ +\item{x}{A character vector where each element starts with an +8-digit date in the format \code{YYYYMMDD}.} +} +\value{ +A logical vector: \code{TRUE} if the extracted date is +after May 1st, 2025, otherwise \code{FALSE}. +} +\description{ +Check if the date in a string is more recent than May 2025 +} +\details{ +This function extracts the first 8 characters from an input string, +interprets them as a date in the format \code{YYYYMMDD}, and checks +whether this date is more recent than May 1st, 2025. +} +\examples{ +use_new_lineup_version("20250401_2021_01_02_PROD") # FALSE +use_new_lineup_version("20250615_2021_01_02_PROD") # TRUE + +} diff --git a/man/with_req_timeout.Rd b/man/with_req_timeout.Rd new file mode 100644 index 00000000..e6bc2c4c --- /dev/null +++ b/man/with_req_timeout.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-plumber.R +\name{with_req_timeout} +\alias{with_req_timeout} +\title{Evaluate an expression with a timeout} +\usage{ +with_req_timeout( + expr, + secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150")) +) +} +\arguments{ +\item{expr}{Expression to evaluate.} + +\item{secs}{Timeout in seconds (default: from env var \code{PLUMBER_REQ_TIMEOUT}, +or 150 if unset).} +} +\value{ +Result of \code{expr} if it finishes in time; otherwise a list +with \code{ok = FALSE}, \code{error = "timeout"}, and \code{elapsed} seconds. +} +\description{ +Wraps \code{\link[R.utils:withTimeout]{R.utils::withTimeout()}} but returns a structured failure +object instead of stopping the whole process. This allows +\code{safe_endpoint()} to handle timeouts like normal errors without +killing the API process. +} diff --git a/tests/testthat/test-create_countries_vctr.R b/tests/testthat/test-create_countries_vctr.R index fae4b133..fb88a02e 100644 --- a/tests/testthat/test-create_countries_vctr.R +++ b/tests/testthat/test-create_countries_vctr.R @@ -457,8 +457,7 @@ test_that("create_vector_countries output the expected object", { year = "2010" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) expect_true(is.list(out)) expect_equal(sort(names(out)), sort(c("ctr_off_reg", @@ -479,8 +478,7 @@ test_that("create_vector_countries works for countries selection", { year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Selects all countries with survey data when country="ALL" expect_equal(sort(out$est_ctrs), sort(aux_files$countries$country_code)) @@ -492,8 +490,7 @@ test_that("create_vector_countries Returns correct results when country = ALL", year = "2010" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Correct official regions expect_equal(out$user_off_reg, c("ALL", @@ -514,8 +511,7 @@ test_that("create_vector_countries Returns correct results when country = WLD", year = "2010" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Correct official regions expect_equal(out$user_off_reg, c("ALL", @@ -534,8 +530,7 @@ test_that("create_vector_countries Returns correct results when country = altern year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Returns no official region expect_true(is.null(out$user_off_reg)) @@ -554,8 +549,7 @@ test_that("create_vector_countries returns correct results when country = offica year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Returns no official region expect_equal(out$user_off_reg, country) @@ -574,8 +568,7 @@ test_that("create_vector_countries returns correct results when country = aggreg year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Returns no official region expect_equal(out$user_off_reg, off_country) diff --git a/tests/testthat/test-fg_pip-local.R b/tests/testthat/test-fg_pip-local.R index 13a56c84..b73460ca 100644 --- a/tests/testthat/test-fg_pip-local.R +++ b/tests/testthat/test-fg_pip-local.R @@ -30,8 +30,7 @@ test_that("Imputation is working for extrapolated aggregated distribution", { welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -44,8 +43,7 @@ test_that("Imputation is working for extrapolated aggregated distribution", { welfare_type = "all", reporting_level = "national", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -61,8 +59,7 @@ test_that("Imputation is working for interpolated mixed distribution", { welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -75,8 +72,7 @@ test_that("Imputation is working for interpolated mixed distribution", { welfare_type = "all", reporting_level = "national", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -91,11 +87,11 @@ test_that("Imputation is working for interpolated aggregate distribution", { welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) - expect_equal(nrow(tmp$main_data), 2) + expect_equal(nrow(tmp$main_data), 0) + expect_equal(nrow(tmp$data_in_cache), 2) tmp <- fg_pip( country = "CHN", @@ -105,11 +101,11 @@ test_that("Imputation is working for interpolated aggregate distribution", { welfare_type = "all", reporting_level = "national", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) - expect_equal(nrow(tmp$main_data), 2) + expect_equal(nrow(tmp$main_data), 0) + expect_equal(nrow(tmp$data_in_cache), 2) }) @@ -158,8 +154,7 @@ tmp <- fg_pip( welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) tmp <- tmp$data_in_cache |> as.data.table() # dt <- pip(country = "ALL", diff --git a/tests/testthat/test-pip-local.R b/tests/testthat/test-pip-local.R index 59e8ac89..43cf92ce 100644 --- a/tests/testthat/test-pip-local.R +++ b/tests/testthat/test-pip-local.R @@ -841,3 +841,13 @@ test_that("SPL is the same by reporting level", { }) + +test_that("make sure popshare bug no which was reported no longer exists", { + out <- pip(country = "USA", year = 2022, + popshare = .5, lkup = lkup) + # Ensure poverty line is not the default one + expect_false(out$poverty_line %in% c(1.9, 3)) + # Ensure headcount is closer to 0.5 + expect_equal(out$headcount, 0.5, tolerance = .05) +}) + diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 7b2b53a4..da735a34 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -66,7 +66,8 @@ test_that("subset_lkup correctly selects all countries", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(nrow(tmp$lkup), nrow(ref_lkup)) }) @@ -79,7 +80,8 @@ test_that("subset_lkup correctly selects countries", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(sort(unique(tmp$lkup$country_code)), sort(selection)) }) @@ -92,7 +94,8 @@ test_that("subset_lkup correctly selects single regions", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(sort(unique(tmp$lkup$region_code)), sort(selection)) }) @@ -105,7 +108,8 @@ test_that("subset_lkup correctly selects multiple regions", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(sort(unique(tmp$lkup$region_code)), sort(selection)) }) @@ -122,7 +126,8 @@ test_that("subset_lkup correctly selects countries and regions", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) # Regions are selected expect_true(all(region_selection %in% (unique(tmp$lkup$region_code))))