diff --git a/R/pip_agg.R b/R/pip_agg.R index 0d5485fb..87762307 100644 --- a/R/pip_agg.R +++ b/R/pip_agg.R @@ -7,43 +7,52 @@ #' # Create lkups #' } #' @export -pip_agg <- function(country = "ALL", - year = "ALL", - povline = 1.9, - group_by = c("wb", "none"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national"), - lkup, - censor = FALSE, - lkup_hash = lkup$cache_data_id$hash_pip_grp, - additional_ind = FALSE) { - +pip_agg <- function( + country = "ALL", + year = "ALL", + povline = 1.9, + group_by = "wb", + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = FALSE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE +) { # Should pip_old or pip_new be used? #------------------------------------- use_new <- lkup$use_new_lineup_version + # check group_by + group_by <- .check_group_by(group_by = group_by, lkup = lkup) + # Run correct function #------------------------------------- out <- if (use_new) { - pip_grp_new(country = country, - year = year, - povline = povline, - welfare_type = welfare_type, - reporting_level = reporting_level, - lkup = lkup, - censor = censor, - additional_ind = additional_ind) + pip_grp_new( + country = country, + year = year, + povline = povline, + welfare_type = welfare_type, + reporting_level = reporting_level, + group_by = group_by, + lkup = lkup, + censor = censor, + additional_ind = additional_ind + ) } else { - pip_grp_logic(country = country, - year = year, - povline = povline, - group_by = group_by, - welfare_type = welfare_type, - reporting_level = reporting_level, - lkup = lkup, - censor = censor, - lkup_hash = lkup_hash, - additional_ind = additional_ind) + pip_grp_logic( + country = country, + year = year, + povline = povline, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind + ) } # Return diff --git a/R/pip_grp.R b/R/pip_grp.R index df0797c2..d9b88b51 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -15,29 +15,31 @@ #' lkup = lkups) #' } #' @export -pip_grp <- function(country = "ALL", - year = "ALL", - povline = 1.9, - group_by = c("wb", "none"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national"), - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip_grp) { - - welfare_type <- match.arg(welfare_type) +pip_grp <- function( + country = "ALL", + year = "ALL", + povline = 1.9, + group_by = "wb", + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip_grp +) { + welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) year <- toupper(year) # If ref_lkup is not part of lkup throw an error. - if (!all(c('ref_lkup') %in% names(lkup))) - stop("You are probably passing more than one dataset as lkup argument. - Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") - + if (!all(c('ref_lkup') %in% names(lkup))) { + stop( + "You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD" + ) + } # Custom aggregations only supported at the national level # subgroups aggregations only supported for "all" countries @@ -51,14 +53,15 @@ pip_grp <- function(country = "ALL", } out <- fg_pip_old( - country = country, - year = year, - povline = povline, - popshare = NULL, - welfare_type = welfare_type, + country = country, + year = year, + povline = povline, + popshare = NULL, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = NULL, - lkup = lkup) + ppp = NULL, + lkup = lkup + ) # For now just rowbinding two dataframes, but we would need to use it more smartly in the future out <- collapse::rowbind(out, fill = TRUE) @@ -69,8 +72,10 @@ pip_grp <- function(country = "ALL", # Handles aggregated distributions (like CHN and IND) if (tolower(reporting_level) %in% c("national", "all")) { - out <- add_agg_stats(out, - return_cols = lkup$return_cols$ag_average_poverty_stats) + out <- add_agg_stats( + out, + return_cols = lkup$return_cols$ag_average_poverty_stats + ) } add_vars_out_of_pipeline(out, fill_gaps = TRUE, lkup = lkup) @@ -83,25 +88,23 @@ pip_grp <- function(country = "ALL", # Handle aggregations with sub-groups if (group_by != "none") { - out <- pip_aggregate_by( df = out, country = country, + group_by = group_by, return_cols = lkup$return_cols$pip_grp ) - out <- estimate_type_var(out,lkup) + out <- estimate_type_var(out, lkup) # Censor regional values # if (censor) { # out <- censor_rows(out, lkup[["censored"]], type = "regions") # } - } else { # Handle simple aggregation - out <- pip_aggregate(out, - return_cols = lkup$return_cols$pip_grp) - out <- estimate_type_var(out,lkup) + out <- pip_aggregate(out, return_cols = lkup$return_cols$pip_grp) + out <- estimate_type_var(out, lkup) } keep <- lkup$return_cols$pip_grp$cols @@ -123,44 +126,34 @@ pip_grp <- function(country = "ALL", #' #' @return data.table pip_aggregate <- function(df, by = NULL, return_cols) { - all_cols <- return_cols$cols weighted_cols <- return_cols$weighted_average_cols ## Assess by parameter --------- if (is.null(by)) { - by_code <- "CUSTOM" by_name <- "CUSTOM" - to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", "estimate_type")] - + to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", "estimate_type")] } else { - if (grepl("code$", by)) { - by_code <- by by_name <- gsub("_code", "", by) - } else { - by_code <- paste0(by, "_code") by_name <- by - } - to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", - "region_code", - "region_name", - "estimate_type")] + to_keep <- all_cols[ + !all_cols %in% + c("pop_in_poverty", "region_code", "region_name", "estimate_type") + ] to_keep <- c(by_name, by_code, to_keep) by <- c(by_name, by_code) } - - # Handle simple aggregation df <- df[, .SD, .SDcols = to_keep] @@ -168,20 +161,17 @@ pip_aggregate <- function(df, by = NULL, return_cols) { byvar <- c(by, "reporting_year", "poverty_line") # Compute population totals - pop <- df[, lapply(.SD, - base::sum, - na.rm = TRUE), - by = byvar, - .SDcols = "reporting_pop" + pop <- df[, + lapply(.SD, base::sum, na.rm = TRUE), + by = byvar, + .SDcols = "reporting_pop" ] # Compute stats weighted average by groups - df <- df[, lapply(.SD, - stats::weighted.mean, - w = reporting_pop, - na.rm = TRUE), - by = byvar, - .SDcols = weighted_cols + df <- df[, + lapply(.SD, stats::weighted.mean, w = reporting_pop, na.rm = TRUE), + by = byvar, + .SDcols = weighted_cols ] # Combine results @@ -189,108 +179,118 @@ pip_aggregate <- function(df, by = NULL, return_cols) { ## Add region code and name ----- if (is.null(by)) { - df$region_code <- "CUSTOM" df$region_name <- "CUSTOM" - } else { - data.table::setnames(df, - c(by_name, by_code), - c("region_name", "region_code")) + data.table::setnames( + df, + c(by_name, by_code), + c("region_name", "region_code") + ) } # Compute population living in poverty df <- df[, pop_in_poverty := round(headcount * reporting_pop, 0)] return(df) - } #' Aggregate by predefined groups #' @param df data.frame: Response from `fg_pip_old()` or `rg_pip()`. #' @param country character: Selected countries / regions +#' @param group_by character: Grouping variable (default is "wb") #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned #' columns #' @keywords internal -pip_aggregate_by <- function(df, - country = "ALL", - return_cols = NULL) { - +pip_aggregate_by <- function( + df, + country = "ALL", + group_by = "wb", + return_cols = NULL +) { all_cols <- return_cols$cols weighted_cols <- return_cols$weighted_average_cols # Keep only rows necessary for regional aggregates df <- filter_for_aggregate_by(df) - to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", - "estimate_type")] + to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", "estimate_type")] # df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part - # Compute stats weighted average by groups - rgn <- df |> - fgroup_by(region_name, - region_code, - reporting_year, - poverty_line) |> - fselect(c(weighted_cols, "reporting_pop")) |> - fmean(w = reporting_pop, stub = FALSE) - - # World aggregation - if (any(c("ALL", "WLD") %in% country)) { + # Determine grouping variables based on group_by parameter + if (group_by == "wb") { + # Default case: use region_code and region_name + group_code <- "region_code" + group_name <- "region_name" + + # Compute stats weighted average by groups + rgn <- df |> + fgroup_by(region_name, region_code, reporting_year, poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) + } else { + # Custom case: construct grouping variables dynamically + group_code <- paste0(group_by, "_code") + group_name <- group_by + + + # Compute stats weighted average by groups + rgn <- df |> + fgroup_by(c(group_name, group_code, "reporting_year", "poverty_line")) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) + + setnames( + rgn, + old = c(group_name, group_code), + new = c("region_name", "region_code") + ) + } + + # World aggregation (only for default "wb" grouping) + if (group_by == "wb" && any(c("ALL", "WLD") %in% country)) { # Compute world aggregates wld <- compute_world_aggregates(rgn = rgn) } else { wld <- NULL } - # Africas aggregation - if (any(c("ALL", "AFE", "AFW") %in% country)) { - rgn <- df |> - fgroup_by(africa_split, - africa_split_code, - reporting_year, - poverty_line) |> - fselect(c(weighted_cols, "reporting_pop")) |> - fmean(w = reporting_pop, stub = FALSE) |> - fsubset(!is.na(africa_split_code)) |> - frename(africa_split_code = region_code, - africa_split = region_name) |> - rowbind(rgn, fill = TRUE) - } - # Vintage aggregation - if ("ALL" %in% country & "regionpcn_code" %in% names(df)) { - rgn <- df |> - fgroup_by(regionpcn, - regionpcn_code, - reporting_year, - poverty_line) |> - fselect(c(weighted_cols, "reporting_pop")) |> - fmean(w = reporting_pop, stub = FALSE) |> - fsubset(!is.na(regionpcn_code)) |> - frename(regionpcn_code = region_code, - regionpcn = region_name) |> - rowbind(rgn, fill = TRUE) + # Africa split aggregation (only for default "wb" grouping) + if (group_by == "wb" && any(c("ALL", "AFE", "AFW") %in% country)) { + rgn <- df |> + fgroup_by( + africa_split, + africa_split_code, + reporting_year, + poverty_line + ) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) |> + fsubset(!is.na(africa_split_code)) |> + frename(africa_split_code = region_code, africa_split = region_name) |> + rowbind(rgn, fill = TRUE) } + # Remove regionpcn_code aggregation logic (as per requirements) - if (length(country) == 1) { - if (country == "WLD") { - # Return only world aggregate - wld[, pop_in_poverty := round(headcount * reporting_pop, 0)] - return(wld) - } + if (length(country) == 1) { + if (country == "WLD") { + # Return only world aggregate + wld[, pop_in_poverty := round(headcount * reporting_pop, 0)] + return(wld) } + } # Combine with other regional aggregates - out <- rowbind(rgn, wld, fill = TRUE) - out[, pop_in_poverty := round(headcount * reporting_pop, 0)] + out <- rowbind(rgn, wld, fill = TRUE) + out[, pop_in_poverty := round(headcount * reporting_pop, 0)] - if ("ALL" %in% country) { - return(out) - } + if ("ALL" %in% country) { + return(out) + } - out[region_code %in% country, ] + out[region_code %in% country, ] } @@ -298,15 +298,12 @@ compute_world_aggregates <- function(rgn, cols = NULL) { # Compute stats # Grouping by poverty line as well since we now have vectorized poverty line values wld <- rgn |> - fgroup_by(reporting_year, - poverty_line) |> + fgroup_by(reporting_year, poverty_line) |> num_vars() |> fmean(w = reporting_pop, stub = FALSE) |> - ftransform(region_code = "WLD", - region_name = "World") + ftransform(region_code = "WLD", region_name = "World") return(wld) - } @@ -319,10 +316,10 @@ filter_for_aggregate_by <- function(df) { # If nationally representative survey is available, use it # Otherwise, use whatever is available - df[, check := length(reporting_level), - by = c("country_code", "reporting_year", "poverty_line") - ][ - check == 1 | (check > 1 & reporting_level == "national"), - ] - + df[, + check := length(reporting_level), + by = c("country_code", "reporting_year", "poverty_line") + ][ + check == 1 | (check > 1 & reporting_level == "national"), + ] } diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index 5fadf427..c2ba322d 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -3,27 +3,28 @@ #' #' @return data.table #' @export -pip_grp_logic <- function(country = "ALL", - year = "ALL", - povline = 1.9, - group_by = c("wb", "none"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national"), - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip_grp, - additional_ind = FALSE) { +pip_grp_logic <- function( + country = "ALL", + year = "ALL", + povline = 1.9, + group_by = "wb", + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE +) { # ________________________________________________________________________ # STEP 1: Set up #### - welfare_type <- match.arg(welfare_type) + welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) # Custom aggregations only supported at the national level # subgroups aggregations only supported for "all" countries country <- toupper(country) - year <- toupper(year) + year <- toupper(year) if (group_by != "none") { reporting_level <- "all" if (!all(country %in% c("ALL", lkup$query_controls$region$values))) { @@ -38,54 +39,56 @@ pip_grp_logic <- function(country = "ALL", lcv <- # List with countries vectors create_countries_vctr( - country = country, - year = year, - lkup = lkup + country = country, + year = year, + lkup = lkup ) # use the same names as before to avoid inconsistencies alt_agg <- lcv$user_alt_agg gt_code <- lcv$user_alt_gt_code - cl <- lkup$aux_files$country_list + cl <- lkup$aux_files$country_list # STEP 3: Start pip_grp_logic algorithm ---- ## STEP 3.1: Official regions only selection ---- ## This will trigger an early return as no additional imputations are needed - if (all(lcv$off_alt_agg == "off")) { # Users only request the official regions + if (all(lcv$off_alt_agg == "off")) { + # Users only request the official regions ### Early return ----------- - res <- pip_grp(country = country, - year = year, - povline = povline, - group_by = "wb", - welfare_type = welfare_type, - reporting_level = reporting_level, - lkup = lkup, - censor = censor) + res <- pip_grp( + country = country, + year = year, + povline = povline, + group_by = "wb", + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = lkup, + censor = censor + ) return(res) - } else { - ## STEP 3.2: Compute fg_pip for ALL required countries ---- ## This will then be re-used in various part of the function ## This is to avoid re-computing and re-loading the same data over and over fg_pip_master <- fg_pip_old( - country = c(lcv$md_off_reg, lcv$user_alt_agg, lcv$user_off_reg), - year = year, - povline = povline, - popshare = NULL, - welfare_type = welfare_type, + country = c(lcv$md_off_reg, lcv$user_alt_agg, lcv$user_off_reg), + year = year, + povline = povline, + popshare = NULL, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = NULL, - lkup = lkup - ) + ppp = NULL, + lkup = lkup + ) # For now just rowbinding two dataframes, but we would need to use it more smartly in the future - fg_pip_master <- rowbind(fg_pip_master, - # THis should not be necessary - fill = TRUE) |> + fg_pip_master <- rowbind( + fg_pip_master, + # THis should not be necessary + fill = TRUE + ) |> setDT() - fg_pip_master[is.na(mean), - mean := predicted_mean_ppp] + fg_pip_master[is.na(mean), mean := predicted_mean_ppp] add_vars_out_of_pipeline(fg_pip_master, fill_gaps = TRUE, lkup = lkup) @@ -100,14 +103,16 @@ pip_grp_logic <- function(country = "ALL", # reporting_level = reporting_level, # lkup = lkup, # censor = censor) - pip_grp_helper(lcv_country = lcv$ctr_off_reg, - country = country, - year = year, - povline = povline, - reporting_level = reporting_level, - censor = FALSE, - fg_pip = fg_pip_master, - lkup = lkup) + pip_grp_helper( + lcv_country = lcv$ctr_off_reg, + country = country, + year = year, + povline = povline, + reporting_level = reporting_level, + censor = FALSE, + fg_pip = fg_pip_master, + lkup = lkup + ) } else { ### STEP 3.2.2 Alternate aggregates only ---- ### Prepare necessary variables @@ -129,20 +134,20 @@ pip_grp_logic <- function(country = "ALL", ## countries, we estimate official region estimates for such countries if (lcv$grp_use %in% c("append", "not")) { - - grp <- pip_grp_helper(lcv_country = lcv$md_off_reg, - country = country, - year = lcv$md_year, - povline = povline, - reporting_level = reporting_level, - censor = FALSE, - fg_pip = fg_pip_master, - lkup = lkup) + grp <- pip_grp_helper( + lcv_country = lcv$md_off_reg, + country = country, + year = lcv$md_year, + povline = povline, + reporting_level = reporting_level, + censor = FALSE, + fg_pip = fg_pip_master, + lkup = lkup + ) if (lcv$grp_use == "append") { grp <- data.table::rbindlist(list(off_ret, grp)) } - } else { # If previous estimations are enough, we don't need to do any estimation. grp <- data.table::copy(off_ret) @@ -152,8 +157,8 @@ pip_grp_logic <- function(country = "ALL", ### Prepare grp to be merge with pop_md grp[, - c("reporting_pop", "pop_in_poverty") := NULL] - + c("reporting_pop", "pop_in_poverty") := NULL + ] ### Merge population with Missing data table --------- @@ -164,17 +169,14 @@ pip_grp_logic <- function(country = "ALL", # This merge will remove those countries for which there is no official # aggregate because of lack of coverage in the region. Eg. There is not data # for SAS in 2000, so for countries like AFG 2000 we can't input estimates - md_grp <- merge(pop_md, grp, - by = c("region_code", "reporting_year")) + md_grp <- merge(pop_md, grp, by = c("region_code", "reporting_year")) ### Merge other region codes ----------- md_grp[, - region_code := NULL] - - md_grp <- merge(md_grp, cl, - by = "country_code", - all.x = TRUE) + region_code := NULL + ] + md_grp <- merge(md_grp, cl, by = "country_code", all.x = TRUE) ## Fill gaps estimates with countries with Survey ----- fg <- fg_pip_master[fg_pip_master$country_code %chin% lcv$est_ctrs, ] @@ -183,29 +185,32 @@ pip_grp_logic <- function(country = "ALL", fg <- fg[fg[["reporting_year"]] %in% as.numeric(year), ] } - l_fg <- vector(mode = "list", length = length(gt_code)) ### Split fg estimates by grouping type ============ for (i in seq_along(gt_code)) { - gt_var <- gt_code[i] + gt_var <- gt_code[i] filter_fg <- paste0(gt_var, " %in% alt_agg") |> - {\(.) parse(text = .) }() + { + \(.) parse(text = .) + }() # Filter both datasets - fdt <- fg[eval(filter_fg)] - mdt <- md_grp[eval(filter_fg)] + fdt <- fg[eval(filter_fg)] + mdt <- md_grp[eval(filter_fg)] # Find common variables common_vars <- intersect(names(fdt), names(mdt)) - fdt <- fdt[, .SD, .SDcols = common_vars] - mdt <- mdt[, .SD, .SDcols = common_vars] + fdt <- fdt[, .SD, .SDcols = common_vars] + mdt <- mdt[, .SD, .SDcols = common_vars] ## Append with countries with missing data ----- - l_fg[[i]] <- data.table::rbindlist(list(fdt, mdt), - use.names = TRUE, - fill = TRUE) + l_fg[[i]] <- data.table::rbindlist( + list(fdt, mdt), + use.names = TRUE, + fill = TRUE + ) } # Estimate poverty for aggregates @@ -218,10 +223,11 @@ pip_grp_logic <- function(country = "ALL", x <- l_fg[[i]] y <- gt_code[i] - ld[[i]] <- pip_aggregate(df = x, - by = y, - return_cols = lkup$return_cols$pip_grp) - + ld[[i]] <- pip_aggregate( + df = x, + by = y, + return_cols = lkup$return_cols$pip_grp + ) } de <- data.table::rbindlist(ld, use.names = TRUE) rm(ld) @@ -232,15 +238,19 @@ pip_grp_logic <- function(country = "ALL", # Append official regions with Alt aggregates --------- if (!is.null(off_ret)) { - ret <- data.table::rbindlist(list(de, off_ret), - use.names = TRUE, - fill = TRUE) - + ret <- data.table::rbindlist( + list(de, off_ret), + use.names = TRUE, + fill = TRUE + ) } else { ret <- de } # add new estimate type + ret <- estimate_type_var(ret,lkup) + + # Censor regional values ----------- We are not censoring at this stage # anymore because we need to show al the years in the homre page, including # nowcast. we are now filtering at the UI and wrappers levels @@ -256,24 +266,28 @@ pip_grp_logic <- function(country = "ALL", #Order rows by country code and reporting year setorder(ret, region_code , reporting_year) + + # ____________________________________________________________________ # Return #### return(ret) } -pip_grp_helper <- function(lcv_country, - country, - year, - povline, - reporting_level, - censor, - fg_pip, - group_by = "wb", - lkup){ - +pip_grp_helper <- function( + lcv_country, + country, + year, + povline, + reporting_level, + censor, + fg_pip, + group_by = "wb", + lkup +) { # Filter countries - keep_countries <- fg_pip[["country_code"]] %chin% lcv_country | + keep_countries <- fg_pip[["country_code"]] %chin% + lcv_country | fg_pip[["wb_region_code"]] %chin% lcv_country out <- fg_pip[keep_countries, ] # Filter years @@ -287,8 +301,10 @@ pip_grp_helper <- function(lcv_country, } # Handles aggregated distributions if (reporting_level %in% c("national", "all")) { - out <- add_agg_stats(out, - return_cols = lkup$return_cols$ag_average_poverty_stats) + out <- add_agg_stats( + out, + return_cols = lkup$return_cols$ag_average_poverty_stats + ) } # Handle potential (insignificant) difference in poverty_line values that @@ -301,10 +317,10 @@ pip_grp_helper <- function(lcv_country, # Handle aggregations with sub-groups if (group_by != "none") { - out <- pip_aggregate_by( df = out, country = country, + group_by = group_by, return_cols = lkup$return_cols$pip_grp ) @@ -313,14 +329,11 @@ pip_grp_helper <- function(lcv_country, # out <- censor_rows(out, lkup[["censored"]], type = "regions") # } - out <- estimate_type_var(out,lkup) - - + out <- estimate_type_var(out, lkup) } else { # Handle simple aggregation - out <- pip_aggregate(df = out, - return_cols = lkup$return_cols$pip_grp) - out <- estimate_type_var(out,lkup) + out <- pip_aggregate(df = out, return_cols = lkup$return_cols$pip_grp) + out <- estimate_type_var(out, lkup) } keep <- lkup$return_cols$pip_grp$cols diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index bf45c3e0..2ca2f7c1 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -1,22 +1,24 @@ #' New way to estimate Aggregate data #' @rdname pip_agg -pip_grp_new <- \(country = "ALL", - year = "ALL", - povline = 1.9, - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national"), - lkup, - censor = TRUE, - additional_ind = FALSE, - lkup_hash = lkup$cache_data_id$hash_pip_grp) { - - welfare_type <- match.arg(welfare_type) +pip_grp_new <- \( + country = "ALL", + year = "ALL", + povline = 1.9, + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + group_by = "wb", + lkup, + censor = TRUE, + additional_ind = FALSE, + lkup_hash = lkup$cache_data_id$hash_pip_grp +) { + welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) # Custom aggregations only supported at the national level # subgroups aggregations only supported for "all" countries country <- toupper(country) - year <- toupper(year) + year <- toupper(year) if (!all(country %in% c("ALL", lkup$query_controls$region$values))) { country <- "ALL" @@ -24,30 +26,34 @@ pip_grp_new <- \(country = "ALL", # Select countries to estimate poverty cts <- copy(lkup$aux_files$country_list) - country_code <- if (!"ALL" %in% country) { + country_code <- if (!"ALL" %in% country) { get_country_code_subset(dt = cts, country = country) } else { "ALL" } out <- fg_pip( - country = country_code, - year = year, - povline = povline, - popshare = NULL, - welfare_type = welfare_type, + country = country_code, + year = year, + povline = povline, + popshare = NULL, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = NULL, - lkup = lkup) + ppp = NULL, + lkup = lkup + ) cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") if (!file.exists(cache_file_path)) { # Create an empty duckdb file create_duckdb_file(cache_file_path) } - out <- treat_cache_and_main(out, - cache_file_path = cache_file_path, - lkup = lkup, fill_gaps = TRUE) + out <- treat_cache_and_main( + out, + cache_file_path = cache_file_path, + lkup = lkup, + fill_gaps = TRUE + ) # return empty dataframe if no metadata is found if (nrow(out) == 0) { @@ -56,8 +62,10 @@ pip_grp_new <- \(country = "ALL", # Handles aggregated distributions (like CHN and IND) if (tolower(reporting_level) %in% c("national", "all")) { - out <- add_agg_stats(out, - return_cols = lkup$return_cols$ag_average_poverty_stats) + out <- add_agg_stats( + out, + return_cols = lkup$return_cols$ag_average_poverty_stats + ) } add_vars_out_of_pipeline(out, fill_gaps = TRUE, lkup = lkup) @@ -70,20 +78,21 @@ pip_grp_new <- \(country = "ALL", # Handle aggregations with sub-groups - out <- pip_aggregate_by( - df = out, - country = country, - return_cols = lkup$return_cols$pip_grp - ) + out <- pip_aggregate_by( + df = out, + country = country, + group_by = group_by, + return_cols = lkup$return_cols$pip_grp + ) - out <- estimate_type_var(out,lkup) + out <- estimate_type_var(out, lkup) - # Censor regional values - if (censor) { - out <- censor_rows(out, lkup[["censored"]], type = "regions") - } + # Censor regional values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "regions") + } - out + out } #' Subset country_code values based on matches in *_code columns and country_code @@ -125,7 +134,8 @@ get_country_code_subset <- function(dt, country) { if (any(!matched)) { cli::cli_abort( "The following values in {.arg country} were not found in any *_code column or country_code: - {country[!matched]}") + {country[!matched]}" + ) } funique(result) } diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index a7c51b3e..14edde52 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -1,4 +1,3 @@ - #' Compute PIP statistics #' #' Compute the main PIP poverty and inequality statistics. @@ -54,27 +53,26 @@ #' lkup = lkups) #' } #' @export -pip_new_lineups <- function(country = "ALL", - year = "ALL", - povline = 1.9, - popshare = NULL, - fill_gaps = FALSE, - group_by = c("none", "wb"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national", "rural", "urban"), - ppp = NULL, - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip, - additional_ind = FALSE) { - - +pip_new_lineups <- function( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) { # set up ------------- - welfare_type <- match.arg(welfare_type) + welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) - povline <- round(povline, digits = 3) - + group_by <- match.arg(group_by) + povline <- round(povline, digits = 3) # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) @@ -83,25 +81,27 @@ pip_new_lineups <- function(country = "ALL", } # If svy_lkup is not part of lkup throw an error. - if (!all(c('svy_lkup') %in% names(lkup))) - stop("You are probably passing more than one dataset as lkup argument. - Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") - + if (!all(c('svy_lkup') %in% names(lkup))) { + stop( + "You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD" + ) + } # **** TO BE REMOVED **** REMOVAL STARTS HERE # Once `pip-grp` has been integrated in ingestion pipeline # Forces fill_gaps to TRUE when using group_by option if (group_by != "none") { fill_gaps <- TRUE - message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + message( + "Info: argument group_by in pip() is deprecated; please use pip_grp() instead." + ) } # **** TO BE REMOVED **** REMOVAL ENDS HERE # Countries vector ------------ validate_country_codes(country = country, lkup = lkup) - - # lcv$est_ctrs has all the country_code that we are interested in cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -113,43 +113,49 @@ pip_new_lineups <- function(country = "ALL", if (fill_gaps) { ## lineup years----------------- out <- fg_pip( - country = country, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup + country = country, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup ) } else { ## survey years ------------------ out <- rg_pip( - country = country, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, + country = country, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = ppp, - lkup = lkup + ppp = ppp, + lkup = lkup ) } # Cache new data #--------------------------------------------- - out <- treat_cache_and_main(out, - cache_file_path = cache_file_path, - lkup = lkup, fill_gaps = fill_gaps) + out <- treat_cache_and_main( + out, + cache_file_path = cache_file_path, + lkup = lkup, + fill_gaps = fill_gaps + ) # Early return for empty table--------------- - if (nrow(out) == 0) return(pipapi::empty_response) + if (nrow(out) == 0) { + return(pipapi::empty_response) + } # aggregate distributions ------------------ if (reporting_level %in% c("national", "all")) { out <- add_agg_stats( - df = out, - return_cols = lkup$return_cols$ag_average_poverty_stats) + df = out, + return_cols = lkup$return_cols$ag_average_poverty_stats + ) if (reporting_level == "national") { out <- out[reporting_level == "national"] } @@ -157,9 +163,7 @@ pip_new_lineups <- function(country = "ALL", # Add out of pipeline variable #--------------------------------------------- - add_vars_out_of_pipeline(out, - fill_gaps = fill_gaps, - lkup = lkup) + add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) # **** TO BE REMOVED **** REMOVAL STARTS HERE # Once `pip-grp` has been integrated in ingestion pipeline @@ -170,63 +174,64 @@ pip_new_lineups <- function(country = "ALL", out$poverty_line <- povline out <- pip_aggregate_by( - df = out, - return_cols = lkup$return_cols$pip_grp) + df = out, + group_by = group_by, + return_cols = lkup$return_cols$pip_grp + ) # Censor regional values if (censor) { - out <- censor_rows(out, - lkup[["censored"]], - type = "regions") + out <- censor_rows(out, lkup[["censored"]], type = "regions") } - out <- out[, c("region_name", - "region_code", - "reporting_year", - "reporting_pop", - "poverty_line", - "headcount", - "poverty_gap", - "poverty_severity", - "watts", - "mean", - "pop_in_poverty")] + out <- out[, c( + "region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty" + )] return(out) } # **** TO BE REMOVED **** REMOVAL ENDS HERE - # pre-computed distributional stats --------------- - crr_names <- names(out) # current variables + crr_names <- names(out) # current variables names2keep <- lkup$return_cols$pip$cols # all variables out <- add_dist_stats( - df = out, - lkup = lkup, - fill_gaps = fill_gaps) + df = out, + lkup = lkup, + fill_gaps = fill_gaps + ) # Add aggregate medians ---------------- out <- add_agg_medians( - df = out, + df = out, fill_gaps = fill_gaps, - data_dir = lkup$data_root + data_dir = lkup$data_root ) # format ---------------- - if (fill_gaps) { # ZP temp NA lineups: #--------------------- # ## Inequality indicators to NA for lineup years ---- - dist_vars <- names2keep[!(names2keep %in% crr_names)] + dist_vars <- names2keep[!(names2keep %in% crr_names)] out[, - (dist_vars) := NA_real_] + (dist_vars) := NA_real_ + ] ## estimate_var ----- out <- estimate_type_ctr_lnp(out, lkup) - } else { out[, estimate_type := NA_character_] } @@ -242,28 +247,30 @@ pip_new_lineups <- function(country = "ALL", out <- censor_rows(out, lkup[["censored"]], type = "countries") } - # Select columns if (additional_ind) { get_additional_indicators(out) added_names <- attr(out, "new_indicators_names") - names2keep <- c(names2keep, added_names) - + names2keep <- c(names2keep, added_names) } # Keep relevant variables - out <- out[, .SD, .SDcols = names2keep] - + out <- out[, .SD, .SDcols = names2keep] # make sure we always report the same precision in all numeric variables doub_vars <- names(out)[unlist(lapply(out, is.double))] |> data.table::copy() - out[, (doub_vars) := lapply(.SD, round, digits = 12), - .SDcols = doub_vars] + out[, (doub_vars) := lapply(.SD, round, digits = 12), .SDcols = doub_vars] # Order rows by country code and reporting year - data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + data.table::setorder( + out, + country_code, + reporting_year, + reporting_level, + welfare_type + ) #} # Make sure no duplicate remains @@ -273,39 +280,35 @@ pip_new_lineups <- function(country = "ALL", } - -treat_cache_and_main <- \(out, cache_file_path, - lkup, fill_gaps) { - - # early return of cache data if not available. +treat_cache_and_main <- \(out, cache_file_path, lkup, fill_gaps) { + # early return of cache data if not available. cached_data <- if (is.null(out$data_in_cache)) { - NULL - } else if (is.data.frame(out$data_in_cache)) { - - if (fnrow(out$data_in_cache) == 0) { NULL - } else { - ft <- qDT(out$data_in_cache) - if (fill_gaps) { - ft <- - fg_remove_duplicates(ft, - use_new_lineup_version = lkup$use_new_lineup_version) - + } else if (is.data.frame(out$data_in_cache)) { + if (fnrow(out$data_in_cache) == 0) { + NULL + } else { + ft <- qDT(out$data_in_cache) + if (fill_gaps) { + ft <- + fg_remove_duplicates( + ft, + use_new_lineup_version = lkup$use_new_lineup_version + ) + } + + # Add just mean and median + get_mean_median(ft, lkup, fill_gaps = fill_gaps) } - - # Add just mean and median - get_mean_median(ft, lkup, fill_gaps = fill_gaps) - - } - } else { - cli::cli_abort( - "{.code out$data_in_cache} must be NULL or data.frame not + } else { + cli::cli_abort( + "{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}" - ) - } + ) + } - main_data <- qDT(out$main_data) + main_data <- qDT(out$main_data) if (nrow(main_data) > 0) { if (is.null(cached_data)) { @@ -317,17 +320,14 @@ treat_cache_and_main <- \(out, cache_file_path, update_master_file(main_data, cache_file_path, fill_gaps) rm(main_data) - } else { out <- cached_data } - setDT(out) } - validate_country_codes <- \(country, lkup) { cls <- lkup$aux_files$country_list$country_code |> unique() |> @@ -335,7 +335,9 @@ validate_country_codes <- \(country, lkup) { if (any(!country %in% cls)) { wcls <- which(!country %in% cls) - cli::cli_abort("{.field {country[wcls]}} {?is/are} not {?a/} valid country code{?s}") + cli::cli_abort( + "{.field {country[wcls]}} {?is/are} not {?a/} valid country code{?s}" + ) } invisible(TRUE) } diff --git a/R/pip_old.R b/R/pip_old.R index e0194c20..a36e239e 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -54,25 +54,25 @@ #' } #' @export #' -pip_old <- function(country = "ALL", - year = "ALL", - povline = 1.9, - popshare = NULL, - fill_gaps = FALSE, - group_by = c("none", "wb"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national", "rural", "urban"), - ppp = NULL, - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip, - additional_ind = FALSE) { - - +pip_old <- function( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) { # set up ------------- - welfare_type <- match.arg(welfare_type) + welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) + group_by <- match.arg(group_by) # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) @@ -81,26 +81,30 @@ pip_old <- function(country = "ALL", } # If svy_lkup is not part of lkup throw an error. - if (!all(c('svy_lkup') %in% names(lkup))) - stop("You are probably passing more than one dataset as lkup argument. - Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") - + if (!all(c('svy_lkup') %in% names(lkup))) { + stop( + "You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD" + ) + } # **** TO BE REMOVED **** REMOVAL STARTS HERE # Once `pip-grp` has been integrated in ingestion pipeline # Forces fill_gaps to TRUE when using group_by option if (group_by != "none") { fill_gaps <- TRUE - message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + message( + "Info: argument group_by in pip() is deprecated; please use pip_grp() instead." + ) } # **** TO BE REMOVED **** REMOVAL ENDS HERE # Countries vector ------------ lcv <- # List with countries vectors create_countries_vctr( - country = country, - year = year, - lkup = lkup + country = country, + year = year, + lkup = lkup ) # lcv$est_ctrs has all the country_code that we are interested in @@ -113,26 +117,26 @@ pip_old <- function(country = "ALL", if (fill_gaps) { ## lineup years----------------- out <- fg_pip_old( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup ) } else { ## survey years ------------------ out <- rg_pip_old( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = ppp, - lkup = lkup + ppp = ppp, + lkup = lkup ) } @@ -155,7 +159,9 @@ pip_old <- function(country = "ALL", setDT(out) } # Early return for empty table--------------- - if (nrow(out) == 0) return(pipapi::empty_response) + if (nrow(out) == 0) { + return(pipapi::empty_response) + } # aggregate distributions ------------------ if (reporting_level %in% c("national", "all")) { @@ -168,8 +174,6 @@ pip_old <- function(country = "ALL", } } - - add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) # **** TO BE REMOVED **** REMOVAL STARTS HERE @@ -181,7 +185,8 @@ pip_old <- function(country = "ALL", out$poverty_line <- povline out <- pip_aggregate_by( - df = out, + df = out, + group_by = group_by, return_cols = lkup$return_cols$pip_grp ) # Censor regional values @@ -189,25 +194,26 @@ pip_old <- function(country = "ALL", out <- censor_rows(out, lkup[["censored"]], type = "regions") } - out <- out[, c("region_name", - "region_code", - "reporting_year", - "reporting_pop", - "poverty_line", - "headcount", - "poverty_gap", - "poverty_severity", - "watts", - "mean", - "pop_in_poverty")] + out <- out[, c( + "region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty" + )] return(out) } # **** TO BE REMOVED **** REMOVAL ENDS HERE - # pre-computed distributional stats --------------- - crr_names <- names(out) # current variables + crr_names <- names(out) # current variables names2keep <- lkup$return_cols$pip$cols # all variables out <- add_dist_stats_old( @@ -217,24 +223,22 @@ pip_old <- function(country = "ALL", # Add aggregate medians ---------------- out <- add_agg_medians( - df = out, + df = out, fill_gaps = fill_gaps, - data_dir = lkup$data_root + data_dir = lkup$data_root ) # format ---------------- - if (fill_gaps) { - ## Inequality indicators to NA for lineup years ---- - dist_vars <- names2keep[!(names2keep %in% crr_names)] + dist_vars <- names2keep[!(names2keep %in% crr_names)] out[, - (dist_vars) := NA_real_] + (dist_vars) := NA_real_ + ] ## estimate_var ----- out <- estimate_type_ctr_lnp(out, lkup) - } else { out[, estimate_type := NA_character_] } @@ -249,28 +253,30 @@ pip_old <- function(country = "ALL", out <- censor_rows(out, lkup[["censored"]], type = "countries") } - # Select columns if (additional_ind) { get_additional_indicators(out) added_names <- attr(out, "new_indicators_names") - names2keep <- c(names2keep, added_names) - + names2keep <- c(names2keep, added_names) } # Keep relevant variables - out <- out[, .SD, .SDcols = names2keep] - + out <- out[, .SD, .SDcols = names2keep] # make sure we always report the same precision in all numeric variables doub_vars <- names(out)[unlist(lapply(out, is.double))] |> data.table::copy() - out[, (doub_vars) := lapply(.SD, round, digits = 12), - .SDcols = doub_vars] + out[, (doub_vars) := lapply(.SD, round, digits = 12), .SDcols = doub_vars] # Order rows by country code and reporting year - data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + data.table::setorder( + out, + country_code, + reporting_year, + reporting_level, + welfare_type + ) #} # Make sure no duplicate remains @@ -280,8 +286,6 @@ pip_old <- function(country = "ALL", } - - #' Compute survey year stats #' #' Compute the main PIP poverty and inequality statistics for survey years. @@ -289,50 +293,54 @@ pip_old <- function(country = "ALL", #' @inheritParams pip #' @return data.frame #' @keywords internal -rg_pip_old <- function(country, - year, - povline, - popshare, - welfare_type, - reporting_level, - ppp, - lkup) { +rg_pip_old <- function( + country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup +) { # get values from lkup valid_regions <- lkup$query_controls$region$values - svy_lkup <- lkup$svy_lkup - data_dir <- lkup$data_root + svy_lkup <- lkup$svy_lkup + data_dir <- lkup$data_root # povline is set to NULL if popshare is given - if (!is.null(popshare)) povline <- NULL - + if (!is.null(popshare)) { + povline <- NULL + } cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") metadata <- subset_lkup( - country = country, - year = year, - welfare_type = welfare_type, + country = country, + year = year, + welfare_type = welfare_type, reporting_level = reporting_level, - lkup = svy_lkup, - valid_regions = valid_regions, - data_dir = data_dir, - povline = povline, + lkup = svy_lkup, + valid_regions = valid_regions, + data_dir = data_dir, + povline = povline, cache_file_path = cache_file_path, - fill_gaps = FALSE + fill_gaps = FALSE ) data_present_in_master <- metadata$data_present_in_master metadata <- metadata$lkup - povline <- metadata$povline + povline <- metadata$povline # Remove aggregate distribution if popshare is specified # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions - metadata <- filter_lkup(metadata = metadata, - popshare = popshare) + metadata <- filter_lkup(metadata = metadata, popshare = popshare) # return empty dataframe if no metadata is found if (nrow(metadata) == 0) { - return(list(main_data = empty_response, - data_in_cache = data_present_in_master)) + return(list( + main_data = empty_response, + data_in_cache = data_present_in_master + )) } out <- vector(mode = "list", length = nrow(metadata)) @@ -346,16 +354,16 @@ rg_pip_old <- function(country, path = tmp_metadata$path ) tmp_stats <- wbpip:::prod_compute_pip_stats( - welfare = svy_data$df0$welfare, - povline = povline, - popshare = popshare, - population = svy_data$df0$weight, - requested_mean = tmp_metadata$survey_mean_ppp, - svy_mean_lcu = tmp_metadata$survey_mean_lcu, - svy_median_lcu = tmp_metadata$survey_median_lcu, - svy_median_ppp = tmp_metadata$survey_median_ppp, - default_ppp = tmp_metadata$ppp, - ppp = ppp, + welfare = svy_data$df0$welfare, + povline = povline, + popshare = popshare, + population = svy_data$df0$weight, + requested_mean = tmp_metadata$survey_mean_ppp, + svy_mean_lcu = tmp_metadata$survey_mean_lcu, + svy_median_lcu = tmp_metadata$survey_median_lcu, + svy_median_ppp = tmp_metadata$survey_median_ppp, + default_ppp = tmp_metadata$ppp, + ppp = ppp, distribution_type = tmp_metadata$distribution_type ) # Add stats columns to data frame diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R index 1bb86dea..d684f74c 100644 --- a/R/pip_old_lineups.R +++ b/R/pip_old_lineups.R @@ -53,28 +53,26 @@ #' lkup = lkups) #' } #' @export -pip_old_lineups <- function(country = "ALL", - year = "ALL", - povline = 1.9, - popshare = NULL, - fill_gaps = FALSE, - group_by = c("none", "wb"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national", "rural", "urban"), - ppp = NULL, - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip, - additional_ind = FALSE) { - - +pip_old_lineups <- function( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) { # set up ------------- - welfare_type <- match.arg(welfare_type) + welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) - povline <- round(povline, digits = 3) - - + group_by <- match.arg(group_by) + povline <- round(povline, digits = 3) # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) @@ -83,26 +81,30 @@ pip_old_lineups <- function(country = "ALL", } # If svy_lkup is not part of lkup throw an error. - if (!all(c('svy_lkup') %in% names(lkup))) - stop("You are probably passing more than one dataset as lkup argument. - Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") - + if (!all(c('svy_lkup') %in% names(lkup))) { + stop( + "You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD" + ) + } # **** TO BE REMOVED **** REMOVAL STARTS HERE # Once `pip-grp` has been integrated in ingestion pipeline # Forces fill_gaps to TRUE when using group_by option if (group_by != "none") { fill_gaps <- TRUE - message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + message( + "Info: argument group_by in pip() is deprecated; please use pip_grp() instead." + ) } # **** TO BE REMOVED **** REMOVAL ENDS HERE # Countries vector ------------ lcv <- # List with countries vectors create_countries_vctr( - country = country, - year = year, - lkup = lkup + country = country, + year = year, + lkup = lkup ) # lcv$est_ctrs has all the country_code that we are interested in @@ -115,26 +117,26 @@ pip_old_lineups <- function(country = "ALL", if (fill_gaps) { ## lineup years----------------- out <- fg_pip_old( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup ) } else { ## survey years ------------------ out <- rg_pip_old( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = ppp, - lkup = lkup + ppp = ppp, + lkup = lkup ) } @@ -145,8 +147,7 @@ pip_old_lineups <- function(country = "ALL", out <- main_data |> rowbind(cached_data) - update_master_file(main_data, cache_file_path, fill_gaps) - + update_master_file(main_data, cache_file_path, fill_gaps) } else { out <- cached_data } @@ -154,7 +155,9 @@ pip_old_lineups <- function(country = "ALL", setDT(out) } # Early return for empty table--------------- - if (nrow(out) == 0) return(pipapi::empty_response) + if (nrow(out) == 0) { + return(pipapi::empty_response) + } # aggregate distributions ------------------ if (reporting_level %in% c("national", "all")) { @@ -167,8 +170,6 @@ pip_old_lineups <- function(country = "ALL", } } - - add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) # **** TO BE REMOVED **** REMOVAL STARTS HERE @@ -180,7 +181,8 @@ pip_old_lineups <- function(country = "ALL", out$poverty_line <- povline out <- pip_aggregate_by( - df = out, + df = out, + group_by = group_by, return_cols = lkup$return_cols$pip_grp ) # Censor regional values @@ -188,25 +190,26 @@ pip_old_lineups <- function(country = "ALL", out <- censor_rows(out, lkup[["censored"]], type = "regions") } - out <- out[, c("region_name", - "region_code", - "reporting_year", - "reporting_pop", - "poverty_line", - "headcount", - "poverty_gap", - "poverty_severity", - "watts", - "mean", - "pop_in_poverty")] + out <- out[, c( + "region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty" + )] return(out) } # **** TO BE REMOVED **** REMOVAL ENDS HERE - # pre-computed distributional stats --------------- - crr_names <- names(out) # current variables + crr_names <- names(out) # current variables names2keep <- lkup$return_cols$pip$cols # all variables out <- add_dist_stats_old( @@ -216,24 +219,22 @@ pip_old_lineups <- function(country = "ALL", # Add aggregate medians ---------------- out <- add_agg_medians( - df = out, + df = out, fill_gaps = fill_gaps, - data_dir = lkup$data_root + data_dir = lkup$data_root ) # format ---------------- - if (fill_gaps) { - ## Inequality indicators to NA for lineup years ---- - dist_vars <- names2keep[!(names2keep %in% crr_names)] + dist_vars <- names2keep[!(names2keep %in% crr_names)] out[, - (dist_vars) := NA_real_] + (dist_vars) := NA_real_ + ] ## estimate_var ----- out <- estimate_type_ctr_lnp(out, lkup) - } else { out[, estimate_type := NA_character_] } @@ -248,28 +249,30 @@ pip_old_lineups <- function(country = "ALL", out <- censor_rows(out, lkup[["censored"]], type = "countries") } - # Select columns if (additional_ind) { get_additional_indicators(out) added_names <- attr(out, "new_indicators_names") - names2keep <- c(names2keep, added_names) - + names2keep <- c(names2keep, added_names) } # Keep relevant variables - out <- out[, .SD, .SDcols = names2keep] - + out <- out[, .SD, .SDcols = names2keep] # make sure we always report the same precision in all numeric variables doub_vars <- names(out)[unlist(lapply(out, is.double))] |> data.table::copy() - out[, (doub_vars) := lapply(.SD, round, digits = 12), - .SDcols = doub_vars] + out[, (doub_vars) := lapply(.SD, round, digits = 12), .SDcols = doub_vars] # Order rows by country code and reporting year - data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + data.table::setorder( + out, + country_code, + reporting_year, + reporting_level, + welfare_type + ) #} # Make sure no duplicate remains diff --git a/R/utils.R b/R/utils.R index f68ce3c8..18936972 100644 --- a/R/utils.R +++ b/R/utils.R @@ -7,27 +7,28 @@ #' @param cache_file_path file path for cache #' @return data.frame #' @keywords internal -subset_lkup <- function(country, - year, - welfare_type, - reporting_level, - lkup, - valid_regions, - data_dir = NULL, - povline, - cache_file_path, - fill_gaps, - popshare = NULL - ) { - - - lkup <- lkup_filter(lkup, - country, - year, - valid_regions, - reporting_level, - welfare_type, - data_dir) +subset_lkup <- function( + country, + year, + welfare_type, + reporting_level, + lkup, + valid_regions, + data_dir = NULL, + povline, + cache_file_path, + fill_gaps, + popshare = NULL +) { + lkup <- lkup_filter( + lkup, + country, + year, + valid_regions, + reporting_level, + welfare_type, + data_dir + ) # If povline is NULL, this happens when popshare is passed # i.e popshare is not NULL if (is.null(povline)) { @@ -48,34 +49,39 @@ subset_lkup <- function(country, #' @keywords internal -lkup_filter <- function(lkup, - country, - year, - valid_regions, - reporting_level, - welfare_type, - data_dir) { +lkup_filter <- function( + lkup, + country, + year, + valid_regions, + reporting_level, + welfare_type, + data_dir +) { # STEP 1 - Keep every row by default keep <- rep(TRUE, nrow(lkup)) # STEP 2 - Select countries keep <- select_country(lkup, keep, country, valid_regions) # STEP 3 - Select years - keep <- select_years(lkup = lkup, - keep = keep, - year = year, - country = country, - data_dir = data_dir, - valid_regions = valid_regions) + keep <- select_years( + lkup = lkup, + keep = keep, + year = year, + country = country, + data_dir = data_dir, + valid_regions = valid_regions + ) # STEP 4 - Select welfare_type if (welfare_type[1] != "all") { keep <- keep & lkup$welfare_type == welfare_type } # STEP 5 - Select reporting_level - keep <- select_reporting_level(lkup = lkup, - keep = keep, - reporting_level = reporting_level[1]) - + keep <- select_reporting_level( + lkup = lkup, + keep = keep, + reporting_level = reporting_level[1] + ) lkup <- lkup[keep, ] return(lkup) @@ -120,12 +126,14 @@ select_country <- function(lkup, keep, country, valid_regions) { #' @inheritParams subset_lkup #' @param keep logical vector #' @return logical vector -select_years <- function(lkup, - keep, - year, - country, - data_dir, - valid_regions = NULL) { +select_years <- function( + lkup, + keep, + year, + country, + data_dir, + valid_regions = NULL +) { # columns i is an ID that identifies if a country has more than one # observation for reporting year. That is the case of IND with URB/RUR and ZWE # with interporaltion and microdata info @@ -134,19 +142,19 @@ select_years <- function(lkup, # by = .(country_code, reporting_year)] caller_names <- get_caller_names() - is_agg <- + is_agg <- grepl("pip_grp", caller_names) |> any() dtmp <- lkup - year <- toupper(year) - country <- toupper(country) + year <- toupper(year) + country <- toupper(country) keep_years <- rep(TRUE, nrow(dtmp)) - has_region <- FALSE + has_region <- FALSE has_country <- TRUE - has_all <- "ALL" %in% country + has_all <- "ALL" %in% country if (!is.null(valid_regions)) { if (any(country %in% valid_regions[!valid_regions %in% "ALL"])) { @@ -159,44 +167,42 @@ select_years <- function(lkup, # STEP 1 - If Most Recent Value requested if ("MRV" %in% year) { - # for MRV, countries and regions not allowed if (has_country && has_region) { - rlang::abort("country codes and region codes not allowed with MRV in year") + rlang::abort( + "country codes and region codes not allowed with MRV in year" + ) } # STEP 1.1 - If all countries selected. Select MRV for each country if (has_region || is_agg) { mr <- get_metaregion_table(data_dir) - dtmp[mr, - on = "region_code", - max_year := reporting_year == i.lineup_year] + dtmp[mr, on = "region_code", max_year := reporting_year == i.lineup_year] if (isFALSE(has_all)) { - dtmp[!region_code %in% country, - max_year := FALSE] + dtmp[!region_code %in% country, max_year := FALSE] } - } else { # STEP 1.2 - If only some countries selected. Select MRV for each selected # country if (has_all) { dtmp[, - max_year := reporting_year == max(reporting_year), - by = country_code] + max_year := reporting_year == max(reporting_year), + by = country_code + ] } else { - dtmp[country_code %in% country | region_code %in% country, - max_year := reporting_year == max(reporting_year), - by = country_code] + dtmp[ + country_code %in% country | region_code %in% country, + max_year := reporting_year == max(reporting_year), + by = country_code + ] } } # dtmp <- unique(dtmp[, .(country_code, reporting_year, max_year)]) dtmp[is.na(max_year), max_year := FALSE] - keep_years <- keep_years & as.logical(dtmp[["max_year"]]) - } # STEP 2 - If specific years are specified. Filter for these years if (!any(c("ALL", "MRV") %in% year)) { @@ -218,8 +224,7 @@ select_years <- function(lkup, #' #' @return data.frame -filter_lkup <- function(metadata, - popshare) { +filter_lkup <- function(metadata, popshare) { # popshare option not supported for aggregate distributions if (!is.null(popshare)) { return( @@ -228,7 +233,6 @@ filter_lkup <- function(metadata, } else { return(metadata) } - } #' helper function to correctly filter look up table according to requested @@ -241,23 +245,20 @@ filter_lkup <- function(metadata, #' @return data.table #' @export #' -select_reporting_level <- function(lkup, - keep, - reporting_level) { +select_reporting_level <- function(lkup, keep, reporting_level) { # To be updated: Fix the coverage variable names in aux data (reporting_coverage?) if (reporting_level == "all") { return(keep) - } else if (reporting_level == "national") { # Subnational levels necessary to compute national stats for aggregate distributions - keep <- keep & (lkup$reporting_level == reporting_level | lkup$is_used_for_aggregation) + keep <- keep & + (lkup$reporting_level == reporting_level | lkup$is_used_for_aggregation) return(keep) - } else { if ("survey_coverage" %in% names(lkup)) { keep <- keep & (lkup$survey_coverage == reporting_level | - lkup$reporting_level == reporting_level) + lkup$reporting_level == reporting_level) } else { # This condition is not triggered keep <- keep & lkup$reporting_level == reporting_level @@ -275,29 +276,31 @@ select_reporting_level <- function(lkup, #' #' @return data.frame #' @keywords internal -get_svy_data <- function(svy_id, - reporting_level, - path) { +get_svy_data <- function(svy_id, reporting_level, path) { # Each call should be made at a unique reporting_level (equivalent to reporting_data_level: national, urban, rural) # This check should be conducted at the data validation stage reporting_level <- unique(reporting_level) - assertthat::assert_that(length(reporting_level) == 1, - msg = "Problem with input data: Multiple reporting_levels" + assertthat::assert_that( + length(reporting_level) == 1, + msg = "Problem with input data: Multiple reporting_levels" ) # tictoc::tic("read_single") out <- lapply(path, function(x) { - # Not robust. Should not be hard coded here. if (reporting_level %in% c("urban", "rural")) { - tmp <- fst::read_fst(x, - columns = c("area", "welfare", "weight"), - as.data.table = TRUE) + tmp <- fst::read_fst( + x, + columns = c("area", "welfare", "weight"), + as.data.table = TRUE + ) tmp <- tmp[area == reporting_level, ] tmp[, area := NULL] } else { - tmp <- fst::read_fst(x, - columns = c("welfare", "weight"), - as.data.table = TRUE) + tmp <- fst::read_fst( + x, + columns = c("welfare", "weight"), + as.data.table = TRUE + ) } return(tmp) @@ -324,7 +327,6 @@ get_svy_data <- function(svy_id, #' @return data.table #' @export add_dist_stats <- function(df, lkup, fill_gaps) { - if (fill_gaps) { dist_stats <- lkup[["lineup_dist_stats"]] } else { @@ -332,18 +334,16 @@ add_dist_stats <- function(df, lkup, fill_gaps) { } if (fill_gaps) { - df <- df |> - joyn::joyn(y = dist_stats, - by = c("country_code", - "reporting_level", - "reporting_year"), - match_type = "m:1", # multiple poverty lines - keep_common_vars = FALSE, - reportvar = FALSE, - verbose = FALSE, - keep = "left") - + joyn::joyn( + y = dist_stats, + by = c("country_code", "reporting_level", "reporting_year"), + match_type = "m:1", # multiple poverty lines + keep_common_vars = FALSE, + reportvar = FALSE, + verbose = FALSE, + keep = "left" + ) } else { # Keep only relevant columns cols <- c( @@ -362,9 +362,10 @@ add_dist_stats <- function(df, lkup, fill_gaps) { # merge dist stats with main table # data.table::setnames(dist_stats, "survey_median_ppp", "median") - df <- dist_stats[df, - on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), - allow.cartesian = TRUE + df <- dist_stats[ + df, + on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), + allow.cartesian = TRUE ] } df @@ -396,9 +397,10 @@ add_dist_stats_old <- function(df, dist_stats) { # merge dist stats with main table # data.table::setnames(dist_stats, "survey_median_ppp", "median") - df <- dist_stats[df, - on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), - allow.cartesian = TRUE + df <- dist_stats[ + df, + on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), + allow.cartesian = TRUE ] return(df) @@ -409,11 +411,13 @@ add_dist_stats_old <- function(df, dist_stats) { #' @return data.table #' @noRd collapse_rows <- function(df, vars, na_var = NULL) { - tmp_vars <- lapply(df[, .SD, .SDcols = vars], unique, collapse = "|") - tmp_vars <- lapply(tmp_vars, paste, collapse = "|") + tmp_vars <- lapply(df[, .SD, .SDcols = vars], unique, collapse = "|") + tmp_vars <- lapply(tmp_vars, paste, collapse = "|") tmp_var_names <- names(df[, .SD, .SDcols = vars]) - if (!is.null(na_var)) df[[na_var]] <- NA_real_ + if (!is.null(na_var)) { + df[[na_var]] <- NA_real_ + } for (tmp_var in seq_along(tmp_vars)) { df[[tmp_var_names[tmp_var]]] <- tmp_vars[[tmp_var]] @@ -431,7 +435,6 @@ collapse_rows <- function(df, vars, na_var = NULL) { #' @return data.table #' @noRd censor_rows <- function(df, censored, type = c("countries", "regions")) { - type <- match.arg(type) # Return early if there are no censoring observations @@ -444,15 +447,18 @@ censor_rows <- function(df, censored, type = c("countries", "regions")) { df$tmp_id <- sprintf( "%s_%s_%s_%s_%s", - df$country_code, df$reporting_year, - df$survey_acronym, df$welfare_type, + df$country_code, + df$reporting_year, + df$survey_acronym, + df$welfare_type, df$reporting_level ) } else { df$tmp_id <- sprintf( "%s_%s", - df$region_code, df$reporting_year + df$region_code, + df$reporting_year ) } @@ -487,8 +493,11 @@ censor_stats <- function(df, censored_table) { if (nrow(censor_stats) > 0) { # Perform a non-equi join to mark relevant statistics # Commenting mult = "first" since with multiple povline values there are more than one rows - df[censor_stats, on = .(tmp_id = id), #mult = "first", - unique(censor_stats$statistic) := NA_real_] + df[ + censor_stats, + on = .(tmp_id = id), #mult = "first", + unique(censor_stats$statistic) := NA_real_ + ] } # Clean up the temporary column @@ -505,12 +514,10 @@ censor_stats <- function(df, censored_table) { #' @param lkup lkup value #' @keywords internal estimate_type_var <- function(df, lkup) { - censored_table <- lkup$censored$regions - data_dir <- lkup$data_root - - mr <- get_metaregion_table(data_dir = data_dir) + data_dir <- lkup$data_root + mr <- get_metaregion_table(data_dir = data_dir) df[, tmp_id := paste(region_code, reporting_year, sep = "_")] # Create a binary column to mark what is projections based on @@ -529,18 +536,21 @@ estimate_type_var <- function(df, lkup) { # Merge metaregion and label those obs with reporting year # higher than lineup year as "nowcast" df <- mr[df, on = "region_code"] - df[reporting_year > lineup_year, - estimate_type := "nowcast"] + df[reporting_year > lineup_year, estimate_type := "nowcast"] # This should be done in a different function... # Update specific statistics to NA where not 'all' censor_stats <- censored_table[statistic != "all"] if (nrow(censor_stats) > 0) { # Perform a non-equi join to mark relevant statistics - df[censor_stats, on = .(tmp_id = id), mult = "first", - (censor_stats$statistic) := NA_real_] + df[ + censor_stats, + on = .(tmp_id = id), + mult = "first", + (censor_stats$statistic) := NA_real_ + ] } - df[, c("tmp_id", "lineup_year") := NULL] + df[, c("tmp_id", "lineup_year") := NULL] } @@ -552,24 +562,25 @@ estimate_type_var <- function(df, lkup) { #' @return out database with `estimate_type` variable #' @keywords internal estimate_type_ctr_lnp <- function(out, lkup) { - - out[, estimate_type := fifelse(estimation_type == "survey", "actual", "projection")] - mr <- get_metaregion_table(lkup$data_root) - wld <- mr[region_code == "WLD", lineup_year] - regs <- out[, unique(region_code)] - mr <- mr[region_code %in% regs] - mr[, lineup_year := max(lineup_year, wld), - by = region_code] + out[, + estimate_type := fifelse( + estimation_type == "survey", + "actual", + "projection" + ) + ] + mr <- get_metaregion_table(lkup$data_root) + wld <- mr[region_code == "WLD", lineup_year] + regs <- out[, unique(region_code)] + mr <- mr[region_code %in% regs] + mr[, lineup_year := max(lineup_year, wld), by = region_code] # Merge metaregion and label those obs with reporting year # higher than lineup year as "nowcast" out <- mr[out, on = "region_code"] - out[reporting_year > lineup_year, - estimate_type := "nowcast"] + out[reporting_year > lineup_year, estimate_type := "nowcast"] out[, lineup_year := NULL] - - } #' Create query controls @@ -580,16 +591,18 @@ estimate_type_ctr_lnp <- function(out, lkup) { #' @param versions character: List of available data versions #' @return list #' @noRd -create_query_controls <- function(svy_lkup, - ref_lkup, - aux_files, - aux_tables, - versions) { +create_query_controls <- function( + svy_lkup, + ref_lkup, + aux_files, + aux_tables, + versions +) { # Countries and regions countries <- unique(c( - svy_lkup$country_code, - ref_lkup$country_code - )) + svy_lkup$country_code, + ref_lkup$country_code + )) regions <- unique(c( aux_files$regions$region_code @@ -600,8 +613,8 @@ create_query_controls <- function(svy_lkup, "ALL", sort(c( countries, - regions) - ) + regions + )) ), type = "character" ) @@ -613,7 +626,8 @@ create_query_controls <- function(svy_lkup, # Year year <- list( values = c( - "all", "MRV", + "all", + "MRV", sort(unique(c( svy_lkup$reporting_year, ref_lkup$reporting_year @@ -633,26 +647,23 @@ create_query_controls <- function(svy_lkup, ) # Boolean parameters - fill_gaps <- - aggregate <- - long_format <- - additional_ind <- - exclude <- - list(values = c(TRUE, FALSE), - type = "logical") - - # Group by - group_by <- list( - values = c("none", "wb"), - type = "character" - ) + fill_gaps <- + aggregate <- + long_format <- + additional_ind <- + exclude <- + list(values = c(TRUE, FALSE), type = "logical") + # Welfare type welfare_type <- list( - values = c("all", sort(unique(c( - svy_lkup$welfare_type, - ref_lkup$welfare_type - )))), + values = c( + "all", + sort(unique(c( + svy_lkup$welfare_type, + ref_lkup$welfare_type + ))) + ), type = "character" ) # Reporting level @@ -677,8 +688,7 @@ create_query_controls <- function(svy_lkup, type = "character" ) # Formats - format <- list(values = c("json", "csv", "rds", "arrow"), - type = "character") + format <- list(values = c("json", "csv", "rds", "arrow"), type = "character") # Tables table <- list(values = aux_tables, type = "character") @@ -688,12 +698,28 @@ create_query_controls <- function(svy_lkup, pass <- list(values = Sys.getenv('PIP_CACHE_SERVER_KEY'), type = "character") # parameters parameter <- - list(values = c("country", "year", "povline", - "popshare", "fill_gaps", "aggregate", - "group_by", "welfare_type", - "reporting_level", "ppp", "version", - "format", "table", "long_format", "exclude", "type", "pass"), - type = "character") + list( + values = c( + "country", + "year", + "povline", + "popshare", + "fill_gaps", + "aggregate", + "group_by", + "welfare_type", + "reporting_level", + "ppp", + "version", + "format", + "table", + "long_format", + "exclude", + "type", + "pass" + ), + type = "character" + ) # cum_welfare cum_welfare <- list( @@ -724,7 +750,7 @@ create_query_controls <- function(svy_lkup, ) # lorenz - lorenz <- list(values = c("lb", "lq"),type = "character") + lorenz <- list(values = c("lb", "lq"), type = "character") # n_bins n_bins <- list( @@ -734,44 +760,54 @@ create_query_controls <- function(svy_lkup, # Endpoint endpoint <- - list(values = c("all", - "aux", - "pip", - "pip-grp", - "pip-info", - "valid-params"), - type = "character") - - # Create list of query controls + list( + values = c("all", "aux", "pip", "pip-grp", "pip-info", "valid-params"), + type = "character" + ) + + # group_by + regs <- aux_files$country_list |> + names() |> + grep("_code$|_name$", x = _, value = TRUE, invert = TRUE) |> + c("wb", "none", "vintage", "pcn") |> + sort() + + group_by <- list( + values = regs, + type = "character" + ) + + + # Create list of query controls query_controls <- list( - country = country, - region = region, - year = year, - povline = povline, - popshare = popshare, - fill_gaps = fill_gaps, - aggregate = aggregate, - long_format = long_format, - exclude = exclude, - additional_ind = additional_ind, - group_by = group_by, - welfare_type = welfare_type, + country = country, + region = region, + year = year, + povline = povline, + popshare = popshare, + fill_gaps = fill_gaps, + aggregate = aggregate, + long_format = long_format, + exclude = exclude, + additional_ind = additional_ind, + group_by = group_by, + welfare_type = welfare_type, reporting_level = reporting_level, - ppp = ppp, - version = version, - format = format, - table = table, - parameter = parameter, - cum_welfare = cum_welfare, - cum_population = cum_population, - requested_mean = requested_mean, - mean = mean, - times_mean = times_mean, - lorenz = lorenz, - n_bins = n_bins, - endpoint = endpoint, - type = type, - pass = pass + ppp = ppp, + version = version, + format = format, + table = table, + parameter = parameter, + cum_welfare = cum_welfare, + cum_population = cum_population, + requested_mean = requested_mean, + mean = mean, + times_mean = times_mean, + lorenz = lorenz, + n_bins = n_bins, + endpoint = endpoint, + type = type, + pass = pass ) return(query_controls) @@ -792,12 +828,7 @@ convert_empty <- function(string) { #' @inheritParams subset_lkup #' @return data.frame #' @keywords internal -subset_ctry_years <- function(country, - year, - lkup, - valid_regions, - data_dir) { - +subset_ctry_years <- function(country, year, lkup, valid_regions, data_dir) { is_agg <- get_caller_names() is_agg <- grepl(pattern = "pip_grp", x = is_agg) |> any() @@ -809,8 +840,8 @@ subset_ctry_years <- function(country, if (!any(c("ALL", "WLD") %in% country)) { # Select regions if (any(country %in% valid_regions)) { - selected_regions <- country[country %in% valid_regions] - keep_regions <- lkup$region_code %in% selected_regions + selected_regions <- country[country %in% valid_regions] + keep_regions <- lkup$region_code %in% selected_regions country_or_region <- "region_code" } else { keep_regions <- rep(FALSE, length(lkup$region_code)) @@ -824,20 +855,20 @@ subset_ctry_years <- function(country, # } # Select years - if (year[1] == "MRV") { + if (year[1] == "MRV") { if (is_agg) { mr <- get_metaregion_table(data_dir) - lkup[mr, - on = "region_code", - lineup_year := i.lineup_year] + lkup[mr, on = "region_code", lineup_year := i.lineup_year] } else { lkup[, lineup_year := reporting_year] } if (country[1] != "ALL") { max_year <- - lkup[get(country_or_region) == country & reporting_year == lineup_year, - reporting_year] |> + lkup[ + get(country_or_region) == country & reporting_year == lineup_year, + reporting_year + ] |> max() } else { max_year <- @@ -863,23 +894,32 @@ subset_ctry_years <- function(country, #' @return list #' @keywords internal clear_cache <- function(cd) { - tryCatch({ - if (cd$size() > 0) { - cd$reset() - n <- cd$size() - if (n == 0) { - out <- list(status = 'success', msg = 'Cache cleared.') + tryCatch( + { + if (cd$size() > 0) { + cd$reset() + n <- cd$size() + if (n == 0) { + out <- list(status = 'success', msg = 'Cache cleared.') + } else { + out <- list( + status = 'error', + msg = sprintf('Something went wrong. %n items remain in cache.', n) + ) + } } else { - out <- list(status = 'error', msg = sprintf('Something went wrong. %n items remain in cache.', n)) + out <- list( + status = 'success', + msg = 'Cache directory is empty. Nothing to clear.' + ) } - } else { - out <- list(status = 'success', msg = 'Cache directory is empty. Nothing to clear.') + return(out) + }, + error = function(e) { + out <- list(status = 'error', msg = 'Cache directory not found.') + return(out) } - return(out) - }, error = function(e){ - out <- list(status = 'error', msg = 'Cache directory not found.') - return(out) - }) + ) } #' Test whether a vector is length zero and IS not NULL @@ -926,16 +966,13 @@ is_empty <- function(x) { #' z <- TRUE #' fillin_list(l) #' l -fillin_list <- function(l, - assign = TRUE) { - +fillin_list <- function(l, assign = TRUE) { # ____________________________________________________________ # Defenses #### - stopifnot( exprs = { + stopifnot(exprs = { is.list(l) is.data.frame(l) == FALSE - } - ) + }) # __________________________________________________________________ # Early returns #### @@ -956,16 +993,17 @@ fillin_list <- function(l, # make sure that all the objects in list are in parent frame if (!all(nm_obj %in% obj_in_parent)) { + non_in_parent <- nm_obj[!nm_obj %in% obj_in_parent] - non_in_parent <-nm_obj[!nm_obj %in% obj_in_parent] - - stop_msg <- paste("The following objects are not in calling function: \n", - paste(non_in_parent, collapse = ", ")) + stop_msg <- paste( + "The following objects are not in calling function: \n", + paste(non_in_parent, collapse = ", ") + ) stop(stop_msg) } - val_obj <- lapply(nm_obj, get, envir = parent.frame()) + val_obj <- lapply(nm_obj, get, envir = parent.frame()) names(val_obj) <- nm_obj for (i in seq_along(nm_obj)) { @@ -980,7 +1018,6 @@ fillin_list <- function(l, } return(invisible(l)) - } #' Returns all auxiliary tables that support the long_format=TRUE parameter @@ -999,27 +1036,24 @@ get_valid_aux_long_format_tables <- function() { #' #' @return data.table #' @keywords internal -get_spr_table <- function(data_dir, - table = c("spr_svy", "spr_lnp")) { - +get_spr_table <- function(data_dir, table = c("spr_svy", "spr_lnp")) { table <- match.arg(table) spr <- tryCatch( expr = { # Your code... - get_aux_table(data_dir = data_dir, - table = table) + get_aux_table(data_dir = data_dir, table = table) }, # end of expr section error = function(e) { data.table::data.table( - country_code = character(0), - reporting_year = numeric(0), - welfare_type = character(0), + country_code = character(0), + reporting_year = numeric(0), + welfare_type = character(0), reporting_level = character(0), - spl = numeric(0), - spr = numeric(0), - median = numeric(0) + spl = numeric(0), + spr = numeric(0), + median = numeric(0) ) } ) # End of trycatch @@ -1035,18 +1069,16 @@ get_spr_table <- function(data_dir, #' @return data.table #' @keywords internal get_metaregion_table <- function(data_dir) { - spr <- tryCatch( expr = { # Your code... - get_aux_table(data_dir = data_dir, - table = "metaregion") + get_aux_table(data_dir = data_dir, table = "metaregion") }, # end of expr section error = function(e) { data.table::data.table( - region_code = character(0), - lineup_year = numeric(0) + region_code = character(0), + lineup_year = numeric(0) ) } ) # End of trycatch @@ -1054,7 +1086,6 @@ get_metaregion_table <- function(data_dir) { } - #' Load prosperity gap table from aux data #' #' If there is no data available, return an empty data.frame @@ -1063,26 +1094,23 @@ get_metaregion_table <- function(data_dir) { #' #' @return data.table #' @keywords internal -get_pg_table <- function(data_dir, - table = c("pg_svy", "pg_lnp")) { - +get_pg_table <- function(data_dir, table = c("pg_svy", "pg_lnp")) { table <- match.arg(table) pg <- tryCatch( expr = { # Your code... - get_aux_table(data_dir = data_dir, - table = table) + get_aux_table(data_dir = data_dir, table = table) }, # end of expr section error = function(e) { data.table::data.table( - country_code = character(0), - reporting_level = character(0), - pg = numeric(0), - welfare_type = character(0), - reporting_year = integer(0) - ) + country_code = character(0), + reporting_level = character(0), + pg = numeric(0), + welfare_type = character(0), + reporting_year = integer(0) + ) } ) # End of trycatch return(pg) @@ -1098,24 +1126,24 @@ get_pg_table <- function(data_dir, #' @return data.table #' @keywords internal add_pg <- function(df, fill_gaps, data_dir) { - - if (fill_gaps) { + if (fill_gaps) { table <- "pg_lnp" } else { table <- "pg_svy" } - pg <- get_pg_table(data_dir = data_dir, - table = table) - - df[pg, - on = c( - "country_code", - "reporting_year", - "welfare_type", - "reporting_level"), - pg := i.pg - ] + pg <- get_pg_table(data_dir = data_dir, table = table) + + df[ + pg, + on = c( + "country_code", + "reporting_year", + "welfare_type", + "reporting_level" + ), + pg := i.pg + ] } #' Add Distribution type @@ -1127,7 +1155,6 @@ add_pg <- function(df, fill_gaps, data_dir) { #' @return data.table #' @keywords internal add_distribution_type <- function(df, lkup, fill_gaps) { - # merge reference table with framework table and get distribution type # from framework rf <- copy(lkup$ref_lkup) |> @@ -1139,11 +1166,10 @@ add_distribution_type <- function(df, lkup, fill_gaps) { reporting_year, surveyid_year )][, - surveyid_year := as.numeric(surveyid_year)] - + surveyid_year := as.numeric(surveyid_year) + ] - fw <- get_aux_table(data_dir = lkup$data_root, - "framework") |> + fw <- get_aux_table(data_dir = lkup$data_root, "framework") |> copy() |> _[, .( country_code, @@ -1155,29 +1181,27 @@ add_distribution_type <- function(df, lkup, fill_gaps) { use_groupdata )] - dt <- collapse::join( - x = rf, - y = fw, - on = c("country_code", "surveyid_year", "survey_acronym"), - how = "left", + x = rf, + y = fw, + on = c("country_code", "surveyid_year", "survey_acronym"), + how = "left", validate = "m:1", - verbose = 0 + verbose = 0 ) if (fill_gaps) { # line up years ---------- - by_vars <- c("country_code", - "reporting_year", - "welfare_type" - ) + by_vars <- c("country_code", "reporting_year", "welfare_type") dt[, - # distribution type by year - distribution_type := fcase(use_groupdata == 1, "group", - use_imputed == 1, "imputed", - default = "micro") + # distribution type by year + distribution_type := fcase( + use_groupdata == 1 , "group" , + use_imputed == 1 , "imputed" , + default = "micro" + ) ][, # find interpolation with different distribution type and # replace by "mixed" @@ -1189,9 +1213,9 @@ add_distribution_type <- function(df, lkup, fill_gaps) { ] dt <- dt[, - # collapse by reporting_year and keep relevant variables - .(distribution_type = unique(distribution_type)), - by = by_vars + # collapse by reporting_year and keep relevant variables + .(distribution_type = unique(distribution_type)), + by = by_vars ] # df[dt, @@ -1201,10 +1225,8 @@ add_distribution_type <- function(df, lkup, fill_gaps) { # # Calculate unique counts of reporting level and add new rows # unique_replevel := uniqueN(reporting_level), # by = c("country_code","reporting_year")] - - } else { - # survey years -------------- + # survey years -------------- by_vars <- c( "country_code", "surveyid_year", @@ -1213,40 +1235,42 @@ add_distribution_type <- function(df, lkup, fill_gaps) { ) dt[, - # distribution type by year - distribution_type := fcase(use_groupdata == 1, "group", - use_imputed == 1, "imputed", - default = "micro") + # distribution type by year + distribution_type := fcase( + use_groupdata == 1 , "group" , + use_imputed == 1 , "imputed" , + default = "micro" + ) ] - dt <- dt[, # collapse by reporting_year and keep relevant variables - .(distribution_type = unique(distribution_type)), - by = by_vars] - + dt <- dt[, + # collapse by reporting_year and keep relevant variables + .(distribution_type = unique(distribution_type)), + by = by_vars + ] } if (!fill_gaps) { df <- df[, - surveyid_year := as.numeric(surveyid_year) + surveyid_year := as.numeric(surveyid_year) ] } - df[dt, - on = by_vars, - distribution_type := i.distribution_type - ][, - # Calculate unique counts of reporting level and add new rows - unique_replevel := uniqueN(reporting_level), - by = by_vars] + df[dt, on = by_vars, distribution_type := i.distribution_type][, + # Calculate unique counts of reporting level and add new rows + unique_replevel := uniqueN(reporting_level), + by = by_vars + ] # distribution type for national cases when aggregate data - - df[unique_replevel == 3 & - reporting_level == "national" & - distribution_type == "group", - distribution_type := "synthetic" - ][, - unique_replevel := NULL] + df[ + unique_replevel == 3 & + reporting_level == "national" & + distribution_type == "group", + distribution_type := "synthetic" + ][, + unique_replevel := NULL + ] setorderv(df, by_vars) return(invisible(df)) @@ -1263,29 +1287,28 @@ add_distribution_type <- function(df, lkup, fill_gaps) { #' @return data.table #' @keywords internal add_spl <- function(df, fill_gaps, data_dir) { - if (fill_gaps) { table <- "spr_lnp" } else { - table <- "spr_svy" + table <- "spr_svy" } spl <- - get_spr_table(data_dir = data_dir, - table = table) - - - out <- df[spl, - on = c( - "country_code", - "reporting_year", - "welfare_type", - "reporting_level" - ), - `:=`( - spl = i.spl, - spr = i.spr - )] + get_spr_table(data_dir = data_dir, table = table) + + out <- df[ + spl, + on = c( + "country_code", + "reporting_year", + "welfare_type", + "reporting_level" + ), + `:=`( + spl = i.spl, + spr = i.spr + ) + ] return(invisible(out)) } @@ -1301,30 +1324,31 @@ add_spl <- function(df, fill_gaps, data_dir) { add_agg_medians <- function(df, fill_gaps, data_dir) { if (fill_gaps) { - table = "spr_lnp" + table = "spr_lnp" # set all lines up medians to NA. df[, median := NA_real_] } else { # if survey data, we keep the ones already calculated and add those # that are missing - table = "spr_svy" + table = "spr_svy" } med <- - get_spr_table(data_dir = data_dir, - table = table) + get_spr_table(data_dir = data_dir, table = table) # join medians to missing data --------- - df[med, - on = c( - "country_code", - "reporting_year", - "welfare_type", - "reporting_level" - ), - # prefer median in df over the one in med as long as the one in - # in df is not NA. If that is the case, select the one in med. - median := fcoalesce(median, i.median)] + df[ + med, + on = c( + "country_code", + "reporting_year", + "welfare_type", + "reporting_level" + ), + # prefer median in df over the one in med as long as the one in + # in df is not NA. If that is the case, select the one in med. + median := fcoalesce(median, i.median) + ] return(invisible(df)) } @@ -1339,15 +1363,15 @@ get_caller_names <- function() { calls <- sys.calls() lcalls <- length(calls) - caller_names <- vector("character" , length = lcalls) + caller_names <- vector("character", length = lcalls) tryCatch( expr = { i <- 1 while (i <= lcalls) { call <- calls[[i]] - call_class <- class(call[[1]]) - call_type <- typeof(call[[1]]) + call_class <- class(call[[1]]) + call_type <- typeof(call[[1]]) call_length <- length(call[[1]]) call[[1]] <- @@ -1375,20 +1399,24 @@ get_caller_names <- function() { }, # end of expr section error = function(err) { - msg <- c(paste("Error in call",i), - paste("class:", call_class), - paste("type:", call_type), - paste("length:", call_length), - paste("text:", call_text)) + msg <- c( + paste("Error in call", i), + paste("class:", call_class), + paste("type:", call_type), + paste("length:", call_length), + paste("text:", call_text) + ) rlang::abort(msg, parent = err) }, # end of error section warning = function(w) { - msg <- c(paste("Warning in call",i), - paste("class:", call_class), - paste("type:", call_type), - paste("length:", call_length), - paste("text:", call_text)) + msg <- c( + paste("Warning in call", i), + paste("class:", call_class), + paste("type:", call_type), + paste("length:", call_length), + paste("text:", call_text) + ) rlang::warn(msg, parent = w) } ) # End of trycatch @@ -1406,23 +1434,16 @@ get_caller_names <- function() { #' @keywords internal #' @return data.table from pip or pip_grp functions. add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) { - ## Add SPL and SPR --------------- - out <- add_spl(df = out, - fill_gaps = fill_gaps, - data_dir = lkup$data_root) + out <- add_spl(df = out, fill_gaps = fill_gaps, data_dir = lkup$data_root) ## Add prosperity Gap ----------- - out <- add_pg(df = out, - fill_gaps = fill_gaps, - data_dir = lkup$data_root) + out <- add_pg(df = out, fill_gaps = fill_gaps, data_dir = lkup$data_root) ## add distribution type ------------- # based on info in framework data, rather than welfare data - out <- add_distribution_type(df = out, - lkup = lkup, - fill_gaps = fill_gaps) + out <- add_distribution_type(df = out, lkup = lkup, fill_gaps = fill_gaps) invisible(out) } @@ -1446,7 +1467,6 @@ add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) { #' unnest_dt_longer(df, grep("^list_column", names(df), value = TRUE)) #' } unnest_dt_longer <- function(tbl, cols) { - tbl <- data.table::as.data.table(tbl) clnms <- rlang::syms(setdiff(colnames(tbl), cols)) @@ -1468,30 +1488,43 @@ unnest_dt_longer <- function(tbl, cols) { #' @return data.table with with fgt, mean and median #' @keywords internal get_mean_median <- \(fgt, lkup, fill_gaps) { - - if (isFALSE(lkup$use_new_lineup_version)) return(fgt) + if (isFALSE(lkup$use_new_lineup_version)) { + return(fgt) + } if (fill_gaps) { - dist <- get_vars(lkup$lineup_dist_stats, - c("country_code", "reporting_year", - "reporting_level", "mean", "median")) + dist <- get_vars( + lkup$lineup_dist_stats, + c("country_code", "reporting_year", "reporting_level", "mean", "median") + ) by_var <- c('country_code', "reporting_year", "reporting_level") } else { - dist <- get_vars(lkup$dist_stats, - c("country_code", "reporting_year", - "reporting_level", "mean", - "survey_median_ppp", "welfare_type")) + dist <- get_vars( + lkup$dist_stats, + c( + "country_code", + "reporting_year", + "reporting_level", + "mean", + "survey_median_ppp", + "welfare_type" + ) + ) setnames(dist, "survey_median_ppp", "median") - by_var <- c('country_code', - "reporting_year", - "reporting_level", - "welfare_type") + by_var <- c( + 'country_code', + "reporting_year", + "reporting_level", + "welfare_type" + ) } - join(x = fgt, - y = dist, - on = by_var, - how = "left", - validate = "m:1", # multiple povlines - verbose = 0L) + join( + x = fgt, + y = dist, + on = by_var, + how = "left", + validate = "m:1", # multiple povlines + verbose = 0L + ) } diff --git a/inst/TMP/TMP_API_launcher.R b/inst/TMP/TMP_API_launcher.R index 7ce361b3..a5f5f5a9 100644 --- a/inst/TMP/TMP_API_launcher.R +++ b/inst/TMP/TMP_API_launcher.R @@ -29,6 +29,3 @@ if (Sys.info()[["user"]] == "wb384996") { } - - - diff --git a/man/pip_agg.Rd b/man/pip_agg.Rd index 51417e2e..87a215a3 100644 --- a/man/pip_agg.Rd +++ b/man/pip_agg.Rd @@ -10,7 +10,7 @@ pip_agg( country = "ALL", year = "ALL", povline = 1.9, - group_by = c("wb", "none"), + group_by = "wb", welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup, @@ -23,7 +23,7 @@ pip_grp_logic( country = "ALL", year = "ALL", povline = 1.9, - group_by = c("wb", "none"), + group_by = "wb", welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup, @@ -38,6 +38,7 @@ pip_grp_new( povline = 1.9, welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), + group_by = "wb", lkup, censor = TRUE, additional_ind = FALSE, diff --git a/man/pip_aggregate_by.Rd b/man/pip_aggregate_by.Rd index 642567e7..ee318e3c 100644 --- a/man/pip_aggregate_by.Rd +++ b/man/pip_aggregate_by.Rd @@ -4,13 +4,15 @@ \alias{pip_aggregate_by} \title{Aggregate by predefined groups} \usage{ -pip_aggregate_by(df, country = "ALL", return_cols = NULL) +pip_aggregate_by(df, country = "ALL", group_by = "wb", return_cols = NULL) } \arguments{ \item{df}{data.frame: Response from \code{fg_pip_old()} or \code{rg_pip()}.} \item{country}{character: Selected countries / regions} +\item{group_by}{character: Grouping variable (default is "wb")} + \item{return_cols}{list: lkup$return_cols$pip_grp object. Controls returned columns} } diff --git a/man/pip_grp.Rd b/man/pip_grp.Rd index ad4c2423..03895a7f 100644 --- a/man/pip_grp.Rd +++ b/man/pip_grp.Rd @@ -8,7 +8,7 @@ pip_grp( country = "ALL", year = "ALL", povline = 1.9, - group_by = c("wb", "none"), + group_by = "wb", welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup,