02.SimulateBoostrappedData.R

# Loading the required libraries
library(caret)
library(xgboost)
library(ggplot2)
library(ggplotify)
library(hrbrthemes)
library(parallel)
library(gridExtra)
library(corrplot)
library(factoextra)
library(FactoMineR)
library(fitdistrplus)
library(dplyr)
library(haven)

# Reading the STATA .dta data, filtering by Year with dfRes, and adding ADM codes&names
dfStata = read_dta("drought_adm2_merged.dta")

# Defining the columns to be kept
sIDs = c("adm0_code", "adm1_code", "adm2_code", "year")
sHHIDs = c("hhid", "adm0_code", "adm1_code", "adm2_code", "year")
sFeats = c("female", "child_out_sch", "pri_age_ch", "working", "children", "senior", "head_female", "head_education", 
           "head_age", "head_lit", "head_act", "head_mar", "head_ageg", "hhsize", "province", "region", 
           "region_urban", "surmonth", "urban", "pw", "own", "rooms", "area", "frame", 
           "car", "motorcycle", "bicycle", "radio", "cassette", "tvbw", "tvcolor", "vcr", 
           "computer", "cellphone", "freezer", "refrigerator", "freezrefrig", "gasstove", "vacuum", "washingmachine", 
           "sewingmachine", "fan", "mobile_ac", "mobile_gas_ac", "dishwasher", "neither", "pipedwater", "electricity", 
           "naturalgas", "phone", "internet", "bath", "kitchen", "cooler", "centralac", "centralheat", 
           "package", "gas_cooler", "sewage", "cookfuel", "heatfuel", "hotwater_fuel", 
           "male", "adults", "dependecy_r")
sFeatsPlus = c("scpdsi", "asi", "era5land_t2m", "era5land_tp", "gws", "rtzsm", "sfsm", 
              "spei01", "spei03", "spei06", "spei12", "spei24", "spei48", 
              "spi01", "spi03", "spi06", "spi09", "spi12")
sTgts = c("pcer", "pcer_17", "pline685")

# Keeping only the columns listed above
dfStata = dfStata[,names(dfStata) %in% c(sHHIDs, sFeats, sTgts)]
sTgts = c("pcer_pline", "pcer_17_pline")

# Dropping NAs
dfStata = na.omit(dfStata)
dfStata = dfStata[!dfStata$adm0_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm1_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm2_code=="",,drop=FALSE]

# Keeping just 2019
dfStata = dfStata[dfStata$year==2019,,drop=F]

# Loading the bootstrapped data
dfIDs = read.csv("boostrapped_data_IDs.csv")
dfBootData = readRDS(file="boostrapped_data_10000samples.RData")

# For every target variable, do...
dfResModel = list()
for (j in 1:length(sTgts)) {
  # Progress message
  sTgt = sTgts[j]
  message("Processing target variable: ", sTgt, " | ")
  
  # Reading the trained model from the disk
  xgbTree_model = readRDS(paste("xgb_model_featsSTATplus_trgtVar_",sTgt,".rds",sep=""))
  
  # For every bootstrapped combination in lRes, do...
  pb = txtProgressBar(min=1, max=dim(dfBootData)[1], initial=1, style=3)
  for (i in 1:dim(dfBootData)[1]) {
    
    # Adding the ID information
    dfDataset = as.data.frame(dfBootData[i,,])
    names(dfDataset) = sFeatsPlus
    dfDataset = cbind(dfIDs,dfDataset)
    dfDataset = na.omit(dfDataset)
    
    # Preparing the data for running the predictions
    dfStataPlus = merge(dfStata, dfDataset, 
                        by.x=c("adm0_code", "adm1_code", "adm2_code", "year"),
                        by.y=c("ADM0_CODE", "ADM1_CODE", "ADM2_CODE", "Year"),
                        all.x=TRUE)
    dfStataPlus = na.omit(dfStataPlus)
    dfDatasetIDs = dfStataPlus[,names(dfStataPlus) %in% c(sIDs, "ADM0_NAME", "ADM1_NAME", "ADM2_NAME")]
    dfDatasetStata = dfStataPlus[,names(dfStataPlus) %in% c(sIDs, "ADM0_NAME", "ADM1_NAME", "ADM2_NAME", sFeats)]
    dfDataset = dfStataPlus[,names(dfStataPlus) %in% c(sFeats, sFeatsPlus),drop=FALSE]
    dfDatasetXGB = xgb.DMatrix(data=as.matrix(dfDataset))
    
    # Predicting the new results
    dfResNewData = predict(xgbTree_model, newdata=dfDatasetXGB)
    
    # Storing the results
    if (i == 1) { dfResModel[[j]] = dfDatasetIDs }
    dfResModel[[j]] = cbind(dfResModel[[j]], dfResNewData)
    names(dfResModel[[j]])[NCOL(dfResModel[[j]])] = paste("boot_",i,sep="")
    
    # Updating the progress bar
    setTxtProgressBar(pb,i)
    
  }
  
}

# Saving the results to the disk
write.csv(dfDatasetIDs, "xgb_model_dfDatasetIDs.csv", row.names=F)
write.csv(dfDatasetStata, "xgb_model_dfDatasetStata.csv", row.names=F)
saveRDS(dfResModel, file="xgb_model_featsSTATplus_boostrapped_data_10000samples.RData")