-
Notifications
You must be signed in to change notification settings - Fork 1
/
02.SimulateBoostrappedData.R
102 lines (85 loc) · 4.19 KB
/
02.SimulateBoostrappedData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Loading the required libraries
library(caret)
library(xgboost)
library(ggplot2)
library(ggplotify)
library(hrbrthemes)
library(parallel)
library(gridExtra)
library(corrplot)
library(factoextra)
library(FactoMineR)
library(fitdistrplus)
library(dplyr)
library(haven)
# Reading the STATA .dta data, filtering by Year with dfRes, and adding ADM codes&names
dfStata = read_dta("drought_adm2_merged.dta")
# Defining the columns to be kept
sIDs = c("adm0_code", "adm1_code", "adm2_code", "year")
sHHIDs = c("hhid", "adm0_code", "adm1_code", "adm2_code", "year")
sFeats = c("female", "child_out_sch", "pri_age_ch", "working", "children", "senior", "head_female", "head_education",
"head_age", "head_lit", "head_act", "head_mar", "head_ageg", "hhsize", "province", "region",
"region_urban", "surmonth", "urban", "pw", "own", "rooms", "area", "frame",
"car", "motorcycle", "bicycle", "radio", "cassette", "tvbw", "tvcolor", "vcr",
"computer", "cellphone", "freezer", "refrigerator", "freezrefrig", "gasstove", "vacuum", "washingmachine",
"sewingmachine", "fan", "mobile_ac", "mobile_gas_ac", "dishwasher", "neither", "pipedwater", "electricity",
"naturalgas", "phone", "internet", "bath", "kitchen", "cooler", "centralac", "centralheat",
"package", "gas_cooler", "sewage", "cookfuel", "heatfuel", "hotwater_fuel",
"male", "adults", "dependecy_r")
sFeatsPlus = c("scpdsi", "asi", "era5land_t2m", "era5land_tp", "gws", "rtzsm", "sfsm",
"spei01", "spei03", "spei06", "spei12", "spei24", "spei48",
"spi01", "spi03", "spi06", "spi09", "spi12")
sTgts = c("pcer", "pcer_17", "pline685")
# Keeping only the columns listed above
dfStata = dfStata[,names(dfStata) %in% c(sHHIDs, sFeats, sTgts)]
sTgts = c("pcer_pline", "pcer_17_pline")
# Dropping NAs
dfStata = na.omit(dfStata)
dfStata = dfStata[!dfStata$adm0_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm1_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm2_code=="",,drop=FALSE]
# Keeping just 2019
dfStata = dfStata[dfStata$year==2019,,drop=F]
# Loading the bootstrapped data
dfIDs = read.csv("boostrapped_data_IDs.csv")
dfBootData = readRDS(file="boostrapped_data_10000samples.RData")
# For every target variable, do...
dfResModel = list()
for (j in 1:length(sTgts)) {
# Progress message
sTgt = sTgts[j]
message("Processing target variable: ", sTgt, " | ")
# Reading the trained model from the disk
xgbTree_model = readRDS(paste("xgb_model_featsSTATplus_trgtVar_",sTgt,".rds",sep=""))
# For every bootstrapped combination in lRes, do...
pb = txtProgressBar(min=1, max=dim(dfBootData)[1], initial=1, style=3)
for (i in 1:dim(dfBootData)[1]) {
# Adding the ID information
dfDataset = as.data.frame(dfBootData[i,,])
names(dfDataset) = sFeatsPlus
dfDataset = cbind(dfIDs,dfDataset)
dfDataset = na.omit(dfDataset)
# Preparing the data for running the predictions
dfStataPlus = merge(dfStata, dfDataset,
by.x=c("adm0_code", "adm1_code", "adm2_code", "year"),
by.y=c("ADM0_CODE", "ADM1_CODE", "ADM2_CODE", "Year"),
all.x=TRUE)
dfStataPlus = na.omit(dfStataPlus)
dfDatasetIDs = dfStataPlus[,names(dfStataPlus) %in% c(sIDs, "ADM0_NAME", "ADM1_NAME", "ADM2_NAME")]
dfDatasetStata = dfStataPlus[,names(dfStataPlus) %in% c(sIDs, "ADM0_NAME", "ADM1_NAME", "ADM2_NAME", sFeats)]
dfDataset = dfStataPlus[,names(dfStataPlus) %in% c(sFeats, sFeatsPlus),drop=FALSE]
dfDatasetXGB = xgb.DMatrix(data=as.matrix(dfDataset))
# Predicting the new results
dfResNewData = predict(xgbTree_model, newdata=dfDatasetXGB)
# Storing the results
if (i == 1) { dfResModel[[j]] = dfDatasetIDs }
dfResModel[[j]] = cbind(dfResModel[[j]], dfResNewData)
names(dfResModel[[j]])[NCOL(dfResModel[[j]])] = paste("boot_",i,sep="")
# Updating the progress bar
setTxtProgressBar(pb,i)
}
}
# Saving the results to the disk
write.csv(dfDatasetIDs, "xgb_model_dfDatasetIDs.csv", row.names=F)
write.csv(dfDatasetStata, "xgb_model_dfDatasetStata.csv", row.names=F)
saveRDS(dfResModel, file="xgb_model_featsSTATplus_boostrapped_data_10000samples.RData")