run_nested_cv.R



# this isn't run in targets because the pipeline slowdown is just too much

source("packages.R")
source("conflicts.R")
R.utils::sourceDirectory('R')

# to get same nested CV folds
# the awkward order has to be kept to make the seeds stay matched to their
# original values. Note to myself to redo the order if I ever re-run
# everything at once.
set.seed(329)

data_meta <-
  tibble::tribble(
                        ~name,           ~pkg,                ~mode,               ~outcome,
                      "meats",    "modeldata",         "regression",              "protein",
                       "ames",    "modeldata",         "regression",           "Sale_Price",
                    "Chicago",    "modeldata",         "regression",            "ridership",
                    "biomass",    "modeldata",         "regression",                  "HHV",
                 "car_prices",    "modeldata",         "regression",                "Price",
            "chem_proc_yield",    "modeldata",         "regression",                "yield",
                   "concrete",    "modeldata",         "regression", "compressive_strength",
                "hotel_rates",    "modeldata",         "regression",   "avg_price_per_room",
             "leaf_id_flavia",    "modeldata",         "regression",          "correlation",
          "permeability_qsar",    "modeldata",         "regression",         "permeability",
     "data_chimiometrie_2019", "modeldatatoo",         "regression",              "soy_oil",
            "ischemic_stroke",    "modeldata",     "classification",               "stroke",
                    "ad_data",    "modeldata",     "classification",                "Class",
                  "attrition",    "modeldata",     "classification",            "Attrition",
                      "cells",    "modeldata",     "classification",                "class",
                "credit_data",    "modeldata",     "classification",               "Status",
               "grants_other",    "modeldata",     "classification",                "class",
               "lending_club",    "modeldata",     "classification",                "Class",
                  "mlc_churn",    "modeldata",     "classification",                "churn",
                  "pd_speech",    "modeldata",     "classification",                "class",
              "stackoverflow",    "modeldata",     "classification",               "Remote",
     "steroidogenic_toxicity",    "modeldata",     "classification",                "class",
                       "taxi",    "modeldata",     "classification",                  "tip",
                   "wa_churn",    "modeldata",     "classification",                "churn"
  ) %>%
  expand_grid(iteration = 1:5) %>%
  mutate(split_seed = sample(x = 1e6, size = n(), replace = FALSE)) %>%
  select(-iteration) %>%
  expand_grid(
    model_id = c(
      'aorsf',
      'ranger',
      'glmnet',
      'xgboost',
      'kernlab',
      'dbarts',
      'earth'
    )
  )

already_done <- read_rds('data/data_results.rds') %>%
  distinct(name, split_seed, model_id)

# data_meta <- data_meta %>%
#   mutate(slurm_id = seq(n())-1) %>%
#   anti_join(already_done)

data_meta <- data_meta %>%
  anti_join(already_done)

# #  for debugging
# filter(data_meta,
#        name == 'grants_other',
#        model_id == 'dbarts') %>%
#   slice(1) %>%
#   as.list() %>%
#   list2env(envir = globalenv())


run_nested_cv <- function(name,
                          pkg,
                          mode,
                          outcome,
                          split_seed,
                          model_id){

  source("../packages.R")
  source("../conflicts.R")

  lapply(list.files("../R", full.names = TRUE), source)

  # don't make this a function argument b/c then ill have to
  # add another column to data_meta (rslurm requires all function
  # inputs be provided in the data frame).
  verbose <- FALSE

  data <- initialize_data(name, pkg, outcome)

  formula <- initialize_formula(outcome)

  metrics <- initialize_metrics(mode)

  metric_string <- switch(mode,
                          classification = 'roc_auc',
                          regression = 'rsq_trad')

  set.seed(split_seed)

  split <- initial_split(data = data)

  train <- training(split) %>%
    droplevels()

  test <- testing(split) %>%
    droplevels()

  resamples <- vfold_cv(train, v = 5)

  model_params <- initialize_models(data, model_id, outcome, mode)

  model_grid <- model_params$model_grid[[1]]

  # recommended order (https://recipes.tidymodels.org/articles/Ordering.html)
  # 1. Impute
  # 2. Handle factor levels
  # 3. Individual transformations for skewness and other issues
  # 4. Discretize (not used)
  # 5. Create dummy variables
  # 6. Create interactions (not used)
  # 7. Normalization steps (center, scale, range, etc)
  # 8. Multivariate transformation (e.g. PCA, spatial sign, etc)

  # consider revising ind transform to allow for spline, yeo, or nothing

  recipe_data <- expand_grid(
    imputation = c("meanmode", "nearest_neighbor"),
    yeo_johnson = c("no", "yes"),
    pca = c("no", "yes"),
    variable_selection = c('no', 'yes')
  ) %>%
    mutate(
      # note: the function used here relies on objects defined above.
      recipe = pmap(
        .l = list(imputation, yeo_johnson, pca, variable_selection),
        .f = function(imputation, yeo_johnson, pca, variable_selection){

          preprocessor <- recipe(train, formula = formula) %>%
            # sometimes splitting creates constant cols, so drop em.
            # more nuance would be good if this analysis focused
            # on a specific dataset, but for analyzing many
            # datasets the most reasonable thing seems to be dropping
            # constant cols when the data split creates them.
            step_zv(all_predictors())

          # the tuning grid changes depending on the recipe
          .model_grid <- model_grid

          # impute ----

          if(imputation == 'meanmode'){

            preprocessor %<>%
              step_impute_mean(all_numeric_predictors()) %>%
              step_impute_mode(all_nominal_predictors())

          } else if (imputation == 'nearest_neighbor'){

            preprocessor %<>%
              step_impute_knn(all_predictors())

          }

          # handle factor levels ----

          cats <- train %>%
            select(where(is.factor), -any_of(outcome)) %>%
            map_dfr(~enframe(prop.table(table(.x)),
                             name = 'category',
                             value = 'percent'),
                    .id = 'variable')

          if(nrow(cats) > 0){

            high_dimensional_cats <- cats %>%
              count(variable) %>%
              filter(n > 10 | n / nrow(train) > 0.1) %>%
              pull(variable)

            # numerically encode variables with high dimension
            if(!is_empty(high_dimensional_cats)){

              preprocessor %<>%
                step_lencode_glm(all_of(high_dimensional_cats),
                                 outcome = vars(outcome))

            }

            sparse_cats <- cats %>%
              filter(percent < 0.05) %>%
              pull(variable) %>%
              unique() %>%
              setdiff(high_dimensional_cats)

            if(!is_empty(sparse_cats)){

              preprocessor %<>% step_other(any_of(sparse_cats),
                                           other = "TEMP_other",
                                           threshold = 0.05)

            }

          }

          # individual transformations for skewness and other issues ----

          if(yeo_johnson == 'yes'){

            preprocessor %<>%
              step_YeoJohnson(all_numeric_predictors())

          }

          # dummy coding ----

          preprocessor %<>% step_dummy(all_nominal_predictors())

          # normalization steps ----

          preprocessor %<>% step_normalize(all_numeric_predictors())

          # multivariate transformation ----

          if(pca == 'yes'){

            preprocessor %<>%
              step_pca(all_numeric_predictors(), num_comp = tune())

            .model_grid %<>% cross_join(tibble(num_comp = seq(4) * 5))

          }

          if(variable_selection == 'yes'){

            variable_selection_engine <- model_id

            # engines not currently supported by colino
            if(variable_selection_engine %in% c('kernlab', 'dbarts')){
              variable_selection_engine <- 'ranger'
            }

            importance <- NULL

            if(variable_selection_engine == 'ranger'){
              importance <- 'permutation'
            } else if (variable_selection_engine == 'aorsf'){
              importance <- 'anova'
            }

            variable_selection_spec <- switch(
              variable_selection_engine,
              'aorsf' = rand_forest(),
              'ranger' = rand_forest(),
              'glmnet' = if(mode == 'regression') linear_reg() else logistic_reg(),
              'xgboost' = boost_tree(),
              'earth' = mars()
            ) %>%
              set_mode(mode)

            if(model_id == 'earth'){
              variable_selection_spec %<>%
                set_engine(variable_selection_engine)
            } else {
              variable_selection_spec %<>%
                set_engine(variable_selection_engine,
                           importance = importance)
            }


            # create a preprocessing recipe
            preprocessor  %<>% step_select_vip(all_predictors(),
                                               outcome = vars(outcome),
                                               model = variable_selection_spec,
                                               threshold = tune())

            .model_grid %<>% cross_join(tibble(threshold = seq(4) / 5))

          }

          tibble(preproc = list(preprocessor),
                 grid = list(.model_grid))

        }
      )
    ) %>%
    unnest(recipe) %>%
    mutate(
      recipe_id = glue(
      "{imputation}..yeo_{yeo_johnson}..pca_{pca}..vs_{variable_selection}"
      )
    )

  if(model_id == 'glmnet'){
    recipe_data %<>% filter(variable_selection == 'no')
  }

  if(miss_prop_summary(data)$df < 0.05){
    recipe_data %<>% filter(imputation == 'meanmode')
  }

  wf_set <- workflow_set(preproc = recipe_data$preproc,
                         models = model_params$model_spec) %>%
    mutate(wflow_id = glue("{model_id}_{recipe_data$recipe_id}"))

  for(i in seq(nrow(wf_set))){

    wf_set %<>% option_add(
      id = wf_set$wflow_id[i],
      grid = recipe_data$grid[[i]]
    )

  }


  time_tune_start <- Sys.time()
  wf_res <- workflow_map(wf_set,
                         fn = 'tune_grid',
                         verbose = TRUE,
                         resamples = resamples,
                         metrics = metrics,
                         control = control_grid(verbose = verbose))
  time_tune_stop <- Sys.time()

  score_internal <- wf_res %>%
    collect_metrics(summarize = TRUE) %>%
    # results are returned for each specific recipe, including the
    # multiple recipes used when a preprocessor parameter is tuned.
    filter(.metric == metric_string) %>%
    group_by(wflow_id) %>%
    slice_max(mean) %>%
    select(wflow_id, mean, std_err) %>%
    distinct() # sometimes there are ties within a wflow id

  # pull out most performant wflow
  wf_best_id <- wf_res %>%
    rank_results(rank_metric = metric_string) %>%
    filter(.metric == metric_string) %>%
    slice_max(mean) %>%
    dplyr::slice(1) %>% # sometimes there are ties within a wflow id
    pull(wflow_id)

  # Extract the best parameters for the workflow
  best_params <- wf_res %>%
    filter(wflow_id == wf_best_id) %>%
    pull(result) %>%
    .[[1]] %>%
    select_best(metric = metric_string)

  # Finalize the workflow with the best parameters
  final_workflow <- wf_res %>%
    extract_workflow(id = wf_best_id) %>%
    finalize_workflow(best_params)


  # TODO: use extract_time when it becomes more accessible
  time_fit_start <- Sys.time()
  fit_final <- fit(final_workflow, data = train)
  time_fit_stop <- Sys.time()

  pred_col <- infer_pred_col(data, mode, outcome)

  pred <- predict(fit_final,
                  new_data = test,
                  type = infer_pred_type(mode))

  pred_final <- tibble(.pred = pred[[pred_col]],
                       .outcome = test[[outcome]])

  score_external <- metrics(pred_final, truth = .outcome, .pred) %>%
    select(-.estimator) %>%
    mutate(
      time_tune = difftime(time_tune_stop,
                           time_tune_start,
                           units = 's'),
      time_fit = difftime(time_fit_stop,
                          time_fit_start,
                          units = 's'))

  tibble(name = name,
         pkg = pkg,
         mode = mode,
         outcome = outcome,
         split_seed = split_seed,
         model_id = model_id) %>%
    bind_cols(score_external) %>%
    mutate(score_internal = list(score_internal))

}


# Submit the SLURM job using rslurm
sjob <- rslurm::slurm_apply(
  run_nested_cv,                  # Function to apply
  params = data_meta,             # Data to pass to the function (each row is a task)
  jobname = "run_nested_cv_job",  # Job name for SLURM
  nodes = nrow(data_meta),        # Number of nodes to use
  cpus_per_node = 1,              # Number of CPUs per node
  slurm_options = list(time = "168:00:00",
                       partition = "general",
                       "mem-per-cpu" = "24G")  # SLURM options
)