diff --git a/3_rowcrop/01_ERA5_nc_to_clim.R b/3_rowcrop/01_ERA5_nc_to_clim.R new file mode 100755 index 0000000..0fc5e51 --- /dev/null +++ b/3_rowcrop/01_ERA5_nc_to_clim.R @@ -0,0 +1,93 @@ +#!/usr/bin/env Rscript + +# Converts ERA5 meteorology data from PEcAn's standard netCDF format +# to Sipnet `clim` driver files. + +# This is basically a thin wrapper around `met2model.SIPNET()`. +# Only the filenames are specific to ERA5 by assuming each file is named +# "ERA5..nc" with ens_id between 1 and 10. + +## --------- runtime values: change for your system and simulation --------- + +options <- list( + optparse::make_option("--site_era5_path", + default = "data_raw/ERA5_nc", + help = paste( + "Path to your existing ERA5 data in PEcAn CF format, organized as", + "single-site, single-year netcdfs in subdirectories per ensemble member.", + "Files should be named", + "'/ERA5__/ERA5...nc'" + ) + ), + optparse::make_option("--site_sipnet_met_path", + default = "data/ERA5_SIPNET", + help = paste( + "Output path:", + "single-site, multi-year Sipnet clim files, one per ensemble member.", + "Files will be named", + "//ERA5....clim" + ) + ), + optparse::make_option("--site_info_file", + default = "site_info.csv", + help = "CSV file with one row per location. Only the `id` column is used", + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = "Date to begin clim file", + ), + optparse::make_option("--end_date", + default = "2023-12-31", + help = "Date to end clim file", + ), + optparse::make_option("--n_cores", + default = 1L, + help = "number of CPUs to use in parallel", + ), + optparse::make_option("--parallel_strategy", + default = "multisession", + help = "Strategy for parallel conversion, passed to future::plan()", + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + +# ----------- end system-specific --------------------------------- + + +future::plan(args$parallel_strategy, workers = args$n_cores) + +site_info <- read.csv(args$site_info_file) +site_info$start_date <- args$start_date +site_info$end_date <- args$end_date + + +file_info <- site_info |> + dplyr::rename(site_id = id) |> + dplyr::cross_join(data.frame(ens_id = 1:10)) + +if (!dir.exists(args$site_sipnet_met_path)) { + dir.create(args$site_sipnet_met_path, recursive = TRUE) +} +furrr::future_pwalk( + file_info, + function(site_id, start_date, end_date, ens_id, ...) { + PEcAn.SIPNET::met2model.SIPNET( + in.path = file.path( + args$site_era5_path, + paste("ERA5", site_id, ens_id, sep = "_") + ), + start_date = args$start_date, + end_date = args$end_date, + in.prefix = paste0("ERA5.", ens_id), + outfolder = file.path(args$site_sipnet_met_path, site_id) + ) + } +) diff --git a/3_rowcrop/02_ic_build.R b/3_rowcrop/02_ic_build.R new file mode 100755 index 0000000..9c07737 --- /dev/null +++ b/3_rowcrop/02_ic_build.R @@ -0,0 +1,451 @@ +#!/usr/bin/env Rscript + +# Creates initial condition (IC) files for each location in site_info, +# using values from data_dir if they are already present, +# or looking them up and writing them to data_dir if they are not. + +options <- list( + optparse::make_option("--site_info_path", + default = "site_info.csv", + help = "CSV giving ids, locations, and PFTs for sites of interest" + ), + optparse::make_option("--field_shape_path", + default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb", + help = "file containing site geometries, used for extraction from rasters" + ), + optparse::make_option("--ic_ensemble_size", + default = 100, + help = "number of files to generate for each site" + ), + optparse::make_option("--run_start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "For now, start date must be same for all sites,", + "and some download/extraction functions rely on this.", + "Workaround: Call this script separately for sites whose dates differ" + ) + ), + optparse::make_option("--run_LAI_date", + default = "2016-07-01", + help = "Date to look near (up to 30 days each direction) for initial LAI" + ), + optparse::make_option("--ic_outdir", + default = "IC_files", + help = "Directory to write completed initial conditions as nc files" + ), + optparse::make_option("--data_dir", + default = "data/IC_prep", + help = "Directory to store data retrieved/computed in the IC build process" + ), + optparse::make_option("--pft_dir", + default = "pfts", + help = paste( + "path to parameter distributions used for PFT-specific conversions", + "from LAI to estimated leaf carbon.", + "Must be path to a dir whose child subdirectory names match the", + "`site.pft` column of site_info and that contain a file", + "`post.distns.Rdata`" + ) + ), + optparse::make_option("--params_read_from_pft", + default = "SLA,leafC", # SLA units are m2/kg, leafC units are % + help = "Parameters to read from the PFT file, comma separated" + ), + optparse::make_option("--landtrendr_raw_files", + default = paste0( + "data_raw/ca_biomassfiaald_2016_median.tif,", + "data_raw/ca_biomassfiaald_2016_stdv.tif" + ), + help = paste( + "Paths to two geotiffs, with a comma between them.", + "These should contain means and standard deviations of aboveground", + "biomass on the start date.", + "We used Landtrendr-based values from the Kennedy group at Oregon State,", + "which require manual download.", + "Medians are available by anonymous FTP at islay.ceoas.oregonstate.edu", + "and by web (but possibly this is a different version?) from", + "https://emapr.ceoas.oregonstate.edu/pages/data/viz/index.html", + "The uncertainty layer was formerly distributed by FTP but I cannot find", + "it on the ceoas server at the moment.", + "TODO find out whether this is available from a supported source.", + "", + "Demo used a subset (year 2016 clipped to the CA state boundaries)", + "of the 30-m CONUS median and stdev maps that are stored on the Dietze", + "lab server" + ) + ), + optparse::make_option("--additional_params", + # Wood C fraction isn't in these PFTs, so just using my estimate. + # TODO update from a citeable source, + # and consider adding to PFT when calibrating + default = + "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005", + help = paste( + "Further params not available from site or PFT data,", + "as a comma-separated named list with names `varname`, `distn`,", + "`parama`, and `paramb`. Currently used only for `wood_carbon_fraction`" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Remainder of this script should work with no edits +# for any CA location(s) in site_info + +set.seed(6824625) +library(tidyverse) + +# Do parallel processing in separate R processes instead of via forking +# (without this the {furrr} calls inside soilgrids_soilC_extract +# were crashing for me. TODO check if this is machine-specific) +op <- options(parallelly.fork.enable = FALSE) +on.exit(options(op)) + +if (!dir.exists(args$data_dir)) dir.create(args$data_dir, recursive = TRUE) + +# split up comma-separated options +params_read_from_pft <- strsplit(args$params_read_from_pft, ",")[[1]] +landtrendr_raw_files <- strsplit(args$landtrendr_raw_files, ",")[[1]] +additional_params <- args$additional_params |> + str_match_all("([^=]+)=([^,]+),?") |> + _[[1]] |> + (\(x) setNames(as.list(x[, 3]), x[, 2]))() |> + as.data.frame() |> + mutate(across(starts_with("param"), as.numeric)) + +site_info <- read.csv( + args$site_info_path, + colClasses = c(field_id = "character") +) +site_info$start_date <- args$run_start_date +site_info$LAI_date <- args$run_LAI_date + + +PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m") +# NB this takes several minutes to run +# csv filename is hardcoded by fn +soilc_csv_path <- file.path(args$data_dir, "soilgrids_soilC_data.csv") +if (file.exists(soilc_csv_path)) { + PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path) + soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE) + sites_needing_soilc <- site_info |> + filter(!id %in% soil_carbon_est$Site_ID) +} else { + soil_carbon_est <- NULL + sites_needing_soilc <- site_info +} +nsoilc <- nrow(sites_needing_soilc) +if (nsoilc > 0) { + PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites") + # soilgrids fn expects a site name col but does not use it; OK if empty + sites_needing_soilc$name <- sites_needing_soilc$name %||% "" + new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract( + sites_needing_soilc |> + select(site_id = id, site_name = name, lat, lon), + outdir = args$data_dir + ) + soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |> + arrange(Site_ID) + write.csv(soil_carbon_est, soilc_csv_path, row.names = FALSE) +} + + + +PEcAn.logger::logger.info("Soil moisture") +sm_outdir <- file.path(args$data_dir, "soil_moisture") |> + normalizePath(mustWork = FALSE) +sm_csv_path <- file.path(args$data_dir, "sm.csv") # name is hardcorded by fn +if (file.exists(sm_csv_path)) { + PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path) + soil_moisture_est <- read.csv(sm_csv_path) + sites_needing_soilmoist <- site_info |> + filter(!id %in% soil_moisture_est$site.id) +} else { + soil_moisture_est <- NULL + sites_needing_soilmoist <- site_info +} +nmoist <- nrow(sites_needing_soilmoist) +if (nmoist > 0) { + PEcAn.logger::logger.info("Retrieving soil moisture for", nmoist, "sites") + if (!dir.exists(sm_outdir)) dir.create(sm_outdir) + new_soil_moisture <- PEcAn.data.land::extract_SM_CDS( + site_info = sites_needing_soilmoist |> + dplyr::select(site_id = id, lat, lon), + time.points = as.Date(site_info$start_date[[1]]), + in.path = sm_outdir, + out.path = dirname(sm_csv_path), + allow.download = TRUE + ) + soil_moisture_est <- bind_rows(soil_moisture_est, new_soil_moisture) |> + arrange(site.id) + write.csv(soil_moisture_est, sm_csv_path, row.names = FALSE) +} + +PEcAn.logger::logger.info("LAI") +# Note that this currently creates *two* CSVs: +# - "LAI.csv", with values from each available day inside the search window +# (filename is hardcoded inside MODIS_LAI_PREP()) +# - this path, aggregated to one row per site +# TODO consider cleaning this up -- eg reprocess from LAI.csv on the fly? +lai_csv_path <- file.path(args$data_dir, "LAI_bysite.csv") +if (file.exists(lai_csv_path)) { + PEcAn.logger::logger.info("using existing LAI file", lai_csv_path) + lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames? + sites_needing_lai <- site_info |> + filter(!id %in% lai_est$site_id) +} else { + lai_est <- NULL + sites_needing_lai <- site_info +} +nlai <- nrow(sites_needing_lai) +if (nlai > 0) { + PEcAn.logger::logger.info("Retrieving LAI for", nlai, "sites") + lai_res <- PEcAn.data.remote::MODIS_LAI_prep( + site_info = sites_needing_lai |> dplyr::select(site_id = id, lat, lon), + time_points = as.Date(site_info$LAI_date[[1]]), + outdir = args$data_dir, + export_csv = TRUE, + skip_download = FALSE + ) + lai_est <- bind_rows(lai_est, lai_res$LAI_Output) |> + arrange(site_id) + write.csv(lai_est, lai_csv_path, row.names = FALSE) +} + + +PEcAn.logger::logger.info("Aboveground biomass from LandTrendr") + +landtrendr_agb_outdir <- args$data_dir + +landtrendr_csv_path <- file.path( + landtrendr_agb_outdir, + "aboveground_biomass_landtrendr.csv" +) +if (file.exists(landtrendr_csv_path)) { + PEcAn.logger::logger.info( + "using existing LandTrendr AGB file", + landtrendr_csv_path + ) + agb_est <- read.csv(landtrendr_csv_path) + sites_needing_agb <- site_info |> + filter(!id %in% agb_est$site_id) +} else { + agb_est <- NULL + sites_needing_agb <- site_info +} +nagb <- nrow(sites_needing_agb) +if (nagb > 0) { + PEcAn.logger::logger.info("Retrieving aboveground biomass for", nagb, "sites") + lt_med_path <- grep("_median.tif$", landtrendr_raw_files, value = TRUE) + lt_sd_path <- grep("_stdv.tif$", landtrendr_raw_files, value = TRUE) + stopifnot( + all(file.exists(landtrendr_raw_files)), + length(lt_med_path) == 1, + length(lt_sd_path) == 1 + ) + lt_med <- terra::rast(lt_med_path) + lt_sd <- terra::rast(lt_sd_path) + field_shp <- terra::vect(args$field_shape_path) + + site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |> + terra::project(lt_med) + + # Check for unmatched sites + # TODO is stopping here too strict? Could reduce to warning if needed + stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID)) + + new_agb <- lt_med |> + terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |> + terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |> + as.data.frame() |> + left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |> + dplyr::select( + site_id = id, + AGB_median_Mg_ha = ends_with("median"), + AGB_sd = ends_with("stdv") + ) |> + mutate(across(where(is.numeric), \(x) signif(x, 5))) + agb_est <- bind_rows(agb_est, new_agb) |> + arrange(site_id) + write.csv(agb_est, landtrendr_csv_path, row.names = FALSE) +} + + + + + + + + +# --------------------------------------------------------- +# Great, we have estimates for some variables. +# Now let's make IC files! + +PEcAn.logger::logger.info("Building IC files") + + +initial_condition_estimated <- dplyr::bind_rows( + soil_organic_carbon_content = soil_carbon_est |> + dplyr::select( + site_id = Site_ID, + mean = `Total_soilC_0-30cm`, + sd = `Std_soilC_0-30cm` + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + SoilMoistFrac = soil_moisture_est |> + dplyr::select( + site_id = site.id, + mean = sm.mean, + sd = sm.uncertainty + ) |> + # Note that we pass this as a percent -- yes, Sipnet wants a fraction, + # but write.configs.SIPNET hardcodes a division by 100. + # TODO consider modifying write.configs.SIPNET + # to not convert when 0 > SoilMoistFrac > 1 + dplyr::mutate( + lower_bound = 0, + upper_bound = 100 + ), + LAI = lai_est |> + dplyr::select( + site_id = site_id, + mean = ends_with("LAI"), + sd = ends_with("SD") + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + AbvGrndBiomass = agb_est |> # NB this assumes AGB ~= AGB woody + dplyr::select( + site_id = site_id, + mean = AGB_median_Mg_ha, + sd = AGB_sd + ) |> + dplyr::mutate(across( + c("mean", "sd"), + ~ PEcAn.utils::ud_convert(.x, "Mg ha-1", "kg m-2") + )) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + .id = "variable" +) +write.csv( + initial_condition_estimated, + file.path(args$data_dir, "IC_means.csv"), + row.names = FALSE +) + + + +# read params from PFTs + +sample_distn <- function(varname, distn, parama, paramb, ..., n) { + if (distn == "exp") { + samp <- rexp(n, parama) + } else { + rfn <- get(paste0("r", distn)) + samp <- rfn(n, parama, paramb) + } + + data.frame(samp) |> + setNames(varname) +} + +sample_pft <- function(path, + vars = params_read_from_pft, + n_samples = args$ic_ensemble_size) { + e <- new.env() + load(file.path(path, "post.distns.Rdata"), envir = e) + e$post.distns |> + tibble::rownames_to_column("varname") |> + dplyr::select(-"n") |> # this is num obs used in posterior; conflicts with n = ens size when sampling + dplyr::filter(varname %in% vars) |> + dplyr::bind_rows(additional_params) |> + purrr::pmap(sample_distn, n = n_samples) |> + purrr::list_cbind() |> + tibble::rowid_to_column("replicate") +} + +pft_var_samples <- site_info |> + mutate(pft_path = file.path(args$pft_dir, site.pft)) |> + nest_by(id) |> + mutate(samp = purrr::map(data$pft_path, sample_pft)) |> + unnest(samp) |> + dplyr::select(-"data") |> + dplyr::rename(site_id = id) + + + + +ic_sample_draws <- function(df, n = 100, ...) { + stopifnot(nrow(df) == 1) + + data.frame( + replicate = seq_len(n), + sample = truncnorm::rtruncnorm( + n = n, + a = df$lower_bound, + b = df$upper_bound, + mean = df$mean, + sd = df$sd + ) + ) +} + +ic_samples <- initial_condition_estimated |> + dplyr::filter(site_id %in% site_info$id) |> + dplyr::group_by(site_id, variable) |> + dplyr::group_modify(ic_sample_draws, n = args$ic_ensemble_size) |> + tidyr::pivot_wider(names_from = variable, values_from = sample) |> + dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |> + dplyr::mutate( + AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction, + leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100), + wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0) + ) + +ic_names <- colnames(ic_samples) +std_names <- c("site_id", "replicate", PEcAn.utils::standard_vars$Variable.Name) +nonstd_names <- ic_names[!ic_names %in% std_names] +if (length(nonstd_names) > 0) { + PEcAn.logger::logger.debug( + "Not writing these nonstandard variables to the IC files:", nonstd_names + ) + ic_samples <- ic_samples |> dplyr::select(-any_of(nonstd_names)) +} + +file.path(args$ic_outdir, site_info$id) |> + unique() |> + purrr::walk(dir.create, recursive = TRUE) + +ic_samples |> + dplyr::group_by(site_id, replicate) |> + dplyr::group_walk( + ~ PEcAn.SIPNET::veg2model.SIPNET( + outfolder = file.path(args$ic_outdir, .y$site_id), + poolinfo = list( + dims = list(time = 1), + vals = .x + ), + siteid = .y$site_id, + ens = .y$replicate + ) + ) + +PEcAn.logger::logger.info("IC files written to", args$ic_outdir) +PEcAn.logger::logger.info("Done") diff --git a/3_rowcrop/03_xml_build.R b/3_rowcrop/03_xml_build.R new file mode 100755 index 0000000..eeb96a0 --- /dev/null +++ b/3_rowcrop/03_xml_build.R @@ -0,0 +1,184 @@ +#!/usr/bin/env Rscript + +library(PEcAn.settings) + +# Construct one multisite PEcAn XML file for statewide simulations + +## Config section -- edit for your project +options <- list( + optparse::make_option("--n_ens", + default = 20, + help = "number of ensemble simulations per site" + ), + optparse::make_option("--n_met", + default = 10, + help = "number of met files available (ensemble will sample from all)" + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "Ensure your IC files are valid for this date" + ) + ), + optparse::make_option("--end_date", + default = "2024-12-31", + help = "Date to end simulations" + ), + optparse::make_option("--ic_dir", + default = "IC_files", + help = paste( + "Directory containing initial conditions.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--met_dir", + default = "data/ERA5_CA_SIPNET", + help = paste( + "Directory containing climate data.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--event_dir", + default = "data/events", + help = paste( + "Directory containing Sipnet `events.in` files.", + "Should contain subdirs named by site id" + ) + ), + + optparse::make_option("--site_file", + default = "site_info.csv", + help = paste( + "CSV file containing one row for each site to be simulated.", + "Must contain at least columns `id`, `lat`, `lon`, and `site.pft`" + ) + ), + optparse::make_option("--template_file", + default = "template.xml", + help = paste( + "XML file containing whole-run settings,", + "Will be expanded to contain all sites at requested ensemble size" + ) + ), + optparse::make_option("--output_file", + default = "settings.xml", + help = "path to write output XML" + ), + optparse::make_option("--output_dir_name", + default = "output", + help = paste( + "Path the settings should declare as output directory.", + "This will be inserted replacing [out] in all of the following places:", + "`outdir` = [out] ; `modeloutdir` = [out]/out; `rundir` = [out]/run;", + "`host$outdir`: [out]/out; `host$rundir`: [out]/run." + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + +## End config section +## Whew, that was a lot of lines to define a few defaults! + + + +site_info <- read.csv(args$site_file) +stopifnot( + length(unique(site_info$id)) == nrow(site_info), + all(site_info$lat > 0), # just to simplify grid naming below + all(site_info$lon < 0) +) +site_info <- site_info |> + dplyr::mutate( + # match locations to half-degree ERA5 grid cell centers + # CAUTION: Calculation only correct when all lats are N and all lons are W! + ERA5_grid_cell = paste0( + ((lat + 0.25) %/% 0.5) * 0.5, "N_", + ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W" + ) + ) + +settings <- read.settings(args$template_file) |> + setDates(args$start_date, args$end_date) + +settings$ensemble$size <- args$n_ens +settings$run$inputs$poolinitcond$ensemble <- args$n_ens +# TODO do we need to set settings$run$inputs$events$ensemble too? + +# Hack: setEnsemblePaths leaves all path components other than siteid +# identical across sites. +# To use site-specific grid id, I'll string-replace each siteid +id2grid <- function(s) { + # replacing in place to preserve names (easier than thinking) + for (p in seq_along(s$run$inputs$met$path)) { + s$run$inputs$met$path[[p]] <- gsub( + pattern = s$run$site$id, + replacement = s$run$site$ERA5_grid_cell, + x = s$run$inputs$met$path[[p]] + ) + } + s +} + +add_soil_pft <- function(s) { + s$run$site$site.pft <- list(veg = s$run$site$site.pft, soil = "soil") + s +} + +settings <- settings |> + createMultiSiteSettings(site_info) |> + setEnsemblePaths( + n_reps = args$n_met, + input_type = "met", + path = args$met_dir, + d1 = args$start_date, + d2 = args$end_date, + # TODO use caladapt when ready + # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc" + path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim" + ) |> + papply(id2grid) |> + setEnsemblePaths( + n_reps = args$n_ens, + input_type = "poolinitcond", + path = args$ic_dir, + path_template = "{path}/{id}/IC_site_{id}_{n}.nc" + ) |> + setEnsemblePaths( + n_reps = args$n_ens, + input_type = "events", + path = args$event_dir, + path_template = "{path}/events-{id}.in" + ) |> + papply(add_soil_pft) + +# Update just the first component of the output directory, +# in all four places it's used. +# Note: It feels a bit odd to directly replace the word "output" +# rather than fill a blank or use a @placeholder@, but since existing template +# already passes @placeholder@'s on to be processed in PEcAn I didn't want +# to introduce confusion by making some be replaced at a different stage. +settings$outdir <- sub("^output", args$output_dir_name, + settings$outdir) +settings$modeloutdir <- sub("^output", args$output_dir_name, + settings$modeloutdir) +settings$rundir <- sub("^output", args$output_dir_name, + settings$rundir) +settings$host$outdir <- sub("^output", args$output_dir_name, + settings$host$outdir) +settings$host$rundir <- sub("^output", args$output_dir_name, + settings$host$rundir) + +write.settings( + settings, + outputfile = basename(args$output_file), + outputdir = dirname(args$output_file) +) diff --git a/3_rowcrop/04_set_up_runs.R b/3_rowcrop/04_set_up_runs.R new file mode 100755 index 0000000..3921bf9 --- /dev/null +++ b/3_rowcrop/04_set_up_runs.R @@ -0,0 +1,77 @@ +#!/usr/bin/env Rscript + +# -------------------------------------------------- +# Run-time parameters + +options <- list( + optparse::make_option(c("-s", "--settings"), + default = "settings.xml", + help = paste( + "path to the XML settings file you want to use for this run.", + "Be aware all paths inside the file are interpreted relative to the", + "working directory of the process that invokes run_model.R,", + "not relative to the settings file path" + ) + ), + optparse::make_option(c("-c", "--continue"), + default = FALSE, + help = paste( + "Attempt to pick up in the middle of a previously interrupted workflow?", + "Does not work reliably. Use at your own risk" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + + + +# make sure always to call status.end +options(warn = 1) +options(error = quote({ + try(PEcAn.utils::status.end("ERROR")) + try(PEcAn.remote::kill.tunnel(settings)) + if (!interactive()) { + q(status = 1) + } +})) + +# ---------------------------------------------------------------------- +# PEcAn Workflow +# ---------------------------------------------------------------------- + +library("PEcAn.all") + + +# Report package versions for provenance +PEcAn.all::pecan_version() + +# Open and read in settings file for PEcAn run. +settings <- PEcAn.settings::read.settings(args$settings) + +if (!dir.exists(settings$outdir)) { + dir.create(settings$outdir, recursive = TRUE) +} + + +# start from scratch if no continue is passed in +status_file <- file.path(settings$outdir, "STATUS") +if (args$continue && file.exists(status_file)) { + file.remove(status_file) +} + + +# Write model specific configs +if (PEcAn.utils::status.check("CONFIG") == 0) { + PEcAn.utils::status.start("CONFIG") + settings <- PEcAn.workflow::runModule.run.write.configs(settings) + PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml") + PEcAn.utils::status.end() +} diff --git a/3_rowcrop/05_run_model.R b/3_rowcrop/05_run_model.R new file mode 100755 index 0000000..ea0a257 --- /dev/null +++ b/3_rowcrop/05_run_model.R @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +# -------------------------------------------------- +# Run-time parameters + +options <- list( + optparse::make_option(c("-s", "--settings"), + default = "output/pecan.CONFIGS.xml", + help = paste( + "path to the XML settings file you want to use for this run.", + "Be aware all paths inside the file are interpreted relative to the", + "working directory of the process that invokes run_model.R,", + "not relative to the settings file path" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + + + +# make sure always to call status.end +options(warn = 1) +options(error = quote({ + try(PEcAn.utils::status.end("ERROR")) + try(PEcAn.remote::kill.tunnel(settings)) + if (!interactive()) { + q(status = 1) + } +})) + +# ---------------------------------------------------------------------- +# PEcAn Workflow +# ---------------------------------------------------------------------- + +library("PEcAn.all") + + +# Report package versions for provenance +PEcAn.all::pecan_version() + +# Open and read in settings file for PEcAn run. +settings <- PEcAn.settings::read.settings(args$settings) + +# Start ecosystem model runs +if (PEcAn.utils::status.check("MODEL") == 0) { + PEcAn.utils::status.start("MODEL") + stop_on_error <- as.logical(settings[[c("run", "stop_on_error")]]) + if (length(stop_on_error) == 0) { + # If we're doing an ensemble run, don't stop. If only a single run, we + # should be stopping. + if (is.null(settings[["ensemble"]]) || + as.numeric(settings[[c("ensemble", "size")]]) == 1) { + stop_on_error <- TRUE + } else { + stop_on_error <- FALSE + } + } + PEcAn.workflow::runModule_start_model_runs(settings, + stop.on.error = stop_on_error) + PEcAn.utils::status.end() +} + + +# Get results of model runs +# this function is arguably too chatty, so we'll suppress +# INFO-level log output for this step. +loglevel <- PEcAn.logger::logger.setLevel("WARN") +if (PEcAn.utils::status.check("OUTPUT") == 0) { + PEcAn.utils::status.start("OUTPUT") + runModule.get.results(settings) + PEcAn.utils::status.end() +} +PEcAn.logger::logger.setLevel(loglevel) + + +# Run ensemble analysis on model output. +# if ("ensemble" %in% names(settings) +# && PEcAn.utils::status.check("ENSEMBLE") == 0) { +# PEcAn.utils::status.start("ENSEMBLE") +# runModule.run.ensemble.analysis(settings, TRUE) +# PEcAn.utils::status.end() +# } + + +# Run sensitivity analysis and variance decomposition on model output +if ("sensitivity.analysis" %in% names(settings) && + PEcAn.utils::status.check("SENSITIVITY") == 0) { + PEcAn.utils::status.start("SENSITIVITY") + runModule.run.sensitivity.analysis(settings) + PEcAn.utils::status.end() +} + +# Pecan workflow complete +if (PEcAn.utils::status.check("FINISHED") == 0) { + PEcAn.utils::status.start("FINISHED") + PEcAn.remote::kill.tunnel(settings) + + PEcAn.utils::status.end() +} + +print("---------- PEcAn Workflow Complete ----------") diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md new file mode 100644 index 0000000..2fca928 --- /dev/null +++ b/3_rowcrop/README.md @@ -0,0 +1,152 @@ +# Simulating row crop management: MAGiC phase 3 + +With full support for agronomic events now implemented in the Sipnet model, +this set of simulations demonstrates incorporating such events into the PEcAn +framework and evaluating their effect on predicted carbon dynamics in a +cropping landscape that can now be resolved into three plant functional types: + +* Woody perennials such as orchards or vineyards (Fer et al 2015)[1], +* Nonwoody perennials such as hay, haylage, grazing land, etc + (Dookohaki et al 2022)[2], +* Annually planted, actively managed row crops. These are initially represented + as a single "nonwoody annual" plant functional type with parameters derived + from the nonwoody perennial PFT by turning off internal phenology so that + greenup and browndown are controlled by the externally prescribed planting + and harvest dates. + +Representing all row crops as one single PFT is a major simplification, so one +key goal of this phase is to prepare the simulation framework for a detailed +uncertainty analysis, which can then be used to inform decisions about further +dividing crop types as data become available to calibrate them. + +Statewide runs continue to use the 198 sites evaluated in phase 2. +We also introduce focused validation runs using the subset of sites where +direct observations of soil carbon and/or biomass are available during the +simulation period. + + +## Caveats + +TODO UPDATE when no longer true + +This simulation is under active development and all the notes below are subject +to change as we update the code. +For now, instructions assume a local run on MacOS and will be updated for a +Linux + Slurm + Apptainer HPC environment as we finish testing and deployment. + +Aspirationally, any command prefixed with `[host_args]` is one that ought to +work on HPC by "just" adding a system-specific prefix, e.g. +`./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on my machine becomes +`sbatch -n16 --mem=12G --mail-type=ALL --uid=jdoe \ + ./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on yours. + + +## Running the workflow + +### 0. Copy prebuilt artifacts and set up validation data + +TODO. Should include: + - PFT definitions including new row crop PFT + - ERA5 data as nc (clim conversion runs locally) + - ca_half_degree_grid.csv too + - DWR map? + - initial conditions + - Decide: deliver full files, site-level mean/sd, or other? + - site_info.csv + - validation info. Key constraint: datapoints are private, + probably need a "drop into this directory with this format" + step. do NOT include validation_site_info.csv + +#### Validation data + +To set up validation runs, you need access to the cropland soil carbon data +files `Harmonized_SiteMngmt_Croplands.csv` and `Harmonized_Data_Croplands.csv`. + +These were shared for this project by CARB and CDFA, who in turn obtained them +from stakeholders (primarily Healthy Soils Program grant recipients) who +consented to use of their data for internal research purposes but explicitly +did not consent to public distribution of the data. +Contact chelsea.carey@arb.ca.gov for more information about the dataset. + +Once obtained, place them in `data_raw/private/HSP` and run +```{sh} +../tools/build_validation_siteinfo.R +``` +to create `validation_site_info.csv`. + + +### 1. Convert climate driver files + +TODO 1: current development version of PEcAn.sipnet still writes 13-col + clim files with constants for grid index and soil water. Document which version writes correctly. + +TODO 2: show how to pass n_cores from host_args +(NSLOTS? SLURM_CPUS_PER_TASK?) + +```{sh} +[host_args] ./01_ERA5_nc_to_clim.R \ + --site_era5_path=data_raw/ERA5_CA_nc \ + --site_sipnet_met_path=data/ERA5_CA_SIPNET \ + --site_info_file=data_raw/ca_half_degree_grid.csv \ + --start_date=2016-01-01 \ + --end_date=2025-12-31 \ + --n_cores=7 +``` + +### 2. Generate initial site conditions + +We'll run this twice, once for validation sites and once for statewide. +It would also be fine to put both together in the same input and run it once. + +```{sh} +[host_args] ./02_ic_build.R \ + --site_info_path=validation_site_info.csv \ + --pft_dir=data_raw/pfts \ + --ic_outdir=data/IC_files +#[host_args] ./02_ic_build.R \ +# --site_info_path=site_info.csv \ +# --pft_dir=data_raw/pfts \ +# --ic_outdir=data/IC_files +``` + +### 3. generate settings file + +```{sh} +[host_args] ./03_xml_build.R \ + --ic_dir=data/IC_files \ + --site_file=validation_site_info.csv \ + --output_file=validation_settings.xml \ + --output_dir_name=val_out +``` + +### 4. Set up model run directories + +TODO: Yes, it's unintuitive that we can't rename the output dir at this +stage instead of in xml_build. + +```{sh} +[host_args] ./04_set_up_runs.R --settings=validation_settings.xml +``` + +### 5. Run model + +```{sh} +export NCPUS=8 +ln -s [your/path/to]/sipnet/sipnet sipnet.git +[host_args] ./05_run_model.R --settings=val_out/pecan.CONFIGS.xml + + +### 6. Validate + +```{sh} +[host_args] ./validate.R \ + --model_dir=val_out \ + --output_dir=validation_results_$(date'+%s') +``` + + +## References + +[1] Fer I, R Kelly, P Moorcroft, AD Richardson, E Cowdery, MC Dietze. 2018. Linking big models to big data: efficient ecosystem model calibration through Bayesian model emulation. Biogeosciences 15, 5801–5830, 2018 https://doi.org/10.5194/bg-15-5801-2018 + +[2] Dokoohaki H, BD Morrison, A Raiho, SP Serbin, K Zarada, L Dramko, MC Dietze. 2022. Development of an open-source regional data assimilation system in PEcAn v. 1.7.2: application to carbon cycle reanalysis across the contiguous US using SIPNET. Geoscientific Model Development 15, 3233–3252. https://doi.org/10.5194/gmd-15-3233-2022 diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml new file mode 100644 index 0000000..2272c60 --- /dev/null +++ b/3_rowcrop/template.xml @@ -0,0 +1,93 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + data_raw/pfts/temperate.deciduous/post.distns.Rdata + data_raw/pfts/temperate.deciduous/ + + + grass + data_raw/pfts/grass/post.distns.Rdata + data_raw/pfts/grass + + + soil + data_raw/pfts/soil/ + + + + + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + + + + + localhost + output/out + output/run + + + squeue -j @JOBID@ || echo DONE + + parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' :::: + + 1000 + + + diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R new file mode 100755 index 0000000..9c54442 --- /dev/null +++ b/3_rowcrop/validate.R @@ -0,0 +1,202 @@ +#!/usr/bin/env Rscript + +options <- list( + optparse::make_option("--val_data_path", + default = "data_raw/private/HSP/Harmonized_Data_Croplands.csv", + help = "CSV containing nonpublic soil C data shared by HSP program" + ), + optparse::make_option("--model_dir", + default = "output", + help = "directory containing PEcAn output to validate" + ), + optparse::make_option("--output_dir", + default = "validation_output", + help = "directory in which to save plots and summary stats" + ) + # TODO lots of assumptions still hardcoded below +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + + +library(tidyverse) + + +## Read validation data, identify target years from each site + +soc_obs <- read.csv(args$val_data_path) |> + # recreate hashes used in site_info + # TODO can we save steps here? + # One advantage of recreating: private IDs never leave the source file + mutate( + BaseID = gsub("\\s+", "", BaseID), + site = paste(ProjectName, BaseID, Latitude, Longitude) |> + purrr::map_chr(rlang::hash), + obs_SOC = PEcAn.utils::ud_convert(SOC_stock_0_30, "tonne/ha", "kg/m2") + ) |> + select(site, BaseID, year = Year, obs_SOC) + +obs_sites_yrs <- soc_obs |> + distinct(site, year) + +## Read model output, summarize to end-of-year values +## (SOC doesn't change very fast) + +sim_files_wanted <- file.path(args$model_dir, "out") |> + list.files( + pattern = "\\d\\d\\d\\d.nc", + full.names = TRUE, + recursive = TRUE + ) |> + data.frame(path = _) |> + separate_wider_regex( + cols = path, + patterns = c( + ".*/ENS-", + ens_num = "\\d+", + "-", + site = ".*?", + "/", + year = "\\d+", + ".nc" + ), + cols_remove = FALSE + ) |> + mutate(across(c("ens_num", "year"), as.numeric)) |> + inner_join(obs_sites_yrs) + +# read.output is FAR too chatty; suppress info-level messages +logger_level <- PEcAn.logger::logger.setLevel("WARN") + +soc_sim <- sim_files_wanted |> + nest_by(ens_num, site, year, .key = "path") |> + mutate( + contents = map( + path, + ~ PEcAn.utils::read.output( + ncfiles = .x, + dataframe = TRUE, + variables = "TotSoilCarb", + print_summary = FALSE, + verbose = FALSE + ) |> + select(-year) # already present outside nested cols + ) + ) |> + unnest(contents) |> + group_by(ens_num, site, year) |> + slice_max(posix) + +## Combine and align obs + sim + +soc_compare <- soc_sim |> + left_join(soc_obs) |> + # TODO these filters need refinement -- + # eg Are NAs actually expected or should they trigger complaints? + drop_na(obs_SOC) |> + # TODO excluded as surprisingly high + # May want to re-include after inspecting data for individual sites + filter(obs_SOC < 20) + # TODO will eventually want to have PFTs labeled here + +if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE) + +## lm fit + CIs + +soc_fits <- soc_compare |> + ungroup() |> + nest_by(ens_num) |> + mutate( + fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)), + r2 = summary(fit)$adj.r.squared, + nse = 1 - ( + sum((data$obs_SOC - data$TotSoilCarb)^2) / + sum((data$obs_SOC - mean(data$obs_SOC))^2) + ), + rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)), + bias = mean(data$TotSoilCarb - data$obs_SOC) + ) + +soc_fits |> + select(-data, -fit) |> + mutate(across(everything(), # NB excludes group vars! ens_num not mutated here + \(x) signif(x, digits = 4))) |> + write.csv( + file = file.path(args$output_dir, "SOC_model_fit.csv"), + row.names = FALSE + ) + +soc_ci <- soc_fits |> + mutate( + predx = list(seq(min(data$obs_SOC), max(data$obs_SOC), by = 0.1)), + pred = list(predict(fit, data.frame(obs_SOC = predx))) + ) |> + unnest(c(predx, pred)) |> + ungroup() |> + group_by(predx) |> + summarize( + pred_q5 = quantile(pred, 0.05), + pred_q95 = quantile(pred, 0.95), + pred_mean = mean(pred), + ) + +## Scatterplot + +soc_lm_plot <- ggplot(soc_compare) + + aes(obs_SOC, TotSoilCarb) + + geom_point() + + geom_abline(lty = "dotted") + + geom_ribbon( + data = soc_ci, + mapping = aes( + x = predx, + ymin = pred_q5, + ymax = pred_q95, + y = NULL + ), + alpha = 0.4 + ) + + geom_line( + data = soc_ci, + mapping = aes(predx, pred_mean), + col = "blue" + ) + + xlab("Measured 0-30 cm soil C stock (kg C / m2)") + + ylab("Simulated 0-30 cm soil C stock (kg C / m2)") + + theme_bw() +ggsave( + file.path(args$output_dir, "SOC_scatter.png"), + plot = soc_lm_plot, + height = 8, + width = 8 +) + +ggsave( + file.path(args$output_dir, "SOC_scatter_by_ens.png"), + plot = soc_lm_plot + facet_wrap(~ens_num), + height = 8, + width = 8 +) + + +soc_fits |> + ungroup() |> + summarize(across(r2:bias, c(mean = mean, sd = sd))) |> + pivot_longer(everything(), names_to = c("stat", ".value"), names_sep = "_") |> + mutate(across(where(is.numeric), + \(x) signif(x, digits = 4))) |> + write.csv( + file.path(args$output_dir, "SOC_fit_summary.csv"), + row.names = FALSE + ) + +pdf(file.path(args$output_dir, "SOC_fit_diagnostic_plots.pdf")) +soc_fits |> pwalk(\(fit, ens_num, ...) plot(fit, which = 1:6, main = ens_num)) +dev.off() diff --git a/one_off_analyses/kanee_agu_2025/README.md b/one_off_analyses/kanee_agu_2025/README.md new file mode 100644 index 0000000..3f6489e --- /dev/null +++ b/one_off_analyses/kanee_agu_2025/README.md @@ -0,0 +1,369 @@ +# Sipnet model runs for Sarah Kanee 2025 AGU talk + +Design: 17 anchor sites, run with and without events for tillage, planting, harvest, irrigation, and phenology from the monitoring pipeline. + +Goal: Compare model predictions of LAI, SoilMoist, GPP, SoilResp, AGB, and NEE modeled with and without events. + + +## Create workflow directory + +I'll do this inside the existing `workflows` repo and symlink in the files I need; May make sense later to archive this separately, but this is easy and avoids duplicating files too early. + +Files reused here: +* Weather data: ERA5 0.5-degree grid, 2016-2024 is already prepared in Sipment `.clim` format for the whole state (~5.6 GB). +* Initial condition files (~23.5 MB, takes an hour or so generate): Aboveground biomass from LandTrendr 2016, SOC from SoilGrids, soil moisture from ECMWF multi-satellite data, LAI from MODIS; plus leaf and wood carbon content from combining AGB, LAI, and PFT-specific SLA estimates. + - Note that these do not yet incorporate any of the remote-sensed values from the monitoring pipeline -- we'll add those later. +* Run scripts 03_xml_build.R, 04_set_up_runs.R, 05_run_model.R +* For the unmanaged run, irrigation via static events.in file + +```{sh} +cd workflows +export EXISTING_WORKFLOW=$(realpath 3_rowcrop) +mkdir one_off_analyses/kanee_agu_2025 && cd one_off_analyses/kanee_agu_2025 +ln -s $(realpath ../../data_raw) data_raw +mkdir data +ln -s "$EXISTING_WORKFLOW"/data/ERA5_CA_SIPNET data/ERA5_CA_SIPNET +ln -s "$EXISTING_WORKFLOW"/data/IC_files data/IC_files +ln -s "$EXISTING_WORKFLOW"/sipnet.git sipnet.git +``` + + +## Fetch event files from BU server + +Note: The filenames are a bit confusing because of some rapid updates while iterating. As of 2025-12-11, _most_ event types in these files start in 2018. `anchors_combinedEvents_2018-2023.csv` has irrigation starting in 2016 and other events still starting in 2018, while `anchors_irrigation_events_2018-2023.csv` is also still 2018-2023 as labeled. + +Why do most events start in 2018 instead of 2016? To avoid fighting format differences between the 2016 and 2018 DWR crop maps. Sarah plans to extend all event types to 2016 soon and we will edit this pipeline when that's complete. + +```{sh} +scp -r \ + cblack1@geo.bu.edu:/projectnb/dietzelab/ccmmf/management/event_files/ \ + data_raw/kanee_anchor_event_files/ +``` + +Recording the versions used on 2025-12-11: + +```{sh} +mv anchors_harvestEvents_2018-2023 anchors_harvestEvents_2018-2023.csv +shasum data_raw/kanee_anchor_event_files/* +``` + +Which produces: + +``` +cb911f85a9847738b0ff0b1bd5b4f38edbfbfde4 data_raw/kanee_anchor_event_files/anchors_combinedEvents_2018-2023.csv +07dd633b71bbc5588d5a85050f44accd5f8efb07 data_raw/kanee_anchor_event_files/anchors_combinedEvents_2018-2023.json +bc189100778219743dc92f738739589a394c4bb7 data_raw/kanee_anchor_event_files/anchors_event_overlap_2018-2023.csv +e226c31f7cf6bd20963868fb34427291f6946d01 data_raw/kanee_anchor_event_files/anchors_harvestEvents_2018-2023.csv +501876a53a16b44e46591b8a342a1aca0932bb92 data_raw/kanee_anchor_event_files/anchors_harvestEvents_2018-2023.json +82680206b2bfb6cd21c0c6272a2084bd39119388 data_raw/kanee_anchor_event_files/anchors_irrigation_events_2018-2023.csv +abe53309470576582d87d137189ca85fd32aed53 data_raw/kanee_anchor_event_files/anchors_irrigation_events_2018-2023.json +62a758fba1946ba1bf9d40ded0f3d1db6e84d4a0 data_raw/kanee_anchor_event_files/anchors_phenoParams_2018-2023.csv +f95878325d53f37838e7d3ae8086e5bcca571222 data_raw/kanee_anchor_event_files/anchors_phenoParams_2018-2023.json +5c70b67312f7be594d44bd8622e939ba9a683257 data_raw/kanee_anchor_event_files/anchors_plantingEvents_2018-2023.csv +50f31594d27b1e2cfb4099c301b7af043816899e data_raw/kanee_anchor_event_files/anchors_plantingEvents_2018-2023.json +451f69136e0d022db41b5d1c961da0878c7c2c4b data_raw/kanee_anchor_event_files/anchors_tillageEvents_2018-2023.csv +b3bea17a301001038fc7fb0997b56071df91fa24 data_raw/kanee_anchor_event_files/anchors_tillageEvents_2018-2023.json +``` + + +## Convert phenology files + +All expected columns are present already, just need to lowercase `leafOnDay` and `leafOffDay` columns. + +The result will be a single CSV containing all years of data from each site that had information available, and we'll pass the same path into every site of settings.xml. + +When processing each site, write.configs.SIPNET will try first to use that site's record for the starting year if available, else the average of all available years from that site, else the DOY 144/285 hardcoded in PEcAn's default parameter template. + +Note 1: Since we'll run Sipnet from 2016-2024 and all phenology starts in 2018, the net effect will be that PEcAn uses the mean of the dates reported for each site. That seems reasonable for this run. + +Note 2: Even though all sites see the same file, no cross-site info is used here. Sites with no data will _not_ take their leaf-on and leaf-off dates from other sites in the same file. In the context of a California-specific pipeline we might want to provide state-specific values in the template or include gapfilling in the monitoring workflow (filling missing phenology from nearby sites with the same crop type) rather than let the model use DOY 144/285 (which were probably chosen for Niwot Ridge, Colorado). + +TODO: As far as I can tell the phenology file is the only place we use fully lowercase names for these parameters. Should write.config.SIPNET be updated to accept files that call them `leafOnDay`/`leafOffDay` as well/instead? + +```{sh} +sed -e 's/leafOnDay/leafonday/g' \ + -e 's/leafOffDay/leafoffday/g' \ + data_raw/kanee_anchor_event_files/anchors_phenoParams_2018-2023.csv \ + > data/phenology.csv +``` + + +## Set up management event files + +As noted above, these files contain only irrigation in 2016 or 2017 -- planting/harvest/tillage start in 2018. Since all sites are treated as woody this won't make a huge difference, but we should revisit it if not adding events for these years soon. + +The currently available JSON files do not follow the PEcAn events schema (they are lists of events each with its own site_id; the events schema calls for lists of sites each with its own block of events), so first convert to the PEcan events standard. TODO: upstream tools ought to generate this in the first place. + +```{R} +read.csv("data_raw/kanee_anchor_event_files/anchors_combinedEvents_2018-2023.csv") |> + dplyr::filter( + event_type != "phenology", + !(event_type == "irrigation" & amount_mm == 0) + ) |> + dplyr::mutate(pecan_events_version = "0.1.0") |> + dplyr::nest_by(site_id, pecan_events_version, .key="events") |> + jsonlite::write_json("data/events/combined_events.json") +``` + +Now we can convert from JSON to Sipnet events format. +```{R} +PEcAn.SIPNET::write.events.SIPNET( + "data/events/combined_events.json", + "data/events/" +) +``` + +For the unmanaged comparison, we'll use a fixed events file that contains 1520 mm irrigation each year, +as used in statewide runs to date. To keep the managed and unmanaged pipelines parallel, let's copy it into (identical!) files named for each site: + +```{sh} +mkdir data/events_fixedirri +cd data/events +find . -name 'events-*.in' -exec cp "$EXISTING_WORKFLOW"/data/events.in ../events_fixedirri/{} \; +cd - +``` + + +## Set up site info + +```{R} +anchor_sites <- read.csv( + file.path(Sys.getenv("EXISTING_WORKFLOW"), "site_info.csv"), + colClasses = "character" +) +sites_wanted <- read.csv( + "data_raw/kanee_anchor_event_files/anchors_combinedEvents_2018-2023.csv", + colClasses = "character" +)$site_id +write.csv( + anchor_sites[anchor_sites$id %in% sites_wanted,], + file = "site_info.csv", + row.names = FALSE +) +``` + + +## Set up template.xml + +This step is done manually rather than rerunnable by clicking through the notebook -- We _could_ do this as a horrible set of sed commands, but they'd be hard to read and the payoff seems minimal. + +1. Set output variables for the ensemble analysis. Note: These are a convenience but not essential -- variables not listed here can always be retrieved from the full output files for posthoc analysis. + +2. For the managed run only, set path to the phenology file. + +Started by copying `"$EXISTING_WORKFLOW"/template.xml`, then hand-edited: + +``` +--- /Users/chrisb/cur/ccmmf/workflows/3_rowcrop/template.xml 2025-12-08 20:50:39 ++++ template_unmanaged.xml 2025-12-09 10:03:56 +@@ -26,11 +26,12 @@ + + + +- NPP +- TotSoilCarb +- AbvGrndWood +- Qle +- SoilMoistFrac ++ LAI ++ SoilMoist ++ GPP ++ SoilResp ++ AGB ++ NEE + + + uniform +``` + +``` +--- template_unmanaged.xml 2025-12-09 10:03:56 ++++ template_managed.xml 2025-12-08 23:46:53 +@@ -74,6 +74,9 @@ + + + ++ ++ data/phenology.csv ++ + + + +``` + +TODO: It's a little inelegant that phenology gets set and unset in the template file while events get set and unset at the settings build stage. Consider picking one approach for both (which might involve adding yet another configuration option to xml_build.R). + + +## Generate settings files, set up rundirs + +```{sh} +"$EXISTING_WORKFLOW"/03_xml_build.R \ + --ic_dir=data/IC_files \ + --site_file=site_info.csv \ + --template_file=template_managed.xml \ + --output_file=settings_managed.xml \ + --output_dir_name=output_managed \ + --event_dir=data/events +"$EXISTING_WORKFLOW"/04_set_up_runs.R --settings=settings_managed.xml +``` + +```{sh} +"$EXISTING_WORKFLOW"/03_xml_build.R \ + --ic_dir=data/IC_files \ + --site_file=site_info.csv \ + --template_file=template_unmanaged.xml \ + --output_file=settings_unmanaged.xml \ + --output_dir_name=output_unmanaged \ + --event_dir=data/events_fixedirri +"$EXISTING_WORKFLOW"/04_set_up_runs.R --settings=settings_unmanaged.xml +``` + + +## Run model + +```{sh} +export NCPUS=8 +"$EXISTING_WORKFLOW"/05_run_model.R --settings=output_managed/pecan.CONFIGS.xml +"$EXISTING_WORKFLOW"/05_run_model.R --settings=output_unmanaged/pecan.CONFIGS.xml +``` + + +## Extract results + +The output directory already contains single-site timeseries and histogram plots for each variable of interest, but they're named with opaque hashes that are hard to compare between treatments. Let's get a single table of results instead. + +Caution: 9 years of half-hourly model output is over 150k rows, so beware that loading all 20 replicates from 17 sites in both management conditions will produce `2*20*17*9*365*48` = more than 100M rows. + +For a first pass, here are daily means of each variable (aggregated as we read each file) to get it down to 2.2M rows; note that some of these variables (GPP, NEE, soilResp) would probably be more intuitive as sums instead. + +```{R} +library(tidyverse) + +list_output_files <- function(modeldir) { + file.path(modeldir, "out") |> + list.files( + pattern = "\\d\\d\\d\\d.nc", + full.names = TRUE, + recursive = TRUE + ) |> + data.frame(path = _) |> + separate_wider_regex( + cols = path, + patterns = c( + ".*/ENS-", + ens_num = "\\d+", + "-", + site = ".*?", + "/", + year = "\\d+", + ".nc" + ), + cols_remove = FALSE + ) |> + mutate(across(c("ens_num", "year"), as.numeric)) +} + +read_daymeans <- function(ncfile, variables) { + PEcAn.utils::read.output( + ncfiles = ncfile, + dataframe = TRUE, + variables = variables, + print_summary = FALSE, + verbose = FALSE + ) |> + mutate(doy = yday(posix)) |> + summarize(across(everything(), mean), .by = doy) |> + select(-year) # already present outside nested cols +} + +output_files <- bind_rows( + managed = list_output_files("output_managed"), + unmanaged = list_output_files("output_unmanaged"), + .id = "condition" +) +vars <- c("LAI", "SoilMoist", "GPP", "SoilResp", "AGB", "NEE") +results <- output_files |> + nest_by(condition, ens_num, site, year, .key = "path") |> + mutate(contents = map(path, \(x) read_daymeans(x, vars))) |> + select(-path) |> + unnest(contents) + +save(results, file="results_daily_means.Rdata") +results |> + mutate(across(c(where(is.double), -posix), zapsmall)) |> + write.csv(file = "results_daily_means.csv", row.names = FALSE) + +result_ci <- results |> + group_by(condition, site, year, doy, posix) |> + summarize(across(-ens_num, + c(q5 = ~quantile(., 0.05), + mean = mean, + q95 = ~quantile(., 0.95)))) +save(result_ci, file="results_intervals.Rdata") +result_ci |> + mutate(across(c(where(is.double), -posix), zapsmall)) |> + write.csv(file = "results_intervals.csv", row.names = FALSE) + +by_var <- result_ci |> + pivot_longer( + cols = ends_with(c("q5", "mean", "q95")), + names_to = c("variable", ".value"), + names_pattern = "(.*)_([^_]+)") + + +save_gg <- function(plt, title) { + name <- paste0(gsub(" ", "_", title), ".png") + ggsave( + file = file.path("plots", name), + plot = plt, + width = 12, + height = 8 + ) +} + +plot_by_var <- function(df, title) { + plt <- ggplot(df) + + aes(x = posix, y = mean, color = condition) + + facet_wrap(~variable, scales = "free_y") + + geom_line() + + geom_line(aes(y=q5), lty = "dashed") + + geom_line(aes(y=q95), lty = "dashed") + + theme_bw() + + xlab(NULL) + + ggtitle(title) + save_gg(plt, title) +} + +plot_by_site <- function(df, title) { + plt <- ggplot(df) + + aes(x = posix, y = mean, color = condition) + + facet_wrap(~site, scales = "free_y") + + geom_line() + + geom_line(aes(y=q5), lty = "dashed") + + geom_line(aes(y=q95), lty = "dashed") + + theme_bw() + + xlab(NULL) + + ggtitle(title) + save_gg(plt, title) +} + +dir.create("plots") + +by_var |> + group_by(site) |> + group_walk(~plot_by_var(df = .x, title = paste0("Site", .y$site))) +by_var |> + group_by(variable) |> + group_walk(~plot_by_site(df = .x, title = .y$variable)) + +lai_doy <- result_ci |> + ggplot() + + aes(doy, LAI_mean, color = condition, group = paste(condition, year)) + + geom_ribbon(aes(ymin=LAI_q5, ymax = LAI_q95, fill = condition), alpha = 0.01, lty = "dotted") + + geom_line() + + facet_wrap(~site) + + theme_bw() + + ylab("LAI") + + xlab("DOY") + +save_gg(lai_doy, "LAI by day") +``` diff --git a/one_off_analyses/kanee_agu_2025/template_managed.xml b/one_off_analyses/kanee_agu_2025/template_managed.xml new file mode 100644 index 0000000..d695809 --- /dev/null +++ b/one_off_analyses/kanee_agu_2025/template_managed.xml @@ -0,0 +1,97 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + data_raw/pfts/temperate.deciduous/post.distns.Rdata + data_raw/pfts/temperate.deciduous/ + + + grass + data_raw/pfts/grass/post.distns.Rdata + data_raw/pfts/grass + + + soil + data_raw/pfts/soil/ + + + + + LAI + SoilMoist + GPP + SoilResp + AGB + NEE + + + uniform + + + sampling + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + data/phenology.csv + + + + + + + localhost + output/out + output/run + + + squeue -j @JOBID@ || echo DONE + + parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' :::: + + 1000 + + + diff --git a/one_off_analyses/kanee_agu_2025/template_unmanaged.xml b/one_off_analyses/kanee_agu_2025/template_unmanaged.xml new file mode 100644 index 0000000..dcb571e --- /dev/null +++ b/one_off_analyses/kanee_agu_2025/template_unmanaged.xml @@ -0,0 +1,94 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + data_raw/pfts/temperate.deciduous/post.distns.Rdata + data_raw/pfts/temperate.deciduous/ + + + grass + data_raw/pfts/grass/post.distns.Rdata + data_raw/pfts/grass + + + soil + data_raw/pfts/soil/ + + + + + LAI + SoilMoist + GPP + SoilResp + AGB + NEE + + + uniform + + + sampling + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + + + + + localhost + output/out + output/run + + + squeue -j @JOBID@ || echo DONE + + parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' :::: + + 1000 + + + diff --git a/one_off_analyses/lebauer_agu_2025/README.md b/one_off_analyses/lebauer_agu_2025/README.md new file mode 100644 index 0000000..2014fa4 --- /dev/null +++ b/one_off_analyses/lebauer_agu_2025/README.md @@ -0,0 +1,386 @@ +# Sipnet runs for David LeBauer 2025 AGU talk + +Design: 100 row-crop anchor sites, run with event files corresponding to 6 hypothetical management scenarios `baseline`, `compost`, `reduced_till`, `zero_till`, `reduced_irrig_drip`, and `stacked`. + +Goal: Compare statewide predictions of [variables TK] under differing managements. + + +## Create workflow directory + +Many of the files needed are already set up for the MAGiC phase 3 workflow. I'll symlink those rather than duplicate them. + + +```{sh} +# cd path/to/ccmmf +export SCENARIO_REPO=$(realpath scenarios) +cd workflows +export EXISTING_WORKFLOW=$(realpath 3_rowcrop) +mkdir one_off_analyses/lebauer_agu_2025 && cd one_off_analyses/lebauer_agu_2025 +ln -s $(realpath ../../data_raw) data_raw +mkdir data +ln -s "$EXISTING_WORKFLOW"/data/ERA5_CA_SIPNET data/ERA5_CA_SIPNET +ln -s "$EXISTING_WORKFLOW"/data/IC_files data/IC_files +ln -s "$EXISTING_WORKFLOW"/sipnet.git sipnet.git +``` + + +## Set up site info + +```{R} +read.csv( + file.path(Sys.getenv("EXISTING_WORKFLOW"), "site_info.csv"), + colClasses = "character" +) |> + dplyr::filter(site.pft == "grass") |> + dplyr::mutate(site.pft = "annual_crop") |> + write.csv( + file = "site_info.csv", + row.names = FALSE + ) +``` + + +## Add event files + +The json versions of these are maintained in a separate repo. We'll read them from the external location and write the sipnet (`*.in`) versions as local artifacts. + +Note that this analysis uses a single event file per scenario and the JSON files use the same dummy sitename (`herb_site_1`) for all of them, while write.events.SIPNET and steps downstream of it expect there to be a directory one events file per site, named `events-.in`. We'll make whole directories full of duplicates for each scenario. + +```{sh} + mkdir data/events + export SCENARIOS=( + baseline + compost + reduced_till + zero_till + reduced_irrig_drip + stacked + ) + # Caution: assumes site id is 2nd column + export SITES=( + $(tail -n+2 site_info.csv | cut -d, -f2 | uniq | tr -d \") + ) + for s in ${SCENARIOS[@]}; do + s_dir=data/events/"$s" + mkdir -p "$s_dir" + json_file="$SCENARIO_REPO"/data/events_"$s".json + Rscript \ + -e 'PEcAn.SIPNET::write.events.SIPNET(' \ + -e ' events_json = "'"$json_file"'",' \ + -e ' outdir = "data/events"' \ + -e ')' + event_files=("${SITES[@]/#/$s_dir/events-}") + event_files=("${event_files[@]/%/.in}") + for f in ${event_files[@]}; do + cp data/events/events-herb_site_1.in $f + done + rm data/events/events-herb_site_1.in + done +``` + + +## Create annual PFT + +Runs for cropland have until now used a PFT calibrated for perennial semiarid grasslands, with phenology controlled by growing degree day. To avoid unwanted interactions between the GDD model and the annual crop cycle, we zero out the biomass change at the computed leaf-on and leaf-off date so that these have no effect. + +Note that this simplification means leaf-off is assumed to only happen at harvest, which isn't technically true -- some crops such as cotton, soy, and rice are often fully deleafed before harvest. However, since the MAGiC system's estimates of harvest date will be primarily derived from remote-sensed NDVI, estimated "harvest" will always be tied to end of greenness anyway (though potentially with crop-specific offsets that can be tuned to account for relative timing of harvest and leaf-off). +off happen only at planting and harvest + +```{R} +pd <- new.env() +load("data_raw/pfts/grass/post.distns.Rdata", envir = pd) +post.distns <- pd$post.distns +pheno_rows <- rownames(post.distns) %in% c("fracLeafFall", "leafGrowth") +post.distns$parama[pheno_rows] <- 0.0 +post.distns$paramb[pheno_rows] <- 0.0 +dir.create("data_raw/pfts/annual_crop") +save(post.distns, file = "data_raw/pfts/annual_crop/post.distns.Rdata") +``` + + +## Set up template.xml + +These edits were done by hand -- decided it wasn't worth the trouble of doing them programmatically. + +* `cp "$EXISTING_WORKFLOW"/template.xml template.xml` +* Removed all variables but TotSoilCarb from the ensemble block -- we won't be looking site-by-site at the PDF output from running ensemble analysis, so why spend time extracting them. Left TotSoilCarb only because get.results throws a logger.severe if there are zero variables to extract. +* Updated PFT paths to point to new annnual_crop distns. + +Diff between existing workflow template and the result: + +``` +--- /Users/chrisb/cur/ccmmf/workflows/3_rowcrop/template.xml 2025-12-08 20:50:39 ++++ template.xml 2025-12-10 00:52:16 +@@ -10,27 +10,17 @@ + output/run + + +- temperate.deciduous +- data_raw/pfts/temperate.deciduous/post.distns.Rdata +- data_raw/pfts/temperate.deciduous/ ++ annual_crop ++ data_raw/pfts/annual_crop/post.distns.Rdata ++ data_raw/pfts/annual_crop/ + + +- grass +- data_raw/pfts/grass/post.distns.Rdata +- data_raw/pfts/grass +- +- + soil + data_raw/pfts/soil/ + + + + +- NPP + TotSoilCarb +- AbvGrndWood +- Qle +- SoilMoistFrac + + + uniform +``` + + +## Generate settings files, set up rundirs + + +```{sh} +for s in ${SCENARIOS[@]}; do + "$EXISTING_WORKFLOW"/03_xml_build.R \ + --n_ens=20 \ + --ic_dir=data/IC_files \ + --site_file=site_info.csv \ + --template_file=template.xml \ + --output_file=settings_"$s".xml \ + --output_dir_name=output_"$s" \ + --event_dir=data/events/"$s" + "$EXISTING_WORKFLOW"/04_set_up_runs.R --settings=settings_"$s".xml +done +``` + + +## Run model + +```{sh} +export NCPUS=8 +for s in ${SCENARIOS[@]}; do + "$EXISTING_WORKFLOW"/05_run_model.R --settings=output_"$s"/pecan.CONFIGS.xml +done +``` + + + +## Extract results + +This output was really intended for the downscaling workflow, but I'll tke a quick look: let's look only at differences between treatments in soil C on monthly time steps. Since soil C changes slowly, I'll simply take the value at the end of each month rather than averaging. + +First some function definitions: + +```{R} +library(tidyverse) +logger_level <- PEcAn.logger::logger.setLevel("WARN") + +list_output_files <- function(modeldir) { + file.path(modeldir, "out") |> + list.files( + pattern = "\\d\\d\\d\\d.nc", + full.names = TRUE, + recursive = TRUE + ) |> + data.frame(path = _) |> + separate_wider_regex( + cols = path, + patterns = c( + ".*/ENS-", + ens_num = "\\d+", + "-", + site = ".*?", + "/", + year = "\\d+", + ".nc" + ), + cols_remove = FALSE + ) |> + mutate(across(c("ens_num", "year"), as.numeric)) +} + +read_monthly <- function(ncfile, variables) { + PEcAn.utils::read.output( + ncfiles = ncfile, + dataframe = TRUE, + variables = variables, + print_summary = FALSE, + verbose = FALSE + ) |> + mutate(month = month(posix)) |> + group_by(year, month) |> + slice_max(posix) |> + ungroup() |> + select(-year) # already present outside this fn call +} + +make_long_quantiles <- function(grouped_df, cols) { + grouped_df |> + summarize( + across({{cols}}, + c(q5 = ~quantile(., 0.05), + mean = mean, + q95 = ~quantile(., 0.95)) + )) |> + pivot_longer( + cols = ends_with(c("q5", "mean", "q95")), + names_to = c("variable", ".value"), + names_pattern = "(.*)_([^_]+)" + ) +} + +save_gg <- function(plt, title) { + name <- paste0(gsub(" ", "_", title), ".png") + ggsave( + file = file.path("plots", name), + plot = plt, + width = 12, + height = 8 + ) +} +``` + +Now read files (this took at least half an hour on my machine) + +```{R} +output_files <- c( + "baseline", "compost", "reduced_till", + "zero_till", "reduced_irrig_drip", "stacked" + ) |> + setNames(nm = _) |> + map_chr(~paste0("output_", .)) |> + map(list_output_files) |> + bind_rows(.id = "scenario") + +# soil C from all sites*reps*months +results <- output_files |> + nest_by(scenario, ens_num, site, year, .key = "path") |> + mutate(contents = map(path, \(x) read_monthly(x, "TotSoilCarb"))) |> + select(-path) |> + unnest(contents) +save(results, file="results_monthly_TotSoilCarb.Rdata") +results |> + mutate(across(c(where(is.double), -posix), zapsmall)) |> + write.csv(file = "results_monthly_TotSoilCarb.csv", row.names = FALSE) +``` + +...and now aggregate... + +```{R} +# Treatment effects for each site*rep*month, +# first as raw diff (treatment - baseline), then as relative difference ((treat - base)/base) +result_diff <- results |> + pivot_wider(names_from = "scenario", values_from = "TotSoilCarb") |> + mutate( + d_compost = compost - baseline, + d_reduced_irrig_drip = reduced_irrig_drip - baseline, + d_reduced_till = reduced_till - baseline, + d_stacked = stacked - baseline, + d_zero_till = zero_till - baseline + ) +result_relative_diff <- results |> + pivot_wider(names_from = "scenario", values_from = "TotSoilCarb") |> + mutate( + d_compost = (compost - baseline)/baseline, + d_reduced_irrig_drip = (reduced_irrig_drip - baseline)/baseline, + d_reduced_till = (reduced_till - baseline)/baseline, + d_stacked = (stacked - baseline)/baseline, + d_zero_till = (zero_till - baseline)/baseline + ) + +# Ensemble averages+quantiles within site*month +diff_ci <- result_diff |> + group_by(site, year, month, posix) |> + make_long_quantiles(-ens_num) +diff_relative_ci <- result_relative_diff |> + group_by(site, year, month, posix) |> + make_long_quantiles(-ens_num) + +# Cross-site averages+quantiles for each timepoint +# Note I'm only calculating relative; +# not sure averaging raw diffs across sites is meaningful +# +# TODO this collapses ensemble and site both at once -- +# should it be rolled up stepwise instead? +mean_relative_diff_ci <- result_relative_diff |> + group_by(year, month, posix) |> + make_long_quantiles(c(-ens_num, -site)) +``` + +Now plotting. + +These site-by-site visualizations obviously won't scale to more than a handful of sites. +I'll choose 8 sites at random; might be better to choose top and bottom most/least responsive sites? + + +```{R} +dir.create("plots") + +selected_sites <- output_files |> + distinct(site) |> + slice_sample(n = 8) |> + pull(site) + +diff_plt <- diff_ci |> + filter( + grepl("^d_", variable), + site %in% selected_sites + ) |> + ggplot() + + aes(x=posix, y=mean) + + facet_grid(variable~site, scales = "free_y") + + geom_line() + + geom_line(aes(y=q5), lty = "dashed") + + geom_line(aes(y=q95), lty = "dashed") + + theme_bw() + + xlab(NULL) + + ylab("Change in soil C (kg C m-2) from baseline scenario") + + geom_hline(yintercept = 0, color="lightgreen") + +rel_plt <- diff_relative_ci |> + filter( + grepl("^d_", variable), + site %in% selected_sites + ) |> + ggplot() + + aes(x=posix, y=mean*100) + + facet_grid(variable~site, scales = "free_y") + + geom_line() + + geom_line(aes(y=q5*100), lty = "dashed") + + geom_line(aes(y=q95*100), lty = "dashed") + + theme_bw() + + xlab(NULL) + + ylab("Percent change in soil C from baseline scenario") + + geom_hline(yintercept = 0, color="lightgreen") + + +rel_mean_plt <- mean_relative_diff_ci |> + filter(grepl("^d_", variable)) |> + ggplot() + + aes(x=posix, y=mean*100) + + facet_wrap(~variable, scales = "free_y") + + geom_line() + + geom_line(aes(y=q5*100), lty = "dashed") + + geom_line(aes(y=q95*100), lty = "dashed") + + theme_bw() + + xlab(NULL) + + ylab("Percent change in soil C from baseline scenario") + + ggtitle("cross-site means") + + geom_hline(yintercept = 0, color="lightgreen") + + + +save_gg(diff_plt, "scenario_diffs") +save_gg(rel_plt, "scenario_pct_change") +save_gg(rel_mean_plt, "scenario_mean_pct_change") +``` + diff --git a/one_off_analyses/lebauer_agu_2025/template.xml b/one_off_analyses/lebauer_agu_2025/template.xml new file mode 100644 index 0000000..7fe17bc --- /dev/null +++ b/one_off_analyses/lebauer_agu_2025/template.xml @@ -0,0 +1,84 @@ + + + + + -1 + + + output + output/out + output/run + + + annual_crop + data_raw/pfts/annual_crop/post.distns.Rdata + data_raw/pfts/annual_crop/ + + + soil + data_raw/pfts/soil/ + + + + + TotSoilCarb + + + uniform + + + sampling + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + + + + + localhost + output/out + output/run + + + squeue -j @JOBID@ || echo DONE + + parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' :::: + + 1000 + + + diff --git a/tools/build_validation_siteinfo.R b/tools/build_validation_siteinfo.R new file mode 100755 index 0000000..14cde4c --- /dev/null +++ b/tools/build_validation_siteinfo.R @@ -0,0 +1,74 @@ +#!/usr/bin/env Rscript + +library(terra) +library(tidyverse) + +set.seed(20251029) + +# Validation data is primarily from Healthy Soil Program demonstration sites, +# and is used with site owner permission granted to CARB and CDFA for nonpublic +# research purposes only. Do not release data or publish in any form that makes +# datapoints identifiable. +# Contact chelsea.carey@arb.ca.gov for more information. +data_dir <- "data_raw/private/HSP/" +soc_file <- "Harmonized_Data_Croplands.csv" +mgmt_file <- "Harmonized_SiteMngmt_Croplands.csv" + +# can't use datapoints from before our simulations start +min_yr <- 2016 + +# TODO using 2018 for compatibility with field ids used in IC script; +# still need to harmonize IDs with those in data_raw/crop_map/ca_fields.gpkg +field_map <- "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb" + +# For first pass, selecting only the control/no-treatment plots. +# TODO: revisit this as we build more management into the workflow. +site_ids <- read.csv(file.path(data_dir, mgmt_file)) |> + filter(Treatment_Control %in% c("Control", "None")) |> + distinct(ProjectName, BaseID) + +site_locs <- read.csv(file.path(data_dir, soc_file)) |> + filter(Year >= min_yr) |> + # some IDs in site_locs contain spaces stripped in site_id; let's match + mutate(BaseID = gsub("\\s+", "", BaseID)) |> + distinct(ProjectName, BaseID, Latitude, Longitude) |> + rename(lat = Latitude, lon = Longitude) |> + inner_join(site_ids) + +dwr_fields <- terra::vect(field_map) +site_dwr_ids <- site_locs |> + terra::vect(crs = "epsg:4326") |> + terra::project(dwr_fields) |> + terra::nearest(dwr_fields) |> + as.data.frame() |> + mutate( + ProjectName = site_locs$ProjectName[from_id], + BaseID = site_locs$BaseID[from_id], + field_id = dwr_fields$UniqueID[to_id], + crop_class = dwr_fields$SYMB_CLASS[to_id] + ) +stopifnot(nrow(site_dwr_ids) == nrow(site_locs)) + +site_locs |> + left_join(site_dwr_ids, by = c("ProjectName", "BaseID")) |> + mutate( + id = paste(ProjectName, BaseID, lat, lon) |> + purrr::map_chr(rlang::hash), + pft = dplyr::case_when( + crop_class %in% c("G", "F", "P", "T") ~ "grass", + crop_class %in% c("D", "C", "V", "YP") ~ "temperate.deciduous", + # TODO later: R = rice, T19 & T28 = woody berries + TRUE ~ NA_character_ + ) + ) |> + # TODO is this desirable? + # In production, may be better to complain if no PFT match + drop_na(pft) |> + # Temporary hack: + # Where multiple treatments share a location, + # use only one of them + group_by(lat, lon, pft) |> + slice_sample(n = 1) |> + ungroup() |> + select(id, field_id, lat, lon, site.pft = pft) |> + write.csv("validation_site_info.csv", row.names = FALSE) diff --git a/tools/run_CA_grid_ERA5_nc_extraction.R b/tools/run_CA_grid_ERA5_nc_extraction.R new file mode 100644 index 0000000..c0bf935 --- /dev/null +++ b/tools/run_CA_grid_ERA5_nc_extraction.R @@ -0,0 +1,42 @@ +#!/usr/bin/env Rscript + +# Extract 0.5-degree-gridded ERA5 met for all of California +# +# Ran on BU cluster as: +# qsub -pe omp 28 -l mem_per_core=4G -o prep_getERA5_CAgrid_met.log -j y \ +# run_CA_grid_ERA5_nc_extraction.R + +library(terra) + +print(paste("start at", Sys.time())) + + +ca_shp <- vect("~/cur/ccmmf/workflows/data_raw/ca_outline_shp/") + +ca_bboxgrid <- expand.grid( + lon = seq(from = -124.5, to = -114, by = 0.5), + lat = seq(from = 32.5, to = 42, by = 0.5) +) |> + mutate(id = paste0(lat, "N_", abs(lon), "W")) +ca_gridcell_ids <- ca_bboxgrid |> + vect(crs = "epsg:4326") |> + project(ca_shp) |> + buffer(27778) |> # 1/4 degree in meters + intersect(ca_shp) |> + _$id +ca_grid <- ca_bboxgrid |> + filter(id %in% ca_gridcell_ids) + +PEcAn.data.atmosphere::extract.nc.ERA5( + slat = ca_grid$lat, + slon = ca_grid$lon, + in.path = "/projectnb/dietzelab/dongchen/anchorSites/ERA5", + start_date = "2016-01-01", + end_date = "2024-12-31", + outfolder = "/projectnb/dietzelab/chrisb/ERA5_nc_CA_grid", + in.prefix = "ERA5_", + newsite = ca_grid$id, + ncores = 27 +) + +print(paste("done at", Sys.time()))