From ef3e913b9618c69a6de8d7b55c49ceb445a55607 Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Fri, 9 Aug 2024 20:15:56 +0000 Subject: [PATCH] fresh rewrite as hf --- data-raw/huggingface.R | 79 ++++++++++++++++++++++++++++++++++++++++++ data-raw/import_db.R | 60 -------------------------------- data-raw/write-prov.R | 64 ++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 60 deletions(-) create mode 100644 data-raw/huggingface.R create mode 100644 data-raw/write-prov.R diff --git a/data-raw/huggingface.R b/data-raw/huggingface.R new file mode 100644 index 0000000..ccc3f89 --- /dev/null +++ b/data-raw/huggingface.R @@ -0,0 +1,79 @@ +hf <- "https://huggingface.co" + + +hf_urls <- function(path = "data/fb/v24.07/parquet", + repo = "datasets/cboettig/fishbase", + branch = "main" + ) { + + paths <- + glue::glue("{hf}/api/{repo}/tree/{branch}/{path}") |> + jsonlite::read_json() |> + purrr::map_chr('path') + + glue::glue("{hf}/{repo}/resolve/{branch}/{path}", path=paths) +} + + +server_code <- function(server = c("fishbase", "sealifebase")) { + server <- match.arg(server) + switch(server, + "fishbase" = "fb", + "sealifebase" = "slb") +} + +available_releases <- function(server = c("fishbase", "sealifebase")) { + + sv <- server_code(server) + repo <- "datasets/cboettig/fishbase" + path <- glue::glue("data/{sv}") + + versions <- + glue::glue("{hf}/api/{repo}/tree/{branch}/{path}") |> + jsonlite::read_json() |> + purrr::map_chr('path') |> + stringr::str_extract("\\/v(\\d{2}\\.\\d{2})", 1) + + versions + +} +# "23.05" "23.01" "21.06" "19.04" +#url |> duckdbfs::open_dataset() + +fb_urls <- function(server = c("fishbase", "sealifebase"), + version = "latest") { + + if(version == "latest") { + version <- max(available_releases(server)) + } + + sv <- server_code(server) + + path <- glue::glue("data/{sv}/v{version}/parquet") + hf_urls(path) + +} + +fb_tables <- function(server = c("fishbase", "sealifebase"), + version = "latest") { + fb_urls(server, version) |> + basename() |> + stringr::str_remove(".parquet") +} + +fb_table <- function(tbl, + server = c("fishbase", "sealifebase"), + version = "latest", + collect = TRUE) { + urls <- fb_urls(server, version) + tbl_names <- urls |> basename() |> stringr::str_remove(".parquet") + names(urls) <- tbl_names + url <- urls[tbl] + + out <- duckdbfs::open_dataset(url) + if(collect) + out <- dplyr::collect(out) + + out +} + diff --git a/data-raw/import_db.R b/data-raw/import_db.R index 13a0770..e6ac6d6 100644 --- a/data-raw/import_db.R +++ b/data-raw/import_db.R @@ -21,32 +21,6 @@ for(table in tables){ #arrow::write_parquet(df, paste0(table,".parquet")) } -fb.prov <- "inst/prov/fb.prov" - -prov::write_prov( - data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/fb_parquet_2023-05/", - basename(fs::dir_ls("../rfishbase_board/fb_parquet_2023-05/"))), - title = "FishBase Snapshots v23.05", - description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci", - license = "https://creativecommons.org/licenses/by-nc/3.0/", - creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"), - version = "23.05", - issued = "2023-02-01", - prov=fb.prov, - schema="http://schema.org", - append=TRUE) - -fs::file_copy(fb.prov, "fb_prov.json") -fs::file_copy("fb_prov.json", fb.prov, overwrite = TRUE) - -jsonld::jsonld_frame(fb.prov, -'{ - "@context": "http://schema.org/", - "@type": "Dataset" -}') |> - readr::write_lines(fb.prov) - - @@ -64,37 +38,3 @@ for(table in tables){ } -urls <- paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-01/", tables, ".parquet") - - -slb.prov <- "inst/prov/slb.prov" - -prov::write_prov( - data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-05/", - basename(fs::dir_ls("../rfishbase_board/slb_parquet_2023-05/"))), - title = "SeaLifeBase Snapshots v23.05", - description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci", - license = "https://creativecommons.org/licenses/by-nc/3.0/", - creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"), - version = "23.05", - issued = "2023-02-01", - prov=slb.prov, - schema="http://schema.org", - append=TRUE) - -fs::file_copy(slb.prov, "slb_prov.json") -#fs::file_copy("slb_prov.json", slb.prov, overwrite = TRUE) - -jsonld::jsonld_frame(slb.prov, - '{ - "@context": "http://schema.org/", - "@type": "Dataset" -}') |> - readr::write_lines(slb.prov) - - - -#mc("cp -r /home/cboettig/cboettig/rfishbase_board/fb_parquet_2023-01 thelio/shared-data/fishbase/") -#mc("cp -r slb_parquet_2023-01 thelio/shared-data/fishbase/") - - diff --git a/data-raw/write-prov.R b/data-raw/write-prov.R new file mode 100644 index 0000000..97061ee --- /dev/null +++ b/data-raw/write-prov.R @@ -0,0 +1,64 @@ + +fb.prov <- "inst/prov/fb.prov" + +prov::write_prov( + data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/fb_parquet_2023-05/", + basename(fs::dir_ls("../rfishbase_board/fb_parquet_2023-07/"))), + title = "FishBase Snapshots v24.07", + description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci", + license = "https://creativecommons.org/licenses/by-nc/3.0/", + creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"), + version = "23.05", + issued = "2023-02-01", + prov=fb.prov, + schema="http://schema.org", + append=TRUE) + +fs::file_copy(fb.prov, "fb_prov.json") +fs::file_copy("fb_prov.json", fb.prov, overwrite = TRUE) + +jsonld::jsonld_frame(fb.prov, + '{ + "@context": "http://schema.org/", + "@type": "Dataset" +}') |> + readr::write_lines(fb.prov) + + + + + + +urls <- paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-01/", tables, ".parquet") + + +slb.prov <- "inst/prov/slb.prov" + +prov::write_prov( + data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-05/", + basename(fs::dir_ls("../rfishbase_board/slb_parquet_2023-05/"))), + title = "SeaLifeBase Snapshots v23.05", + description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci", + license = "https://creativecommons.org/licenses/by-nc/3.0/", + creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"), + version = "23.05", + issued = "2023-02-01", + prov=slb.prov, + schema="http://schema.org", + append=TRUE) + +fs::file_copy(slb.prov, "slb_prov.json") +#fs::file_copy("slb_prov.json", slb.prov, overwrite = TRUE) + +jsonld::jsonld_frame(slb.prov, + '{ + "@context": "http://schema.org/", + "@type": "Dataset" +}') |> + readr::write_lines(slb.prov) + + + +#mc("cp -r /home/cboettig/cboettig/rfishbase_board/fb_parquet_2023-01 thelio/shared-data/fishbase/") +#mc("cp -r slb_parquet_2023-01 thelio/shared-data/fishbase/") +