temporary fix for corrupted eea_cell_codes

#46 linked to trias-project/indicators#123
inbo · Dec 5, 2023 · 1c06bfa · 1c06bfa
1 parent f31d742
commit 1c06bfa
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 23 deletions.
diff --git a/data/interim/corrupt_bl_eea_cell_codes.csv b/data/interim/corrupt_bl_eea_cell_codes.csv
@@ -0,0 +1 @@
+year,eea_cell_code,classKey,cobs,cellcode_length,x,y,natura2000
diff --git a/src/05_occurrence_indicators_preprocessing.Rmd b/src/05_occurrence_indicators_preprocessing.Rmd
@@ -127,8 +127,8 @@ data_file <- here::here(
 )
 taxa_df <-
   read_tsv(data_file,
-    na = "",
-    guess_max = 5000
+           na = "",
+           guess_max = 5000
   )
 taxa_df <-
   taxa_df %>%
@@ -309,10 +309,10 @@ df_xy <-
   bind_cols(
     tibble(
       x = unlist(str_extract_all(unique(df$eea_cell_code),
-        pattern = "(?<=E)\\d+"
+                                 pattern = "(?<=E)\\d+"
       )),
       y = unlist(str_extract_all(unique(df$eea_cell_code),
-        pattern = "(?<=N)\\d+"
+                                 pattern = "(?<=N)\\d+"
       ))
     ) %>%
       mutate_all(as.integer)
@@ -327,17 +327,31 @@ df_xy %>% head()
 
 Do the same for baseline data:
 
+```{r remove corrupted eea_cell_code}
+df_bl <- df_bl %>% 
+  mutate(cellcode_length = nchar(eea_cell_code))
+
+table(df_bl$cellcode_length, useNA = "ifany")
+
+corrupt_bl_eea_cell_codes <- df_bl %>% 
+  filter(cellcode_length != 13) %>% 
+  write_csv("./data/interim/corrupt_bl_eea_cell_codes.csv")
+
+df_bl <- df_bl %>% 
+  filter(cellcode_length == 13)
+```
+
 ```{r extract_x_y_baseline}
 df_bl_xy <-
   df_bl %>%
   distinct(eea_cell_code) %>%
   bind_cols(
     tibble(
       x = unlist(str_extract_all(unique(df_bl$eea_cell_code),
-        pattern = "(?<=E)\\d+"
+                                 pattern = "(?<=E)\\d+"
       )),
       y = unlist(str_extract_all(unique(df_bl$eea_cell_code),
-        pattern = "(?<=N)\\d+"
+                                 pattern = "(?<=N)\\d+"
       ))
     ) %>%
       mutate_all(as.integer)
@@ -363,10 +377,10 @@ recent_alien_species <-
   filter(first_observed >= year_of_introduction | is.na(first_observed)) %>%
   # remove duplicates due to other columns as pathways
   distinct(nubKey, first_observed, last_observed) %>%
-
+  
   # get classkey info and filter out taxa without data
   inner_join(spec_names, by = c("nubKey" = "taxonKey")) %>%
-
+  
   distinct(
     taxonKey = nubKey,
     .data$canonicalName,
@@ -392,8 +406,8 @@ old_introductions_taxa <-
   taxa_df %>%
   filter(first_observed < year_of_introduction) %>%
   anti_join(recent_alien_species %>%
-    select(taxonKey),
-  by = c("nubKey" = "taxonKey")
+              select(taxonKey),
+            by = c("nubKey" = "taxonKey")
   ) %>%
   distinct(
     nubKey,
@@ -418,16 +432,16 @@ We save them in file `taxa_introduced_in_BE_before_1950.tsv` in  `data/output`:
 
 ```{r save_old_introduced_taxa_as_tsv_file}
 write_tsv(old_introductions_taxa,
-  here::here(
-    "data",
-    "output",
-    paste0(
-      "taxa_introduced_in_BE_before_",
-      year_of_introduction,
-      ".tsv"
-    )
-  ),
-  na = ""
+          here::here(
+            "data",
+            "output",
+            paste0(
+              "taxa_introduced_in_BE_before_",
+              year_of_introduction,
+              ".tsv"
+            )
+          ),
+          na = ""
 )
 ```
 
@@ -477,7 +491,7 @@ We add whether the grid cell intersects any of the Natura2000 Belgian protected
 df <-
   df %>%
   left_join(df_prot_areas,
-    by = c("eea_cell_code" = "CELLCODE")
+            by = c("eea_cell_code" = "CELLCODE")
   )
 ```
 
@@ -493,7 +507,7 @@ We do the same for baseline data:
 df_bl <-
   df_bl %>%
   left_join(df_prot_areas,
-    by = c("eea_cell_code" = "CELLCODE")
+            by = c("eea_cell_code" = "CELLCODE")
   )
 ```
 
@@ -546,7 +560,7 @@ make_time_series <- function(eea_cell_code, taxonKey, begin_year, last_year ) {
   expand_grid(eea_cell_code = eea_cell_code,
               taxonKey = taxonKey,
               year = seq(from = begin_year, to = last_year))
-
+  
 }
 
 # create timeseries slots
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		year,eea_cell_code,classKey,cobs,cellcode_length,x,y,natura2000