Skip to content

Commit

Permalink
temporary fix for corrupted eea_cell_codes
Browse files Browse the repository at this point in the history
  • Loading branch information
SanderDevisscher committed Dec 5, 2023
1 parent f31d742 commit 1c06bfa
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 23 deletions.
1 change: 1 addition & 0 deletions data/interim/corrupt_bl_eea_cell_codes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
year,eea_cell_code,classKey,cobs,cellcode_length,x,y,natura2000
60 changes: 37 additions & 23 deletions src/05_occurrence_indicators_preprocessing.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ data_file <- here::here(
)
taxa_df <-
read_tsv(data_file,
na = "",
guess_max = 5000
na = "",
guess_max = 5000
)
taxa_df <-
taxa_df %>%
Expand Down Expand Up @@ -309,10 +309,10 @@ df_xy <-
bind_cols(
tibble(
x = unlist(str_extract_all(unique(df$eea_cell_code),
pattern = "(?<=E)\\d+"
pattern = "(?<=E)\\d+"
)),
y = unlist(str_extract_all(unique(df$eea_cell_code),
pattern = "(?<=N)\\d+"
pattern = "(?<=N)\\d+"
))
) %>%
mutate_all(as.integer)
Expand All @@ -327,17 +327,31 @@ df_xy %>% head()

Do the same for baseline data:

```{r remove corrupted eea_cell_code}
df_bl <- df_bl %>%
mutate(cellcode_length = nchar(eea_cell_code))
table(df_bl$cellcode_length, useNA = "ifany")
corrupt_bl_eea_cell_codes <- df_bl %>%
filter(cellcode_length != 13) %>%
write_csv("./data/interim/corrupt_bl_eea_cell_codes.csv")
df_bl <- df_bl %>%
filter(cellcode_length == 13)
```

```{r extract_x_y_baseline}
df_bl_xy <-
df_bl %>%
distinct(eea_cell_code) %>%
bind_cols(
tibble(
x = unlist(str_extract_all(unique(df_bl$eea_cell_code),
pattern = "(?<=E)\\d+"
pattern = "(?<=E)\\d+"
)),
y = unlist(str_extract_all(unique(df_bl$eea_cell_code),
pattern = "(?<=N)\\d+"
pattern = "(?<=N)\\d+"
))
) %>%
mutate_all(as.integer)
Expand All @@ -363,10 +377,10 @@ recent_alien_species <-
filter(first_observed >= year_of_introduction | is.na(first_observed)) %>%
# remove duplicates due to other columns as pathways
distinct(nubKey, first_observed, last_observed) %>%
# get classkey info and filter out taxa without data
inner_join(spec_names, by = c("nubKey" = "taxonKey")) %>%
distinct(
taxonKey = nubKey,
.data$canonicalName,
Expand All @@ -392,8 +406,8 @@ old_introductions_taxa <-
taxa_df %>%
filter(first_observed < year_of_introduction) %>%
anti_join(recent_alien_species %>%
select(taxonKey),
by = c("nubKey" = "taxonKey")
select(taxonKey),
by = c("nubKey" = "taxonKey")
) %>%
distinct(
nubKey,
Expand All @@ -418,16 +432,16 @@ We save them in file `taxa_introduced_in_BE_before_1950.tsv` in `data/output`:

```{r save_old_introduced_taxa_as_tsv_file}
write_tsv(old_introductions_taxa,
here::here(
"data",
"output",
paste0(
"taxa_introduced_in_BE_before_",
year_of_introduction,
".tsv"
)
),
na = ""
here::here(
"data",
"output",
paste0(
"taxa_introduced_in_BE_before_",
year_of_introduction,
".tsv"
)
),
na = ""
)
```

Expand Down Expand Up @@ -477,7 +491,7 @@ We add whether the grid cell intersects any of the Natura2000 Belgian protected
df <-
df %>%
left_join(df_prot_areas,
by = c("eea_cell_code" = "CELLCODE")
by = c("eea_cell_code" = "CELLCODE")
)
```

Expand All @@ -493,7 +507,7 @@ We do the same for baseline data:
df_bl <-
df_bl %>%
left_join(df_prot_areas,
by = c("eea_cell_code" = "CELLCODE")
by = c("eea_cell_code" = "CELLCODE")
)
```

Expand Down Expand Up @@ -546,7 +560,7 @@ make_time_series <- function(eea_cell_code, taxonKey, begin_year, last_year ) {
expand_grid(eea_cell_code = eea_cell_code,
taxonKey = taxonKey,
year = seq(from = begin_year, to = last_year))
}
# create timeseries slots
Expand Down

0 comments on commit 1c06bfa

Please sign in to comment.