Skip to content

Commit

Permalink
fix: Keep value_as_concept_id in the summary stats table (#122)
Browse files Browse the repository at this point in the history
* Use `test_data/internal` to store monthly test data

* Use `test_data/internal` as input for dev data

* Regenerate dev data

* RStudio project now has a `ProjectId`

* Keep the `value_as_concept_id` column in the summary stats

Helps avoiding duplicate rows in case there is no matching concept for
the `value_as_concept_id`

* Fix typo

* Update test snapshot
  • Loading branch information
milanmlft authored Jan 22, 2025
1 parent 5b0a2c0 commit ba46c7d
Show file tree
Hide file tree
Showing 15 changed files with 65 additions and 61 deletions.
42 changes: 22 additions & 20 deletions app/inst/dev_data/omopcat_monthly_counts.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
concept_id,concept_name,date_year,date_quarter,record_count,person_count,records_per_person
3003573,C Ag [Presence] on Red Blood Cells,2020,2,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2019,3,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2020,3,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2021,1,2.5,2.5,2.5
concept_id,concept_name,date_year,date_month,record_count,person_count,records_per_person
3003573,C Ag [Presence] on Red Blood Cells,2020,5,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2019,8,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2020,8,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2021,2,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2021,4,2.5,2.5,2.5
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,2020,4,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2021,3,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2021,4,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2023,2,2.5,2.5,2.5
4092281,Ex-cigarette smoker,2022,2,2.5,2.5,2.5
4276526,Cigarette smoker,2021,2,2.5,2.5,2.5
4276526,Cigarette smoker,2022,4,2.5,2.5,2.5
4298794,Smoker,2019,1,2.5,2.5,2.5
4298794,Smoker,2021,3,2.5,2.5,2.5
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,2019,3,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2019,2,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2019,4,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2020,2,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2021,4,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2021,6,2.5,2.5,2.5
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,2021,11,2.5,2.5,2.5
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,2020,11,2.5,2.5,2.5
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,2020,12,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2021,9,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2021,10,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2021,12,2.5,2.5,2.5
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,2023,4,2.5,2.5,2.5
4092281,Ex-cigarette smoker,2022,6,2.5,2.5,2.5
4276526,Cigarette smoker,2021,5,2.5,2.5,2.5
4276526,Cigarette smoker,2022,12,2.5,2.5,2.5
4298794,Smoker,2019,2,2.5,2.5,2.5
4298794,Smoker,2021,7,2.5,2.5,2.5
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,2019,9,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2019,5,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2019,10,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2020,6,2.5,2.5,2.5
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,2021,10,2.5,2.5,2.5
34 changes: 17 additions & 17 deletions app/inst/dev_data/omopcat_summary_stats.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
concept_id,concept_name,summary_attribute,value_as_number,value_as_string
3003573,C Ag [Presence] on Red Blood Cells,frequency,2.5,Not present
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,mean,123.66666666666667,NA
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,sd,9.993331109628395,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,mean,128.83333333333334,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,sd,23.65938855225694,NA
4092281,Ex-cigarette smoker,frequency,2.5,Well nourished
4276526,Cigarette smoker,mean,12.799999999999999,NA
4276526,Cigarette smoker,sd,3.469870314579495,NA
4298794,Smoker,mean,5.666666666666667,NA
4298794,Smoker,sd,1.1547005383792517,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,mean,0.6666666666666666,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,sd,0,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,mean,132.8,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,sd,21.25323504786977,NA
concept_id,concept_name,summary_attribute,value_as_number,value_as_concept_id,value_as_string
3003573,C Ag [Presence] on Red Blood Cells,frequency,2.5,45878588,Not present
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,mean,123.66666666666667,NA,NA
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,sd,9.993331109628395,NA,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,1633781,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,1635564,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,mean,128.83333333333334,NA,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,sd,23.65938855225694,NA,NA
4092281,Ex-cigarette smoker,frequency,2.5,4086518,Well nourished
4276526,Cigarette smoker,mean,12.799999999999999,NA,NA
4276526,Cigarette smoker,sd,3.469870314579495,NA,NA
4298794,Smoker,mean,5.666666666666667,NA,NA
4298794,Smoker,sd,1.1547005383792517,NA,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,mean,0.6666666666666666,NA,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,sd,0,NA,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,mean,132.8,NA,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,sd,21.25323504786977,NA,NA
1 change: 1 addition & 0 deletions app/omopcat.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: 2afffd4f-205d-4d1b-9fea-8d996c5e2b91

RestoreWorkspace: No
SaveWorkspace: No
Expand Down
34 changes: 17 additions & 17 deletions app/tests/testthat/_snaps/utils_get_data/summary_stats.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
concept_id,concept_name,summary_attribute,value_as_number,value_as_string
3003573,C Ag [Presence] on Red Blood Cells,frequency,2.5,Not present
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,mean,123.66666666666669,NA
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,sd,9.993331109628397,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,mean,128.83333333333334,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,sd,23.65938855225694,NA
4092281,Ex-cigarette smoker,frequency,2.5,Well nourished
4276526,Cigarette smoker,mean,12.8,NA
4276526,Cigarette smoker,sd,3.469870314579495,NA
4298794,Smoker,mean,5.666666666666667,NA
4298794,Smoker,sd,1.1547005383792517,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,mean,0.6666666666666666,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,sd,0,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,mean,132.8,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,sd,21.25323504786977,NA
concept_id,concept_name,summary_attribute,value_as_number,value_as_concept_id,value_as_string
3003573,C Ag [Presence] on Red Blood Cells,frequency,2.5,45878588,Not present
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,mean,123.66666666666669,NA,NA
3005673,Hemoglobin A1c/Hemoglobin.total in Blood by HPLC,sd,9.993331109628397,NA,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,1633781,NA
3021714,Do sup(a) Ag [Presence] on Red Blood Cells,frequency,2.5,1635564,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,mean,128.83333333333334,NA,NA
3022250,Lactate dehydrogenase [Enzymatic activity/volume] in Serum or Plasma by Lactate to pyruvate reaction,sd,23.65938855225694,NA,NA
4092281,Ex-cigarette smoker,frequency,2.5,4086518,Well nourished
4276526,Cigarette smoker,mean,12.8,NA,NA
4276526,Cigarette smoker,sd,3.469870314579495,NA,NA
4298794,Smoker,mean,5.666666666666667,NA,NA
4298794,Smoker,sd,1.1547005383792517,NA,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,mean,0.6666666666666666,NA,NA
40760007,HIV 1+2 Ab+HIV1 p24 Ag [Presence] in Serum or Plasma by Immunoassay,sd,0,NA,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,mean,132.8,NA,NA
40762352,Hemoglobin A1c/Hemoglobin.total in Blood by IFCC protocol,sd,21.25323504786977,NA,NA
Binary file modified data/test_data/internal/omopcat_concepts.parquet
Binary file not shown.
Binary file modified data/test_data/internal/omopcat_monthly_counts.parquet
Binary file not shown.
Binary file modified data/test_data/internal/omopcat_summary_stats.parquet
Binary file not shown.
Binary file removed data/test_data/omopcat_concepts.parquet
Binary file not shown.
Binary file removed data/test_data/omopcat_monthly_counts.parquet
Binary file not shown.
Binary file removed data/test_data/omopcat_summary_stats.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion preprocessing/R/summary_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ generate_summary_stats <- function(cdm, threshold, replacement) {
dplyr::rename(value_as_string = "concept_name") |>
# Then we get the names for the main concept_ids
dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
dplyr::select("concept_id", "concept_name", !"value_as_concept_id")
dplyr::select("concept_id", "concept_name", dplyr::everything())
}

#' Calculate summary statistics for an OMOP table
Expand Down
1 change: 1 addition & 0 deletions preprocessing/omopcat.preprocessing.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: 5a758118-3fd4-4083-bcab-ef118645ab42

RestoreWorkspace: No
SaveWorkspace: No
Expand Down
6 changes: 3 additions & 3 deletions preprocessing/tests/testthat/test-summary_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ test_that("generate_summary_stats works on a CDM object", {
summary_stats <- generate_summary_stats(mock_cdm, threshold = 0, replacement = 0)
expect_s3_class(summary_stats, "data.frame")
expect_named(summary_stats, c(
"concept_id", "concept_name", "summary_attribute", "value_as_number", "value_as_string"
"concept_id", "concept_name",
"summary_attribute", "value_as_number",
"value_as_concept_id", "value_as_string"
))
})

Expand All @@ -22,7 +24,6 @@ test_that("calculate_summary_stats produces the expected results", {
threshold = 0, replacement = 0
)
expect_s3_class(res, "data.frame")
expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id"))
expect_equal(nrow(res), 5)
mean <- res[res$summary_attribute == "mean", ][["value_as_number"]]
sd <- res[res$summary_attribute == "sd", ][["value_as_number"]]
Expand Down Expand Up @@ -68,7 +69,6 @@ test_that("calculate_summary_stats works with a database-stored table", {
)

expect_s3_class(db_res, "data.frame")
expect_named(db_res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id"))
expect_identical(db_res, ref)
expect_type(db_res$value_as_number, "double")
})
2 changes: 1 addition & 1 deletion scripts/create_dev_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ suppressPackageStartupMessages({
library(dplyr)
})

data_path <- here::here("data/test_data")
data_path <- here::here("data/test_data/internal")
stopifnot(dir.exists(data_path))
out_path <- here::here("app/inst/dev_data")
stopifnot(dir.exists(out_path))
Expand Down
4 changes: 2 additions & 2 deletions scripts/create_test_data.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generates the parquet files in data/test_data/ by running the preprocessing pipeilne
# Generates the parquet files in data/test_data/ by running the preprocessing pipeline
# on the test database located at data-raw/test_db/eunomia
withr::local_envvar(
ENV = "test",
Expand All @@ -9,7 +9,7 @@ withr::local_envvar(
LOW_FREQUENCY_REPLACEMENT = 2.5
)

out_path <- here::here("data/test_data")
out_path <- here::here("data/test_data/internal")
omopcat.preprocessing::preprocess(out_path)

cli::cli_alert_success("Test data written to {out_path}")

0 comments on commit ba46c7d

Please sign in to comment.