The CDMConnector package provides tools for working OMOP Common Data Model (CDM) tables in a pipe friendly syntax. CDM table references are stored in a single compound object along with CDM specific metadata.
The main function provided by the package is
cdm_from_con
which creates a CDM connection object that can
be used with dplyr verbs. In the examples that we will use a duckdb database which is embedded in the
CDMConnector package.
library(CDMConnector)
<- DBI::dbConnect(duckdb::duckdb(), dbdir = eunomia_dir())
con <- cdm_from_con(con, cdm_schema = "main")
cdm
cdm#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: person, observation_period, visit_occurrence, visit_detail, condition_occurrence, drug_exposure, procedure_occurrence, measurement, observation, death, location, care_site, provider, drug_era, dose_era, condition_era, cdm_source, concept, vocabulary, concept_relationship, concept_synonym, concept_ancestor, drug_strength
Individual CDM table references can be accessed using $
and piped to dplyr verbs.
library(dplyr, warn.conflicts = FALSE)
$person %>%
cdmcount()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB 0.6.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpnQ23cR/dwtudgsr]
#> n
#> <dbl>
#> 1 2694
$drug_exposure %>%
cdmleft_join(cdm$concept, by = c("drug_concept_id" = "concept_id")) %>%
count(drug = concept_name, sort = TRUE)
#> # Source: SQL [?? x 2]
#> # Database: DuckDB 0.6.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpnQ23cR/dwtudgsr]
#> # Ordered by: desc(n)
#> drug n
#> <chr> <dbl>
#> 1 Acetaminophen 325 MG Oral Tablet 9365
#> 2 poliovirus vaccine, inactivated 7977
#> 3 tetanus and diphtheria toxoids, adsorbed, preservative free, for adult… 7430
#> 4 Aspirin 81 MG Oral Tablet 4380
#> 5 Amoxicillin 250 MG / Clavulanate 125 MG Oral Tablet 3851
#> 6 hepatitis A vaccine, adult dosage 3211
#> 7 Acetaminophen 160 MG Oral Tablet 2158
#> 8 zoster vaccine, live 2125
#> 9 Acetaminophen 21.7 MG/ML / Dextromethorphan Hydrobromide 1 MG/ML / dox… 1993
#> 10 hepatitis B vaccine, adult dosage 1916
#> # … with more rows
To send SQL to the CDM use the connection object. The cdm reference
also contains the connection as an attribute that can be accessed with
the dbcon
function.
::dbGetQuery(con, "select count(*) as person_count from main.person")
DBI#> person_count
#> 1 2694
# get the connection from a cdm object using the function `remote_con` from dbplyr
::dbGetQuery(dbplyr::remote_con(cdm$person), "select count(*) as person_count from main.person")
DBI#> person_count
#> 1 2694
To create SQL that can be executed across multiple database platforms the SqlRender package can be used.
<- SqlRender::translate("select count(*) as person_count from main.person",
sql targetDialect = dbms(con))
::dbGetQuery(con, sql) DBI
If you do not need references to all tables you can easily select
only a subset of tables to include in the cdm reference. The
select
argument of cdm_from_con
supports the
tidyselect
selection language and provides a new selection helper:
tbl_group
.
cdm_from_con(con, cdm_tables = starts_with("concept")) # tables that start with 'concept'
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: concept, concept_class, concept_relationship, concept_synonym, concept_ancestor
cdm_from_con(con, cdm_tables = contains("era")) # tables that contain the substring 'era'
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: drug_era, dose_era, condition_era
cdm_from_con(con, cdm_tables = tbl_group("vocab")) # pre-defined groups
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: concept, vocabulary, domain, concept_class, concept_relationship, relationship, concept_synonym, concept_ancestor, source_to_concept_map, drug_strength
cdm_from_con(con, cdm_tables = c("person", "observation_period")) # character vector
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: person, observation_period
cdm_from_con(con, cdm_tables = c(person, observation_period)) # bare names
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: person, observation_period
cdm_from_con(con, cdm_tables = matches("person|period")) # regular expression
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: person, observation_period, payer_plan_period
# If you are using a variable to hold selections then use the `all_of` selection helper
<- c("person", "observation_period")
tables_to_include cdm_from_con(con, cdm_tables = all_of(tables_to_include))
#> # OMOP CDM reference (tbl_duckdb_connection)
#>
#> Tables: person, observation_period
tbl_group
supports several subsets of the CDM: “all”,
“clinical”, “vocab”, “derived”, and “default”.
The default set of CDM tables included in a CDM object is:
tbl_group("default")
#> [1] "person" "observation_period" "visit_occurrence"
#> [4] "visit_detail" "condition_occurrence" "drug_exposure"
#> [7] "procedure_occurrence" "measurement" "observation"
#> [10] "death" "location" "care_site"
#> [13] "provider" "drug_era" "dose_era"
#> [16] "condition_era" "cdm_source" "concept"
#> [19] "vocabulary" "concept_relationship" "concept_synonym"
#> [22] "concept_ancestor" "drug_strength"
It is common to use one or more cohort tables along with the CDM. A cohort table has the following structure and can be created by CDMConnector or SQL or another package.
Creation of cohort tables is outside of the scope of the CDMConnector package. Cohort tables need to be a separate schema from the CDM tables where the user has both read and write access. Once the cohort table is created in the database it can be added to the cdm object as follows.
listTables(con, schema = "write_schema")
#> [1] "cohort"
<- cdm_from_con(con,
cdm cdm_tables = c("person", "observation_period"),
write_schema = "write_schema",
cohort_tables = "cohort")
$cohort
cdm#> # Source: table<write_schema.cohort> [2 x 4]
#> # Database: DuckDB 0.6.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpnQ23cR/dwtudgsr]
#> cohort_id subject_id cohort_start_date cohort_end_date
#> <int> <int> <date> <date>
#> 1 1 1 2023-01-23 2023-01-23
#> 2 1 2 2020-02-03 2020-11-04
There are two ways to extract subsets of the CDM.
collect
pulls data into R
stow
saves the cdm subset to a set of files on disk
in either parquet, feather, or csv format
<- cdm %>%
local_cdm collect()
# The cdm tables are now dataframes
$person[1:4, 1:4]
local_cdm#> # A tibble: 4 × 4
#> person_id gender_concept_id year_of_birth month_of_birth
#> <dbl> <dbl> <dbl> <dbl>
#> 1 6 8532 1963 12
#> 2 123 8507 1950 4
#> 3 129 8507 1974 10
#> 4 16 8532 1971 10
<- file.path(tempdir(), "tmp")
save_path dir.create(save_path)
%>%
cdm stow(path = save_path)
list.files(save_path)
#> [1] "cohort.parquet" "observation_period.parquet"
#> [3] "person.parquet"
stow
saves the cdm object as a set of files.
cdm_from_files
read the files back into R as a
cdm_reference object. The tables can be stored as R dataframes or Arrow
Tables. In both cases cdm tables can be manipulated with
dplyr
verbs.
<- cdm_from_files(save_path)
cdm
class(cdm$cohort)
#> [1] "tbl_df" "tbl" "data.frame"
$cohort %>%
cdmtally() %>%
pull(n)
#> [1] 2
<- cdm_from_files(save_path,
cdm as_data_frame = FALSE)
class(cdm$cohort)
#> [1] "Table" "ArrowTabular" "ArrowObject" "R6"
$cohort %>%
cdmnrow()
#> [1] 2
Close the database connection with dbDisconnect
. After
the connection is closed the cdm object can no longer be used.
::dbDisconnect(con, shutdown = TRUE) DBI
Sometimes you may need to delay the creation of a cdm connection or
create and close many connections during execution. CDMConnector
provides the ability to store connection information that can be passed
to dbConnect
to create a connection. The typical use case
is to create a new connection inside a function and then close the
connection before the function exits.
<- dbConnectDetails(duckdb::duckdb(), dbdir = eunomia_dir())
connection_details
<- function(connection_details) {
self_contained_query # create a new connection
<- DBI::dbConnect(connection_details)
con # close the connection before exiting
on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
# use the connection
::dbGetQuery(con, "select count(*) as n from main.person")
DBI
}
self_contained_query(connection_details)
#> n
#> 1 2694
Since cdm object can include any subset of cdm tables it is important
for functions that take cdm objects as input to check that the expected
tables exists. The assert_tables
function provides a
checkmate style function that tests if the expected tables exist, have
the correct columns, and (optionally) are not empty. It can also be used
with checkmate assert collections.
library(checkmate)
<- DBI::dbConnect(duckdb::duckdb(), dbdir = eunomia_dir())
con
assertTables(cdm_from_con(con, cdm_tables = "drug_era"), tables = c("person"))
#> Error in `assertTables()`:
#> ! - person table not found in cdm object
# add missing table error to collection
<- checkmate::makeAssertCollection()
err assertTables(cdm_from_con(con, cdm_tables = "drug_era"), tables = c("person"), add = err)
$getMessages()
err#> [1] "- person table not found in cdm object"
assert_tables
can be used in functions that accept
cdm_reference
objects as a parameter.
<- function(cdm) {
countDrugsByGender assertTables(cdm, tables = c("person", "drug_era"), empty.ok = FALSE)
$person %>%
cdm::inner_join(cdm$drug_era, by = "person_id") %>%
dplyr::count(.data$gender_concept_id, .data$drug_concept_id) %>%
dplyr::collect()
dplyr
}
countDrugsByGender(cdm_from_con(con, cdm_tables = "person"))
#> Error in `assertTables()`:
#> ! - drug_era table not found in cdm object
::dbExecute(con, "delete from drug_era")
DBI#> [1] 52508
countDrugsByGender(cdm_from_con(con))
#> Error in `assertTables()`:
#> ! - drug_era cdm table is empty
::dbDisconnect(con, shutdown = TRUE) DBI
Camel case aliases are provided for many functions to fit both
snake_case
and camelCase
programming
styles.