Getting Started

The CDMConnector package provides tools for working OMOP Common Data Model (CDM) tables in a pipe friendly syntax. CDM table references are stored in a single compound object along with CDM specific metadata.

The main function provided by the package is cdm_from_con which creates a CDM connection object that can be used with dplyr verbs. In the examples that we will use a duckdb database which is embedded in the CDMConnector package.

library(CDMConnector)

con <- DBI::dbConnect(duckdb::duckdb(), dbdir = eunomia_dir())
cdm <- cdm_from_con(con, cdm_schema = "main")
cdm
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period, visit_occurrence, visit_detail, condition_occurrence, drug_exposure, procedure_occurrence, measurement, observation, death, location, care_site, provider, drug_era, dose_era, condition_era, cdm_source, concept, vocabulary, concept_relationship, concept_synonym, concept_ancestor, drug_strength

Individual CDM table references can be accessed using $ and piped to dplyr verbs.

library(dplyr, warn.conflicts = FALSE)
cdm$person %>% 
  count()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB 0.6.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpnQ23cR/dwtudgsr]
#>       n
#>   <dbl>
#> 1  2694

cdm$drug_exposure %>% 
  left_join(cdm$concept, by = c("drug_concept_id" = "concept_id")) %>% 
  count(drug = concept_name, sort = TRUE)
#> # Source:     SQL [?? x 2]
#> # Database:   DuckDB 0.6.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpnQ23cR/dwtudgsr]
#> # Ordered by: desc(n)
#>    drug                                                                        n
#>    <chr>                                                                   <dbl>
#>  1 Acetaminophen 325 MG Oral Tablet                                         9365
#>  2 poliovirus vaccine, inactivated                                          7977
#>  3 tetanus and diphtheria toxoids, adsorbed, preservative free, for adult…  7430
#>  4 Aspirin 81 MG Oral Tablet                                                4380
#>  5 Amoxicillin 250 MG / Clavulanate 125 MG Oral Tablet                      3851
#>  6 hepatitis A vaccine, adult dosage                                        3211
#>  7 Acetaminophen 160 MG Oral Tablet                                         2158
#>  8 zoster vaccine, live                                                     2125
#>  9 Acetaminophen 21.7 MG/ML / Dextromethorphan Hydrobromide 1 MG/ML / dox…  1993
#> 10 hepatitis B vaccine, adult dosage                                        1916
#> # … with more rows

To send SQL to the CDM use the connection object. The cdm reference also contains the connection as an attribute that can be accessed with the dbcon function.

DBI::dbGetQuery(con, "select count(*) as person_count from main.person")
#>   person_count
#> 1         2694

# get the connection from a cdm object using the function `remote_con` from dbplyr
DBI::dbGetQuery(dbplyr::remote_con(cdm$person), "select count(*) as person_count from main.person")
#>   person_count
#> 1         2694

To create SQL that can be executed across multiple database platforms the SqlRender package can be used.

sql <- SqlRender::translate("select count(*) as person_count from main.person",
                            targetDialect = dbms(con))
DBI::dbGetQuery(con, sql)

Select a subset of CDM tables

If you do not need references to all tables you can easily select only a subset of tables to include in the cdm reference. The select argument of cdm_from_con supports the tidyselect selection language and provides a new selection helper: tbl_group.

cdm_from_con(con, cdm_tables = starts_with("concept")) # tables that start with 'concept'
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: concept, concept_class, concept_relationship, concept_synonym, concept_ancestor
cdm_from_con(con, cdm_tables = contains("era")) # tables that contain the substring 'era'
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: drug_era, dose_era, condition_era
cdm_from_con(con, cdm_tables = tbl_group("vocab")) # pre-defined groups
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: concept, vocabulary, domain, concept_class, concept_relationship, relationship, concept_synonym, concept_ancestor, source_to_concept_map, drug_strength
cdm_from_con(con, cdm_tables = c("person", "observation_period")) # character vector
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period
cdm_from_con(con, cdm_tables = c(person, observation_period)) # bare names
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period
cdm_from_con(con, cdm_tables = matches("person|period")) # regular expression
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period, payer_plan_period

# If you are using a variable to hold selections then use the `all_of` selection helper
tables_to_include <- c("person", "observation_period")
cdm_from_con(con, cdm_tables = all_of(tables_to_include))
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period

tbl_group supports several subsets of the CDM: “all”, “clinical”, “vocab”, “derived”, and “default”.

The default set of CDM tables included in a CDM object is:

tbl_group("default")
#>  [1] "person"               "observation_period"   "visit_occurrence"    
#>  [4] "visit_detail"         "condition_occurrence" "drug_exposure"       
#>  [7] "procedure_occurrence" "measurement"          "observation"         
#> [10] "death"                "location"             "care_site"           
#> [13] "provider"             "drug_era"             "dose_era"            
#> [16] "condition_era"        "cdm_source"           "concept"             
#> [19] "vocabulary"           "concept_relationship" "concept_synonym"     
#> [22] "concept_ancestor"     "drug_strength"

Include cohort tables

It is common to use one or more cohort tables along with the CDM. A cohort table has the following structure and can be created by CDMConnector or SQL or another package.

Creation of cohort tables is outside of the scope of the CDMConnector package. Cohort tables need to be a separate schema from the CDM tables where the user has both read and write access. Once the cohort table is created in the database it can be added to the cdm object as follows.

listTables(con, schema = "write_schema")
#> [1] "cohort"

cdm <- cdm_from_con(con, 
                    cdm_tables = c("person", "observation_period"), 
                    write_schema = "write_schema",
                    cohort_tables = "cohort") 

cdm$cohort
#> # Source:   table<write_schema.cohort> [2 x 4]
#> # Database: DuckDB 0.6.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpnQ23cR/dwtudgsr]
#>   cohort_id subject_id cohort_start_date cohort_end_date
#>       <int>      <int> <date>            <date>         
#> 1         1          1 2023-01-23        2023-01-23     
#> 2         1          2 2020-02-03        2020-11-04

Extracting data

There are two ways to extract subsets of the CDM.

local_cdm <- cdm %>% 
  collect()

# The cdm tables are now dataframes
local_cdm$person[1:4, 1:4] 
#> # A tibble: 4 × 4
#>   person_id gender_concept_id year_of_birth month_of_birth
#>       <dbl>             <dbl>         <dbl>          <dbl>
#> 1         6              8532          1963             12
#> 2       123              8507          1950              4
#> 3       129              8507          1974             10
#> 4        16              8532          1971             10
save_path <- file.path(tempdir(), "tmp")
dir.create(save_path)

cdm %>% 
  stow(path = save_path)

list.files(save_path)
#> [1] "cohort.parquet"             "observation_period.parquet"
#> [3] "person.parquet"

Create a CDM reference from files

stow saves the cdm object as a set of files. cdm_from_files read the files back into R as a cdm_reference object. The tables can be stored as R dataframes or Arrow Tables. In both cases cdm tables can be manipulated with dplyr verbs.

cdm <- cdm_from_files(save_path)

class(cdm$cohort)
#> [1] "tbl_df"     "tbl"        "data.frame"

cdm$cohort %>% 
  tally() %>% 
  pull(n)
#> [1] 2

cdm <- cdm_from_files(save_path, 
                      as_data_frame = FALSE)

class(cdm$cohort)
#> [1] "Table"        "ArrowTabular" "ArrowObject"  "R6"

cdm$cohort %>% 
  nrow()
#> [1] 2

Closing connections

Close the database connection with dbDisconnect. After the connection is closed the cdm object can no longer be used.

DBI::dbDisconnect(con, shutdown = TRUE)

Delaying connections

Sometimes you may need to delay the creation of a cdm connection or create and close many connections during execution. CDMConnector provides the ability to store connection information that can be passed to dbConnect to create a connection. The typical use case is to create a new connection inside a function and then close the connection before the function exits.

connection_details <- dbConnectDetails(duckdb::duckdb(), dbdir = eunomia_dir())

self_contained_query <- function(connection_details) {
  # create a new connection
  con <- DBI::dbConnect(connection_details)
  # close the connection before exiting 
  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
  # use the connection
  DBI::dbGetQuery(con, "select count(*) as n from main.person")
}

self_contained_query(connection_details)
#>      n
#> 1 2694

Programming with cdm objects

Since cdm object can include any subset of cdm tables it is important for functions that take cdm objects as input to check that the expected tables exists. The assert_tables function provides a checkmate style function that tests if the expected tables exist, have the correct columns, and (optionally) are not empty. It can also be used with checkmate assert collections.


library(checkmate)

con <- DBI::dbConnect(duckdb::duckdb(), dbdir = eunomia_dir())


assertTables(cdm_from_con(con, cdm_tables = "drug_era"), tables = c("person"))
#> Error in `assertTables()`:
#> ! - person table not found in cdm object

# add missing table error to collection
err <- checkmate::makeAssertCollection()
assertTables(cdm_from_con(con, cdm_tables = "drug_era"), tables = c("person"), add = err)
err$getMessages()
#> [1] "- person table not found in cdm object"

assert_tables can be used in functions that accept cdm_reference objects as a parameter.

countDrugsByGender <- function(cdm) {
  assertTables(cdm, tables = c("person", "drug_era"), empty.ok = FALSE)

  cdm$person %>%
    dplyr::inner_join(cdm$drug_era, by = "person_id") %>%
    dplyr::count(.data$gender_concept_id, .data$drug_concept_id) %>%
    dplyr::collect()
}

countDrugsByGender(cdm_from_con(con, cdm_tables = "person"))
#> Error in `assertTables()`:
#> ! - drug_era table not found in cdm object

DBI::dbExecute(con, "delete from drug_era")
#> [1] 52508
countDrugsByGender(cdm_from_con(con))
#> Error in `assertTables()`:
#> ! - drug_era cdm table is empty

DBI::dbDisconnect(con, shutdown = TRUE)

Code style

Camel case aliases are provided for many functions to fit both snake_case and camelCase programming styles.

Supported DBI drivers

The cdm_from_con function should work with any DBI driver backend implementation. However the package is tested using the following driver backends.