mOMOP to OMOP Vocabularies • amphora

library(amphora)
library(mOMOP)
library(tidyverse)

In lieu of using Protege, mCode value set classification developed in the mOMOP package is converted to the OMOP Vocabulary structure, specifically the class hierarchy seen in the Concept Relationship table. This is achieved by creating:
1. Forward “Subsumes” relationships between the class and concept.
1. Inverse “Is a” relationships between the concept and class.
1. Concepts for each class in a Concept table.

MCODE_CLASS_HIERARCHY
#> # A tibble: 215,271 x 2
#>    class             concept                                                    
#>    <chr>             <chr>                                                      
#>  1 CANCER_BODY_LOCA… [V] [S] 4048384 Body structure [SNOMED 123037004] [Spec An…
#>  2 CANCER_BODY_LOCA… [V] [S] 4002852 Buccal embrasure [SNOMED 110326006] [Spec …
#>  3 CANCER_BODY_LOCA… [V] [S] 36717763 Skin structure of left lower eyelid [SNOM…
#>  4 CANCER_BODY_LOCA… [V] [S] 4230944 Adenofibrosis [SNOMED 89115006] [Observati…
#>  5 CANCER_BODY_LOCA… [V] [S] 42605189 Intervertebral foramen of eighteenth thor…
#>  6 CANCER_BODY_LOCA… [V] [S] 4097829 Transitional cell carcinoma [SNOMED 270900…
#>  7 CANCER_BODY_LOCA… [V] [S] 4265989 Structure of medial nuclei of hypothalamus…
#>  8 CANCER_BODY_LOCA… [V] [S] 4143459 Structure of subarachnoid space of brain […
#>  9 CANCER_BODY_LOCA… [V] [S] 4093877 Entire body of second thoracic vertebra [S…
#> 10 CANCER_BODY_LOCA… [V] [S] 4300995 Structure of suprarenal vein [SNOMED 77905…
#> # … with 215,261 more rows

The class field is converted into a human readable format.

MCODE_CLASS_HIERARCHY2 <-
  MCODE_CLASS_HIERARCHY %>%
  mutate_at(vars(class), stringr::str_replace_all, "_", " ") %>%
  mutate_at(vars(class), stringr::str_to_title)
unique(MCODE_CLASS_HIERARCHY2$class)
#>  [1] "Cancer Body Location"            "Cancer Disease Status Evidence "
#>  [3] "Cancer Disorder"                 "Cancer Related Surgical Procedu"
#>  [5] "Cancer Staging"                  "Cancer Staging System"          
#>  [7] "Comorbid Condition"              "Condition Status Trend"         
#>  [9] "Genomics"                        "Histology Morphology Behavior"  
#> [11] "Laterality"                      "Primary Or Uncertain Behavior C"
#> [13] "Radiation Procedure"             "Radiation Target Body Site"     
#> [15] "Secondary Cancer Disorder"       "Specimen"                       
#> [17] "Treatment Intent"                "Treatment Termination Reason"   
#> [19] "Tumor Marker Measurement"        "Units Of Measurement"           
#> [21] "Yes No Unknown"

CONCEPT_A <-
  MCODE_CLASS_HIERARCHY2 %>%
  select(class) %>%
  distinct() %>%
  mutate(concept_id = uuid::UUIDgenerate(n = length(unique(MCODE_CLASS_HIERARCHY2$class))),
         concept_code = length(unique(MCODE_CLASS_HIERARCHY2$class))) %>%
  transmute(
    concept_id,
    concept_name = class,
    domain_id = "Cancer Modifier",
    vocabulary_id = "mOMOP",
    concept_class_id = "Class",
    standard_concept = "C",
    concept_code,
    valid_start_date = Sys.Date(),
    valid_end_date = as.Date(NA_character_),
    invalid_reason = NA_character_)
rMarkedDown::print_dt(CONCEPT_A)

CONCEPT_RELATIONSHIP <-
CONCEPT_A %>%
  transmute(concept_id_1 = concept_id,
            class = concept_name) %>%
  inner_join(MCODE_CLASS_HIERARCHY2,
            by = "class") %>%
  chariot::unmerge_strip(strip_col = concept,
                         remove = FALSE) %>%
  mutate(concept_id_2 = coalesce(concept_id, concept)) %>%
  transmute(concept_id_1,
             relationship_id = "Subsumes",
             concept_id_2,
             valid_start_date = Sys.Date(),
             valid_end_date = as.Date(NA_character_),
             invalid_reason = NA_character_)
#> Warning in chariot::unmerge_strip(., strip_col = concept, remove = FALSE): Not
#> all concepts unmerged: 24. See flagUnmergeStrip object.

CONCEPT_RELATIONSHIP_INVERSE <-
  CONCEPT_RELATIONSHIP %>%
  transmute(concept_id_1 = concept_id_2,
            relationship_id = "Is a",
            concept_id_2 = concept_id_1,
            valid_start_date,
            valid_end_date,
            invalid_reason)

conn <- chariot::connectAthena()
#> Connecting using PostgreSQL driver
CONCEPT_B <-
chariot::join_on_concept_id(
  data = CONCEPT_RELATIONSHIP %>%
            mutate_at(vars(concept_id_2), as.integer),
  column = "concept_id_2",
  conn = conn
) %>%
  select(concept_id:last_col())
#> [2021-01-08 22:37:57]    Target column: concept_id_2
#> [2021-01-08 22:37:57]    Dropping patelm9.V20210108223757...
#> [2021-01-08 22:37:57]    
#> ✓ Open connection
#> [2021-01-08 22:37:57]    
#> ✓ JDBC connection
#> [2021-01-08 22:37:57]    SQL: DROP TABLE IF EXISTS patelm9.V20210108223757;
#> [2021-01-08 22:37:57]    Sending...
#> [2021-01-08 22:37:57]    Sending...complete
#> [2021-01-08 22:37:57]    Dropping patelm9.V20210108223757...complete
#> Warning: Problem with `mutate()` input `concept_id_2`.
#> ℹ NAs introduced by coercion
#> ℹ Input `concept_id_2` is `.Primitive("as.integer")(concept_id_2)`.
#> Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
#> [2021-01-08 22:37:57]    
#> ✓ Data 'data' has more than 0 rows
#> [2021-01-08 22:37:57]    
#> ✓ Table name 'V20210108223757' is not a reserved word
#> [2021-01-08 22:37:57]    
#> ✓ Field name 'concept_id_1' is not a reserved word
#> [2021-01-08 22:37:57]    
#> ✓ Field name 'relationship_id' is not a reserved word
#> [2021-01-08 22:37:57]    
#> ✓ Field name 'concept_id_2' is not a reserved word
#> [2021-01-08 22:37:57]    
#> ✓ Field name 'valid_start_date' is not a reserved word
#> [2021-01-08 22:37:57]    
#> ✓ Field name 'valid_end_date' is not a reserved word
#> [2021-01-08 22:37:57]    
#> ✓ Field name 'invalid_reason' is not a reserved word
#> [2021-01-08 22:37:57]    Writing patelm9.V20210108223757...
#> Warning in max(nchar(as.character(obj)), na.rm = TRUE): no non-missing arguments
#> to max; returning -Inf
#> [2021-01-08 22:38:03]    Writing patelm9.V20210108223757...complete
#> [2021-01-08 22:38:03]    
#> ✓ Open connection
#> [2021-01-08 22:38:03]    
#> ✓ JDBC connection
#> [2021-01-08 22:38:03]    SQL: SELECT a.*, b.* FROM patelm9.V20210108223757 a LEFT JOIN omop_vocabulary.concept b ON a.concept_id_2 = b.concept_id WHERE b.invalid_reason IS NULL
#> [2021-01-08 22:38:03]    Querying...
#> [2021-01-08 22:38:11]    Querying...complete
#> [2021-01-08 22:38:11]    
#> ✓ Returned data has more than 0 rows
#> [2021-01-08 22:38:11]    Dropping patelm9.V20210108223757...
#> [2021-01-08 22:38:11]    
#> ✓ Open connection
#> [2021-01-08 22:38:11]    
#> ✓ JDBC connection
#> [2021-01-08 22:38:11]    SQL: DROP TABLE IF EXISTS patelm9.V20210108223757;
#> [2021-01-08 22:38:11]    Sending...
#> [2021-01-08 22:38:11]    Sending...complete
#> [2021-01-08 22:38:11]    Dropping patelm9.V20210108223757...complete
chariot::dcAthena()
#> [2021-01-08 22:38:11]    Postgres connection was already closed

Both subsets of the Concept data, including concept_id is converted to character because the uuid identifiers created are not integers.

CONCEPT <-
  bind_rows(CONCEPT_A %>%
              mutate_all(as.character),
            CONCEPT_B %>%
              mutate_all(as.character))

CONCEPT_RELATIONSHIP <-
  bind_rows(CONCEPT_RELATIONSHIP,
            CONCEPT_RELATIONSHIP_INVERSE)

The dataset is written to data-raw/ if it does not already exist.

MOMOP_TABLES <-
  list(CONCEPT = CONCEPT,
       CONCEPT_RELATIONSHIP = CONCEPT_RELATIONSHIP) 

file <- file.path("data-raw", "MOMOP_TABLES.xlsx")
if (!file.exists(file)) {
  broca::write_full_excel(x = MOMOP_TABLES,
                          file = file)
}