From dataset To RDF

Our datasets are defined in a way that their dimensions can be easily and unambiguously reduced to triples for RDF applications; they can be easily serialized to, or synchronized with semantic web applications with the rdflib package¹.

Because our datasets conform the tidy data concept, they can be reduced into long-form triples.


RDF	subject	predicate	object
JSON	object	property	value
spreadsheet	row id	column name	cell
data.frame	key	variable	measurement
data.frame	key	attribute	value

Dimension reductions of the dataset

Our datasets are tidy.

example_df <- readxl::read_excel(
  system.file("extdata", "rdf_example.xlsx", package = "dataset"), 
  sheet = "dataset-wide")

example_dataset <- dataset (
  x = example_df,
  Dimensions = c("time", "geo", "sex"),
  Measures = "value", 
  Attributes = c("freq", "status")
)

attr(example_dataset, "local_id") <- 'rowid'

You can start to reduce the dimensions, for example, with uniting the dimensions. In this case, the row identifier becomes more and more a unique resource identifier, i.e. a URI.

Eventually you can reduce the entire dataset into a triple. The uri uniquely defines the observations, the component maintains the W3C/SDMX datacube models main structural element, and the value field the value of the dimension, measurement or attribute.

example_ds <- dataset_uri(ds = example_dataset) 
example_ds
#>                                                            URI rowid time geo
#> 1 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2021     1 2021  NL
#> 2 https:://example.org/my_data/status=E_geo=BE_sex=F_time=2021     2 2021  BE
#> 3 https:://example.org/my_data/status=A_geo=NL_sex=M_time=2021     3 2021  NL
#> 4 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2021     4 2021  BE
#> 5 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2022     5 2022  NL
#> 6 https:://example.org/my_data/status=A_geo=BE_sex=F_time=2022     6 2022  BE
#> 7 https:://example.org/my_data/status=O_geo=NL_sex=M_time=2022     7 2022  NL
#> 8 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2022     8 2022  BE
#>   sex value unit freq status
#> 1   F     9   NR    A      A
#> 2   F     8   NR    A      E
#> 3   M    10   NR    A      A
#> 4   M     7   NR    A      A
#> 5   F    10   NR    A      A
#> 6   F    11   NR    A      A
#> 7   M    NA   NR    A      O
#> 8   M    10   NR    A      A

subset( example_ds, select = c("URI", "value"))
#>                                                            URI value
#> 1 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2021     9
#> 2 https:://example.org/my_data/status=E_geo=BE_sex=F_time=2021     8
#> 3 https:://example.org/my_data/status=A_geo=NL_sex=M_time=2021    10
#> 4 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2021     7
#> 5 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2022    10
#> 6 https:://example.org/my_data/status=A_geo=BE_sex=F_time=2022    11
#> 7 https:://example.org/my_data/status=O_geo=NL_sex=M_time=2022    NA
#> 8 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2022    10

nq_file <- file.path(tempdir(), "triple_file.nq")

library(rdflib)
rdf <- rdf()

for ( i in seq_len(nrow(example_ds))) {
  rdf %>% 
  rdf_add("", 
          predicate = example_ds$URI[i], 
          object = example_ds$value[i])
}

rdf_serialize(rdf, doc = nq_file)

rdf_parse(nq_file) 
#> Total of 8 triples, stored in hashes
#> -------------------------------
#> _:r1671048845r22774r5 <https:://example.org/my_data/status=A_geo=NL_sex=F_time=2022> "10"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r3 <https:://example.org/my_data/status=A_geo=NL_sex=M_time=2021> "10"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r9 <https:://example.org/my_data/status=A_geo=BE_sex=M_time=2022> "10"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r1 <https:://example.org/my_data/status=A_geo=NL_sex=F_time=2021> "9"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r2 <https:://example.org/my_data/status=E_geo=BE_sex=F_time=2021> "8"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r6 <https:://example.org/my_data/status=A_geo=BE_sex=F_time=2022> "11"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r4 <https:://example.org/my_data/status=A_geo=BE_sex=M_time=2021> "7"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r7 <https:://example.org/my_data/status=O_geo=NL_sex=M_time=2022> _:r1671048845r22774r8 .

library(dplyr)
long_dataset <- example_dataset %>%
  select ( -.data$value) %>%
  mutate_all( as.character)  %>%
  tidyr::pivot_longer(cols = any_of(c("geo", "sex", "time", "unit", "freq", "status")), 
                      names_to = "predicate",
                      values_to = "object") %>%
  bind_rows( example_dataset %>%
  select ( all_of(c("rowid","value"))) %>%
    mutate_all(as.character) %>%
  tidyr::pivot_longer(cols = any_of(c("value")), 
                      names_to = "predicate",
                      values_to = "object") %>%
    mutate ( object = as.character(object))) %>%
  rename ( URI = .data$rowid) %>%
  mutate ( URI = paste0("https:://example.org/my_data/", .data$URI))
#> Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
#> ℹ Please use `"value"` instead of `.data$value`
#> Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
#> ℹ Please use `"rowid"` instead of `.data$rowid`

long_dataset %>% head()
#> # A tibble: 6 × 3
#>   URI                            predicate object
#>   <chr>                          <chr>     <chr> 
#> 1 https:://example.org/my_data/1 geo       NL    
#> 2 https:://example.org/my_data/1 sex       F     
#> 3 https:://example.org/my_data/1 time      2021  
#> 4 https:://example.org/my_data/1 unit      NR    
#> 5 https:://example.org/my_data/1 freq      A     
#> 6 https:://example.org/my_data/1 status    A

rdf2 <- rdf()

for ( i in seq_len(nrow(long_dataset))) {
  rdf2 %>% 
     rdf_add(subject = long_dataset$URI[i],
             predicate = long_dataset$predicate[i], 
             object = long_dataset$object[i])
}

rdf2
#> Total of 56 triples, stored in hashes
#> -------------------------------
#> <https:://example.org/my_data/5> <status> "A" .
#> <https:://example.org/my_data/1> <status> "A" .
#> <https:://example.org/my_data/8> <geo> "BE" .
#> <https:://example.org/my_data/3> <unit> "NR" .
#> <https:://example.org/my_data/6> <unit> "NR" .
#> <https:://example.org/my_data/7> <sex> "M" .
#> <https:://example.org/my_data/3> <freq> "A" .
#> <https:://example.org/my_data/4> <status> "A" .
#> <https:://example.org/my_data/6> <time> "2022" .
#> <https:://example.org/my_data/2> <status> "E" .
#> 
#> ... with 46 more triples

From dataset To RDF

Dimension reductions of the dataset

The benefit of standard codelists