library(dataset)
Our datasets are defined in a way that their dimensions can be easily and unambiguously reduced to triples for RDF applications; they can be easily serialized to, or synchronized with semantic web applications with the rdflib package1.
Read more about how we adopted to datacube model of SDMX and the RDF Data Cube Vocabulary for datasets as R objects in the The dataset S3 Class vignette article.
Because our datasets conform the tidy data concept, they can be reduced into long-form triples.
RDF | subject | predicate | object |
JSON | object | property | value |
spreadsheet | row id | column name | cell |
data.frame | key | variable | measurement |
data.frame | key | attribute | value |
Table source: rdflib
Our datasets are tidy.
<- readxl::read_excel(
example_df system.file("extdata", "rdf_example.xlsx", package = "dataset"),
sheet = "dataset-wide")
<- dataset (
example_dataset x = example_df,
Dimensions = c("time", "geo", "sex"),
Measures = "value",
Attributes = c("freq", "status")
)
attr(example_dataset, "local_id") <- 'rowid'
You can start to reduce the dimensions, for example, with uniting the dimensions. In this case, the row identifier becomes more and more a unique resource identifier, i.e. a URI.
Eventually you can reduce the entire dataset into a triple. The
uri
uniquely defines the observations, the
component
maintains the W3C/SDMX datacube models main
structural element, and the value
field the value of the
dimension, measurement or attribute.
<- dataset_uri(ds = example_dataset)
example_ds
example_ds#> URI rowid time geo
#> 1 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2021 1 2021 NL
#> 2 https:://example.org/my_data/status=E_geo=BE_sex=F_time=2021 2 2021 BE
#> 3 https:://example.org/my_data/status=A_geo=NL_sex=M_time=2021 3 2021 NL
#> 4 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2021 4 2021 BE
#> 5 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2022 5 2022 NL
#> 6 https:://example.org/my_data/status=A_geo=BE_sex=F_time=2022 6 2022 BE
#> 7 https:://example.org/my_data/status=O_geo=NL_sex=M_time=2022 7 2022 NL
#> 8 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2022 8 2022 BE
#> sex value unit freq status
#> 1 F 9 NR A A
#> 2 F 8 NR A E
#> 3 M 10 NR A A
#> 4 M 7 NR A A
#> 5 F 10 NR A A
#> 6 F 11 NR A A
#> 7 M NA NR A O
#> 8 M 10 NR A A
subset( example_ds, select = c("URI", "value"))
#> URI value
#> 1 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2021 9
#> 2 https:://example.org/my_data/status=E_geo=BE_sex=F_time=2021 8
#> 3 https:://example.org/my_data/status=A_geo=NL_sex=M_time=2021 10
#> 4 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2021 7
#> 5 https:://example.org/my_data/status=A_geo=NL_sex=F_time=2022 10
#> 6 https:://example.org/my_data/status=A_geo=BE_sex=F_time=2022 11
#> 7 https:://example.org/my_data/status=O_geo=NL_sex=M_time=2022 NA
#> 8 https:://example.org/my_data/status=A_geo=BE_sex=M_time=2022 10
<- file.path(tempdir(), "triple_file.nq") nq_file
library(rdflib)
<- rdf()
rdf
for ( i in seq_len(nrow(example_ds))) {
%>%
rdf rdf_add("",
predicate = example_ds$URI[i],
object = example_ds$value[i])
}
rdf_serialize(rdf, doc = nq_file)
rdf_parse(nq_file)
#> Total of 8 triples, stored in hashes
#> -------------------------------
#> _:r1671048845r22774r5 <https:://example.org/my_data/status=A_geo=NL_sex=F_time=2022> "10"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r3 <https:://example.org/my_data/status=A_geo=NL_sex=M_time=2021> "10"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r9 <https:://example.org/my_data/status=A_geo=BE_sex=M_time=2022> "10"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r1 <https:://example.org/my_data/status=A_geo=NL_sex=F_time=2021> "9"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r2 <https:://example.org/my_data/status=E_geo=BE_sex=F_time=2021> "8"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r6 <https:://example.org/my_data/status=A_geo=BE_sex=F_time=2022> "11"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r4 <https:://example.org/my_data/status=A_geo=BE_sex=M_time=2021> "7"^^<http://www.w3.org/2001/XMLSchema#decimal> .
#> _:r1671048845r22774r7 <https:://example.org/my_data/status=O_geo=NL_sex=M_time=2022> _:r1671048845r22774r8 .
library(dplyr)
<- example_dataset %>%
long_dataset select ( -.data$value) %>%
mutate_all( as.character) %>%
::pivot_longer(cols = any_of(c("geo", "sex", "time", "unit", "freq", "status")),
tidyrnames_to = "predicate",
values_to = "object") %>%
bind_rows( example_dataset %>%
select ( all_of(c("rowid","value"))) %>%
mutate_all(as.character) %>%
::pivot_longer(cols = any_of(c("value")),
tidyrnames_to = "predicate",
values_to = "object") %>%
mutate ( object = as.character(object))) %>%
rename ( URI = .data$rowid) %>%
mutate ( URI = paste0("https:://example.org/my_data/", .data$URI))
#> Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
#> ℹ Please use `"value"` instead of `.data$value`
#> Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
#> ℹ Please use `"rowid"` instead of `.data$rowid`
%>% head()
long_dataset #> # A tibble: 6 × 3
#> URI predicate object
#> <chr> <chr> <chr>
#> 1 https:://example.org/my_data/1 geo NL
#> 2 https:://example.org/my_data/1 sex F
#> 3 https:://example.org/my_data/1 time 2021
#> 4 https:://example.org/my_data/1 unit NR
#> 5 https:://example.org/my_data/1 freq A
#> 6 https:://example.org/my_data/1 status A
<- rdf()
rdf2
for ( i in seq_len(nrow(long_dataset))) {
%>%
rdf2 rdf_add(subject = long_dataset$URI[i],
predicate = long_dataset$predicate[i],
object = long_dataset$object[i])
}
rdf2#> Total of 56 triples, stored in hashes
#> -------------------------------
#> <https:://example.org/my_data/5> <status> "A" .
#> <https:://example.org/my_data/1> <status> "A" .
#> <https:://example.org/my_data/8> <geo> "BE" .
#> <https:://example.org/my_data/3> <unit> "NR" .
#> <https:://example.org/my_data/6> <unit> "NR" .
#> <https:://example.org/my_data/7> <sex> "M" .
#> <https:://example.org/my_data/3> <freq> "A" .
#> <https:://example.org/my_data/4> <status> "A" .
#> <https:://example.org/my_data/6> <time> "2022" .
#> <https:://example.org/my_data/2> <status> "E" .
#>
#> ... with 46 more triples
In this example, except for the measurement of the observation, we used only SDMX-attribute conform variable names and codes. The advantage of this approach is that it is very easy to increase the dimensions of the dataset, and add human-readable labels, potentially in many natural languages.
set.seed(2022)
library(statcodelists)
%>%
example_long filter (.data$component == "sex") %>%
left_join(statcodelists::CL_SEX %>%
rename ( value = .data$id ),
by = "value") %>%
bind_rows (
%>%
example_long filter (.data$component == "freq") %>%
left_join(statcodelists::CL_FREQ %>%
::rename ( value = .data$id ),
dplyrby = "value")
%>%
) bind_rows (
%>%
example_long filter (.data$component == "status") %>%
left_join(statcodelists::CL_OBS_STATUS %>%
::rename ( value = .data$id ),
dplyrby = "value")
%>%
) group_by (.data$component) %>%
sample_frac( size = 0.3 ) %>%
::kbl() %>%
kableExtra::kable_paper() kableExtra
Carl Boettiger: A tidyverse lover’s intro to RDF↩︎