Introduction
In this vignette we will explain how some functions of the package
are used in order to estimate a contingency table. We will work on the
eusilc
dataset of the laeken
package. All the
functions presented in the following are explained in the proposed
manuscript by Raphaël Jauslin and Yves Tillé (2021)
<arXiv:2105.08379>.
Contingency table
We will estimate the contingency table when the factor variable which
represents the economic status pl030
is crossed with a
discretized version of the equivalized household income
eqIncome
. In order to discretize the equivalized income, we
calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable
and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix
data("eusilc")
<- na.omit(eusilc)
eusilc <- nrow(eusilc)
N
# Xm are the matching variables and id are identity of the units
<- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xm <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xmcat <- cbind(Xmcat,Xm[,-c(2,4,5)])
Xm <- eusilc$rb030
id
# categorial income splitted by the percentile
<- eusilc$eqIncome
c_income <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
q which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
c_income[
# variable of interests
<- data.frame(ecostat = eusilc$pl030)
Y <- data.frame(c_income = c_income)
Z
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
<- table(cbind(Y,Z))
YZ addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Sampling schemes
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
<- 1000
n1 <- 500
n2
# samples
<- srswor(n1,N)
s1 <- srswor(n2,N)
s2
# extract matching units
<- Xm[s1 == 1,]
X1 <- Xm[s2 == 1,]
X2
# extract variable of interest
<- data.frame(Y[s1 == 1,])
Y1 colnames(Y1) <- colnames(Y)
<- as.data.frame(Z[s2 == 1,])
Z2 colnames(Z2) <- colnames(Z)
# extract correct identities
<- id[s1 == 1]
id1 <- id[s2 == 1]
id2
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
<- rep(N/n1,n1)
d1 <- rep(N/n2,n2)
d2
# disjunctive form
<- sampling::disjunctive(as.matrix(Y))
Y_dis <- sampling::disjunctive(as.matrix(Z))
Z_dis
<- Y_dis[s1 ==1,]
Y1_dis <- Z_dis[s2 ==1,] Z2_dis
Harmonization
Then the harmonization step must be performed. The
harmonize
function returns the harmonized weights. If by
chance the true population totals are known, it is possible to use these
instead of the estimate made within the function.
<- harmonize(X1,d1,id1,X2,d2,id2)
re
# if we want to use the population totals to harmonize we can use
<- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
re
<- re$w1
w1 <- re$w2
w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
Optimal transport matching
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by
extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct
weights.
# Optimal transport matching
<- otmatch(X1,id1,X2,id2,w1,w2)
object head(object[,1:3])
#> id1 id2 weight
#> 401 401 15202 6.136490
#> 401.1 401 242002 2.971381
#> 1 402 402 11.317454
#> 1702 1702 340402 10.847577
#> 2201 2201 57601 2.322798
#> 2201.1 2201 209801 3.892207
<- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Y1_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
Z2_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
YZ_ot
# transform NA into 0
is.na(YZ_ot)] <- 0
YZ_ot[
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 495.191 583.829 842.043 767.611 861.806 1002.948 422.448 4975.876
#> 2 105.242 183.647 227.369 163.413 205.471 117.297 151.151 1153.589
#> 3 106.657 62.830 80.567 75.092 44.895 84.938 48.424 503.404
#> 4 108.838 137.109 128.167 114.884 73.292 92.180 50.054 704.525
#> 5 473.056 534.986 655.753 257.049 511.679 495.770 320.423 3248.718
#> 6 62.570 58.126 12.043 23.961 19.634 21.324 18.577 216.235
#> 7 198.620 194.825 305.879 169.669 210.271 144.219 81.170 1304.654
#> Sum 1550.175 1755.352 2251.821 1571.680 1927.048 1958.676 1092.248 12107.000
Balanced sampling
As you can see from the previous section, the optimal transport
results generally do not have a one-to-one match, meaning that for every
unit in sample 1, we have more than one unit with weights not equal to 0
in sample 2. The bsmatch
function creates a one-to-one
match by selecting a balanced stratified sampling to obtain a data.frame
where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
<- bsmatch(object)
BS head(BS$object[,1:3])
#> id1 id2 weight
#> 401.1 401 242002 2.971381
#> 1 402 402 11.317454
#> 1702 1702 340402 10.847577
#> 2201 2201 57601 2.322798
#> 4001 4001 467701 8.061287
#> 4601 4601 593101 12.982194
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 498.419 606.443 846.744 741.573 878.150 970.039 434.508 4975.876
#> 2 125.431 150.789 236.924 135.058 212.339 125.672 167.376 1153.589
#> 3 123.581 55.102 78.184 76.929 54.011 68.045 47.551 503.404
#> 4 98.483 122.698 120.994 127.346 69.124 105.372 60.507 704.525
#> 5 491.618 534.867 657.895 262.888 487.836 498.823 314.791 3248.718
#> 6 74.817 58.126 12.043 29.866 20.032 11.918 9.433 216.235
#> 7 160.873 213.759 313.441 172.285 234.037 163.451 46.808 1304.654
#> Sum 1573.221 1741.783 2266.225 1545.946 1955.531 1943.320 1080.974 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
<- bsmatch(object,Z2)
BS
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 489.842 620.543 834.466 767.894 831.532 998.271 433.327 4975.876
#> 2 101.669 174.548 212.352 185.152 211.122 112.894 155.852 1153.589
#> 3 134.490 55.102 78.184 76.929 54.011 57.136 47.551 503.404
#> 4 98.863 128.603 134.098 119.351 69.314 93.468 60.829 704.525
#> 5 445.615 534.071 659.128 252.378 555.277 489.505 312.745 3248.718
#> 6 62.154 58.126 12.043 20.249 20.032 34.198 9.433 216.235
#> 7 220.469 182.894 324.804 141.407 196.164 163.451 75.464 1304.654
#> Sum 1553.101 1753.887 2255.074 1563.360 1937.452 1948.924 1095.201 12107.000
Prediction
# split the weight by id1
<- split(object$weight,f = object$id1)
q_l # normalize in each id1
<- lapply(q_l, function(x){x/sum(x)})
q_l <- as.numeric(do.call(c,q_l))
q
<- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
Z_pred colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0.3262432 0.6737568 0 0.000000 0.0000000 0.000000 0
#> [2,] 0.0000000 0.0000000 1 0.000000 0.0000000 0.000000 0
#> [3,] 1.0000000 0.0000000 0 0.000000 0.0000000 0.000000 0
#> [4,] 0.0000000 0.0000000 0 0.264496 0.2922999 0.443204 0
#> [5,] 0.0000000 0.0000000 0 0.000000 1.0000000 0.000000 0
#> [6,] 0.0000000 0.0000000 0 0.000000 1.0000000 0.000000 0