A small package for joint calibration of totals and quantiles (see Beręsewicz and Szymkowiak (2023) working paper for details). The package combines the following approaches:
which allows to calibrate weights to known (or estimated) totals and
quantiles jointly. As an backend for calibration sampling
(sampling::calib
), laeken
(laeken::calibWeights
), survey
(survey::grake
) or ebal
(ebal::eb
) package can be used. One can also apply
empirical likelihood using codes from Wu (2005) with support of
stats::constrOptim
as used in Zhang, Han and Wu (2022).
backend | method | function called |
---|---|---|
sampling |
c("raking", "linear", "logit", "truncated") |
sampling::calib |
laeken |
c("raking", "linear", "logit") |
laeken::calibWeights |
survey |
c("raking", "linear", "logit", "sinh") |
survey::grake |
ebal |
eb |
ebal::eb |
base |
el |
R code and stats::constrOptim |
Currently supports:
Further plans:
sampling::gencalib
,Work on this package is supported by the the National Science Centre, OPUS 22 grant no. 2020/39/B/HS4/00941.
You can install the development version of jointCalib
from GitHub with:
# install.packages("remotes")
::install_github("ncn-foreigners/jointCalib") remotes
Load packages
library(jointCalib)
library(survey)
library(laeken)
library(ebal)
Based on Haziza, D., and Lesage, É. (2016). A discussion of weighting procedures for unit nonresponse. Journal of Official Statistics, 32(1), 129-145.
set.seed(20230817)
<- 1000
N <- runif(N, 0, 80)
x #y <- 1000+10*x+rnorm(N, 0, 300)
#y <- exp(-0.1 + 0.1*x) + rnorm(N, 0, 300)
#y <- rbinom(N, 1, prob = 1/(exp(-0.5*(x-55))+1))
<- 1300-(x-40)^2 + rnorm(N, 0, 300)
y #p <- rbinom(N, 1, prob = 0.2+0.6*(1 + exp(-5 + x/8))^-1)
<- rbinom(N, 1, prob = 0.07+0.45*(x/40-1)^2+0.0025*x)
p #p <- rbinom(N, 1, prob = (1.2+0.024*x)^-1)
#p <- rbinom(N, 1, prob = exp(-0.2 - 0.014*x))
<- seq(0.1, 0.9, 0.1)
probs <- list(x=quantile(x, probs))
quants_known <- c(x=sum(x))
totals_known <- data.frame(x, y, p)
df <- df[df$p == 1, ]
df_resp $d <- N/nrow(df_resp)
df_resp<- quantile(y, probs)
y_quant_true head(df_resp)
#> x y p d
#> 6 12.35687 444.9053 1 3.134796
#> 7 61.90319 403.9473 1 3.134796
#> 13 60.96079 923.4975 1 3.134796
#> 14 76.85300 124.4110 1 3.134796
#> 18 71.52828 422.0934 1 3.134796
#> 19 65.32808 740.4801 1 3.134796
jointCalib
packageexample 1a: calibrate only quantiles (deciles)
<- joint_calib(formula_quantiles = ~x,
result1 data=df_resp,
dweights=df_resp$d,
N = N,
pop_quantiles = quants_known,
method = "linear",
backend = "sampling")
result1#> Weights calibrated using: linear withsamplingbackend.
#> Summary statistics for g-weights:
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4562 0.6773 0.8180 1.0000 1.4500 2.4538
#> Totals and precision (abs diff: 4.574665e-10)
#> totals precision
#> [1,] 1e+03 4.557705e-10
#> [2,] 1e-01 5.020984e-14
#> [3,] 2e-01 1.055545e-13
#> [4,] 3e-01 1.324496e-13
#> [5,] 4e-01 1.580958e-13
#> [6,] 5e-01 1.765255e-13
#> [7,] 6e-01 1.991740e-13
#> [8,] 7e-01 2.304823e-13
#> [9,] 8e-01 2.882139e-13
#> [10,] 9e-01 3.552714e-13
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result1$g*df_resp$d, probs))
#> true est
#> 10% 7.078067 7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274
example 1b: calibrate only quantiles (deciles)
<- joint_calib(formula_totals = ~x,
result2 formula_quantiles = ~x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
pop_totals = totals_known,
method = "linear",
backend = "sampling")
summary(result2$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.3563 0.6208 0.8199 1.0000 1.4368 2.5384
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs))
#> true est
#> 10% 7.078067 7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274
We can restrict weights to specific range using
logit
.
<- joint_calib(formula_totals = ~x,
result3 formula_quantiles = ~x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
pop_totals = totals_known,
method = "logit",
backend = "sampling",
maxit = 500,
bounds = c(0, 3))
summary(result3$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.3911 0.6254 0.8140 1.0000 1.4325 2.5186
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result3$g*df_resp$d, probs))
#> true est
#> 10% 7.078067 7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 22.872078
#> 40% 31.641911 31.297922
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.529683
#> 90% 71.565441 71.567274
Empirical likelihood method can be applied using the following code
<- joint_calib(formula_quantiles = ~ x,
result4a data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
method = "el",
backend = "base")
summary(result4a$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4563 0.6773 0.8180 1.0000 1.4500 2.4538
<- joint_calib(formula_totals = ~ x,
result4b formula_quantiles = ~ x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
pop_totals = totals_known,
method = "el",
backend = "base")
summary(result4b$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4414 0.6584 0.8109 1.0000 1.4256 2.8513
<- calib_el(X = result4b$Xs[, c(1, 11)],
result4c d = df_resp$d,
totals = c(N,totals_known),
maxit = 50,
tol = 1e-8)
summary(result4c)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.7438 0.7910 0.8848 1.0000 1.2455 1.4923
data.frame(true = quants_known$x,
est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs),
est_el0 = weightedQuantile(df_resp$x, result4c*df_resp$d, probs),
est_el1 = weightedQuantile(df_resp$x, result4a$g*df_resp$d, probs),
est_el2 = weightedQuantile(df_resp$x, result4b$g*df_resp$d, probs))
#> true est est_el0 est_el1 est_el2
#> 10% 7.078067 7.085003 3.640246 7.085003 7.085003
#> 20% 14.831221 14.824424 8.024948 14.824424 14.824424
#> 30% 23.146180 23.287657 14.499018 23.287657 23.287657
#> 40% 31.641911 31.802986 24.010501 31.802986 31.802986
#> 50% 39.033812 39.154276 39.154276 38.892342 39.154276
#> 60% 47.527168 48.252065 54.685438 48.252065 48.252065
#> 70% 54.984229 55.311953 63.084405 54.954054 54.954054
#> 80% 64.073167 64.062629 70.050593 64.062629 64.062629
#> 90% 71.565441 71.567274 75.663078 71.567274 71.567274
Entropy balancing method can be applied using the following code
<- joint_calib(formula_quantiles = ~ x,
result5 data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
method = "eb")
summary(result5$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4576 0.6775 0.8180 1.0003 1.4500 2.4538
Finally, compare all method with true Y distribution where
est_y1
– weights calibrated to quantiles only,est_y2
– weights calibrated to quantiles and
totals,est_y3
– weights calibrated to quantiles and totals
using logit
distance and range limitations,est_y4
– weights calibrated to means only using
el
.est_y5
– weights calibrated to quantiles only using
el
.est_y6
– weights calibrated to quantiles and totals
using el
.est_y7
– weights calibrated to quantiles and totals
using eb
.<- data.frame(true_y = y_quant_true,
results_y est_y1 = weightedQuantile(df_resp$y, result1$g * df_resp$d, probs),
est_y2 = weightedQuantile(df_resp$y, result2$g * df_resp$d, probs),
est_y3 = weightedQuantile(df_resp$y, result3$g * df_resp$d, probs),
est_y4 = weightedQuantile(df_resp$y, result4c * df_resp$d, probs),
est_y5 = weightedQuantile(df_resp$y, result4a$g * df_resp$d, probs),
est_y6 = weightedQuantile(df_resp$y, result4b$g * df_resp$d, probs),
est_y7 = weightedQuantile(df_resp$y, result5$g * df_resp$d, probs))
results_y#> true_y est_y1 est_y2 est_y3 est_y4 est_y5
#> 10% -56.97982 -36.18473 -36.18473 -36.18473 -149.00996 -36.18473
#> 20% 210.58301 228.75429 228.75429 228.75429 17.27107 228.75429
#> 30% 444.81652 413.22517 417.60155 413.22517 203.58302 413.22517
#> 40% 646.06099 622.82192 622.82192 622.82192 381.99560 622.82192
#> 50% 803.50842 789.89936 789.89936 789.89936 503.35849 789.89936
#> 60% 975.31803 925.84730 925.84730 925.84730 679.04530 925.84730
#> 70% 1123.44643 1023.75230 1023.75230 1023.75230 811.84389 1023.75230
#> 80% 1245.62645 1162.06504 1162.06504 1162.06504 1009.46703 1162.06504
#> 90% 1449.23819 1471.33301 1471.33301 1471.33301 1242.60357 1471.33301
#> est_y6 est_y7
#> 10% -36.18473 -36.18473
#> 20% 228.75429 228.75429
#> 30% 411.47088 413.22517
#> 40% 622.82192 622.82192
#> 50% 789.89936 789.89936
#> 60% 925.84730 925.84730
#> 70% 1023.75230 1023.75230
#> 80% 1162.06504 1162.06504
#> 90% 1471.33301 1471.33301
Here is the sum of squares and calibration of totals and means seems to be the best.
apply(results_y[, -1], 2, FUN = function(x) sum((x-results_y[,1])^2))
#> est_y1 est_y2 est_y3 est_y4 est_y5 est_y6 est_y7
#> 22342.87 22085.51 22342.87 547195.99 22342.87 22456.79 22342.87
survey
packageexample 1a: calibrate only quantiles (deciles)
<- joint_calib_create_matrix(X_q = model.matrix(~x+0, df_resp), N = N, pop_quantiles = quants_known)
A colnames(A) <- paste0("quant_", gsub("\\D", "", names(quants_known$x)))
<- as.data.frame(A)
A <- cbind(df_resp, A)
df_resp head(df_resp)
#> x y p d quant_10 quant_20 quant_30 quant_40 quant_50
#> 6 12.35687 444.9053 1 3.134796 0 0.001 0.001 0.001 0.001
#> 7 61.90319 403.9473 1 3.134796 0 0.000 0.000 0.000 0.000
#> 13 60.96079 923.4975 1 3.134796 0 0.000 0.000 0.000 0.000
#> 14 76.85300 124.4110 1 3.134796 0 0.000 0.000 0.000 0.000
#> 18 71.52828 422.0934 1 3.134796 0 0.000 0.000 0.000 0.000
#> 19 65.32808 740.4801 1 3.134796 0 0.000 0.000 0.000 0.000
#> quant_60 quant_70 quant_80 quant_90
#> 6 0.001 0.001 0.001 0.001
#> 7 0.000 0.000 0.001 0.001
#> 13 0.000 0.000 0.001 0.001
#> 14 0.000 0.000 0.000 0.000
#> 18 0.000 0.000 0.000 0.001
#> 19 0.000 0.000 0.000 0.001
<- svydesign(ids = ~1, data = df_resp, weights = ~d)
m1 <- as.formula(paste("~", paste(colnames(A), collapse = "+")))
quants_formula
quants_formula#> ~quant_10 + quant_20 + quant_30 + quant_40 + quant_50 + quant_60 +
#> quant_70 + quant_80 + quant_90
svytotal(quants_formula, m1)
#> total SE
#> quant_10 0.10972 0.0175
#> quant_20 0.23197 0.0237
#> quant_30 0.29154 0.0255
#> quant_40 0.34796 0.0267
#> quant_50 0.38871 0.0273
#> quant_60 0.43887 0.0278
#> quant_70 0.50784 0.0280
#> quant_80 0.63323 0.0270
#> quant_90 0.78100 0.0232
Calibration using calibrate
<- c(N, probs)
pop_totals names(pop_totals) <- c("(Intercept)", colnames(A))
<- calibrate(m1, quants_formula, pop_totals)
m1_cal svytotal(quants_formula, m1_cal)
#> total SE
#> quant_10 0.1 0
#> quant_20 0.2 0
#> quant_30 0.3 0
#> quant_40 0.4 0
#> quant_50 0.5 0
#> quant_60 0.6 0
#> quant_70 0.7 0
#> quant_80 0.8 0
#> quant_90 0.9 0
Calibration using grake
(low level)
<- grake(mm = as.matrix(cbind(1, A)),
g1 ww = df_resp$d,
calfun = cal.linear,
population = pop_totals,
bounds = list(lower = -Inf, upper = Inf),
epsilon = 1e-7,
verbose = FALSE,
maxit = 50,
variance = NULL)
summary(as.numeric(g1))
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4562 0.6773 0.8180 1.0000 1.4500 2.4538