ympes provides a collection of functions for working with age intervals with underlying implementations that have been optimised for performance.
breaks_to_interval()
breaks_to_interval
provides a categorisation based on specified breaks which represent left-hand interval limits. The resultant groupings span from the minimum break through to Inf
and will always be closed on the left and open on the right. Ages below the minimum break will be returned as NA. As an example, if breaks = c(0, 1, 10, 30)
the interval categories would be [0, 1), [1, 10), [10, 30) and [30, Inf). Intervals are returned as a data frame with 3 entries; A factor with a character representation of the interval and two columns representing the numeric values of the corresponding lower (closed) and upper (open) bounds.
library(ympes)
breaks_to_interval(breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound
#> 1 [0, 1) 0 1
#> 2 [1, 5) 1 5
#> 3 [5, 15) 5 15
#> 4 [15, 25) 15 25
#> 5 [25, 45) 25 45
#> 6 [45, 65) 45 65
#> 7 [65, Inf) 65 Inf
cut_ages()
cut_ages()
provides categorisation of ages based on specified breaks which represent the left-hand interval limits. Categorisation is based on the breaks and follows the approach of breaks_to_interval
.
cut_ages(ages = 0:9, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound
#> 1 [0, 1) 0 1
#> 2 [1, 5) 1 5
#> 3 [1, 5) 1 5
#> 4 [1, 5) 1 5
#> 5 [1, 5) 1 5
#> 6 [5, 15) 5 15
#> 7 [5, 15) 5 15
#> 8 [5, 15) 5 15
#> 9 [5, 15) 5 15
#> 10 [5, 15) 5 15
cut_ages(1:10, breaks = 6L)
#> interval lower_bound upper_bound
#> 1 <NA> NA NA
#> 2 <NA> NA NA
#> 3 <NA> NA NA
#> 4 <NA> NA NA
#> 5 <NA> NA NA
#> 6 [6, Inf) 6 Inf
#> 7 [6, Inf) 6 Inf
#> 8 [6, Inf) 6 Inf
#> 9 [6, Inf) 6 Inf
#> 10 [6, Inf) 6 Inf
<- cut_ages(1:100, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
x str(x)
#> 'data.frame': 100 obs. of 3 variables:
#> $ interval : Ord.factor w/ 7 levels "[0, 1)"<"[1, 5)"<..: 2 2 2 2 3 3 3 3 3 3 ...
#> $ lower_bound: num 1 1 1 1 5 5 5 5 5 5 ...
#> $ upper_bound: num 5 5 5 5 15 15 15 15 15 15 ...
head(x$interval)
#> [1] [1, 5) [1, 5) [1, 5) [1, 5) [5, 15) [5, 15)
#> 7 Levels: [0, 1) < [1, 5) < [5, 15) < [15, 25) < [25, 45) < ... < [65, Inf)
split_interval_counts()
split_interval_counts()
splits counts within a age interval in to counts for individuals years based on a given weighting. Age intervals are specified by their lower (closed) and upper (open) bounds, i.e. intervals of the form [lower, upper).
# by default counts are split equally across ages within intervals
split_interval_counts(
lower_bounds = c(0L, 5L, 10L),
upper_bounds = c(5L, 10L, 20L),
counts = c(5L, 10L, 30L)
)#> age count
#> 1 0 1
#> 2 1 1
#> 3 2 1
#> 4 3 1
#> 5 4 1
#> 6 5 2
#> 7 6 2
#> 8 7 2
#> 9 8 2
#> 10 9 2
#> 11 10 3
#> 12 11 3
#> 13 12 3
#> 14 13 3
#> 15 14 3
#> 16 15 3
#> 17 16 3
#> 18 17 3
#> 19 18 3
#> 20 19 3
# Population weightings to apply for individual years can be specified by
# the weights argument. If these are specified, they must be of length
# `max_upper` and represent weights in the range 0:(max_upper - 1).
<- 20L
max_upper <- integer(max_upper)
weights c(TRUE, FALSE)] <- 1L
weights[split_interval_counts(
lower_bounds = c(0L, 5L, 10L),
upper_bounds = c(5L, 10L, 20L),
counts = c(5L, 10L, 30L),
max_upper = max_upper,
<- weights
weights
)#> age count
#> 1 0 1.666667
#> 2 1 0.000000
#> 3 2 1.666667
#> 4 3 0.000000
#> 5 4 1.666667
#> 6 5 0.000000
#> 7 6 5.000000
#> 8 7 0.000000
#> 9 8 5.000000
#> 10 9 0.000000
#> 11 10 6.000000
#> 12 11 0.000000
#> 13 12 6.000000
#> 14 13 0.000000
#> 15 14 6.000000
#> 16 15 0.000000
#> 17 16 6.000000
#> 18 17 0.000000
#> 19 18 6.000000
#> 20 19 0.000000
aggregate_age_counts()
aggregate_age_counts()
provides aggregation of counts across ages (in years). It is similar to a cut()
and tapply()
pattern but optimised for speed over flexibility. Groupings are the same as in ages_to_interval()
and counts will be provided across all natural numbers as well as for missing values.
# default ages generated as 0:(length(counts) - 1L) if only counts provided.
aggregate_age_counts(counts = 1:65, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 1
#> 2 [1, 5) 1 5 14
#> 3 [5, 15) 5 15 105
#> 4 [15, 25) 15 25 205
#> 5 [25, 45) 25 45 710
#> 6 [45, 65) 45 65 1110
#> 7 [65, Inf) 65 Inf 0
#> 8 <NA> NA NA 0
# Values below the minimum break are counted as NA
aggregate_age_counts(counts = 1:65, breaks = 50L)
#> interval lower_bound upper_bound count
#> 1 [50, Inf) 50 Inf 870
#> 2 <NA> NA NA 1275
# NA ages are also handled with their own grouping
<- 1:65
ages 1:44] <- NA
ages[aggregate_age_counts(
counts = 1:65,
ages = ages,
breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L)
)#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 0
#> 2 [1, 5) 1 5 0
#> 3 [5, 15) 5 15 0
#> 4 [15, 25) 15 25 0
#> 5 [25, 45) 25 45 0
#> 6 [45, 65) 45 65 1090
#> 7 [65, Inf) 65 Inf 65
#> 8 <NA> NA NA 990
reaggregate_interval_counts()
reaggregate_interval_counts()
is equivalent to, but more efficient than a call to to split_interval_counts()
followed by aggregate_age_counts()
.
The example below shows how it can be used to redistribute counts across a desired set of age intervals. We use data included in the package that has been obtained from the 2021 census and modify this based on our desired interval limits.
# census data
data(pop_dat)
pop_dat#> area_code area_name age_category value
#> 1 K04000001 England and Wales [0, 5) 3232100
#> 2 K04000001 England and Wales [5, 10) 3524600
#> 3 K04000001 England and Wales [10, 15) 3595900
#> 4 K04000001 England and Wales [15, 20) 3394700
#> 5 K04000001 England and Wales [20, 25) 3602100
#> 6 K04000001 England and Wales [25, 30) 3901800
#> 7 K04000001 England and Wales [30, 35) 4148800
#> 8 K04000001 England and Wales [35, 40) 3981600
#> 9 K04000001 England and Wales [40, 45) 3755700
#> 10 K04000001 England and Wales [45, 50) 3788700
#> 11 K04000001 England and Wales [50, 55) 4123400
#> 12 K04000001 England and Wales [55, 60) 4029000
#> 13 K04000001 England and Wales [60, 65) 3455700
#> 14 K04000001 England and Wales [65, 70) 2945100
#> 15 K04000001 England and Wales [70, 75) 2978000
#> 16 K04000001 England and Wales [75, 80) 2170300
#> 17 K04000001 England and Wales [80, 85) 1517000
#> 18 K04000001 England and Wales [85, 90) 925100
#> 19 K04000001 England and Wales [90, Inf) 527900
# each row is for the same region so discard for moment
<- subset(pop_dat, select = c(age_category, value))
dat
# extract upper and lower bounds
<- transform(
dat
dat,lower_bound = as.numeric(sub("\\[([0-9]+), .+)", "\\1", age_category)),
upper_bound = as.numeric(sub(".+, (.+))", "\\1", age_category))
)
head(dat, n=10)
#> age_category value lower_bound upper_bound
#> 1 [0, 5) 3232100 0 5
#> 2 [5, 10) 3524600 5 10
#> 3 [10, 15) 3595900 10 15
#> 4 [15, 20) 3394700 15 20
#> 5 [20, 25) 3602100 20 25
#> 6 [25, 30) 3901800 25 30
#> 7 [30, 35) 4148800 30 35
#> 8 [35, 40) 3981600 35 40
#> 9 [40, 45) 3755700 40 45
#> 10 [45, 50) 3788700 45 50
# recategorise based on ages
with(
dat,reaggregate_interval_counts(
lower_bounds = lower_bound,
upper_bounds = upper_bound,
counts = value,
breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L),
max_upper = 100L,
weights = NULL
)
)#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 646420
#> 2 [1, 5) 1 5 2585680
#> 3 [5, 15) 5 15 7120500
#> 4 [15, 25) 15 25 6996800
#> 5 [25, 45) 25 45 15787900
#> 6 [45, 65) 45 65 15396800
#> 7 [65, Inf) 65 Inf 11063400
#> 8 <NA> NA NA 0