Utilities for Working with Age Categories

ympes provides a collection of functions for working with age intervals with underlying implementations that have been optimised for performance.

`breaks_to_interval()`

breaks_to_interval provides a categorisation based on specified breaks which represent left-hand interval limits. The resultant groupings span from the minimum break through to Inf and will always be closed on the left and open on the right. Ages below the minimum break will be returned as NA. As an example, if breaks = c(0, 1, 10, 30) the interval categories would be [0, 1), [1, 10), [10, 30) and [30, Inf). Intervals are returned as a data frame with 3 entries; A factor with a character representation of the interval and two columns representing the numeric values of the corresponding lower (closed) and upper (open) bounds.

library(ympes)
breaks_to_interval(breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound
#> 1    [0, 1)           0           1
#> 2    [1, 5)           1           5
#> 3   [5, 15)           5          15
#> 4  [15, 25)          15          25
#> 5  [25, 45)          25          45
#> 6  [45, 65)          45          65
#> 7 [65, Inf)          65         Inf

`cut_ages()`

cut_ages() provides categorisation of ages based on specified breaks which represent the left-hand interval limits. Categorisation is based on the breaks and follows the approach of breaks_to_interval.

cut_ages(ages = 0:9, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound
#> 1    [0, 1)           0           1
#> 2    [1, 5)           1           5
#> 3    [1, 5)           1           5
#> 4    [1, 5)           1           5
#> 5    [1, 5)           1           5
#> 6   [5, 15)           5          15
#> 7   [5, 15)           5          15
#> 8   [5, 15)           5          15
#> 9   [5, 15)           5          15
#> 10  [5, 15)           5          15
cut_ages(1:10, breaks = 6L)
#>    interval lower_bound upper_bound
#> 1      <NA>          NA          NA
#> 2      <NA>          NA          NA
#> 3      <NA>          NA          NA
#> 4      <NA>          NA          NA
#> 5      <NA>          NA          NA
#> 6  [6, Inf)           6         Inf
#> 7  [6, Inf)           6         Inf
#> 8  [6, Inf)           6         Inf
#> 9  [6, Inf)           6         Inf
#> 10 [6, Inf)           6         Inf
x <- cut_ages(1:100, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
str(x)
#> 'data.frame':    100 obs. of  3 variables:
#>  $ interval   : Ord.factor w/ 7 levels "[0, 1)"<"[1, 5)"<..: 2 2 2 2 3 3 3 3 3 3 ...
#>  $ lower_bound: num  1 1 1 1 5 5 5 5 5 5 ...
#>  $ upper_bound: num  5 5 5 5 15 15 15 15 15 15 ...
head(x$interval)
#> [1] [1, 5)  [1, 5)  [1, 5)  [1, 5)  [5, 15) [5, 15)
#> 7 Levels: [0, 1) < [1, 5) < [5, 15) < [15, 25) < [25, 45) < ... < [65, Inf)

`split_interval_counts()`

split_interval_counts() splits counts within a age interval in to counts for individuals years based on a given weighting. Age intervals are specified by their lower (closed) and upper (open) bounds, i.e. intervals of the form [lower, upper).

# by default counts are split equally across ages within intervals
split_interval_counts(
    lower_bounds = c(0L, 5L, 10L),
    upper_bounds = c(5L, 10L, 20L),
    counts = c(5L, 10L, 30L)
)
#>    age count
#> 1    0     1
#> 2    1     1
#> 3    2     1
#> 4    3     1
#> 5    4     1
#> 6    5     2
#> 7    6     2
#> 8    7     2
#> 9    8     2
#> 10   9     2
#> 11  10     3
#> 12  11     3
#> 13  12     3
#> 14  13     3
#> 15  14     3
#> 16  15     3
#> 17  16     3
#> 18  17     3
#> 19  18     3
#> 20  19     3

# Population weightings to apply for individual years can be specified by
# the weights argument. If these are specified, they must be of length
# `max_upper` and represent weights in the range 0:(max_upper - 1).
max_upper <- 20L
weights <- integer(max_upper)
weights[c(TRUE, FALSE)] <- 1L
split_interval_counts(
    lower_bounds = c(0L, 5L, 10L),
    upper_bounds = c(5L, 10L, 20L),
    counts = c(5L, 10L, 30L),
    max_upper = max_upper,
    weights <- weights
)
#>    age    count
#> 1    0 1.666667
#> 2    1 0.000000
#> 3    2 1.666667
#> 4    3 0.000000
#> 5    4 1.666667
#> 6    5 0.000000
#> 7    6 5.000000
#> 8    7 0.000000
#> 9    8 5.000000
#> 10   9 0.000000
#> 11  10 6.000000
#> 12  11 0.000000
#> 13  12 6.000000
#> 14  13 0.000000
#> 15  14 6.000000
#> 16  15 0.000000
#> 17  16 6.000000
#> 18  17 0.000000
#> 19  18 6.000000
#> 20  19 0.000000

`aggregate_age_counts()`

aggregate_age_counts() provides aggregation of counts across ages (in years). It is similar to a cut() and tapply() pattern but optimised for speed over flexibility. Groupings are the same as in ages_to_interval() and counts will be provided across all natural numbers as well as for missing values.

# default ages generated as 0:(length(counts) - 1L) if only counts provided.
aggregate_age_counts(counts = 1:65, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound count
#> 1    [0, 1)           0           1     1
#> 2    [1, 5)           1           5    14
#> 3   [5, 15)           5          15   105
#> 4  [15, 25)          15          25   205
#> 5  [25, 45)          25          45   710
#> 6  [45, 65)          45          65  1110
#> 7 [65, Inf)          65         Inf     0
#> 8      <NA>          NA          NA     0

# Values below the minimum break are counted as NA
aggregate_age_counts(counts = 1:65, breaks = 50L)
#>    interval lower_bound upper_bound count
#> 1 [50, Inf)          50         Inf   870
#> 2      <NA>          NA          NA  1275

# NA ages are also handled with their own grouping
ages <- 1:65
ages[1:44] <- NA
aggregate_age_counts(
    counts = 1:65,
    ages = ages,
    breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L)
)
#>    interval lower_bound upper_bound count
#> 1    [0, 1)           0           1     0
#> 2    [1, 5)           1           5     0
#> 3   [5, 15)           5          15     0
#> 4  [15, 25)          15          25     0
#> 5  [25, 45)          25          45     0
#> 6  [45, 65)          45          65  1090
#> 7 [65, Inf)          65         Inf    65
#> 8      <NA>          NA          NA   990

`reaggregate_interval_counts()`

reaggregate_interval_counts() is equivalent to, but more efficient than a call to to split_interval_counts() followed by aggregate_age_counts().

The example below shows how it can be used to redistribute counts across a desired set of age intervals. We use data included in the package that has been obtained from the 2021 census and modify this based on our desired interval limits.

# census data
data(pop_dat)
pop_dat
#>    area_code         area_name age_category   value
#> 1  K04000001 England and Wales       [0, 5) 3232100
#> 2  K04000001 England and Wales      [5, 10) 3524600
#> 3  K04000001 England and Wales     [10, 15) 3595900
#> 4  K04000001 England and Wales     [15, 20) 3394700
#> 5  K04000001 England and Wales     [20, 25) 3602100
#> 6  K04000001 England and Wales     [25, 30) 3901800
#> 7  K04000001 England and Wales     [30, 35) 4148800
#> 8  K04000001 England and Wales     [35, 40) 3981600
#> 9  K04000001 England and Wales     [40, 45) 3755700
#> 10 K04000001 England and Wales     [45, 50) 3788700
#> 11 K04000001 England and Wales     [50, 55) 4123400
#> 12 K04000001 England and Wales     [55, 60) 4029000
#> 13 K04000001 England and Wales     [60, 65) 3455700
#> 14 K04000001 England and Wales     [65, 70) 2945100
#> 15 K04000001 England and Wales     [70, 75) 2978000
#> 16 K04000001 England and Wales     [75, 80) 2170300
#> 17 K04000001 England and Wales     [80, 85) 1517000
#> 18 K04000001 England and Wales     [85, 90)  925100
#> 19 K04000001 England and Wales    [90, Inf)  527900

# each row is for the same region so discard for moment
dat <- subset(pop_dat, select = c(age_category, value))

# extract upper and lower bounds
dat <- transform(
    dat,
    lower_bound = as.numeric(sub("\\[([0-9]+), .+)", "\\1", age_category)),
    upper_bound = as.numeric(sub(".+, (.+))", "\\1", age_category))
)

head(dat, n=10)
#>    age_category   value lower_bound upper_bound
#> 1        [0, 5) 3232100           0           5
#> 2       [5, 10) 3524600           5          10
#> 3      [10, 15) 3595900          10          15
#> 4      [15, 20) 3394700          15          20
#> 5      [20, 25) 3602100          20          25
#> 6      [25, 30) 3901800          25          30
#> 7      [30, 35) 4148800          30          35
#> 8      [35, 40) 3981600          35          40
#> 9      [40, 45) 3755700          40          45
#> 10     [45, 50) 3788700          45          50

# recategorise based on ages
with(
    dat,
    reaggregate_interval_counts(
        lower_bounds = lower_bound,
        upper_bounds = upper_bound,
        counts = value,
        breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L),
        max_upper = 100L,
        weights = NULL
    )
)
#>    interval lower_bound upper_bound    count
#> 1    [0, 1)           0           1   646420
#> 2    [1, 5)           1           5  2585680
#> 3   [5, 15)           5          15  7120500
#> 4  [15, 25)          15          25  6996800
#> 5  [25, 45)          25          45 15787900
#> 6  [45, 65)          45          65 15396800
#> 7 [65, Inf)          65         Inf 11063400
#> 8      <NA>          NA          NA        0