BranchGLM Vignette

1 Fitting GLMs

1.1 Optimization methods

1.2 Initial values

1.3 Parallel computation

2 Families

2.1 Gaussian

# Loading in BranchGLM
library(BranchGLM)

# Fitting gaussian regression models for mtcars dataset
cars <- mtcars

## Identity link
BranchGLM(mpg ~ ., data = cars, family = "gaussian", link = "identity")
#> Results from gaussian regression with identity link function 
#> Using the formula mpg ~ .
#> 
#>             Estimate       SE       t p.values  
#> (Intercept) 12.30000 18.72000  0.6573  0.51810  
#> cyl         -0.11140  1.04500 -0.1066  0.91610  
#> disp         0.01334  0.01786  0.7468  0.46350  
#> hp          -0.02148  0.02177 -0.9868  0.33500  
#> drat         0.78710  1.63500  0.4813  0.63530  
#> wt          -3.71500  1.89400 -1.9610  0.06325 .
#> qsec         0.82100  0.73080  1.1230  0.27390  
#> vs           0.31780  2.10500  0.1510  0.88140  
#> am           2.52000  2.05700  1.2250  0.23400  
#> gear         0.65540  1.49300  0.4389  0.66520  
#> carb        -0.19940  0.82880 -0.2406  0.81220  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Dispersion parameter taken to be 7.0235
#> 32 observations used to fit model
#> (0 observations removed due to missingness)
#> 
#> Residual Deviance: 147.49 on 21 degrees of freedom
#> AIC: 166.19

2.2 Gamma

# Fitting gamma regression models for mtcars dataset
## Inverse link
GammaFit <- BranchGLM(mpg ~ ., data = cars, family = "gamma", link = "inverse")
GammaFit
#> Results from gamma regression with inverse link function 
#> Using the formula mpg ~ .
#> 
#>               Estimate         SE       t p.values  
#> (Intercept) -6.794e-02  3.085e-02 -2.2020  0.03898 *
#> cyl          1.760e-03  1.946e-03  0.9048  0.37590  
#> disp        -7.947e-06  3.515e-05 -0.2261  0.82330  
#> hp          -6.724e-05  4.050e-05 -1.6600  0.11170  
#> drat        -4.283e-04  2.728e-03 -0.1570  0.87670  
#> wt          -9.224e-03  3.360e-03 -2.7450  0.01214 *
#> qsec         1.739e-03  1.165e-03  1.4930  0.15040  
#> vs          -3.086e-04  3.322e-03 -0.0929  0.92690  
#> am           6.305e-04  3.441e-03  0.1832  0.85640  
#> gear         4.882e-03  2.577e-03  1.8940  0.07203 .
#> carb        -1.025e-03  1.481e-03 -0.6926  0.49610  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Dispersion parameter taken to be 0.0087
#> 32 observations used to fit model
#> (0 observations removed due to missingness)
#> 
#> Residual Deviance: 0.28 on 21 degrees of freedom
#> AIC: 152.3
#> Algorithm converged in 3 iterations using Fisher's scoring

## Log link
GammaFit <- BranchGLM(mpg ~ ., data = cars, family = "gamma", link = "log")
GammaFit
#> Results from gamma regression with log link function 
#> Using the formula mpg ~ .
#> 
#>               Estimate         SE       t  p.values    
#> (Intercept)  2.754e+00  6.934e-01  3.9720 0.0006942 ***
#> cyl          7.509e-03  3.871e-02  0.1940 0.8481000    
#> disp         6.521e-05  6.615e-04  0.0986 0.9224000    
#> hp          -8.898e-04  8.064e-04 -1.1030 0.2823000    
#> drat         2.366e-02  6.058e-02  0.3906 0.7000000    
#> wt          -1.655e-01  7.017e-02 -2.3590 0.0281100 *  
#> qsec         3.055e-02  2.707e-02  1.1290 0.2718000    
#> vs          -1.141e-03  7.796e-02 -0.0146 0.9885000    
#> am           5.421e-02  7.618e-02  0.7115 0.4846000    
#> gear         6.014e-02  5.531e-02  1.0870 0.2893000    
#> carb        -2.277e-02  3.070e-02 -0.7418 0.4664000    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Dispersion parameter taken to be 0.0096
#> 32 observations used to fit model
#> (0 observations removed due to missingness)
#> 
#> Residual Deviance: 0.31 on 21 degrees of freedom
#> AIC: 155.65
#> Algorithm converged in 2 iterations using Fisher's scoring

2.3 Poisson

# Fitting poisson regression models for warpbreaks dataset
warp <- warpbreaks

## Log link
BranchGLM(breaks ~ ., data = warp, family = "poisson", link = "log")
#> Results from poisson regression with log link function 
#> Using the formula breaks ~ .
#> 
#>             Estimate       SE      z  p.values    
#> (Intercept)  3.69200  0.04541 81.300 < 2.2e-16 ***
#> woolB       -0.20600  0.05157 -3.994 6.490e-05 ***
#> tensionM    -0.32130  0.06027 -5.332 9.729e-08 ***
#> tensionH    -0.51850  0.06396 -8.107 5.209e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Dispersion parameter taken to be 1
#> 54 observations used to fit model
#> (0 observations removed due to missingness)
#> 
#> Residual Deviance: 210.39 on 50 degrees of freedom
#> AIC: 493.06
#> Algorithm converged in 6 iterations using Fisher's scoring

2.4 Binomial

# Fitting binomial regression models for toothgrowth dataset
Data <- ToothGrowth

## Logit link
BranchGLM(supp ~ ., data = Data, family = "binomial", link = "logit")
#> Results from binomial regression with logit link function 
#> Using the formula supp ~ .
#> 
#>             Estimate      SE      z p.values   
#> (Intercept)   1.5380  0.7860  1.956 0.050440 . 
#> len          -0.2127  0.0728 -2.921 0.003484 **
#> dose          2.0890  0.8497  2.458 0.013970 * 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Dispersion parameter taken to be 1
#> 60 observations used to fit model
#> (0 observations removed due to missingness)
#> 
#> Residual Deviance: 72.33 on 57 degrees of freedom
#> AIC: 78.33
#> Algorithm converged in 4 iterations using Fisher's scoring

## Probit link
BranchGLM(supp ~ ., data = Data, family = "binomial", link = "probit")
#> Results from binomial regression with probit link function 
#> Using the formula supp ~ .
#> 
#>             Estimate       SE      z p.values   
#> (Intercept)  0.94020  0.46900  2.005 0.045000 * 
#> len         -0.13180  0.04195 -3.142 0.001678 **
#> dose         1.30800  0.49710  2.631 0.008502 **
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Dispersion parameter taken to be 1
#> 60 observations used to fit model
#> (0 observations removed due to missingness)
#> 
#> Residual Deviance: 72.19 on 57 degrees of freedom
#> AIC: 78.19
#> Algorithm converged in 5 iterations using Fisher's scoring

2.4.1 Functions for binomial GLMs

  • BranchGLM has some utility functions for binomial GLMs
    • Table() creates a confusion matrix based on the predicted classes and observed classes
    • ROC() creates an ROC curve which can be plotted with plot()
    • AUC() and Cindex() calculate the area under the ROC curve
    • MultipleROCCurves() allows for the plotting of multiple ROC curves on the same plot

2.4.1.1 Table

# Fitting logistic regression model for toothgrowth dataset
catFit <- BranchGLM(supp ~ ., data = Data, family = "binomial", link = "logit")

Table(catFit)
#> Confusion matrix:
#> ----------------------
#>             Predicted
#>              OJ   VC
#> 
#>          OJ  17   13
#> Observed
#>          VC  7    23
#> 
#> ----------------------
#> Measures:
#> ----------------------
#> Accuracy:  0.6667 
#> Sensitivity:  0.7667 
#> Specificity:  0.5667 
#> PPV:  0.6389

2.4.1.2 ROC

# Creating ROC curve
catROC <- ROC(catFit)

plot(catROC, main = "ROC Curve", col = "indianred")

2.4.1.3 Cindex/AUC

# Getting Cindex/AUC
Cindex(catFit)
#> [1] 0.7127778

AUC(catFit)
#> [1] 0.7127778

2.4.1.4 MultipleROCPlots

# Showing ROC plots for logit, probit, and cloglog
probitFit <- BranchGLM(supp ~ . ,data = Data, family = "binomial", 
                       link = "probit")

cloglogFit <- BranchGLM(supp ~ . ,data = Data, family = "binomial", 
                       link = "cloglog")

MultipleROCCurves(catROC, ROC(probitFit), ROC(cloglogFit), 
                  names = c("Logistic ROC", "Probit ROC", "Cloglog ROC"))

2.4.1.5 Using predictions

  • For each of the methods used in this section predicted probabilities and observed classes can also be supplied instead of the BranchGLM object.

preds <- predict(catFit)

Table(preds, Data$supp)
#> Confusion matrix:
#> ----------------------
#>             Predicted
#>              OJ   VC
#> 
#>          OJ  17   13
#> Observed
#>          VC  7    23
#> 
#> ----------------------
#> Measures:
#> ----------------------
#> Accuracy:  0.6667 
#> Sensitivity:  0.7667 
#> Specificity:  0.5667 
#> PPV:  0.6389

AUC(preds, Data$supp)
#> [1] 0.7127778

ROC(preds, Data$supp) |> plot(main = "ROC Curve", col = "deepskyblue")

3 Useful functions

# Predict method
predict(GammaFit)
#>  [1] 21.69333 21.15566 25.92103 20.27496 17.15124 19.83386 14.55730 22.26169
#>  [9] 23.89767 18.75674 19.10374 15.10160 16.07372 16.13726 12.20297 11.70443
#> [17] 11.60706 27.90334 30.14896 30.14451 23.41000 17.02230 17.63782 13.90043
#> [25] 16.06923 28.65170 26.61054 28.59460 17.89013 19.53099 14.11845 23.30350

# Accessing coefficients matrix
GammaFit$coefficients
#>                  Estimate           SE           t   p.values
#> (Intercept)  2.754211e+00 0.6933540659  3.97230023 0.00069416
#> cyl          7.509180e-03 0.0387101010  0.19398501 0.84805180
#> disp         6.521183e-05 0.0006614834  0.09858423 0.92240336
#> hp          -8.897889e-04 0.0008063589 -1.10346503 0.28231007
#> drat         2.366270e-02 0.0605780301  0.39061524 0.70001605
#> wt          -1.655137e-01 0.0701735211 -2.35863431 0.02811042
#> qsec         3.055119e-02 0.0270721947  1.12850802 0.27183333
#> vs          -1.140739e-03 0.0779559038 -0.01463313 0.98846300
#> am           5.420526e-02 0.0761831300  0.71151268 0.48459603
#> gear         6.013892e-02 0.0553138294  1.08723110 0.28925667
#> carb        -2.277255e-02 0.0306989241 -0.74180288 0.46642281