Multivariate Covariance Generalized Linear Models

Example 1 - Count data
Example 2 - Continuous data with offset
Example 3 - Continuous positive data
Example 4 - Binomial data

To install the stable version of mcglm, use devtools::install_git(). For more information, visit mcglm/README.

library(devtools)
install_git("http://git.leg.ufpr.br/wbonat/mcglm.git")

library(mcglm)
packageVersion("mcglm")

## [1] '0.0.2'

Abstract

The mcglm package implements the multivariate covariance generalized linear models (McGLMs) proposed by Bonat and J\(\o\)rgensen (2016). In this introductory vignette we employed the mcglm package for fitting a set of generalized linear models and compare our results with the ones obtained by ordinary R functions like lm and glm.

Example 1 - Count data

Consider the count data obtained in Dobson (1990).

## Dobson (1990) Page 93: Randomized Controlled Trial :
counts <- c(18,17,15,20,10,20,25,13,12)
outcome <- gl(3,1,9)
treatment <- gl(3,3)
print(d.AD <- data.frame(treatment, outcome, counts))

##   treatment outcome counts
## 1         1       1     18
## 2         1       2     17
## 3         1       3     15
## 4         2       1     20
## 5         2       2     10
## 6         2       3     20
## 7         3       1     25
## 8         3       2     13
## 9         3       3     12

Ordinary analysis using Poisson model.

fit.glm <- glm(counts ~ outcome + treatment, family = poisson)

The orthodox quasi-Poisson model is obtained specifying the variance function as tweedie and fix the power parameter at \(1\). Since, we are dealing with independent data, the matrix linear predictor is composed of a diagonal matrix.

require(Matrix)
# Matrix linear predictor
Z0 <- Diagonal(dim(d.AD)[1],1)
fit.qglm <- mcglm(linear_pred = c(counts ~ outcome + treatment),
                  matrix_pred = list("resp1" = list(Z0)),
                  link = "log", variance = "tweedie", data = d.AD,
                  control_algorithm = list("verbose" = FALSE,
                                           "method" = "chaser",
                                           "tunning" = 0.8))

## Automatic initial values selected.

Comparing regression parameter estimates.

cbind("GLM" = coef(fit.glm),
      "McGLM" = coef(fit.qglm, type = "beta")$Estimates)

##                       GLM         McGLM
## (Intercept)  3.044522e+00  3.044522e+00
## outcome2    -4.542553e-01 -4.542553e-01
## outcome3    -2.929871e-01 -2.929871e-01
## treatment2   1.337909e-15  2.179525e-16
## treatment3   1.421085e-15  3.611095e-17

Example 2 - Continuous data with offset

Consider the example from Venables & Ripley (2002, p.189). The response variable is continuous for which we can assume the Gaussian distribution. In this example, we exemplify the use of the offset argument.

# Loading the data set
utils::data(anorexia, package = "MASS")

# GLM fit
anorex.1 <- glm(Postwt ~ Prewt + Treat + offset(Prewt),
               family = gaussian, data = anorexia)

# McGLM fit
Z0 <- Diagonal(dim(anorexia)[1],1)
fit.anorexia <- mcglm(linear_pred = c(Postwt ~ Prewt + Treat),
                      matrix_pred = list(list(Z0)),
                      link = "identity", variance = "constant",
                      offset = list(anorexia$Prewt),
                      power_fixed = TRUE, data = anorexia,
                      control_algorithm = list("correct" = FALSE))

## Automatic initial values selected.

Comparing parameter estimates and standard errors.

# Estimates
cbind("McGLM" = round(coef(fit.anorexia, type = "beta")$Estimates,5),
      "GLM" = round(coef(anorex.1),5))

##                McGLM      GLM
## (Intercept) 49.77111 49.77111
## Prewt       -0.56554 -0.56554
## TreatCont   -4.09707 -4.09707
## TreatFT      4.56306  4.56306

# Standard errors
cbind("McGLM" = sqrt(diag(vcov(fit.anorexia))),
      "GLM" = c(sqrt(diag(vcov(anorex.1))),NA))

##                  McGLM        GLM
## (Intercept) 13.0136721 13.3909581
## Prewt        0.1566411  0.1611824
## TreatCont    1.8401441  1.8934926
## TreatFT      2.0732299  2.1333359
##              6.4645109         NA

Example 3 - Continuous positive data

Consider the following data set from McCullagh & Nelder (1989, pp.300-2). It is an example of Gamma regression model.

clotting <- data.frame(
  u = c(5,10,15,20,30,40,60,80,100),
  lot1 = c(118,58,42,35,27,25,21,19,18),
  lot2 = c(69,35,26,21,18,16,13,12,12))

Analysis using generalized linear models glm function in R.

fit.lot1 <- glm(lot1 ~ log(u), data = clotting,
                family = Gamma(link = "inverse"))
fit.lot2 <- glm(lot2 ~ log(u), data = clotting,
                family = Gamma(link = "inverse"))

The code below exemplify how to use the control_initial argument.

list_initial = list()
list_initial$regression <- list(coef(fit.lot1))
list_initial$power <- list(c(2))
list_initial$tau <- list(summary(fit.lot1)$dispersion)
list_initial$rho = 0

The control_initial argument should be a named list with initial values for all parameters involved in the model. Note that, in this example we used the parameter estimates from the glm fit as initial values for the regression and dispersion parameters. The power parameter was fixed at \(p = 2\), since we want to fit Gamma regression models. In that case, we have only one response variable, but the initial value for correlation parameter \(\rho\) should be specified.

Z0 <- Diagonal(dim(clotting)[1], 1)
fit.lot1.mcglm <- mcglm(linear_pred = c(lot1 ~ log(u)),
                        matrix_pred = list(list(Z0)),
                        link = "inverse", variance = "tweedie",
                        data = clotting,
                        control_initial = list_initial)

Comparing parameter estimates and standard errors.

# Estimates
cbind("mcglm" = round(coef(fit.lot1.mcglm, type = "beta")$Estimates,5),
      "glm" = round(coef(fit.lot1),5))

##                mcglm      glm
## (Intercept) -0.01655 -0.01655
## log(u)       0.01534  0.01534

# Standard errors
cbind("mcglm" = sqrt(diag(vcov(fit.lot1.mcglm))),
      "glm" = c(sqrt(diag(vcov(fit.lot1))),NA))

##                    mcglm          glm
## (Intercept) 0.0007376046 0.0009275466
## log(u)      0.0003299837 0.0004149596
##             0.0008271059           NA

Initial values for the response variable lot2

list_initial$regression <- list("resp1" = coef(fit.lot2))
list_initial$tau <- list("resp1" = c(var(1/clotting$lot2)))

Note that, since the list_initial object already have all components required, we just modify the entries regression and tau.

fit.lot2.mcglm <- mcglm(linear_pred = c(lot2 ~ log(u)),
                        matrix_pred = list(list(Z0)),
                        link = "inverse", variance = "tweedie",
                        data = clotting,
                        control_initial = list_initial)

Comparing parameter estimates and standard errors.

# Estimates
cbind("mcglm" = round(coef(fit.lot2.mcglm, type = "beta")$Estimates,5),
      "glm" = round(coef(fit.lot2),5))

##                mcglm      glm
## (Intercept) -0.02391 -0.02391
## log(u)       0.02360  0.02360

# Standard errors
cbind("mcglm" = sqrt(diag(vcov(fit.lot2.mcglm))),
      "glm" = c(sqrt(diag(vcov(fit.lot2))),NA))

##                    mcglm          glm
## (Intercept) 0.0010609254 0.0013264571
## log(u)      0.0004613227 0.0005767841
##             0.0005297868           NA

The main contribution of the mcglmpackage is that it easily fit multivariate regression models. For example, for the clotting data a bivariate Gamma model is a suitable choice.

# Initial values
list_initial = list()
list_initial$regression <- list(coef(fit.lot1), coef(fit.lot2))
list_initial$power <- list(c(2),c(2))
list_initial$tau <- list(c(0.00149), c(0.001276))
list_initial$rho = 0.80

# Matrix linear predictor
Z0 <- Diagonal(dim(clotting)[1],1)

# Fit bivariate Gamma model
fit.joint.mcglm <- mcglm(linear_pred = c(lot1 ~ log(u), lot2 ~ log(u)),
                         matrix_pred = list(list(Z0), list(Z0)),
                         link = c("inverse", "inverse"),
                         variance = c("tweedie", "tweedie"),
                         data = clotting,
                         control_initial = list_initial,
                         control_algorithm = list("correct" = TRUE,
                                                 "method" = "rc",
                                                 "tunning" = 0.001,
                                                 "max_iter" = 100))
summary(fit.joint.mcglm)

## Call: lot1 ~ log(u)
## 
## Link function: inverse
## Variance function: tweedie
## Covariance function: identity
## Regression:
##               Estimates    Std.error   Z value
## (Intercept) -0.01655962 0.0007395932 -22.39017
## log(u)       0.01534506 0.0003309394  46.36818
## 
## Dispersion:
##     Estimates    Std.error  Z value
## 1 0.001556584 0.0008243904 1.888164
## 
## Call: lot2 ~ log(u)
## 
## Link function: inverse
## Variance function: tweedie
## Covariance function: identity
## Regression:
##               Estimates    Std.error   Z value
## (Intercept) -0.02385489 0.0010587888 -22.53036
## log(u)       0.02357960 0.0004601744  51.24057
## 
## Dispersion:
##     Estimates   Std.error  Z value
## 1 0.001154365 0.000537687 2.146909
## 
## Correlation matrix:
##   Parameters Estimates Std.error  Z value
## 5      rho12 0.7252082 0.1787199 4.057793
## 
## Algorithm: rc
## Correction: TRUE
## Number iterations: 7

We can also easily change the link function. The code below fit a bivariate Gamma model using the log link function.

# Initial values
list_initial = list()
list_initial$regression <- list(c(log(mean(clotting$lot1)),0),
                                c(log(mean(clotting$lot2)),0))
list_initial$power <- list(c(2), c(2))
list_initial$tau <- list(c(0.023), c(0.024))
list_initial$rho = 0

# Fit bivariate Gamma model
fit.joint.log <- mcglm(linear_pred = c(lot1 ~ log(u), lot2 ~ log(u)),
                       matrix_pred = list(list(Z0),list(Z0)),
                       link = c("log", "log"),
                       variance = c("tweedie", "tweedie"),
                       data = clotting,
                       control_initial = list_initial)
summary(fit.joint.log)

## Call: lot1 ~ log(u)
## 
## Link function: log
## Variance function: tweedie
## Covariance function: identity
## Regression:
##              Estimates  Std.error   Z value
## (Intercept)  5.5032302 0.15180744  36.25139
## log(u)      -0.6019177 0.04412031 -13.64264
## 
## Dispersion:
##    Estimates   Std.error Z value
## 1 0.01549821 0.007775851 1.99312
## 
## Call: lot2 ~ log(u)
## 
## Link function: log
## Variance function: tweedie
## Covariance function: identity
## Regression:
##              Estimates  Std.error   Z value
## (Intercept)  4.9187575 0.14801016  33.23257
## log(u)      -0.5674356 0.04301669 -13.19106
## 
## Dispersion:
##    Estimates   Std.error  Z value
## 1 0.01473257 0.007720715 1.908187
## 
## Correlation matrix:
##   Parameters Estimates Std.error  Z value
## 5      rho12 0.9807098 0.1409252 6.959081
## 
## Algorithm: chaser
## Correction: TRUE
## Number iterations: 11

Example 4 - Binomial data

Consider the example menarche from the MASS R package.

require(MASS)
data(menarche)
data <- data.frame("resp" = menarche$Menarche/menarche$Total,
                   "Ntrial" = menarche$Total,
                   "Age" = menarche$Age)

Logistic regression model.

glm.out = glm(cbind(Menarche, Total-Menarche) ~ Age,
              family=binomial(logit), data=menarche)

The same fitted by mcglm function.

# Matrix linear predictor
Z0 <- Diagonal(dim(data)[1],1)
fit.logit <- mcglm(linear_pred = c(resp ~ Age),
                   matrix_pred = list(list(Z0)),
                   link = "logit", variance = "binomialP",
                   Ntrial = list(data$Ntrial), data = data)

## Automatic initial values selected.

Comparing parameter estimates and standard errors.

# Estimates
cbind("GLM" = coef(glm.out),
      "McGLM" = coef(fit.logit, type = "beta")$Estimates)

##                    GLM      McGLM
## (Intercept) -21.226395 -21.226395
## Age           1.631968   1.631968

# Standard error
cbind("GLM" = c(sqrt(diag(vcov(glm.out))),NA),
      "McGLM" =  sqrt(diag(vcov(fit.logit))))

##                    GLM      McGLM
## (Intercept) 0.77068466 0.69361807
## Age         0.05895308 0.05305792
##                     NA 0.13634349

We can estimate a more flexible model using the extended binomial variance function.

fit.logit.power <- mcglm(linear_pred = c(resp ~ Age),
                         matrix_pred = list(list(Z0)),
                         link = "logit", variance = "binomialP",
                         Ntrial = list(data$Ntrial),
                         power_fixed = FALSE, data = data)

## Automatic initial values selected.

summary(fit.logit.power)

## Call: resp ~ Age
## 
## Link function: logit
## Variance function: binomialP
## Covariance function: identity
## Regression:
##             Estimates Std.error   Z value
## (Intercept) -21.30965 0.6960901 -30.61334
## Age           1.63802 0.0532248  30.77550
## 
## Power:
##   Estimates Std.error  Z value
## 1  1.040691 0.1003321 10.37246
## 
## Dispersion:
##   Estimates Std.error  Z value
## 1 0.9222993 0.3394829 2.716777
## 
## Algorithm: chaser
## Correction: TRUE
## Number iterations: 8

Multivariate Covariance Generalized Linear Models

Wagner Hugo Bonat

mcglm 0.0.2 2016-01-10

Abstract

Example 1 - Count data

Example 2 - Continuous data with offset

Example 3 - Continuous positive data

Example 4 - Binomial data