# Lecture 8 - Oslo particles - multiple logistic regression
library(tidyverse); theme_set(theme_bw() + theme(text = element_text(size = 16)))
library(readxl)
library(car)Lecture 8 - Multiple logistic regression
Example 1 - PM10 particles in Oslo
1 Packages
- Update to a newer version (preferably) or
- make sure that the package
MASSis installed. It probably is, but look for it on the Packages tab in RStudio; it should be under the System Library heading. You do not have to activate it withlibrary(MASS).
2 Multiple logistic model
2.1 Categorical tempdiff with suitable reference category
oslo <- read_excel("Data/oslo.xlsx")
glimpse(oslo)Rows: 500
Columns: 7
$ cars <dbl> 2308, 3084, 110, 1854, 2351, 2662, 2478, 2387, 984, 2159, 30…
$ windspeed <dbl> 4.2, 4.8, 4.3, 3.0, 5.6, 2.3, 1.9, 8.9, 2.0, 5.2, 3.4, 4.4, …
$ temp2m <dbl> -4.4, -5.7, -13.5, 1.4, 4.1, 5.8, 2.7, 7.1, 4.1, 1.1, -9.0, …
$ tempdiff <chr> "zero", "zero", "zero", "zero", "pos", "zero", "zero", "zero…
$ zerodiff <chr> "zero", "zero", "zero", "zero", "nonzero", "zero", "zero", "…
$ windquad <chr> "NE", "NE", "NE", "SE", "NW", "SW", "SW", "SW", "SW", "NE", …
$ highpm10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, …
count(oslo, tempdiff)# A tibble: 3 × 2
tempdiff n
<chr> <int>
1 neg 35
2 pos 79
3 zero 386
count(oslo, zerodiff)# A tibble: 2 × 2
zerodiff n
<chr> <int>
1 nonzero 114
2 zero 386
The tempdiff variable is a textstring with values neg, pos, zero. Turn it into a factor variable using the texts as labels with as.factor(). This will make the first name in alphabetical order (neg) the reference. Change this to zero using relevel().
The zerodiff variable should be made into a factor variable with zero as reference category:
oslo <- oslo |> mutate(
tempdiff = relevel(as.factor(tempdiff), "zero"),
zerodiff = relevel(as.factor(zerodiff), "zero"))
glimpse(oslo)Rows: 500
Columns: 7
$ cars <dbl> 2308, 3084, 110, 1854, 2351, 2662, 2478, 2387, 984, 2159, 30…
$ windspeed <dbl> 4.2, 4.8, 4.3, 3.0, 5.6, 2.3, 1.9, 8.9, 2.0, 5.2, 3.4, 4.4, …
$ temp2m <dbl> -4.4, -5.7, -13.5, 1.4, 4.1, 5.8, 2.7, 7.1, 4.1, 1.1, -9.0, …
$ tempdiff <fct> zero, zero, zero, zero, pos, zero, zero, zero, zero, zero, z…
$ zerodiff <fct> zero, zero, zero, zero, nonzero, zero, zero, zero, zero, zer…
$ windquad <chr> "NE", "NE", "NE", "SE", "NW", "SW", "SW", "SW", "SW", "NE", …
$ highpm10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, …
count(oslo, tempdiff)# A tibble: 3 × 2
tempdiff n
<fct> <int>
1 zero 386
2 neg 35
3 pos 79
count(oslo, zerodiff)# A tibble: 2 × 2
zerodiff n
<fct> <int>
1 zero 386
2 nonzero 114
2.2 Check for multicollinearity
vif(glm(highpm10 ~ I(cars/1000) + windspeed + tempdiff,
family = "binomial",
data = oslo)) GVIF Df GVIF^(1/(2*Df))
I(cars/1000) 1.155561 1 1.07497
windspeed 1.171610 1 1.08241
tempdiff 1.219773 2 1.05092
Values close to 1 so not a problem.
2.3 Fit the full model
model_oslo_glm <- glm(highpm10 ~ I(cars/1000)*windspeed + tempdiff,
family = "binomial",
data = oslo)
model_oslo_sum <- summary(model_oslo_glm)
model_oslo_sum$coefficients Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.3193760 0.50212490 -2.6275854 8.599327e-03
I(cars/1000) 0.1827145 0.20492171 0.8916309 3.725908e-01
windspeed -0.4859905 0.17160092 -2.8320974 4.624376e-03
tempdiffneg 1.7843040 0.38442597 4.6414762 3.459288e-06
tempdiffpos 1.0175374 0.31094326 3.2724214 1.066305e-03
I(cars/1000):windspeed 0.1396820 0.06470855 2.1586332 3.087863e-02
Get the confidence intervals and the \(e^\beta\)-values, with confidence intervals:
beta <- model_oslo_glm$coefficients
ci.beta <- confint(model_oslo_glm)
cbind(b = beta, ci.beta, exp_b = exp(beta), exp(ci.beta)) |> round(digits = 2) b 2.5 % 97.5 % exp_b 2.5 % 97.5 %
(Intercept) -1.32 -2.33 -0.35 0.27 0.10 0.70
I(cars/1000) 0.18 -0.22 0.59 1.20 0.81 1.80
windspeed -0.49 -0.83 -0.16 0.62 0.44 0.85
tempdiffneg 1.78 1.04 2.56 5.96 2.83 12.89
tempdiffpos 1.02 0.41 1.63 2.77 1.50 5.10
I(cars/1000):windspeed 0.14 0.01 0.27 1.15 1.01 1.31
2.4 Null deviance and Deviance
The summary output of a glm model includes the null deviance and the residual deviance for the model:
model_oslo_sum
Call:
glm(formula = highpm10 ~ I(cars/1000) * windspeed + tempdiff,
family = "binomial", data = oslo)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.31938 0.50212 -2.628 0.00860 **
I(cars/1000) 0.18271 0.20492 0.892 0.37259
windspeed -0.48599 0.17160 -2.832 0.00462 **
tempdiffneg 1.78430 0.38443 4.641 3.46e-06 ***
tempdiffpos 1.01754 0.31094 3.272 0.00107 **
I(cars/1000):windspeed 0.13968 0.06471 2.159 0.03088 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 536.85 on 499 degrees of freedom
Residual deviance: 463.40 on 494 degrees of freedom
AIC: 475.4
Number of Fisher Scoring iterations: 5
The deviances and their respective degrees of freedom are saved in summary:
glimpse(model_oslo_sum)List of 17
$ call : language glm(formula = highpm10 ~ I(cars/1000) * windspeed + tempdiff, family = "binomial", data = oslo)
$ terms :Classes 'terms', 'formula' language highpm10 ~ I(cars/1000) * windspeed + tempdiff
.. ..- attr(*, "variables")= language list(highpm10, I(cars/1000), windspeed, tempdiff)
.. ..- attr(*, "factors")= int [1:4, 1:4] 0 1 0 0 0 0 1 0 0 0 ...
.. .. ..- attr(*, "dimnames")=List of 2
.. ..- attr(*, "term.labels")= chr [1:4] "I(cars/1000)" "windspeed" "tempdiff" "I(cars/1000):windspeed"
.. ..- attr(*, "order")= int [1:4] 1 1 1 2
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(highpm10, I(cars/1000), windspeed, tempdiff)
.. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "factor"
.. .. ..- attr(*, "names")= chr [1:4] "highpm10" "I(cars/1000)" "windspeed" "tempdiff"
$ family :List of 13
..$ family : chr "binomial"
..$ link : chr "logit"
..$ linkfun :function (mu)
..$ linkinv :function (eta)
..$ variance :function (mu)
..$ dev.resids:function (y, mu, wt)
..$ aic :function (y, n, mu, wt, dev)
..$ mu.eta :function (eta)
..$ initialize: language { if (NCOL(y) == 1) { ...
..$ validmu :function (mu)
..$ valideta :function (eta)
..$ simulate :function (object, nsim)
..$ dispersion: num 1
..- attr(*, "class")= chr "family"
$ deviance : num 463
$ aic : num 475
$ contrasts :List of 1
..$ tempdiff: chr "contr.treatment"
$ df.residual : int 494
$ null.deviance : num 537
$ df.null : int 499
$ iter : int 5
$ deviance.resid: Named num [1:500] -0.611 -0.784 -0.266 -0.59 1.51 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ coefficients : num [1:6, 1:4] -1.319 0.183 -0.486 1.784 1.018 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
.. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
$ aliased : Named logi [1:6] FALSE FALSE FALSE FALSE FALSE FALSE
..- attr(*, "names")= chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
$ dispersion : num 1
$ df : int [1:3] 6 494 6
$ cov.unscaled : num [1:6, 1:6] 0.2521 -0.0876 -0.0723 -0.0105 -0.0777 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
.. ..$ : chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
$ cov.scaled : num [1:6, 1:6] 0.2521 -0.0876 -0.0723 -0.0105 -0.0777 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
.. ..$ : chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
- attr(*, "class")= chr "summary.glm"
model_oslo_sum$null.deviance[1] 536.8484
model_oslo_sum$df.null[1] 499
model_oslo_sum$deviance[1] 463.4022
model_oslo_sum$df.residual[1] 494
Just double checking that the null deviance is correct:
phat <- mean(oslo$highpm10)
phat[1] 0.228
-2*nobs(model_oslo_glm)*(phat*log(phat) + (1 - phat)*log(1 - phat))[1] 536.8484
Yes!
2.5 LR-test against the null model
We want to test H\(_0\): \(\beta_1 = \beta_2 = \beta_3 = \beta_4 = \beta_5 = 0\). Compare the oslo-model with the null model by calculating \(D_0 - D\) and the difference in the number of parameters, i.e., the difference in the degrees of freedom. Calculate the P-value using the \(\chi^2\)-distribution.
D_diff <- model_oslo_sum$null.deviance - model_oslo_sum$deviance
df_diff <- model_oslo_sum$df.null - model_oslo_sum$df.residual
chi2_alpha <- qchisq(p = 1 - 0.05, df = df_diff)
Pvalue <- pchisq(q = D_diff, df = df_diff, lower.tail = FALSE)
cbind(D_diff, df_diff, chi2_alpha, Pvalue) D_diff df_diff chi2_alpha Pvalue
[1,] 73.44625 5 11.0705 1.962176e-14
Since the P-value is smaller than 0.05, we should reject H\(_0\).
2.6 LR-test against the reduced model with only tempdiff
We want to test H\(_0\): \(\beta_1 = \beta_2 = \beta_3 = 0\).
Fit the reduced model:
model_red_glm <- glm(highpm10 ~ tempdiff,
family = "binomial",
data = oslo)
model_red_sum <- summary(model_red_glm)
model_red_sum
Call:
glm(formula = highpm10 ~ tempdiff, family = "binomial", data = oslo)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.5971 0.1360 -11.742 < 2e-16 ***
tempdiffneg 2.0025 0.3709 5.399 6.68e-08 ***
tempdiffpos 0.9974 0.2717 3.671 0.000242 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 536.85 on 499 degrees of freedom
Residual deviance: 499.80 on 497 degrees of freedom
AIC: 505.8
Number of Fisher Scoring iterations: 4
2.6.1 Alt 1. Get the \(D_{red} - D_{full}\) and \(k\) from the summary outputs
D_diff <- model_red_sum$deviance - model_oslo_sum$deviance
df_diff <- model_red_sum$df.residual - model_oslo_sum$df.residual
cbind(D_diff, df_diff) D_diff df_diff
[1,] 36.40266 3
Calculate the \(\chi^2\)-quantile and/or the P-value:
chi2_alpha <- qchisq(1 - 0.05, df_diff)
Pvalue <- pchisq(D_diff, df_diff, lower.tail = FALSE)
cbind(D_diff, df_diff, chi2_alpha, Pvalue) D_diff df_diff chi2_alpha Pvalue
[1,] 36.40266 3 7.814728 6.155208e-08
We should reject H\(_0\).
2.6.2 Alt 2. use the anova output
- The
anova()function will perform a partial F-test if the models arelm()models. - The
anova()function will perform an LR-test if the models areglm()models.
anova_red_oslo <- anova(model_red_glm, model_oslo_glm)
anova_red_osloAnalysis of Deviance Table
Model 1: highpm10 ~ tempdiff
Model 2: highpm10 ~ I(cars/1000) * windspeed + tempdiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 497 499.8
2 494 463.4 3 36.403 6.155e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3 Leverage
Save the influence measures from the model. It contains both hat values and residuals.
model_oslo_infl <- influence(model_oslo_glm)
glimpse(model_oslo_infl)List of 5
$ hat : Named num [1:500] 0.00377 0.00814 0.00546 0.00306 0.03021 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ coefficients: num [1:500, 1:6] 0.00204 -0.00308 0.00276 -0.00289 -0.07541 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:500] "1" "2" "3" "4" ...
.. ..$ : chr [1:6] "(Intercept)" "I(cars/1000)" "windspeed" "tempdiffneg" ...
$ sigma : Named num [1:500] 0.969 0.969 0.969 0.969 0.967 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ dev.res : Named num [1:500] -0.611 -0.784 -0.266 -0.59 1.51 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ pear.res : Named num [1:500] -0.453 -0.6 -0.19 -0.436 1.459 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
Get the linear predictor and the hat values for the oslo model:
oslo_pred <- oslo |> mutate(
xbeta = predict(model_oslo_glm),
v = model_oslo_infl$hat)
glimpse(oslo_pred)Rows: 500
Columns: 9
$ cars <dbl> 2308, 3084, 110, 1854, 2351, 2662, 2478, 2387, 984, 2159, 30…
$ windspeed <dbl> 4.2, 4.8, 4.3, 3.0, 5.6, 2.3, 1.9, 8.9, 2.0, 5.2, 3.4, 4.4, …
$ temp2m <dbl> -4.4, -5.7, -13.5, 1.4, 4.1, 5.8, 2.7, 7.1, 4.1, 1.1, -9.0, …
$ tempdiff <fct> zero, zero, zero, zero, pos, zero, zero, zero, zero, zero, z…
$ zerodiff <fct> zero, zero, zero, zero, nonzero, zero, zero, zero, zero, zer…
$ windquad <chr> "NE", "NE", "NE", "SE", "NW", "SW", "SW", "SW", "SW", "NE", …
$ highpm10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, …
$ xbeta <dbl> -1.5848094, -1.0208979, -3.3229671, -1.6616834, -0.7548260, …
$ v <dbl> 0.003771006, 0.008143667, 0.005456267, 0.003061162, 0.030210…
Plot leverage against the linear predictor with horizontal line at \(2(p+1)/n\), separately for \(Y=0\) and \(Y=1\).
pplus1_oslo <- length(model_oslo_glm$coefficients)
n <- nobs(model_oslo_glm)
ggplot(oslo_pred, aes(x = xbeta, y = v)) +
geom_point() +
geom_hline(yintercept = c(2*pplus1_oslo/n)) +
facet_wrap(~ highpm10)Find the highest leverages:
oslo_pred |> slice_max(v, n = 8)# A tibble: 8 × 9
cars windspeed temp2m tempdiff zerodiff windquad highpm10 xbeta v
<dbl> <dbl> <dbl> <fct> <fct> <chr> <dbl> <dbl> <dbl>
1 3727 9.4 9.1 zero zero NE 1 -0.313 0.123
2 4053 7.3 9.3 zero zero SW 0 0.00619 0.0833
3 3506 8.6 0.4 zero zero NE 0 -0.647 0.0711
4 3104 8.3 12.9 neg nonzero NW 1 0.597 0.0631
5 2223 8 7.8 neg nonzero NE 1 -0.533 0.0536
6 3236 7.2 5.2 neg nonzero SW 1 0.812 0.0491
7 2320 7.3 15 neg nonzero SW 0 -0.293 0.0476
8 3543 1.1 -9.6 pos nonzero SW 1 0.355 0.0414
Highlight the observations with the largest leverages, \(v > 0.045\):
ggplot(oslo_pred, aes(x = xbeta, y = v)) +
geom_point() +
geom_point(data = filter(oslo_pred, v > 0.045),
aes(color = "v > 0.045"), size = 3) +
geom_hline(yintercept = c(2*pplus1_oslo/n)) +
facet_wrap(~ highpm10) +
labs(color = "Highlight",
title = "Leverage vs linear predictor by Y=0/Y=1") +
theme(legend.position = "top")ggplot(oslo_pred, aes(x = cars, y = v)) +
geom_point() +
geom_point(data = filter(oslo_pred, v > 0.045),
aes(color = "v > 0.045"), size = 3) +
geom_hline(yintercept = c(2*pplus1_oslo/n)) +
facet_wrap(~ tempdiff) +
labs(color = "Highlight",
title = "Leverage vs cars by tempdiff") +
theme(legend.position = "top")ggplot(oslo_pred, aes(x = windspeed, y = v)) +
geom_point() +
geom_point(data = filter(oslo_pred, v > 0.045),
aes(color = "v > 0.045"), size = 3) +
geom_hline(yintercept = c(2*pplus1_oslo/n)) +
facet_wrap(~ tempdiff) +
labs(color = "Highlight",
title = "Leverage vs wind speed by tempdiff") +
theme(legend.position = "top")Use facet_grid() in order to plot in a grid defined by two categorical variables:
ggplot(oslo_pred, aes(x = cars, y = windspeed)) +
geom_point() +
geom_point(data = filter(oslo_pred, v > 0.045),
aes(color = "v > 0.045"), size = 3) +
geom_hline(yintercept = c(2*pplus1_oslo/n)) +
facet_grid(rows = vars(highpm10), cols = vars(tempdiff)) +
labs(color = "Highlight",
title = "Wind speed vs cars by tempdiff and Y=0/1") +
theme(legend.position = "top")4 Residuals
4.1 Pearson rediduals
Extract the Pearson residuals from influence() output and standardize them:
oslo_pred <- oslo_pred |>
mutate(pearson = model_oslo_infl$pear.res,
stdpearson = pearson/sqrt(1 - v))
glimpse(oslo_pred)Rows: 500
Columns: 11
$ cars <dbl> 2308, 3084, 110, 1854, 2351, 2662, 2478, 2387, 984, 2159, 3…
$ windspeed <dbl> 4.2, 4.8, 4.3, 3.0, 5.6, 2.3, 1.9, 8.9, 2.0, 5.2, 3.4, 4.4,…
$ temp2m <dbl> -4.4, -5.7, -13.5, 1.4, 4.1, 5.8, 2.7, 7.1, 4.1, 1.1, -9.0,…
$ tempdiff <fct> zero, zero, zero, zero, pos, zero, zero, zero, zero, zero, …
$ zerodiff <fct> zero, zero, zero, zero, nonzero, zero, zero, zero, zero, ze…
$ windquad <chr> "NE", "NE", "NE", "SE", "NW", "SW", "SW", "SW", "SW", "NE",…
$ highpm10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
$ xbeta <dbl> -1.5848094, -1.0208979, -3.3229671, -1.6616834, -0.7548260,…
$ v <dbl> 0.003771006, 0.008143667, 0.005456267, 0.003061162, 0.03021…
$ pearson <dbl> -0.4527547, -0.6002260, -0.1898571, -0.4356824, 1.4585065, …
$ stdpearson <dbl> -0.4536108, -0.6026851, -0.1903772, -0.4363508, 1.4810497, …
QQ-plot of the standardized Pearson residuals:
ggplot(oslo_pred, aes(sample = stdpearson)) +
geom_qq() + geom_qq_line() +
labs(title = "Q-Q-plot standardised residuals",
x = "theoretical", y = "sample")Not at all normally distributed.
Plot the residuals against the linear predictor colour coded by \(Y=0/1\). The as.factor(highpm10) prevents ggplot from using a colour spectrum and instead use default color number 1 and 2.
ggplot(oslo_pred, aes(x = xbeta,
y = stdpearson,
color = as.factor(highpm10))) +
geom_point() +
geom_hline(yintercept = c(-3, -2, 0, 2, 3),
linetype = c("dotted", "dashed", "solid", "dashed", "dotted"),
linewidth = 1) +
labs(title = "standardised residuals vs linear predictor",
x = "xb", y = "stdres",
color = "Y")Plot the squared residuals against the linear predictor colour coded by \(Y=0/1\).
ggplot(oslo_pred, aes(x = xbeta,
y = stdpearson^2,
color = as.factor(highpm10))) +
geom_point() +
geom_hline(yintercept = c(0, 2^2, 3^2),
linetype = c("solid", "dashed", "dotted"),
linewidth = 1) +
labs(title = "Squared standardised residuals vs linear predictor",
x = "xb", y = "stdres^2",
color = "Y")4.2 Deviance residuals
Extract the deviance residuals from influence() output and standardize them:
oslo_pred <- oslo_pred |>
mutate(devresid = model_oslo_infl$dev.res,
stddevresid = devresid/sqrt(1 - v))
glimpse(oslo_pred)Rows: 500
Columns: 13
$ cars <dbl> 2308, 3084, 110, 1854, 2351, 2662, 2478, 2387, 984, 2159, …
$ windspeed <dbl> 4.2, 4.8, 4.3, 3.0, 5.6, 2.3, 1.9, 8.9, 2.0, 5.2, 3.4, 4.4…
$ temp2m <dbl> -4.4, -5.7, -13.5, 1.4, 4.1, 5.8, 2.7, 7.1, 4.1, 1.1, -9.0…
$ tempdiff <fct> zero, zero, zero, zero, pos, zero, zero, zero, zero, zero,…
$ zerodiff <fct> zero, zero, zero, zero, nonzero, zero, zero, zero, zero, z…
$ windquad <chr> "NE", "NE", "NE", "SE", "NW", "SW", "SW", "SW", "SW", "NE"…
$ highpm10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1…
$ xbeta <dbl> -1.5848094, -1.0208979, -3.3229671, -1.6616834, -0.7548260…
$ v <dbl> 0.003771006, 0.008143667, 0.005456267, 0.003061162, 0.0302…
$ pearson <dbl> -0.4527547, -0.6002260, -0.1898571, -0.4356824, 1.4585065,…
$ stdpearson <dbl> -0.4536108, -0.6026851, -0.1903772, -0.4363508, 1.4810497,…
$ devresid <dbl> -0.6106859, -0.7844542, -0.2661251, -0.5895784, 1.5100671,…
$ stddevresid <dbl> -0.6118407, -0.7876680, -0.2668541, -0.5904829, 1.5334072,…
QQ-plot of the standardized deviance residuals:
ggplot(oslo_pred, aes(sample = stddevresid)) +
geom_qq() + geom_qq_line() +
labs(title = "QQ-plot Standardized deviance residuals")Not at all normaly distributed.
Plot the standardized deviance residuals against the linear predictor colour coded by \(Y=0/1\).
ggplot(oslo_pred, aes(x = xbeta,
y = stddevresid,
color = as.factor(highpm10))) +
geom_point() +
geom_hline(yintercept = c(-3, -2, 0, 2, 3),
linetype = c("dotted", "dashed", "solid", "dashed", "dotted"),
linewidth = 1) +
labs(title = "Standardised deviance residuals vs linear predictor",
x = "xb", y = "devstd",
color = "Y")There are less extreme than the Pearson residuals.
Plot the standardized deviance residuals vs cars and against wind speed, divided by tempdiff to find systematic problems:
ggplot(oslo_pred, aes(x = cars, y = stddevresid,
color = as.factor(highpm10))) +
geom_point() +
geom_hline(yintercept = c(-3, -2, 0, 2, 3),
linetype = rep(c("dotted", "dashed", "solid", "dashed", "dotted"), 3),
linewidth = 1) +
labs(title = "Standardised deviance residuals vs cars by temp diff",
y = "devstd", color = "Y") +
facet_wrap(~tempdiff)ggplot(oslo_pred, aes(x = windspeed, y = stddevresid,
color = as.factor(highpm10))) +
geom_point() +
geom_hline(yintercept = c(-3, -2, 0, 2, 3),
linetype = rep(c("dotted", "dashed", "solid", "dashed", "dotted"), 3),
linewidth = 1) +
labs(title = "Standardised deviance residuals vs windspeed by temp diff",
x = "windspeed", y = "devstd",
color = "Y") +
facet_wrap(~tempdiff)5 Influential observations
5.1 Cook’s distance
Save the Cook’s distances:
oslo_pred <- oslo_pred |> mutate(Dcook = cooks.distance(model_oslo_glm))Since none of the Cook’s distance values are even close to 1, we only add the horizontal line at \(4/n\) to the plot.
ggplot(oslo_pred, aes(x = xbeta, y = Dcook)) +
geom_point() +
geom_point(data = filter(oslo_pred, v > 0.045), aes(color = "v > 0.045"),
size = 3) +
geom_hline(yintercept = 4/n, linewidth = 1) +
facet_grid(rows = vars(highpm10), cols = vars(tempdiff)) +
labs(title = "Cook's distance vs linear predictor by temp diff",
x = "xb",
color = "Highlight") +
theme(legend.position = "top")6 Model comparisons
6.1 Fit some more models to compare with
Since tempdiff = pos and neg are small groups, both with \(\beta\)-parameters larger than zero, we might want to put them together into one group. This is done in the variable zerodiff:
table(oslo$tempdiff, oslo$zerodiff)
zero nonzero
zero 386 0
neg 0 35
pos 0 79
We will try using this version in some models as It might reduce the influence problem.
Fit some models. Note that Model 5 is the result of a backwards elimination on Model oslo using BIC as criterion.
model_0_glm <- glm(highpm10 ~ 1, family = "binomial", data = oslo)
model_1_glm <- glm(highpm10 ~ I(cars/1000), family = "binomial", data = oslo)
model_2_glm <- glm(highpm10 ~ I(cars/1000) + windspeed, family = "binomial", data = oslo)
model_3_glm <- glm(highpm10 ~ I(cars/1000) + zerodiff, family = "binomial", data = oslo)
model_4_glm <- glm(highpm10 ~ I(cars/1000)*windspeed, family = "binomial", data = oslo)
model_5_glm <- step(model_oslo_glm, k = log(nobs(model_oslo_glm)))Start: AIC=500.69
highpm10 ~ I(cars/1000) * windspeed + tempdiff
Df Deviance AIC
- I(cars/1000):windspeed 1 468.13 499.20
<none> 463.40 500.69
- tempdiff 2 492.65 517.50
Step: AIC=499.2
highpm10 ~ I(cars/1000) + windspeed + tempdiff
Df Deviance AIC
- windspeed 1 473.43 498.29
<none> 468.13 499.20
- tempdiff 2 501.62 520.26
- I(cars/1000) 1 497.67 522.52
Step: AIC=498.29
highpm10 ~ I(cars/1000) + tempdiff
Df Deviance AIC
<none> 473.43 498.29
- I(cars/1000) 1 499.80 518.45
- tempdiff 2 511.23 523.66
model_6_glm <- glm(highpm10 ~ I(cars/1000)*windspeed + zerodiff, family = "binomial", data = oslo)6.2 AIC and BIC
Save the AIC and BIC values for all the models:
aic <- AIC(model_0_glm, model_1_glm, model_2_glm, model_red_glm,
model_3_glm, model_4_glm, model_5_glm, model_6_glm,
model_oslo_glm)
bic <- BIC(model_0_glm, model_1_glm, model_2_glm, model_red_glm,
model_3_glm, model_4_glm, model_5_glm, model_6_glm,
model_oslo_glm)
collect.AICetc <- data.frame(aic, bic)
collect.AICetc df AIC df.1 BIC
model_0_glm 1 538.8484 1 543.0630
model_1_glm 2 515.2302 2 523.6594
model_2_glm 3 507.6211 3 520.2650
model_red_glm 3 505.8048 3 518.4486
model_3_glm 3 480.2511 3 492.8949
model_4_glm 4 500.6461 4 517.5045
model_5_glm 4 481.4339 4 498.2924
model_6_glm 5 476.2115 5 497.2845
model_oslo_glm 6 475.4022 6 500.6898
Get rid of the extra df.1 column:
collect.AICetc <- collect.AICetc |> mutate(df.1 = NULL)
collect.AICetc df AIC BIC
model_0_glm 1 538.8484 543.0630
model_1_glm 2 515.2302 523.6594
model_2_glm 3 507.6211 520.2650
model_red_glm 3 505.8048 518.4486
model_3_glm 3 480.2511 492.8949
model_4_glm 4 500.6461 517.5045
model_5_glm 4 481.4339 498.2924
model_6_glm 5 476.2115 497.2845
model_oslo_glm 6 475.4022 500.6898
Model oslo with cars*windspeed and tempdiff is the best (AIC).
Model 3 with cars and zerodiff is the best (BIC).
6.3 Pseudo R2
Save the log likelihood for the null model separately with the logLik() function. We will need to pick out the value so that we get rid of the text surrounding it.
logLik(model_0_glm)'log Lik.' -268.4242 (df=1)
lnL0 <- logLik(model_0_glm)[1]
lnL0[1] -268.4242
Add the log-likelihoods of all the models to the AIC-collection:
collect.AICetc <- collect.AICetc |> mutate(
loglik = c(logLik(model_0_glm)[1],
logLik(model_1_glm)[1],
logLik(model_2_glm)[1],
logLik(model_red_glm)[1],
logLik(model_3_glm)[1],
logLik(model_4_glm)[1],
logLik(model_5_glm)[1],
logLik(model_6_glm)[1],
logLik(model_oslo_glm)[1]))
collect.AICetc df AIC BIC loglik
model_0_glm 1 538.8484 543.0630 -268.4242
model_1_glm 2 515.2302 523.6594 -255.6151
model_2_glm 3 507.6211 520.2650 -250.8106
model_red_glm 3 505.8048 518.4486 -249.9024
model_3_glm 3 480.2511 492.8949 -237.1256
model_4_glm 4 500.6461 517.5045 -246.3230
model_5_glm 4 481.4339 498.2924 -236.7170
model_6_glm 5 476.2115 497.2845 -233.1057
model_oslo_glm 6 475.4022 500.6898 -231.7011
6.3.1 McFadden
Calculate McFadden’s pseudo \(R^2\) and adjusted pseudo \(R^2\) by dividing the the log-likelihood for the null model, lnL0. Note that df is the same as \(p+1\) so we can use it when calculating the adjusted version.
collect.AICetc <- collect.AICetc |> mutate(
R2McF = 1 - loglik/lnL0,
R2McF.adj = 1 - (loglik - (df - 1)/2)/lnL0)
collect.AICetc df AIC BIC loglik R2McF R2McF.adj
model_0_glm 1 538.8484 543.0630 -268.4242 0.00000000 0.00000000
model_1_glm 2 515.2302 523.6594 -255.6151 0.04771969 0.04585697
model_2_glm 3 507.6211 520.2650 -250.8106 0.06561863 0.06189318
model_red_glm 3 505.8048 518.4486 -249.9024 0.06900196 0.06527651
model_3_glm 3 480.2511 492.8949 -237.1256 0.11660143 0.11287598
model_4_glm 4 500.6461 517.5045 -246.3230 0.08233674 0.07674857
model_5_glm 4 481.4339 498.2924 -236.7170 0.11812363 0.11253546
model_6_glm 5 476.2115 497.2845 -233.1057 0.13157709 0.12412619
model_oslo_glm 6 475.4022 500.6898 -231.7011 0.13681004 0.12749642
6.3.2 Cox-Snell and Nagelkerke
Save the maximal Cox-Snell value (remember that we saved n previously):
R2CS.max <- 1 - (exp(lnL0))^(2/n)
R2CS.max[1] 0.6582572
Calculate Cox-Snell and Nagelkerke pseudo \(R^2\):
collect.AICetc <- collect.AICetc |> mutate(
R2CS = 1 - (exp(lnL0 - loglik))^(2/n),
R2N = R2CS/R2CS.max)
collect.AICetc df AIC BIC loglik R2McF R2McF.adj R2CS
model_0_glm 1 538.8484 543.0630 -268.4242 0.00000000 0.00000000 0.00000000
model_1_glm 2 515.2302 523.6594 -255.6151 0.04771969 0.04585697 0.04994602
model_2_glm 3 507.6211 520.2650 -250.8106 0.06561863 0.06189318 0.06802987
model_red_glm 3 505.8048 518.4486 -249.9024 0.06900196 0.06527651 0.07140926
model_3_glm 3 480.2511 492.8949 -237.1256 0.11660143 0.11287598 0.11767480
model_4_glm 4 500.6461 517.5045 -246.3230 0.08233674 0.07674857 0.08460965
model_5_glm 4 481.4339 498.2924 -236.7170 0.11812363 0.11253546 0.11911567
model_6_glm 5 476.2115 497.2845 -233.1057 0.13157709 0.12412619 0.13174853
model_oslo_glm 6 475.4022 500.6898 -231.7011 0.13681004 0.12749642 0.13661321
R2N
model_0_glm 0.00000000
model_1_glm 0.07587616
model_2_glm 0.10334846
model_red_glm 0.10848232
model_3_glm 0.17876721
model_4_glm 0.12853585
model_5_glm 0.18095613
model_6_glm 0.20014751
model_oslo_glm 0.20753775
The function starts_with() can be used in a select to allow us to select all variables that starts with a specific text string. There is also ends_with() and contains().
Select only df and the \(R^2\) values.
collect.AICetc |> select(df, starts_with("R2")) |> round(digits = 3) df R2McF R2McF.adj R2CS R2N
model_0_glm 1 0.000 0.000 0.000 0.000
model_1_glm 2 0.048 0.046 0.050 0.076
model_2_glm 3 0.066 0.062 0.068 0.103
model_red_glm 3 0.069 0.065 0.071 0.108
model_3_glm 3 0.117 0.113 0.118 0.179
model_4_glm 4 0.082 0.077 0.085 0.129
model_5_glm 4 0.118 0.113 0.119 0.181
model_6_glm 5 0.132 0.124 0.132 0.200
model_oslo_glm 6 0.137 0.127 0.137 0.208