# Lecture 9 - Oslo particles - Goodness-of-fit
library(tidyverse); theme_set(theme_bw() + theme(text = element_text(size = 16)))
library(readxl)
library(caret)
library(pROC)Lecture 9 - Goodness-of-fit
Example 1 - PM10 particles in Oslo
1 Packages
We will need two new packages:
caretfor calculating confusion matrices and associated measures,pROCfor producing ROC-curves and AUC.
Install them first (Tools > Install Packages...).
2 Refit all models
Read the data and turn categorical x-variables into factor variables. We will also need a factor version, highpm10_cat, of the y-variable, highpm10. Refit all models from Lecture 8:
oslo <- read_excel("Data/oslo.xlsx")
oslo <- oslo |> mutate(
highpm10_cat = factor(highpm10,
levels = c(0, 1),
labels = c("Low", "High")),
tempdiff = relevel(as.factor(tempdiff), "zero"),
zerodiff = relevel(as.factor(zerodiff), "zero"))
model_oslo_glm <- glm(highpm10 ~ I(cars/1000)*windspeed + tempdiff, family = "binomial", data = oslo)
model_red_glm <- glm(highpm10 ~ tempdiff, family = "binomial", data = oslo)
model_0_glm <- glm(highpm10 ~ 1, family = "binomial", data = oslo)
model_1_glm <- glm(highpm10 ~ I(cars/1000), family = "binomial", data = oslo)
model_2_glm <- glm(highpm10 ~ I(cars/1000) + windspeed, family = "binomial", data = oslo)
model_3_glm <- glm(highpm10 ~ I(cars/1000) + zerodiff, family = "binomial", data = oslo)
model_4_glm <- glm(highpm10 ~ I(cars/1000)*windspeed, family = "binomial", data = oslo)
model_5_glm <- glm(highpm10 ~ I(cars/1000) + tempdiff, family = "binomial", data = oslo)
model_6_glm <- glm(highpm10 ~ I(cars/1000)*windspeed + zerodiff, family = "binomial", data = oslo)2.1 Calculate the expected probabilities using all the models
pred_phat <- oslo |> mutate(
p_0 = predict(model_0_glm, type = "response"),
p_1 = predict(model_1_glm, type = "response"),
p_2 = predict(model_2_glm, type = "response"),
p_3 = predict(model_3_glm, type = "response"),
p_red = predict(model_red_glm, type = "response"),
p_4 = predict(model_4_glm, type = "response"),
p_5 = predict(model_5_glm, type = "response"),
p_6 = predict(model_6_glm, type = "response"),
p_oslo = predict(model_oslo_glm, type = "response"))
glimpse(pred_phat)Rows: 500
Columns: 17
$ cars <dbl> 2308, 3084, 110, 1854, 2351, 2662, 2478, 2387, 984, 2159,…
$ windspeed <dbl> 4.2, 4.8, 4.3, 3.0, 5.6, 2.3, 1.9, 8.9, 2.0, 5.2, 3.4, 4.…
$ temp2m <dbl> -4.4, -5.7, -13.5, 1.4, 4.1, 5.8, 2.7, 7.1, 4.1, 1.1, -9.…
$ tempdiff <fct> zero, zero, zero, zero, pos, zero, zero, zero, zero, zero…
$ zerodiff <fct> zero, zero, zero, zero, nonzero, zero, zero, zero, zero, …
$ windquad <chr> "NE", "NE", "NE", "SE", "NW", "SW", "SW", "SW", "SW", "NE…
$ highpm10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
$ highpm10_cat <fct> Low, Low, Low, Low, High, Low, Low, Low, Low, Low, Low, L…
$ p_0 <dbl> 0.228, 0.228, 0.228, 0.228, 0.228, 0.228, 0.228, 0.228, 0…
$ p_1 <dbl> 0.2685731, 0.3469301, 0.1142603, 0.2282989, 0.2726122, 0.…
$ p_2 <dbl> 0.23462056, 0.29339186, 0.08407689, 0.23209207, 0.1933119…
$ p_3 <dbl> 0.19754609, 0.27459274, 0.06783712, 0.16064648, 0.5326425…
$ p_red <dbl> 0.1683938, 0.1683938, 0.1683938, 0.1683938, 0.3544304, 0.…
$ p_4 <dbl> 0.21873189, 0.33564065, 0.04371516, 0.21221119, 0.1837597…
$ p_5 <dbl> 0.19699171, 0.26999879, 0.07121687, 0.16173017, 0.4954221…
$ p_6 <dbl> 0.17562057, 0.27664805, 0.03521523, 0.15832748, 0.4055599…
$ p_oslo <dbl> 0.17011543, 0.26485253, 0.03479163, 0.15953615, 0.3197706…
3 Confusion matrix
Construct the confusion matrix for the Oslo model.
First calculate the predicted categories, \(\hat{Y}_i\), using \(\hat{Y}_i = 1\) if \(p_i > 0.5\) and 0 otherwise and turn it into a factor variable with the same labels as highpm10_cat:
pred_phat <- pred_phat |> mutate(
yhat_oslo = factor(p_oslo > 0.5,
levels = c(FALSE, TRUE),
labels = c("Low", "High")))Calculate the confusion matrix for the oslo model, using confusionMatrix() from the caret package. You have to use the factor version of \(Y\) and define which category is the “Positive” outcome:
cm_oslo <- confusionMatrix(
data = pred_phat$yhat_oslo,
reference = pred_phat$highpm10_cat,
positive = "High")
cm_osloConfusion Matrix and Statistics
Reference
Prediction Low High
Low 374 90
High 12 24
Accuracy : 0.796
95% CI : (0.758, 0.8305)
No Information Rate : 0.772
P-Value [Acc > NIR] : 0.1091
Kappa : 0.2364
Mcnemar's Test P-Value : 2.457e-14
Sensitivity : 0.2105
Specificity : 0.9689
Pos Pred Value : 0.6667
Neg Pred Value : 0.8060
Prevalence : 0.2280
Detection Rate : 0.0480
Detection Prevalence : 0.0720
Balanced Accuracy : 0.5897
'Positive' Class : High
glimpse(cm_oslo)List of 6
$ positive: chr "High"
$ table : 'table' int [1:2, 1:2] 374 12 90 24
..- attr(*, "dimnames")=List of 2
.. ..$ Prediction: chr [1:2] "Low" "High"
.. ..$ Reference : chr [1:2] "Low" "High"
$ overall : Named num [1:7] 0.796 0.236 0.758 0.83 0.772 ...
..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
$ byClass : Named num [1:11] 0.211 0.969 0.667 0.806 0.667 ...
..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
$ mode : chr "sens_spec"
$ dots : list()
- attr(*, "class")= chr "confusionMatrix"
cm_oslo$table Reference
Prediction Low High
Low 374 90
High 12 24
cm_oslo$overall Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
7.960000e-01 2.364355e-01 7.579738e-01 8.304830e-01 7.720000e-01
AccuracyPValue McnemarPValue
1.091351e-01 2.456741e-14
cm_oslo$byClass Sensitivity Specificity Pos Pred Value
0.2105263 0.9689119 0.6666667
Neg Pred Value Precision Recall
0.8060345 0.6666667 0.2105263
F1 Prevalence Detection Rate
0.3200000 0.2280000 0.0480000
Detection Prevalence Balanced Accuracy
0.0720000 0.5897191
4 ROC-curves
The function roc() from the pROC package calculates the ROC-curve.
4.1 The null model
roc_0 <- roc(highpm10 ~ p_0, data = pred_phat)
roc_0
Call:
roc.formula(formula = highpm10 ~ p_0, data = pred_phat)
Data: p_0 in 386 controls (highpm10 0) < 114 cases (highpm10 1).
Area under the curve: 0.5
There are a number of things saved in the output:
glimpse(roc_0)List of 17
$ percent : logi FALSE
$ sensitivities : num [1:2] 1 0
$ specificities : num [1:2] 0 1
$ thresholds : num [1:2] -Inf Inf
$ direction : chr "<"
$ cases : Named num [1:114] 0.228 0.228 0.228 0.228 0.228 ...
..- attr(*, "names")= chr [1:114] "5" "14" "15" "20" ...
$ controls : Named num [1:386] 0.228 0.228 0.228 0.228 0.228 ...
..- attr(*, "names")= chr [1:386] "1" "2" "3" "4" ...
$ fun.sesp :function (...)
$ auc : 'auc' num 0.5
..- attr(*, "partial.auc")= logi FALSE
..- attr(*, "percent")= logi FALSE
..- attr(*, "roc")=List of 15
.. ..$ percent : logi FALSE
.. ..$ sensitivities : num [1:2] 1 0
.. ..$ specificities : num [1:2] 0 1
.. ..$ thresholds : num [1:2] -Inf Inf
.. ..$ direction : chr "<"
.. ..$ cases : Named num [1:114] 0.228 0.228 0.228 0.228 0.228 ...
.. .. ..- attr(*, "names")= chr [1:114] "5" "14" "15" "20" ...
.. ..$ controls : Named num [1:386] 0.228 0.228 0.228 0.228 0.228 ...
.. .. ..- attr(*, "names")= chr [1:386] "1" "2" "3" "4" ...
.. ..$ fun.sesp :function (...)
.. ..$ auc : 'auc' num 0.5
.. .. ..- attr(*, "partial.auc")= logi FALSE
.. .. ..- attr(*, "percent")= logi FALSE
.. .. ..- attr(*, "roc")=List of 8
.. .. .. ..- attr(*, "class")= chr "roc"
.. ..$ call : language roc.default(response = response, predictor = predictors[, 1])
.. ..$ original.predictor: Named num [1:500] 0.228 0.228 0.228 0.228 0.228 ...
.. .. ..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
.. ..$ original.response : Named num [1:500] 0 0 0 0 1 0 0 0 0 0 ...
.. .. ..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
.. ..$ predictor : Named num [1:500] 0.228 0.228 0.228 0.228 0.228 ...
.. .. ..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
.. ..$ response : Named num [1:500] 0 0 0 0 1 0 0 0 0 0 ...
.. .. ..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
.. ..$ levels : chr [1:2] "0" "1"
.. ..- attr(*, "class")= chr "roc"
$ call : language roc.formula(formula = highpm10 ~ p_0, data = pred_phat)
$ original.predictor: Named num [1:500] 0.228 0.228 0.228 0.228 0.228 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ original.response : Named num [1:500] 0 0 0 0 1 0 0 0 0 0 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ predictor : Named num [1:500] 0.228 0.228 0.228 0.228 0.228 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ response : Named num [1:500] 0 0 0 0 1 0 0 0 0 0 ...
..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
$ levels : chr [1:2] "0" "1"
$ predictor.name : chr "p_0"
$ response.name : chr "highpm10"
- attr(*, "class")= chr "roc"
4.2 The oslo model:
roc_oslo <- roc(highpm10 ~ p_oslo, data = pred_phat)
roc_oslo
Call:
roc.formula(formula = highpm10 ~ p_oslo, data = pred_phat)
Data: p_oslo in 386 controls (highpm10 0) < 114 cases (highpm10 1).
Area under the curve: 0.7547
4.3 Plot ROC-curve
You can extract the coordinates of the curve by the coords() function.
coords(roc_oslo) |> head() threshold specificity sensitivity
1 -Inf 0.000000000 1
2 0.01279326 0.002590674 1
3 0.01650654 0.005181347 1
4 0.01714387 0.007772021 1
5 0.01771694 0.010362694 1
6 0.01904887 0.012953368 1
You can use these coordinates for ploting, but the pROC package contains a function ggroc() that can do this for you. Since it uses ggplot() you can also add ggplot() elements to the plot. For instance, since both axes are 0-1, we specify a square plot area with + coord_fixed() and add a suitable title:
ggroc(roc_oslo) +
coord_fixed() +
labs(title = "ROC-curve for model oslo")Note that the x-axis is reversed!
The ggroc() function can plot several ROC-curves if given a list and the names to use as labels. Let’s add the ROC-curve for the null model, i.e., the diagonal:
ggroc(list(null = roc_0, oslo = roc_oslo)) +
coord_fixed() +
labs(title = "ROC-curves for model oslo and the null model")4.4 Find best threshold value
The default Youden method maximizing the distance to the diagonal:
youden <- coords(roc_oslo, "best")
youden threshold specificity sensitivity
1 0.2448035 0.738342 0.6403509
Using the “closest.topleft” method minimizing the distance to the top-left ideal corner:
topleft <- coords(roc_oslo, "best", best.method = "closest.topleft")
topleft threshold specificity sensitivity
1 0.2330427 0.6917098 0.6842105
The topleft method results in more equal sensitivity and specificity so let’s use that. Add variable name = "methodname" so that we automatically get colours and labels:
youden$name <- "youden"
topleft$name <- "topleft"
ggroc(list(null = roc_0, oslo = roc_oslo), linewidth = 1) +
geom_point(data = topleft, aes(x = specificity, y = sensitivity), size = 3) +
geom_point(data = youden, aes(x = specificity, y = sensitivity), size = 3) +
coord_fixed() +
labs(title = "ROC-curve for model oslo",
subtitle = "with optimal thresholds")4.5 ROC-curves for all the models
roc_1 <- roc(highpm10 ~ p_1, data = pred_phat)
roc_2 <- roc(highpm10 ~ p_2, data = pred_phat)
roc_3 <- roc(highpm10 ~ p_3, data = pred_phat)
roc_4 <- roc(highpm10 ~ p_4, data = pred_phat)
roc_5 <- roc(highpm10 ~ p_5, data = pred_phat)
roc_6 <- roc(highpm10 ~ p_6, data = pred_phat)
roc_red <- roc(highpm10 ~ p_red, data = pred_phat)
ggroc(list(`0` = roc_0, `1` = roc_1, `2` = roc_2, `3` = roc_3,
`4` = roc_4, `5` = roc_5, `6` = roc_6, red = roc_red,
oslo = roc_oslo),
linewidth = 1) +
coord_fixed() +
labs(title = "ROC-curves for all the models") 5 Area Under the Curve
5.1 AUC for model oslo
You can pick out the AUC from the ROC-curve using the function auc():
auc(roc_oslo)Area under the curve: 0.7547
and calculate its confidence interval using DeLong’s method with the function ci():
ci_oslo <- ci(roc_oslo)
ci_oslo95% CI: 0.705-0.8045 (DeLong)
Note that the result consists of three numerical values even if only two of them are printed.
# lower limit:
ci_oslo[1][1] 0.70503
# AUC:
ci_oslo[2][1] 0.7547496
# upper limit:
ci_oslo[3][1] 0.8044691
5.2 AUC for all the models
aucs <-
data.frame(
model = c("0", "1", "2", "red", "3", "4", "5", "6", "oslo"),
auc = c(auc(roc_0), auc(roc_1), auc(roc_2), auc(roc_red),
auc(roc_3), auc(roc_4), auc(roc_5), auc(roc_6),
auc(roc_oslo)),
lwr = c(ci(roc_0)[1], ci(roc_1)[1],
ci(roc_2)[1], ci(roc_red)[1],
ci(roc_3)[1], ci(roc_4)[1],
ci(roc_5)[1], ci(roc_6)[1],
ci(roc_oslo)[1]),
upr = c(ci(auc(roc_0))[3], ci(auc(roc_1))[3],
ci(auc(roc_2))[3], ci(auc(roc_red))[3],
ci(auc(roc_3))[3], ci(auc(roc_4))[3],
ci(auc(roc_5))[3], ci(auc(roc_6))[3],
ci(auc(roc_oslo))[3]))
aucs model auc lwr upr
1 0 0.5000000 0.5000000 0.5000000
2 1 0.6482138 0.5934717 0.7029559
3 2 0.6861876 0.6350702 0.7373050
4 red 0.6384306 0.5878164 0.6890447
5 3 0.7279793 0.6752745 0.7806840
6 4 0.6946187 0.6432921 0.7459453
7 5 0.7297064 0.6769386 0.7824741
8 6 0.7486365 0.6988793 0.7983936
9 oslo 0.7547496 0.7050300 0.8044691
5.3 Compare AUC against model oslo
The function roc.test() can be used to test whether the AUC values for two models are significantly different:
roc.test(roc_0, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_0 and roc_oslo
Z = -10.042, p-value < 2.2e-16
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.3044691 -0.2050300
sample estimates:
AUC of roc1 AUC of roc2
0.5000000 0.7547496
roc.test(roc_1, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_1 and roc_oslo
Z = -3.873, p-value = 0.0001075
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.16044852 -0.05262302
sample estimates:
AUC of roc1 AUC of roc2
0.6482138 0.7547496
roc.test(roc_2, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_2 and roc_oslo
Z = -3.1416, p-value = 0.00168
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.11133589 -0.02578801
sample estimates:
AUC of roc1 AUC of roc2
0.6861876 0.7547496
roc.test(roc_red, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_red and roc_oslo
Z = -5.3592, p-value = 8.361e-08
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.1588593 -0.0737786
sample estimates:
AUC of roc1 AUC of roc2
0.6384306 0.7547496
roc.test(roc_3, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_3 and roc_oslo
Z = -2.129, p-value = 0.03326
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.051415262 -0.002125325
sample estimates:
AUC of roc1 AUC of roc2
0.7279793 0.7547496
roc.test(roc_4, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_4 and roc_oslo
Z = -3.0023, p-value = 0.00268
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.09938587 -0.02087593
sample estimates:
AUC of roc1 AUC of roc2
0.6946187 0.7547496
roc.test(roc_5, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_5 and roc_oslo
Z = -1.9942, p-value = 0.04613
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.0496562234 -0.0004301324
sample estimates:
AUC of roc1 AUC of roc2
0.7297064 0.7547496
roc.test(roc_6, roc_oslo)
DeLong's test for two correlated ROC curves
data: roc_6 and roc_oslo
Z = -1.5386, p-value = 0.1239
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.013900550 0.001674389
sample estimates:
AUC of roc1 AUC of roc2
0.7486365 0.7547496