library("tidyverse"); theme_set(theme_bw())
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("Sleuth3")
library("ggResidpanel")
library("emmeans")
Let \(Y_i\) be a Bernoulli response variable. Assume \[ Y_i \stackrel{ind}{\sim} Ber(\theta_i), \quad \eta_i = \mbox{logit}(\theta_i) = \log\left(\frac{\theta_i}{1-\theta_i} \right) = \beta_0 + \beta_1X_{i,1} + \cdots + \beta_{p-1} X_{i,p-1} \] where \(\frac{\theta_i}{1-\theta_i}\) is the odds.
Sometimes observations can be grouped according to common explanatory varaible values. When this is the case, the model can be written using a binomial distribution: \[ Y_i \stackrel{ind}{\sim} Bin(n_i,\theta_i), \quad \mbox{logit}(\theta_i) = \log\left(\frac{\theta_i}{1-\theta_i} \right) = \beta_0 + \beta_1X_{i,1} + \cdots + \beta_{p-1} X_{i,p-1} \] Remember that a binomial is simply the sum of a set of Bernoulli random variables that and independent and have the same probability of success. Thus these two models are equivalent.
The inverse of the logitic, i.e. logit, function will be used later:
expit <- function(eta) 1/(1+exp(-eta)) # inverse logitistic
In R, you can fit these models both ways. The following data set is recorded in a grouped fashion.
case1802
## Treatment Cold NoCold
## 1 Placebo 335 76
## 2 VitC 302 105
Thus, it is natural to fit binomial model
# NoCold is success here
m_binomial <- glm(cbind(NoCold, Cold) ~ Treatment,
data = case1802,
family = "binomial")
Let’s wrangle the data into the other format.
case1802_individual <- case1802 %>%
tidyr::pivot_longer(-Treatment, names_to = "Result", values_to = "n") %>%
tidyr::uncount(n) %>%
mutate(Result = factor(Result))
table(case1802_individual)
## Result
## Treatment Cold NoCold
## Placebo 335 76
## VitC 302 105
Now we can analyze this data set
m_bernoulli <- glm(Result ~ Treatment,
data = case1802_individual,
family = "binomial")
The maximum likelihood estimator for this model is not available in closed form and thus we use an iterative algorithm to find the answers. But we don’t have to worry about these details.
coef(m_bernoulli)
## (Intercept) TreatmentVitC
## -1.4833972 0.4269305
This MLE can be used to determine asymptotic confidence intervals.
confint(m_bernoulli)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.73931608 -1.2405398
## TreatmentVitC 0.09477341 0.7628385
The fact that the interval for the coefficient for the dummy variable for vitamin C does not include 0 provides evidence that, if the remainder of the model is correct, that it is unlikely the coefficient is 0.
We can also compute p-values for a single parameter being zero or a collection of parameters being zero.
summary(m_bernoulli)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4833972 0.1270550 -11.675239 1.705863e-31
## TreatmentVitC 0.4269305 0.1702293 2.507973 1.214259e-02
A small p-value here indicates evidence against the model that has that coefficient equal to 0. But remember that other aspects of the model could be problematic, e.g. independence, constant probability (within a group), etc.
The p-value above is based on a Wald test. An alternative is based on a likelihood ratio test.
anova(m_bernoulli, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Result
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 817 864.65
## Treatment 1 6.3573 816 858.29 0.01169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Unlike with the linear regression (which is based on the normal distribution) the p-values here don’t match exactly.
When computing predictions, we need to be explicit about whether we
want predictions on the linear part, i.e. \(\eta_i\), or the response, i.e. \(\theta_i = E[Y_i]\). By default, the
predict()
function provides predictions on the linear
part.
nd <- data.frame(Treatment = unique(case1802_individual$Treatment))
p <- nd %>%
mutate(prediction = predict(m_bernoulli, newdata = .))
p
## Treatment prediction
## 1 Placebo -1.483397
## 2 VitC -1.056467
We can manually compute predictions on the response.
p %>% mutate(probability = expit(prediction))
## Treatment prediction probability
## 1 Placebo -1.483397 0.1849148
## 2 VitC -1.056467 0.2579853
Alternatively, we can use predict()
to compute these
probabilities directly.
p %>% mutate(probability = predict(m_bernoulli, newdata = ., type = "response"))
## Treatment prediction probability
## 1 Placebo -1.483397 0.1849148
## 2 VitC -1.056467 0.2579853
You can also obtain standard errors which can be used to construct asymptotic confidence intervals.
cbind(nd, predict(m_bernoulli, newdata = nd, se.fit = TRUE)) %>%
mutate(
lcl = fit - qnorm(.975)*se.fit,
ucl = fit + qnorm(.975)*se.fit,
# convert to probability
prob = expit(fit),
p.lcl = expit(lcl),
p.ucl = expit(ucl)
) %>%
select(-residual.scale)
## Treatment fit se.fit lcl ucl prob p.lcl
## 1 Placebo -1.483397 0.1270550 -1.732420 -1.2343740 0.1849148 0.1502782
## 2 VitC -1.056467 0.1132919 -1.278515 -0.8344187 0.2579853 0.2178032
## p.ucl
## 1 0.2254168
## 2 0.3027116
Another alternative is to use the emmeans::emmeans()
function.
emmeans(m_bernoulli, ~ Treatment, type = "response")
## Treatment prob SE df asymp.LCL asymp.UCL
## Placebo 0.185 0.0191 Inf 0.150 0.225
## VitC 0.258 0.0217 Inf 0.218 0.303
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
This can also be used to perform the comparisons, e.g.
emmeans(m_bernoulli, pairwise ~ Treatment, type = "response")
## $emmeans
## Treatment prob SE df asymp.LCL asymp.UCL
## Placebo 0.185 0.0191 Inf 0.150 0.225
## VitC 0.258 0.0217 Inf 0.218 0.303
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
##
## $contrasts
## contrast odds.ratio SE df null z.ratio p.value
## Placebo / VitC 0.653 0.111 Inf 1 -2.508 0.0121
##
## Tests are performed on the log odds ratio scale
The odds ratio is the simplest interpretation of a logistic regression model.
It is much more difficult to assess model assumptions in logistic regression models.
resid_panel(m_binomial)
resid_panel(m_bernoulli)
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
summary(ex2011)
## Temperature Failure
## Min. :53.00 No :17
## 1st Qu.:67.00 Yes: 7
## Median :70.00
## Mean :69.92
## 3rd Qu.:75.25
## Max. :81.00
m <- glm(Failure ~ Temperature,
data = ex2011,
family = "binomial")
summary(m)
##
## Call:
## glm(formula = Failure ~ Temperature, family = "binomial", data = ex2011)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2125 -0.8253 -0.4706 0.5907 2.0512
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.87535 5.70291 1.907 0.0565 .
## Temperature -0.17132 0.08344 -2.053 0.0400 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28.975 on 23 degrees of freedom
## Residual deviance: 23.030 on 22 degrees of freedom
## AIC: 27.03
##
## Number of Fisher Scoring iterations: 4
p <- bind_cols(ex2011,
predict(m, newdata = ex2011, se.fit = TRUE)) %>%
mutate(
lcl = fit - qnorm(.975)*se.fit,
ucl = fit + qnorm(.975)*se.fit
)
ggplot(p, aes(x = Temperature, y = fit, ymin = lcl, ymax = ucl)) +
geom_ribbon(alpha = 0.5) +
geom_line()
This prediction is on the linear predictor scale, i.e. \(\eta\). But this is not very interpretable. Thus, we should convert to the probability scale.
p <- p %>%
mutate(
fit = expit(fit),
lcl = expit(lcl),
ucl = expit(ucl)
)
ggplot(p, aes(x = Temperature, y = fit, ymin = lcl, ymax = ucl)) +
geom_ribbon(alpha = 0.5, fill = "gray") +
geom_line(color = "blue") +
labs(
y = "Probability",
title = "Probability of O-ring Failure vs Temperature"
) +
geom_jitter(aes(y = 1*(Failure == "Yes")),
height = 0)
This analysis intends to understand the relationship between bird keeping and lung cancer. But since we know smoking is related to lung cancer, we need to adjust (or control for) smoking to understand the effect of bird keeping.
The actual data here are from a case-control study where cases are found that (in some sense) are a match to the controls.
summary(case2002)
## LC FM SS BK AG
## LungCancer:49 Female: 36 High: 45 Bird :67 Min. :37.00
## NoCancer :98 Male :111 Low :102 NoBird:80 1st Qu.:52.00
## Median :59.00
## Mean :56.97
## 3rd Qu.:63.00
## Max. :67.00
## YR CD
## Min. : 0.00 Min. : 0.00
## 1st Qu.:20.00 1st Qu.:10.00
## Median :30.00 Median :15.00
## Mean :27.85 Mean :15.75
## 3rd Qu.:39.00 3rd Qu.:20.00
## Max. :50.00 Max. :45.00
In logistic regression models, it is often helpful to construct a binary variable so that it is clear what “success” means.
d <- case2002 %>%
mutate(LungCancer = LC == "LungCancer",
BirdKeeping = ifelse(BK == "Bird", "Yes", "No"),
BirdKeeping = factor(BirdKeeping, levels = c("Yes","No")),
SmokingYears = YR) %>%
select(LungCancer, BirdKeeping, SmokingYears)
summary(d)
## LungCancer BirdKeeping SmokingYears
## Mode :logical Yes:67 Min. : 0.00
## FALSE:98 No :80 1st Qu.:20.00
## TRUE :49 Median :30.00
## Mean :27.85
## 3rd Qu.:39.00
## Max. :50.00
m <- glm(LungCancer ~ BirdKeeping * SmokingYears,
data = d,
family = "binomial")
Attempt to assess assumptions
resid_panel(m)
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
## Warning in helper_plotly_label(model): NAs introduced by coercion
Model selection
summary(m)
##
## Call:
## glm(formula = LungCancer ~ BirdKeeping * SmokingYears, family = "binomial",
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6378 -0.8497 -0.5438 0.9342 2.0604
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.819393 0.712871 -2.552 0.01070 *
## BirdKeepingNo -1.179470 1.146896 -1.028 0.30376
## SmokingYears 0.062120 0.022294 2.786 0.00533 **
## BirdKeepingNo:SmokingYears -0.009297 0.033980 -0.274 0.78440
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 187.14 on 146 degrees of freedom
## Residual deviance: 158.04 on 143 degrees of freedom
## AIC: 166.04
##
## Number of Fisher Scoring iterations: 5
anova(m, test = "Chi")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: LungCancer
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 146 187.13
## BirdKeeping 1 14.2040 145 172.93 0.0001640 ***
## SmokingYears 1 14.8168 144 158.11 0.0001185 ***
## BirdKeeping:SmokingYears 1 0.0744 143 158.04 0.7850979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Remove interaction
m <- glm(LungCancer ~ BirdKeeping + SmokingYears,
data = d,
family = "binomial")
summary(m)
##
## Call:
## glm(formula = LungCancer ~ BirdKeeping + SmokingYears, family = "binomial",
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6093 -0.8644 -0.5283 0.9479 2.0937
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.70460 0.56267 -3.030 0.002450 **
## BirdKeepingNo -1.47555 0.39588 -3.727 0.000194 ***
## SmokingYears 0.05825 0.01685 3.458 0.000544 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 187.14 on 146 degrees of freedom
## Residual deviance: 158.11 on 144 degrees of freedom
## AIC: 164.11
##
## Number of Fisher Scoring iterations: 4
em <- emmeans(m, pairwise ~ BirdKeeping, type = "response")
confint(em)
## $emmeans
## BirdKeeping prob SE df asymp.LCL asymp.UCL
## Yes 0.479 0.0655 Inf 0.355 0.606
## No 0.174 0.0440 Inf 0.104 0.277
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
##
## $contrasts
## contrast odds.ratio SE df asymp.LCL asymp.UCL
## Yes / No 4.37 1.73 Inf 2.01 9.5
##
## Confidence level used: 0.95
## Intervals are back-transformed from the log odds ratio scale
Construct a plot to help understand the relationship between bird keeping and years of smoking on the probability of lung cancer.
nd <- expand.grid(
BirdKeeping = unique(d$BirdKeeping),
SmokingYears = seq(min(d$SmokingYears), max(d$SmokingYears), length = 1001)
)
p <- bind_cols(nd, predict(m, newdata = nd, se.fit = TRUE)) %>%
mutate(
lcl = fit - qnorm(.975)*se.fit,
ucl = fit + qnorm(.975)*se.fit,
# Convert to probability scale
prob = expit(fit),
lcl = expit(lcl),
ucl = expit(ucl)
)
ggplot(p, aes(x = SmokingYears,
y = prob, ymin = lcl, ymax = ucl,
fill = BirdKeeping, color = BirdKeeping)) +
geom_ribbon(alpha = 0.5) +
geom_line() +
ylim(0,1) +
labs(y = "Probability of lung lancer",
x = "Number of years of smoking",
title = "Effect of Bird Keeping on Lung Cancer",
subtitle = "Controlling for Years of Smoking",
color = "Bird keeping",
fill = "Bird keeping") +
theme(legend.position = "bottom")
This is not necessarily the end of the analysis as there are a number of other variables in this data set that could be considered. A quick analysis suggests that maybe we have considered the most important variables.
m <- glm(LC ~ ., data = case2002, family = "binomial")
drop1(m, test = "LRT")
## Single term deletions
##
## Model:
## LC ~ FM + SS + BK + AG + YR + CD
## Df Deviance AIC LRT Pr(>Chi)
## <none> 154.20 168.20
## FM 1 155.32 167.32 1.1184 0.2902721
## SS 1 154.25 166.25 0.0505 0.8221952
## BK 1 165.87 177.87 11.6700 0.0006352 ***
## AG 1 155.49 167.49 1.2961 0.2549329
## YR 1 163.93 175.93 9.7307 0.0018121 **
## CD 1 155.24 167.24 1.0420 0.3073673
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m <- glm(Cause ~ Make*Passengers*VehicleAge, data = ex2018, family = "binomial")
anova(m, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Cause
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 2320 294.20
## Make 1 42.964 2319 251.23 5.577e-11 ***
## Passengers 1 28.788 2318 222.44 8.075e-08 ***
## VehicleAge 1 26.840 2317 195.60 2.210e-07 ***
## Make:Passengers 1 4.190 2316 191.41 0.04066 *
## Make:VehicleAge 1 0.713 2315 190.70 0.39841
## Passengers:VehicleAge 1 0.936 2314 189.77 0.33343
## Make:Passengers:VehicleAge 1 0.152 2313 189.61 0.69709
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m <- glm(Cause ~ Make * Passengers + VehicleAge,
data = ex2018,
family = "binomial")
nd <- expand.grid(
Make = unique(ex2018$Make),
Passengers = seq(min(ex2018$Passengers),
max(ex2018$Passengers)),
VehicleAge = seq(min(ex2018$VehicleAge),
max(ex2018$VehicleAge))
)
p <- bind_cols(nd,
predict(m, newdata = nd, se.fit = TRUE)) %>%
mutate(
lcl = fit - qnorm(.975)*se.fit,
ucl = fit + qnorm(.975)*se.fit,
fit = expit(fit),
lcl = expit(lcl),
ucl = expit(ucl)
)
ggplot(p, aes(x = Passengers,
y = fit,
ymin = lcl,
ymax = ucl,
fill = Make,
color = Make,
linetype = Make)) +
geom_ribbon(alpha = 0.5) +
geom_line() +
facet_wrap(~VehicleAge,
labeller = label_both) +
theme(legend.position = "bottom") +
labs(
y = "Probability",
title = "Probability a Car Accident is Caused by Tire Failure"
)