library("tidyverse"); theme_set(theme_bw())

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library("Sleuth3")
library("ggResidpanel")
library("emmeans")

Background

As a general approach, regression allows the response variable mean (or expectation) to depend on categorical and continuous explanatory variables in complex patterns.

Energy expenditure

Scientific question: How does echolocation affect energy expenditure?

Two-group analysis

d <- case1002 %>%
  filter(grepl("bats", Type)) %>%
  mutate(echolocating = Type == "echolocating bats")

d %>% 
  group_by(echolocating) %>%
  summarize(
    n = n(),
    mean = mean(Energy),
    sd = sd(Energy))

## # A tibble: 2 × 4
##   echolocating     n  mean    sd
##   <lgl>        <int> <dbl> <dbl>
## 1 FALSE            4 31.0  10.1 
## 2 TRUE             4  3.08  3.84

t.test(Energy ~ echolocating, data = d, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  Energy by echolocating
## t = 5.1562, df = 3.84, p-value = 0.007496
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  12.65831 43.28169
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##               31.05                3.08

# Maybe should use logarithms
d %>% 
  group_by(echolocating) %>%
  summarize(
    n = n(),
    mean = mean(log(Energy)),
    sd = sd(log(Energy)))

## # A tibble: 2 × 4
##   echolocating     n  mean    sd
##   <lgl>        <int> <dbl> <dbl>
## 1 FALSE            4 3.40  0.323
## 2 TRUE             4 0.653 1.02

t.test(log(Energy) ~ echolocating, data = d, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  log(Energy) by echolocating
## t = 5.1122, df = 3.591, p-value = 0.009205
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  1.184026 4.302518
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##           3.3961200           0.6528477

Three-group analysis

case1002 %>% 
  group_by(Type) %>%
  summarize(
    n = n(),
    mean = mean(log(Energy)),
    sd = sd(log(Energy)))

## # A tibble: 3 × 4
##   Type                       n  mean    sd
##   <fct>                  <int> <dbl> <dbl>
## 1 echolocating bats          4 0.653 1.02 
## 2 non-echolocating bats      4 3.40  0.323
## 3 non-echolocating birds    12 2.79  0.888

m <- lm(log(Energy) ~ Type, data = case1002)
anova(m)

## Analysis of Variance Table
## 
## Response: log(Energy)
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Type       2 17.845  8.9222  12.504 0.0004576 ***
## Residuals 17 12.130  0.7135                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(m)

## 
## Call:
## lm(formula = log(Energy) ~ Type, data = case1002)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88718 -0.39944  0.02359  0.49323  1.52531 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  0.6528     0.4224   1.546 0.140585    
## Typenon-echolocating bats    2.7433     0.5973   4.593 0.000259 ***
## Typenon-echolocating birds   2.1345     0.4877   4.377 0.000411 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8447 on 17 degrees of freedom
## Multiple R-squared:  0.5953, Adjusted R-squared:  0.5477 
## F-statistic:  12.5 on 2 and 17 DF,  p-value: 0.0004576

Why does there seem to be a difference? Is it due to echolocation or something else, e.g. Mass?

Regression

ggplot(case1002, aes(x = Mass, y = Energy, color = Type, shape = Type)) +
  geom_point() + 
  scale_y_log10() + 
  scale_x_log10()

m <- lm(log(Energy) ~ log(Mass) + Type, data = case1002)
anova(m)

## Analysis of Variance Table
## 
## Response: log(Energy)
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## log(Mass)  1 29.3919 29.3919 849.9108 2.691e-15 ***
## Type       2  0.0296  0.0148   0.4276    0.6593    
## Residuals 16  0.5533  0.0346                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

After we control/adjust for Mass, Type does not appear to improve the model. Thus, after controlling for Mass, echolocation does not appear to increase energy expenditure.

Of course, now we might like to know why echolocation is primarily in low mass species.

Regression Model

The general structure of a regression model is \[ Y_i \stackrel{ind}{\sim} N(\beta_0 + \beta_1X_{i,1} + \cdots + \beta_{p-1} X_{i,p-1}, \sigma^2) \] for $i=1,\ldots,n$ or, equivalently, \[ Y_i = \beta_0 + \beta_1X_{i,1} + \cdots + \beta_{p-1} X_{i,p-1}, \qquad \epsilon_i \stackrel{ind}{\sim} N(0, \sigma^2). \]

where, for observation $i$,

$Y_i$ is the value of the response variable,
$X_{i,j}$ is the value of the $j$th explanatory variable, and
$\epsilon_i$ is the error.

You have a lot of flexibility in your choice for the response and explanatory variables.

Matrix notation

An alternative way to represent the model uses matrix notation and the multivariate normal distribution. \[ Y = X\beta + \epsilon, \qquad \epsilon \sim N_n(0,\sigma^2 \mathrm{I}) \] where

$Y = (Y_1,\ldots,Y_n)^\top$ is an $n\times 1$ response variable vector
$X$ is an $n\times p$ design matrix
- each row $X_{r,\cdot}$ contains the explanatory variables values for one observation
- each column $X_{\cdot, c}$ contains the one explanatory variable values for all observations
$\beta = (\beta_0,\beta_1,\ldots,\beta_{p-1})$ is a $p\times 1$ coefficient parameter vector
$\epsilon = (\epsilon_1,\ldots,\epsilon_n)$ is an $n\times 1$ error vector
$N_n(\mu,\Sigma)$ is a multivariate normal distribution with
- mean $\mu$ and
- covariance matrix $\Sigma$ where
  - $\Sigma_{i,j}$ is the covariance between $\epsilon_i$ and $\epsilon_j$
$\mathrm{I}$ is the $n\times n$ identity matrix

If you are interested in learning more about the multivariate normal distribution and its uses, look for a course in multivariate data analyses, e.g. STAT 475.

Parameter estimates

Use of matrix notation and linear algebra eases parameter estimation:

estimated coefficients $\hat\beta = (X^\top X)^{-1} X^\top y$
fitted/predicted values $\hat{y} = X\hat\beta$
residuals $\hat{\epsilon} = y - \hat{y}$
estimated error variance $\hat\sigma^2 = \frac{1}{n-p}\hat{e}^\top \hat{e}$
coefficient of determination ($R^2$) $1-\hat{e}^\top \hat{e}/(y-\overline{y})^\top (y-\overline{y})$

m <- lm(log(Energy) ~ log(Mass) * Type, data = case1002)
s <- summary(m)

# Coefficient estimates and confidence/credible intervals
coef(m)            # or m$coefficients

##                          (Intercept)                            log(Mass) 
##                          -1.47051526                           0.80465705 
##            Typenon-echolocating bats           Typenon-echolocating birds 
##                           1.26806769                          -0.11032250 
##  log(Mass):Typenon-echolocating bats log(Mass):Typenon-echolocating birds 
##                          -0.21487499                           0.03071328

confint(m)

##                                           2.5 %     97.5 %
## (Intercept)                          -2.0017153 -0.9393152
## log(Mass)                             0.6187372  0.9905769
## Typenon-echolocating bats            -1.4888841  4.0250195
## Typenon-echolocating birds           -0.9355124  0.7148674
## log(Mass):Typenon-echolocating bats  -0.6944979  0.2647479
## log(Mass):Typenon-echolocating birds -0.1898417  0.2512682

# SD estimate
s$sigma

## [1] 0.1899

# Coefficint of determination
s$r.squared

## [1] 0.9831569

# Fitted and residuals
head(fitted(m))    # or head(m$fitted.values)

##        1        2        3        4        5        6 
## 3.724328 3.597247 3.072588 3.190317 1.084392 1.389195

head(residuals(m)) # or head(m$residuals)

##           1           2           3           4           5           6 
##  0.05302022 -0.04762963  0.07586501 -0.08125559 -0.18423016 -0.02055509

# A reasonable display of most of this
summary(m)

## 
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type, data = case1002)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25152 -0.12643 -0.00954  0.08124  0.32840 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -1.47052    0.24767  -5.937 3.63e-05 ***
## log(Mass)                             0.80466    0.08668   9.283 2.33e-07 ***
## Typenon-echolocating bats             1.26807    1.28542   0.987    0.341    
## Typenon-echolocating birds           -0.11032    0.38474  -0.287    0.779    
## log(Mass):Typenon-echolocating bats  -0.21487    0.22362  -0.961    0.353    
## log(Mass):Typenon-echolocating birds  0.03071    0.10283   0.299    0.770    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1899 on 14 degrees of freedom
## Multiple R-squared:  0.9832, Adjusted R-squared:  0.9771 
## F-statistic: 163.4 on 5 and 14 DF,  p-value: 6.696e-12

Assumptions

The assumptions in every regression model are

errors are independent,
errors are normally distributed,
errors have constant variance,

and

the expected response, $E[Y_i]$, depends on the explanatory variables according to a linear function (of the parameters).

We generally use graphical techniques to assess these assumptions. In particular, we utilize

residuals v fitted
qqplot of residuals
residuals v explanatory
residuals v index

base R graphics

opar = par(mfrow = c(2,3))
plot(m, 1:6, ask = FALSE)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

par(opar)

d <- case1002 %>%
  mutate(residuals = residuals(m))

plot(d$residuals) # residuals v index

plot(residuals ~ Type, data = d, type = "p")

plot(residuals ~ log(Mass), data = d)

ggResidpanel

This package is created by ISU Statistics PhD Alumni and utilizes ggplot2.

resid_panel(m, plots = "all")

resid_panel(m, plots = "all", 
            qqbands = TRUE, smoother = TRUE)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

resid_panel(m, plots = c("resid","index","qq","cookd"), 
            qqbands = TRUE, smoother = TRUE)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

resid_xpanel(m, smoother = TRUE)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Interpretation

In a linear regression model there are $p$ coefficients $\beta_0,\ldots,\beta_{p-1}$ and one variance $\sigma^2$. Generally, we are most interested in understanding the effect of explanatory variables by interpreting (a transformation of) the coefficients.

The general interpretation is

a one unit increase in the explanatory variable is associated with a $\beta$
increase in the mean of the response when holding all other variables constant

There are several issues with this interpretation

is the one-unit increase relevant?
is the increase in mean relevant?
can you hold the other variables constant?

In this class, we will focus on graphical interpretations.

Tests

T-tests for regression coefficients

If a regression coefficient is exactly zero, this means that explanatory variable has no effect on the mean of the response. Thus constructing a hypothesis test with the null value being zero is reasonable. If the p-value for this test is small, it only tells us there is something wrong with the model that includes this coefficient being zero. But this small p-value could arise due to any (or multiple) model assumptions. Thus, the analyst needs to evaluate all model assumptions.

Recall that each regression coefficient can be tested using the following t statistic \[ T = \frac{\hat\beta - b_0}{SE(\hat\beta)} \] where $b_0$ is the hypothesized value. Then this statistic is compared to a t-distribution with $n-p$ degrees of freedom.

coef(s) # or s$coefficients

##                                         Estimate Std. Error    t value
## (Intercept)                          -1.47051526 0.24767033 -5.9373897
## log(Mass)                             0.80465705 0.08668453  9.2825915
## Typenon-echolocating bats             1.26806769 1.28542004  0.9865006
## Typenon-echolocating birds           -0.11032250 0.38474216 -0.2867440
## log(Mass):Typenon-echolocating bats  -0.21487499 0.22362264 -0.9608821
## log(Mass):Typenon-echolocating birds  0.03071328 0.10283304  0.2986713
##                                          Pr(>|t|)
## (Intercept)                          3.625681e-05
## log(Mass)                            2.329921e-07
## Typenon-echolocating bats            3.406309e-01
## Typenon-echolocating birds           7.785083e-01
## log(Mass):Typenon-echolocating bats  3.529143e-01
## log(Mass):Typenon-echolocating birds 7.695781e-01

This table provides $\hat\beta$, $SE(\hat\beta)$, the t statistic with $b_0, and the two-sided p-value.

F-tests

Two models are nested when setting some parameters in one of the models to specific values (usually 0) produces the other model. The second model is called the simpler model.

When we would like to compare two nested models we can use an F-test. The null model for this comparison is the simpler model. A small p-value provides evidence against the simpler model.

In the standard regression output, R provides an F-test to compare the intercept only model to the full model.

summary(m)

## 
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type, data = case1002)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25152 -0.12643 -0.00954  0.08124  0.32840 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -1.47052    0.24767  -5.937 3.63e-05 ***
## log(Mass)                             0.80466    0.08668   9.283 2.33e-07 ***
## Typenon-echolocating bats             1.26807    1.28542   0.987    0.341    
## Typenon-echolocating birds           -0.11032    0.38474  -0.287    0.779    
## log(Mass):Typenon-echolocating bats  -0.21487    0.22362  -0.961    0.353    
## log(Mass):Typenon-echolocating birds  0.03071    0.10283   0.299    0.770    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1899 on 14 degrees of freedom
## Multiple R-squared:  0.9832, Adjusted R-squared:  0.9771 
## F-statistic: 163.4 on 5 and 14 DF,  p-value: 6.696e-12

We can recreate this F-test using the ANOVA function.

m  <- lm(log(Energy) ~ log(Mass) * Type, data = case1002)
m0 <- lm(log(Energy) ~ 1, data = case1002)
anova(m0, m)

## Analysis of Variance Table
## 
## Model 1: log(Energy) ~ 1
## Model 2: log(Energy) ~ log(Mass) * Type
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     19 29.9748                                  
## 2     14  0.5049  5     29.47 163.44 6.696e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

But, generally this F-test is not very interesting. It may help us to consider adding terms to the model sequentially.

anova(m)

## Analysis of Variance Table
## 
## Response: log(Energy)
##                Df  Sum Sq Mean Sq  F value    Pr(>F)    
## log(Mass)       1 29.3919 29.3919 815.0382 8.265e-14 ***
## Type            2  0.0296  0.0148   0.4100    0.6713    
## log(Mass):Type  2  0.0484  0.0242   0.6718    0.5265    
## Residuals      14  0.5049  0.0361                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This is a sequential comparison, i.e.

comparing log(Mass) to intercept-only model
comparing Type + log(Mass) to log(Mass)
comparing Type + log(Mass) + Type:log(Mass) to Type + log(Mass)

If we just want to look at the highest order terms, we can use drop().

drop1(m, test = "F")

## Single term deletions
## 
## Model:
## log(Energy) ~ log(Mass) * Type
##                Df Sum of Sq     RSS     AIC F value Pr(>F)
## <none>                      0.50487 -61.584               
## log(Mass):Type  2   0.04845 0.55332 -63.751  0.6718 0.5265

Contrasts

Contrasts are linear combinations of means who coefficients sum to zero. They are useful in comparing the means of groups of groups to each other. Extensions of these ideas can be used to compare group means at fixed values of other explanatory variables.

First, imagine that we wanted to perform all pairwise comparisons.

m <- lm(log(Energy) ~ Type, data = case1002)
e <- emmeans(m, pairwise~Type)
e

## $emmeans
##  Type                   emmean    SE df lower.CL upper.CL
##  echolocating bats       0.653 0.422 17   -0.238     1.54
##  non-echolocating bats   3.396 0.422 17    2.505     4.29
##  non-echolocating birds  2.787 0.244 17    2.273     3.30
## 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                                           estimate    SE df t.ratio
##  echolocating bats - (non-echolocating bats)          -2.743 0.597 17  -4.593
##  echolocating bats - (non-echolocating birds)         -2.134 0.488 17  -4.377
##  (non-echolocating bats) - (non-echolocating birds)    0.609 0.488 17   1.248
##  p.value
##   0.0007
##   0.0011
##   0.4423
## 
## Results are given on the log (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 3 estimates

Suppose we wanted to compare echolocating bats vs (non-echolocating bats AND non-echolocating birds). One way to pose this question is to compare the mean of the echolocating bats to the average of the means for the non-echolocating bats and non-echolocating birds.

contrast(e, list("echo v non-echo" = c(-1,.5,.5)))

## I bet you wanted to call this with just object[[1]] - use '[[]]' or which' if I'm wrong.
## See '? emm_list' for more information

##  contrast        estimate    SE df t.ratio p.value
##  echo v non-echo     2.44 0.488 17   5.001  0.0001
## 
## Results are given on the log (not the response) scale.

Suppose instead you wanted to consider the model that has a different slope for each type and you wanted to compare the slopes amongst the different types.

m <- lm(log(Energy) ~ log(Mass)*Type, data = case1002)
emtrends(m, pairwise ~ Type, var = "log(Mass)")

## $emtrends
##  Type                   log(Mass).trend     SE df lower.CL upper.CL
##  echolocating bats                0.805 0.0867 14    0.619    0.991
##  non-echolocating bats            0.590 0.2061 14    0.148    1.032
##  non-echolocating birds           0.835 0.0553 14    0.717    0.954
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                                           estimate    SE df t.ratio
##  echolocating bats - (non-echolocating bats)          0.2149 0.224 14   0.961
##  echolocating bats - (non-echolocating birds)        -0.0307 0.103 14  -0.299
##  (non-echolocating bats) - (non-echolocating birds)  -0.2456 0.213 14  -1.151
##  p.value
##   0.6124
##   0.9522
##   0.5003
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

Differences between groups at a particular value of a continuous variable

ggplot(case1002, aes(x = Mass, y = Energy, 
       color = Type, linetype = Type, shape = Type)) +
  geom_point() + 
  geom_smooth(method = "lm") +
  scale_x_log10() + 
  scale_y_log10() +
  geom_vline(xintercept = 400)

## `geom_smooth()` using formula = 'y ~ x'

m <- lm(log(Energy) ~ log(Mass)*Type, data = case1002)
em <- emmeans(m, pairwise ~ Type, at = list(Mass = 400))

## NOTE: Results may be misleading due to involvement in interactions

em

## $emmeans
##  Type                   emmean     SE df lower.CL upper.CL
##  echolocating bats        3.35 0.3057 14     2.69     4.01
##  non-echolocating bats    3.33 0.0976 14     3.12     3.54
##  non-echolocating birds   3.42 0.0692 14     3.28     3.57
## 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                                           estimate    SE df t.ratio
##  echolocating bats - (non-echolocating bats)          0.0193 0.321 14   0.060
##  echolocating bats - (non-echolocating birds)        -0.0737 0.313 14  -0.235
##  (non-echolocating bats) - (non-echolocating birds)  -0.0930 0.120 14  -0.778
##  p.value
##   0.9980
##   0.9701
##   0.7223
## 
## Results are given on the log (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 3 estimates

co <- contrast(em, type = "response")

## I bet you wanted to call this with just object[[1]] - use '[[]]' or which' if I'm wrong.
## See '? emm_list' for more information

confint(co)

##  contrast                        ratio    SE df lower.CL upper.CL
##  echolocating bats effect        0.982 0.204 14    0.558     1.73
##  (non-echolocating bats) effect  0.963 0.119 14    0.689     1.35
##  (non-echolocating birds) effect 1.057 0.123 14    0.770     1.45
## 
## Confidence level used: 0.95 
## Conf-level adjustment: bonferroni method for 3 estimates 
## Intervals are back-transformed from the log scale

The tools provided in the emmeans package are tremendously helpful to real-world statistical analysis. They allow you to fit a single model and ask many scientific questions.

Model construction

There are various ways to construct regression models in R.

Multiple explanatory variables

If we want to include multiple explanatory variables, we simply use the + symbol.

lm(Energy ~ Mass + Type, data = case1002)

## 
## Call:
## lm(formula = Energy ~ Mass + Type, data = case1002)
## 
## Coefficients:
##                (Intercept)                        Mass  
##                     1.4213                      0.0575  
##  Typenon-echolocating bats  Typenon-echolocating birds  
##                     1.1685                      4.6007

Transformations

Transformations of the response and explanatory variables can generally be included directly in the call.

lm(log(Energy) ~ sqrt(Mass) + Type, data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ sqrt(Mass) + Type, data = case1002)
## 
## Coefficients:
##                (Intercept)                  sqrt(Mass)  
##                    0.05805                     0.13340  
##  Typenon-echolocating bats  Typenon-echolocating birds  
##                    0.44399                     0.70902

The most common transformation is the log().

I()

If we want to include a polynomial term, e.g. quadratic, we need to encapsulate it in I().

lm(log(Energy) ~ Mass + I(Mass^2) + Type, data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ Mass + I(Mass^2) + Type, data = case1002)
## 
## Coefficients:
##                (Intercept)                        Mass  
##                  4.006e-01                   9.380e-03  
##                  I(Mass^2)   Typenon-echolocating bats  
##                 -8.335e-06                   7.842e-01  
## Typenon-echolocating birds  
##                  7.039e-01

This can also be used to “Shift the intercept” to provide an interpretable intercept.

meanMass <- mean(case1002$Mass)
lm(Energy ~ I(Mass - meanMass), data = case1002)

## 
## Call:
## lm(formula = Energy ~ I(Mass - meanMass), data = case1002)
## 
## Coefficients:
##        (Intercept)  I(Mass - meanMass)  
##            19.5180              0.0587

Interactions

Interactions between variables can be included by using :.

lm(log(Energy) ~ log(Mass) + Type + log(Mass):Type, data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ log(Mass) + Type + log(Mass):Type, 
##     data = case1002)
## 
## Coefficients:
##                          (Intercept)                             log(Mass)  
##                             -1.47052                               0.80466  
##            Typenon-echolocating bats            Typenon-echolocating birds  
##                              1.26807                              -0.11032  
##  log(Mass):Typenon-echolocating bats  log(Mass):Typenon-echolocating birds  
##                             -0.21487                               0.03071

But a convenient shorthand is to use * which will include the main effects as well as the interaction.

lm(log(Energy) ~ log(Mass) * Type, data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type, data = case1002)
## 
## Coefficients:
##                          (Intercept)                             log(Mass)  
##                             -1.47052                               0.80466  
##            Typenon-echolocating bats            Typenon-echolocating birds  
##                              1.26807                              -0.11032  
##  log(Mass):Typenon-echolocating bats  log(Mass):Typenon-echolocating birds  
##                             -0.21487                               0.03071

Include everything

We can include everything in the data.frame using the ..

lm(log(Energy) ~ ., data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ ., data = case1002)
## 
## Coefficients:
##                (Intercept)                        Mass  
##                   0.546622                    0.003682  
##  Typenon-echolocating bats  Typenon-echolocating birds  
##                   1.026910                    1.271714

Interactions (and main effects) can be included using .^2.

lm(log(Energy) ~ .^2, data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ .^2, data = case1002)
## 
## Coefficients:
##                     (Intercept)                             Mass  
##                        -0.03342                          0.02379  
##       Typenon-echolocating bats       Typenon-echolocating birds  
##                         2.79646                          1.55466  
##  Mass:Typenon-echolocating bats  Mass:Typenon-echolocating birds  
##                        -0.02251                         -0.01898

If there are more explanatory variables, this can be used to include higher order interactions.

Remove terms

lm(log(Energy) ~ log(Mass) + Type - 1, data = case1002) # remove intercept

## 
## Call:
## lm(formula = log(Energy) ~ log(Mass) + Type - 1, data = case1002)
## 
## Coefficients:
##                  log(Mass)       Typeecholocating bats  
##                      0.815                      -1.498  
##  Typenon-echolocating bats  Typenon-echolocating birds  
##                     -1.576                      -1.474

lm(log(Energy) ~ log(Mass) * Type - Type, data = case1002)

## 
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type - Type, data = case1002)
## 
## Coefficients:
##                          (Intercept)                             log(Mass)  
##                             -1.48722                               0.81006  
##  log(Mass):Typenon-echolocating bats  log(Mass):Typenon-echolocating birds  
##                             -0.01090                               0.00803

Examples

Simple linear regression

ggplot(case0701, aes(x = Velocity, y = Distance)) +
  geom_point() + 
  geom_smooth(method = "lm", formula = y ~ x)

An alternative, more general approach, utilizes the predict() function to create data for the lines.

m <- lm(Distance ~ Velocity, data = case0701)

d <- data.frame(Velocity = range(case0701$Velocity)) %>%
  mutate(prediction = predict(m, newdata = .))

ggplot(case0701, aes(x = Velocity)) +
  geom_point(aes(y = Distance)) + 
  geom_line(
    data = d,
    aes(y = prediction), color = "blue")

We can obtain the equation for the line using the summary() output.

summary(m)

## 
## Call:
## lm(formula = Distance ~ Velocity, data = case0701)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76717 -0.23517 -0.01083  0.21081  0.91463 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.3991704  0.1186662   3.364   0.0028 ** 
## Velocity    0.0013724  0.0002278   6.024 4.61e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4056 on 22 degrees of freedom
## Multiple R-squared:  0.6226, Adjusted R-squared:  0.6054 
## F-statistic: 36.29 on 1 and 22 DF,  p-value: 4.608e-06

Take a look at diagnostics

resid_panel(m, plots = c("resid","index","qq","cookd"))

resid_xpanel(m)

One categorical variable

ggplot(case0501, aes(x = Diet, y = Lifetime)) + 
  geom_jitter(width = 0.1)

m <- lm(Lifetime ~ Diet, data = case0501)

resid_panel(m, plots = c("resid","index","qq","cookd"))

resid_xpanel(m)

Visual representation

d <- data.frame(Diet = unique(case0501$Diet)) 
p <- d %>% mutate(prediction = predict(m, d))

ggplot() + 
  geom_jitter(
    data = case0501,
    aes(x = Diet, y = Lifetime),
    width = 0.1) + 
  geom_point(
    data = p,
    aes(x = Diet, y = prediction),
    color = "red",
    size = 3)

Polynomial

ggplot(Sleuth3::case1001, aes(Height, Distance)) +
  geom_point() +
  theme_bw()

m1 <- lm(Distance ~ Height,                             case1001)
m2 <- lm(Distance ~ Height + I(Height^2),               case1001)
m3 <- lm(Distance ~ Height + I(Height^2) + I(Height^3), case1001)

g <- ggplot(case1001, aes(x=Height, y=Distance)) + geom_point(size=3)

g

g + stat_smooth(method="lm", formula = y ~ x)

g + stat_smooth(method="lm", formula = y ~ x + I(x^2))

g + stat_smooth(method="lm", formula = y ~ x + I(x^2) + I(x^3))

Recall that error variance will ALWAYS decrease as you add more explanatory variables. So reduced uncertainty in the line does not ensure a better out-of-sample fit.

m <- lm(Distance ~ Height + I(Height^2) + I(Height^3) + I(Height^4), case1001)
anova(m)

## Analysis of Variance Table
## 
## Response: Distance
##             Df Sum Sq Mean Sq    F value    Pr(>F)    
## Height       1  71351   71351 11208.1079 8.921e-05 ***
## I(Height^2)  1   4927    4927   773.9758  0.001290 ** 
## I(Height^3)  1    696     696   109.3033  0.009025 ** 
## I(Height^4)  1     36      36     5.5799  0.142011    
## Residuals    2     13       6                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This suggests that that model with a quadratic term is insufficient while the model with a cubic does not raise any red flags. But, you should still check model assumptions.

Transformations

m <- lm(log(Energy) ~ log(Mass) + Type, data = case1002)

d <- expand.grid(
  Mass = seq(min(case1002$Mass), max(case1002$Mass), length = 1001),
  Type = unique(case1002$Type)
)

p <- d %>%
  mutate(Energy = exp(predict(m, newdata = d))) # note the exp()

g <- ggplot() + 
  geom_point(
    data = case1002, aes(x = Mass, y = Energy, color = Type, shape    = Type)) + 
  geom_line(
    data = p,        aes(x = Mass, y = Energy, color = Type, linetype = Type)) 

g

These lines will look straight (and parallel) on log-log axes

g + 
  scale_x_log10() + 
  scale_y_log10()

Interactions

m <- lm(log(Energy) ~ log(Mass) * Type, data = case1002) # note the *

d <- expand.grid(
  Mass = seq(min(case1002$Mass), max(case1002$Mass), length = 1001),
  Type = unique(case1002$Type)
)

p <- d %>%
  mutate(Energy = exp(predict(m, newdata = d))) # note the exp()

g <- ggplot() + 
  geom_point(
    data = case1002, aes(x = Mass, y = Energy, color = Type, shape    = Type)) + 
  geom_line(
    data = p,        aes(x = Mass, y = Energy, color = Type, linetype = Type)) 

g

These lines will look straight (and parallel) on log-log axes

g + 
  scale_x_log10() + 
  scale_y_log10()

Linear regression

Jarad Niemi

2023-03-30

Background

Energy expenditure

Two-group analysis

Three-group analysis

Regression

Regression Model

Matrix notation

Parameter estimates

Assumptions

base R graphics

ggResidpanel

Interpretation

Tests

T-tests for regression coefficients

F-tests

Contrasts

Model construction

Multiple explanatory variables

Transformations

I()

Interactions

Include everything

Remove terms

Examples

Simple linear regression

One categorical variable

Polynomial

Transformations

Interactions