library("tidyverse"); theme_set(theme_bw())
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("Sleuth3")
library("ggResidpanel")
library("emmeans")
As a general approach, regression allows the response variable mean (or expectation) to depend on categorical and continuous explanatory variables in complex patterns.
Scientific question: How does echolocation affect energy expenditure?
d <- case1002 %>%
filter(grepl("bats", Type)) %>%
mutate(echolocating = Type == "echolocating bats")
d %>%
group_by(echolocating) %>%
summarize(
n = n(),
mean = mean(Energy),
sd = sd(Energy))
## # A tibble: 2 × 4
## echolocating n mean sd
## <lgl> <int> <dbl> <dbl>
## 1 FALSE 4 31.0 10.1
## 2 TRUE 4 3.08 3.84
t.test(Energy ~ echolocating, data = d, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: Energy by echolocating
## t = 5.1562, df = 3.84, p-value = 0.007496
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## 12.65831 43.28169
## sample estimates:
## mean in group FALSE mean in group TRUE
## 31.05 3.08
# Maybe should use logarithms
d %>%
group_by(echolocating) %>%
summarize(
n = n(),
mean = mean(log(Energy)),
sd = sd(log(Energy)))
## # A tibble: 2 × 4
## echolocating n mean sd
## <lgl> <int> <dbl> <dbl>
## 1 FALSE 4 3.40 0.323
## 2 TRUE 4 0.653 1.02
t.test(log(Energy) ~ echolocating, data = d, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: log(Energy) by echolocating
## t = 5.1122, df = 3.591, p-value = 0.009205
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## 1.184026 4.302518
## sample estimates:
## mean in group FALSE mean in group TRUE
## 3.3961200 0.6528477
case1002 %>%
group_by(Type) %>%
summarize(
n = n(),
mean = mean(log(Energy)),
sd = sd(log(Energy)))
## # A tibble: 3 × 4
## Type n mean sd
## <fct> <int> <dbl> <dbl>
## 1 echolocating bats 4 0.653 1.02
## 2 non-echolocating bats 4 3.40 0.323
## 3 non-echolocating birds 12 2.79 0.888
m <- lm(log(Energy) ~ Type, data = case1002)
anova(m)
## Analysis of Variance Table
##
## Response: log(Energy)
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 17.845 8.9222 12.504 0.0004576 ***
## Residuals 17 12.130 0.7135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m)
##
## Call:
## lm(formula = log(Energy) ~ Type, data = case1002)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88718 -0.39944 0.02359 0.49323 1.52531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6528 0.4224 1.546 0.140585
## Typenon-echolocating bats 2.7433 0.5973 4.593 0.000259 ***
## Typenon-echolocating birds 2.1345 0.4877 4.377 0.000411 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8447 on 17 degrees of freedom
## Multiple R-squared: 0.5953, Adjusted R-squared: 0.5477
## F-statistic: 12.5 on 2 and 17 DF, p-value: 0.0004576
Why does there seem to be a difference? Is it due to echolocation or something else, e.g. Mass?
ggplot(case1002, aes(x = Mass, y = Energy, color = Type, shape = Type)) +
geom_point() +
scale_y_log10() +
scale_x_log10()
m <- lm(log(Energy) ~ log(Mass) + Type, data = case1002)
anova(m)
## Analysis of Variance Table
##
## Response: log(Energy)
## Df Sum Sq Mean Sq F value Pr(>F)
## log(Mass) 1 29.3919 29.3919 849.9108 2.691e-15 ***
## Type 2 0.0296 0.0148 0.4276 0.6593
## Residuals 16 0.5533 0.0346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
After we control/adjust for Mass, Type does not appear to improve the model. Thus, after controlling for Mass, echolocation does not appear to increase energy expenditure.
Of course, now we might like to know why echolocation is primarily in low mass species.
The general structure of a regression model is \[ Y_i \stackrel{ind}{\sim} N(\beta_0 + \beta_1X_{i,1} + \cdots + \beta_{p-1} X_{i,p-1}, \sigma^2) \] for \(i=1,\ldots,n\) or, equivalently, \[ Y_i = \beta_0 + \beta_1X_{i,1} + \cdots + \beta_{p-1} X_{i,p-1}, \qquad \epsilon_i \stackrel{ind}{\sim} N(0, \sigma^2). \]
where, for observation \(i\),
You have a lot of flexibility in your choice for the response and explanatory variables.
An alternative way to represent the model uses matrix notation and the multivariate normal distribution. \[ Y = X\beta + \epsilon, \qquad \epsilon \sim N_n(0,\sigma^2 \mathrm{I}) \] where
If you are interested in learning more about the multivariate normal distribution and its uses, look for a course in multivariate data analyses, e.g. STAT 475.
Use of matrix notation and linear algebra eases parameter estimation:
m <- lm(log(Energy) ~ log(Mass) * Type, data = case1002)
s <- summary(m)
# Coefficient estimates and confidence/credible intervals
coef(m) # or m$coefficients
## (Intercept) log(Mass)
## -1.47051526 0.80465705
## Typenon-echolocating bats Typenon-echolocating birds
## 1.26806769 -0.11032250
## log(Mass):Typenon-echolocating bats log(Mass):Typenon-echolocating birds
## -0.21487499 0.03071328
confint(m)
## 2.5 % 97.5 %
## (Intercept) -2.0017153 -0.9393152
## log(Mass) 0.6187372 0.9905769
## Typenon-echolocating bats -1.4888841 4.0250195
## Typenon-echolocating birds -0.9355124 0.7148674
## log(Mass):Typenon-echolocating bats -0.6944979 0.2647479
## log(Mass):Typenon-echolocating birds -0.1898417 0.2512682
# SD estimate
s$sigma
## [1] 0.1899
# Coefficint of determination
s$r.squared
## [1] 0.9831569
# Fitted and residuals
head(fitted(m)) # or head(m$fitted.values)
## 1 2 3 4 5 6
## 3.724328 3.597247 3.072588 3.190317 1.084392 1.389195
head(residuals(m)) # or head(m$residuals)
## 1 2 3 4 5 6
## 0.05302022 -0.04762963 0.07586501 -0.08125559 -0.18423016 -0.02055509
# A reasonable display of most of this
summary(m)
##
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type, data = case1002)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25152 -0.12643 -0.00954 0.08124 0.32840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.47052 0.24767 -5.937 3.63e-05 ***
## log(Mass) 0.80466 0.08668 9.283 2.33e-07 ***
## Typenon-echolocating bats 1.26807 1.28542 0.987 0.341
## Typenon-echolocating birds -0.11032 0.38474 -0.287 0.779
## log(Mass):Typenon-echolocating bats -0.21487 0.22362 -0.961 0.353
## log(Mass):Typenon-echolocating birds 0.03071 0.10283 0.299 0.770
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1899 on 14 degrees of freedom
## Multiple R-squared: 0.9832, Adjusted R-squared: 0.9771
## F-statistic: 163.4 on 5 and 14 DF, p-value: 6.696e-12
The assumptions in every regression model are
and
We generally use graphical techniques to assess these assumptions. In particular, we utilize
opar = par(mfrow = c(2,3))
plot(m, 1:6, ask = FALSE)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
par(opar)
d <- case1002 %>%
mutate(residuals = residuals(m))
plot(d$residuals) # residuals v index
plot(residuals ~ Type, data = d, type = "p")
plot(residuals ~ log(Mass), data = d)
This package is created by ISU Statistics PhD Alumni and utilizes ggplot2.
resid_panel(m, plots = "all")
resid_panel(m, plots = "all",
qqbands = TRUE, smoother = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
resid_panel(m, plots = c("resid","index","qq","cookd"),
qqbands = TRUE, smoother = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
resid_xpanel(m, smoother = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
In a linear regression model there are \(p\) coefficients \(\beta_0,\ldots,\beta_{p-1}\) and one variance \(\sigma^2\). Generally, we are most interested in understanding the effect of explanatory variables by interpreting (a transformation of) the coefficients.
The general interpretation is
a one unit increase in the explanatory variable is associated with a $\beta$
increase in the mean of the response when holding all other variables constant
There are several issues with this interpretation
In this class, we will focus on graphical interpretations.
If a regression coefficient is exactly zero, this means that explanatory variable has no effect on the mean of the response. Thus constructing a hypothesis test with the null value being zero is reasonable. If the p-value for this test is small, it only tells us there is something wrong with the model that includes this coefficient being zero. But this small p-value could arise due to any (or multiple) model assumptions. Thus, the analyst needs to evaluate all model assumptions.
Recall that each regression coefficient can be tested using the following t statistic \[ T = \frac{\hat\beta - b_0}{SE(\hat\beta)} \] where \(b_0\) is the hypothesized value. Then this statistic is compared to a t-distribution with \(n-p\) degrees of freedom.
coef(s) # or s$coefficients
## Estimate Std. Error t value
## (Intercept) -1.47051526 0.24767033 -5.9373897
## log(Mass) 0.80465705 0.08668453 9.2825915
## Typenon-echolocating bats 1.26806769 1.28542004 0.9865006
## Typenon-echolocating birds -0.11032250 0.38474216 -0.2867440
## log(Mass):Typenon-echolocating bats -0.21487499 0.22362264 -0.9608821
## log(Mass):Typenon-echolocating birds 0.03071328 0.10283304 0.2986713
## Pr(>|t|)
## (Intercept) 3.625681e-05
## log(Mass) 2.329921e-07
## Typenon-echolocating bats 3.406309e-01
## Typenon-echolocating birds 7.785083e-01
## log(Mass):Typenon-echolocating bats 3.529143e-01
## log(Mass):Typenon-echolocating birds 7.695781e-01
This table provides \(\hat\beta\), \(SE(\hat\beta)\), the t statistic with $b_0, and the two-sided p-value.
Two models are nested when setting some parameters in one of the models to specific values (usually 0) produces the other model. The second model is called the simpler model.
When we would like to compare two nested models we can use an F-test. The null model for this comparison is the simpler model. A small p-value provides evidence against the simpler model.
In the standard regression output, R provides an F-test to compare the intercept only model to the full model.
summary(m)
##
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type, data = case1002)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25152 -0.12643 -0.00954 0.08124 0.32840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.47052 0.24767 -5.937 3.63e-05 ***
## log(Mass) 0.80466 0.08668 9.283 2.33e-07 ***
## Typenon-echolocating bats 1.26807 1.28542 0.987 0.341
## Typenon-echolocating birds -0.11032 0.38474 -0.287 0.779
## log(Mass):Typenon-echolocating bats -0.21487 0.22362 -0.961 0.353
## log(Mass):Typenon-echolocating birds 0.03071 0.10283 0.299 0.770
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1899 on 14 degrees of freedom
## Multiple R-squared: 0.9832, Adjusted R-squared: 0.9771
## F-statistic: 163.4 on 5 and 14 DF, p-value: 6.696e-12
We can recreate this F-test using the ANOVA function.
m <- lm(log(Energy) ~ log(Mass) * Type, data = case1002)
m0 <- lm(log(Energy) ~ 1, data = case1002)
anova(m0, m)
## Analysis of Variance Table
##
## Model 1: log(Energy) ~ 1
## Model 2: log(Energy) ~ log(Mass) * Type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19 29.9748
## 2 14 0.5049 5 29.47 163.44 6.696e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
But, generally this F-test is not very interesting. It may help us to consider adding terms to the model sequentially.
anova(m)
## Analysis of Variance Table
##
## Response: log(Energy)
## Df Sum Sq Mean Sq F value Pr(>F)
## log(Mass) 1 29.3919 29.3919 815.0382 8.265e-14 ***
## Type 2 0.0296 0.0148 0.4100 0.6713
## log(Mass):Type 2 0.0484 0.0242 0.6718 0.5265
## Residuals 14 0.5049 0.0361
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This is a sequential comparison, i.e.
If we just want to look at the highest order terms, we can use
drop()
.
drop1(m, test = "F")
## Single term deletions
##
## Model:
## log(Energy) ~ log(Mass) * Type
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 0.50487 -61.584
## log(Mass):Type 2 0.04845 0.55332 -63.751 0.6718 0.5265
Contrasts are linear combinations of means who coefficients sum to zero. They are useful in comparing the means of groups of groups to each other. Extensions of these ideas can be used to compare group means at fixed values of other explanatory variables.
First, imagine that we wanted to perform all pairwise comparisons.
m <- lm(log(Energy) ~ Type, data = case1002)
e <- emmeans(m, pairwise~Type)
e
## $emmeans
## Type emmean SE df lower.CL upper.CL
## echolocating bats 0.653 0.422 17 -0.238 1.54
## non-echolocating bats 3.396 0.422 17 2.505 4.29
## non-echolocating birds 2.787 0.244 17 2.273 3.30
##
## Results are given on the log (not the response) scale.
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio
## echolocating bats - (non-echolocating bats) -2.743 0.597 17 -4.593
## echolocating bats - (non-echolocating birds) -2.134 0.488 17 -4.377
## (non-echolocating bats) - (non-echolocating birds) 0.609 0.488 17 1.248
## p.value
## 0.0007
## 0.0011
## 0.4423
##
## Results are given on the log (not the response) scale.
## P value adjustment: tukey method for comparing a family of 3 estimates
Suppose we wanted to compare echolocating bats vs (non-echolocating bats AND non-echolocating birds). One way to pose this question is to compare the mean of the echolocating bats to the average of the means for the non-echolocating bats and non-echolocating birds.
contrast(e, list("echo v non-echo" = c(-1,.5,.5)))
## I bet you wanted to call this with just object[[1]] - use '[[]]' or which' if I'm wrong.
## See '? emm_list' for more information
## contrast estimate SE df t.ratio p.value
## echo v non-echo 2.44 0.488 17 5.001 0.0001
##
## Results are given on the log (not the response) scale.
Suppose instead you wanted to consider the model that has a different slope for each type and you wanted to compare the slopes amongst the different types.
m <- lm(log(Energy) ~ log(Mass)*Type, data = case1002)
emtrends(m, pairwise ~ Type, var = "log(Mass)")
## $emtrends
## Type log(Mass).trend SE df lower.CL upper.CL
## echolocating bats 0.805 0.0867 14 0.619 0.991
## non-echolocating bats 0.590 0.2061 14 0.148 1.032
## non-echolocating birds 0.835 0.0553 14 0.717 0.954
##
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio
## echolocating bats - (non-echolocating bats) 0.2149 0.224 14 0.961
## echolocating bats - (non-echolocating birds) -0.0307 0.103 14 -0.299
## (non-echolocating bats) - (non-echolocating birds) -0.2456 0.213 14 -1.151
## p.value
## 0.6124
## 0.9522
## 0.5003
##
## P value adjustment: tukey method for comparing a family of 3 estimates
Differences between groups at a particular value of a continuous variable
ggplot(case1002, aes(x = Mass, y = Energy,
color = Type, linetype = Type, shape = Type)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
geom_vline(xintercept = 400)
## `geom_smooth()` using formula = 'y ~ x'
m <- lm(log(Energy) ~ log(Mass)*Type, data = case1002)
em <- emmeans(m, pairwise ~ Type, at = list(Mass = 400))
## NOTE: Results may be misleading due to involvement in interactions
em
## $emmeans
## Type emmean SE df lower.CL upper.CL
## echolocating bats 3.35 0.3057 14 2.69 4.01
## non-echolocating bats 3.33 0.0976 14 3.12 3.54
## non-echolocating birds 3.42 0.0692 14 3.28 3.57
##
## Results are given on the log (not the response) scale.
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio
## echolocating bats - (non-echolocating bats) 0.0193 0.321 14 0.060
## echolocating bats - (non-echolocating birds) -0.0737 0.313 14 -0.235
## (non-echolocating bats) - (non-echolocating birds) -0.0930 0.120 14 -0.778
## p.value
## 0.9980
## 0.9701
## 0.7223
##
## Results are given on the log (not the response) scale.
## P value adjustment: tukey method for comparing a family of 3 estimates
co <- contrast(em, type = "response")
## I bet you wanted to call this with just object[[1]] - use '[[]]' or which' if I'm wrong.
## See '? emm_list' for more information
confint(co)
## contrast ratio SE df lower.CL upper.CL
## echolocating bats effect 0.982 0.204 14 0.558 1.73
## (non-echolocating bats) effect 0.963 0.119 14 0.689 1.35
## (non-echolocating birds) effect 1.057 0.123 14 0.770 1.45
##
## Confidence level used: 0.95
## Conf-level adjustment: bonferroni method for 3 estimates
## Intervals are back-transformed from the log scale
The tools provided in the emmeans
package are
tremendously helpful to real-world statistical analysis. They allow you
to fit a single model and ask many scientific questions.
There are various ways to construct regression models in R.
If we want to include multiple explanatory variables, we simply use
the +
symbol.
lm(Energy ~ Mass + Type, data = case1002)
##
## Call:
## lm(formula = Energy ~ Mass + Type, data = case1002)
##
## Coefficients:
## (Intercept) Mass
## 1.4213 0.0575
## Typenon-echolocating bats Typenon-echolocating birds
## 1.1685 4.6007
Transformations of the response and explanatory variables can generally be included directly in the call.
lm(log(Energy) ~ sqrt(Mass) + Type, data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ sqrt(Mass) + Type, data = case1002)
##
## Coefficients:
## (Intercept) sqrt(Mass)
## 0.05805 0.13340
## Typenon-echolocating bats Typenon-echolocating birds
## 0.44399 0.70902
The most common transformation is the log()
.
If we want to include a polynomial term, e.g. quadratic, we need to
encapsulate it in I()
.
lm(log(Energy) ~ Mass + I(Mass^2) + Type, data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ Mass + I(Mass^2) + Type, data = case1002)
##
## Coefficients:
## (Intercept) Mass
## 4.006e-01 9.380e-03
## I(Mass^2) Typenon-echolocating bats
## -8.335e-06 7.842e-01
## Typenon-echolocating birds
## 7.039e-01
This can also be used to “Shift the intercept” to provide an interpretable intercept.
meanMass <- mean(case1002$Mass)
lm(Energy ~ I(Mass - meanMass), data = case1002)
##
## Call:
## lm(formula = Energy ~ I(Mass - meanMass), data = case1002)
##
## Coefficients:
## (Intercept) I(Mass - meanMass)
## 19.5180 0.0587
Interactions between variables can be included by using
:
.
lm(log(Energy) ~ log(Mass) + Type + log(Mass):Type, data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ log(Mass) + Type + log(Mass):Type,
## data = case1002)
##
## Coefficients:
## (Intercept) log(Mass)
## -1.47052 0.80466
## Typenon-echolocating bats Typenon-echolocating birds
## 1.26807 -0.11032
## log(Mass):Typenon-echolocating bats log(Mass):Typenon-echolocating birds
## -0.21487 0.03071
But a convenient shorthand is to use *
which will
include the main effects as well as the interaction.
lm(log(Energy) ~ log(Mass) * Type, data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type, data = case1002)
##
## Coefficients:
## (Intercept) log(Mass)
## -1.47052 0.80466
## Typenon-echolocating bats Typenon-echolocating birds
## 1.26807 -0.11032
## log(Mass):Typenon-echolocating bats log(Mass):Typenon-echolocating birds
## -0.21487 0.03071
We can include everything in the data.frame using the
.
.
lm(log(Energy) ~ ., data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ ., data = case1002)
##
## Coefficients:
## (Intercept) Mass
## 0.546622 0.003682
## Typenon-echolocating bats Typenon-echolocating birds
## 1.026910 1.271714
Interactions (and main effects) can be included using
.^2
.
lm(log(Energy) ~ .^2, data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ .^2, data = case1002)
##
## Coefficients:
## (Intercept) Mass
## -0.03342 0.02379
## Typenon-echolocating bats Typenon-echolocating birds
## 2.79646 1.55466
## Mass:Typenon-echolocating bats Mass:Typenon-echolocating birds
## -0.02251 -0.01898
If there are more explanatory variables, this can be used to include higher order interactions.
lm(log(Energy) ~ log(Mass) + Type - 1, data = case1002) # remove intercept
##
## Call:
## lm(formula = log(Energy) ~ log(Mass) + Type - 1, data = case1002)
##
## Coefficients:
## log(Mass) Typeecholocating bats
## 0.815 -1.498
## Typenon-echolocating bats Typenon-echolocating birds
## -1.576 -1.474
lm(log(Energy) ~ log(Mass) * Type - Type, data = case1002)
##
## Call:
## lm(formula = log(Energy) ~ log(Mass) * Type - Type, data = case1002)
##
## Coefficients:
## (Intercept) log(Mass)
## -1.48722 0.81006
## log(Mass):Typenon-echolocating bats log(Mass):Typenon-echolocating birds
## -0.01090 0.00803
ggplot(case0701, aes(x = Velocity, y = Distance)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x)
An alternative, more general approach, utilizes the
predict()
function to create data for the lines.
m <- lm(Distance ~ Velocity, data = case0701)
d <- data.frame(Velocity = range(case0701$Velocity)) %>%
mutate(prediction = predict(m, newdata = .))
ggplot(case0701, aes(x = Velocity)) +
geom_point(aes(y = Distance)) +
geom_line(
data = d,
aes(y = prediction), color = "blue")
We can obtain the equation for the line using the
summary()
output.
summary(m)
##
## Call:
## lm(formula = Distance ~ Velocity, data = case0701)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76717 -0.23517 -0.01083 0.21081 0.91463
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3991704 0.1186662 3.364 0.0028 **
## Velocity 0.0013724 0.0002278 6.024 4.61e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4056 on 22 degrees of freedom
## Multiple R-squared: 0.6226, Adjusted R-squared: 0.6054
## F-statistic: 36.29 on 1 and 22 DF, p-value: 4.608e-06
Take a look at diagnostics
resid_panel(m, plots = c("resid","index","qq","cookd"))
resid_xpanel(m)
ggplot(case0501, aes(x = Diet, y = Lifetime)) +
geom_jitter(width = 0.1)
m <- lm(Lifetime ~ Diet, data = case0501)
resid_panel(m, plots = c("resid","index","qq","cookd"))
resid_xpanel(m)
Visual representation
d <- data.frame(Diet = unique(case0501$Diet))
p <- d %>% mutate(prediction = predict(m, d))
ggplot() +
geom_jitter(
data = case0501,
aes(x = Diet, y = Lifetime),
width = 0.1) +
geom_point(
data = p,
aes(x = Diet, y = prediction),
color = "red",
size = 3)
ggplot(Sleuth3::case1001, aes(Height, Distance)) +
geom_point() +
theme_bw()
m1 <- lm(Distance ~ Height, case1001)
m2 <- lm(Distance ~ Height + I(Height^2), case1001)
m3 <- lm(Distance ~ Height + I(Height^2) + I(Height^3), case1001)
g <- ggplot(case1001, aes(x=Height, y=Distance)) + geom_point(size=3)
g
g + stat_smooth(method="lm", formula = y ~ x)
g + stat_smooth(method="lm", formula = y ~ x + I(x^2))
g + stat_smooth(method="lm", formula = y ~ x + I(x^2) + I(x^3))
Recall that error variance will ALWAYS decrease as you add more explanatory variables. So reduced uncertainty in the line does not ensure a better out-of-sample fit.
m <- lm(Distance ~ Height + I(Height^2) + I(Height^3) + I(Height^4), case1001)
anova(m)
## Analysis of Variance Table
##
## Response: Distance
## Df Sum Sq Mean Sq F value Pr(>F)
## Height 1 71351 71351 11208.1079 8.921e-05 ***
## I(Height^2) 1 4927 4927 773.9758 0.001290 **
## I(Height^3) 1 696 696 109.3033 0.009025 **
## I(Height^4) 1 36 36 5.5799 0.142011
## Residuals 2 13 6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This suggests that that model with a quadratic term is insufficient while the model with a cubic does not raise any red flags. But, you should still check model assumptions.
m <- lm(log(Energy) ~ log(Mass) + Type, data = case1002)
d <- expand.grid(
Mass = seq(min(case1002$Mass), max(case1002$Mass), length = 1001),
Type = unique(case1002$Type)
)
p <- d %>%
mutate(Energy = exp(predict(m, newdata = d))) # note the exp()
g <- ggplot() +
geom_point(
data = case1002, aes(x = Mass, y = Energy, color = Type, shape = Type)) +
geom_line(
data = p, aes(x = Mass, y = Energy, color = Type, linetype = Type))
g
These lines will look straight (and parallel) on log-log axes
g +
scale_x_log10() +
scale_y_log10()
m <- lm(log(Energy) ~ log(Mass) * Type, data = case1002) # note the *
d <- expand.grid(
Mass = seq(min(case1002$Mass), max(case1002$Mass), length = 1001),
Type = unique(case1002$Type)
)
p <- d %>%
mutate(Energy = exp(predict(m, newdata = d))) # note the exp()
g <- ggplot() +
geom_point(
data = case1002, aes(x = Mass, y = Energy, color = Type, shape = Type)) +
geom_line(
data = p, aes(x = Mass, y = Energy, color = Type, linetype = Type))
g
These lines will look straight (and parallel) on log-log axes
g +
scale_x_log10() +
scale_y_log10()