x = 1:10
y = rep(c(1,2), each = 5)
m = lm(y ~ x)
s = summary(m)
Now, look at the result of each line
x
## [1] 1 2 3 4 5 6 7 8 9 10
y
## [1] 1 1 1 1 1 2 2 2 2 2
m
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 0.6667 0.1515
s
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4242 -0.1667 0.0000 0.1667 0.4242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6667 0.1880 3.546 0.00756 **
## x 0.1515 0.0303 5.000 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2752 on 8 degrees of freedom
## Multiple R-squared: 0.7576, Adjusted R-squared: 0.7273
## F-statistic: 25 on 1 and 8 DF, p-value: 0.001053
s$r.squared
## [1] 0.7575758
For those who are familiar with linear regression, this may look familiar.
Calculate the probability the individual has the disease if the test is positive when
specificity = 0.95
sensitivity = 0.99
prevalence = 0.001
probability = (sensitivity*prevalence) / (sensitivity*prevalence + (1-specificity)*(1-prevalence))
probability
## [1] 0.01943463
Yes, it is only about 2%!
Read in the fluTrends.csv
file.
# Read in the csv file
fluTrends = read.csv("fluTrends.csv")
names(fluTrends)
## [1] "Date" "Alaska" "Alabama"
## [4] "Arkansas" "Arizona" "California"
## [7] "Colorado" "Connecticut" "District.of.Columbia"
## [10] "Delaware" "Florida" "Georgia"
## [13] "Hawaii" "Iowa" "Idaho"
## [16] "Illinois" "Indiana" "Kansas"
## [19] "Kentucky" "Louisiana" "Massachusetts"
## [22] "Maryland" "Maine" "Michigan"
## [25] "Minnesota" "Missouri" "Mississippi"
## [28] "Montana" "North.Carolina" "North.Dakota"
## [31] "Nebraska" "New.Hampshire" "New.Jersey"
## [34] "New.Mexico" "Nevada" "New.York"
## [37] "Ohio" "Oklahoma" "Oregon"
## [40] "Pennsylvania" "Rhode.Island" "South.Carolina"
## [43] "South.Dakota" "Tennessee" "Texas"
## [46] "Utah" "Virginia" "Vermont"
## [49] "Washington" "Wisconsin" "West.Virginia"
## [52] "Wyoming"
# To maintain pretty column names, use
fluTrends = read.csv("fluTrends.csv", check.names = FALSE)
names(fluTrends)
## [1] "Date" "Alaska" "Alabama"
## [4] "Arkansas" "Arizona" "California"
## [7] "Colorado" "Connecticut" "District of Columbia"
## [10] "Delaware" "Florida" "Georgia"
## [13] "Hawaii" "Iowa" "Idaho"
## [16] "Illinois" "Indiana" "Kansas"
## [19] "Kentucky" "Louisiana" "Massachusetts"
## [22] "Maryland" "Maine" "Michigan"
## [25] "Minnesota" "Missouri" "Mississippi"
## [28] "Montana" "North Carolina" "North Dakota"
## [31] "Nebraska" "New Hampshire" "New Jersey"
## [34] "New Mexico" "Nevada" "New York"
## [37] "Ohio" "Oklahoma" "Oregon"
## [40] "Pennsylvania" "Rhode Island" "South Carolina"
## [43] "South Dakota" "Tennessee" "Texas"
## [46] "Utah" "Virginia" "Vermont"
## [49] "Washington" "Wisconsin" "West Virginia"
## [52] "Wyoming"
# unfortunately these names won't work with the
# fluTrends$colname syntax, but you can use back-ticks
summary(fluTrends$`United States`)
## Length Class Mode
## 0 NULL NULL
# Min, max, mean, and median age for zipcode 20032.
GI_20032 <- GI %>%
filter(zipcode == 20032)
min( GI_20032$age)
## [1] 0
max( GI_20032$age)
## [1] 93
mean( GI_20032$age)
## [1] 28.47843
median(GI_20032$age)
## [1] 26.5
Alternatively
summary(GI_20032$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 26.50 28.48 41.00 93.00
Construct a histogram and boxplot for age at facility 37.
# Construct a histogram and boxplot for age at facility 37.
GI_37 <- GI %>%
filter(facility == 37)
hist(GI_37$age)
# Construct a boxplot for age at facility 37.
boxplot(GI_37$age)
Construct a bar chart for the zipcode at facility 37.
# Construct a bar chart for the zipcode at facility 37.
barplot(table(GI_37$zipcode))
Perhaps this plot isn’t so useful. Maybe it would be better to just use the first 3 zipcode digits
# Construct a bar chart for the first three digits of zipcode at facility 37.
barplot(table(trunc(GI_37$zipcode/100)))