Introduction

x = 1:10
y = rep(c(1,2), each = 5)
m = lm(y ~ x)
s = summary(m)

Now, look at the result of each line

x
##  [1]  1  2  3  4  5  6  7  8  9 10
y
##  [1] 1 1 1 1 1 2 2 2 2 2
m
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.6667       0.1515
s
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4242 -0.1667  0.0000  0.1667  0.4242 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.6667     0.1880   3.546  0.00756 **
## x             0.1515     0.0303   5.000  0.00105 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2752 on 8 degrees of freedom
## Multiple R-squared:  0.7576, Adjusted R-squared:  0.7273 
## F-statistic:    25 on 1 and 8 DF,  p-value: 0.001053
s$r.squared
## [1] 0.7575758

For those who are familiar with linear regression, this may look familiar.

Calculator

Calculate the probability the individual has the disease if the test is positive when

specificity = 0.95
sensitivity = 0.99
prevalence = 0.001
probability = (sensitivity*prevalence) / (sensitivity*prevalence + (1-specificity)*(1-prevalence))
probability
## [1] 0.01943463

Yes, it is only about 2%!

Read csv file

Read in the fluTrends.csv file.

# Read in the csv file
fluTrends = read.csv("fluTrends.csv")
names(fluTrends)
##  [1] "Date"                 "Alaska"               "Alabama"             
##  [4] "Arkansas"             "Arizona"              "California"          
##  [7] "Colorado"             "Connecticut"          "District.of.Columbia"
## [10] "Delaware"             "Florida"              "Georgia"             
## [13] "Hawaii"               "Iowa"                 "Idaho"               
## [16] "Illinois"             "Indiana"              "Kansas"              
## [19] "Kentucky"             "Louisiana"            "Massachusetts"       
## [22] "Maryland"             "Maine"                "Michigan"            
## [25] "Minnesota"            "Missouri"             "Mississippi"         
## [28] "Montana"              "North.Carolina"       "North.Dakota"        
## [31] "Nebraska"             "New.Hampshire"        "New.Jersey"          
## [34] "New.Mexico"           "Nevada"               "New.York"            
## [37] "Ohio"                 "Oklahoma"             "Oregon"              
## [40] "Pennsylvania"         "Rhode.Island"         "South.Carolina"      
## [43] "South.Dakota"         "Tennessee"            "Texas"               
## [46] "Utah"                 "Virginia"             "Vermont"             
## [49] "Washington"           "Wisconsin"            "West.Virginia"       
## [52] "Wyoming"
# To maintain pretty column names, use 
fluTrends = read.csv("fluTrends.csv", check.names = FALSE)
names(fluTrends)
##  [1] "Date"                 "Alaska"               "Alabama"             
##  [4] "Arkansas"             "Arizona"              "California"          
##  [7] "Colorado"             "Connecticut"          "District of Columbia"
## [10] "Delaware"             "Florida"              "Georgia"             
## [13] "Hawaii"               "Iowa"                 "Idaho"               
## [16] "Illinois"             "Indiana"              "Kansas"              
## [19] "Kentucky"             "Louisiana"            "Massachusetts"       
## [22] "Maryland"             "Maine"                "Michigan"            
## [25] "Minnesota"            "Missouri"             "Mississippi"         
## [28] "Montana"              "North Carolina"       "North Dakota"        
## [31] "Nebraska"             "New Hampshire"        "New Jersey"          
## [34] "New Mexico"           "Nevada"               "New York"            
## [37] "Ohio"                 "Oklahoma"             "Oregon"              
## [40] "Pennsylvania"         "Rhode Island"         "South Carolina"      
## [43] "South Dakota"         "Tennessee"            "Texas"               
## [46] "Utah"                 "Virginia"             "Vermont"             
## [49] "Washington"           "Wisconsin"            "West Virginia"       
## [52] "Wyoming"
# unfortunately these names won't work with the 
# fluTrends$colname syntax, but you can use back-ticks
summary(fluTrends$`United States`)
## Length  Class   Mode 
##      0   NULL   NULL

Descriptive statistics

# Min, max, mean, and median age for zipcode 20032.
GI_20032 <- GI %>%
  filter(zipcode == 20032)

min(   GI_20032$age)
## [1] 0
max(   GI_20032$age)
## [1] 93
mean(  GI_20032$age)
## [1] 28.47843
median(GI_20032$age)
## [1] 26.5

Alternatively

summary(GI_20032$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   26.50   28.48   41.00   93.00

Graphical statistics

Construct a histogram and boxplot for age at facility 37.

# Construct a histogram and boxplot for age at facility 37.
GI_37 <- GI %>%
  filter(facility == 37) 

hist(GI_37$age)

# Construct a boxplot for age at facility 37.
boxplot(GI_37$age)

Construct a bar chart for the zipcode at facility 37.

# Construct a bar chart for the zipcode at facility 37.
barplot(table(GI_37$zipcode))

Perhaps this plot isn’t so useful. Maybe it would be better to just use the first 3 zipcode digits

# Construct a bar chart for the first three digits of zipcode at facility 37.
barplot(table(trunc(GI_37$zipcode/100)))