Intermediate R

If you shut down R, you will need to change your working directory and run the following commands to load libraries,

library("tidyverse")
library("MWBDSSworkshop")

Read in the data files and create additional variables

# Read csv files and create additional variables
icd9df = read.csv("icd9.csv")
GI     = read.csv("GI.csv") %>%
  mutate(
    date      = as.Date(date),
    weekC     = cut(date, breaks="weeks"),
    week      = as.numeric(weekC),
    weekD     = as.Date(weekC),
    facility  = as.factor(facility),
    icd9class = factor(cut(icd9, 
                           breaks = icd9df$code_cutpoint, 
                           labels = icd9df$classification[-nrow(icd9df)], 
                           right  = TRUE)),
    ageC      = cut(age, 
                    breaks = c(-Inf, 5, 18, 45 ,60, Inf)),
    zip3      = trunc(zipcode/100))

Exporting tables

There are many ways to export tables (for use in Word). Here I will cover two basic approachess that are simple and work well.

cut-and-paste
create HTML table and cut-and-paste to Word

There are more sophisticated approaches that I will not cover:

Cut-and-paste

You can cut-and-paste directly from R.

ga_l <- GI %>%
  group_by(gender, ageC) %>%
  summarize(count = n())

ga_w <- ga_l %>%
  spread(ageC, count) %>%
  print(row.names = FALSE)

## # A tibble: 2 x 6
## # Groups:   gender [2]
##   gender `(-Inf,5]` `(5,18]` `(18,45]` `(45,60]` `(60, Inf]`
##   <fct>       <int>    <int>     <int>     <int>       <int>
## 1 Female       2161     1942      3710      1590        1250
## 2 Male         2163     1860      3766      1543        1259

Cut-and-pasting this table is done in ASCII format. This looks good in a plain text document, e.g. Notepad, TextEdit, and some email, but will not look good in other formats, e.g. docx.

Create HTML table

library('xtable')
tab = xtable(ga_w,
             caption = "Total GI cases by Sex and Age Category",
             label   = "myHTMLanchor",
             align   = "ll|rrrrr") # rownames gets a column

Save the table to a file

print(tab, file="table.html", type="html", include.rownames=FALSE)

Output for this HTML table

The HTML code looks like

print(tab, type="html", include.rownames=FALSE)

## <!-- html table generated in R 3.5.2 by xtable 1.8-3 package -->
## <!-- Mon May 20 17:14:00 2019 -->
## <table border=1>
## <caption align="bottom"> Total GI cases by Sex and Age Category </caption>
## <tr> <th> gender </th> <th> (-Inf,5] </th> <th> (5,18] </th> <th> (18,45] </th> <th> (45,60] </th> <th> (60, Inf] </th>  </tr>
##   <tr> <td> Female </td> <td align="right"> 2161 </td> <td align="right"> 1942 </td> <td align="right"> 3710 </td> <td align="right"> 1590 </td> <td align="right"> 1250 </td> </tr>
##   <tr> <td> Male </td> <td align="right"> 2163 </td> <td align="right"> 1860 </td> <td align="right"> 3766 </td> <td align="right"> 1543 </td> <td align="right"> 1259 </td> </tr>
##    <a name=myHTMLanchor></a>
## </table>

and the resulting table looks like

print(tab, type="html", include.rownames=FALSE)

Total GI cases by Sex and Age Category
gender	(-Inf,5]	(5,18]	(18,45]	(45,60]	(60, Inf]
Female	2161	1942	3710	1590	1250
Male	2163	1860	3766	1543	1259

Copy-and-paste table to Word

Now you can

Open the file (table.html)
Copy-and-paste this table to Word.

Activity - exporting tables

Create a Word table for the number of cases by facility and age category.

# Summarize data by facility and age category

# Reshape data from long to wide format

# Create HTML table

# Save HTML to file

# Copy-and-paste table into Word

When you have completed the activity, compare your results to the solutions.

Maps

The packages ggplot2 and maps can be used together to make maps.

Map of the continental US

library('maps')
states = map_data("state")
ggplot(states, aes(x = long, y = lat, group = group)) + 
  geom_polygon(color="white")

Map of the counties in the continental US

counties = map_data("county")
ggplot(counties, aes(x = long, y = lat, group = group)) + 
  geom_polygon(color = "white")

Map of the counties in Iowa

ggplot(counties %>% filter(region=="iowa"), 
       aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "gray", color = "black")

Construct an appropriate data set

To make an informative map, we need to add data.

fluTrends = read.csv("fluTrends.csv", check.names=FALSE)

For simplicity, only keep the most recent 12 weeks on states.

flu_w = fluTrends %>%
  tail(n = 12) 
dim(flu_w)

## [1] 12 52

Reshape to long format

flu_l <- flu_w %>%
  gather(region, index, -Date)

Merge fluTrends data with map_data

The region names in map_data are lower case, so use tolower() to convert all the region names in flu_l to lowercase. Then the map_data and fluTrends data are merged using merge().

head(unique(states$region))

## [1] "alabama"     "arizona"     "arkansas"    "california"  "colorado"   
## [6] "connecticut"

flu_l$region = tolower(flu_l$region)
states_merged = states %>%
  merge(flu_l, sort = FALSE, by = 'region')

Make the plots

Some Google Flu Trend data.

states_merged$Date = as.Date(states_merged$Date)
mx_date = max(states_merged$Date)
g = ggplot(states_merged %>% filter(Date == mx_date), 
       aes(x = long, y = lat, group = region, fill = index)) + 
  geom_polygon() + 
  labs(title=paste('Google Flu Trends on', mx_date), x='', y='') +
  theme_minimal() + 
  theme(legend.title = element_blank()) +
  coord_map("cylindrical")

if (require('RColorBrewer'))
  g = g + scale_fill_gradientn(colours=RColorBrewer::brewer.pal(9,"Reds")) # Thanks Annie Chen

## Loading required package: RColorBrewer

Last 12 weeks.

ggplot(states_merged, 
       aes(x=long, y=lat, group=group, fill=index)) + 
  geom_polygon() + 
  labs(title='Google Flu Trends', x='', y='') +
  theme_minimal() + 
  theme(legend.title = element_blank()) +
  facet_wrap(~Date) + 
  coord_map("cylindrical") + 
  scale_fill_gradientn(colours=brewer.pal(9,"Reds"))

Activity

Modify the code to determine what elements of the map are affected.

Advanced: Download the data directly from http://www.google.org/flutrends/about/data/flu/historic/us-historic-v2.txt using `read.csv’ and then construct a map of the most recent Google Flu Trends data.

# Construct Google Flu Trends map

When you have completed the activity, compare your results to the solutions.

Packages

A package provides additional functionality besides what base installation of R provides. You have already used a number of additional packages:

ggplot2
MWBDSSworkshop
maps
dplyr
tidyr
xtable

The Comprehensive R Archive Network (CRAN) has over 9,594 packages. These packages can be installed using the install.packages() function, e.g.

install.packages("ggplot2")

For many reasons, packages may not be on CRAN but may be available from other sources:

Github
Bioconductor
Source file (tar.gz)

Github

To install a package from github, you can use the install_github() function from the devtools package. For example,

# install.packages("devtools")
library('devtools')
install_github('nandadorea/vetsyn')

Bioconductor

Bioconductor provides tools for high-throughput genomic data. There are over 1,296 packages available from bioconductor. To install bioconductor, use

source("http://bioconductor.org/biocLite.R")
biocLite()

Then to install a package from bioconductor use

biocLite("edgeR")

The bioconductor repository may be useful in the future for public health biosurveillance.

Source

All packages can be installed from their source. If a package is not available in a repository, then this is one way to install the package. To install from source download the source file (.tar.gz) and then use the install.packages() function with arguments repos=NULL and type="source". For example, from a source version of this workshop, you would use the following command:

install.packages("MWBDSSworkshop_0.4.tar.gz", 
                 repos = NULL, 
                 type  = "source")

Functions

Packages are typically a collection of functions. But you can write your own. For example,

add = function(a,b) {
  return(a+b)
}
add(1,2)

## [1] 3

add_vector = function(v) {
  sum = 0
  for (i in 1:length(v)) sum = sum + v[i]
  return(sum)
}

A simple outbreak detection function

Here is a simple outbreak detection function

alert = function(y,threshold=100) {
  # y is the time series 
  # an alert is issue for any y above the threshold (default is 100)
  factor(ifelse(y>threshold, "Yes", "No"))
}

Run this on our weekly GI data.

GI_w <- GI %>%
  group_by(weekD) %>%
  summarize(count = n())

GI_w$alert = alert(y = GI_w$count, threshold = 150)

ggplot(GI_w, aes(x = weekD, y = count, color = alert)) + 
  geom_point() + 
  scale_color_manual(values = c("black","red"))

R in batch

For routine analysis, it can be helpful to run R in batch mode rather than interactively. To do this, create a script and save the script with a .R extension, e.g. script.R:

# Read in the data perhaps as a csv file

# Create some table and save them to html files

# Create some figures and save them to jpegs

# Run some outbreak detection algorithms and produce figures

Now, from the command line run

Rscript script.R

Or, from within R run

source("script.R")

Now each week you can update the csv file and then just run the script which will create all new tables and figures.

Shiny apps

R can be used to create HTML applications through the shiny package. The code is written entirely in R, although you can use cascading style sheets (CSS) if you want to update how it looks. Here is an example that allows visualization of CDC data.

These apps can also be run locally, e.g.

install.packages('shiny')
library('shiny')
runGitHub('NLMichaud/WeeklyCDCPlot')

Regression

Regression is a set of techniques for relating the mean/quantile of a response variable to a set of explanatory variables.

Simple linear regression

Simple linear regression looks at the relationship between a single explanatory variable and a continuous response. The mean of the response is a straight line function of the explanatory variable.

Let’s look at the relationship between age and number of GI visits. First, we need to construct the appropriate data.frame.

visits_by_age = GI %>%
  filter(age > 18, age < 80) %>%
  group_by(age) %>%
  summarize(count = n())

Plot the data.

ggplot(visits_by_age, aes(x = age, y = count)) + 
  geom_point() + 
  scale_y_log10()

Let’s use this to run a regression of log(count) on age.

m = lm(log(count) ~ age, data = visits_by_age)
summary(m)

## 
## Call:
## lm(formula = log(count) ~ age, data = visits_by_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49175 -0.14048  0.01362  0.15456  0.38425 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.440937   0.083010   77.59   <2e-16 ***
## age         -0.025101   0.001594  -15.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2192 on 59 degrees of freedom
## Multiple R-squared:  0.8077, Adjusted R-squared:  0.8045 
## F-statistic: 247.9 on 1 and 59 DF,  p-value: < 2.2e-16

To visualize this use

ggplot(visits_by_age, aes(x = age, y = count)) + 
  geom_point() + 
  scale_y_log10() +
  stat_smooth(method = "lm") # use `se=FALSE` if you do not want to see the uncertainty

Default diagnostics.

opar = par(mfrow=c(2,3)) # Create a 2x3 grid of plots
plot(m, 1:6, ask=FALSE)  # 6 residual plots

par(opar)                # Revert to original graphics settings

Multiple regression

When you have more than one explanatory variable and a continuous response, you can use multiple regression.

m = lm(log(count) ~ age + I(age^2), data = visits_by_age)
summary(m)

## 
## Call:
## lm(formula = log(count) ~ age + I(age^2), data = visits_by_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47710 -0.10722  0.02647  0.11838  0.29932 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.415e+00  1.780e-01  30.420  < 2e-16 ***
## age          2.299e-02  7.867e-03   2.923  0.00494 ** 
## I(age^2)    -4.908e-04  7.926e-05  -6.192 6.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1716 on 58 degrees of freedom
## Multiple R-squared:  0.8843, Adjusted R-squared:  0.8803 
## F-statistic: 221.6 on 2 and 58 DF,  p-value: < 2.2e-16

To visualize this use

ggplot(visits_by_age, aes(x = age, y = count)) + 
  geom_point() + 
  scale_y_log10() +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2))

Regression smoothers

This fit is still not very good. You could use a loess smoother.

m = loess(log(count) ~ age, data = visits_by_age)
summary(m)

## Call:
## loess(formula = log(count) ~ age, data = visits_by_age)
## 
## Number of Observations: 61 
## Equivalent Number of Parameters: 4.42 
## Residual Standard Error: 0.1346 
## Trace of smoother matrix: 4.83  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

ggplot(visits_by_age, aes(x = age, y = count)) + 
  geom_point() + 
  scale_y_log10() +
  stat_smooth()  # default is loess

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Poisson regression

A better model for these data is the Poisson regression model.

m = glm(count ~ age + I(age^2), data = visits_by_age, family = "poisson")
summary(m)

## 
## Call:
## glm(formula = count ~ age + I(age^2), family = "poisson", data = visits_by_age)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2118  -1.3417  -0.0224   1.5299   3.8260  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.283e+00  7.069e-02  74.741   <2e-16 ***
## age          2.979e-02  3.327e-03   8.955   <2e-16 ***
## I(age^2)    -5.611e-04  3.579e-05 -15.679   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2288.76  on 60  degrees of freedom
## Residual deviance:  265.44  on 58  degrees of freedom
## AIC: 701.48
## 
## Number of Fisher Scoring iterations: 4

To visualize

ggplot(visits_by_age, aes(x = age, y = count)) + 
  geom_point() + 
  stat_smooth(method = "glm", formula = y ~ x + I(x^2), 
              method.args = list(family = "poisson"))

Mixed effect models

Random effects allow you to model correlation in your data. For example, we might expect counts within a facility to be more similar than counts at different facilities. Thus, we might improve our model by introducing a random effect for facility.

First, construct the appropriate data set.

visits_by_age_and_facility = GI %>% 
  filter(age > 18, age < 80) %>%
  filter(facility != 259) %>%       # only 1 observation in this facility
  group_by(facility, age) %>%
  summarize(count = n())

Plot the data

ggplot(visits_by_age_and_facility, aes(x = age, y = count)) + 
  facet_wrap(~facility) +
  scale_y_log10() +
  geom_point()

Now, use the lme4 or the nlme package to fit the model.

First, we’ll try a

library("lme4")

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

m = lmer(log(count) ~ age + I(age^2) + (age+I(age^2)|facility), 
         data = visits_by_age_and_facility)

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## boundary (singular) fit: see ?isSingular

Let’s go with the suggetion and rescale age.

visits_by_age_and_facility = visits_by_age_and_facility %>%
  mutate(age = scale(age))

m = lmer(log(count) ~ age + I(age^2) + (age+I(age^2)|facility), 
         data = visits_by_age_and_facility)
summary(m)

## Linear mixed model fit by REML ['lmerMod']
## Formula: log(count) ~ age + I(age^2) + (age + I(age^2) | facility)
##    Data: visits_by_age_and_facility
## 
## REML criterion at convergence: 1152
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9654 -0.4502  0.0805  0.5443  3.8423 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  facility (Intercept) 1.27198  1.1278              
##           age         0.05220  0.2285   -0.19      
##           I(age^2)    0.05565  0.2359   -0.24 -0.15
##  Residual             0.23930  0.4892              
## Number of obs: 708, groups:  facility, 15
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  2.17616    0.29254   7.439
## age         -0.35755    0.06230  -5.739
## I(age^2)    -0.22424    0.06428  -3.488
## 
## Correlation of Fixed Effects:
##          (Intr) age   
## age      -0.173       
## I(age^2) -0.251 -0.148

Random forests

Let’s suppose we are interested in predicting the number of GI visits by age, facility, and gender. We could use a (mixed effects) regression model and it might do quite well. But there are a variety of techniques that have been introduced that typically do a better job at prediction. These techniques typically have a reduced ability to interpret the results.

First, construct the appropriate data set

visits = GI %>%
  filter(age > 18, age < 80) %>%
  filter(facility != 259) %>%       # only 1 observation in this facility
  group_by(facility, age, gender) %>%
  summarize(count = n())

Let’s take a look at the data

ggplot(visits, aes(x = age, y = count, color = gender, shape = gender)) + 
  facet_wrap(~facility) +
  scale_y_log10() +
  geom_point()

Now let’s construct a random forest.

library("randomForest")

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

m = randomForest(log(count) ~ age + gender + facility, data = visits)
m

## 
## Call:
##  randomForest(formula = log(count) ~ age + gender + facility,      data = visits) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.3902249
##                     % Var explained: 66.87

importance(m)

##          IncNodePurity
## age          92.714518
## gender        3.474141
## facility    725.240815

This is a pretty small data set to be used for a random forest.

dim(visits)

## [1] 1295    4