library("tidyverse"); theme_set(theme_bw())

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library("scales") # for breaks_pretty()

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

The distributions built into base R can be found using

?Distributions

Most distributions have 4 functions:

d: probability mass/density function
p: cumulative distribution function
q: quantile function (inverse of the cdf)
r: random draws

Discrete distributions

Let’s start with discrete distributions

Binomial

Probability mass function

Recall the pmf for a binomial is \[ P(Y=y) = {n\choose y} p^y (1-p)^{n-y} \] for integer \(n>0\), \(0<p<1\), and \(y=0,1,\ldots,n\).

We can plot this pmf using dbinom.

n <- 10
p <- 0.3
d <- data.frame(
  y = 0:n
) %>%
  mutate(pmf = dbinom(y, size = n, prob = p))

ggplot(d, aes(x = y, y = pmf, yend = 0, xend = y)) +
  geom_point() + 
  geom_segment()

Cumulative distribution function

We can plot this cdf using pbinom.

d <- d %>%
  mutate(cdf = pbinom(y, size = n, prob = p))

ggplot(d, aes(x = y, y = cdf, yend = 0, xend = y)) +
  geom_step()

More often, we use the cdf to calculate probabilities. For example, suppose \(Y \sim Bin(30, 0.4)\) and we want to calculate \(P(10<Y<14)\). Recall that \[ P(10<Y<14) = P(Y\le 13) - P(Y \le 10) \] we can do this using

n <- 30
p <- 0.4
pbinom(13, size = n, prob = p) - 
  pbinom(1, size = n, prob = p)

## [1] 0.7144998

Random draws

To obtain random draws from a binomial use the rbinom function.

draws <- rbinom(1000, size = n, prob = p)

ggplot(mapping = aes(x = draws)) +
  geom_histogram(binwidth = 1)

Let’s compare our empirical probabilities to the true probabilities.

mean(10 < draws & draws < 14) # vs

## [1] 0.425

pbinom(13, size = n, prob = p) - 
  pbinom(10, size = n, prob = p)

## [1] 0.4230325

Poisson

Probability mass function

A random variable has a Poisson distribution it its probability mass function is \[ P(Y=y) = \frac{e^{-r}r^y}{y!} \] for some rate \(r>0\) and integer \(y\ge 0\).

We use dpois for the Poisson pmf.

r <- 3
d <- data.frame(y = 0:(3*r)) %>%
  mutate(pmf = dpois(y, lambda = r),
         cdf = ppois(y, lambda = r))

ggplot(d, aes(x = y, y = pmf, xend = y, yend = 0)) + 
  geom_point() + 
  geom_segment() +
  scale_x_continuous(breaks = breaks_pretty())

Cumulative distribution function

We use ppois for the Poisson pmf.

ggplot(d, aes(x = y, y = cdf)) + 
  geom_step() +
  scale_x_continuous(breaks = breaks_pretty())

Random draws

To obtain random draws from a Poisson distribution use rpois.

draws <- rpois(1000, lambda = r)

ggplot(mapping = aes(x = draws)) + 
  geom_histogram(binwidth = 1, color = "white") + 
  scale_x_continuous(breaks = breaks_pretty())

Continuous distributions

Normal (Gaussian)

Probability density function

m <- 0
s <- 1 # standard deviation

d <- data.frame(x = seq(m - 3*s, m + 3*s, length = 1001)) %>%
  mutate(pdf = dnorm(x, m, s),
         cdf = pnorm(x, m, s))

ggplot(d, aes(x = x, y = pdf)) +
  geom_line()

Cumulative distribution function

ggplot(d, aes(x = x, y = cdf)) +
  geom_line()

Critical values

The qnorm function provides the inverse of the normal CDF. We can plot this function

ggplot(data.frame(p = 0:1)) +
  stat_function(fun = qnorm, n = 1001, args = list(mean = m, sd = s))

The most common use of this function is to obtain z critical values for us in constructing confidence intervals.

qnorm(.975)

## [1] 1.959964

Random draws

We can use the rnorm function to sample random draws from the normal distribution.

draws <- rnorm(20)

ggplot() +
  geom_histogram(mapping = aes(x = draws), bins = 10)

ggplot() +
  geom_histogram(
    mapping = aes(
      x = draws, 
      y = after_stat(density)), 
    bins = 10) +
  stat_function(fun = dnorm, 
                n = 1001, 
                args = list(
                  mean = mean(draws), 
                  sd = sd(draws)),
                color = 'red') +
  xlim(min(draws)-1, max(draws) + 1)

## Warning: Removed 2 rows containing missing values (geom_bar).

If we have a much larger sample size, the histogram will look approximately normal.

draws <- rnorm(1e4)

ggplot() +
  geom_histogram(
    mapping = aes(
      x = draws, 
      y = after_stat(density)), 
    bins = 30) +
  stat_function(fun = dnorm, 
                n = 1001, 
                args = list(
                  mean = mean(draws), 
                  sd = sd(draws)),
                color = 'red') +
  xlim(min(draws)-1, max(draws) + 1)

## Warning: Removed 2 rows containing missing values (geom_bar).

Student’s t

Probability density function

df <- 10

d <- data.frame(x = seq(-4, 4, length = 1001)) %>%
  mutate(pdf = dt(x, df = df),
         cdf = pt(x, df = df))

ggplot(d, aes(x = x, y = pdf)) + 
  geom_line() +
  stat_function(fun = dnorm, color = 'red')

Calculative p-values

Suppose you have the following data

y <- round(rnorm(10), 2)
y

##  [1] -1.32  0.46 -1.66 -1.10 -0.43  0.37  1.88 -0.65  0.87 -0.52

Which has the following summary statistics

n <- length(y); n

## [1] 10

m <- mean(y); m

## [1] -0.21

s <- sd(y); s

## [1] 1.094664

If you are interested in the following hypothesis test: \[ H_0: \mu=1 \quad \mbox{vs} \quad H_A: \mu \ne 1 \] then the test statistic is

t <- (m-1)/(s/sqrt(n)); t

## [1] -3.495462

Under the null hypothesis, this statistic has a \(t\) distribution with \(n-1\) degrees of freedom. We can then compute the p-value using

pvalue <- 2*pt(-abs(t), df = n-1); pvalue

## [1] 0.006771688

Of course, we could use the built-in functions in R

t.test(y, mu = 1)$p.value

## [1] 0.006771688

Critical values

The main use of the quantile function (or inverse CDF) is to compute t critical values. For example,

tcrit <- qt(.95, df = n-1); tcrit

## [1] 1.833113

So a 90% CI for the mean is

m + c(-1,1)*tcrit*s/sqrt(n)

## [1] -0.844556  0.424556

Of course, we could just use built-in functions

t.test(y, conf.level = 0.9)$conf.int

## [1] -0.844556  0.424556
## attr(,"conf.level")
## [1] 0.9

STAT 486/586

Jarad Niemi

2023-02-22

Discrete distributions

Binomial

Probability mass function

Cumulative distribution function

Random draws

Poisson

Probability mass function

Cumulative distribution function

Random draws

Continuous distributions

Normal (Gaussian)

Probability density function

Cumulative distribution function

Critical values

Random draws

Student’s t

Probability density function

Calculative p-values

Critical values