library("tidyverse"); theme_set(theme_bw())
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
If we have \(X_i\) for \(i=1,\ldots,n\) that are independent and identically distributed with \(E[X_i] = \mu\) and \(Var[X_i] = \sigma^2 < \infty\), then \[ \frac{\overline{X}-\mu}{s/\sqrt{n}} \stackrel{d}{\to} N(0,1) \quad \mbox{as } n\to \infty. \] Thus, for large enough n, \[ \overline{X} \stackrel{\cdot}{\sim} N(\mu, s^2/n) \] where \(\stackrel{\cdot}{\sim}\) means “approximately distributed.”
This result is the basis of a common asymptotic confidence interval formula for \(\mu\): \[ \overline{x} \pm z s/\sqrt{n} \] where \(z\) is a critical value that depends on the confidence level of the confidence interval.
Let’s first make a function to construct the data.frame we’ll need to plot to visualize
create_mean_dataframe <- function(x) {
data.frame(n = 1:length(x)) %>%
mutate(
xbar = cumsum(x)/n,
s = cumsum((x-xbar)^2)/(n-1),
lcl = xbar - qnorm(.975)*s/sqrt(n),
ucl = xbar + qnorm(.975)*s/sqrt(n)
)
}
As long as we’re at it, let’s make a function to create the plot
plot_mean <- function(d, truth = NULL) {
g <- ggplot(d, aes(x = n, y = xbar, ymin = lcl, ymax = ucl)) +
geom_ribbon(fill = "blue", alpha = 0.3) +
geom_line(color = "blue")
if (!is.null(truth))
g <- g + geom_hline(yintercept = truth, color = "red", linetype = 2)
return(g)
}
Let \(X_i \stackrel{ind}{\sim} N(\mu, \sigma^2)\). If we sample enough \(X_i\), then we can learn about \(\mu\).
m <- 0
s <- 1
x <- rnorm(1e3, mean = m, sd = s)
d <- create_mean_dataframe(x)
plot_mean(d, truth = m)
Suppose our random variables have a normal distribution but are truncated between two values \(a\) and \(b\). We might write this as \(N(\mu,\sigma^2)[a,b]\).
Let’s first construct a function to simulate this random variable. (There are better ways to do this.)
rtnorm <- function(n, mean, sd, a, b) {
x <- rnorm(n, mean, sd)
for (i in seq_along(x)) {
while (x[i] < a || x[i] > b) {
x[i] <- rnorm(1, mean, sd)
}
}
return(x)
}
also the density for a truncated normal
dtnorm <- function(x, mean, sd, a, b) {
c <- diff(pnorm(c(a,b), mean, sd))
d <- dnorm(x, mean, sd)/c
return(d * (a < x & x < b))
}
Simulate and plot
```r
m <- 3
s <- 8
a <- 10
b <- 20
Density plot
curve(dtnorm(x, m, s, a, b), a-1, b+1, n = 1001)
Monte Carlo
x <- rtnorm(1e3, m, s, a, b)
d <- create_mean_dataframe(x)
plot_mean(d)
The mean actually can be calculated for a truncated normal
alpha <- (a-m)/s
beta <- (b-m)/s
mean <- m + (dnorm(alpha)-dnorm(beta))/(pnorm(beta) - pnorm(alpha)) * s
plot_mean(d, truth = mean)
Watch out!
You need to pay attention to whether the variance is finite amongst your random variables. A Cauchy distribution has no moments and therefore its mean and variance are both undefined. This occurs because the Cauchy distribution has extremely heavy tails.
Let’s use Monte Carlo to try and estimate a Cauchy ``mean’’
m <- 0
s <- 1
x <- rcauchy(1e3, m, s)
d <- create_mean_dataframe(x)
plot_mean(d)
plot_mean(d) + xlim(25,100) # will likely need to adjust the limits
## Warning: Removed 924 rows containing missing values (`geom_line()`).
The CLT is not relevant here because the variance of the random variables is not finite.
If we have \(Y \sim Bin(n,\theta)\) with \(\theta \in (0,1)\) and \(\hat\theta = Y/n\), then \[ \frac{\hat\theta-\theta}{\sqrt{\frac{\hat\theta(1-\hat\theta)}{n}}} \stackrel{d}{\to} N(0,1) \quad \mbox{as } n\to \infty. \] Thus, for large enough n, \[ \hat\theta \stackrel{\cdot}{\sim} N\left(\theta, \frac{\hat\theta(1-\hat\theta)}{n}\right) \] where \(\stackrel{\cdot}{\sim}\) means “approximately distributed.”
This result is the basis of a common asymptotic confidence interval formula for \(\theta\): \[ \hat\theta \pm z \sqrt{\frac{\hat\theta(1-\hat\theta)}{n}} \] where \(z\) is a critical value that depends on the confidence level of the confidence interval.
Function to create proportion data frame. The data will be binary (0/1) or logical (TRUE/FALSE).
create_proportion_dataframe <- function(x) {
data.frame(n = 1:length(x)) %>%
mutate(
theta_hat = cumsum(x)/n,
se = sqrt(theta_hat*(1-theta_hat)/n),
lcl = theta_hat - qnorm(.975)*se,
ucl = theta_hat + qnorm(.975)*se
)
}
and the plot function
plot_proportion <- function(d, truth = NULL) {
g <- ggplot(d, aes(x = n, y = theta_hat, ymin = lcl, ymax = ucl)) +
geom_ribbon(fill = "blue", alpha = 0.3) +
geom_line(color = "blue")
if (!is.null(truth))
g <- g + geom_hline(yintercept = truth, color = "red", linetype = 2)
return(g)
}
Let’s generate some Bernoulli data that, when summed, will be binomial.
p <- 0.3
x <- rbinom(1e3, size = 1, prob = p)
d <- create_proportion_dataframe(x)
plot_proportion(d, truth = p)
How about the probability a normal random variable is between two end points? Of course, we can calculate the truth here.
m <- 0
s <- 1
a <- 2
b <- 4
y <- rnorm(1e3, m, s)
x <- a < y & y < b
d <- create_proportion_dataframe(x)
plot_proportion(d, truth = diff(pnorm(c(a,b), mean = m, sd = s)))
How about the probability that the absolute value of a normal random variable is less than an exponential random variable?
n <- 1e3
y <- rnorm(n)
e <- rexp(n)
x <- abs(y) < e
d <- create_proportion_dataframe(x)
plot_proportion(d)
tail(d,1)
## n theta_hat se lcl ucl
## 1000 1000 0.495 0.0158106 0.4640118 0.5259882