###################################### # Statistical methods for data science: R # A.Y. 2018/19 # http://didawiki.di.unipi.it/doku.php/mds/smd/ ###################################### # [CTRL-Enter] to execute a line ###### Script R13: Confidence intervals ########## # from script R8: mean, sigma and n are parameters central.limit = function(xns, mu=0, sigma=1, n=1, ...) { h <- hist(xns, plot=FALSE, breaks="Scott", ...) ylim <- range(0, h$density, dnorm(mu, mu, sigma/sqrt(n))) hist(xns, freq=FALSE, breaks="Scott", ylim=ylim, xlab=deparse(substitute(xns)), ...) curve(dnorm(x, mu, sigma/sqrt(n)), add=TRUE, col="red") } # from script R8: z transformation z.transf = function(xn, mu, sigma, n) (sqrt(n)*(xn-mu)/sigma) n = 100 # sample size # normal distribution mu = 5; s = 5 xns = replicate(10000, mean(rnorm(n, mu, s)) ) zns = z.transf(xns, mu, s, n) central.limit(zns) # Confidence Interval: Part 1 # Normal data, known variance ci.z = function(data, sigma, conf.level=0.95, mu=NA) { n = length(data) xn = mean(data) alpha = 1 - conf.level # P(Z > cv) = 1 - P(Z <= cv) = 1 - (1-alpha/2) = alpha/2 cv = qnorm(1-alpha/2) # right-critical value if(!is.na(mu)) { # if true mu is provided # standard normal rx = seq(-4, 4, 0.1) plot(rx, dnorm(rx), type="l") scaledmu = (mu-xn)/sigma # (scaled) true mu position segments(scaledmu, 0, scaledmu, dnorm(scaledmu), col="blue") segments(-cv, 0, -cv, dnorm(cv), col="red") segments(cv, 0, cv, dnorm(cv), col="red") } delta = cv*sigma/sqrt(n) cat("\nmean ", xn, "delta ", delta) cat(paste0("\n",conf.level*100, "% CI ="), c(xn-delta, xn+delta) ) # right-sided interval # P(Z > cv) = 1 - P(Z <= cv) = alpha cv1 = qnorm(1-alpha) delta1 = cv1*sigma/sqrt(n) cat(paste0("\n",conf.level*100, "% one-sided CI ="), xn-delta1, "infy" ) } # standard normal data N(0, 1) - try different n and conf_level n = 5; data=rnorm(n); ci.z(data, 1, conf.level=0.95, mu=0); # general normal data N(mu, s) n=100; mu=3; s=5 data=rnorm(n, mu, s) ci.z(data, s) ci.z(data, s, mu=mu) # for visualization only # smaller sample leads to larger delta ci.z(data[1:30], s); # library with z test and Confidence Intervals built-in library(BSDA) z.test(data[1:30], sigma.x=s) z.test(data[1:30], sigma.x=s, alternative="greater") ### but, how to check whether dataset is normal? qqnorm(data) # visually qqline(data) data=rexp(n, rate=2) qqnorm(data) # visually # test of normality in next classes # Confidence Interval: Part 2 # Normal data, unknown variance # t-distribution curve(dnorm(x), xlim=c(-5,5)) curve(dt(x, 1), add=T, col="red") curve(dt(x, 2), add=T, col="blue") curve(dt(x, 10), add=T, col="green") legend("topright",c("norm", "m=1", "m=2","m=10"), col=c("black", "red","blue", "green"), lwd=3) # central limit theorem for unknown variance central.limit.t = function(xns, n=1, ...) { h <- hist(xns, plot=FALSE, breaks="Scott", ...) ylim <- range(0, h$density, dt(0, n-1)) hist(xns, freq=FALSE, breaks="Scott", ylim=ylim, xlab=deparse(substitute(xns)), ...) curve(dt(x, n-1), add=T, col="blue") curve(dnorm(x), add=T, col="red") # to see norm distribution as well } # return pair of mean and stdev mean.sd = function(data) c(mean(data), sd(data)) n = 100 # sample size, also try n = 5 # normal distribution mu = 5; s = 5 xns = replicate(10000, mean.sd(rnorm(n, mu, s)) ) zns = z.transf(xns[1,], mu, xns[2,], n) central.limit.t(zns, n) # Confidence Interval: normal data, unknown variance ci.t = function(data, conf.level=0.95) { n = length(data) xn = mean(data) alpha = 1 - conf.level # P(T > cv) = 1 - P(T <= cv) = 1 - (1-alpha/2) = alpha/2 cv = qt(1-alpha/2, n-1) delta = cv*sd(data)/sqrt(n) cat("\nmean ", xn, "delta ", delta) cat(paste0("\n",conf.level*100, "% CI ="), c(xn-delta, xn+delta) ) # right-sided interval # P(T > cv) = 1 - P(T <= cv) = alpha/2 cv1 = qt(1-alpha, n-1) # right-critical value delta1 = cv1*sd(data)/sqrt(n) cat(paste0("\n",conf.level*100, "% one-sided CI ="), xn-delta1, "infy" ) } # standard normal data n = 5; data=rnorm(n); ci.t(data); # general normal data n=100; mu=3; s=5 data=rnorm(n, mu, s) ci.t(data) ci.z(data, s) t.test(data) # r built-in # Confidence Interval: Part 3 # Large sample method (any data, unknown variance) # use ci.z with sigma = sd(data) n = 1000; data=rnorm(n); ci.z(data, sigma=sd(data)); # compare to ci.z with known sigma ci.z(data, sigma=1) # simulation f = function(n, conf.level=0.95, df=rnorm, dfpar=1, truemu=1) { data = df(n, dfpar) ci = z.test(data, sigma.x=sd(data), conf.level=conf.level)$conf.int # large sample return(ci[1] <= truemu & truemu <= ci[2]) } # simulations (see [T] Table 23.3) n = 20 # also try with n = 100 and n=1000 mu=1 norm90 = sum(replicate(1e3, f(n, 0.9, rnorm, mu, mu)))/1e3 norm95 = sum(replicate(1e3, f(n, 0.95, rnorm, mu, mu)))/1e3 lambda=1; mu = 1/lambda exp90 = sum(replicate(1e3, f(n, 0.9, rexp, lambda, mu)))/1e3 exp95 = sum(replicate(1e3, f(n, 0.95, rexp, lambda, mu)))/1e3 alpha=2.1; mu = alpha/(alpha-1) library(VGAM) # functional needed because shape is not second position parameter fpareto = function(n, alpha) rpareto(n, shape=alpha) par90 = sum(replicate(1e3, f(n, 0.9, fpareto, alpha, mu)))/1e3 par95 = sum(replicate(1e3, f(n, 0.95, fpareto, alpha, mu)))/1e3 c(n, norm90, norm95, exp90, exp95, par90, par95)