###################################### # Statistics for data science: R # # http://didawiki.di.unipi.it/doku.php/mds/sds/ ###################################### # [CTRL-Enter] to execute a line ###### Script Lesson 21: Regression methods ####### par(mar=c(4,4,1,1)) library(SemiPar) # load dataset data(janka) x = janka$dens y = janka$hardness # scatter plot: pch is type of points, cex is their size plot(x, y, xlab='dens', ylab='hadness', xlim=c(20, 80), ylim=c(0,3500), pch=16, cex=0.8) # simple linear model fit = lm(y~x) fit abline(fit, col='red', lw=2) # regression line # summary info summary(fit) # fitted values fit_ys = fitted(fit) res = resid(fit) # same as y - fit_ys # standard errors n = length(x) s.hat = sqrt(sum(res^2)/(n-2)) xbar = mean(x) SXX = sum((x-xbar)^2) # or var(x)*(n-1) # prediction on new data +- errs x_test = data.frame(x = seq(-500, 500, 20)) # predict requires a dataframe in input y_pred = predict(fit, x_test) plot(x_test$x, y_pred) errs = s.hat*sqrt(1/n+ (mean(x)-x_test$x)^2/SXX) segments(x_test$x, y_pred-errs, x_test$x, y_pred+errs, col=2, lwd=2) lines(x_test$x, y_pred-errs, col=3, lwd=1) lines(x_test$x, y_pred+errs, col=3, lwd=1) # standard errors from predict y_pred_se = predict(fit, x_test, se.fit=TRUE) plot(x_test$x, y_pred_se$fit) segments(x_test$x, y_pred_se$fit-y_pred_se$se.fit, x_test$x, y_pred_se$fit+y_pred_se$se.fit, col=2, lwd=2) lines(x_test$x, y_pred_se$fit-y_pred_se$se.fit, col=3, lwd=1) lines(x_test$x, y_pred_se$fit+y_pred_se$se.fit, col=3, lwd=1) # Weighted Least Square library(GLMsData) data(gestation) # Age in weeks of gestation # Weight: weight at birth # Births: count of births - i.e, weighting of cases View(gestation) plot(Weight~Age, data=gestation) fit = lm(Weight~Age, data=gestation) summary(fit) abline(fit, col=2, lwd=2) # replicate by number of births x = rep(gestation$Age, gestation$Births) y = rep(gestation$Weight, gestation$Births) # plot(x, y) # the same as before fit = lm(y~x) abline(fit, col=3, lwd=2) summary(fit) # or weighted form of linear regression fit = lm(Weight~Age, data=gestation, weights=Births) summary(fit) # polynomial regression # generate dataset set.seed(0) x = seq(from=0, to=20, by=0.1) f = function(x) 500 + 0.4 * (x-5)^3 y = f(x) plot(x, y) noise = rnorm(length(x), mean=0, sd=40) noisy.y = y + noise plot(x, noisy.y, col='blue', xlab='x') lines(x, y, col='red', lwd=2) # polynomial regression fit = lm( noisy.y ~ poly(x,3) ) summary(fit) # fitted vs residuals plot(fitted(fit), resid(fit), xlab="fitted", ylab="residuals") # fiited vs residuals # visual fit plot(x, noisy.y, col='blue', xlab='x') lines(x, y, col='red', lwd=2) lines(x, fitted(fit), col='green', lwd=2) legend("topleft",c("true","predicted"), col=c("red","green"), lwd=3) # non-linear regression # generate Zipfian population alpha = 2.5 C = 100000 # sample x = sample(1:1000, 200) y = C*x^{-alpha} plot(x, y, log="xy", col='blue', xlab='x') # and multiply by noise noise = rnorm(length(x), 0, 0.8) noisy.y = y*exp(noise) plot(x, noisy.y, log="xy", col='blue', xlab='x') curve(C*x^{-alpha}, col=2, lwd=2, add=T) # fitting using linear regression over log-scaled data fit1 = lm( log(noisy.y) ~ log(x)) # cannot use abline! lines(x, exp(fitted(fit1)), col='green', lwd=2) summary(fit1) intercept = exp(coef(fit1)[1]) slope = coef(fit1)[2] cat('estimate C', intercept) cat('estimate -alpha', slope) # fitting using non-linear regression (favors fitting top ranks) # initial guess required: start=list(c=intercept, b=coefficient) fit2 = nls(noisy.y ~ c*x^b, start=list(c=intercept, b=slope)) summary(fit2) cat('estimate C', coef(fit2)[1]) cat('estimate -alpha', coef(fit2)[2]) lines(x, fitted(fit2), col='black', lwd=2) legend("topright",c("true","fit1-lm", "fit2-nls"), col=c("red","green","black"), lwd=3) # fitted vs residuals plot(fitted(fit1), resid(fit1), xlab="fitted", ylab="residuals") # nls assume additive noise plot(fitted(fit2), resid(fit2), xlab="fitted", log="xy", ylab="residuals") # multiple (variable) linear regression library(ISLR) data(Carseats) View(Carseats) fit = lm(Sales~Advertising+Price, data=Carseats) summary(fit) # dummy encoding for factors (categorial) fit2 = lm(Sales~Advertising+Price+ShelveLoc, data=Carseats) summary(fit2) # Multivariate regression ami_data = read.table("http://static.lib.virginia.edu/statlab/materials/data/ami_data.DAT") names(ami_data) = c("TOT","AMI","GEN","AMT","PR","DIAP","QRS") # independent variables # GEN, gender (male = 0, female = 1) # AMT, amount of drug taken at time of overdose # PR, PR wave measurement # DIAP, diastolic blood pressure # QRS, QRS wave measurement # dependent variables # TOT is total TCAD plasma level # AMI is the amount of amitriptyline present in the TCAD plasma level. View(ami_data) fit = lm(cbind(TOT, AMI) ~ GEN + AMT + PR + DIAP + QRS, data = ami_data) summary(fit) # same coefficients as for individual regressions fit_tot = lm(TOT ~ GEN + AMT + PR + DIAP + QRS, data = ami_data) summary(fit_tot) # same coefficients as for individual regressions # but coefficients covary! vcov(fit)