#FIXED X'S X = seq(10,20,0.25) Xt2=X*X Xt.5=sqrt(X) n = length(X) #SIMULATED REGRESSION MODEL alpha = 1 beta = .5 sigma = .75 Y = alpha + beta*X + rnorm(n, 0, sigma) plot(X,Y) abline(1,.5) #DATA STORAGE my.data = data.frame(Y,X,Xt2,Xt.5) names(my.data) #REGRESSION MODEL FITTING lm.fit = lm(Y~X,data=my.data) lm.fit summary(lm.fit) #REGRESSION FITTING OUTPUT names(lm.fit) lm.fit$coefficients lm.fit$residuals lm.fit$fitted.values #RESULT plot(X,Y) abline(1,.5,col='red') abline(lm.fit$coefficients[1],lm.fit$coefficients[2]) #DIAGNOSTICS plot(X,Y) abline(lm.fit$coefficients[1],lm.fit$coefficients[2]) plot(lm.fit$fitted.values,Y) plot(X,lm.fit$fitted.values) plot(lm.fit$fitted.values,lm.fit$residuals) plot(X,lm.fit$residuals) abline(0,0) sum(lm.fit$residuals) sqrt(sum((lm.fit$residuals)^2)/(41-2)) hist(lm.fit$residuals) hist(rnorm(41, 0,sigma)) #PREDICTION #For E[Y|X] predict(lm.fit,newdata=data.frame(X=c(15,20)), interval = "confidence",level=.95) #For Y|X predict(lm.fit,newdata=data.frame(X=c(15,20)), interval = "prediction") #REGRESSION MODEL #Y = alpha + X*beta + epsilon #epsilon ~ N(0, sigma2) tries = 200 tmp = NULL for(i in 1:tries) { # X = rnorm(n, 6, 2) Y = alpha + beta *X + rnorm(n, 0, sigma) regression = lm(Y~X) tmp = rbind(tmp,coef(regression)) plot(X,Y, xlim=c(0,25), ylim=c(0,15), lwd=3) if(i > 1) for(j in 1:(i-1)) abline(tmp[j,], col='pink', lwd = 3) abline(c(alpha, beta), col='blue', lwd = 3) points(X,Y, xlim=c(0,20), ylim=c(0,10), lwd=3) readline() abline(tmp[i,], col='red', lwd = 3) readline() abline(tmp[i,], col='pink', lwd = 3) readline() } #TRANSFORMATIONS... #y: readership, x: maps distributed (thousands, both) y=c(.6,6.7,5.3,4,6.55,2.15,6.6,5.75) x=c(80,220,140,120,180,100,200,160) cor(x,y) lm.fit=lm(y~x) abline(-1.8, .043) plot(x,y) abline(lm.fit$coefficients[1],lm.fit$coefficients[2]) plot(x,y) plot(x,lm.fit$residuals) abline(0,0) ox=1/(x) xt2=x^2 xt.5=x^.5 lx=log(x) plot(lx,(y)) plot(ox,(y)) plot(xt2,(y)) lm.fit=lm(y~x+xt2) plot(lm.fit$residuals) lm.fit=lm(y~lx) plot(lx,y) plot(lm.fit$residuals) plot(lm.fit$fitted.values,lm.fit$residuals) #y: plasma level, age: plasma=read.table('~/Desktop/TEACHING_summer2010/plasma.txt', header=TRUE) x=plasma[,1] y=plasma[,2] plot(x,y) lm.fit=lm(I(y)~x) abline(13.5,-2.2) plot(x,lm.fit$residuals) abline(0,0) hist(lm.fit$residuals) plot(x,log(y)) plot(x,log(y)) lm.fit=lm(I(log(y))~x) abline(2.6,-.23) plot(x,lm.fit$residuals) abline(0,0) abline(13.5,-2.2) plot(lm.fit$fitted.values,lm.fit$residuals) abline(0,0) #y: telephone usage by time in coverage: tele=read.table('~/Desktop/TEACHING_summer2010/tele.txt',sep=',',header=TRUE) plot(tele$months,tele$aveCALLS) lm.fit=lm(tele$aveCALLS~tele$months+I(tele$months^2)) abline(13.7,.75) points(10:30,(10:30)*2.3-.04*(10:30)^2,type='l') plot(lm.fit$residuals) plot(lm.fit$fitted.values, lm.fit$residuals) plot(lm.fit$residuals) abline(0,0) hist(lm.fit$residuals) #predict studio ability dwaine=read.table('~/Desktop/TEACHING_summer2010/dwaine.txt', header=TRUE) #collinearity #multiple regression... control confounding... prediction names(dwaine) dwaine$targPOP dwaine$dispoINC dwaine$sales plot(dwaine$dispoINC,dwaine$sales) plot(dwaine$targPOP,dwaine$sales) summary(lm(sales~dispoINC,data=dwaine)) summary(lm(sales~targPOP,data=dwaine)) lm.fit=(lm(sales~targPOP+dispoINC,data=dwaine)) plot(lm.fit$fitted.values,dwaine$sales) abline(0,1) plot(dwaine$targPOP,dwaine$dispoINC) cor(dwaine$targPOP,dwaine$dispoINC) patient=read.table('~/Desktop/TEACHING_summer2010/patient.txt', header=TRUE) names(patient) #collinearity bodyfat=read.table('~/Desktop/TEACHING_summer2010/bodyfat.txt', header=TRUE) names(patient) soap=read.table('~/Desktop/TEACHING_summer2010/soap.txt', header=TRUE) names(soap) mech=read.table('~/Desktop/TEACHING_summer2010/a_nova.txt', header=TRUE) names(mech) summary(lm(items~m,data=mech))