# Read in pollution data poll <- read.table("poll.dat", head=T, skip=20); par(mfrow=c(2,2),bty='n') # Four Plots on one page plot( poll$wind, poll$oxid ); plot( poll$temp, poll$oxid ); plot( poll$humi, poll$oxid ); plot( poll$inso, poll$oxid ); cor( poll$wind, poll$oxid ) cor( poll$temp, poll$oxid ) cor( poll$humi, poll$oxid ) cor( poll$inso, poll$oxid ) options(contrasts=c("contr.treatment")); ########################################################################### # explore regressions on each individual meteorological variable # -- look at R^2 measure of "fit": Highest correlation <-> highest R^2 summary( lm( oxid~wind, data=poll) )$r.squared summary( lm( oxid~temp, data=poll) )$r.squared summary( lm( oxid~humi, data=poll) )$r.squared summary( lm( oxid~inso, data=poll) )$r.squared ########################################################################### # explore regression on temperature fit <- lm( oxid~temp, data=poll ); print(fit); summary(fit); # Note "Residual Standard Error" is 3.996507 on df=28 # plot fitted least square line par(mfrow=c(1),bty='n') # Four Plots on one page plot( poll$temp, poll$oxid ) lines( poll$temp, fit$fitted.values, col=2) # messy as temp is not ordered: redo: i <- order(poll$temp); lines( poll$temp[i], fit$fitted.values[i], col=2); ########################################################################### # now predictions .. include exact central probability intervals at each x qt( 0.975, 30-2); newtemp <- 65:95 # range of TEMP values to predict OXID at pre <- predict.lm(fit, data.frame(temp=newtemp) , se.fit=T) plot( poll$temp, poll$oxid , xlim=c(65,92), ylim=c(0,32)) rse <- 3.997^2; # Residual SE from summary(fit) Yhat <- pre$fit; Ysd <- sqrt(rse+pre$se.fit^2); # mean and S.E. of predictive distns lines(newtemp,Yhat, col=2); lines(newtemp,Yhat+2.048*Ysd,col=3); # Upper 95% prediction envelope lines(newtemp,Yhat-2.048*Ysd,col=3); # Lower 95% prediction envelope ########################################################################### # Now, compare predictive distributions at 3 values of TEMP, and # also with predictive based on OXID as a normal random sample # Set up range for plots ... y <- seq(0,32,length=500) # Plot data and predictive line for reference hist(poll$oxid, nclass=15, prob=T, xlim=c(0,32), ylim=c(0,0.2), ylab="Predictive", xlab="oxid") # Then, first, the linear regression prediction at TEMP=65 degrees j <- 1; poll$temp[j]; m <- Yhat[j]; m; v <- Ysd[j]; v; lines(y, dt((y-m)/v, 28)/v); # Second, at TEMP=79 j <- 15; poll$temp[j]; m <- Yhat[j]; m; v <- Ysd[j]; v; lines(y,dt((y-m)/v,28)/v, col=2); # And third at TEMP=94 j <- 30 poll$temp[j] m <- Yhat[j]; m; v <- Ysd[j]; v; lines(y,dt((y-m)/v,28)/v, col=3); # Finally, compare the predictive from the normal random sample model: m <- mean(poll$oxid); # For normal sampling model, v <- sqrt(var(poll$oxid)*(1+1/n)) # scale factor of predictive t distn lines(y,dt((y-m)/v,29)/v, col=1, lty=2) # Student t density function overlay ########################################################################### # Exceedance levels: What's the chance of OXID>17.5 as a function of TEMP? plot(newtemp, 1-pt((17.5-Yhat)/Ysd,28), type="l", ylab="exceedance prob"); # .. and of OXID>19.5? lines(newtemp, 1-pt((19.5-Yhat)/Ysd,28), col=2); # .. and of OXID>21? lines(newtemp, 1-pt((21-Yhat)/Ysd,28), col=3); ########################################################################### # plot vs two variables -- spinning version spin( matrix(c(poll$oxid,poll$temp,poll$wind),30,3), collab=c("oxid","temp","wind"), highlight=rep(T,30));