# S-Plus code for beginning analysis of the Mercedes data used car prices merc <- read.table("mercedes") dim(merc); i <- order(merc[,5]); merc <- merc[i,] # Sort in order of mileage y <- log(merc[,2]) # Why *log* price? (NB: # not $) mod <- merc[,3] # Model: 0=500, 1=450, ..., 4=200 age <- merc[,4] # Age in units of 6-mo mile <- merc[,5] # Mileage in thousands vend <- merc[,6] # Four different vendors Mod <- as.factor(mod) # Treat as category, not number options(contrasts=c("contr.treatment")) plot( mile, y, xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok <- mod==j; points(mile[ok], y[ok], pch=paste(j), col=j+1); fit <- lm(y[ok]~mile[ok]); abline(fit, col=j+1); } # ------------ fit <- lm( y~ Mod + mile , x=T) names(fit) print(fit) summary(fit) plot( mile, y, xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok <- mod==j; points(mile[ok],y[ok],pch=paste(j),col=j+1); lines(mile[ok],fit$fitted.values[ok],col=j+1 ); } exp(-0.011) exp(-0.011+c(-1.96,1.96)*0.0017) # decrease in price per 1000 miles # now predictions .. pre <- predict.lm(fit,se.fit=T) plot( mile, y, xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok<-mod==j; points(mile[ok],y[ok],pch=paste(j),col=j+1) lines(mile[ok],pre$fit[ok],col=j+1) } # .. and again with pointwise 95% intervals ... # compute mean and S.E. of predictive distns ss <- 0.136^2; haty <- pre$fit; vy <- sqrt(ss+pre$se.fit^2) # and plot .... par(mfrow=c(3,2)) for (j in 0:4) { plot( mile, y, xlab="Miles",ylab="Log Price",type='n') ok<-mod==j; points(mile[ok],y[ok],pch=paste(j),col=j+1) lines(mile[ok],pre$fit[ok],col=j+1) lines(mile[ok],pre$fit[ok]+2*vy[ok],col=j+1,lty=j+1) lines(mile[ok],pre$fit[ok]-2*vy[ok],col=j+1,lty=j+1) } par(mfrow=c(1,1)) # on price scale, rather than log(price) scale plot( mile, exp(y), xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok<-mod==j; points(mile[ok],exp(y)[ok],pch=paste(j),col=j+1) lines(mile[ok],exp(pre$fit)[ok],col=j+1) } # compute and plot standardised residuals -- e <- fit$residuals/sqrt(var(fit$residuals)) qqnorm(e); abline(0,1) source("qqbayes.ssc"); qqbayes(e) plot( mile, fit$residuals, xlab="Miles",ylab="Residuals",type='n') for (j in 0:4) { ok<-mod==j; points(mile[ok],fit$residuals[ok],pch=paste(j),col=j+1) } abline(0,0) plot( age, fit$residuals, xlab="Age",ylab="Residuals",type='n') for (j in 0:4) { ok<-mod==j; points(age[ok],fit$residuals[ok],pch=paste(j),col=j+1) } abline(0,0) plot( vend, fit$residuals, xlab="Vendor",ylab="Residuals",type='n') for (j in 0:4) { ok<-mod==j; points(vend[ok],fit$residuals[ok],pch=paste(j),col=j+1) } abline(0,0) # ------------------------------------------------------------------------ # different slopes with miles ? newfit <- lm( y ~ Mod+mile+Mod/mile) # Try also: y ~ Mod/mile summary(newfit, corr=F) newpre <- predict.lm(newfit,se.fit=T) plot( mile, y, xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok<-mod==j; points(mile[ok],y[ok],pch=paste(j), col=j+1); lines(mile[ok],newpre$fit[ok],col=j+1); lines(mile[ok],newpre$fit[ok],col=j+1,lty=j+1); } # Subset F test -- are all "interactions" significant? print(fit); print(newfit) Qb <- 48*0.136^2; Qa <- 44*0.131^2; ssa <- 0.131^2; fobs <- (Qb-Qa)/(4*ssa) 1-pf(fobs,4,44) # doesn't look statistically significant overall # ... maybe treat Model 2 cars separate from the rest, and cater for the two # "special cases" ... high miles, but not so old #----------------------------------------------------------------------- # Check also age: plot( mile, age, xlab="Miles",ylab="Age",type='n') for (j in 0:4) { i <- mod==j; points(mile[i],age[i],pch=paste(j),col=j+1) } # compare models including Age variable rather than or with Miles # e.g., fit <- lm(y ~ Mod+age) summary(fit) # now maybe slopes are really not different ... pre <- predict.lm(fit,se.fit=T) plot( age, y, xlab="Age",ylab="Log Price",type='n') for (j in 0:4) { ok<-mod==j; points(age[ok],y[ok],pch=paste(j),col=j+1); lines(age[ok],pre$fit[ok],col=j+1); } # more models and comparisons ... summary( fit1 <- lm(y ~ Mod+age+Mod/age) , correlation=F) # subset F test for different slopes: Qb <- 48*0.1019^2; Qa <- 44*0.105^2; ssa <- 0.105^2; fobs <- (Qb-Qa)/(4*ssa) 1-pf(fobs,4,44) summary( fit2 <- lm(y ~ Mod+mile+age) , correlation=F) cor(mile,age) # mile and age say slightly different things: see the two "special cases" # and more ... summary( fit3 <- lm(y ~ Mod+mile+age+Mod/mile), correlation=F ) summary( fit4 <- lm(y ~ Mod+mile+age + Mod/(mile + age)), correlation=F )