### cpus data explorations
require(MASS)
names(cpus)
#> names(cpus)
#[1] "name"    "syct"    "mmin"    "mmax"    "cach"    "chmin"   "chmax"
#[8] "perf"    "estperf"

# 6 continuous predictors
# Response is the performance
CPUs <- cpus[, 2:8]


for(j in 1:6)
  CPUs[[j]] <- cut(rank(CPUs[[j]], ties = "r"), 5)

fm <- lm(perf ~ ., CPUs)
# The boxcox transformation serves to apply tests
boxcox(fm, lambda = seq(-0.15, 0.15, len = 10))



####

set.seed(38267251) # My phone number
cpus.samp <- sample(nrow(cpus), 100)

cpusTrain <- cpus[cpus.samp, 2:8] # omit name and manufactuer's estimate
cpusTest <- cpus[-cpus.samp, 2:8]

### orthodox linear models

cpus.l1 <- lm(log(perf) ~ ., cpusTrain)
cpus.lAIC <- stepAIC(cpus.l1, trace = F)
cpus.lBIC <- stepAIC(cpus.l1, k = log(100), trace = F)

####

testPred <- function(fit, data = cpusTest) {
#
# mean squared error for the performance of a
# predictor on the test data.
#
	testVals <- log(data[, "perf"])
	predVals <- predict(fit, data[, ])
	sqrt(sum((testVals - predVals)^2)/nrow(data))
}

testPred(cpus.l1)
testPred(cpus.lAIC)
testPred(cpus.lBIC)

####

library(rpart)
cpus.t1 <- rpart(log(perf) ~ syct + mmin + mmax + cach +
                chmin + chmax, cpusTrain, minsplit = 3)
testPred(cpus.t1)  # not good!

plot(cpus.t1)
text(cpus.t1)

cpus.t1

####

plotcp(cpus.t1)

cpus.t2 <- prune(cpus.t1, cp=0.019)
testPred(cpus.t2)

###

py.tree <- predict(cpus.t1, cpusTest)
py.tree2 <- predict(cpus.t2, cpusTest)
cor(cbind(log(cpusTest$perf), py.tree, py.tree2))

###

plot(cpus.t2)
text(cpus.t2)

###

par(mfrow = c(1,2), pty = "s")
plot(log(cpusTest$perf), py.tree, asp = 1)
abline(0, 1, col = "red")
plot(log(cpusTest$perf), py.tree2, asp = 1)
abline(0, 1, col = "red")

###
simpleBagging <-local({
bsample <- function(dataFrame) # bootstrap sampling
dataFrame[sample(nrow(dataFrame), rep = T),  ]

 function(object,
	data = eval(object$call$data), nBags = 200, ...) {
	bagsFull <- list()
	for(j in 1:nBags)
		bagsFull[[j]] <- update(object, data = bsample(data))
	oldClass(bagsFull) <- "bagRpart"
	bagsFull
}
                      })


predict.bagRpart <- function(object, newdata, ...)
	rowMeans(sapply(object, predict, newdata = newdata))

###
cpus.bag <- simpleBagging(cpus.t1)
testPred(cpus.bag)  # bit better!

py.bag <- predict(cpus.bag, cpusTest)
cor(cbind(log(cpusTest$perf), py.bag, py.tree, py.tree2))


par(mfrow = c(2,2), pty = "s")
frame()
plot(log(cpusTest$perf), py.bag, asp = 1)
abline(0, 1, col = "red")
plot(log(cpusTest$perf), py.tree, asp = 1)
abline(0, 1, col = "red")
plot(log(cpusTest$perf), py.tree2, asp = 1)
abline(0, 1, col = "red")


###
require(randomForest)

cpus.rf <- randomForest(log(perf) ~ ., cpusTrain)
testPred(cpus.rf)

###
py.rf <- predict(cpus.rf, cpusTest)
round(cor(cbind(log(cpusTest$perf), py.tree, py.tree2,
                py.bag, py.rf)),4)
                
###

par(mfrow = c(2,2), pty = "s")
with(cpusTest, {
  plot(log(perf), py.rf, asp = 1)
  abline(0, 1, col = "red")
  plot(log(perf), py.bag, asp = 1)
  abline(0, 1, col = "red")
  plot(log(perf), py.tree, asp = 1)
  abline(0, 1, col = "red")
  plot(log(perf), py.tree2, asp = 1)
  abline(0, 1, col = "red")
})
