rm(list = ls())
library(MASS)
library(tree)
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston)/2 )
tree.boston <- tree(medv ~ . ,Boston ,subset = train)
summary(tree.boston)
##
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "rm" "lstat" "crim" "age"
## Number of terminal nodes: 7
## Residual mean deviance: 10.38 = 2555 / 246
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.1800 -1.7770 -0.1775 0.0000 1.9230 16.5800
plot(tree.boston)
text(tree.boston ,pretty =0)
cv.boston <- cv.tree(tree.boston)
cv.boston
## $size
## [1] 7 6 5 4 3 2 1
##
## $dev
## [1] 4380.849 4544.815 5601.055 6171.917 6919.608 10419.472 19630.870
##
## $k
## [1] -Inf 203.9641 637.2707 796.1207 1106.4931 3424.7810 10724.5951
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.boston$size ,cv.boston$dev ,type= 'b')
prune.boston <- prune.tree(tree.boston ,best = 7)
plot(prune.boston)
text(prune.boston ,pretty =0)
yhat <- predict(tree.boston ,newdata = Boston[-train ,])
boston.test <- Boston[-train ,"medv"]
plot(yhat ,boston.test)
abline(0,1)
mean((yhat - boston.test)^2)
## [1] 35.28688
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.boston <- randomForest(medv ~ . , data=Boston,
subset = train ,mtry = 13, importance =TRUE)
bag.boston
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 11.39601
## % Var explained: 85.17
yhat.bag <- predict(bag.boston ,newdata = Boston[-train ,])
plot(yhat.bag , boston.test)
abline (0,1)
mean(( yhat.bag -boston.test)^2)
## [1] 23.59273
bag.boston <- randomForest(medv~ . ,data = Boston ,subset = train,
mtry = 13, ntree = 10)
yhat.bag <- predict(bag.boston ,newdata = Boston[-train ,])
mean(( yhat.bag -boston.test)^2)
## [1] 24.87671
smaller number of tree (\(f_1,f_2,...,f_B\)), can not ensure that every input sampling data gets fitted at least a few times.
building a random forest:
Here use mtry = 6.
set.seed(1)
rf.boston <- randomForest(medv~ . ,data = Boston ,subset =train ,
mtry = 6, importance =TRUE)
yhat.rf <- predict(rf.boston ,newdata = Boston[-train ,])
mean(( yhat.rf -boston.test)^2)
## [1] 19.62021
importance(rf.boston)
## %IncMSE IncNodePurity
## crim 16.697017 1076.08786
## zn 3.625784 88.35342
## indus 4.968621 609.53356
## chas 1.061432 52.21793
## nox 13.518179 709.87339
## rm 32.343305 7857.65451
## age 13.272498 612.21424
## dis 9.032477 714.94674
## rad 2.878434 95.80598
## tax 9.118801 364.92479
## ptratio 8.467062 823.93341
## black 7.579482 275.62272
## lstat 27.129817 6027.63740
varImpPlot(rf.boston)
library(gbm)
## Loaded gbm 2.1.5
set.seed (1)
boost.boston <- gbm(medv ~ . ,data = Boston[train ,],
distribution = "gaussian",
n.trees = 5000,
interaction.depth = 4)
summary(boost.boston)
## var rel.inf
## rm rm 43.9919329
## lstat lstat 33.1216941
## crim crim 4.2604167
## dis dis 4.0111090
## nox nox 3.4353017
## black black 2.8267554
## age age 2.6113938
## ptratio ptratio 2.5403035
## tax tax 1.4565654
## indus indus 0.8008740
## rad rad 0.6546400
## zn zn 0.1446149
## chas chas 0.1443986
yhat.boost <- predict(boost.boston ,newdata = Boston[-train ,],
n.trees = 5000)
mean(( yhat.boost -boston.test)^2)
## [1] 18.84709
par(mfrow =c(1,2))
plot(boost.boston ,i = "rm")
plot(boost.boston ,i = "lstat")
boost.boston <- gbm(medv ~ . ,data = Boston[train ,],
distribution = "gaussian",
n.trees =5000 ,
interaction.depth =4,
shrinkage = 0.2,
verbose =F)
yhat.boost <- predict(boost.boston ,newdata = Boston[-train ,],
n.trees =5000)
mean(( yhat.boost -boston.test)^2)
## [1] 18.33455
The default \(\lambda = 0.001\), but this is easily modified. Here we take \(\lambda = 0.2\).
In this case, using λ = 0.2 leads to a slightly lower test MSE than λ = 0.001.