#install.packages("ISLR")
rm(list = ls())
library(ISLR)
data(Smarket)
names(Smarket)
dim(Smarket)
str(Smarket)
summary(Smarket)
cor(Smarket[,names(Smarket) != "Direction"])
apply(cor(Smarket[,names(Smarket) != "Direction"]),
2, FUN = function(x) abs(x) > 0.5)
Volum和Year有比較高的相關係數值,從圖也發現
# By plotting the data we see that Volume is increasing over time. In other words, the average number
# of shares traded daily increased from 2001 to 2005.
par(mfrow = c(1,2))
plot(Smarket[,"Volume"])
plot(Smarket[,"Year"],Smarket[,"Volume"])
$ P(Y=1|X)= \frac{exp(\beta_0+\beta_1 X_1+...+\beta_p X_p)}{1+exp(\beta_0+\beta_1 X_1+...+\beta_p X_p)} $
We know that these values correspond to the probability of the market going up, rather than down, because the contrasts() function indicates that R has created a dummy variable with a 1 for Up.
str(Smarket[,"Direction"]) # first level = failure(0), others = success(1)
contrasts(Smarket[,"Direction"])
glm.log <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,
data = Smarket ,family = binomial)
summary(glm.log)
Lag1的p-value最小,係數為-0.073
代表: 若有一個$X_{Lag1} > 0$,則會降低看漲機率$P(Y=1|X)$
glm.probs <- predict(glm.log, type = "response")
# type="response" option tells to output probabilities of the form $P(Y =1 | X)$
head(glm.probs)
plot(glm.probs)
res_fit = rep("Down", dim(Smarket)[1])
res_fit[glm.probs > 0.5] = "Up"
table(Fitted = res_fit, True = Smarket$Direction)
mean(res_fit == Smarket[,"Direction"])
似乎只比亂猜(50%)好一點。另外,training error = 47.84%
上面的model是全部都拿去fitting,接著,分train & test data去做:
train = Smarket$Year < 2005
Smarket.2005 = Smarket[!train,]
glm.log_2005 = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,
data = Smarket ,family = binomial, subset = train)
glm.probs_2005 = predict(glm.log_2005, Smarket.2005, type = "response")
summary(glm.log_2005)
res_fit_2005 = rep("Down", dim(Smarket.2005)[1])
res_fit_2005[glm.probs_2005 > 0.5] = "Up"
table(Fitted = res_fit_2005, True = Smarket.2005$Direction)
mean(res_fit_2005 == Smarket.2005$Direction)
結果在testing data的正確率只有48% QQ
注意: model裡各個解釋變數的p-value都很大. 我們試著只用Lag1,Lag2去fit model: (之後的LDA, QDA, KNN都是)
glm.log_2005_x1x2 = glm(Direction ~ Lag1 + Lag2 ,
data = Smarket ,family = binomial, subset = train)
glm.probs_2005_x1x2 = predict(glm.log_2005_x1x2, Smarket.2005, type = "response")
res_fit_2005_x1x2 = rep("Down", dim(Smarket.2005)[1])
res_fit_2005_x1x2[ glm.probs_2005_x1x2 > 0.5] = "Up"
table(Fitted = res_fit_2005_x1x2, True = Smarket.2005$Direction)
mean(res_fit_2005_x1x2 == Smarket.2005$Direction)
結果在testing data的正確率提升到56%~
ls()
library(MASS)
lda_mod = lda(Direction ~ Lag1 + Lag2 ,
data = Smarket , subset = train)
lda_mod
# Prior probabilities of groups:
table(Smarket[train,"Direction"])
# Group means:
apply(Smarket[ train & (Smarket[,"Direction"] == "Down") ,c("Lag1","Lag2") ] ,
2, mean)
apply(Smarket[ train & (Smarket[,"Direction"] == "Up") ,c("Lag1","Lag2") ] ,
2, mean)
lda.pred = predict(lda_mod, Smarket.2005)
str(lda.pred)
res_lda = lda.pred$class
table(fitted = res_lda, Smarket.2005$Direction)
mean(res_lda == Smarket.2005$Direction)
note: LDA and logistic regression predictions are almost identical.
summary(lda.pred$posterior)
#lda.pred$posterior
sum(lda.pred$posterior[,2] >= 0.5) # =76+106
sum(lda.pred$posterior[,1] >= 0.5) # =35+35
qda_mod = qda(Direction ~ Lag1 + Lag2 ,
data = Smarket , subset = train)
qda_mod
qda.pred = predict(qda_mod, Smarket.2005)
str(qda.pred)
res_qda = qda.pred$class
table(fitted = res_qda, Smarket.2005$Direction)
mean(res_qda == Smarket.2005$Direction)
library(class)
train_X = Smarket[ train, c("Lag1", "Lag2")]
test_X = Smarket[ !train, c("Lag1", "Lag2")]
train_Y = Smarket[ train, c("Direction")]
set.seed(1)
KNN_1 = knn(train = train_X, test = test_X, cl = train_Y, k = 1)
str(KNN_1)
table(Fitted = KNN_1, True = Smarket.2005$Direction)
mean(KNN_1 == Smarket.2005$Direction)
KNN_3 = knn(train = train_X, test = test_X, cl = train_Y, k = 3)
str(KNN_3)
table(Fitted = KNN_3, True = Smarket.2005$Direction)
mean(KNN_3 == Smarket.2005$Direction)
logistic | LDA | QDA | KNN |
---|---|---|---|
56% | 56% | 60% | 54% |