-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexercise5.R
77 lines (60 loc) · 1.99 KB
/
exercise5.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# exercise 5
# part B
# take aways: optimal value of lambda increases as true sparsity increases
n = 10000
sparsity = 0.02
lambdas = seq(0.1, 3, 0.1)
theta = rbinom(n, 4, 0.5) * rbinom(n, 4, sparsity)
stdev = rep(1, n)
z = rnorm(n, mean=theta, sd=stdev)
estimator_func <- function(zi, lambda) {
return(sign(zi) * max(abs(zi) - lambda, 0))
}
mse_func <- function(lambda) {
theta_hat = mapply(estimator_func, z, lambda)
return(sum((theta_hat - theta) ^ 2)/ length(theta))
}
mse = mapply(mse_func, lambdas)
plot(lambdas, mse)
# Lasso
# take away: in-sample mse increases as lambda increases. The fit gets worse.
library(glmnet)
setwd("~/Google Drive/University of Texas/SDS 385; Statistical Models for Big Data/code")
X = data.matrix(read.csv("../data/diabetesX.csv", header=TRUE), rownames.force=FALSE)
y = data.matrix(read.csv("../data/diabetesY.csv", header=FALSE), rownames.force=FALSE)
model = glmnet(X, y)
plot(model, xvar="lambda")
mse = rep(0, length(model$lambda))
for (i in seq(length(model$lambda), 1)) {
beta = as.matrix(model$beta[, i])
mse[i] = sum((y - X %*% beta) ^ 2) / length(y)
}
plot(model$lambda, mse, log='x')
# cross validation
# take away: out-of-sample mse has a non-zero optimial value for lambda
n = length(y)
ntrain = round(0.8 * n)
trainX = X[1:ntrain, ]
trainY = y[1:ntrain]
testX = X[(ntrain + 1):n, ]
testY = y[(ntrain + 1):n]
model = glmnet(trainX, trainY)
predictY = predict(model, testX)
mse = rep(0, length(model$lambda))
for (i in seq(length(model$lambda), 1)) {
y_hat = as.matrix(predictY[, i])
mse[i] = sum((testY - y_hat) ^ 2) / n
}
plot(model$lambda, mse, log='x')
# Cp statistic
# this plot has the same overall shape as the cross validation curve, but it's more bumpy
cp = rep(0, length(model$lambda))
for (i in seq(length(model$lambda), 1)) {
beta = as.matrix(model$beta[, i])
y_hat = X %*% beta
residuals = y_hat - y
mse = sum(residuals ^ 2) / n
penalty = 2 * nnzero(beta) * var(residuals) / n
cp[i] = mse + penalty
}
plot(model$lambda, cp, log='x')