「Oracle Property」。巷にこれほどかっこいいフレーズはそうそうないと思うので紹介を試みる。
さて、Lassoの罰則項はどういう作用を持っているか? L1 normの縮小推定を解説する図はいろんな教科書にも載っているのだが、自分は最初見たときに実感を持てなかった。Rで実際に条件を変えながらOLSと比較することで、一助になれば幸いである。
www.slideshare.net
今回使用したコード。 foreachのブロックは後半で結構時間がかかるので、イテレーションを減らすか並列化されたい。
require(dplyr)
require(foreach)
require(glmnet)
require(msgps)
require(hdm)
Data_with <- function(n,p,s, silent=TRUE){
if(! silent){
cat(paste0(n, " obs. of ", p, " variables.\n",
"Last ", length(s)," variables are non-zero."))
}
beta <- c(rep(0,p-length(s)),s)
X <- matrix(rnorm(n*p), ncol=p)
res <-list(
X = X,
y = 0 + X%*%beta + rnorm(n),
beta = beta,
non.zero.coeff = which(beta>1e-8),
non.zero.value = s
)
if(! silent){
str(res)
}
invisible(res)
}
test_data <- Data_with(n=10^3, p=10, s= c(1,10,100))
str(test_data,max.level = 2)
attach(test_data)
system.time( lm.reg <- lm(y~X) )
summary(lm.reg)
system.time( lasso_cv <- cv.glmnet(x = X, y=y, alpha = 1) )
coef(lasso_cv, s="lambda.1se")
test_data <- Data_with(n=10^3, p=10, s= c(100,100,100))
str(test_data,max.level = 2)
attach(test_data)
system.time( lm.reg <- lm(y~X) )
summary(lm.reg)
system.time( lasso_cv <- cv.glmnet(x = X, y=y, alpha = 1) )
par(mfrow=c(1,2))
plot(lasso_cv)
abline(v=log(lasso_cv$lambda.1se), lty=2, col="blue")
lasso.reg <- glmnet(x = X, y=y, alpha = 1)
plot(lasso.reg, xvar="lambda", label=TRUE)
abline(v=log(lasso_cv$lambda.1se), lty=2, col="blue")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
par(mfrow=c(1,1))
fit.coeff <- coef(lasso_cv, s="lambda.1se") %>% summary %>% transmute(beta=i-1, Var=x)
coef(lasso_cv, s="lambda.1se")
fit.coeff %>% str
system.time( alasso.reg <- msgps(X= X, y=c(y), penalty="alasso",gamma=1, lambda=0) )
par(mfrow=c(2,2))
plot(alasso.reg, stand.coef=FALSE, criterion="cp", main="Mallows' Cp")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
plot(alasso.reg, stand.coef=FALSE, criterion="aicc",main="bias-corrected AIC (AICc)")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
plot(alasso.reg, stand.coef=FALSE, criterion="gcv", main="generalized cross validation (GCV) ")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
plot(alasso.reg, stand.coef=FALSE, criterion="bic", main="BIC")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
par(mfrow=c(1,1))
summary(alasso.reg)
ss=300
sample_sizes <- c(5:20*10, 400, 600, 800, 1000,2000,3000, 4000)
itr <- 1:10
conditions <- expand.grid(ss=sample_sizes, itr=itr) %>% arrange(ss)
sse.sampsi <- foreach(Ci=1:NROW(conditions), .combine = rbind) %do% {
cond <- conditions[Ci,]
this_data <- Data_with(n=cond$ss, p=10, s= c(10,10,10), silent=TRUE)
beta <- this_data$beta
fit.lm <- lm(this_data$y~this_data$X)
sse.lm <- sum( (beta - coef(fit.lm)[-1])^2 )
fit.lasso_cv <- cv.glmnet(x = this_data$X, y=this_data$y, alpha = 1)
coeff.lasso_cv <- coef(fit.lasso_cv, s="lambda.1se") %>%
summary %>%
transmute(beta=i-1, Var=x)
sse.lasso_cv <- sum( (beta[coeff.lasso_cv$beta[-1]] - coeff.lasso_cv$Var[-1])^2 )
fit.alasso <- msgps(X= this_data$X, y=c(this_data$y),
penalty="alasso", gamma=1, lambda=0)
coeff.alasso <- coef(fit.alasso) %>%
data.frame %>%
select(BIC) %>%
unlist
sse.alasso <- sum( (this_data$beta - coeff.alasso[-1])^2 )
data.frame(sample.size=cond$ss, lm=sse.lm, lasso=sse.lasso_cv, alasso=sse.alasso)
}
sse.summ.sampsi <- sse.sampsi %>%
group_by(sample.size) %>%
summarize_each(funs(mean))
sse.summ.sampsi
with(sse.summ.sampsi,{
plot (lm ~sample.size, type="l", lwd=2, col="green", ylim=c(0,0.35), ylab="mean SSE", log="x")
lines(lasso~sample.size, type="l", lwd=2, col="red")
legend("topright", legend = c("lm","lasso"), col = c("green","red"), lty = 1, lwd=2)
})
with(sse.summ.sampsi,{
plot (lm ~sample.size, type="l", lwd=2, col="green", ylim=c(0,0.35), ylab="mean SSE", log="x")
lines(lasso ~sample.size, type="l", lwd=2, col="red")
lines(alasso~sample.size, type="l", lwd=2, col="blue")
legend("topright", legend = c("lm","lasso","a-lasso"), col = c("green","red","blue"), lty = 1, lwd=2)
})
system.time(
nonpost_lasso.reg <- rlasso(
formula, data,
post = FALSE,
intercept = TRUE,
penalty = list(homoscedastic = FALSE,
X.dependent.lambda = FALSE,
lambda.start = NULL,
c = 1.1,
gamma = 0.1/log(n)),
control = list(numIter =15,
tol = 10^-5,
threshold = NULL),
...)
)
system.time(
post_lasso.reg <- rlasso(
formula, data,
post = TRUE,
intercept = TRUE,
penalty = list(homoscedastic = FALSE,
X.dependent.lambda = FALSE,
lambda.start = NULL,
c = 1.1,
gamma = 0.1/log(n)),
control = list(numIter =15,
tol = 10^-5,
threshold = NULL),
...)
)
summary(nonpost_lasso.reg, all=FALSE)
summary(post_lasso.reg, all=FALSE)
coef(post_lasso.reg)
dim_sizes <- c(1:10*10, 1:5*200, 2:5*1000, 1e4, 2e4)
itr <- 1:30
conditions <- expand.grid(ds=dim_sizes, itr=itr) %>% arrange(ds)
sse.dim_sizes <- foreach(Ci=1:NROW(conditions), .combine = rbind) %do% {
cond <- conditions[Ci,]
this_data <- Data_with(n=1000, p=cond$ds, s= c(10,10,10), silent=TRUE)
beta <- this_data$beta
fit.lm <- lm(this_data$y~this_data$X)
sse.lm <- sum( (beta - coef(fit.lm)[-1])^2 )
fit.lasso_cv <- cv.glmnet(x = this_data$X, y=this_data$y, alpha = 1)
coeff.lasso_cv <- coef(fit.lasso_cv, s="lambda.1se") %>%
summary %>%
transmute(beta=i-1, Var=x)
sse.lasso_cv <- sum( (beta[coeff.lasso_cv$beta[-1]] - coeff.lasso_cv$Var[-1])^2 )
fit.alasso <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1)
coeff.alasso <- coef(fit.alasso) %>%
data.frame %>%
select(BIC) %>%
unlist
sse.alasso <- sum( (this_data$beta - coeff.alasso[-1])^2 )
plasso.reg <- rlasso(c(this_data$y) ~ this_data$X, post=TRUE, intercept=TRUE)
coeff.plasso <-coef(plasso.reg)
sse.plasso <-(this_data$beta - coeff.plasso)^2 %>% sum
data.frame(dim.size=cond$ds,
lm=sse.lm, lasso=sse.lasso_cv, alasso=sse.alasso, plasso=sse.plasso)
}
sse.summ.dim_sizes <- sse.dim_sizes %>%
group_by(dim.size) %>%
summarize_each((funs(mean)))
sse.summ.dim_sizes
with(sse.summ.dim_sizes,{
plot( lm ~dim.size, type="l", lwd=2, col="green", ylab="mean SSE", log="x")
lines(lasso ~dim.size, type="l", lwd=2, col="red")
lines(alasso~dim.size, type="l", lwd=2, col="blue")
legend("topright", legend = c("lm","lasso","a-lasso"),
col = c("green","red","blue"), lty = 1, lwd=2)
abline(v=1000, lty=3)
})
with(sse.summ.dim_sizes,{
plot(lasso~dim.size, type="l", lwd=2, col="red", ylim=c(0,0.1), ylab="mean SSE",log="x")
lines(alasso~dim.size, type="l", lwd=2, col="blue")
legend("topleft", legend = c("lasso","a-lasso"), col = c("red","blue"), lty = 1, lwd=2)
abline(v=1000, lty=3)
})
with(sse.summ.dim_sizes,{
plot(lasso~dim.size, type="l", lwd=2, col="red", ylim=c(0,0.1), ylab="mean SSE",log="x")
lines(alasso~dim.size, type="l", lwd=2, col="blue")
lines(plasso~dim.size, type="l", lwd=2, col="orange")
legend("topleft", legend = c("lasso","a-lasso","post-lasso"),
col = c("red","blue","orange"), lty = 1, lwd=2)
abline(v=1000, lty=3)
})
this_data <- Data_with(n=1000, p=2e4, s= rep(10,3),silent = FALSE)
system.time(
plasso.reg <- rlasso(c(this_data$y) ~ this_data$X, post=TRUE, intercept=TRUE)
)
system.time(
fit.alasso <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1)
)
反省点その1、これほどかっこいいフレーズなのに当日聴衆には全く刺さらなかった(というか完全に滑った)。プレゼンスキルの改善がである。その2、圧縮センシングのお得感とか、スパース推定後の解釈性とか、もうちょっと使い道に触れた導入にすべきだった。最後の"rigorous lasso"では、計算時間は少なからず増えているわけで、実際の現場感としてここまでの精度が必要か?とか、いずれ費用対効果やロバスト化の議論はしてみたい。