「Oracle Property」。巷にこれほどかっこいいフレーズはそうそうないと思うので紹介を試みる。
さて、Lassoの罰則項はどういう作用を持っているか? L1 normの縮小推定を解説する図はいろんな教科書にも載っているのだが、自分は最初見たときに実感を持てなかった。Rで実際に条件を変えながらOLSと比較することで、一助になれば幸いである。
www.slideshare.net
今回使用したコード。 foreachのブロックは後半で結構時間がかかるので、イテレーションを減らすか並列化されたい。
# comparison among lm, lasso, adaptive lasso, and rigorous lasso require(dplyr) require(foreach) #install.packages("glmnet") require(glmnet) #install.packages("msgps") require(msgps) #install.packages("hdm") require(hdm) # functions --------------------------------------------------------------- # generate data according to model: y ~ 0 + B_i*x_i + N(0,1) Data_with <- function(n,p,s, silent=TRUE){ # data with sample size(n), number of variables(p) and non-zero vector(s) if(! silent){ cat(paste0(n, " obs. of ", p, " variables.\n", "Last ", length(s)," variables are non-zero.")) } beta <- c(rep(0,p-length(s)),s) X <- matrix(rnorm(n*p), ncol=p) res <-list( X = X, y = 0 + X%*%beta + rnorm(n), beta = beta, non.zero.coeff = which(beta>1e-8), non.zero.value = s ) if(! silent){ str(res) } invisible(res) } # effect of shrinkage on estimates (lm vs lasso) ---------------------------------- test_data <- Data_with(n=10^3, p=10, s= c(1,10,100)) str(test_data,max.level = 2) attach(test_data) system.time( lm.reg <- lm(y~X) ) summary(lm.reg) system.time( lasso_cv <- cv.glmnet(x = X, y=y, alpha = 1) ) coef(lasso_cv, s="lambda.1se") test_data <- Data_with(n=10^3, p=10, s= c(100,100,100)) str(test_data,max.level = 2) attach(test_data) system.time( lm.reg <- lm(y~X) ) summary(lm.reg) system.time( lasso_cv <- cv.glmnet(x = X, y=y, alpha = 1) ) par(mfrow=c(1,2)) plot(lasso_cv) abline(v=log(lasso_cv$lambda.1se), lty=2, col="blue") # plot solution path (Lasso) ------------------------------------------------------ lasso.reg <- glmnet(x = X, y=y, alpha = 1) plot(lasso.reg, xvar="lambda", label=TRUE) abline(v=log(lasso_cv$lambda.1se), lty=2, col="blue") abline(h= c(1,2,3), lty=2, col= c(1,2,3)) par(mfrow=c(1,1)) fit.coeff <- coef(lasso_cv, s="lambda.1se") %>% summary %>% transmute(beta=i-1, Var=x) coef(lasso_cv, s="lambda.1se") fit.coeff %>% str # using lasso with oracle property (adaptive lasso) ------------------------- system.time( alasso.reg <- msgps(X= X, y=c(y), penalty="alasso",gamma=1, lambda=0) ) par(mfrow=c(2,2)) plot(alasso.reg, stand.coef=FALSE, criterion="cp", main="Mallows' Cp") abline(h= c(1,2,3), lty=2, col= c(1,2,3)) plot(alasso.reg, stand.coef=FALSE, criterion="aicc",main="bias-corrected AIC (AICc)") abline(h= c(1,2,3), lty=2, col= c(1,2,3)) plot(alasso.reg, stand.coef=FALSE, criterion="gcv", main="generalized cross validation (GCV) ") abline(h= c(1,2,3), lty=2, col= c(1,2,3)) plot(alasso.reg, stand.coef=FALSE, criterion="bic", main="BIC") abline(h= c(1,2,3), lty=2, col= c(1,2,3)) par(mfrow=c(1,1)) summary(alasso.reg) ss=300 # measure: estimate error vs sample size ------------- sample_sizes <- c(5:20*10, 400, 600, 800, 1000,2000,3000, 4000) itr <- 1:10 conditions <- expand.grid(ss=sample_sizes, itr=itr) %>% arrange(ss) sse.sampsi <- foreach(Ci=1:NROW(conditions), .combine = rbind) %do% { cond <- conditions[Ci,] this_data <- Data_with(n=cond$ss, p=10, s= c(10,10,10), silent=TRUE) beta <- this_data$beta fit.lm <- lm(this_data$y~this_data$X) sse.lm <- sum( (beta - coef(fit.lm)[-1])^2 ) fit.lasso_cv <- cv.glmnet(x = this_data$X, y=this_data$y, alpha = 1) coeff.lasso_cv <- coef(fit.lasso_cv, s="lambda.1se") %>% summary %>% transmute(beta=i-1, Var=x) sse.lasso_cv <- sum( (beta[coeff.lasso_cv$beta[-1]] - coeff.lasso_cv$Var[-1])^2 ) fit.alasso <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1, lambda=0) coeff.alasso <- coef(fit.alasso) %>% data.frame %>% select(BIC) %>% unlist sse.alasso <- sum( (this_data$beta - coeff.alasso[-1])^2 ) data.frame(sample.size=cond$ss, lm=sse.lm, lasso=sse.lasso_cv, alasso=sse.alasso) } sse.summ.sampsi <- sse.sampsi %>% group_by(sample.size) %>% summarize_each(funs(mean)) sse.summ.sampsi with(sse.summ.sampsi,{ plot (lm ~sample.size, type="l", lwd=2, col="green", ylim=c(0,0.35), ylab="mean SSE", log="x") lines(lasso~sample.size, type="l", lwd=2, col="red") legend("topright", legend = c("lm","lasso"), col = c("green","red"), lty = 1, lwd=2) }) with(sse.summ.sampsi,{ plot (lm ~sample.size, type="l", lwd=2, col="green", ylim=c(0,0.35), ylab="mean SSE", log="x") lines(lasso ~sample.size, type="l", lwd=2, col="red") lines(alasso~sample.size, type="l", lwd=2, col="blue") legend("topright", legend = c("lm","lasso","a-lasso"), col = c("green","red","blue"), lty = 1, lwd=2) }) # example(rlasso) --------------------------------------------------------- # use Lasso with rigorous penalty system.time( nonpost_lasso.reg <- rlasso( formula, data, post = FALSE, # FALSEで、罰則だけrigorousなLasso intercept = TRUE, # TRUEで切片項(β0)の罰則付き推定をしない penalty = list(homoscedastic = FALSE, # FALSEで誤差の等分散性を仮定しない X.dependent.lambda = FALSE, # TRUEでデータ依存的なλの選択をする lambda.start = NULL, c = 1.1, gamma = 0.1/log(n)), control = list(numIter =15, tol = 10^-5, threshold = NULL), ...) ) # use Post-Lasso procedure with rigorous penalty system.time( post_lasso.reg <- rlasso( formula, data, post = TRUE, # TRUEでPost-Lasso+rigorousな罰則 intercept = TRUE, # TRUEで切片項(β0)の罰則付き推定をしない penalty = list(homoscedastic = FALSE, # FALSEで誤差の等分散性を仮定しない X.dependent.lambda = FALSE, # TRUEでデータ依存的なλの選択をする lambda.start = NULL, c = 1.1, gamma = 0.1/log(n)), control = list(numIter =15, tol = 10^-5, threshold = NULL), ...) ) # comparison results (set all=FALSE to summarize) summary(nonpost_lasso.reg, all=FALSE) summary(post_lasso.reg, all=FALSE) coef(post_lasso.reg) # measure: estimate error vs dimension size ------------- dim_sizes <- c(1:10*10, 1:5*200, 2:5*1000, 1e4, 2e4) itr <- 1:30 conditions <- expand.grid(ds=dim_sizes, itr=itr) %>% arrange(ds) sse.dim_sizes <- foreach(Ci=1:NROW(conditions), .combine = rbind) %do% { cond <- conditions[Ci,] this_data <- Data_with(n=1000, p=cond$ds, s= c(10,10,10), silent=TRUE) beta <- this_data$beta fit.lm <- lm(this_data$y~this_data$X) sse.lm <- sum( (beta - coef(fit.lm)[-1])^2 ) fit.lasso_cv <- cv.glmnet(x = this_data$X, y=this_data$y, alpha = 1) coeff.lasso_cv <- coef(fit.lasso_cv, s="lambda.1se") %>% summary %>% transmute(beta=i-1, Var=x) sse.lasso_cv <- sum( (beta[coeff.lasso_cv$beta[-1]] - coeff.lasso_cv$Var[-1])^2 ) fit.alasso <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1) coeff.alasso <- coef(fit.alasso) %>% data.frame %>% select(BIC) %>% unlist sse.alasso <- sum( (this_data$beta - coeff.alasso[-1])^2 ) plasso.reg <- rlasso(c(this_data$y) ~ this_data$X, post=TRUE, intercept=TRUE) coeff.plasso <-coef(plasso.reg) sse.plasso <-(this_data$beta - coeff.plasso)^2 %>% sum data.frame(dim.size=cond$ds, lm=sse.lm, lasso=sse.lasso_cv, alasso=sse.alasso, plasso=sse.plasso) } sse.summ.dim_sizes <- sse.dim_sizes %>% group_by(dim.size) %>% summarize_each((funs(mean))) sse.summ.dim_sizes with(sse.summ.dim_sizes,{ plot( lm ~dim.size, type="l", lwd=2, col="green", ylab="mean SSE", log="x") lines(lasso ~dim.size, type="l", lwd=2, col="red") lines(alasso~dim.size, type="l", lwd=2, col="blue") legend("topright", legend = c("lm","lasso","a-lasso"), col = c("green","red","blue"), lty = 1, lwd=2) abline(v=1000, lty=3) }) with(sse.summ.dim_sizes,{ plot(lasso~dim.size, type="l", lwd=2, col="red", ylim=c(0,0.1), ylab="mean SSE",log="x") lines(alasso~dim.size, type="l", lwd=2, col="blue") legend("topleft", legend = c("lasso","a-lasso"), col = c("red","blue"), lty = 1, lwd=2) abline(v=1000, lty=3) }) with(sse.summ.dim_sizes,{ plot(lasso~dim.size, type="l", lwd=2, col="red", ylim=c(0,0.1), ylab="mean SSE",log="x") lines(alasso~dim.size, type="l", lwd=2, col="blue") lines(plasso~dim.size, type="l", lwd=2, col="orange") legend("topleft", legend = c("lasso","a-lasso","post-lasso"), col = c("red","blue","orange"), lty = 1, lwd=2) abline(v=1000, lty=3) }) # measure: performance vs dimension size ------------- this_data <- Data_with(n=1000, p=2e4, s= rep(10,3),silent = FALSE) system.time( plasso.reg <- rlasso(c(this_data$y) ~ this_data$X, post=TRUE, intercept=TRUE) ) #coef(fit.lasso_cv) system.time( fit.alasso <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1) )
反省点その1、これほどかっこいいフレーズなのに当日聴衆には全く刺さらなかった(というか完全に滑った)。プレゼンスキルの改善がである。その2、圧縮センシングのお得感とか、スパース推定後の解釈性とか、もうちょっと使い道に触れた導入にすべきだった。最後の"rigorous lasso"では、計算時間は少なからず増えているわけで、実際の現場感としてここまでの精度が必要か?とか、いずれ費用対効果やロバスト化の議論はしてみたい。