琥珀色呑んだくれ備忘録

メモとか備忘録とか

LassoとOracle Property

Oracle Property」。巷にこれほどかっこいいフレーズはそうそうないと思うので紹介を試みる。

さて、Lassoの罰則項はどういう作用を持っているか? L1 normの縮小推定を解説する図はいろんな教科書にも載っているのだが、自分は最初見たときに実感を持てなかった。Rで実際に条件を変えながらOLSと比較することで、一助になれば幸いである。

www.slideshare.net


今回使用したコード。 foreachのブロックは後半で結構時間がかかるので、イテレーションを減らすか並列化されたい。

# comparison among lm, lasso, adaptive lasso, and rigorous lasso

require(dplyr)
require(foreach)

#install.packages("glmnet")
require(glmnet)
#install.packages("msgps")
require(msgps)
#install.packages("hdm")
require(hdm)


# functions ---------------------------------------------------------------

# generate data according to model: y ~ 0 + B_i*x_i + N(0,1) 
Data_with <- function(n,p,s, silent=TRUE){
  # data with sample size(n), number of variables(p) and non-zero vector(s)
  if(! silent){
    cat(paste0(n, " obs. of ", p, " variables.\n",
              "Last ", length(s)," variables are non-zero."))
  }
  beta <- c(rep(0,p-length(s)),s)
  X <-  matrix(rnorm(n*p), ncol=p)
  
  res <-list(
    X = X,
    y = 0 + X%*%beta + rnorm(n),
    beta = beta,
    non.zero.coeff = which(beta>1e-8),
    non.zero.value = s 
  )
  if(! silent){
    str(res)
  }
  invisible(res)
}


# effect of shrinkage on estimates (lm vs lasso) ----------------------------------

test_data <- Data_with(n=10^3, p=10, s= c(1,10,100))
str(test_data,max.level = 2)
attach(test_data)

system.time(  lm.reg <- lm(y~X)  )
summary(lm.reg)

system.time(  lasso_cv  <- cv.glmnet(x = X, y=y, alpha = 1)  )
coef(lasso_cv, s="lambda.1se")

test_data <- Data_with(n=10^3, p=10, s= c(100,100,100))
str(test_data,max.level = 2)
attach(test_data)

system.time(  lm.reg <- lm(y~X)  )
summary(lm.reg)

system.time(  lasso_cv  <- cv.glmnet(x = X, y=y, alpha = 1)  )
par(mfrow=c(1,2))
plot(lasso_cv)
abline(v=log(lasso_cv$lambda.1se), lty=2, col="blue")

# plot solution path (Lasso) ------------------------------------------------------
lasso.reg <- glmnet(x = X, y=y, alpha = 1) 
plot(lasso.reg, xvar="lambda", label=TRUE)
abline(v=log(lasso_cv$lambda.1se), lty=2, col="blue")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
par(mfrow=c(1,1))

fit.coeff <- coef(lasso_cv, s="lambda.1se") %>% summary %>% transmute(beta=i-1, Var=x)
coef(lasso_cv, s="lambda.1se")
fit.coeff %>% str

# using lasso with oracle property (adaptive lasso) -------------------------

system.time(  alasso.reg <- msgps(X= X, y=c(y), penalty="alasso",gamma=1, lambda=0)  )

par(mfrow=c(2,2))
plot(alasso.reg, stand.coef=FALSE, criterion="cp",  main="Mallows' Cp")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))

plot(alasso.reg, stand.coef=FALSE, criterion="aicc",main="bias-corrected AIC (AICc)")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))

plot(alasso.reg, stand.coef=FALSE, criterion="gcv", main="generalized cross validation (GCV) ")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))

plot(alasso.reg, stand.coef=FALSE, criterion="bic", main="BIC")
abline(h= c(1,2,3), lty=2, col= c(1,2,3))
par(mfrow=c(1,1))
summary(alasso.reg)
ss=300

# measure: estimate error vs sample size -------------
sample_sizes <- c(5:20*10, 400, 600, 800, 1000,2000,3000, 4000)
itr <- 1:10
conditions <- expand.grid(ss=sample_sizes, itr=itr) %>% arrange(ss)

sse.sampsi <- foreach(Ci=1:NROW(conditions), .combine = rbind) %do% {
  
  cond <- conditions[Ci,] 
  this_data <- Data_with(n=cond$ss, p=10, s= c(10,10,10), silent=TRUE)
  beta <- this_data$beta
  
  fit.lm <- lm(this_data$y~this_data$X)
  sse.lm <- sum( (beta - coef(fit.lm)[-1])^2 )
  
  fit.lasso_cv   <- cv.glmnet(x = this_data$X, y=this_data$y, alpha = 1)
  coeff.lasso_cv <- coef(fit.lasso_cv, s="lambda.1se") %>% 
    summary %>%
    transmute(beta=i-1, Var=x)
  sse.lasso_cv <- sum( (beta[coeff.lasso_cv$beta[-1]] - coeff.lasso_cv$Var[-1])^2 )
  
  fit.alasso   <- msgps(X= this_data$X, y=c(this_data$y), 
                        penalty="alasso", gamma=1, lambda=0)
  coeff.alasso <- coef(fit.alasso) %>% 
    data.frame %>% 
    select(BIC) %>%
    unlist
  sse.alasso   <- sum( (this_data$beta - coeff.alasso[-1])^2 )
  
  data.frame(sample.size=cond$ss, lm=sse.lm, lasso=sse.lasso_cv, alasso=sse.alasso)
}

sse.summ.sampsi <- sse.sampsi %>% 
  group_by(sample.size) %>%
  summarize_each(funs(mean))
sse.summ.sampsi

with(sse.summ.sampsi,{
  plot (lm   ~sample.size, type="l", lwd=2, col="green", ylim=c(0,0.35), ylab="mean SSE", log="x")
  lines(lasso~sample.size, type="l", lwd=2, col="red")
  legend("topright", legend = c("lm","lasso"), col = c("green","red"), lty = 1, lwd=2)
})

with(sse.summ.sampsi,{
  plot (lm    ~sample.size, type="l", lwd=2, col="green", ylim=c(0,0.35), ylab="mean SSE", log="x")
  lines(lasso ~sample.size, type="l", lwd=2, col="red")
  lines(alasso~sample.size, type="l", lwd=2, col="blue")
  legend("topright", legend = c("lm","lasso","a-lasso"), col = c("green","red","blue"), lty = 1, lwd=2)
})

# example(rlasso) ---------------------------------------------------------
# use Lasso with rigorous penalty
system.time( 
  nonpost_lasso.reg <- rlasso(
    formula, data, 
    post      = FALSE, 	# FALSEで、罰則だけrigorousなLasso
    intercept = TRUE,	  # TRUEで切片項(β0)の罰則付き推定をしない
    penalty = list(homoscedastic = FALSE, 	     # FALSEで誤差の等分散性を仮定しない
                   X.dependent.lambda = FALSE,   # TRUEでデータ依存的なλの選択をする
                   lambda.start = NULL, 
                   c = 1.1, 
                   gamma = 0.1/log(n)),
    control = list(numIter =15,
                   tol = 10^-5,
                   threshold = NULL),
    ...)
  
)

# use Post-Lasso procedure with rigorous penalty
system.time( 
  post_lasso.reg <-  rlasso(
    formula, data, 
    post      = TRUE, 	# TRUEでPost-Lasso+rigorousな罰則
    intercept = TRUE,	  # TRUEで切片項(β0)の罰則付き推定をしない
    penalty = list(homoscedastic = FALSE, 	     # FALSEで誤差の等分散性を仮定しない
                   X.dependent.lambda = FALSE,   # TRUEでデータ依存的なλの選択をする
                   lambda.start = NULL, 
                   c = 1.1, 
                   gamma = 0.1/log(n)),
    control = list(numIter =15,
                   tol = 10^-5,
                   threshold = NULL),
    ...)
  
)
# comparison results (set all=FALSE to summarize)
summary(nonpost_lasso.reg, all=FALSE)
summary(post_lasso.reg, all=FALSE)
coef(post_lasso.reg)


# measure: estimate error vs dimension size -------------
dim_sizes <- c(1:10*10, 1:5*200, 2:5*1000, 1e4, 2e4)
itr <- 1:30
conditions <- expand.grid(ds=dim_sizes, itr=itr) %>% arrange(ds)

sse.dim_sizes <- foreach(Ci=1:NROW(conditions), .combine = rbind) %do% {
  
  cond <- conditions[Ci,] 
  this_data <- Data_with(n=1000, p=cond$ds, s= c(10,10,10), silent=TRUE)
  beta <- this_data$beta
  
  fit.lm <- lm(this_data$y~this_data$X)
  sse.lm <- sum( (beta - coef(fit.lm)[-1])^2 )
  
  fit.lasso_cv   <- cv.glmnet(x = this_data$X, y=this_data$y, alpha = 1)
  coeff.lasso_cv <- coef(fit.lasso_cv, s="lambda.1se") %>%
    summary %>% 
    transmute(beta=i-1, Var=x)
  sse.lasso_cv   <- sum( (beta[coeff.lasso_cv$beta[-1]] - coeff.lasso_cv$Var[-1])^2 )
  
  fit.alasso    <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1)
  coeff.alasso  <- coef(fit.alasso) %>% 
    data.frame %>%
    select(BIC) %>%
    unlist
  sse.alasso    <- sum( (this_data$beta - coeff.alasso[-1])^2 )
  
  plasso.reg    <- rlasso(c(this_data$y) ~ this_data$X, post=TRUE, intercept=TRUE)
  coeff.plasso  <-coef(plasso.reg)
  sse.plasso    <-(this_data$beta - coeff.plasso)^2 %>% sum
  
  data.frame(dim.size=cond$ds, 
             lm=sse.lm, lasso=sse.lasso_cv, alasso=sse.alasso, plasso=sse.plasso)
  
}

sse.summ.dim_sizes <- sse.dim_sizes %>% 
  group_by(dim.size) %>% 
  summarize_each((funs(mean)))
sse.summ.dim_sizes

with(sse.summ.dim_sizes,{
  plot( lm    ~dim.size, type="l", lwd=2, col="green", ylab="mean SSE", log="x")
  lines(lasso ~dim.size, type="l", lwd=2, col="red")
  lines(alasso~dim.size, type="l", lwd=2, col="blue")
  
  legend("topright", legend = c("lm","lasso","a-lasso"),
         col = c("green","red","blue"), lty = 1, lwd=2)
  abline(v=1000, lty=3)
})

with(sse.summ.dim_sizes,{
  plot(lasso~dim.size, type="l", lwd=2, col="red", ylim=c(0,0.1), ylab="mean SSE",log="x")
  lines(alasso~dim.size,  type="l", lwd=2, col="blue")
  
  legend("topleft", legend = c("lasso","a-lasso"), col = c("red","blue"), lty = 1, lwd=2)
  abline(v=1000, lty=3)
})


with(sse.summ.dim_sizes,{
  plot(lasso~dim.size, type="l", lwd=2, col="red", ylim=c(0,0.1), ylab="mean SSE",log="x")
  lines(alasso~dim.size,  type="l", lwd=2, col="blue")
  lines(plasso~dim.size,  type="l", lwd=2, col="orange")
  
  legend("topleft", legend = c("lasso","a-lasso","post-lasso"), 
         col = c("red","blue","orange"), lty = 1, lwd=2)
  abline(v=1000, lty=3)
})


# measure: performance vs dimension size -------------
this_data <- Data_with(n=1000, p=2e4, s= rep(10,3),silent = FALSE)

system.time(
  plasso.reg <- rlasso(c(this_data$y) ~ this_data$X, post=TRUE, intercept=TRUE)
)

#coef(fit.lasso_cv)
system.time(
  fit.alasso <- msgps(X= this_data$X, y=c(this_data$y), penalty="alasso", gamma=1)
)


反省点その1、これほどかっこいいフレーズなのに当日聴衆には全く刺さらなかった(というか完全に滑った)。プレゼンスキルの改善がである。その2、圧縮センシングのお得感とか、スパース推定後の解釈性とか、もうちょっと使い道に触れた導入にすべきだった。最後の"rigorous lasso"では、計算時間は少なからず増えているわけで、実際の現場感としてここまでの精度が必要か?とか、いずれ費用対効果やロバスト化の議論はしてみたい。