Simulations
simu.Rmd
Method comparison
To demonstrate the effectiveness of the proposed method, we ran simulation studies and compared the performance of six methods for a moderate-dimensional and a high-dimensional setting: CR-Lasso, sparse shooting S (SSS, Bottmer et al., 2022), robust Lars (RLars, Khan et al., 2007), MM-Lasso (Smucler & Yohai, 2017), sparse LTS (SLTS, Alfons et al., 2013) and Lasso (Tibshirani, 1996).
For the implementation of SSS, we utilized the R function sparseshooting
(Wilms, 2020). RLars and SLTS were implemented using the R functions rlars
and sparseLTS
in the R package robustHD
(Alfons, 2021). The R function mmlasso
(Smucler, 2017) was employed for MM-Lasso. Lasso was implemented through the R function glmnet
in the R package glmnet
(Friedman et al., 2010).
By default in the utilized functions, Lasso and MM-Lasso employ cross-validation (CV) to select optimal tuning parameters. In contrast, SSS, RLars, and CR-Lasso utilize the Bayesian information criterion (BIC) for tuning parameter selection. SLTS uses a default tuning parameter without optimization.
For additional information regarding the packages utilized, kindly refer to the “Bone mineral density data” page for a more comprehensive overview
##### methods compared
mtds = list(
## spase shooting S, codes are from https://github.com/ineswilms/sparse-shooting-S
## package is available from https://github.com/PengSU517/shootings
## install this package directly using remotes::install_github("PengSU517/shootings")
sss = function(y, x){shootings::sparseshooting(x,y)$coef},
## Rlars, from package robustHD
rlars = function(y, x){regcell::Rlars(y, x)$betahat},
## MM-Lasso, forked from https://github.com/esmucler/mmlasso
## the updated package is available from https://github.com/PengSU517/mmlasso
## install this package directly using remotes::install_github("PengSU517/mmlasso")
## this function can be effectively replaced by the pense package on CRAN
mmlasso = function(y,x){mmlasso::mmlasso(x,y)$coef.MMLasso.ad},
## sparse LTS, from package robustHD
slts = function(y,x){robustHD::sparseLTS(x,y)$coefficients},
## cellwise regularized Lasso without post regression, not used in simulations
# cell_lasso = function(y,x){
# fit = regcell::sregcell_std(y = y, x = x, softbeta = TRUE, lambda_zeta = 1, penal = 1, penaldelta =0)
# return(c(fit$intercept_hat, fit$betahat))
# },
## cellwise regularized Lasso with post regression
cell_lasso_post = function(y,x){
fit = regcell::sregcell_std(y = y, x = x, softbeta = TRUE, lambda_zeta = 1, penal = 1, penaldelta =0)
return(c(fit$intercept_hat_post, fit$betahat_post))
},
## Lasso, from package glmnet
lasso = function(y,x){
return(regcell::lassocv(y,x)$betahat)
}
)
Simulation settings
In our simulations, we set \(n = 200\), \(p = 50\) and \(\mathbf\beta = (\mathbf 1_{10} ^\top, \mathbf 0_{p-10}^\top )^\top\) with an intercept term \(\beta_0 = 1\). Clean observation vectors \(\mathbf{\check x}_\mathrm{i}\) were sampled from \(N(\mathbf 0,\mathbf\Sigma)\), and errors \(\varepsilon_i\) were sampled from \(N(0,3^2)\), indicating a relatively high level of noise in the data. The correlation structure among predictors was given by \(\Sigma _{ij} = \rho^{|i-j|}\) and we set \(\rho = 0.5\). We also generated \(\mathbf{\check x}_i\) from the multivariate \(t_4\) distribution to simulate distributions with heavier tails.
To introduce cellwise outliers, we set the contamination proportion \(e\) to $ 0%$, $ 2%$ and \(5\%\) for all predictors and generate outliers independently. Outlying cells \(\Delta_{ij}\) and \(\zeta_{i}\) were randomly generated from \(N(\gamma, 1)\) and \(N(-\gamma, 1)\) with equal probability, where \(\gamma\) varies to simulate outliers with different magnitudes. %Then we obtain the observed cells via \({x}_{ij} = {\check x}_{ij} + \Delta_{ij}\).
The following code is utilized for running simulations (in parallel).
library(doParallel) ####parallel computation
registerDoParallel(cores=10)
getDoParWorkers()
###### data generation settings
{
ms = 1:200 # the number of simulations
ns = c(200) # sample size
ps = c(50) # num of candidate predictors
prs = c(10) # number of informative candidata predictors
es = c(0, 0.02, 0.05) # contamination rates
rs = c(0.5) # correlation among predictors
gammas = c(0, 2, 4, 6, 8) # magnitude of outlyingness
# sigmas = c(2) # variance of residuals, not used
dfs = c(4, Inf) # degrees of freedom generating predictors
outtypes = c("cellwise") # types of contamination
}
{
result <- foreach(m = ms,
.packages = c("robustHD", "robustbase" , "mmlasso","shootings", "cellWise", "regcell"))%:%
foreach(n = ns)%:%
foreach(p = ps)%:%
foreach(pr = prs)%:%
foreach(e = es)%:%
foreach(r = rs)%:%
foreach(gamma = gammas)%:%
# foreach(sigma = sigmas)%:%
foreach(df = dfs)%:%
foreach(outtype = outtypes)%dopar% {
{
seed = m
set.seed(seed = seed) ##set random seed
beta = c(rep(1,pr),rep(0,p-pr))#*c(1,-1) ###set beta
dataset = regcell::genevar(n = n, p = p, e = e, r = r, beta = beta,intercept = 1,
gamma = gamma, df = df, outtype = outtype, sigma = 3,
mux = rep(0,p), scalex = 1) ##generate datasets
x = dataset$x # the clean design matrix
xc = dataset$xc # the contaminated design matrix
y = dataset$y # the response
ynew = dataset$ynew # # another response vector for calculating prediction errors
}
rst = list()
if(((e!=0)&(gamma!=0))|((e==0)&(gamma==0))){
for (mtd in 1:length(mtds)) {
rst[[mtd]]=rep(NA, 27)
try({
timing = system.time({betahat = mtds[[mtd]](y,x)})["elapsed"] ##calculate time consuming
mspe = mean((ynew - cbind(1,xc)%*%betahat)^2) #mean squared prediction error
mape = mean(abs(ynew - cbind(1,xc)%*%betahat)) # mean absolute prediction error
rtmspe = sqrt(mean(sort((ynew - cbind(1,xc)%*%betahat)^2)[1:(0.9*n)]))## root of trimed mspe
tpf<-function(betahat, beta){sum((as.logical(betahat)==as.logical(beta))[1:pr])}
tnf<-function(betahat,beta){sum((as.logical(betahat)==as.logical(beta))[-(1:pr)])}
tp = tpf(betahat[-1], beta) # true positive rate
tn = tnf(betahat[-1], beta) # true negative rate
rst[[mtd]] = c(m = m, n = n, p = p, pr = pr, e = e, r = r,
gamma = gamma, outtype = outtype, df = df,
method = names(mtds)[mtd], seed =seed,
MSPE= mspe, MAPE = mape, RTMSPE = rtmspe,
TP = tp, FP = (p-pr)-tn, Time = timing, betahat[2:11])
}, TRUE
)
}
}else{
rst[[1]] = rep(NA,27)
}
rst
}
save(result, file = "result_simu.RData")
}
Analysis
We ran \(200\) simulations for each scenario and used the root of the mean squared prediction error (RMSPE) to assess the prediction accuracy of the considered methods.
In addition, to assess the accuracy of variable selection, we employed \[ \mathrm{F}_1 = \frac{2\mathrm{TP}}{2\mathrm{TP} + \mathrm{FP} + \mathrm{FN}}, \] where TP, FP, and FN indicate true positives, false positives, and false negatives, respectively. While the \(\mathrm{F}_1\) score is commonly used for classification problems, it is also used to evaluate the performance of variable selection techniques, as in Bleich et al. (2014).
The advantage of using the \(\mathrm{F}_1\) score to measure variable selection is that it takes into account both the precision and recall of the selected variables. Precision measures the proportion of selected variables that are relevant, while recall measures the proportion of relevant variables that are selected. By combining precision and recall, the \(\mathrm{F}_1\) score provides a balanced evaluation of variable selection performance. This allows us to measure the effectiveness of each method in selection and prediction.
The following code is employed to analyze the generated results.
result1 = as.data.frame(t(as.data.frame(result)))
names(result1) = c("m", "n", "p", "pr", "e", "r", "gamma", "outtype", "df", "method", "seed",
"MSPE", "MAPE", "RTMSPE", "TP", "FP", "Time", paste("betahat", 1:10))
result2 = result1 %>% mutate(across(c(1:7,9, 11:21), as.numeric),
across(5:11, as.factor),
TN = p-pr-FP,
FN = pr - TP,
BACC = (TP/pr + TN/(p-pr))/2,
F1 = 2*TP/(2*TP + FP + FN),
RMSPE = sqrt(MSPE))
levels(result2$e) = c("e = 0%", "e = 2%", "e = 5%")
levels(result2$df) = c("t(4)", "Normal")
result2$df = factor(result2$df, levels = c("Normal" ,"t(4)" ))
levels(result2$gamma) = c(" ", "2", "4", "6", "8")
levels(result2$method) = c("CR-Lasso", "Lasso", "MM-Lasso", "RLars", "SLTS", "SSS")
result2$method = factor(result2$method, levels = c("CR-Lasso", "SSS", "RLars", "MM-Lasso", "SLTS", "Lasso"))
result2 = result2[complete.cases(result2),]
result3 = result2 %>% filter((p==50))
colorset = c("CR-Lasso" ="#F8766D",
"SSS" = "#C49A00",
"RLars" = "#53B400",
"MM-Lasso" = "#00C094",
"SLTS" = "#00B6EB",
"Lasso" = "#A58AFF")
p1 = ggplot(data = result3)+
geom_boxplot(aes(fill = method, x = gamma, y = RMSPE),outlier.size = 0.2, lwd = 0.4)+
facet_grid(df~e, scales = "free", space = "free_x") +
scale_fill_manual(values = colorset)+
theme_bw()+
labs(x = expression(gamma*": magnitude of outlyingness"))+
theme(text= element_text(size=20),
#axis.text.x = element_blank(),
legend.position = "bottom")
p1
summary = result3 %>% group_by(n,p, pr,e,r,gamma, df, method,outtype) %>% summarise(
TP = mean(TP, na.rm = T),
FP = mean(FP, na.rm = T),
TPR = mean(TP/pr, na.rm = T),
FPR = mean(FP/(p-pr), na.rm = T),
TNR = 1-FPR,
BACC = mean(BACC),
F1 = mean(F1))
p2 = ggplot(data = summary, aes(x = gamma, y = F1, group = method))+
geom_point(aes(color = method))+
geom_line(aes(linetype = method, color = method))+
facet_grid(df~e, scales = "free", space = "free_x") +
ylim(0,1)+
theme_bw()+
labs(x = expression(gamma*": magnitude of outlyingness"),
y = expression(F[1]))+
theme(text= element_text(size=20),
legend.position = "bottom")
p2