title: "lasso回归"
author: "Sun Xiaojie"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Vignette Title}
%\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
editor_options:
chunk_output_type: console
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
knitr::opts_chunk$set(fig.width = 7, fig.height = 5,collapse = TRUE)
knitr::opts_chunk$set(message = FALSE)
1.准备输入数据
load("TCGA-KIRC_sur_model.Rdata")
ls()
exprSet[1:4,1:4]
meta[1:4,1:4]
2.构建lasso回归模型
输入数据是表达矩阵(仅含tumor样本)和每个病人对应的生死(顺序必须一致)。exprSet/meta
x=t(exprSet)
y=meta$event
library(glmnet)
model_lasso <- glmnet(x, y,nlambda=10, alpha=1) #代码构建模型,nlambda参数决定模型的简洁性
print(model_lasso)
这里是举例子,所以只计算了10个λ值,解释一下输出结果三列的意思。
- Df 是自由度
- 列%Dev代表了由模型解释的残差的比例,对于线性模型来说就是模型拟合的R^2(R-squred)。它在0和1之间,越接近1说明模型的表现越好,如果是0,说明模型的预测结果还不如直接把因变量的均值作为预测值来的有效。
- Lambda 是构建模型的重要参数。
解释的残差百分比越高越好,但是构建模型使用的基因的数量也不能太多,需要取一个折中值。
2.1挑选合适的λ值
计算1000个,画图,筛选表现最好的λ值 #虚线之间的都可
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)
两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。lambda.1se构建的模型最简单,即使用的基因数量少,而lambda.min则准确率更高一点,使用的基因数量更多一点。
2.2 用这两个λ值重新建模
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
这两个值体现在参数lambda上。有了模型,可以将筛选的基因挑出来了。所有基因存放于模型的子集beta中,用到的基因有一个s0值,没用的基因只记录了“.”,所以可以用下面代码挑出用到的基因。
head(model_lasso_min$beta,20)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
length(choose_gene_1se)
3.模型预测和评估
3.1自己预测自己
newx参数是预测对象。输出结果lasso.prob是一个矩阵,第一列是min的预测结果,第二列是1se的预测结果,预测结果是概率,或者说百分比,不是绝对的0和1。
将每个样本的生死和预测结果放在一起,直接cbind即可。
lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(y ,lasso.prob)
head(re)
3.2 箱线图
对预测结果进行可视化。以实际的生死作为分组,画箱线图整体上查看预测结果。
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr)
p1 = ggboxplot(re, x = "event", y = "prob_min",
color = "event", palette = "jco",
add = "jitter")+
scale_y_continuous(limits = c(0,1)) +
stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
color = "event", palette = "jco",
add = "jitter")+
scale_y_continuous(limits = c(0,1)) +
stat_compare_means()
library(patchwork)
p1+p2
可以看到,真实结果是生(0)的样本,预测的值就小一点(靠近0),真实结果是死(1)的样本,预测的值就大一点(靠近1),整体上趋势是对的,但不是完全准确,模型是可用的。
对比两个λ值构建的模型,差别不大,min的预测值准确一点。
3.3 ROC曲线
计算AUC取值范围在0.5-1之间,越接近于1越好。可以根据预测结果绘制ROC曲线。
library(ROCR)
library(caret)
# 自己预测自己
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
plot(perf_min,colorize=FALSE, col="blue")
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_min,3)))
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")
plot(perf_1se,colorize=FALSE, col="red")
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)))
- 强迫症选项,想把两个模型画一起。
plot(perf_min,colorize=FALSE, col="blue")
plot(perf_1se,colorize=FALSE, col="red",add = T)
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.3, labels = paste0("AUC = ",round(auc_min,3)),col = "blue")
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)),col = "red")
-还可以用ggplot2画的更好看一点
tpr_min = performance(pred_min,"tpr")@y.values[[1]]
tpr_1se = performance(pred_1se,"tpr")@y.values[[1]]
dat = data.frame(tpr_min = perf_min@y.values[[1]],
fpr_min = perf_min@x.values[[1]],
tpr_1se = perf_1se@y.values[[1]],
fpr_1se = perf_1se@x.values[[1]])
library(ggplot2)
ggplot() +
geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") +
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
annotate("text",x = .75, y = .25,
label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
scale_x_continuous(name = "fpr")+
scale_y_continuous(name = "tpr")
ggplot() +
geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") +
geom_line(data = dat,aes(x = fpr_1se, y = tpr_1se),color = "red")+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
annotate("text",x = .75, y = .25,
label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
annotate("text",x = .75, y = .15,label = paste("AUC of 1se = ",round(auc_1se,2)),color = "red")+
scale_x_continuous(name = "fpr")+
scale_y_continuous(name = "tpr")
5.切割数据构建模型并预测
5.1 切割数据
用R包caret切割数据,生成的结果是一组代表列数的数字,用这些数字来给表达矩阵和meta取子集即可。
library(caret)
set.seed(12345679)
sam<- createDataPartition(meta$event, p = .5,list = FALSE)
head(sam)
可查看两组一些临床参数切割比例
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
prop.table(table(train_meta$stage))
prop.table(table(test_meta$stage))
prop.table(table(test_meta$race))
prop.table(table(train_meta$race))
5.2 切割后的train数据集建模
和上面的建模方法一样。
#计算lambda
x = t(train)
y = train_meta$event
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)
#构建模型
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
#挑出基因
head(model_lasso_min$beta)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
length(choose_gene_1se)
4.模型预测
用训练集构建模型,预测测试集的生死,注意newx参数变了。
lasso.prob <- predict(cv_fit, newx=t(test), s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(event = test_meta$event ,lasso.prob)
head(re)
再次进行可视化
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr)
p1 = ggboxplot(re, x = "event", y = "prob_min",
color = "event", palette = "jco",
add = "jitter")+
scale_y_continuous(limits = c(0,1)) +
stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
color = "event", palette = "jco",
add = "jitter")+
scale_y_continuous(limits = c(0,1)) +
stat_compare_means()
library(patchwork)
p1+p2
再画ROC曲线
library(ROCR)
library(caret)
# 训练集模型预测测试集
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")
tpr_min = performance(pred_min,"tpr")@y.values[[1]]
tpr_1se = performance(pred_1se,"tpr")@y.values[[1]]
dat = data.frame(tpr_min = perf_min@y.values[[1]],
fpr_min = perf_min@x.values[[1]],
tpr_1se = perf_1se@y.values[[1]],
fpr_1se = perf_1se@x.values[[1]])
ggplot() +
geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") +
geom_line(data = dat,aes(x = fpr_1se, y = tpr_1se),color = "red")+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
annotate("text",x = .75, y = .25,
label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
annotate("text",x = .75, y = .15,label = paste("AUC of 1se = ",round(auc_1se,2)),color = "red")+
scale_x_continuous(name = "fpr")+
scale_y_continuous(name = "tpr")
AUC值比不拆分时降低。