了解到一个发表在bioinformatics的用于多组学聚类的R包MOVICS,学习下他的Vignette。它支持10个聚类算法,综合考虑多组学数据,多算法聚类,并且可以直接做出发表级的图,很强大。还是做肿瘤资源多呀,除了TCGA上哪里去找这么大样本量的、同一批人的多组学数据呢。
使用browseVignettes("MOVICS")
这句代码即可召唤作者写的详细教程啦。
1.载入示例数据
library(MOVICS)
library(purrr)
load(system.file("extdata", "brca.tcga.RData", package = "MOVICS", mustWork = TRUE))
load(system.file("extdata", "brca.yau.RData", package = "MOVICS", mustWork = TRUE))
了解数据,发现是brca的多组学数据,如下:
names(brca.tcga)
## [1] "mRNA.expr" "lncRNA.expr" "meth.beta" "mut.status" "count"
## [6] "fpkm" "maf" "segment" "clin.info"
sapply(brca.tcga, dim)
## mRNA.expr lncRNA.expr meth.beta mut.status count fpkm maf segment
## [1,] 500 500 1000 30 13771 13771 120988 381953
## [2,] 643 643 643 643 643 643 10 5
## clin.info
## [1,] 643
## [2,] 5
names(brca.yau)
## [1] "mRNA.expr" "clin.info"
sapply(brca.tcga, dim)
## mRNA.expr lncRNA.expr meth.beta mut.status count fpkm maf segment
## [1,] 500 500 1000 30 13771 13771 120988 381953
## [2,] 643 643 643 643 643 643 10 5
## clin.info
## [1,] 643
## [2,] 5
# extract multi-omics data
mo.data <- brca.tcga[1:4]
count <- brca.tcga$count
fpkm <- brca.tcga$fpkm
maf <- brca.tcga$maf
segment <- brca.tcga$segment
surv.info <- brca.tcga$clin.info
head(surv.info)
## fustat futime PAM50 pstage age
## BRCA-A03L-01A 0 2442 LumA T3 34
## BRCA-A04R-01A 0 2499 LumB T1 36
## BRCA-A075-01A 0 518 LumB T2 42
## BRCA-A08O-01A 0 943 LumA T2 45
## BRCA-A0A6-01A 0 640 LumA T2 64
## BRCA-A0AD-01A 0 1157 LumA T1 83
给的是已经筛选过基因的数据,我们自己使用时是要自己筛选的,可以自己写代码筛选,也可以使用getElites函数来筛选。
2.getElites筛选基因
对缺失值的处理
na.action参数:含有缺失值的行可以选择删除na.action = "rm"
或者是插补na.action = "impute"
,前者是默认值
elite.pct : 选择前百分之多少的特征
elite.num :选择前多少个特征,如果同时指定pct和num,会以num为准
提供了5种筛选方法:
mad() 中位数绝对偏差
sd 标准差 cox
单因素Cox 比例风险回归
freq 二元组学数据
pca 主成分分析
cox按照p值筛选
如果使用cox方法,则需提供surv.info,列名必须有futime和fustat
freq按照突变频数或频率
freq只支持二元组学数据,需要注意freq下的elite.num>80就是是挑突变样本数量大于80的基因
elite.pct>0.2是突变样本数量/样本总数>0.2的基因
示例数据已经筛选好的,如果是未筛选的数据,可以把上面的前三个组学数据都用mad和cox方法筛选一下,示例代码如下(我整的):
if(F){
names(brca.tcga)
mo.data = lapply(brca.tcga[1:3], function(x){
x %>%
getElites(elite.num = 200) %>%
pluck('elite.dat') %>%
getElites(method= "cox",surv.info = surv.info,p.cutoff = 0.05) %>%
pluck('elite.dat')
})
mo.data$mut.status = getElites(dat = brca.tcga$mut.status,
method = "freq",
elite.pct = 0.02) %>%
pluck('elite.dat')
sapply(mo.data, dim)
}
3.获取聚类的最佳数量
运行超级慢
optk.brca <- getClustNum(data = mo.data,
is.binary = c(F,F,F,T), #二元数据写T
try.N.clust = 2:8,
fig.name = "CLUSTER NUMBER OF TCGA-BRCA")
optk.brca$N.clust
## [1] 3
不一定非要使用计算出来的最佳数量,可以结合背景知识另行选择,作者给的示例是乳腺癌所以选了5,因为乳腺癌公认的PAM50分型是分了5种亚型,我在这里是使用了3,试试而已,跑跑看。
4.多组学数据、多算法聚类
从getMOIC帮助文档可以看到10种聚类算法: “SNF”, “CIMLR”, “PINSPlus”, “NEMO”, “COCA”, “MoCluster”, “LRAcluster”, “ConsensusClustering”, “IntNMF”, “iClusterBayes”
moic.res.list <- getMOIC(data = mo.data,
methodslist = list("SNF", "PINSPlus", "NEMO", "COCA", "LRAcluster", "ConsensusClustering", "IntNMF", "CIMLR", "MoCluster","iClusterBayes"),
N.clust = 3,
type = c("gaussian", "gaussian", "gaussian", "binomial"))
5.整合不同算法
直接出个热图来
cmoic.brca <- getConsensusMOIC(moic.res.list = moic.res.list,
fig.name = "CONSENSUS HEATMAP",
distance = "euclidean",
linkage = "average")
6.量化样本相似度
getSilhouette(sil = cmoic.brca$sil, # a sil object returned by getConsensusMOIC()
fig.path = getwd(),
fig.name = "SILHOUETTE",
height = 5.5,
width = 5)
## png
## 2
7.基于聚类结果画多组学热图
获取画图数据
# convert beta value to M value for stronger signal
indata <- mo.data
indata$meth.beta <- log2(indata$meth.beta / (1 - indata$meth.beta))
# data normalization for heatmap
plotdata <- getStdiz(data = indata,
halfwidth = c(2,2,2,NA), # no truncation for mutation
centerFlag = c(T,T,T,F), # no center for mutation
scaleFlag = c(T,T,T,F)) # no scale for mutation
画图,带有临床信息的
# set color for each omics data
# if no color list specified all subheatmaps will be unified to green and red color pattern
mRNA.col <- c("#00FF00", "#008000", "#000000", "#800000", "#FF0000")
lncRNA.col <- c("#6699CC", "white" , "#FF3C38")
meth.col <- c("#0074FE", "#96EBF9", "#FEE900", "#F00003")
mut.col <- c("grey90" , "black")
col.list <- list(mRNA.col, lncRNA.col, meth.col, mut.col)
# extract PAM50, pathologic stage and age for sample annotation
annCol <- surv.info[,c("PAM50", "pstage", "age"), drop = FALSE]
# generate corresponding colors for sample annotation
annColors <- list(age = circlize::colorRamp2(breaks = c(min(annCol$age),
median(annCol$age),
max(annCol$age)),
colors = c("#0000AA", "#555555", "#AAAA00")),
PAM50 = c("Basal" = "blue",
"Her2" = "red",
"LumA" = "yellow",
"LumB" = "green",
"Normal" = "black"),
pstage = c("T1" = "green",
"T2" = "blue",
"T3" = "red",
"T4" = "yellow",
"TX" = "black"))
# comprehensive heatmap (may take a while)
getMoHeatmap(data = plotdata,
row.title = c("mRNA","lncRNA","Methylation","Mutation"),
is.binary = c(F,F,F,T), # the 4th data is mutation which is binary
legend.name = c("mRNA.FPKM","lncRNA.FPKM","M value","Mutated"),
clust.res = cmoic.brca$clust.res, # consensusMOIC results
clust.dend = NULL, # show no dendrogram for samples
show.rownames = c(F,F,F,F), # specify for each omics data
show.colnames = FALSE, # show no sample names
show.row.dend = c(F,F,F,F), # show no dendrogram for features
annRow = NULL, # no selected features
color = col.list,
annCol = annCol, # annotation for samples
annColors = annColors, # annotation color
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF CONSENSUSMOIC")
[图片上传失败...(image-c798f9-1713157170386)]
8.亚型之间的比较
常见的是比较生存情况-KMplot
surv.brca <- compSurv(moic.res = cmoic.brca,
surv.info = surv.info,
convt.time = "m", # convert day unit to month
surv.median.line = "h", # draw horizontal line at median survival
xyrs.est = c(5,10), # estimate 5 and 10-year survival
fig.name = "KAPLAN-MEIER CURVE OF CONSENSUSMOIC")
9.亚型之间的差异分析
runDEA(dea.method = "edger",
expr = count, # raw count data
moic.res = cmoic.brca,
prefix = "TCGA-BRCA") # prefix of figure name
## edger of CS1_vs_Others done...
## edger of CS2_vs_Others done...
## edger of CS3_vs_Others done...
10.鉴定marker基因
这个是选出edger计算的每个亚型的前100个基因
marker.up <- runMarker(moic.res = cmoic.brca,
dea.method = "edger", # name of DEA method
prefix = "TCGA-BRCA", # MUST be the same of argument in runDEA()
dat.path = getwd(), # path of DEA files
res.path = getwd(), # path to save marker files
p.cutoff = 0.05, # p cutoff to identify significant DEGs
p.adj.cutoff = 0.05, # padj cutoff to identify significant DEGs
dirct = "up", # direction of dysregulation in expression
n.marker = 100, # number of biomarkers for each subtype
doplot = TRUE, # generate diagonal heatmap
norm.expr = fpkm, # use normalized expression as heatmap input
annCol = annCol, # sample annotation in heatmap
annColors = annColors, # colors for sample annotation
show_rownames = FALSE, # show no rownames (biomarker name)
fig.name = "UPREGULATED BIOMARKER HEATMAP")
11.给外部数据预测亚型
NTP(nearest template prediction)和PAM(partition around medoids)两种算法,可以根据选中的marker来给外部数据预测亚型,templates是在上一步runMarker时生成出来的。
names(brca.yau) #这是外部数据
## [1] "mRNA.expr" "clin.info"
# run NTP in Yau cohort by using up-regulated biomarkers
yau.ntp.pred <- runNTP(expr = brca.yau$mRNA.expr,
templates = marker.up$templates, # the template has been already prepared in runMarker()
scaleFlag = TRUE, # scale input data (by default)
centerFlag = TRUE, # center input data (by default)
doPlot = TRUE, # to generate heatmap
fig.name = "NTP HEATMAP FOR YAU")
##
## CS1 CS2 CS3 <NA>
## 131 139 145 267
yau.pam.pred <- runPAM(train.expr = fpkm,
moic.res = cmoic.brca,
test.expr = brca.yau$mRNA.expr)
12.比较外部数据的亚型的生存情况
surv.yau <- compSurv(moic.res = yau.ntp.pred,
surv.info = brca.yau$clin.info,
convt.time = "m", # switch to month
surv.median.line = "hv", # switch to both
fig.name = "KAPLAN-MEIER CURVE OF NTP FOR YAU")
13.可以检查一致程度
可以对TCGA队列跑NTP和PAM,分别比较其结果和原来得到的亚型一致程度如何;
或者是比较NTP和PAM
runKappa(subt1 = yau.ntp.pred$clust.res$clust,
subt2 = yau.pam.pred$clust.res$clust,
subt1.lab = "NTP",
subt2.lab = "PAM",
fig.name = "CONSISTENCY HEATMAP FOR YAU")