说在前面
生信宝库在之前的两期推文:CBNplot:可视化富集通路中基因的调控关系和CBNplot:可视化富集分析中各通路的调控关系,分别从基因和通路的角度介绍了如何使用CBNplot对我们关注的生物学问题进行分析。但是说到解决具体的临床问题,那必须和临床特征相关联才具有实际意义,CBNplot的作者在开发时就考虑到这个角度。那么今天Immugent就来进行实操展示如何使用CBNplot对临床特征进行关联,从而找出和某一临床特征相关的通路或者基因。
同样,本期推文还是基于上两期产生的富集结果,如果小伙伴们想实操本期的代码还需要先跑一下之前的流程,下面开始进行本次的代码展示。
代码展示
CBNplot还可以评估生物功能相关通路信息与临床变量之间的关系,表达数据以外的变量可以用otherVar指定,也可以用otherVarName指定它们的名称。otherVar的顺序必须与表达式数据的列顺序相同,默认使用校正后p值低于0.05的所有显著通路。
bnCov <- bnpathplot(pway,
vstedTCGA,
nCategory = 1000,
adjpCutOff = 0.01,
expSample=rownames(metadata),
algo="hc", strType="normal",
otherVar=metadata,
otherVarName=c("Age", "Category"),
R=50, cl=parallel::makeCluster(10),
returnNet=T,
shadowText=T)
## Check DAG
igraph::is.dag(as.igraph(bnCov$av))
## Fit the parameter to network based on the data
bnFit <- bn.fit(bnCov$av, bnCov$df)
bnCov$plot
接下来,Immugent通过bnlearn函数cpdist执行条件概率查询,以阐明临床变量如何影响通路调节。首先,我们将推断出的网络与原始数据进行匹配。它们存储在命名列表中。除非另有说明,否则执行逻辑采样。执行cpdist,根据肿瘤类别使用ggdist可视化“与弹性纤维相关的分子”的分布。
library(bnlearn)
library(ggdist)
library(ggplot2)
candPath <- "Molecules associated with elastic fibres"
efz <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==0))
efo <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==1))
eft <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==2))
effect = data.frame(
val = c(efz[,1], efo[,1], eft[,1]),
stage = c(rep("0",nrow(efz)), rep("1", nrow(efo)), rep("2", nrow(eft)))
)
disMean <- effect %>% group_by(stage) %>% summarise(mean=mean(val))
stageWMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
effect$stageLabel <- c(rep(stageWMean[1],nrow(efz)), rep(stageWMean[2], nrow(efo)), rep(stageWMean[3], nrow(eft)))
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candPath)
library(bnlearn)
library(ggdist)
library(ggplot2)
candPath <- "Molecules associated with elastic fibres"
efz <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==0))
efo <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==1))
eft <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==2))
effect = data.frame(
val = c(efz[,1], efo[,1], eft[,1]),
stage = c(rep("0",nrow(efz)), rep("1", nrow(efo)), rep("2", nrow(eft)))
)
disMean <- effect %>% group_by(stage) %>% summarise(mean=mean(val))
stageWMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
effect$stageLabel <- c(rep(stageWMean[1],nrow(efz)), rep(stageWMean[2], nrow(efo)), rep(stageWMean[3], nrow(eft)))
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candPath)
## Reflect the difference in the plot modifying ggplot2 object
changeCol <- bnCov$plot$data
difMeanCCC <- difMeanCCC[changeCol$name]
names(difMeanCCC) <- changeCol$name
changeCol$color <- difMeanCCC
## Replace the color, change the legend
bnCov$plot$data <- changeCol
bnCov$plot + scale_color_continuous(low="blue",high="red",name="difference")
对于感兴趣通路中的基因,也可以纳入临床变量。我们研究了参与反应团的基因“与弹性纤维相关的分子”。
bnGeneCov <- bngeneplot(pway,
vstedTCGA, pathNum=43,
expSample=rownames(metadata),
otherVar=metadata,
hub=5, R=100,
otherVarName=c("Age","Category"),
cl=parallel::makeCluster(10),
returnNet=T)
## Plot
bnGeneCov$plot
## Check DAG
igraph::is.dag(as.igraph(bnGeneCov$av))
## Fit the parameter to network based on the data
bnFitGene <- bn.fit(bnGeneCov$av, bnGeneCov$df)
candGene <- "EFEMP1"
efz <- cpdist(bnFitGene, nodes=c(candGene), evidence=(Category==0))
efo <- cpdist(bnFitGene, nodes=c(candGene), evidence=(Category==1))
eft <- cpdist(bnFitGene, nodes=c(candGene), evidence=(Category==2))
effect = data.frame(
val = c(efz[,1], efo[,1], eft[,1]),
stage = c(rep("0",nrow(efz)), rep("1", nrow(efo)), rep("2", nrow(eft)))
)
disMean <- effect %>% group_by(stage) %>% summarise(mean=mean(val))
stageWMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
effect$stageLabel <- c(rep(stageWMean[1],nrow(efz)), rep(stageWMean[2], nrow(efo)), rep(stageWMean[3], nrow(eft)))
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candGene)
candGene <- names(bnFitGene)
candGene <- candGene[candGene != "Category"]
efz2 <- cpdist(bnFitGene, nodes=candGene, evidence=(Category==0))
eft2 <- cpdist(bnFitGene, nodes=candGene, evidence=(Category==2))
difMean <- apply(eft2, 2, mean) - apply(efz2, 2, mean)
changeCol <- bnGeneCov$plot$data
difMean <- difMean[changeCol$name]
names(difMean) <- changeCol$name
changeCol$color <- difMean
## Replace shape and color
changeCol$shape <- rep(19, dim(bnGeneCov$plot$data)[1])
bnGeneCov$plot$data <- changeCol
bnGeneCov$plot + scale_color_continuous(low="blue",high="red",name="difference")
为了确认推断出的贝叶斯网络的有效性,我们可以关注一些已经被验证与临床信息相关或被纳入日常临床实践的基因。要获得包含特定基因的路径,可以使用获取路径函数。本次我们重点研究了MMP2基因,该基因已被报道与膀胱癌的临床进展相关。
pathSub <- obtainPath(pway, "MMP2")
bnGeneCov2 <- bngeneplot(pathSub,
vstedTCGA, pathNum=1:2,
expSample=rownames(metadata),
otherVar=metadata,
hub=5, R=50, algo="hc",
otherVarName=c("Age","Category"),
cl=parallel::makeCluster(10),
returnNet=T)
bnGeneCov2$plot
bnGeneCov2$av <- chooseEdgeDir(bnGeneCov2$av, bnGeneCov2$df, scoreType="mi-cg", debug=FALSE)
bnGeneCov2Fit <- bn.fit(bnGeneCov2$av, bnGeneCov2$df)
candGene <- "MMP2"
mz <- cpdist(bnGeneCov2Fit, nodes=c(candGene), evidence=(Category==0), method="ls")
mo <- cpdist(bnGeneCov2Fit, nodes=c(candGene), evidence=(Category==1), method="ls")
mt <- cpdist(bnGeneCov2Fit, nodes=c(candGene), evidence=(Category==2), method="ls")
effect = data.frame(
val = c(mz[,1], mo[,1], mt[,1]),
stage = c(rep("0",nrow(mz)), rep("1", nrow(mo)), rep("2", nrow(mt)))
)
disMean <- effect %>% group_by(stage) %>% summarise(mean=mean(val))
stageWMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
effect$stageLabel <- c(rep(stageWMean[1],nrow(mz)), rep(stageWMean[2], nrow(mo)), rep(stageWMean[3], nrow(mt)))
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candGene)
metadata <- data.frame(tcgaData@colData) %>%
dplyr::select(age_at_diagnosis, gender, paper_mutation.in.TP53, paper_Combined.T.and.LN.category) %>% na.omit() %>%
filter(paper_mutation.in.TP53!="ND") %>%
filter(paper_Combined.T.and.LN.category!="ND")
table(metadata$paper_mutation.in.TP53)
## Set TP53 status to numeric of 0/1.
metadata$paper_mutation.in.TP53 <- as.numeric(as.factor(metadata$paper_mutation.in.TP53))-1
metadata$age_at_diagnosis <- as.numeric(scale(metadata$age_at_diagnosis))
metadata$paper_Combined.T.and.LN.category <- as.factor(metadata$paper_Combined.T.and.LN.category)
metadata$gender <- as.factor(metadata$gender)
set.seed(53) # Seed for split
trainIndex <- caret::createFolds(factor(metadata$paper_mutation.in.TP53), k = 5, list = TRUE, returnTrain=TRUE)
allnets <- list() ## Store network in the list
allClassRes <- list() ## Store prediction in the list
## Already VSTed DF
load("trainDf.rda")
load("testDf.rda")
for (f in seq_len(5)) {
nets <- list()
classRes <- list()
foldTrainIndex <- trainIndex[[f]]
## Recursively fit and test for significant pathways
for (pnum in seq(1, dim(subset(pway@result, p.adjust<1e-5))[1], 1)) {
cl <- parallel::makeCluster(12)
bnCovTrain <- bngeneplot(pway, assay(trainDf[[f]]), pathNum=pnum, layout="sugiyama",
expSample=rownames(metadata[foldTrainIndex,]), algo="hc", strType="normal",
otherVar=metadata[foldTrainIndex,], otherVarName=c("Age","Gender","TP53","Category"),
R=50, cl=cl, returnNet=T)
## Return only DF for testing
bnCovTest <- bngeneplot(pway, assay(testDf[[f]]), pathNum=pnum,
expSample=rownames(metadata[-foldTrainIndex,]),
otherVar=metadata[-foldTrainIndex,], otherVarName=c("Age","Gender","TP53","Category"),
onlyDf=T)
bnCovTrain$av <- chooseEdgeDir(bnCovTrain$av, bnCovTrain$df, scoreType="mi-cg", debug=FALSE)
## If DAG and TP53 have parents
if ( igraph::is.dag(bnlearn::as.igraph(bnCovTrain$av)) && length(bnCovTrain$av$nodes$TP53$parents) >= 1 ){
bnCovLargeFit <- bnlearn::bn.fit(bnCovTrain$av, bnCovTrain$df)
pred <- sigmoid::sigmoid(predict(bnCovLargeFit, node="TP53", data=bnCovTest, method = "bayes-lw")) # Use sigmoid function
classRes[[pway@result$Description[pnum]]] <- pred
nets[[pway@result$Description[pnum]]] <- bnCovTrain
} else {
message(paste0("Among pathway ", pway@result$Description[pnum], ", no parent node of TP53 is found, or inferred network is not dag."))
}
parallel::stopCluster(cl)
}
allnets[[f]] <- nets
allClassRes[[f]] <- classRes
}
library(pROC)
library(ggplotify)
rocDf <- c()
allRocList <- list()
for (f in seq_len(5)) {
correct <- metadata[-trainIndex[[f]],]$paper_mutation.in.TP53
predDf <- data.frame(allClassRes[[f]])
predDf$label <- correct
rocList <- list()
for (i in seq_len(dim(predDf)[2]-1)){
rocList[[names(predDf)[i]]] <- roc(predDf$label, predDf[,i], ci=TRUE, direction="<") # check direction
}
tmpRocDf <- data.frame(t(data.frame(purrr::map(rocList, function(x) as.numeric(x$auc)))))
colnames(tmpRocDf) <- c(paste0("auc",f))
tmpRocDf$name <- rownames(tmpRocDf)
allRocList[[f]] <- tmpRocDf
}
allRocListDf <- allRocList %>%
purrr::reduce(left_join, by = "name")
rocMean <- allRocListDf %>%
rowwise() %>%
mutate(Min = min(c_across(starts_with("auc")), na.rm=T),
Max = max(c_across(starts_with("auc")), na.rm=T),
Mean = mean(c_across(starts_with("auc")), na.rm=T),
Sd = sd(c_across(starts_with("auc")), na.rm=T)) %>%
select(name ,Mean, Sd) %>%
arrange(desc(Mean))
kable(rocMean, row.names=FALSE, booktab=TRUE) %>%
kable_styling(font_size = 10)
topPath <- rocMean[1,"name"]
candFold <- as.numeric(allRocListDf %>% filter(name == as.character(topPath)) %>% summarize(which.max(c_across(starts_with("auc")))))
## With the CI
candRoc <- data.frame(metadata[-trainIndex[[candFold]],]$paper_mutation.in.TP53)
colnames(candRoc) <- c("label")
candRoc$pred <- as.numeric(unlist(allClassRes[[candFold]][gsub("[.]", " ", as.character(topPath))]))
rocplot <- as.ggplot(function(){
rocobj1 <- plot.roc(candRoc$label, candRoc$pred, print.auc = TRUE, ci=TRUE, col="black", direction="<",
main = "Mutation in TP53", percent=TRUE)
ciobj <- ci.se(rocobj1, specificities = seq(0, 100, 5), progress="none")
plot(ciobj, type = "shape", col = "steelblue")
})
## Along with the network
topNet <- allnets[[candFold]][[gsub("[.]", " ", topPath)]]
topNet$plot + rocplot
topFit <- bnlearn::bn.fit(topNet$av, topNet$df)
candNodes <- names(topNet$av$nodes)
candNodes <- candNodes[!candNodes %in% c("TP53","Category","Gender")]
tp53low <- cpdist(topFit, nodes=candNodes, evidence=(TP53 < 0.5))
dim(tp53low)
tp53high <- cpdist(topFit, nodes=candNodes, evidence=(TP53 > 0.5))
dim(tp53high)
difMeanTp53 <- apply(tp53high, 2, mean) - apply(tp53low, 2, mean)
kable(head(difMeanTp53[order(abs(difMeanTp53), decreasing=T)]), col.names=c("difference"))
changeCol <- topNet$plot$data
difMeanTp53 <- difMeanTp53[changeCol$name]
names(difMeanTp53) <- changeCol$name
changeCol$color <- difMeanTp53
## Replace shape and color
topNet$plot$data <- changeCol
topNet$plot + scale_color_continuous(low="blue",high="red",name="difference")
说在最后
CBNplot可以说是为从事基础研究的科研工作者打开了一扇崭新的大门,这个工具不仅功能强大,而且很轻便,更是和Y叔的很多经典R包有无缝衔接。当然,Immugent这里只是根据官网介绍跑的标准化流程,在实际应用中可以更加的灵活使用。
好啦,截止到目前有关CBNplot的三篇推文就已经全部更新完毕。由于这个软件在今年3月才被开发出来,因此到目前还没有使用CBNplot进行分析的文章发表,后续如果有的话,生信宝库将会再出一期推文进行解读,敬请期待!