这里记录一下自己对单细胞对象做富集分析时的过程以及所用的代码,以免下次重头再找,非常麻烦。
参考文献:
单细胞各个亚群基因按照平均表达量排序后gsea分析 (qq.com)
单细胞各个亚群特异性高表达基因的数据库注释(包括GO,KEGG,ReactomePA) (qq.com)
ClusterProfiler包进行KEGG富集报错
1.GO,KEGG,ReactomePA富集分析
根据生信技能树的教程
(1). 先加载Seurat对象后进行取基因平均表达量
以Fibroblast 这个Seurat对象为例
av <- AverageExpression(Fibroblast ,
assays = "RNA")
av=av[[1]]
cg=names(tail(sort(apply(av, 1, sd)),1000))
pheatmap::pheatmap(cor(av[cg,]))
此时,变量av如下图所示,体现出了 亚群平均表达量:
(2). 使用com_go_kegg_ReactomePA_human.R函数
- 首先寻找各群的Markers基因, 需要安装一个包:COSGR
remotes::install_github(repo = 'genecell/COSGR')
#正式开始
input_sce = Fibroblast
table(Idents(input_sce))
pro = 'cosg_seurat_clusters'
if(T){
library(COSG)
marker_cosg <- cosg(
input_sce,
groups='all',
assay='RNA',
slot='data',
mu=1,
n_genes_user=100)
save(marker_cosg,file = paste0(pro,'_marker_cosg.Rdata'))
head(marker_cosg)
## Top10 genes
library(dplyr)
top_10 <- unique(as.character(apply(marker_cosg$names,2,head,10)))
sce.Scale <- ScaleData(input_sce ,features = top_10 )
DoHeatmap(sce.Scale, top_10)
## Top3 genes
top_3 <- unique(as.character(apply(marker_cosg$names,2,head,3)))
}
symbols_list <- as.list(as.data.frame(apply(marker_cosg$names,2,head,100)))
source('~/Code/com_go_kegg_ReactomePA_human.R')
com_go_kegg_ReactomePA_human(symbols_list, pro='Fibroblast' )
save(symbols_list, file = "./03-单细胞对象/symbols_list.RData")
-
加载函数com_go_kegg_ReactomePA_human
因为KEGG好像出问题了,所以在其中需要添加一点函数
首先将函数写入并另存为一个R文件,(为方便阅读,我分成了三段)
library(R.utils)
R.utils::setOption("clusterProfiler.download.method","auto")
com_go_kegg_ReactomePA_human <- function(symbols_list ,pro){
library(clusterProfiler)
library(org.Hs.eg.db)
library(ReactomePA)
library(ggplot2)
library(stringr)
# 首先全部的symbol 需要转为 entrezID
gcSample = lapply(symbols_list, function(y){
y=as.character(na.omit(select(org.Hs.eg.db,
keys = y,
columns = 'ENTREZID',
keytype = 'SYMBOL')[,2])
)
y
})
gcSample
# 第1个注释是 KEGG
xx <- compareCluster(gcSample, fun="enrichKEGG",
organism="hsa", pvalueCutoff=0.05)
dotplot(xx) +
scale_y_discrete(labels=function(x) str_wrap(x, width=50))
ggsave(paste0(pro,'_kegg.pdf'),width = 10,height = 8)
# 第2个注释是 ReactomePA
xx <- compareCluster(gcSample, fun="enrichPathway",
organism = "human",
pvalueCutoff=0.05)
dotplot(xx) +
scale_y_discrete(labels=function(x) str_wrap(x, width=50))
ggsave(paste0(pro,'_ReactomePA.pdf'),width = 10,height = 8)
# 然后是GO数据库的BP,CC,MF的独立注释
# Run full GO enrichment test for BP
formula_res <- compareCluster(
gcSample,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05
)
# Run GO enrichment test and merge terms
# that are close to each other to remove result redundancy
lineage1_ego <- simplify(
formula_res,
cutoff=0.5,
by="p.adjust",
select_fun=min
)
pdf(paste0(pro,'_GO_BP_cluster_simplified.pdf') ,width = 15,height = 8)
print(dotplot(lineage1_ego, showCategory=5) +
scale_y_discrete(labels=function(x) str_wrap(x, width=50)) )
dev.off()
write.csv(lineage1_ego@compareClusterResult,
file=paste0(pro,'_GO_BP_cluster_simplified.csv'))
# Run full GO enrichment test for CC
formula_res <- compareCluster(
gcSample,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05
)
# Run GO enrichment test and merge terms
# that are close to each other to remove result redundancy
lineage1_ego <- simplify(
formula_res,
cutoff=0.5,
by="p.adjust",
select_fun=min
)
pdf(paste0(pro,'_GO_CC_cluster_simplified.pdf') ,width = 15,height = 8)
print(dotplot(lineage1_ego, showCategory=5) +
scale_y_discrete(labels=function(x) str_wrap(x, width=50)) )
dev.off()
write.csv(lineage1_ego@compareClusterResult,
file=paste0(pro,'_GO_CC_cluster_simplified.csv'))
# Run full GO enrichment test for MF
formula_res <- compareCluster(
gcSample,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05
)
# Run GO enrichment test and merge terms
# that are close to each other to remove result redundancy
lineage1_ego <- simplify(
formula_res,
cutoff=0.5,
by="p.adjust",
select_fun=min
)
pdf(paste0(pro,'_GO_MF_cluster_simplified.pdf') ,width = 15,height = 8)
print(dotplot(lineage1_ego, showCategory=5) +
scale_y_discrete(labels=function(x) str_wrap(x, width=50)) )
dev.off()
write.csv(lineage1_ego@compareClusterResult,
file=paste0(pro,'_GO_MF_cluster_simplified.csv'))
}
- 接下来准备工作已经做完,开始真正的分析
symbols_list <- as.list(as.data.frame(apply(marker_cosg$names,2,head,100)))
source('com_go_kegg_ReactomePA_human.R')
com_go_kegg_ReactomePA_human(symbols_list, pro='pbmc' )
就三步
首先将symbols_list变量变为列表类型
其次加载函数
最后运行函数
运行的结果是pdf文件,因为隐私原因就不展示啦。
2.分组水平GSVA富集分析-HALLMARK基因集
-
基因集准备
genesets <- msigdbr(species = "Homo sapiens", category = "H")
genesets <- subset(genesets, select = c("gs_name","gene_symbol")) %>% as.data.frame()
genesets <- split(genesets$gene_symbol, genesets$gs_name)
现在这里的类群就是三群,因此不需要改变Ident
expr <- AverageExpression(Fibroblast, assays = "RNA", slot = "data")[[1]]
expr <- expr[rowSums(expr)>0,] #选取非零基因
expr <- as.matrix(expr)
GSVA富集分析
# gsva默认开启全部线程计算
gsva.res <- gsva(expr, genesets, method="ssgsea")
saveRDS(gsva.res, "gsva.res.rds")
gsva.df <- data.frame(Genesets=rownames(gsva.res), gsva.res, check.names = F)
write.csv(gsva.df, "gsva_res.csv", row.names = F)
pheatmap::pheatmap(gsva.res, show_colnames = T, scale = "row")
3.分组水平GSVA富集分析-KEGG基因集
基因集准备
genesets <- msigdbr(species = "Homo sapiens", category = "C2")
genesets <- subset(genesets, gs_subcat=="CP:KEGG",
select = c("gs_name", "gene_symbol")) %>% as.data.frame()
genesets <- split(genesets$gene_symbol, genesets$gs_name)
gsva富集分析
gsva.res <- gsva(expr, genesets, method="ssgsea")
saveRDS(gsva.res, "KEGG_gsva.res.rds")
gsva.df <- data.frame(Genesets=rownames(gsva.res), gsva.res, check.names = F)
write.csv(gsva.df, "KEGG_gsva_res.csv", row.names = F)
# 画图
P <- pheatmap::pheatmap(gsva.res, show_colnames = T, scale = "row")
ggsave(P, file = "./KEGG_pheatmap.pdf", width = 16, height = 30)