新组装的基因组并没有上传和生成RefSeq基因组信息。而且一系列需要or.db或者构建or.db的步骤尝试构建数据库都不成功,要么缺乏物种信息,要么缺乏gene symbol。因此,我们利用自己的数据在eggNOG-mapper上进行功能注释的信息,同样可以进行富集分析。参考Y叔clusterProfile的手册,以及一系列网上搜索的结果:https://zhuanlan.zhihu.com/p/35510434;https://cloud.tencent.com/developer/article/1607669?from=14588;
//www.greatytc.com/p/d484003dced5;
https://www.omicsclass.com/article/1513
直接上整理出来的脚本吧:
##universal enrichment analysis
library(magrittr)
library(clusterProfiler)
library(stringr)
library(dplyr)
library(ggplot2)
library(enrichplot)
#下载安装注释数据库, 有参考基因组的,有注释结果的make a database
#source("https://bioconductor.org/biocLite.R")
library("AnnotationHub")
BiocManager::install("biomaRt")
library("biomaRt")
hub <- AnnotationHub::AnnotationHub() #snapshotDate(): 2020-04-27
query(hub, "Erisoma") #搜索有参考基因组的KEGG注释信息
#非模式物种以及没有参考基因组的物种,不适合用eggGO和eggKEGG
gene_list <- read.table("Sl.expand.gene.txt", header = F)
gene <- as.character(gene_list[,1])
##1、GO 富集分析
#先处理emapper的数据,#注意去除title的注释符号“#”
egg <- read.table("Sl.out.emapper.annotations", sep="\t", header=T)
#提取GO数据
gene_ids <- egg$query
eggnog_annoations_go <- str_split(egg$GOs, ",")
gene_to_go <- data.frame(gene = rep(gene_ids,
times = sapply(eggnog_annoations_go, length)),
term = unlist(eggnog_annoations_go))
gene2go <- filter(gene_to_go, term != "-")
#term2gene <- gene_to_go[,c(2,1)]
term2gene <- gene2go[,c(2,1)]
colnames(term2gene)[1] <- "ID"
df1 <- go2term(term2gene$ID) ##根据ID生成GO注释信息
df2 <- go2ont(term2gene$ID) ##根据ID进行分类:BP, CC, MF
colnames(df1)[1] <- "ID"
colnames(df2)[1] <- "ID"
df <- left_join(term2gene, df1, by = "ID")
df3 <- left_join(df, df2, by = "ID")
#df3_BP <- filter(df3, Ontology == "BP")
#df_three <- split(df3, with(df3, Ontology))
colnames(df3)[4] <- "Class"
gid2gene <- df3[, c("ID", "gene", "Class")]
gid2name <- df3[, c("ID", "Term", "Class")]
ego <- enricher(gene, TERM2GENE = gid2gene, TERM2NAME = gid2name,
pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
dotplot(ego)
result <- as.data.frame(ego) #转化为数据框方便看富集的结果
gid2gene <- split(gid2gene, with(gid2gene, Class)) #拆分成BP,MF,CC三个数据框
gid2name <- split(gid2name, with(gid2name, Class)) #拆分成BP,MF,CC三个数据框
#gid2gene <- df3_BP[, c("ID", "gene")]
#gid2name <- df3_BP[, c("ID", "Term")]
#如果只是想关注某一GO分类的富集情况可运行如下:
ego_BP <- enricher(gene, TERM2GENE = gid2gene[['BP']][c(1,2)],
TERM2NAME = gid2name[['BP']][c(1,2)],
pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
dotplot(ego_BP, title = "BP")
ego_CC <- enricher(gene, TERM2GENE = gid2gene[['CC']][c(1,2)],
TERM2NAME = gid2name[['CC']][c(1,2)],
pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
ego_MF <- enricher(gene, TERM2GENE = gid2gene[['MF']][c(1,2)],
TERM2NAME = gid2name[['MF']][c(1,2)],
pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
#以下可以将三个分类都整合到一个图内
library(ggpubr)
BP <- dotplot(ego_BP, title = "BP")
CC <- dotplot(ego_CC, title = "CC")
MF <- dotplot(ego_MF, title = "MF")
ggarrange(BP, CC, MF, ncol = 1, nrow = 3, align = "hv")
#下面的数据是TBtools整理出来的
#dat <- read.table("out.emapper.annotations.GO.txt", header = F, sep = "\t")
#colnames(dat)[1] <- "gene"
#colnames(dat)[2] <- "Gid"
#term2gene <- dat[,c(2,1)]
#这里是先富集分析再给id注释和分类,然后用ggplot2画,因为没法用dotplot画
df<-enricher(gene=gene,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
TERM2GENE = term2gene)
df <- as.data.frame(df) #一定要转成数据框
df1 <- go2term(df$ID) ##根据ID生成注释信息
colnames(df1)[1] <- "ID"
ego <- left_join(df, df1, by = "ID")
df2 <- go2ont(df$ID) ##根据ID进行分类:BP, CC, MF
colnames(df2)[1] <- "ID"
ego1 <- left_join(ego, df2, by = "ID")
df3 <- ego1 %>% select(c("Term", "Ontology", "p.adjust", "Count"))
df3 <- na.omit(df3)
df_m <- df3[c(1:20),] #提取前20个term
df_m <- df_m[order(df_m$Count, decreasing = F),] #按照count排序
df_m$Term <- factor(df_m$Term, levels = df_m$Term) #给排好序的term固定变量,作图时就能按照这个顺序
ggplot(df_m,aes(x=Term,y=Count))+
geom_point(aes(size = Count, color=-log10(p.adjust))) +
facet_grid(Ontology~., scales = "free", space = "free") +
coord_flip()+labs(x="")+
theme_bw() + scale_color_steps(low = "blue", high = "red")
#color越红,p值越小
#接上面GO的分析,此处不需要count排序只要pvalue的柱形图,等同于TBtools里的图
df3 <- ego1 %>% select(c("Term", "Ontology", "pvalue"))
df3 <- na.omit(df3)
ggplot(df3[c(1:20),],aes(x=Term,y=-log10(pvalue)))+
geom_bar(aes(fill=Ontology))+
coord_flip()+labs(x="")+
theme_bw()
接着提取KEGG注释信息并富集分析。。。。
# 2. KEGG pathway
#提取KEGG数据,要注意KO pathway与k number的区别,前者才可转化为代谢功能
gene_ko <- egg %>%
dplyr::select(GID = query, Ko = KEGG_ko) #这里的KEGG_ko是K number
gene_ko[,2]<-gsub("ko:","",gene_ko[,2]) #替换去除ko:
gene_ko_list <- str_split(gene_ko$Ko, ",")
gene2ko <- data.frame(gene = rep(gene_ids,
times = sapply(gene_ko_list, length)),
term = unlist(gene_ko_list))
gene2ko <- filter(gene2ko, term != "-")
gene_pathway <- egg %>%
dplyr::select(GID = query, Pathway = KEGG_Pathway) #这里的才是Pathway
gene_pathway_list <- str_split(gene_pathway$Pathway, ",")
gene2pathway <- data.frame(gene = rep(gene_ids,
times = sapply(gene_pathway_list, length)),
term = unlist(gene_pathway_list))
gene2pathway <- filter(gene2pathway, term != "-")
term2gene <- gene2pathway[,c(2,1)]
colnames(term2gene)[1] <- "ID"
pathway2name <- ko2name(term2gene$ID) #根据pathway的id输出注释信息
pathway2name <- na.omit(pathway2name)
pathway2name <- unique.data.frame(pathway2name)
colnames(pathway2name)[1] <- "ID"
ko2gene <- term2gene[grep(pattern = "^ko", term2gene$ID),]
#这里的id有的是map开头的需要去除,因为ko2name只能注释ko开头的
kegg <- enricher(gene, TERM2GENE = ko2gene, TERM2NAME = pathway2name,
pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
dotplot(kegg)
#browseKEGG(kegg, 'ko05034') #在pathway通路图上标记富集到的基因,会链接到KEGG官网