题目出自【生信技能树】公众号文章:2011年的表达芯片分析和2019年的区别
要求:走GEO标准分析流程分析GSE27447
代码主要复制于Jimmy大佬的github:https://github.com/jmzeng1314/GEO/tree/master/GSE42872_main
主要步骤:
step1 - 读入GSE27447数据
step2 - 检查表达矩阵
step3 - 差异分析
step4 - GO/KEGG数据库注释
step5 - GSEA基因集富集分析(施工中)
step6 - GSVA基因集变异分析(施工中)
step1 - 读入GSE27447数据
rm(list=ls())
options(stringsAsFactors = F)
library(AnnoProbe) # 生信技能树出品的GEO数据下载利器
library(Biobase)
gset <- geoChina("GSE27447")
gse27447 <- gset[[1]]
exprSet <- exprs(gse27447)
boxplot(exprSet,las=2)
根据boxplot可以得出,原始数据需要log2处理。
exprSet <- log2(exprSet+1)
boxplot(exprSet,las=2)
exprSet[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#7892501 3.221012 4.502222 1.271969 1.849167
#7892502 6.909593 7.305615 6.956150 7.078983
#7892503 5.125911 8.659878 5.395697 5.660467
#7892504 11.045603 10.060453 10.845937 11.145225
#行名是探针,列名是样本,中间的数据是某样本中某探针的表达量。
gse27447@annotation
#[1] "GPL6244"
checkGPL(gse27447@annotation)
#[1] TRUE
# checkGPL()
结果是TRUE说明AnnoProbe包中存在"GPL6244" 平台数据,于是可以使用另外两个超级厉害的函数idmap()
和filterEM()
,得到探针对应的基因名,然后把表达矩阵的探针名转换为基因名。
ids <- idmap(gse27447@annotation)
dat <- filterEM(exprSet,ids)
dim(dat)
#[1] 18837 19
dat <- dat[order(rownames(dat)),]
获取临床信息,从中进一步获取分组信息:
pd <- pData(gse27447)
library(stringr)
group_list=str_split(pd$title,' ',simplify = T)[,1]
table(group_list)
#group_list
#non-triple triple
# 14 5
所以是5个三阴乳腺癌样本,14个非三阴乳腺癌样本。
把表达矩阵和分组信息保存到Rdata文件。
save(dat,group_list,file = 'step1-output.Rdata')
step2 - 检查表达矩阵
rm(list = ls()) ##清空Enviroment
options(stringsAsFactors = F)
load('step1-output.Rdata') ##读入第一步得到的数据,然后检查数据
table(group_list)
#group_list
#non-triple triple
# 14 5
dat[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#A1CF 6.654092 6.074542 6.258994 6.609746
#A2M 10.266259 11.034813 10.494956 11.450283
#A2ML1 6.375017 6.118954 5.683851 4.278468
#A3GALT2 4.424096 4.144055 4.906982 5.065266
2.1 样本相关性分析
dim(dat)
#[1] 18837 19
ac=data.frame(groups=group_list)
rownames(ac)=colnames(dat) #把ac的行名给到n的列名,即对每一个探针标记上分组信息
pheatmap(cor(dat),annotation_col = ac)
2.2 主成分分析(PCA)
# 我们是对样本做主成分分析,所以要求行名是样本名,列名是探针(基因)名,所以需要对表达矩阵进行转置
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list) ##给表达矩阵加上分组信息
library("FactoMineR")
library("factoextra")
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#dat最后一列是group_list。pca分析需要一个纯数值矩阵,所以将dat最后一列去掉以后赋值给dat.pca
(fviz_pca_ind(dat.pca,geom.ind = "point",
col.ind = dat$group_list,
palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE,
legend.title = "Groups"
))
ggsave('all_samples_PCA.png')
可以看到,两组样本并没有分开╮(╯▽╰)╭
不过大家都是肿瘤样本,相似性较高也可以理解。
2.3 取sd最大的1000个基因画热图
rm(list = ls()) ## 魔幻操作,一键清空~
load(file = 'step1-output.Rdata')
dat[1:4,1:4]
cg=names(tail(sort(apply(dat,1,sd)),1000))
##使用 apply()
函数计算获取表达矩阵dat每行(即每个基因表达量)的方差,从小到大排序后,取最大的1000个方差,获取其对应的基因名,赋值给变量cg
##然后就可以对这1000个基因画热图
library(pheatmap)
pheatmap(dat[cg,],show_colnames =F,show_rownames = F)
优化一下,比如对dat[cg,]
归一化:
n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
给样本添加分组信息:
ac=data.frame(g=group_list)
rownames(ac)=colnames(n) #把ac的行名给到n的列名,即对每一个样本标记上分组信息
(heatmap <- pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac))
library(ggplot2)
ggsave(filename = 'heatmap_top1000_sd.png',heatmap)
step3 - 差异分析
3.1 使用limma包做差异分析
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = 'step1-output.Rdata') ##载入数据
dat[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#A1CF 6.654092 6.074542 6.258994 6.609746
#A2M 10.266259 11.034813 10.494956 11.450283
#A2ML1 6.375017 6.118954 5.683851 4.278468
#A3GALT2 4.424096 4.144055 4.906982 5.065266
group_list[group_list=='non-triple'] <- 'non_triple'
table(group_list)
#group_list
#non_triple triple
# 14 5
使用箱线图boxplot
肉眼观察一下前几个基因的数据分布。
boxplot(unlist(dat[1,])~group_list)
自定义一个函数,用
ggpubr包
画漂亮点的boxplot
library(ggpubr)
bp=function(g){ #定义一个函数g,函数为{}里的内容
df=data.frame(gene=g,stage=group_list)
p <- ggboxplot(df, x = "stage", y = "gene",
color = "stage", palette = "jco",
add = "jitter")
p + stat_compare_means()
}
p1 <- bp(as.numeric(dat[1,]))
p2 <- bp(as.numeric(dat[2,]))
p3 <- bp(as.numeric(dat[3,]))
p4 <- bp(as.numeric(dat[4,]))
library(patchwork)
(bp_4 <- p1|p2|p3|p4)
以上也只属于check data,下面开始真正的差异分析
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
exprSet=dat
rownames(design)=colnames(exprSet)
contrast.matrix<-makeContrasts("triple-non_triple",levels = design)
contrast.matrix ##这个矩阵声明,我们要把 triple 组跟 non_triple 进行差异分析比较
# Contrasts
#Levels triple-non_triple
# non_triple -1
# triple 1
自定义函数DEG,功能是做差异分析。
DEG <- function(exprSet,design,contrast.matrix){
fit <- lmFit(exprSet,design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
return(nrDEG)
}
把数据喂给DEG函数,获取差异分析结果,并保存到Rdata
deg <- DEG(exprSet,design,contrast.matrix)
head(deg)
# logFC AveExpr t P.Value adj.P.Val B
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696
# DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157
save(deg,file = 'deg.Rdata')
手动检查deg前几个基因表达情况
bp(as.numeric(dat[rownames(dat)=='ARFGEF3',]))
可以看到,与non-triple组比较,triple组表达ARFGEF3显著下调,与deg结果一致。
3.2 差异分析结果的可视化
3.2.1 火山图
设置P.Value
和logFC
的阈值,将基因分为up,down,stable三类
df <- deg
colnames(deg)
df$v <- -log10(df$P.Value)
p_thred <- 0.05
logFC_thred <- 1.5
df$groups = ifelse(df$P.Value > p_thred, "stable",
ifelse(df$logFC > logFC_thred, "up",
ifelse(df$logFC < -logFC_thred, "down",
"stable")))
table(df$groups)
# down stable up
# 164 18554 119
library(ggplot2)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_thred,3),
'\nThe number of up gene is ',nrow(df[df$groups =='up',]) ,
'\nThe number of down gene is ',nrow(df[df$groups =='down',])
)
p <- ggplot(data = df, aes(x = logFC, y = v)) +
geom_point(alpha=0.4, size=1.75,
aes(color=groups)) +
scale_color_manual(values=c("blue", "grey","red")) +
geom_vline(xintercept=c(-logFC_thred,logFC_thred),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(p_thred),lty=4,col="black",lwd=0.8) +
labs(x="logFC",y="-log10(P.value)")+
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
theme_bw()
print(p)
3.2.2 热图
load(file = 'step1-output.Rdata')
dat[1:4,1:4]
# GSM678364 GSM678365 GSM678366 GSM678367
#A1CF 6.654092 6.074542 6.258994 6.609746
#A2M 10.266259 11.034813 10.494956 11.450283
#A2ML1 6.375017 6.118954 5.683851 4.278468
#A3GALT2 4.424096 4.144055 4.906982 5.065266
table(group_list)
#group_list
#non-triple triple
# 14 5
x=deg$logFC #deg取logFC这列并将其重新赋值给x
names(x)=rownames(deg) #deg取probe_id这列,并将其作为名字给x
cg=c(names(head(sort(x),100)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
names(tail(sort(x),100)))
n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n< -2]= -2
ac=data.frame(groups=group_list)
rownames(ac)=colnames(n)
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac)
step4 - GO/KEGG数据库注释
参考:【生信技能树】公众号文章:为R包写一本书(像Y叔致敬)
4.0 整理数据
设置P.Value和logFC的阈值,将基因分为UP
,DOWN
,stable
3组, deg矩阵新建一列g
储存基因分组信息。
不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
rm(list = ls()) ## 魔幻操作,一键清空~
load(file = 'deg.Rdata')
head(deg)
# logFC AveExpr t P.Value adj.P.Val B
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696
#DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157
p_thred <- 0.05
logFC_thred <- 1.5
deg$g=ifelse(deg$P.Value>p_thred,'stable',
ifelse( deg$logFC > logFC_thred,'UP',
ifelse( deg$logFC < -logFC_thred,'DOWN','stable') )
)
table(deg$g)
# DOWN stable UP
# 164 18554 119
head(deg)
# logFC AveExpr t P.Value adj.P.Val B g
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433 DOWN
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194 DOWN
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988 stable
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932 UP
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696 DOWN
#DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157 DOWN
富集分析需要基因的ENTREZID
,使用Y叔神作clusterProfiler包
里的bitr()
函数获取对应关系,deg矩阵转存为DEG,并加上ENTREZID
列
deg$SYMBOL=rownames(deg)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(deg$SYMBOL), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(df)
# SYMBOL ENTREZID
#1 ARFGEF3 57221
#2 NEK11 79858
#3 HOXB3 3213
#4 EGR2 1959
#5 C3orf14 57415
#6 DNAH5 1767
DEG=deg
head(DEG)
# logFC AveExpr t P.Value adj.P.Val B g SYMBOL
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433 DOWN ARFGEF3
#NEK11 -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194 DOWN NEK11
#HOXB3 -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988 stable HOXB3
#EGR2 2.140287 6.380807 5.680934 1.350247e-05 0.06003476 2.575932 UP EGR2
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696 DOWN C3orf14
#DNAH5 -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157 DOWN DNAH5
DEG=merge(DEG,df,by='SYMBOL')
head(DEG)
# SYMBOL logFC AveExpr t P.Value adj.P.Val B g ENTREZID
#1 A1CF -0.24279450 6.481373 -0.78084875 0.44383564 0.8444129 -5.353833 stable 29974
#2 A2M 0.74751848 11.069348 1.89303707 0.07258966 0.5827869 -4.132066 stable 2
#3 A2ML1 0.99851595 4.933066 0.92880174 0.36382423 0.8085547 -5.242500 stable 144568
#4 A3GALT2 0.03320466 4.269750 0.08468268 0.93333708 0.9894294 -5.625240 stable 127550
#5 A4GALT -0.28893317 6.883433 -0.71439103 0.48305962 0.8637983 -5.397984 stable 53947
#6 A4GNT 0.43189200 6.202285 1.60275087 0.12432137 0.6497896 -4.528644 stable 51146
save(DEG,file = 'anno_DEG.Rdata')
分别取出上调基因和下调基因,合并为差异基因
gene_up <- DEG[DEG$g == 'UP','ENTREZID']
gene_down <- DEG[DEG$g == 'DOWN','ENTREZID']
gene_diff <- c(gene_up,gene_down)
gene_all <- as.character(DEG[ ,'ENTREZID'] )
geneList <- DEG$logFC
names(geneList) <- DEG$ENTREZID
geneList <- sort(geneList,decreasing = T)
4.1 KEGG
4.1.1 上调基因集
library(ggplot2)
library(clusterProfiler)
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk.up)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04640 hsa04640 Hematopoietic cell lineage 7/53 88/7130 3.334829e-06 0.000506894
#hsa04662 hsa04662 B cell receptor signaling pathway 6/53 78/7130 2.152723e-05 0.001636069
#hsa04062 hsa04062 Chemokine signaling pathway 6/53 175/7130 1.769682e-03 0.081549397
#hsa05340 hsa05340 Primary immunodeficiency 3/53 35/7130 2.146037e-03 0.081549397
#hsa04921 hsa04921 Oxytocin signaling pathway 5/53 147/7130 4.506704e-03 0.133247928
#hsa04064 hsa04064 NF-kappa B signaling pathway 4/53 95/7130 5.259787e-03 0.133247928
KEGG分析上调基因集结果可视化:
1)上调基因所属信号通路(气泡图)
dotplot(kk.up)
2)查看第一个结果hsa04640的信号通路示意图:
browseKEGG(kk.up, 'hsa04640')
3)上调基因所属信号通路(条带图)
(gg_barplot <- barplot(kk.up,showCategory=20))
4)通路与基因之间的关系可视化:
cnetplot(kk.up, categorySize="pvalue", foldChange=geneList,colorEdge = TRUE)
5)通路与通路之间的关系图:
emapplot(kk.up)
6)热图展现通路与基因之间的关系:
heatplot(kk.up)
4.1.2 下调基因集
同样的方法计算下调基因集KEGG
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk.down)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04915 hsa04915 Estrogen signaling pathway 6/62 130/7130 0.0008719172 0.1464821
#hsa00910 hsa00910 Nitrogen metabolism 2/62 16/7130 0.0082547476 0.4221930
#hsa04921 hsa04921 Oxytocin signaling pathway 5/62 147/7130 0.0087691281 0.4221930
#hsa04934 hsa04934 Cushing syndrome 5/62 152/7130 0.0100522154 0.4221930
#hsa04360 hsa04360 Axon guidance 5/62 177/7130 0.0184322243 0.4366278
#hsa04927 hsa04927 Cortisol synthesis and secretion 3/62 65/7130 0.0186643448 0.4366278
dotplot(kk.down );ggsave('kk.down.dotplot.png')
4.1.2 差异基因集
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
合并上下调基因后,几乎分析不出信号通路。╮(╯▽╰)╭
4.1.3 Pathway Enrichment
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
kegg_plot <- function(up_kegg,down_kegg){
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) +
geom_bar(stat="identity") +
scale_fill_gradient(low="blue",high="red",guide = FALSE) +
scale_x_discrete(name ="Pathway names") +
scale_y_continuous(name ="log10P-value") +
coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Pathway Enrichment")
}
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down.png')
这张图其实就是上调基因和下调基因pathway条带图合并,可以看到
Oxytocin signaling pathway
同时存在于上调基因和下调基因pathway,刚好跟差异基因集的分析结果一样。
4.1.4 GSEA
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
4.2 GO
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
dotplot(go_enrich_results[[i]][[j]],title=paste0('dotplot_',n1[i],'_',n2[j])) %>% print()
dev.off()
}
}
未完待续……
step5 - GSEA基因集富集分析
step6 - GSVA基因集变异分析