step1-Load Data and Quality Control
一、数据下载和读入
数据下载和读入,有与下载很慢,直接导入下载好的数据。
##数据下载和读入
#GEO Accession viewer https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63067
suppressPackageStartupMessages(library(GEOquery))
eSet <- getGEO('GSE63067',
destdir = ".",
getGPL= F)
save(eSet,file = 'GSE6306.Rdata')
load('GSE63067_exprSet.Rdata')
(1)提取表达矩阵
#(1)提取表达矩阵
#eSet #查看eSet
exprSet <- exprSet
# class(eSet)#eSet的的类型
#
# #str(eSet) #查看结构
#
# length(eSet)#查看list有多少个元素;eSet这个list只有一个元素
#exprSet <- exprs(eSet[[1]])#exprs函数提取表达矩阵
exprSet[1:5,1:5]#查看表达矩阵
## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881
## 1007_s_at 6.317799 6.220083 5.801907 6.322376 6.090536
## 1053_at 5.411170 5.445441 4.764031 5.170063 5.771946
## 117_at 6.139097 7.166847 5.842836 6.899896 6.499958
## 121_at 6.548681 6.317436 6.224910 5.986812 6.181588
## 1255_g_at 4.000176 4.130992 3.986064 3.904768 4.006547
dim(exprSet)
## [1] 54675 18
(2)提取临床信息
#(2)提取临床信息
# pdata <- pData(eSet[[1]])#样本分组信息
# head(pdata)
#
# samples <- sampleNames(eSet[[1]])#有多少GSM
# head(samples)
#构建样本信息
group_list=c(rep('Steatosis',2),rep('Non-alcoholic steatohepatitis',9),rep('Healthy',7))
group_list=factor(group_list)
head(group_list)
## [1] Steatosis Steatosis
## [3] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## [5] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## Levels: Healthy Non-alcoholic steatohepatitis Steatosis
(3)质控
一定要用boxplot看一下各个样本的表达量是不是在同一条水平线上,如果不在同一条平行线上,需要进行质控。
# 质控,一定要看boxplot
#有几个样本的表达量与其他样本不在同一条水平线上,质控
boxplot(exprSet,las = 2, cex = 0.3)
#质控
library(limma)
exprSet <- normalizeBetweenArrays(exprSet)
#View(exprSet)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
boxplot(exprSet, col = cols,main="expression value",las=2)
二、ID转换和过滤
通常需要转换ID和过滤某些探针
if(! require("hgu133plus2.db")) install("hgu133plus2.db")
library(hgu133plus2.db)
ls('package:hgu133plus2.db')#查看hugene10sttranscriptcluster.db有多少个对象
## [1] "hgu133plus2" "hgu133plus2.db"
## [3] "hgu133plus2_dbconn" "hgu133plus2_dbfile"
## [5] "hgu133plus2_dbInfo" "hgu133plus2_dbschema"
## [7] "hgu133plus2ACCNUM" "hgu133plus2ALIAS2PROBE"
## [9] "hgu133plus2CHR" "hgu133plus2CHRLENGTHS"
## [11] "hgu133plus2CHRLOC" "hgu133plus2CHRLOCEND"
## [13] "hgu133plus2ENSEMBL" "hgu133plus2ENSEMBL2PROBE"
## [15] "hgu133plus2ENTREZID" "hgu133plus2ENZYME"
## [17] "hgu133plus2ENZYME2PROBE" "hgu133plus2GENENAME"
## [19] "hgu133plus2GO" "hgu133plus2GO2ALLPROBES"
## [21] "hgu133plus2GO2PROBE" "hgu133plus2MAP"
## [23] "hgu133plus2MAPCOUNTS" "hgu133plus2OMIM"
## [25] "hgu133plus2ORGANISM" "hgu133plus2ORGPKG"
## [27] "hgu133plus2PATH" "hgu133plus2PATH2PROBE"
## [29] "hgu133plus2PFAM" "hgu133plus2PMID"
## [31] "hgu133plus2PMID2PROBE" "hgu133plus2PROSITE"
## [33] "hgu133plus2REFSEQ" "hgu133plus2SYMBOL"
## [35] "hgu133plus2UNIGENE" "hgu133plus2UNIPROT"
#找到hugene10sttranscriptclusterSYMBOL,找到对应的个呢名字
ids <- toTable(hgu133plus2SYMBOL)#totable获得对应关系
length(unique(ids$symbol))#长度,unique除去重复后元素个数
## [1] 20183
tail(sort(table(ids$symbol)))#查看重复的基因
##
## DNAH1 TCF3 CFLAR NRP2 HFE YME1L1
## 13 13 14 14 15 22
table(sort(table(ids$symbol)))#table统计重复的基因
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 9426 5151 2841 1383 741 339 168 67 29 17 7 5 5 2 1
## 22
## 1
plot(table(sort(table(ids$symbol))))#
探针对应到基因和过滤表达矩阵
#探针对应到基因
#过滤表达矩阵
head(ids)
## probe_id symbol
## 1 1053_at RFC2
## 2 117_at HSPA6
## 3 121_at PAX8
## 4 1255_g_at GUCA1A
## 5 1316_at THRA
## 6 1320_at PTPN21
head(exprSet)
## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881
## 1007_s_at 6.282738 6.212412 5.821629 6.298573 6.055701
## 1053_at 5.408377 5.423271 4.734848 5.192682 5.747881
## 117_at 6.113114 7.211946 5.868391 6.845572 6.434376
## 121_at 6.506860 6.313513 6.273058 5.970227 6.143037
## 1255_g_at 3.998576 4.130220 3.963418 3.921568 4.036102
## 1294_at 6.531638 6.630746 6.092748 6.262294 6.333928
## GSM1539882 GSM1539883 GSM1539884 GSM1539885 GSM1539886
## 1007_s_at 6.060390 6.210868 6.039242 6.277970 5.765510
## 1053_at 5.387612 5.180355 5.577747 5.348066 5.105622
## 117_at 5.613340 5.658942 5.364068 5.627686 5.619727
## 121_at 6.281549 5.997775 6.572306 6.549663 6.590396
## 1255_g_at 4.039059 3.979454 4.108688 4.153167 4.005298
## 1294_at 6.303366 6.226402 6.603085 6.208452 6.153673
## GSM1539887 GSM1539888 GSM1539889 GSM1539890 GSM1539891
## 1007_s_at 6.286630 6.156261 6.082300 5.664447 6.876097
## 1053_at 5.856089 5.255968 4.968610 4.889945 5.165363
## 117_at 6.269895 5.546778 5.456274 6.442117 5.650169
## 121_at 6.573122 6.351897 6.526092 6.375800 6.609897
## 1255_g_at 4.073529 4.098591 3.995851 4.140052 3.994319
## 1294_at 6.393489 6.159599 6.438031 6.139996 5.674530
## GSM1539892 GSM1539893 GSM1539894
## 1007_s_at 6.255433 6.143778 6.619000
## 1053_at 5.297843 5.042678 4.825426
## 117_at 5.546913 5.666350 5.498495
## 121_at 6.432975 6.175153 6.291851
## 1255_g_at 4.228946 4.043580 4.022117
## 1294_at 6.113279 6.579910 6.183669
#rownames(exprSet) #rownames(exprSet)为表达矩阵exprSet的探针;
length(rownames(exprSet))
## [1] 54675
table(rownames(exprSet) %in% ids$probe_id)#x %in% y 的意思是“对x里的每个元素进行判断,判断它是否在y中存在,存在就返回TRUE,不存在就返回FALSE”。
##
## FALSE TRUE
## 12743 41932
#table(rownames(exprSet) %in% ids$probe_id)
exprSet <- exprSet[rownames(exprSet) %in% ids$probe_id,]
dim(exprSet)
## [1] 41932 18
#ids过滤探针
ids <- ids[match(rownames(exprSet),ids$probe_id),]
head(ids)
## probe_id symbol
## 1 1053_at RFC2
## 2 117_at HSPA6
## 3 121_at PAX8
## 4 1255_g_at GUCA1A
## 5 1316_at THRA
## 6 1320_at PTPN21
exprSet[1:5,1:5]
## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881
## 1053_at 5.408377 5.423271 4.734848 5.192682 5.747881
## 117_at 6.113114 7.211946 5.868391 6.845572 6.434376
## 121_at 6.506860 6.313513 6.273058 5.970227 6.143037
## 1255_g_at 3.998576 4.130220 3.963418 3.921568 4.036102
## 1316_at 4.834299 4.879104 4.829465 4.881184 4.858719
#合并表达矩阵和ids
merge2 <- function(exprSet, ids){
tmp <- by(exprSet,
ids$symbol,
function(x) rownames(x)[which.max( rowMeans(x))])
probes <- as.character(tmp)
print(dim(exprSet))
exprSet <- exprSet[rownames(exprSet) %in% probes,]
print(dim(exprSet))
rownames(exprSet) <- ids[match(rownames(exprSet), ids$probe_id),2]
return(exprSet)
}
new_exprSet <- merge2(exprSet,ids)
## [1] 41932 18
## [1] 20183 18
#View(new_exprSet)
#查看某个基因的表达水平
new_exprSet['GAPDH',]
## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881 GSM1539882
## 12.05410 12.21721 11.88043 11.81438 11.80473 12.05410
## GSM1539883 GSM1539884 GSM1539885 GSM1539886 GSM1539887 GSM1539888
## 11.88043 12.14874 12.00448 11.59026 12.27825 12.05883
## GSM1539889 GSM1539890 GSM1539891 GSM1539892 GSM1539893 GSM1539894
## 12.12765 11.55415 11.68287 12.04985 12.19551 11.80945
boxplot(new_exprSet, las = 2)
#new_exprSet['MALAT1',]
new_exprSet['ACTB',]
## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881 GSM1539882
## 12.53922 12.27825 11.94215 12.19997 11.91843 12.08690
## GSM1539883 GSM1539884 GSM1539885 GSM1539886 GSM1539887 GSM1539888
## 12.07694 11.88981 11.67853 11.95695 11.86387 12.29221
## GSM1539889 GSM1539890 GSM1539891 GSM1539892 GSM1539893 GSM1539894
## 13.01597 11.65957 12.06532 11.78780 12.33845 11.61137
boxplot(new_exprSet, las = 2) #las=2样本名称为竖排列
#中间三个存在存在差异
# 从上面的图中可以看到有一个样本的中位数和其他样本有些不在一条水平显示,这个normalizeBetweenArrays函数,
# 可以把他拉回正常水平,normalizeBetweenArrays只能是在同一个数据集里面使用。这一步可以省略。
library(limma)
exprSet <- normalizeBetweenArrays(new_exprSet)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
boxplot(exprSet, col = cols,main="expression value",las=2)
三、绘图
转置矩阵,生成s三列列矩阵
#绘图,转置矩阵,生成s三列列矩阵
library("reshape2")
exprSet_L <- melt(exprSet)
colnames(exprSet_L) <- c('probe','sample','value')
head(exprSet_L)
## probe sample value
## 1 PAX8 GSM1539877 6.459490
## 2 CYP2A6 GSM1539877 11.972124
## 3 SCARB1 GSM1539877 9.084149
## 4 TTLL12 GSM1539877 6.098230
## 5 CYTOR GSM1539877 4.608951
## 6 ADAM32 GSM1539877 3.766005
#样本分组信息
head(group_list)
## [1] Steatosis Steatosis
## [3] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## [5] Non-alcoholic steatohepatitis Non-alcoholic steatohepatitis
## Levels: Healthy Non-alcoholic steatohepatitis Steatosis
exprSet_L$group <- rep(group_list, each=nrow(exprSet))
head(exprSet_L)
## probe sample value group
## 1 PAX8 GSM1539877 6.459490 Steatosis
## 2 CYP2A6 GSM1539877 11.972124 Steatosis
## 3 SCARB1 GSM1539877 9.084149 Steatosis
## 4 TTLL12 GSM1539877 6.098230 Steatosis
## 5 CYTOR GSM1539877 4.608951 Steatosis
## 6 ADAM32 GSM1539877 3.766005 Steatosis
ggplot2
library(ggplot2)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+
geom_boxplot()
print(p)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+
geom_violin()
print(p)
p=ggplot(exprSet_L,aes(value,fill=group))+
geom_histogram(bins = 200)+
facet_wrap(~sample, nrow = 4)
print(p)
p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+
facet_wrap(~sample, nrow = 4)
print(p)
p=ggplot(exprSet_L,aes(value,col=group))+geom_density()
print(p)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
print(p)
p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
print(p)
p=p+theme_set(theme_set(theme_bw(base_size=20)))
print(p)
p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
print(p)
hclust
## hclust
colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
# Define nodePar
nodePar <- list(lab.cex = 0.4, pch = c(NA, 19),
cex = 0.7, col = "blue")
hc=hclust(dist(t(exprSet)))
par(mar=c(5,5,5,10))
png('hclust.tif',res=120)
plot(as.dendrogram(hc), nodePar = nodePar, horiz = TRUE)
dev.off()
## png
## 2
PCA
## PCA
library(ggfortify)
df=as.data.frame(t(exprSet))
df$group=group_list
png('pca.png',res=120)
pp <- autoplot(prcomp( df[,1:(ncol(df)-1)] ),
data=df,
colour = 'group')+
theme_bw()
pp
dev.off()
参考:生信技能树Jimmy大神的解读GEO数据存放规律及下载,一文就够