I. Network analysis of liver expression data from female mice: finding modules related to body weight
1.1 数据的导入
rm(list = ls())
setwd('E:/gsj/RWD/WGCNA/')
library(WGCNA)
options(stringsAsFactors = FALSE)# 如果需要保存变量的话这一步不能省
#读入文件
femData = read.csv("LiverFemale3600.csv")
#查看一下文件格式,如果不符合,就需要修改
name(femData)
head(femData)
#进行WGCNA分析的时候,要求输入的表达矩阵,行名是样本,列名是基因
datExpr0 = as.data.frame(t(femData[, -c(1:8)]))
names(datExpr0) = femData$substanceBXH
rownames(datExpr0) = names(femData)[-c(1:8)]
dim(datExpr0)
1.2 检查是否有离群值
#检查是否有缺失值,没问题就会返回TRUE
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
#如果不是TREU,那么需要进行下面操作
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:",paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
#第二步,对样本进行聚类,判断有无outlier的样本
sampleTree = hclust(dist(datExpr0), method = "average");
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
#从下图可以看出sampleF2_221是一个离群值,要么手动删掉,要么设置一个阈值,剔除掉
abline(h = 15, col = "red")
#按照设定的高度
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)
#可以看出cluster1包含了我们最后所需要的样本
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#通过以上的处理,最后得到的datExpr对象就可以用来进行后续的分析
1.3 导入临床信息
#导入临床信息
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
names(traitData)
#除去不相关的信息
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
dim(allTraits)
names(allTraits)
#每一个样本都会有对应的临床信息,包括体重,长度等
view(allTraits)
#将临床信息和表样本名结合起来
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1]
view(datTraits)
#最后在进行后续的网络构建和模块选择之前,我们可以看一下样本和临床信息之间的匹配度
sampleTree2 = hclust(dist(datExpr), method = "average")
#将特性与颜色相关联,白色表示低;红色表示高;灰色表示缺失
traitColors = numbers2colors(datTraits, signed = FALSE);
#出图,红色代表高的,白色代表低的,灰色代表缺失
plotDendroAndColors(sampleTree2, traitColors,groupLabels = names(datTraits),main = "Sample dendrogram and trait heatmap")
#没有问题的话就保存环境变量
save(datExpr, datTraits, file = "FemaleLiver-01-dataInput.RData")
2.0 构建R环境
######################################
#如果你关掉了R,那么就需要运行下面几步
#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程,但是如果是在本机上使用的话,这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上,成功导入第一步生成的datExpr和datTraits
2.1 确定合适的阈值
#由不同的方式构建基因网络,这里选择自动一步构建基因网络的方法
#确定合适的阈值范围
powers = c(c(1:10), seq(from = 12, to=20, by=2))
#调用pickSoftThreshold函数分析出合适的阈值
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#画图,结果展示
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red")
abline(h=0.90,col="green")
plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
#以上会出来两张图,由此确定选取阈值6
2.2 一步构建网络图和模块选择
#一步构建网络和模块选择
net = blockwiseModules(datExpr, power = 6,TOMType = "unsigned", minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "femaleMouseTOM",verbose = 3)
#参数mergeCutHeight为合并模块的一个阈值
#上述的参数设置均为下限值,不同的数据类型有不同的参数设置
#如果电脑的运行跟不上,建议参考大样本的那种网络构建方法
#看一下有多少个模块以及模块当中所包含的基因
#如下图展示的结果,一共有18个模块,从1到18模块,按照基因数递减排列,模块0表示没有分类的基因数
table(net$colors)
#层次聚类图的结果包含在net$dendrograms[[1]]对象中
#用以下代码可以将树图和颜色分布整合,树图是对基因进行的聚类,下面不同颜色代表这个基因处于哪个模块
#设置窗口的大小
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
#出图
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#此外,如果用户想要修改一些参数,可以使用recutBlockwiseTrees这个函数
#保存数据
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
file = "FemaleLiver-02-networkConstruction-auto.RData")
3.0 构建R环境
#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程,但是如果是在本机上使用的话,这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上,成功导入第一步生成的datExpr和datTraits
#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
#lnames
#以上,成功导入第二步生成的参数
3.1 计算模块和性状之间的相关性
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)##不同颜色的模块的ME值矩阵(样本vs模块)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
#这一步将会展示相关性和P值
sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
#这一步将会在热图上展示相关系数
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
#以上可以通过这个热图发现和性状相关的基因,在这之后我们主要关注weight这一性状相关的基因
3.2 确定相关模块中的显著相关基因
#性状跟模块虽然求出了相关性,可以挑选最相关的那些模块来分析,但是模块本身仍然包含非常多的基因,还需进一步的寻找最重要的基因
#首先计算模块与基因的相关性矩阵
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight" #单独把weight这一列提出来,做一个data.frame
modNames = substring(names(MEs), 3) #提出每一模块的颜色
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); #这一步,模块与基因的相关性矩阵
## 算出每个模块跟基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
## datExpr是每个基因在每个样本的表达量
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
#再计算性状与基因的相关性矩阵
# 只有连续型性状才能只有计算
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p")); #这一步,性状与基因的相关性矩阵
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");
3.3 筛选出和性状以及模块相关性都很高的基因
#这里可以从上图中看出weight这一个性状中,棕色的模块最相关
module = "brown"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for body weight",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
#这一张图显示的是在棕色模块中,基因先属性和模块的关系(类似于MEs)
#可以看出,和性状高度相关的基因往往在和性状高度相关的模块中
#经过上述操作,我们已经找到和感兴趣的性状最相关的模型,同时还得到了相关的基因
3.4 将网络分析结果输出
#返回所有的probe ID
names(datExpr)
#返回brown模块对应的基因
names(datExpr)[moduleColors=="brown"]
#可以提供一个转换ID的文件
annot = read.csv(file = "GeneAnnotation.csv");
dim(annot)
names(annot)
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
#计算没有注释到的probe数量
sum(is.na(probes2annot))
#随后构建一个data.frame,描述probe ID、gene symbol、模块颜色、基因与weight模块的显著性、p值
geneInfo0 = data.frame(substanceBXH = probes,
geneSymbol = annot$gene_symbol[probes2annot],
LocusLinkID = annot$LocusLinkID[probes2annot],
moduleColor = moduleColors,
geneTraitSignificance,
GSPvalue)
modOrder = order(-abs(cor(MEs, weight, use = "p")));
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
MMPvalue[, modOrder[mod]]);
names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));
geneInfo = geneInfo0[geneOrder, ]
write.csv(geneInfo, file = "geneInfo.csv")
4.1 选取感兴趣的基因进行功能注释或者其他分析
# Read in the probe annotation
annot = read.csv(file = "GeneAnnotation.csv");
# Match probes in the data set to the probe IDs in the annotation file
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# Get the corresponding Locuis Link IDs
allLLIDs = annot$LocusLinkID[probes2annot];
# $ Choose interesting modules
intModules = c("brown", "red", "salmon")
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modLLIDs = allLLIDs[modGenes];
# Write them into a file
fileName = paste("LocusLinkIDs-", module, ".txt", sep="");
write.table(as.data.frame(modLLIDs), file = fileName,row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.data.frame(allLLIDs), file = fileName,row.names = FALSE, col.names = FALSE)
#经过上述步骤,将会生成下面的文件,后续可以对感兴趣的模块进行富集分析等
5.0 构建R环境
#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程,但是如果是在本机上使用的话,这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上,成功导入第一步生成的datExpr和datTraits
#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
#lnames
#以上,成功导入第二步生成的参数
5.1 可视化基因互作网络
#网络可视化过程
#对所有基因画热图
#这一步速度很慢,不建议做
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
#随机选取400个基因
nSelect = 400# For reproducibility, we set the random seedset.seed(10);select = sample(nGenes, size = nSelect);selectTOM = dissTOM[select, select];# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];# Open a graphical windowsizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing# the color palette; setting the diagonal to NA also improves the clarity of the plotplotDiss = selectTOM^7;diag(plotDiss) = NA;TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
5.2 可视化网络图中的eigenegenes
#根据模块里面的eigenegenes(类似于主要的基因),对模块进行聚类以及热图,标注出想要看的模块
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 只有连续型性状才能只有计算
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
MET = orderMEs(cbind(MEs, weight))
# 出图,模块与性状之间的关系
#下面是将两张图分别画出的代码
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
# Plot the relationships among the eigengenes and the trait
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## 模块的聚类图
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与模块热图
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),plotDendrograms = FALSE, xLabelsAngle = 90)
6.0 构建R环境
#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程,但是如果是在本机上使用的话,这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上,成功导入第一步生成的datExpr和datTraits
#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
#lnames
#以上,成功导入第二步生成的参数
6.1 将数据导出VisANT
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
annot = read.csv(file = "GeneAnnotation.csv");
#选择模块
module = "brown"
#选择模块的ID
probes = names(datExpr)
inModule = (moduleColors==module);
modProbes = probes[inModule];
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
vis = exportNetworkToVisANT(modTOM,
file = paste("VisANTInput-", module, ".txt", sep=""),
weighted = TRUE,
threshold = 0,
probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )
#因为brown模块很大,筛选出top30的基因
nTop = 30;
IMConn = softConnectivity(datExpr[, modProbes]);
top = (rank(-IMConn) <= nTop)
vis = exportNetworkToVisANT(modTOM[top, top],
file = paste("VisANTInput-", module, "-top30.txt", sep=""),
weighted = TRUE,
threshold = 0,
probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )
6.2 导出Cytoscape
# Recalculate topological overlap if needed
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# Read in the annotation file
annot = read.csv(file = "GeneAnnotation.csv");
# Select modules
modules = c("brown", "red");
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = moduleColors[inModule]);