通过查看基于表达丰度进行癌症分子亚型聚类的文献中相关的部分,并进行学习,将相关方法应用于George的数据。
2020-European Urology-Identification of Differential Tumor Subtypes of T1 Bladder Cancer(待进行聚类数的确定和Marker/differentially expressed基因的筛选)
01.文献方法部分的阅读:
总概:确定聚类数与不同类中的样品组成>得到marker/differentially expressed基因
01.确定聚类数
Features的选择:log10转化的、以均值为中心的2945个蛋白编码基因的FPKM谱,这些基因的FPKM标准差高于所有基因标准差的85th。
工具:ConsensusClusterPlus 。
流程/具体参数:
①通过组合clusterAlg和distance两个参数变量(ConsensusClusterPlus R包中的变量),并评估结果进行选择,选择出3个聚类的结果(具体参数:a- 3 clusters: Pearson, PAM, fItem = 0.85, reps = 20,000; b- 4 clusters: Pearson, k-means, fItem = 0.875, reps = 5000; c- 5 clusters: Spearman, PAM, fItem = 0.875, reps = 50,000。 )
②进行病人样品生存分析,在三个结果中确定一个最优的分类。
③文章进一步鉴定了核心样品和非核心样品。
02.Marker/differentially expressed基因的筛选
[图片上传失败...(image-48153-1637819437099)]
工具:edgeR
流程:通过edgeR/FC进行不同亚型间差异基因的计算
02.流程的应用
01.流程的开展
library(CancerSubtypes)
library(ComplexHeatmap)
(.packages())
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2020-EBioMedicine-Identification")
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt",header = T,sep = "\t")
library(ConsensusClusterPlus)
#01.features的选择
#按标准差选择出变化最大的前25%基因
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_85<-George_FPKM_sd[George_FPKM_sd$sd_value>quantile(George_FPKM_sd$sd_value,0.85),]
George_FPKM_85<-George_FPKM[rownames(George_FPKM_sd_85),]
#对基因进行log10转化后进行中心化
George_FPKM_85_log<-log10(George_FPKM_85+1)
George_FPKM_85_log10_scaled<-t(scale(t(George_FPKM_85_log)))
#02.使用ConsensusClusterPlus进行聚类,选用不同的distances和clusterAlg参数进行组合。
distances_value<-c("pearson", "spearman", "euclidean", "binary", "maximum", "canberra", "minkowski")
clusterAlg_value<-c("hc","pam","km")
for(i in distances_value){
for(j in clusterAlg_value){
file_name<-paste("George_FPKM_85_log10_scaled",i,j,sep="_")#文件夹名称的设置
assign(file_name,ConsensusClusterPlus(George_FPKM_85_log10_scaled,#生成相关文件夹
maxK=10,#设置最大聚类的类数
reps=20000,#重复次数
distance=i,#距离计算方法
clusterAlg=j,
title=file_name,
plot= 'png',
writeTable=T))
}
}
distances和clusterAlg参数图片红字中最后两部分。根据软件运行提示,km参数只能与euclidean联用,所以km与其他参数的组合结果实际应该为km参数与euclidean参数联用的结果。根据肉眼对所有的聚类结果进行观测,在K=3或K=4时有着较明确的类群之间的界限。
02.总结
文章对癌症的亚型聚类和亚型marker鉴定是分开的。文章先是进行聚类获得亚型后,再在亚型之间进行marker基因的鉴定。
文章首先进行了一致性聚类。在此阶段,文章组合使用了ConsensusClusterPlus 工具中lusterAlg和distance两个参数变量,得到了三个人工筛选后的结果,之后,进一步使用生存分析,确定其中一个结果用于后续分析。而用于聚类的基因,文章较为直接,使用的是变化top25%的基因。
之后,文章在获得亚型分类和亚型中样品归属的情况下,进行差异表达基因的分析,鉴定出亚型特异性的基因类群。
目前,通过相同方法,在George的样本中进行了聚类实验。依据聚类结果(排除个人认为较差的结果后),大部分结果在K=3、K=4时,亚型间差异性较明显(如George_FPKM_85_log10_scaled_spearman_pam、George_FPKM_85_log10_scaled_pearson_pam、George_FPKM_85_log10_scaled_pearson等)
03.根据个人判定出较优的聚类结果分析是否存在与一些SCLC marker基因的联动
library(CancerSubtypes)
library(ComplexHeatmap)
(.packages())
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2020-EBioMedicine-Identification")
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt",header = T,sep = "\t")
library(ConsensusClusterPlus)
#01.features的选择
#按标准差选择出变化最大的前25%基因
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_85<-George_FPKM_sd[George_FPKM_sd$sd_value>quantile(George_FPKM_sd$sd_value,0.85),]
George_FPKM_85<-George_FPKM[rownames(George_FPKM_sd_85),]
#对基因进行log10转化后进行中心化
George_FPKM_85_log<-log10(George_FPKM_85+1)
George_FPKM_85_log10_scaled<-George_FPKM_85_log
for(i in colnames(George_FPKM_85_log)){
median_tmp<-median(George_FPKM_85_log[,i])
George_FPKM_85_log10_scaled[,i]<-George_FPKM_85_log[,i]-median_tmp
}
George_FPKM_85_log10_scaled<-as.matrix(George_FPKM_85_log10_scaled)
#02.使用ConsensusClusterPlus进行聚类,选用不同的distances和clusterAlg参数进行组合。
distances_value<-c("pearson", "spearman", "euclidean", "binary", "maximum", "canberra", "minkowski")
clusterAlg_value<-c("hc","pam","km")
for(i in distances_value){
for(j in clusterAlg_value){
file_name<-paste("George_FPKM_85_log10_scaled",i,j,sep="_")#文件夹名称的设置
assign(file_name,ConsensusClusterPlus(George_FPKM_85_log10_scaled,#生成相关文件夹
maxK=10,#设置最大聚类的类数
reps=20000,#重复次数
distance=i,#距离计算方法
clusterAlg=j,
title=file_name,
plot= 'png',
writeTable=T))
}
}
library(ComplexHeatmap)
library("stringr")
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2020-EBioMedicine-Identification")
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/00.patients/George_2015_sum_FPKM.txt",header = T,sep = "\t")
#读取文件夹名称
file_names = list.files(pattern = "George_FPKM_85_log")
#遍历文件夹,生成图片
for(i in file_names){#遍历文件夹
for(j in 2:10){#遍历文件夹下需要的文件
file_name_pre<-paste(i,".k=",j,sep="")#设置生成图片的前缀
K_names<-paste("k=",j,sep="")#获得K值
file_names1<-paste(i,K_names,"consensusClass.csv",sep=".")#设置consensusClass.csv名称
file_names2<-paste(i,K_names,"consensusMatrix.csv",sep=".")#设置consensusMatrix.csv名称
file_class<-paste(i,file_names1,sep="/")#设置路径+consensusClass.csv名称
class<-read.table(file_class,sep = ",")#读取consensusClass.csv文件
file_consensusMatrix<-paste(i,file_names2,sep="/")#设置路径+consensusMatrix.csv名称
consensusMatrix<-read.table(file_consensusMatrix,header = T,sep=",")#读取consensusMatrix.csv文件
consensusMatrix<-consensusMatrix[,-1]
colnames(consensusMatrix)<-class[,1]
rownames(consensusMatrix)<-class[,1]
class1<-class[order(class$V2),]#按聚类结果进行排序
George_FPKM<-George_FPKM[,class1[,1]]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
for(m in rownames(consensusMatrix)){
consensusMatrix[m,]<-as.numeric(consensusMatrix[m,])
}
consensusMatrix<-consensusMatrix[class1$V1,class1$V1]
png.names<-paste(file_name_pre,"png",sep=".")#设置生成图片名称
D_F<-paste(i,png.names,sep="/")#设置路径+名称
consensusMatrix_heatmap<-Heatmap(consensusMatrix,cluster_rows = F,cluster_columns = F,bottom_annotation=ha,show_column_names=F)#绘制热图
png(filename = D_F,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}}
2021-Cancer Cell-Patterns of transcription factor programs and immune pathway activation define four major subtypes of SCLC with distinct therapeutic vulnerabilities
01.文献方法部分的阅读
总概:确定聚类数与不同类中的样品组成>得到marker/differentially expressed基因
01.确定聚类数
Features的选择:双峰指数≥1.5,平均表达值≥25th,sd value≥50th。
聚类工具/方法:NMF
流程/具体参数:
①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
②进行NMF聚类,使用cophenetic correlation coefficient maximization进行确定聚类数。但K=3与K=4下的评价几乎一样,进而都用于后续分析,在后续分析中进行人工筛选确定。
02.Marker基因的筛选
工具/方法:glmnet
其他:此外文章检测了亚型特异性的marker基因:使用glmnet来进行。①筛选最优的alpha和features。②确定markers:使用glmet采用惩罚极大似然法拟合广义线性模型。
而文章中的差异表达基因是通过NMF进行的。
相关知识阅读:
Lasso:
通过构建模型来预测数据的分类
https://cloud.tencent.com/developer/article/1621581
https://www.biowolf.cn/m/view.php?aid=326
https://www.zsccy.xyz/post/2018-09-13-glmnet1/
NMF:
https://zhuanlan.zhihu.com/p/22043930
https://cloud.tencent.com/developer/article/1806266
https://blog.csdn.net/weixin_39595008/article/details/111579636
//www.greatytc.com/p/f5cbbe771ce2
02.流程的应用
表达量的处理对后续分析有着较大的影响。
首先在表达量的处理过程中,文章提及是使用log2进行了转化,但是具体是log2(expression+1)还是仅仅log2(expression)没有提及(上图),但是由于NMF对数据要求是>0,所以这边使用log2(expression+1)进行。除此之外,文章还提及Byers 2012的文章,在该文章中,对基因表达数据进行了normalization(使用的方法是分位数标准化法,下图),但Gay 2021没明确是否有进行此步骤。所以下面的应用分多种情况进行:①不进行分位数标准化的流程应用;进行分位数标准化的应用;③不进行log2转化且不进行分位数标准化的流程应用;④进行分位数标准化并将标准化步骤调整至筛选获得基因后的应用。
01.流程的开展
①不进行分位数标准化的流程应用:
library(BimodalIndex)
library(NMF)
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
#计算平均值,并获得means >=25th的基因名
George_FPKM<-log2(George_FPKM+1)
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得sd>=50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd>=50th的基因名和means>=25th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
#NMF的运算过程由文章释放的code中获得,但仅仅就只有很小的以部分
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=200 ,.options='vt')
hm <- consensusmap(res)
}
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=50, seed=123456)
plot(estim.r)
依据结果(本地化运行1),最优的聚类数应该为3,与文章结果描述中出现最优聚类数同时为3/4不同。可能是文章采取了分位数标准化?下面试一下分位数标准化后数据的应用,看是否能有类似与文章的结果。
②进行分位数标准化的应用
library(BimodalIndex)
library(NMF)
library("preprocessCore")
#read the FPKM values from George 2015.
George_FPKM<-read.table("George_2015_sum_FPKM.txt")
#normalization based upon quantiles using preprocessCore package
George_FPKM<-as.matrix(George_FPKM)
George_FPKM<-normalize.quantiles(George_FPKM,keep.names=T)
#calculating the bimodal index, mean FPKM value, stand deviation value across samples of genes. then the genes with bimodal index ≥ 1.5, mean FPKM value ≥ 25th and stand deviation value ≥ 50th were retained.
##calculating mean FPKM value of genes across samples and get gene names of genes with the mean FPKM value ≥ 25th
George_FPKM<-log2(George_FPKM+1)
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
##calculating stand deviation value of genes across samples and get gene names of genes with the stand deviation value ≥ 50th
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#obtain intersection names
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#calculating the bimodal index value and get gene names of genes with the bimodal index value > 1.5 from intersection genes
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
#obtaining the FPKM value of genes with bimodal index ≥ 1.5, mean FPKM value ≥ 25th and stand deviation value ≥ 50th
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
dim(George_FPKM_filtered)
#running NMF
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=200 ,.options='vt')
hm <- consensusmap(res)
}
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=50, seed=123456)
plot(estim.r)
依据结果(本地化运行2),最优的聚类数仍应该为2,与文章结果描述中出现最优聚类数同时为3/4不同。进一步猜测是否有可能文章没有使用log2对表达量进行转化,以下开展不进行log2转化的本地化运行3。
③不进行log2转化且不进行分位数标准化的流程应用:
library(BimodalIndex)
library(NMF)
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
#计算平均值,并获得means >=25th的基因名
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得sd>=50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd>=50th的基因名和means>=25th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
dim(George_FPKM_filtered)
#NMF的运算过程由文章释放的code中获得,但仅仅就只有很小的一部分,且没有相关注释文件
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=200 ,.options='vt')
hm <- consensusmap(res)
}
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=50, seed=123456)
plot(estim.r)
依据结果(本地化运行3),最优的聚类数仍应该为2,与文章结果描述中出现最优聚类数同时为3/4不同。在经过三次不同猜测下的运行,仍与文章结果相差较远,猜测可能是源数据差异、R包版本,或者是标准化步骤开展位置的问题,下面将标准化步骤进行调整,进行一次运行。
④进行分位数标准化并将标准化步骤调整至筛选获得基因后的应用
library(BimodalIndex)
library(NMF)
library("preprocessCore")
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
#计算平均值,并获得means≥25th的基因名
George_FPKM<-log2(George_FPKM+1)
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得前sd 50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd 25th的基因名和means前50th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
#查看基因数量
dim(George_FPKM_filtered)
#使用preprocessCore进行分位数标准化
George_FPKM_filtered<-as.matrix(George_FPKM_filtered)
George_FPKM_filtered<-normalize.quantiles(George_FPKM_filtered,keep.names=T)
#NMF的运算过程由文章释放的code中获得,但仅仅就只有很小的以部分
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=200 ,.options='vt')
hm <- consensusmap(res)
}
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=50, seed=123456)
plot(estim.r)
依据结果(本地化运行4),最优的聚类数应该为3/4,与文章结果描述中出现最优聚类数同时为3/4相似,但在整个折线图仍有着差异(k=6)。
02.总结
文章对癌症的亚型聚类和亚型marker鉴定是分开的。文章先是进行聚类获得亚型后,再在亚型之间进行marker基因的鉴定。
文章首先进行了NMF聚类。在此阶段,文章使用NMF聚类后,认为K=3或K=4的时候为同时最优,之后通过其他分析和认为判定,选取K=4。用于聚类的基因,文章文章的限定条件包括双峰指数、平均表达值、标准差。
之后,文章在获得亚型分类和亚型中样品归属的情况下,进行marker基因的鉴定,鉴定出亚型特异性的基因类群。
目前,通过相同方法,在George的样本中进行了聚类实验。因为文章中对表达数据处理和聚类分析的描述并不是特别的详细(特别是在表达数据处理部分),有部分过程需要进行猜测。最后在 本地化运行4 中获得了与文章中描述的”最优同时为3/4“,但整个折线图仍有差异。其他三次运行结果中,有两次最优为K=2,一次最优为K=3(依据cophenetic值的大小进行判断)。而我同时看到有人进行最优K值的判断是当cophenetic开始下降或者下降最大前的K为最优的K。
03.根据个人判定出较优的聚类结果分析是否存在与一些SCLC marker基因的联动
1-
library(BimodalIndex)
library(NMF)
library(ComplexHeatmap)
library("stringr")
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
George_FPKM1<-George_FPKM
#计算平均值,并获得means >=25th的基因名
George_FPKM<-log2(George_FPKM+1)
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得sd>=50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd>=50th的基因名和means>=25th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
#774
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=1000, seed=123456)
png("1-estim.r.plot.png",width=500,height = 500)
plot(estim.r)
dev.off()
#
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=1000 ,.options='vt')
consensusmap_name<-paste("1-k=",ik," consensusmap.png",sep="")
png(consensusmap_name,width=500,height = 500)
consensusmap(res)
dev.off()
#获得分组信息
group<-predict(res,what="chc")
group_names<-paste("1-k=",ik,"group.txt",sep=".")
write.table(group,group_names,sep="\t")
sorted_group<-names(sort(group))
George_FPKM1<-George_FPKM1[,sorted_group]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
sorted_consensus<-res@consensus[sorted_group,sorted_group]
res_consensus1<-sorted_consensus
sorted_consensus_names<-paste("1-1res_ik=",ik,"consensus.txt",sep=".")
write.table(res_consensus1,sorted_consensus_names,sep="\t")
consensusMatrix_heatmap<-Heatmap(sorted_consensus,
cluster_rows = F,
cluster_columns = F,
bottom_annotation=ha,
show_column_names=F)#绘制热图
png.names<-paste("1-k=",ik,"png",sep=".")#设置生成图片名称
png(filename = png.names,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}
#并进行一致性聚类
library(ConsensusClusterPlus)
distances_value<-c("pearson", "spearman", "euclidean", "binary", "maximum", "canberra", "minkowski")
clusterAlg_value<-c("hc","pam","km")
George_FPKM_filtered<-as.matrix(George_FPKM_filtered)
for(i in distances_value){
for(j in clusterAlg_value){
file_name<-paste("1-George_FPKM_filtered",i,j,sep="_")#文件夹名称的设置
assign(file_name,ConsensusClusterPlus(George_FPKM_filtered,#生成相关文件夹
maxK=10,#设置最大聚类的类数
reps=20000,#重复次数
distance=i,#距离计算方法
clusterAlg=j,
title=file_name,
plot= 'png',
writeTable=T))
}
}
#读取文件夹名称
file_names = list.files(pattern = "1-George_FPKM_filtered")
#遍历文件夹,生成图片
for(i in file_names){#遍历文件夹
for(j in 2:10){#遍历文件夹下需要的文件
file_name_pre<-paste(i,".k=",j,sep="")#设置生成图片的前缀
K_names<-paste("k=",j,sep="")#获得K值
file_names1<-paste(i,K_names,"consensusClass.csv",sep=".")#设置consensusClass.csv名称
file_names2<-paste(i,K_names,"consensusMatrix.csv",sep=".")#设置consensusMatrix.csv名称
file_class<-paste(i,file_names1,sep="/")#设置路径+consensusClass.csv名称
class<-read.table(file_class,sep = ",")#读取consensusClass.csv文件
file_consensusMatrix<-paste(i,file_names2,sep="/")#设置路径+consensusMatrix.csv名称
consensusMatrix<-read.table(file_consensusMatrix,header = T,sep=",")#读取consensusMatrix.csv文件
consensusMatrix<-consensusMatrix[,-1]
colnames(consensusMatrix)<-class[,1]
rownames(consensusMatrix)<-class[,1]
class1<-class[order(class$V2),]#按聚类结果进行排序
George_FPKM1<-George_FPKM1[,class1[,1]]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
for(m in rownames(consensusMatrix)){
consensusMatrix[m,]<-as.numeric(consensusMatrix[m,])
}
consensusMatrix<-consensusMatrix[class1$V1,class1$V1]
png.names<-paste(file_name_pre,"png",sep=".")#设置生成图片名称
D_F<-paste(i,png.names,sep="/")#设置路径+名称
consensusMatrix_heatmap<-Heatmap(consensusMatrix,cluster_rows = F,cluster_columns = F,bottom_annotation=ha,show_column_names=F)#绘制热图
png(filename = D_F,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}}
2-
rm(list=ls())
library("preprocessCore")
library(BimodalIndex)
library(NMF)
library(ComplexHeatmap)
library("stringr")
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
#使用preprocessCore进行分位数标准化
George_FPKM<-as.matrix(George_FPKM)
George_FPKM<-normalize.quantiles(George_FPKM,keep.names=T)
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
George_FPKM1<-George_FPKM
#计算平均值,并获得means >=25th的基因名
George_FPKM<-log2(George_FPKM+1)
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得sd>=50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd>=50th的基因名和means>=25th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
#774
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=1000, seed=123456)
png("2-estim.r.plot.png",width=500,height = 500)
plot(estim.r)
dev.off()
#
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=1000 ,.options='vt')
consensusmap_name<-paste("2-k=",ik," consensusmap.png",sep="")
png(consensusmap_name,width=500,height = 500)
consensusmap(res)
dev.off()
#获得分组信息
group<-predict(res)
sorted_group<-names(sort(group))
George_FPKM1<-George_FPKM1[,sorted_group]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
sorted_consensus<-res@consensus[sorted_group,sorted_group]
consensusMatrix_heatmap<-Heatmap(sorted_consensus,
cluster_rows = F,
cluster_columns = F,
bottom_annotation=ha,
show_column_names=F)#绘制热图
png.names<-paste("2-k=",ik,"png",sep=".")#设置生成图片名称
png(filename = png.names,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}
#并进行一致性聚类
library(ConsensusClusterPlus)
distances_value<-c("pearson", "spearman", "euclidean", "binary", "maximum", "canberra", "minkowski")
clusterAlg_value<-c("hc","pam","km")
George_FPKM_filtered<-as.matrix(George_FPKM_filtered)
for(i in distances_value){
for(j in clusterAlg_value){
file_name<-paste("2-George_FPKM_filtered",i,j,sep="_")#文件夹名称的设置
assign(file_name,ConsensusClusterPlus(George_FPKM_filtered,#生成相关文件夹
maxK=10,#设置最大聚类的类数
reps=20000,#重复次数
distance=i,#距离计算方法
clusterAlg=j,
title=file_name,
plot= 'png',
writeTable=T))
}
}
#读取文件夹名称
file_names = list.files(pattern = "2-George_FPKM_filtered")
#遍历文件夹,生成图片
for(i in file_names){#遍历文件夹
for(j in 2:10){#遍历文件夹下需要的文件
file_name_pre<-paste(i,".k=",j,sep="")#设置生成图片的前缀
K_names<-paste("k=",j,sep="")#获得K值
file_names1<-paste(i,K_names,"consensusClass.csv",sep=".")#设置consensusClass.csv名称
file_names2<-paste(i,K_names,"consensusMatrix.csv",sep=".")#设置consensusMatrix.csv名称
file_class<-paste(i,file_names1,sep="/")#设置路径+consensusClass.csv名称
class<-read.table(file_class,sep = ",")#读取consensusClass.csv文件
file_consensusMatrix<-paste(i,file_names2,sep="/")#设置路径+consensusMatrix.csv名称
consensusMatrix<-read.table(file_consensusMatrix,header = T,sep=",")#读取consensusMatrix.csv文件
consensusMatrix<-consensusMatrix[,-1]
colnames(consensusMatrix)<-class[,1]
rownames(consensusMatrix)<-class[,1]
class1<-class[order(class$V2),]#按聚类结果进行排序
George_FPKM1<-George_FPKM1[,class1[,1]]#表达丰度数据的colnames也按聚类结果进行排序
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
for(m in rownames(consensusMatrix)){
consensusMatrix[m,]<-as.numeric(consensusMatrix[m,])
}
consensusMatrix<-consensusMatrix[class1$V1,class1$V1]
png.names<-paste(file_name_pre,"png",sep=".")#设置生成图片名称
D_F<-paste(i,png.names,sep="/")#设置路径+名称
consensusMatrix_heatmap<-Heatmap(consensusMatrix,cluster_rows = F,cluster_columns = F,bottom_annotation=ha,show_column_names=F)#绘制热图
png(filename = D_F,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}}
3-
rm(list=ls())
library("preprocessCore")
library(BimodalIndex)
library(NMF)
library(ComplexHeatmap)
library("stringr")
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
#使用preprocessCore进行分位数标准化
George_FPKM<-as.matrix(George_FPKM)
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
George_FPKM1<-George_FPKM
#计算平均值,并获得means >=25th的基因名
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得sd>=50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd>=50th的基因名和means>=25th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
#774
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=1000, seed=123456)
png("2-estim.r.plot.png",width=500,height = 500)
plot(estim.r)
dev.off()
#
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=1000 ,.options='vt')
consensusmap_name<-paste("3-k=",ik," consensusmap.png",sep="")
png(consensusmap_name,width=1000,height = 1000)
consensusmap(res)
dev.off()
#获得分组信息
group<-predict(res)
sorted_group<-names(sort(group))
George_FPKM1<-George_FPKM1[,sorted_group]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
sorted_consensus<-res@consensus[sorted_group,sorted_group]
consensusMatrix_heatmap<-Heatmap(sorted_consensus,
cluster_rows = F,
cluster_columns = F,
bottom_annotation=ha,
show_column_names=F)#绘制热图
png.names<-paste("3-k=",ik,"png",sep=".")#设置生成图片名称
png(filename = png.names,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}
#并进行一致性聚类
library(ConsensusClusterPlus)
distances_value<-c("pearson", "spearman", "euclidean", "binary", "maximum", "canberra", "minkowski")
clusterAlg_value<-c("hc","pam","km")
George_FPKM_filtered<-as.matrix(George_FPKM_filtered)
for(i in distances_value){
for(j in clusterAlg_value){
file_name<-paste("3-George_FPKM_filtered",i,j,sep="_")#文件夹名称的设置
assign(file_name,ConsensusClusterPlus(George_FPKM_filtered,#生成相关文件夹
maxK=10,#设置最大聚类的类数
reps=20000,#重复次数
distance=i,#距离计算方法
clusterAlg=j,
title=file_name,
plot= 'png',
writeTable=T))
}
}
#读取文件夹名称
file_names = list.files(pattern = "3-George_FPKM_filtered")
#遍历文件夹,生成图片
for(i in file_names){#遍历文件夹
for(j in 2:10){#遍历文件夹下需要的文件
file_name_pre<-paste(i,".k=",j,sep="")#设置生成图片的前缀
K_names<-paste("k=",j,sep="")#获得K值
file_names1<-paste(i,K_names,"consensusClass.csv",sep=".")#设置consensusClass.csv名称
file_names2<-paste(i,K_names,"consensusMatrix.csv",sep=".")#设置consensusMatrix.csv名称
file_class<-paste(i,file_names1,sep="/")#设置路径+consensusClass.csv名称
class<-read.table(file_class,sep = ",")#读取consensusClass.csv文件
file_consensusMatrix<-paste(i,file_names2,sep="/")#设置路径+consensusMatrix.csv名称
consensusMatrix<-read.table(file_consensusMatrix,header = T,sep=",")#读取consensusMatrix.csv文件
consensusMatrix<-consensusMatrix[,-1]
colnames(consensusMatrix)<-class[,1]
rownames(consensusMatrix)<-class[,1]
class1<-class[order(class$V2),]#按聚类结果进行排序
George_FPKM1<-George_FPKM1[,class1[,1]]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
for(m in rownames(consensusMatrix)){
consensusMatrix[m,]<-as.numeric(consensusMatrix[m,])
}
consensusMatrix<-consensusMatrix[class1$V1,class1$V1]
png.names<-paste(file_name_pre,"png",sep=".")#设置生成图片名称
D_F<-paste(i,png.names,sep="/")#设置路径+名称
consensusMatrix_heatmap<-Heatmap(consensusMatrix,cluster_rows = F,cluster_columns = F,bottom_annotation=ha,show_column_names=F)#绘制热图
png(filename = D_F,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}}
4-
library(BimodalIndex)
library(NMF)
library("preprocessCore")
setwd("D:/project/01.SCLC/03.数据分析/00.SCLC_expression_file/01.grouping/2021-Cancer-Cell-Patterns")
#读取数据
George_FPKM<-read.table("D:/project/01.SCLC/03.数据分析/George_2015_sum_FPKM.txt")
George_FPKM1<-George_FPKM
#①计算符合双峰指数≥1.5,平均表达值≥25th,sd value≥50th条件下的基因。
#计算平均值,并获得means≥25th的基因名
George_FPKM<-log2(George_FPKM+1)
George_FPKM_mean<-rowMeans(George_FPKM)
George_FPKM_Means_25th_names<-names(George_FPKM_mean[George_FPKM_mean>=quantile(George_FPKM_mean,0.25)])
#计算标准差,并获得前sd 50th的基因名
George_FPKM_sd<-data.frame()
for(i in rownames(George_FPKM)){
George_FPKM_sd[i,"sd_value"]<-sd(George_FPKM[i,])
George_FPKM_sd[i,"Gene_names"]<-i
}
George_FPKM_sd_50th_names<-rownames(George_FPKM_sd[George_FPKM_sd$sd_value>=quantile(George_FPKM_sd$sd_value,0.5),])
#获得前sd 25th的基因名和means前50th的基因名的交集
intersect_means_sd_names<-intersect(George_FPKM_sd_50th_names,George_FPKM_Means_25th_names)
#计算交集基因的双峰指数,并获得双峰指数≥1.5的基因
George_FPKM_bi <- bimodalIndex(George_FPKM[intersect_means_sd_names,], verbose=FALSE)
dim(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
George_FPKM_bi_1.5<-rownames(George_FPKM_bi[George_FPKM_bi$BI>=1.5,])
#获得符合条件的基因矩阵
George_FPKM_filtered<-George_FPKM[George_FPKM_bi_1.5,]
#查看基因数量
dim(George_FPKM_filtered)
#使用preprocessCore进行分位数标准化
George_FPKM_filtered<-as.matrix(George_FPKM_filtered)
George_FPKM_filtered<-normalize.quantiles(George_FPKM_filtered,keep.names=T)
estim.r <- nmf(George_FPKM_filtered, 2:7, nrun=1000, seed=123456)
png("4-estim.r.plot.png",width=500,height = 500)
plot(estim.r)
dev.off()
#
for(ik in c(2:7)){
use.this.k <- ik
res <- nmf(George_FPKM_filtered, use.this.k, nrun=1000 ,.options='vt')
consensusmap_name<-paste("4-k=",ik," consensusmap.png",sep="")
png(consensusmap_name,width=1000,height = 1000)
consensusmap(res)
dev.off()
#获得分组信息
group<-predict(res)
sorted_group<-names(sort(group))
George_FPKM1<-George_FPKM1[,sorted_group]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
sorted_consensus<-res@consensus[sorted_group,sorted_group]
consensusMatrix_heatmap<-Heatmap(sorted_consensus,
cluster_rows = F,
cluster_columns = F,
bottom_annotation=ha,
show_column_names=F)#绘制热图
png.names<-paste("4-k=",ik,"png",sep=".")#设置生成图片名称
png(filename = png.names,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}
#并进行一致性聚类
library(ConsensusClusterPlus)
distances_value<-c("pearson", "spearman", "euclidean", "binary", "maximum", "canberra", "minkowski")
clusterAlg_value<-c("hc","pam","km")
George_FPKM_filtered<-as.matrix(George_FPKM_filtered)
for(i in distances_value){
for(j in clusterAlg_value){
file_name<-paste("4-George_FPKM_filtered",i,j,sep="_")#文件夹名称的设置
assign(file_name,ConsensusClusterPlus(George_FPKM_filtered,#生成相关文件夹
maxK=10,#设置最大聚类的类数
reps=20000,#重复次数
distance=i,#距离计算方法
clusterAlg=j,
title=file_name,
plot= 'png',
writeTable=T))
}
}
#读取文件夹名称
file_names = list.files(pattern = "3-George_FPKM_filtered")
#遍历文件夹,生成图片
for(i in file_names){#遍历文件夹
for(j in 2:10){#遍历文件夹下需要的文件
file_name_pre<-paste(i,".k=",j,sep="")#设置生成图片的前缀
K_names<-paste("k=",j,sep="")#获得K值
file_names1<-paste(i,K_names,"consensusClass.csv",sep=".")#设置consensusClass.csv名称
file_names2<-paste(i,K_names,"consensusMatrix.csv",sep=".")#设置consensusMatrix.csv名称
file_class<-paste(i,file_names1,sep="/")#设置路径+consensusClass.csv名称
class<-read.table(file_class,sep = ",")#读取consensusClass.csv文件
file_consensusMatrix<-paste(i,file_names2,sep="/")#设置路径+consensusMatrix.csv名称
consensusMatrix<-read.table(file_consensusMatrix,header = T,sep=",")#读取consensusMatrix.csv文件
consensusMatrix<-consensusMatrix[,-1]
colnames(consensusMatrix)<-class[,1]
rownames(consensusMatrix)<-class[,1]
class1<-class[order(class$V2),]#按聚类结果进行排序
George_FPKM1<-George_FPKM1[,class1[,1]]#表达丰度数据的colnames也按聚类结果进行排序
#设置Markers
Two_annotation_genes_FPKM_NEUROD1<-as.vector(as.matrix(George_FPKM1["NEUROD1",]))
Two_annotation_genes_FPKM_NEUROD2<-as.vector(as.matrix(George_FPKM1["NEUROD2",]))
Two_annotation_genes_FPKM_NEUROD4<-as.vector(as.matrix(George_FPKM1["NEUROD4",]))
Two_annotation_genes_FPKM_NOTCH1<-as.vector(as.matrix(George_FPKM1["NOTCH1",]))
Two_annotation_genes_FPKM_NOTCH2<-as.vector(as.matrix(George_FPKM1["NOTCH2",]))
Two_annotation_genes_FPKM_NOTCH3<-as.vector(as.matrix(George_FPKM1["NOTCH3",]))
Two_annotation_genes_FPKM_NOTCH4<-as.vector(as.matrix(George_FPKM1["NOTCH4",]))
Two_annotation_genes_FPKM_POU2F3<-as.vector(as.matrix(George_FPKM1["POU2F3",]))
Two_annotation_genes_FPKM_ASCL1<-as.vector(as.matrix(George_FPKM1["ASCL1",]))
Two_annotation_genes_FPKM_YAP1<-as.vector(as.matrix(George_FPKM1["YAP1",]))
Two_annotation_genes_FPKM_MYC<-as.vector(as.matrix(George_FPKM1["MYC",]))
Two_annotation_genes_FPKM_CALCA<-as.vector(as.matrix(George_FPKM1["CALCA",]))
Two_annotation_genes_FPKM_UCHL1<-as.vector(as.matrix(George_FPKM1["UCHL1",]))
Two_annotation_genes_FPKM_INSM1<-as.vector(as.matrix(George_FPKM1["INSM1",]))
Two_annotation_genes_FPKM_SCG2<-as.vector(as.matrix(George_FPKM1["SCG2",]))
Two_annotation_genes_FPKM_REST<-as.vector(as.matrix(George_FPKM1["REST",]))
Two_annotation_genes_FPKM_CDH5<-as.vector(as.matrix(George_FPKM1["CDH5",]))
Two_annotation_genes_FPKM_EZH2<-as.vector(as.matrix(George_FPKM1["EZH2",]))
Two_annotation_genes_FPKM_CD44<-as.vector(as.matrix(George_FPKM1["CD44",]))
Two_annotation_genes_FPKM_EPCAM<-as.vector(as.matrix(George_FPKM1["EPCAM",]))
Two_annotation_genes_FPKM_HES1<-as.vector(as.matrix(George_FPKM1["HES1",]))
Two_annotation_genes_FPKM_DDC<-as.vector(as.matrix(George_FPKM1["DDC",]))
Two_annotation_genes_FPKM_CLDN4<-as.vector(as.matrix(George_FPKM1["CLDN4",]))
Two_annotation_genes_FPKM_GRP<-as.vector(as.matrix(George_FPKM1["GRP",]))
Two_annotation_genes_FPKM_CHGA<-as.vector(as.matrix(George_FPKM1["CHGA",]))
Two_annotation_genes_FPKM_CALCB<-as.vector(as.matrix(George_FPKM1["CALCB",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CHGB<-as.vector(as.matrix(George_FPKM1["CHGB",]))
Two_annotation_genes_FPKM_SYP<-as.vector(as.matrix(George_FPKM1["SYP",]))
Two_annotation_genes_FPKM_TTF1<-as.vector(as.matrix(George_FPKM1["TTF1",]))
Two_annotation_genes_FPKM_ATOH1<-as.vector(as.matrix(George_FPKM1["ATOH1",]))
Two_annotation_genes_FPKM_NCAM1<-as.vector(as.matrix(George_FPKM1["NCAM1",]))
Two_annotation_genes_FPKM_CD24<-as.vector(as.matrix(George_FPKM1["CD24",]))
Two_annotation_genes_FPKM_CADM1<-as.vector(as.matrix(George_FPKM1["CADM1",]))
Two_annotation_genes_FPKM_ALCAM<-as.vector(as.matrix(George_FPKM1["ALCAM",]))
Two_annotation_genes_FPKM_CD151<-as.vector(as.matrix(George_FPKM1["CD151",]))
Two_annotation_genes_FPKM_EPHA2<-as.vector(as.matrix(George_FPKM1["EPHA2",]))
Two_annotation_genes_FPKM_NIPAL4<-as.vector(as.matrix(George_FPKM1["NIPAL4",]))
#设置marker在热图中的注释
ha = HeatmapAnnotation(NEUROD1 = anno_barplot(Two_annotation_genes_FPKM_NEUROD1),
NEUROD2 = anno_barplot(Two_annotation_genes_FPKM_NEUROD2),
NEUROD4 = anno_barplot(Two_annotation_genes_FPKM_NEUROD4),
NOTCH1 = anno_barplot(Two_annotation_genes_FPKM_NOTCH1),
NOTCH2 = anno_barplot(Two_annotation_genes_FPKM_NOTCH2),
NOTCH3 = anno_barplot(Two_annotation_genes_FPKM_NOTCH3),
HES1= anno_barplot(Two_annotation_genes_FPKM_HES1),
YAP1= anno_barplot(Two_annotation_genes_FPKM_YAP1),
REST= anno_barplot(Two_annotation_genes_FPKM_REST),
#NOTCH4 = anno_barplot(Two_annotation_genes_FPKM_NOTCH4),
CD44= anno_barplot(Two_annotation_genes_FPKM_CD44),
POU2F3 = anno_barplot(Two_annotation_genes_FPKM_POU2F3),
MYC= anno_barplot(Two_annotation_genes_FPKM_MYC),
INSM1= anno_barplot(Two_annotation_genes_FPKM_INSM1),
SCG2= anno_barplot(Two_annotation_genes_FPKM_SCG2),
GRP= anno_barplot(Two_annotation_genes_FPKM_GRP),
ASCL1= anno_barplot(Two_annotation_genes_FPKM_ASCL1),
DDC= anno_barplot(Two_annotation_genes_FPKM_DDC),
CALCA= anno_barplot(Two_annotation_genes_FPKM_CALCA),
CALCB= anno_barplot(Two_annotation_genes_FPKM_CALCB),
CHGA= anno_barplot(Two_annotation_genes_FPKM_CHGA),
UCHL1= anno_barplot(Two_annotation_genes_FPKM_UCHL1),
CLDN4= anno_barplot(Two_annotation_genes_FPKM_CLDN4),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
EZH2= anno_barplot(Two_annotation_genes_FPKM_EZH2),
EPCAM= anno_barplot(Two_annotation_genes_FPKM_EPCAM),
CHGB= anno_barplot(Two_annotation_genes_FPKM_CHGB),
SYP= anno_barplot(Two_annotation_genes_FPKM_SYP),
TTF1= anno_barplot(Two_annotation_genes_FPKM_TTF1),
ATOH1= anno_barplot(Two_annotation_genes_FPKM_ATOH1),
NCAM1= anno_barplot(Two_annotation_genes_FPKM_NCAM1),
CD24= anno_barplot(Two_annotation_genes_FPKM_CD24),
CADM1= anno_barplot(Two_annotation_genes_FPKM_CADM1),
ALCAM= anno_barplot(Two_annotation_genes_FPKM_ALCAM),
CD151= anno_barplot(Two_annotation_genes_FPKM_CD151),
EPHA2= anno_barplot(Two_annotation_genes_FPKM_EPHA2),
NIPAL4= anno_barplot(Two_annotation_genes_FPKM_NIPAL4),
annotation_height = c(rep(1,35)))
#对聚类结果文件进行排序
for(m in rownames(consensusMatrix)){
consensusMatrix[m,]<-as.numeric(consensusMatrix[m,])
}
consensusMatrix<-consensusMatrix[class1$V1,class1$V1]
png.names<-paste(file_name_pre,"png",sep=".")#设置生成图片名称
D_F<-paste(i,png.names,sep="/")#设置路径+名称
consensusMatrix_heatmap<-Heatmap(consensusMatrix,cluster_rows = F,cluster_columns = F,bottom_annotation=ha,show_column_names=F)#绘制热图
png(filename = D_F,width=1200,height = 1900)#生成图片
print(consensusMatrix_heatmap)
dev.off()
}}