拟时序(pseudotime)分析,又称细胞轨迹(cell trajectory)分析,根据不同细胞亚群基因表达量随时间的变化情况通过拟时分析可以来推断发育过程细胞的分化轨迹或细胞亚型的演化过程。并非一定要不同时间段做实验的结果,因为细胞本身存在拟时序变化,细胞是有变化的,可以做拟时序分析。
1.
1.1 数据预处理
# 安装并加载所需的R包
#BiocManager::install("monocle")
library(monocle)
#载入注释后的数据
load("sub_immune_B.Rdata")
sub_immune_B
## An object of class Seurat
## 36611 features across 54803 samples within 1 assay
## Active assay: RNA (36611 features, 2000 variable features)
## 3 layers present: counts, data, scale.data
## 4 dimensional reductions calculated: pca, harmony, umap, tsne
# 通过seurat结果导入数据
data <- as(as.matrix(sub_immune_B@assays$RNA@counts), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = sub_immune_B@meta.data)
fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
fd <- new('AnnotatedDataFrame', data = fData)
#构建S4对象,CellDataSet
HSMM <- newCellDataSet(data,
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size() # 单细胞稀疏矩阵的话用negbinomial.size(),如果是UMI的话不要标准化;FPKM值用tobit();logFPKM值用gaussianff()
)
# 估计size factors 和 dispersions
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)
# 过滤低质量细胞
HSMM <- detectGenes(HSMM, min_expr = 3)
print(head(fData(HSMM)))
## gene_short_name num_cells_expressed use_for_ordering
## MIR1302-2HG MIR1302-2HG 0 FALSE
## FAM138A FAM138A 0 FALSE
## OR4F5 OR4F5 0 FALSE
## AL627309.1 AL627309.1 0 FALSE
## AL627309.3 AL627309.3 0 FALSE
## AL627309.2 AL627309.2 0 FALSE
expressed_genes <- row.names(subset(fData(HSMM),
num_cells_expressed >= 10))
head(pData(HSMM))
orig.ident nCount_RNA nFeature_RNA percent.mt RNA_snn_res.0.8 seurat_clusters Sample RNA_snn_res.0.5 celltype Disease Size_Factor num_genes_expressed
## Sample1_GGCACGTCGAAGTCCAT SeuratProject 993 412 0 14 14 N11 18 B_cell Normal 0.2883241 20
## Sample1_GCCATGGATGTAGTCTC SeuratProject 1057 564 0 4 4 N11 7 B_cell Normal 0.3069069 17
## Sample1_GGAGAACGCTCGAGTGA SeuratProject 1223 632 0 8 8 N11 11 B_cell Normal 0.3551061 59
## Sample1_TCGGTCTATGGTCTAAG SeuratProject 1078 328 0 2 2 N11 1 B_cell Normal 0.3130044 16
## Sample1_AAGAACATACCGGACGT SeuratProject 1090 436 0 13 13 N11 17 B_cell Normal 0.3164886 31
## Sample1_CGTAAGTGCTGATTCAC SeuratProject 1089 572 0 3 3 N11 6 B_cell Normal 0.3161983 35
1.2 选择基因
diff_test_res <- differentialGeneTest(HSMM[expressed_genes,],
fullModelFormulaStr = "~ RNA_snn_res.0.8")
# 找出那些在不同细胞类型之间有显著差异表达的基因,以用于细胞排序。 作者:小云爱生信 https://www.bilibili.com/read/cv33990870/ 出处:bilibili
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
ordering_genes_top <- row.names(diff_test_res)[order(diff_test_res$qval)][1:2000] # 选择top2000的基因
HSMM <- setOrderingFilter(HSMM, ordering_genes)
HSMM_top <- setOrderingFilter(HSMM, ordering_genes_top)
plot_ordering_genes(HSMM_top)
★ 这里选择基因的方式有很多,说明文档中建议以下4种选择基因的方式:
1.选择发育差异表达基因
2.选择clusters差异表达基因
3.选择离散程度高的基因
4.自定义发育marker基因
# 降维 & 排序
HSMM_top <- reduceDimension(HSMM_top, max_components = 3,
num_dim = 20,
method = 'DDRTree') # DDRTree方式
HSMM_top <- orderCells(HSMM_top)
1.3 可视化
1.3.1 基于各种“类型”可视化
colour = c('#C6DADF','#E26EB8','#918FBE','#A2C5A6','#EED784','#D4B796','#F5BDC8','#F0BDD8','#77A1EC','#F8D39C','#F17591','#CCC9E6')
a1 <- plot_cell_trajectory(HSMM_top, color_by = "RNA_snn_res.0.8") + scale_color_manual(values = colour)
a2 <- plot_cell_trajectory(HSMM_top, color_by = "State") + scale_color_manual(values = colour)
a3 <- plot_cell_trajectory(HSMM_top, color_by = "Pseudotime")
a4 <- plot_cell_trajectory(HSMM_top, color_by = "Disease") + scale_color_manual(values = colour)
(a1 + a2) / (a3 + a4)
1.3.2 添加“树形图”
p1 <- plot_cell_trajectory(HSMM_top, x = 1, y = 2, color_by = "RNA_snn_res.0.8") +
theme(legend.position='none',panel.border = element_blank()) + #去掉第一个的legend
scale_color_manual(values = colour)
p2 <- plot_complex_cell_trajectory(HSMM_top, x = 1, y = 3,
color_by = "RNA_snn_res.0.8")+
scale_color_manual(values = colour) +
theme(legend.title = element_blank())
p1 | p2
1.3.3 细胞密度图和热图
# 沿时间轴的细胞密度图
p_density <- ggplot(df, aes(Pseudotime, colour = RNA_snn_res.0.8, fill = RNA_snn_res.0.8)) +
geom_density(bw = 0.5, size = 1,alpha = 0.5) +
theme_classic2() +
scale_fill_manual(name = "", values = colour) +
scale_color_manual(name= "", values = colour )
# 寻找拟时差异基因(qvalue体现基因与拟时的密切程度)绘制热图。
Time_diff <- differentialGeneTest(HSMM_top[ordering_genes_top,], cores = 4,
fullModelFormulaStr = "~sm.ns(Pseudotime)")
Time_diff <-Time_diff[, c(5, 2, 3, 4, 1, 6, 7)] # 把gene放前面,也可以不改
#write.csv(Time_diff, "Time_T_cell_all.csv", row.names = F)
Time_genes <- Time_diff %>% arrange(qval) %>% head(50) %>% pull(gene_short_name) %>% as.character() # 提取Top50gene
p_pseudotime <- plot_pseudotime_heatmap(HSMM_top[Time_genes,],
cores = 6,
num_clusters = 3,
show_rownames = T, return_heatmap = T,
hmcols = colorRampPalette(c("navy", "white", "firebrick3"))(100))
# 提取cluste对应的基因集
clusters <- cutree(p_pseudotime$tree_row, k = 3)
clustering <- data.frame(clusters)
clustering[,1] <- as.character(clustering[,1])
colnames(clustering) <- "Gene_Clusters"
table(clustering)
write.csv(clustering, "clustering_B_cell.csv", row.names = F)
p_density
p_pseudotime
1.3.4 指定基因的可视化
# 直接查看一些基因随细胞状态等的表达变化,选择前4个top基因并将其对象取出
keygenes <- head(Time_genes, 4)
HSMM_subset <- HSMM_top[keygenes,]
# 可视化:以state/cluster/pseudotime进行
p1 <- plot_genes_in_pseudotime(HSMM_subset, color_by = "State")
p2 <- plot_genes_in_pseudotime(HSMM_subset, color_by = "RNA_snn_res.0.8")
p3 <- plot_genes_in_pseudotime(HSMM_subset, color_by = "Pseudotime")
p1|p2|p3
# 也可以自己指定基因
s.genes <-c("SELL","CCR7","IL7R","CD84","CCL5","S100A4")
p1 <- plot_genes_jitter(HSMM_top[s.genes,],grouping = "State", color_by = "State")
p2 <- plot_genes_violin(HSMM_top[s.genes,],grouping = "State", color_by = "State")
p3 <-plot_genes_in_pseudotime(HSMM_top[s.genes,], color_by = "State")
p1|p2|p3
1.3.5 分支点的差异分析
在monocle中的分析中发现,细胞群能从一个轨迹分叉成不同的分支点,Monocle采用分支表达式分析建模,主要是BEAM函数,可以将分叉过程重构为一个分支轨迹,从而分析不同细胞命运下的差异。
# BEAM进行统计分析(此处,分析branch_point = 1这个分支处的细胞命名分叉是如何进行的)
BEAM_res <- BEAM(HSMM_top[Time_genes,], branch_point = 3, cores = 2)
# 对影响分析的基因根据qvalue进行排序
BEAM_res <-BEAM_res[order(BEAM_res$qval),]
BEAM_res <-BEAM_res[,c("gene_short_name", "pval", "qval")]
head(BEAM_res)
## gene_short_name pval qval
## CD52 CD52 0 0
## LAPTM5 LAPTM5 0 0
## BCAR3 BCAR3 0 0
## NIBAN1 NIBAN1 0 0
## PTPRC PTPRC 0 0
## REL REL 0 0
write.csv(BEAM_res, "BEAM_res.csv", row.names = F)
# 绘制与分支相关的基因热图
plot_genes_branched_heatmap(HSMM_top[row.names(subset(BEAM_res,qval < 1e-4)),], branch_point = 3,
num_clusters = 3, cores = 4, use_gene_short_name= T, show_rownames = T)
# 也可选TOP 100基因可视化
BEAM_genes <- top_n(BEAM_res, n = 100, desc(qval)) %>% pull(gene_short_name) %>% as.character() # 这里前面已选择TOP50“Time_genes“
plot_genes_branched_heatmap(HSMM_top[BEAM_genes,], branch_point = 3, num_clusters = 3, show_rownames = T, return_heatmap = T)
ggsave("BEAM_heatmap.pdf",p$ph_res, width = 6.5, height = 10)
2.
2.1 数据预处理
# 安装并加载所需的R包
library(tidyverse)
library(viridis)
library(tidydr)
library(ggforce)
# 提取散点(细胞信息)
data_df <- t(reducedDimS(HSMM_top)) %>% as.data.frame() %>%
select_('Component 1' = 1, 'Component 2' = 2) %>%
rownames_to_column("Cells") %>%
mutate(pData(HSMM_top)$State,
pData(HSMM_top)$Pseudotime,
pData(HSMM_top)$Sample,
pData(HSMM_top)$Disease,
pData(HSMM_top)$RNA_snn_res.0.8)
head(data_df)
## cells Component_1 Component_2 State Pseudotime Sample Disease Cluster
## 1 Sample1_GGCACGTCGAAGTCCAT -0.2556758 -0.39954223 12 16.312038 N11 Normal 14
## 2 Sample1_GCCATGGATGTAGTCTC -6.3751636 -2.72973839 1 9.154760 N11 Normal 4
## 3 Sample1_GGAGAACGCTCGAGTGA -4.8346884 10.82667623 10 32.885713 N11 Normal 8
## 4 Sample1_TCGGTCTATGGTCTAAG 2.4437216 -0.08049483 6 18.501786 N11 Normal 2
## 5 Sample1_AAGAACATACCGGACGT 0.6226999 -0.58489902 5 17.082317 N11 Normal 13
## 6 Sample1_CGTAAGTGCTGATTCAC -12.6889563 -4.95611584 1 2.298391 N11 Normal 3
# 修改列名
colnames(data_df) <- c("cells","Component_1","Component_2","State",
"Pseudotime","Sample","Disease","Cluster")
# 提取轨迹线
reduced_dim_coords <- reducedDimK(HSMM_top)
ica_space_df <- Matrix::t(reduced_dim_coords) %>%
as.data.frame() %>%
select_(prin_graph_dim_1 = 1, prin_graph_dim_2 = 2) %>%
mutate(sample_name = rownames(.), sample_state = rownames(.))
dp_mst <- minSpanningTree(HSMM_top)
edge_df <- dp_mst %>%
igraph::as_data_frame() %>%
select_(source = "from", target = "to") %>%
left_join(ica_space_df %>% select_(source="sample_name", source_prin_graph_dim_1="prin_graph_dim_1", source_prin_graph_dim_2="prin_graph_dim_2"), by = "source") %>%
left_join(ica_space_df %>% select_(target="sample_name", target_prin_graph_dim_1="prin_graph_dim_1", target_prin_graph_dim_2="prin_graph_dim_2"), by = "target")
head(edge_df)
## source target source_prin_graph_dim_1 source_prin_graph_dim_2 target_prin_graph_dim_1 target_prin_graph_dim_2
## 1 Y_1 Y_127 -1.9808689 -1.6954309 -1.974843 -1.69475171
## 2 Y_2 Y_27 -9.6308815 -4.2806869 -9.842792 -4.34748024
## 3 Y_2 Y_61 -9.6308815 -4.2806869 -9.418861 -4.21382639
## 4 Y_3 Y_108 3.2228720 0.1019687 3.330707 0.04139265
## 5 Y_3 Y_132 3.2228720 0.1019687 3.114318 0.16312487
## 6 Y_4 Y_59 0.8357722 3.9965879 1.055543 3.74468161
2.2 可视化及美化
# 饼图信息
Cellratio <- prop.table(table(data_df$State, data_df$Disease), margin = 2) # 计算各组样本不同细胞群比例
Cellratio <- as.data.frame(Cellratio)
colnames(Cellratio) <- c('State',"Disease","Freq")
# 可视化
ggplot() +
geom_point(data = data_df, aes(x = Component_1,
y = Component_2,
color =Pseudotime)) +
scale_color_viridis() + # 调整颜色
theme_bw() +
theme_dr(arrow = grid::arrow(length = unit(0, "inches"))) + # 坐标轴主题修改
theme(
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.ticks.length = unit(0.8, "lines"),
axis.ticks = element_blank(),
axis.line = element_blank(),
axis.title = element_text(size=15),
) +
geom_segment(aes_string(x = "source_prin_graph_dim_1",
y = "source_prin_graph_dim_2",
xend = "target_prin_graph_dim_1",
yend = "target_prin_graph_dim_2"),
size=0.75, linetype = "solid", na.rm = TRUE, data= edge_df) + # 添加轨迹线
geom_arc(arrow = arrow(length = unit(0.15, "inches"), type = "closed", angle = 30),
aes(x0 = 1, y0 = -3, r = 4, start = -2, end = 0.5),
lwd = 1.5, color = "grey") +
geom_arc_bar(data = subset(Cellratio, State == '3'), stat = "pie",
aes(x0 = 0.1,y0 = 0.5,r0 = 0, r = 0.8, amount = Freq,fill = Disease)) +
scale_fill_manual(values = c("#3D6590", "#F7D46E","#DC615F")) # 自定义组别(Disease)颜色