5.细胞类型注释
多样本的注释和单样本的是一样的。
用的之前的自动注释
library(celldex)
library(SingleR)
ls("package:celldex")
f = "ref_BlueprintEncode.RData"
if(!file.exists(f)){
ref <- celldex::BlueprintEncodeData()
save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = ref$label.main,
clusters = scRNA@active.ident)
pred.scRNA$pruned.labels
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
save(scRNA,file = "scRNA.Rdata")
p2 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p1+p2
6.分组可视化及组间细胞比例比较
示例中:54 55 56是实验组,57 58 59是对照
如果看不到,需要自己去提取,方法如下:
library(tinyarray)
edat = geo_download("GSE231920")
pd = edat$pd
从右边点pd可以看到pd的内容
6.1 ==和%in%
==
表示判断是否相等,会把x和y一一比较,相等返回TRUE,不相等返回FALSE。且==
只会把x的第一个位置和y的第一个位置比较,x的第二个位置和y的第二个位置比较,不会错位来比较。
%in%
可以实现错位比较!只要x里面的元素在y里面是有的,就可以返回TRUE,不受位置限制。当x和y的长度(也就是元素个数)不一样时,%in%
也同样好用(这时候就不考虑==了)。
x <- c('a','b','c')
y <- c('c','b','a')
x == y
## [1] FALSE TRUE FALSE
x %in% y #x里的每个元素在y里面有没有
## [1] TRUE TRUE TRUE
y %in% x #y里的每个元素在x里面有没有
## [1] TRUE TRUE TRUE
#产生出来的逻辑值会和%in% 前面的那个数据一一对应。
a = c(1,4,6,2,5)
b = c(2,5,7)
a %in% b #5个逻辑值,和a一一对应,a里的每个元素在b里面有没有
## [1] FALSE FALSE FALSE TRUE TRUE
b %in% a #3个逻辑值,和b一一对应,b里的每个元素在a里面有没有
## [1] TRUE TRUE FALSE
6.2 ifelse
可以根据逻辑值是T还是F产生不同的值
age <- 0:28
ifelse(age>18,'+','-')
## [1] "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-" "-"
## [20] "+" "+" "+" "+" "+" "+" "+" "+" "+" "+"
#结合前面的%in%
ifelse(a %in% b,"在","不在")
## [1] "不在" "不在" "不在" "在" "在"
6.3 添加分组信息
scRNA$group = ifelse(scRNA$orig.ident %in% c("sample1","sample2","sample3"), "treat","control")
DimPlot(scRNA, reduction = "umap", group.by = "group")
这里写scRNAgroup其实是一样的,都会给meta.data添加一列,名为group。
6.4 计算每个亚群的细胞数量和占全部细胞的比例
# 每种细胞的数量和比例
cell_counts <- table(Idents(scRNA))
cell.all <- cbind(cell_counts = cell_counts,
cell_Freq = round(prop.table(cell_counts)*100,2))
#各组中每种细胞的数量和比例
cell.num.group <- table(Idents(scRNA), scRNA$group)
cell.freq.group <- round(prop.table(cell.num.group, margin = 2) *100,2)
cell.all = cbind(cell.all,cell.num.group,cell.freq.group)
cell.all = cell.all[,c(1,3,4,2,5,6)]
colnames(cell.all) = paste(rep(c("all","control","treat"),times = 2),
rep(c("count","freq"),each = 3),sep = "_")
cell.all
## all_count control_count treat_count all_freq control_freq treat_freq
##CD4+ T-cells 1007 283 724 24.98 13.88 36.35
##Fibroblasts 762 664 98 18.90 32.56 4.92
##B-cells 667 183 484 16.55 8.97 24.30
##CD8+ T-cells 547 204 343 13.57 10.00 17.22
##Neutrophils 378 260 118 9.38 12.75 5.92
##Monocytes 311 208 103 7.72 10.20 5.17
##Adipocytes 244 164 80 6.05 8.04 4.02
##NK cells 87 51 36 2.16 2.50 1.81
##Endothelial cells 28 22 6 0.69 1.08 0.30
7.差异分析
7.1 找某一细胞内部的组间差异基因
sub.obj = subset(scRNA,idents = "NK cells")
dfmarkers <- FindMarkers(scRNA, ident.1 = "treat", group.by = "group",min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(dfmarkers)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## XIST 0.000000e+00 -13.156832 0.000 0.655 0.000000e+00
## RPS4Y1 0.000000e+00 12.921082 0.650 0.000 0.000000e+00
## DDX3Y 0.000000e+00 12.553759 0.531 0.000 0.000000e+00
## RPS26 2.582808e-301 1.878636 0.905 0.733 6.290945e-297
## IGHG4 2.033975e-197 6.489441 0.444 0.037 4.954154e-193
## CFD 2.462775e-197 -4.424015 0.142 0.575 5.998581e-193
7.2 找conserved marker基因
西湖镜像
options(BioC_mirror="https://mirrors.westlake.edu.cn/bioconductor")
if(!require("multtest"))BiocManager::install('multtest')
if(!require("metap"))install.packages('metap')
sub.markers <- FindConservedMarkers(scRNA, ident.1 = "NK cells", grouping.var = "group", min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(sub.markers)
FindConservedMarkers和前面的FindMarkers 不大一样。它结合了“分组”和“细胞类型”,是找出不同细胞类型之间在不同条件(treat和control)下都有差异的基因。结合上面的代码来看,就是找出在control组内部NK和其他细胞的差异基因,再找出treat组内部NK和其他细胞的差异基因,二者取交集,就是NK和其他细胞的差异保守的基因,即conservedmarkers。
7.3 组间比较的气泡图
markers.to.plot = c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
"CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
"GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "PRSS57") #一组感兴趣的基因
#如果idents有NA会报错https://github.com/satijalab/seurat/issues/8772
#scRNA <- subset(scRNA, seurat_annotations %in% na.omit(scRNA$seurat_annotations))
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"),
dot.scale = 8, split.by = "group") +
RotatedAxis()
7.4 FeaturePlot
一行是一个基因,一列是一个分组
FeaturePlot(scRNA, features = c("CD3D", "GNLY", "IFI6", "PRSS57"), split.by = "group", max.cutoff = 3, cols = c("grey",
"red"), reduction = "umap")
7.5 VlnPlot
把每种细胞类型分成两组来展示
plots <- VlnPlot(scRNA, features = c("LYZ", "ISG15", "MS4A1", "IFI6"),
split.by = "group", group.by = "seurat_annotations",
pt.size = 0, combine = FALSE)
library(patchwork)
wrap_plots(plots = plots, ncol = 2)
8.伪bulk 转录组差异分析
8.1 单细胞样本整合为伪bulk转录组样本
每个组要有多个样本才能做。
AggregateExpression是把单细胞数据整合为常规转录组数据的方式。group.by = c("seurat_annotations","orig.ident", "group")就是同一细胞类型、同一样本、同一分组的细胞汇总在一起成为一个“样本”。
bulk <- AggregateExpression(scRNA, return.seurat = T, slot = "counts", assays = "RNA", group.by = c("seurat_annotations","orig.ident", "group"))
colnames(bulk)#整合成了多个“样本”
可以只取出一种细胞,然后找treat和control之间的差异基因,这和常规转录组的差异分析是一样的。
sub <- subset(bulk, seurat_annotations == "Monocytes")
colnames(sub)
##[1] "Monocytes_sample1_treat" "Monocytes_sample2_treat" "Monocytes_sample3_treat"
##[4] "Monocytes_sample4_control" "Monocytes_sample5_control" "Monocytes_sample6_control"
Idents(sub) <- "group"
de_markers <- FindMarkers(sub, ident.1 = "treat", ident.2 = "control", slot = "counts", test.use = "DESeq2",
verbose = F)
de_markers$gene <- rownames(de_markers)
8.2 逻辑值连接符号
&
是AND,|
(shift+回车上方的)是OR
8.3 火山图
k1 = de_markers$avg_log2FC< -1 & de_markers$p_val <0.01
k2 = de_markers$avg_log2FC> 1 & de_markers$p_val <0.01
de_markers$change <- ifelse(k1,"down",ifelse(k2,"up","not"))
#满足k1就是down,不满足的话再看是否满足k2,满足k2就是up,不满足就是not。
library(ggplot2)
ggplot(de_markers, aes(avg_log2FC, -log10(p_val),color = change)) +
geom_point(size = 2, alpha = 0.5) +
geom_vline(xintercept = c(1,-1),linetype = 4)+
geom_hline(yintercept = -log10(0.01),linetype = 4)+
scale_color_manual(values = c("#2874C5", "grey", "#f87669"))+
theme_bw() +
ylab("-log10(unadjusted p-value)")