使用AUCell识别单细胞rna数据中具有活性“基因集”(i.e. gene signatures)的细胞。AUCell使用“曲线下面积”(Area Under the Curve,AUC)来计算输入基因集的一个关键子集是否在每个细胞的表达基因中富集。AUC分数在所有细胞的分布允许探索signatures的相对表达。
AUCell允许在单细胞rna数据中识别具有活性基因集(如签名、基因模块)的细胞。简言之,运行AUCell的工作流基于三个步骤:
- Build the rankings
- Calculate the Area Under the Curve (AUC)
- Set the assignment thresholds
其实我们发现在SCENIC 包的分析过程中,已经封装了AUCell。在单细胞数据的下游分析中往往聚焦于某个有意思的基因集(gene set),已经发展出许多的富集方法。但是大部分的富集分析往往都是单向的:输入基因集输出通路(生物学意义),但是很少有可以从基因集富集信息反馈到样本上来的。AUCell在做这样的尝试。
应用案例可以参考:
下面通过一个简短的示例来说明它是如和运作的。
library(AUCell)
library(clusterProfiler)
library(ggplot2)
library(Seurat)
library(SeuratData)
##seurat RDS object
cd8.seurat <- pbmc3k.final
cells_rankings <- AUCell_buildRankings(cd8.seurat@assays$RNA@data) # 关键一步
# ?AUCell_buildRankings
##load gene set, e.g. GSEA lists from BROAD
c5 <- read.gmt("c5.cc.v7.1.symbols.gmt") ## ALL GO tm
这个c5.cc.v7.1.symbols.gmt
文件可以在GSEA上面下载,要做下游分析通路是要会背的。
这里记录的是每个通路及其对应的基因集:
> head(c5$ont)
[1] GO_FILOPODIUM GO_FILOPODIUM GO_FILOPODIUM GO_FILOPODIUM GO_FILOPODIUM GO_FILOPODIUM
999 Levels: GO_FILOPODIUM GO_NUCLEAR_CHROMOSOME_TELOMERIC_REGION GO_CLATHRIN_SCULPTED_VESICLE GO_I_BAND ... GO_PROXIMAL_DENDRITE
> head(c5$gene)
[1] "ABI1" "ARL4C" "ABI2" "FARP1" "CDK5" "TUBB3"
geneSets <- lapply(unique(c5$ont), function(x){print(x);c5$gene[c5$ont == x]})
names(geneSets) <- unique(c5$ont)
此时,我们可以根据GO通路找到对应的基因:
geneSets$GO_I_BAND
[1] "DNAJB6" "MYZAP" "STUB1" "MYL12B" "MYL9" "NEBL" "GLRX3" "PDLIM5" "CFL2" "FERMT2" "LDB3"
[12] "AHNAK2" "SYNPO" "FBXO32" "C10orf71" "MYOM3" "XIRP2" "KLHL40" "CRYAB" "CSRP1" "ADRA1A" "CTNNB1"
[23] "DES" "SYNPO2" "DMD" "SMTNL1" "ALDOA" "FHL2" "FHL3" "FKBP1A" "FKBP1B" "PALLD" "FLNA"
[34] "FLNB" "FLNC" "SYNE2" "OBSL1" "FRG1" "FBXO22" "ANKRD2" "ITGB1BP2" "ANKRD1" "PDLIM3" "BMP10"
[45] "BIN1" "FBXL22" "ANK1" "ANK2" "ANK3" "PARVB" "PRICKLE4" "HRC" "HSPB1" "KY" "CAVIN4"
[56] "JUP" "KCNA5" "KCNE1" "KCNN2" "KRT8" "KRT19" "MTM1" "MYH6" "MYH7" "MYL3" "PPP1R12A"
[67] "PPP1R12B" "NEB" "NOS1" "NRAP" "ATP2B4" "PAK1" "PDE4B" "MYOZ2" "PGM5" "PPP2R5A" "PPP3CA"
[78] "PPP3CB" "PARVA" "SCN3B" "PSEN1" "PSEN2" "JPH1" "JPH2" "TRIM54" "MYOZ1" "RYR1" "RYR2"
[89] "RYR3" "S100A1" "SCN1A" "SCN5A" "SCN8A" "PDLIM2" "SLC2A1" "SLC4A1" "SLC8A1" "SMN1" "SMN2"
[100] "DST" "SRI" "ACTC1" "TTN" "CACNA1C" "CACNA1D" "CACNA1S" "SYNPO2L" "FHOD3" "CSRP3" "ACTN4"
[111] "SYNC" "CAPN3" "OBSCN" "CASQ1" "CASQ2" "MYPN" "TRIM63" "SORBS2" "MYO18B" "TCAP" "PDLIM4"
[122] "CAV3" "ACTN1" "MYOM1" "FBP2" "ACTN2" "KAT2B" "AKAP4" "IGFN1" "PDLIM1" "NEXN" "MYOM2"
[133] "MYOZ3" "PDLIM7" "HOMER1" "FHL5" "MYOT" "BAG3" "NOS1AP" "HDAC4"
计算AUC:
?AUCell_calcAUC
cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings, aucMaxRank=nrow(cells_rankings)*0.1)
找一些通路(该找哪些通路呢?)
> length(rownames(cells_AUC@assays@data$AUC))
[1] 958
> grep("REG",rownames(cells_AUC@assays@data$AUC),value = T)
[1] "GO_NUCLEAR_CHROMOSOME_TELOMERIC_REGION" "GO_CHROMOSOME_CENTROMERIC_REGION"
[3] "GO_CYTOPLASMIC_REGION" "GO_PROTEASOME_REGULATORY_PARTICLE_BASE_SUBCOMPLEX"
[5] "GO_PERINUCLEAR_REGION_OF_CYTOPLASM" "GO_CHROMOSOMAL_REGION"
[7] "GO_CELL_CORTEX_REGION" "GO_PARANODE_REGION_OF_AXON"
[9] "GO_CONDENSED_CHROMOSOME_CENTROMERIC_REGION" "GO_CONDENSED_NUCLEAR_CHROMOSOME_CENTROMERIC_REGION"
[11] "GO_PLASMA_MEMBRANE_REGION" "GO_CHROMOSOME_TELOMERIC_REGION"
[13] "GO_MEMBRANE_REGION" "GO_PROTEASOME_REGULATORY_PARTICLE_LID_SUBCOMPLEX"
[15] "GO_MYELIN_SHEATH_ABAXONAL_REGION" "GO_MYELIN_SHEATH_ADAXONAL_REGION"
[17] "GO_JUXTAPARANODE_REGION_OF_AXON" "GO_REGION_OF_CYTOSOL"
[19] "GO_CENTRAL_REGION_OF_GROWTH_CONE"
我们选择GO_PERINUCLEAR_REGION_OF_CYTOPLASM
。
> ##set gene set of interest here for plotting
> geneSet <- "GO_PERINUCLEAR_REGION_OF_CYTOPLASM"
> aucs <- as.numeric(getAUC(cells_AUC)[geneSet, ])
> cd8.seurat$AUC <- aucs
> df<- data.frame(cd8.seurat@meta.data, cd8.seurat@reductions$umap@cell.embeddings)
> head(df)
orig.ident nCount_RNA nFeature_RNA seurat_annotations percent.mt RNA_snn_res.0.5 seurat_clusters AUC UMAP_1
AAACATACAACCAC pbmc3k 2419 779 Memory CD4 T 3.0177759 1 1 0.06102966 -4.232792
AAACATTGAGCTAC pbmc3k 4903 1352 B 3.7935958 3 3 0.08560310 -4.892886
AAACATTGATCAGC pbmc3k 3147 1129 Memory CD4 T 0.8897363 1 1 0.08328099 -5.508639
AAACCGTGCTTCCG pbmc3k 2639 960 CD14+ Mono 1.7430845 2 2 0.07669723 11.332233
AAACCGTGTATGCG pbmc3k 980 521 NK 1.2244898 6 6 0.06250478 -7.450703
AAACGCACTGGTAC pbmc3k 2163 781 Memory CD4 T 1.6643551 1 1 0.08204075 -3.509504
UMAP_2
AAACATACAACCAC -4.152139
AAACATTGAGCTAC 10.985685
AAACATTGATCAGC -7.211088
AAACCGTGCTTCCG 3.161727
AAACCGTGTATGCG 1.092022
AAACGCACTGGTAC -6.087042
我们看到每个细胞现在都加上AUC值了,下面做一下可视化。
class_avg <- df %>%
group_by( seurat_annotations) %>%
summarise(
UMAP_1 = median(UMAP_1),
UMAP_2 = median(UMAP_2)
)
ggplot(df, aes(UMAP_1, UMAP_2)) +
geom_point(aes(colour = AUC)) + viridis::scale_color_viridis(option="A") +
ggrepel::geom_label_repel(aes(label = seurat_annotations),
data = class_avg,
size = 6,
label.size = 0,
segment.color = NA
)+ theme(legend.position = "none") + theme_bw()
关键是,你要找到基因集啊。
https://bioconductor.org/packages/release/bioc/html/AUCell.html