1. 首先GEO下载的数据可能会有问题:
1.1 genes.tsv文件只有一列,缺第一列和第三列,遇到这种情况第一列ENSG123433这种字符串补上即可,第三列全部写Gene Expression;
1.2 .mtx文件也可能有问题:主要是第一列和第二列顺序错误,需要调换。第一列是基因编号,第二列是细胞编号才对;另外第三列count必须是整数,如果不是整数要改为整数
1.3 下载的数据的分组信息在xxx_metadata.tsv里面,需要自己写代码整合到barcodes.tsv文件中,详见后面的代码
上述问题解决后,把文件改为barcodes.tsv.gz,features.tsv.gz,matrix.mtx.gz就可以使用Seurat包的Read10X正常读取。
这里以GSE151674的数据为例进行分析,代码如下:
```
############################################################################
#####chongzuo chongzuo chongzuo########
setwd('F://danxibaofenxi20230529//GSE151674//analysis')
#BiocManager::install('SoupX')
rm(list=ls())
library(dplyr)
#install.packages('dplyr')
library(Seurat)
#BiocManager::install('lazyeval')
library(patchwork)
## =============1.Load the PBMC dataset
data_dir <- "."
pbmc.data <- Read10X(data.dir = data_dir)
pbmc.data
pbmc <- CreateSeuratObject(counts = pbmc.data,
project = "pbmc3k",
min.cells = 3,
min.features = 200)
head(pbmc@meta.data)
library(tidyverse) #加载tidyverse
#BiocManager::install('tidyverse')
#nstall.packages("tidyverse")
rownames(pbmc@meta.data) -> pbmc@meta.data$rowLeo #反向赋值,增加了1列
head(pbmc@meta.data) #展示如下
str_split(pbmc$rowLeo,"_")
head(str_split(pbmc$rowLeo,"_",simplify=T) [,3])
str_split(pbmc$rowLeo,"_",simplify=T) [,3] -> pbmc@meta.data$group #反向赋值,实现分组
head(pbmc@meta.data)
names(pbmc@meta.data)
unique(pbmc$group)
####
## =============3.QC
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
grep(pattern = "^mt-", rownames(pbmc), value = T)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^mt-")
##
# QC指标
head(pbmc@meta.data, 5)
summary(pbmc@meta.data$nCount_RNA)
# QC指标使用小提琴图可视化
#VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# 密度曲线图
data <- pbmc@meta.data
library(ggplot2)
p <- ggplot(data = data, aes(x=nFeature_RNA)) + geom_density()
p
# 两个指标之间的相关性
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# 过滤
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
## =============4.标准化
# LogNormalize
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
# 标准化后的值保存在:pbmc[["RNA"]]@data
normalized.data <- pbmc[["RNA"]]@data
normalized.data[1:20,1:4]
dim(normalized.data)
##5、选择高变基因
## =============5.鉴定高变基因
# 高变基因:在一些细胞中表达高,另一些细胞中表达低的基因
# 变异指标: mean-variance relationship
# 默认返回两千个高变基因,用于下游如PCA降维分析。
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# 提取前10的高变基因
top10 <- head(VariableFeatures(pbmc), 10)
top10
all <- head(VariableFeatures(pbmc), 2000)
write.csv(all,'allGaoBianGene.csv')
# 展示高变基因
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
#6、Scaling the data 归一化数据
## =============6.Scaling the data
# 归一化处理:每一个基因在所有细胞中的均值变为0,方差标为1,对于降维来说是必需步骤
# 归一化后的值保存在:pbmc[["RNA"]]@scale.data
pbmc <- ScaleData(pbmc)
scale.data <- pbmc[["RNA"]]@scale.data
dim(scale.data)
scale.data[1:10,1:4]
# 可以选择全部基因归一化 !注意! !注意!
#all.genes <- rownames(pbmc)
#pbmc <- ScaleData(pbmc, features = all.genes)
## =============7.降维
# PCA降维,默认使用前面2000个高变基因,可以使用features改变用于降维的基因集
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
# 可视化
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:6, cells = 500, balanced = TRUE)
## =============8.确定使用PC个数
# each PC essentially representing a ‘metafeature’
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:20)
ElbowPlot(pbmc)
## =============9.对细胞聚类
# 首先基于PCA空间构建一个基于欧氏距离的KNN图
pbmc <- FindNeighbors(pbmc, dims = 1:20)
# 聚类并最优化
# resolution参数:值越大,细胞分群数越多,
# 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells
# Optimal resolution often increases for larger datasets.
pbmc <- FindClusters(pbmc, resolution = 0.5)
# 查看聚类数ID
head(Idents(pbmc), 5)
# 查看每个类别多少个细胞
head(pbmc@meta.data)
table(pbmc@meta.data$seurat_clusters)
##
## =============10.将细胞在低维空间可视化UMAP/tSNE
pbmc <- RunUMAP(pbmc, dims = 1:20)
pbmc <- RunTSNE(pbmc, dims = 1:20)
DimPlot(pbmc,split.by = 'group')
# 可视化
#DimPlot(pbmc, reduction = "umap", label = T, label.size = 5)
#DimPlot(pbmc, reduction = "tsne", label = T, label.size = 5)
saveRDS(pbmc, file = "pbmc_tutorial.rds")
####
###1.细胞注释
############细胞注释2
# 清空当前环境变量
#rm(list=ls())
options(stringsAsFactors = F)
# 加载包
library(Seurat)
library(SingleR)
#BiocManager::install('celldex')
refdata <- HumanPrimaryCellAtlasData()
save(refdata,file = "refdata.RData")
load("./refdata.RData")
getwd()
refdata
head(colnames(refdata))
head(rownames(refdata))
# 查看共有多少种细胞类型
unique(refdata@colData@listData[["label.main"]])
# 加载pbmc结果
#pbmc <- readRDS("pbmc_tutorial.rds")
# 使用的数据为标化后的数据
testdata <- GetAssayData(pbmc, slot="data")
dim(testdata)
testdata[1:30,1:4]
clusters <- pbmc@meta.data$seurat_clusters
table(clusters)
cellpred <- SingleR(test = testdata,
ref = refdata,
labels = refdata$label.main,
method = "cluster",
clusters = clusters,
assay.type.test = "logcounts",
assay.type.ref = "logcounts")
str(cellpred,max.level = 3)
metadata <- cellpred@metadata
head(metadata)
celltype = data.frame(ClusterID = rownames(cellpred),
celltype = cellpred$labels,
stringsAsFactors = F)
celltype
write.csv(celltype, "celltype_anno_SingleR.csv")
levels(pbmc)
new.cluster.ids <- c("Neutrophils","Neutrophils","DC",
"Neutrophils","Neutrophils","Neutrophils",
"Epithelial_cells","Monocyte","Neutrophils",
"HSC_-G-CSF","Fibroblasts","DC","Neutrophils",
"Neutrophils","DC","Neutrophils","Neutrophils",
"Neutrophils","Neutrophils","Macrophage","Neutrophils",
"Gametocytes","HSC_-G-CSF")
names(new.cluster.ids) <- levels(pbmc)
new.cluster.ids
pbmc <- RenameIdents(pbmc, new.cluster.ids)
###
VlnPlot(pbmc,features = c("Irak4", "Cdk9", 'Tyk2', 'Jak1', 'Jak2', 'Jak3'),idents = '0', split.by = 'group',split.plot = TRUE)
DotPlot(pbmc,features = "Irak4",split.by ='group',cols = c('blue','yellow','pink', 'green', 'orange'))
DimPlot(pbmc,split.by = 'group', reduction = "umap", cols = c('blue','yellow','pink', 'green', 'orange','black', 'purple', 'red'))
DimPlot(pbmc,split.by = 'group', reduction = "tsne", cols = c('blue','yellow','pink', 'green', 'orange','black', 'purple', 'red'))
###其他图
VlnPlot(pbmc,features = 'Jak2',split.by = 'group', cols = c('blue','yellow','pink'))
names(pbmc@meta.data)
```