前言
Scanpy是基于Python语言的类似Seurat的单细胞转录组分析工具,本文汇集了Scanpy的常规操作。
-
加载模块和设置
import time
print('Start time: ' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()))
import numpy as np
import pandas as pd
import scanpy as sc
import louvain
import matplotlib
matplotlib.use('AGG')
import matplotlib.pyplot as plt
from matplotlib import rcParams
sc.settings.verbosity = 3
sc.settings.autosave = True
results_file = './write/81libs.h5ad'
sc.settings.autosave = True
sc.settings.set_figure_params(dpi=300, frameon=False)
-
读入矩阵
inpath= 'Matrix/scanpy_mat'
adata2 = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True)
adata2.obs['batch1'] = 'C7-4'
adata= sc.read_10x_h5("./matrix.h5", genome=None, gex_only=True)
adata = sc.read_csv('./matrix.tsv',delimiter='\t')
adata = sc.read_h5ad('./result.h5ad')
adata = sc.read('./result.h5ad')
-
合并多个对象
#load data
adata1 = sc.read("30libs.h5ad")
adata2 = sc.read("22libs.h5ad")
adata3 = sc.read("28libs.h5ad")
nadata1 = adata1.raw.to_adata()
nadata1.obs['batch1'] = adata1.obs['batch']
nadata2 = adata2.raw.to_adata()
nadata2.obs['batch1'] = adata2.obs['batch']
nadata3 = adata3.raw.to_adata()
nadata3.obs['batch1'] = adata3.obs['batch']
datalist = [nadata2, nadata3]
nadata1 = nadata1[nadata1.obs.batch != "C7-4" , :]
path= mat+'C7-4'+'/04.Matrix/scanpy_mat'
datalist.append(sc.read_10x_mtx(path, var_names='gene_symbols', cache=True))
datalist[2].obs['batch1'] = 'C7-4'
path= mat+'C5-2'+'/04.Matrix/scanpy_mat'
datalist.append(sc.read_10x_mtx(path, var_names='gene_symbols', cache=True))
datalist[3].obs['batch1'] = 'C5-2'
adata = nadata1.concatenate(datalist, join='outer')
-
添加注释
adata.obs['lable'] = "NA"
c=[ i for i, word in enumerate(list(adata.obs['batch1'])) if word.startswith('A') ]
adata.obs['lable'][c]= "A"
c=[ i for i, word in enumerate(list(adata.obs['batch1'])) if word.startswith('B') ]
adata.obs['lable'][c]= "B"
c=[ i for i, word in enumerate(list(adata.obs['batch1'])) if word.startswith('C') ]
adata.obs['lable'][c]= "C"
c=[ i for i, word in enumerate(list(adata.obs['batch1'])) if word.startswith('Y') ]
adata.obs['lable'][c]= "Y"
-
输出矩阵
#save raw matrix
pd.DataFrame(data=adata.X.todense().T, index=adata.var_names,columns=adata.obs_names).to_csv('raw_mat.csv',sep="\t",
# float_format='%.0f')
print(str(len(datalist)+1)+' datasets were merged!')
-
质控
#quality control
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=3)
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
adata.obs['percent_mito'] = adata.obs['pct_counts_mt']
adata.obs['n_counts'] = adata.obs['total_counts']
adata = adata[adata.obs.pct_counts_mt < 10, :]
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],jitter=0.4, multi_panel=True, save='_qc.pdf', stripplot=False)
plt.savefig('./figures/01.22libs_qc.pdf')
sc.pp.calculate_qc_metrics(data, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(data, ['n_genes_by_counts', 'total_counts'],jitter=False, multi_panel=True,stripplot=False)
plt.savefig("QC_violin.pdf")
data = data[data.obs.n_genes < 2500, :]
print(data.obs['n_genes_by_counts'].median(),'\n')
print(data.obs['total_counts'].median(),"\n")
-
常规聚类
#normalize scale pca umap
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
adata.raw = adata
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
#sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50, random_state=int(2020), svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40, random_state=int(2020))
sc.tl.umap(adata, random_state=int(2020))
#cluster
sc.tl.louvain(adata, random_state=int(2020))
sc.pl.umap(adata, color=['louvain'], save='_cluster1.pdf')
sc.pl.umap(adata, color=['lable'], save='_cluster2.pdf')
#plt.savefig('./figures/02.22libs_cluster.pdf')
adata.obs['clusters'] = adata.obs['louvain']
-
聚类点图
data.obsm['X_umap']=data.obsm['spatial']
sc.pl.umap(data, color=['leiden'])
plt.savefig("Umap2.pdf")
-
基因表达点图
#plot markers
rcParams['figure.figsize'] = 3,3
marker = ["CD68","CD14","CD163","CD3G","CD4","CD8A"]
marker = [gene for gene in marker if (gene in adata.raw.var._stat_axis.values.tolist())]
sc.pl.umap(adata, color=marker, s=50, frameon=False, ncols=3, vmax='p99', save='_l1.pdf')
rcParams['figure.figsize'] = 3,3
marker = ["RTKN2","CHN2","LRRD1","KIAA0408","CCDC144A"]
marker = [gene for gene in marker if (gene in adata.raw.var._stat_axis.values.tolist())]
sc.pl.umap(adata, color=marker, s=50, frameon=False, ncols=3, vmax='p99', save='_l2.pdf')
-
保存结果
adata.write(results_file)
adata.obs.to_csv('./write/Scanpy_cell_info.csv', sep='\t', header=True, index=True)
adata.write(results_file, compression='gzip')
-
找DEG
# Finding marker genes
sc.settings.verbosity = 2
sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon')
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, swap_axes=True, show_gene_labels=True, vmin=-3, vmax=3, cmap='bwr')
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
trgpc = pd.DataFrame({group + '_' + key[:1]: result[key][group] for group in groups for key in ['names', 'scores', 'logfoldchanges','pvals_adj']}).head(20)
trgpc.to_csv(path_or_buf="./write/top20_genes_per_cluster.csv", sep='\t', header=True, index=True)
-
更改基因名或细胞名
adata.var_names = new_gene_names
adata.obs_names='cell_'+adata.obs_names