GSVA其实就是pathway级别的差异分析,根据GSVA的原理其实就是计算每个sample的GSEA然后得出类似pathway enrich score,把这个可以当作芯片的表达数据一样,再用limma包分析差异基因。
参考以下两个例子:
使用GSVA方法计算某基因集在各个样本的表现
充分理解GSVA和GSEA
以及我一个例子:
rm(list=ls())
suppressMessages(library(GSVA))
suppressMessages(library(GSVAdata))
suppressMessages(library(GSEABase))
suppressMessages(library(limma))
#读入gmt文件,这个可以从MSigDB上下载,这边选的上gene symbol根据自己的data来选择
gmt_file="~/c5.bp.v7.1.symbols.gmt"
geneset <- getGmt(gmt_file)
#自行读入exp文件,我这边为行为gene symbol,列为样本(4个control,5个treatment样本)
es <- gsva(as.matrix(exp), geneset,
min.sz=10, max.sz=500, verbose=TRUE)
#得到gsva计算的数值后再用limma包做差异分析得到差异的pathway
design <- model.matrix(~ factor(c(rep("cont",4),rep("treatment",5))))
colnames(design) <- c("ALL", "contvstreatment")
row.names(design)<-colnames(exp)
fit <- lmFit(es, design)
fit <- eBayes(fit)
#这边是总的
allGeneSets <- topTable(fit, coef="contvstreatment", number=Inf)
#这边是差异的
adjPvalueCutoff <- 0.001
DEgeneSets <- topTable(fit, coef="contvstreatment", number=Inf,
p.value=adjPvalueCutoff, adjust="BH")
res <- decideTests(fit, p.value=adjPvalueCutoff)
例子2
gmt_file="~/Desktop/GoogleDrive/HPC/gmt/common/c5.bp.v7.1.symbols.gmt"
geneset <- getGmt(gmt_file)
tpm_1st_pass<-read.csv(paste0(data_dir,"TPM_GenePass.csv"),row.names = 1)
gsva_analysis<-function(sample,design,geneset,p,out_dir){
exp<-get(sample)
es <- gsva(as.matrix(exp), geneset,
min.sz=10, max.sz=500, verbose=TRUE)
colnames(design) <- c("ALL", "ALvsFast")
row.names(design)<-colnames(exp)
fit <- lmFit(es, design)
fit <- eBayes(fit)
allGeneSets <- topTable(fit, coef="ALvsFast", number=Inf)
DEgeneSets <- allGeneSets[allGeneSets$P.Value<p,]
write.csv(es,paste0(out_dir_table,sample,"_GSVA_es_matrix.csv"))
write.csv(allGeneSets,paste0(out_dir_table,sample,"_GSVA_degs_analysis_matrix.csv"))
write.csv(DEgeneSets,paste0(out_dir_table,sample,"_GSVA_degs_sig_p_",as.character(p),"_analysis_matrix.csv"))
output <- list(a = es, b = allGeneSets,c=DEgeneSets)
return(output)
}
p=0.05
design_1st <- model.matrix(~ factor(c(rep("AL",3),rep("Fast",3))))
gsva_1st<-gsva_analysis(sample="tpm_1st_pass",design=design_1st,geneset=geneset,p=p,out_dir=out_dir_table)
gsva_1st_es<-gsva_1st$a
gsva_1st_degs<-gsva_1st$b
gsva_1st_degs_sig<-gsva_1st$c
包装为一个函数,使用sbatch,下面是mouse data转化为human gene symbol然后run gsva
#!/usr/bin/env Rscript
#use for mouse data to human gsva analysis
#all data files and gmt files should be in the same folder
#@author: CFJiang 04252020
suppressMessages(library(argparse))
rm(list = ls())
options(stringsAsFactors=F)
parser <- ArgumentParser(description="")
parser$add_argument('-d', '--data_dir', metavar="*", help="path for your data")
parser$add_argument('-f', '--data_filename', metavar="*", help="data file name")
parser$add_argument('-p', '--prefix', metavar="*", help="prefix for your output")
parser$add_argument('-g1', '--group1', metavar="*", help="group1 name")
parser$add_argument('-g2', '--group2', metavar="*", help="group2 name")
parser$add_argument('-t', '--type', metavar="*", help="type for either KEGG or GO")
args <- parser$parse_args()
if (is.null(args$type)) { args$type <-"KEGG" }
#useage & example
data_dir= as.character(args$data_dir) #your data folder
data_filename= as.character(args$data_filename) #your data file full name
prefix= as.character(args$prefix) #your out put flile prefix
group1= as.character(args$group1) #your group1
group2= as.character(args$group2) #your group2
type=as.character(args$type) #your analysis type (KEGG or GO)
gsva_all_in_one<-function(data_dir,data_filename,prefix,group1,group2,type){
rm(list = ls())
options(stringsAsFactors=F)
suppressMessages(library(GSVA))
suppressMessages(library(GSEABase))
suppressMessages(library(limma))
suppressMessages(library(Hmisc))
change_name<-function(rt){
ann<-read.csv(paste0(data_dir,"/HHOM_MouseHumanSequence_mouse_human_combined.csv"),row.names = 1,header = T)
com_id<-intersect(row.names(ann),row.names(rt))
ann_com<-ann[com_id,]
rt_com<-rt[com_id,]
row.names(rt_com)<-as.character(ann_com$Symbol)
return(rt_com)
}
#read your data .csv
your_data<-read.csv(paste0(data_dir,"/",data_filename),header = T)
your_data<-your_data[!duplicated(your_data[,1]),]
row.names(your_data)<-as.character(your_data[,1])
your_data<-your_data[,-1]
#change the mouse gene name to human gene name
new_rt<-change_name(your_data)
#gsva analysis
gsva_de<-function(data_dir,new_rt,prefix,group1,group2,type){
new_dir<-paste0(data_dir,"/",prefix)
dir.create(new_dir,recursive = T)
setwd(new_dir)
if (type=="KEGG"){
gmt_file=paste0(data_dir,"/c2.cp.kegg.v7.1.symbols.gmt")
}else{
gmt_file=paste0(data_dir,"/c5.bp.v7.1.symbols.gmt")
}
geneset <- getGmt(gmt_file)
es <- gsva(as.matrix(new_rt), geneset,
min.sz=10, max.sz=500, verbose=TRUE)
gs<-paste0("(",as.character(group1),"|",as.character(group2),").*")
level<-c(as.character(group1),as.character(group2))
design <- model.matrix(~factor(gsub(gs, "\\1", colnames(new_rt)),levels = level))
compare<-paste0(as.character(group1),"vs",as.character(group2))
colnames(design)<-c("ALL", compare)
row.names(design)<-colnames(new_rt)
fit <- lmFit(es, design)
fit <- eBayes(fit)
allGeneSets <- topTable(fit, coef=compare, number=Inf)
adjPvalueCutoff <- 0.05
DEgeneSets <- topTable(fit, coef=compare, number=Inf,
p.value=adjPvalueCutoff, adjust="BH")
write.csv(es,paste0(prefix,"_",compare,"_all_gsva_matrix_",type,".csv"))
write.csv(allGeneSets,paste0(prefix,"_",compare,"_all_deg_gsva_",type,".csv"))
write.csv(DEgeneSets,paste0(prefix,"_",compare,"_padj_",as.character(adjPvalueCutoff),"_deg_gsva_",type,".csv"))
}
gsva_de(data_dir=data_dir,new_rt = new_rt ,prefix = prefix ,group1 = group1,group2 = group2,type=type)
}
# data_dir= "~/Desktop/GSVA/ori_data" #your data folder
# data_filename= "nonCD45_T_count_test.csv" #your data file full name
# prefix= "nonCD45_T" #your out put flile prefix
# group1= "FB6" #your group1
# group2= "FT2" #your group2
gsva_all_in_one(data_dir=data_dir,data_filename =data_filename,prefix = prefix,group1 = group1,group2=group2,type=type )
shell脚本如下
#! /bin/bash
#SBATCH --time=10:00:00
#SBATCH --cpus-per-task=10
#SBATCH --mem=16g
ml R/3.5
data_dir=
Rscript ./GSVA_mouse.R -d ${data_dir} -f nonCD45_T_count_test.csv -p nonCD45_T -g1 FB6 -g2 FT2 -t KEGG