使用R处理一代测序的结果数据

一、一代测序的结果以*.ab1和*.seq储存，其中*.seq文件可以使用记事本打开，储存的是测序的序列结果（ATCG序列，而且是峰值最高的信号所代表的碱基），而*.ab1文件需要用特殊的可视化软件打开，储存的是序列信息及测序时每个碱基的信号强弱，可以通过该文件识别杂合位点（信号比最高信号稍低，比背景信号高）
二、本文主要介绍通过R包--sangerseqR来处理该类数据
1)安装R包（该包在bioconductor中，通过biocmanager安装即可）
2）读入文件

library(sangerseqR)
#读入*.ab1文件
seq<-readsangerseq("C://Users/shinelon/Desktop/g2001137145-C-3_3R.ab1")
#seq是一个sangerseq对象
> class(seq)
[1] "sangerseq"
attr(,"package")
[1] "sangerseqR"
#看一下seq的结构
> str(seq)
Formal class 'sangerseq' [package "sangerseqR"] with 7 slots
  ..@ primarySeqID  : chr "From ab1 file"
  ..@ primarySeq    :Formal class 'DNAString' [package "Biostrings"] with 5 slots
  .. .. ..@ shared         :Formal class 'SharedRaw' [package "XVector"] with 2 slots
  .. .. .. .. ..@ xp                    :<externalptr> 
  .. .. .. .. ..@ .link_to_cached_object:<environment: 0x000000001515c418> 
  .. .. ..@ offset         : int 0
  .. .. ..@ length         : int 830
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ secondarySeqID: chr "From ab1 file"
  ..@ secondarySeq  :Formal class 'DNAString' [package "Biostrings"] with 5 slots
  .. .. ..@ shared         :Formal class 'SharedRaw' [package "XVector"] with 2 slots
  .. .. .. .. ..@ xp                    :<externalptr> 
  .. .. .. .. ..@ .link_to_cached_object:<environment: 0x000000001515c418> 
  .. .. ..@ offset         : int 0
  .. .. ..@ length         : int 830
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ traceMatrix   : int [1:9919, 1:4] 0 0 0 0 0 0 1 2 3 4 ...
  ..@ peakPosMatrix : num [1:830, 1:4] 3 20 36 47 61 75 86 97 109 116 ...
  ..@ peakAmpMatrix : int [1:830, 1:4] 413 1602 920 1425 750 1320 885 991 777 884 ...

#对该对象的详细描述可以查阅帮助文档了解
 ?sangerseq

Slots

primarySeqID
Source of the primary basecalls. Functions that modify these calls, such as makeBaseCalls and setAllelePhase will also change this value.

secondarySeqID
Source of the secondary basecalls. See above.

primarySeq
The primary Basecalls formatted as a DNAString object.

secondarySeq
The secondary Basecalls formatted as a DNAString object.

traceMatrix
A numerical matrix containing 4 columns corresponding to the normalized signal values for the chromatogram traces. Column order = A,C,G,T.
对于色谱信号的标准化的平均值

peakPosMatrix
A numerical matrix containing the position of the maximum peak values for each base within each Basecall window. If no peak was detected for a given base in a given window, then "NA". Column order = A,C,G,T.
色谱信号最高峰所在的位置

peakAmpMatrix
A numerical matrix containing the maximum peak amplitudes for each base within each Basecall window. If no peak was detected for a given base in a given window, then 0. Column order = A,C,G,T.
色谱信号在每个碱基处的高度

三、使用

seq<-makeBaseCalls(seq,ratio=0.2)#将高度比（某峰比上该处最高峰）的值大于0.2，认为是真正的信号
peakAmp<-peakAmpMatrix(seq)
peakAmp<-as.data.frame(peakAmp)
colnames(peakAmp)<-c("A","C","G","T")
#计算ratio，用每个碱基的第二大的峰值除以最高峰
peakAmp$ratio<-apply(peakAmp,1,function(x){a=sort(x,decreasing = T);a[2]/a[1]})
peakAmp$sig<-ifelse(peakAmp$ratio>0.2,T,F)
peakAmp$primaryseq<-t(str_split(primarySeq(seq,string = T),pattern = "",simplify = T))
peakAmp$secondary<-t(str_split(secondarySeq(seq,string = T),pattern = "",simplify = T))
#检验一下primaryseq和secondaryseq是否根据AMP计算的
#peakAmp$gussprimary<-apply(peakAmp[,1:4],1,function(x){c("A","C","G","T")[which(x==max(x))]})
#identical(as.character(peakAmp$primaryseq),as.character(peakAmp$gussprimary))
#[1] TRUE
#secondary没法检验，因为需要考虑碱基的合并及ratio的阈值
guss写错了，应该改成guess,手误！
#可视化测序结果
chromatogram(seq, width = 200, height = 2, showcalls = "both")

peakAmp

chromatogram

四、我把这个过程写成了shiny程序，可自行取用
app.R

library(sangerseqR)
library(shiny)
library(stringr)
library(openxlsx)

ui <- fluidPage(
  headerPanel("Sangerseq"),
  mainPanel(fileInput(inputId = "ab1.input",label = "输入ab1文件",accept = ".ab1"),
            textInput(inputId = "ratio.input",label = "输入ratio",value = 0.2),
            actionButton(inputId = "start",label = "运行"),
            plotOutput(outputId = "chromatogram.output",width = "2000px"),
            tableOutput(outputId = "table.output"),
            textInput(inputId = "filename",label = "保存的文件名",value = paste("sangerseq",Sys.time(),".xlsx",sep = "")),
            downloadButton(outputId = "res.download",label = "下载结果"))
)

server <- function(input, output, session) {
  
  seq=reactive({
    input$start
    isolate({seq=readsangerseq(input$ab1.input$datapath);seq=makeBaseCalls(seq,ratio = input$ratio.input);return(seq)})
  })
  output$chromatogram.output=renderPlot({
    chromatogram(seq(), width = 200, height = 2, showcalls = "both")})
  peakAmp<-reactive({
    peakAmp<-peakAmpMatrix(seq())
    peakAmp<-as.data.frame(peakAmp)
    colnames(peakAmp)<-c("A","C","G","T")
    peakAmp$ratio<-apply(peakAmp,1,function(x){a=sort(x,decreasing = T);a[2]/a[1]})
    peakAmp$sig<-ifelse(peakAmp$ratio>input$ratio.input,T,F)
    peakAmp$primaryseq<-t(str_split(primarySeq(seq(),string = T),pattern = "",simplify = T))
    peakAmp$secondaryseq<-t(str_split(secondarySeq(seq(),string = T),pattern = "",simplify = T))
    return(peakAmp)
  })
  output$table.output<-renderTable({head(peakAmp())},rownames = T)
  output$res.download<-downloadHandler(
    filename = input$filename,
    content = function(file){
      write.xlsx(peakAmp(),file = file)
    },
    contentType = "xlsx"
  )
  
  
}

shinyApp(ui, server)

shiny.sangerseq