count normalization的代码

在前面的count normalization的方法部分，我初步介绍了一些常见的count normalization的方法，这部分我会写出对应的代码。用的数据集是我自己的一个数据。

欢迎大家用自己数据集检查代码是否有错误

# 总共是37336个基因
> head(data)
          WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2
AT1G01010       15       15      152      106
AT1G01020      353      461      255      279
AT1G03987        1        0        3        0
AT1G01030       23       39       74       56
AT1G01040      695      746     1831     1611
AT1G03993        0        0        0        0

> str(data)
'data.frame':   37336 obs. of  4 variables:
 $ WT_0h_R1: int  15 353 1 23 695 0 412 0 64 13 ...
 $ WT_0h_R2: int  15 461 0 39 746 0 541 0 118 7 ...
 $ WT_4h_R1: int  152 255 3 74 1831 0 517 0 349 0 ...
 $ WT_4h_R2: int  106 279 0 56 1611 0 512 0 629 0 ...

CPM

公式
$CPM_i=\frac{X_i}{N}10^6$

CPM其实是最简单的，就是你对应基因的count除以你样本的总count，从而得到你对应基因在对应样本里面的百分比。

data_CPM <- (sweep(data, 2, colSums(data), FUN = "/"))*1e6

> head(data_CPM)
             WT_0h_R1   WT_0h_R2   WT_4h_R1  WT_4h_R2
AT1G01010  0.36330819  0.3272056  7.0490344  5.413802
AT1G01020  8.54985270 10.0561188 11.8256828 14.249536
AT1G03987  0.02422055  0.0000000  0.1391257  0.000000
AT1G01030  0.55707256  0.8507346  3.4317668  2.860122
AT1G01040 16.83327940 16.2730252 84.9130397 82.279578
AT1G03993  0.00000000  0.0000000  0.0000000  0.000000

我们可以来检查下

> colSums(data_CPM)
WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2 
   1e+06    1e+06    1e+06    1e+06 

> data[2,2] / sum(data[, 2]) * 1e6
[1] 10.05612

FPKM

公式：
$FPKM_i=\frac{\frac{X_i}{l_i}}{N}10^9FPKM_i=\frac{\frac{X_i}{l_i}}{N}10^9$
这里就要涉及到基因长度的问题了，我在之前的方法部分也提到过，不同的算法对于长度的界定是不一样的，我这里用的基因长度是直接把外显子overlap，然后去掉冗余部分，得到累加之后的长度。因为我count是利用featureCount得到的，用的GTF是Araport11的GTF，所以我直接用GenomicFeatures包来得到外显子去冗余累加之后的长度。

# 先得到外显子长度
library(GenomicFeatures)
Tx_At <- makeTxDbFromGFF("~/reference/annoation/Athaliana/Araport11/Araport11_GFF3_genes_transposons.201606.gtf",
                         chrominfo = Seqinfo(seqnames = paste0("Chr",c(1:5,"C","M")),
                                             seqlengths = c(30427671,19698289,23459830,18585056,26975502,154478,366924)))
# 会报一些warning
Import genomic features from the file as a GRanges object ... OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ... OK
Warning messages:
1: In .get_cds_IDX(mcols0$type, mcols0$phase) :
  The "phase" metadata column contains non-NA values for features of
  type stop_codon, exon. This information was ignored.
2: In .reject_transcripts(bad_tx, because) :
  The following transcripts were rejected because they have stop
  codons that cannot be mapped to an exon: AT1G07320.4, AT1G18180.2,
  AT1G30230.1, AT1G36380.1, AT1G52415.1, AT1G52940.1, AT2G18110.1,
  AT2G35130.1, AT2G35130.3, AT2G39050.1, AT3G13445.1, AT3G17660.1,
  AT3G17660.3, AT3G52070.2, AT3G54680.1, AT3G59450.2, AT4G13730.2,
  AT4G17730.1, AT4G39620.2, AT5G01520.2, AT5G22794.1, AT5G27710.2,
  AT5G45240.2

# 然后发现少6个基因
> genes(Tx_At)
GRanges object with 37330 ranges and 1 metadata column:
            seqnames        ranges strand |     gene_id
               <Rle>     <IRanges>  <Rle> | <character>
  AT1G01010     Chr1     3631-5899      + |   AT1G01010
  AT1G01020     Chr1     6788-9130      - |   AT1G01020
  AT1G01030     Chr1   11649-13714      - |   AT1G01030
  AT1G01040     Chr1   23121-31227      + |   AT1G01040
  AT1G01050     Chr1   31170-33171      - |   AT1G01050
        ...      ...           ...    ... .         ...
  ATMG09730     ChrM 181268-181347      + |   ATMG09730
  ATMG09740     ChrM 191954-192025      - |   ATMG09740
  ATMG09950     ChrM 333651-333725      - |   ATMG09950
  ATMG09960     ChrM 347266-347348      + |   ATMG09960
  ATMG09980     ChrM 359666-359739      - |   ATMG09980
  -------
  seqinfo: 7 sequences from an unspecified genome

# 我们可以看下缺失的6个基因，刚好就是报warning的那几个基因
> rownames(data)[!rownames(data) %in% names(genes(Tx_At)) ]
[1] "AT1G36380" "AT1G52415" "AT1G52940" "AT2G18110" "AT2G39050" "AT3G54680"
> gene_notin <- rownames(data)[!rownames(data) %in% names(genes(Tx_At)) ]
> data[gene_notin, ]
          WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2
AT1G36380       75       82      140      179
AT1G52415        0        0        0        0
AT1G52940        0        0        0        0
AT2G18110      522      488      622      686
AT2G39050       26       31      133      182
AT3G54680     1236     1777      727      728

之所以会不包括这6个基因，应该是这6个基因的stop condon部分并不在exon里面的缘故……

后来我发现gff并不会报这个错

Tx_At <- makeTxDbFromGFF("~/reference/annoation/Athaliana/Araport11/Araport11_GFF3_genes_transposons.201606.gff",
                         chrominfo = Seqinfo(seqnames = paste0("Chr",c(1:5,"C","M")),
                                             seqlengths = c(30427671,19698289,23459830,18585056,26975502,154478,366924)))

Import genomic features from the file as a GRanges object ... OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ... OK
Warning message:
In .get_cds_IDX(mcols0$type, mcols0$phase) :
  The "phase" metadata column contains non-NA values for features of type exon. This
  information was ignored.

> genes(Tx_At)
GRanges object with 38194 ranges and 1 metadata column:
            seqnames            ranges strand |     gene_id
               <Rle>         <IRanges>  <Rle> | <character>
  AT1G01010     Chr1         3631-5899      + |   AT1G01010
  AT1G01020     Chr1         6788-9130      - |   AT1G01020
  AT1G01030     Chr1       11649-13714      - |   AT1G01030
  AT1G01040     Chr1       23121-31227      + |   AT1G01040
  AT1G01046     Chr1       28500-28706      + |   AT1G01046
        ...      ...               ...    ... .         ...
    MIR854c     Chr5 11838096-11838316      + |     MIR854c
    MIR854d     Chr5 11689861-11690081      - |     MIR854d
    MIR854e     Chr5 11942467-11942687      + |     MIR854e
     MIR855     Chr2   4674427-4674698      + |      MIR855
    MIR858b     Chr1 26772633-26772935      + |     MIR858b
  -------
  seqinfo: 7 sequences from an unspecified genome

> rownames(data)[!rownames(data) %in% names(genes(Tx_At)) ]
character(0)

然后我们就可以得到对应基因的外显子信息

> exons_gene <- exonsBy(Tx_At, by = "gene")
> exons_gene
GRangesList object of length 38194:
$AT1G01010
GRanges object with 6 ranges and 2 metadata columns:
      seqnames    ranges strand |   exon_id     exon_name
         <Rle> <IRanges>  <Rle> | <integer>   <character>
  [1]     Chr1 3631-3913      + |         1 NAC001:exon:1
  [2]     Chr1 3996-4276      + |         2 NAC001:exon:2
  [3]     Chr1 4486-4605      + |         3 NAC001:exon:3
  [4]     Chr1 4706-5095      + |         4 NAC001:exon:4
  [5]     Chr1 5174-5326      + |         5 NAC001:exon:5
  [6]     Chr1 5439-5899      + |         6 NAC001:exon:6
  -------
  seqinfo: 7 sequences from an unspecified genome

$AT1G01020
GRanges object with 14 ranges and 2 metadata columns:
       seqnames    ranges strand |   exon_id    exon_name
          <Rle> <IRanges>  <Rle> | <integer>  <character>
   [1]     Chr1 6788-7069      - |     26363 ARV1:exon:14
   [2]     Chr1 7157-7232      - |     26364 ARV1:exon:13
   [3]     Chr1 7157-7450      - |     26365 ARV1:exon:12
   [4]     Chr1 7384-7450      - |     26366 ARV1:exon:11
   [5]     Chr1 7564-7649      - |     26367 ARV1:exon:10
   ...      ...       ...    ... .       ...          ...
  [10]     Chr1 8417-8464      - |     26372  ARV1:exon:5
  [11]     Chr1 8571-8737      - |     26373  ARV1:exon:4
  [12]     Chr1 8571-9130      - |     26374  ARV1:exon:3
  [13]     Chr1 8594-8737      - |     26375  ARV1:exon:2
  [14]     Chr1 8594-9130      - |     26376  ARV1:exon:1
  -------
  seqinfo: 7 sequences from an unspecified genome

...
<38192 more elements>

# 我们只用到37336个基因的外显子信息
> exons_gen_expr <- exons_gene[rownames(data)]
> exons_gen_expr
GRangesList object of length 37336:
$AT1G01010
GRanges object with 6 ranges and 2 metadata columns:
      seqnames    ranges strand |   exon_id     exon_name
         <Rle> <IRanges>  <Rle> | <integer>   <character>
  [1]     Chr1 3631-3913      + |         1 NAC001:exon:1
  [2]     Chr1 3996-4276      + |         2 NAC001:exon:2
  [3]     Chr1 4486-4605      + |         3 NAC001:exon:3
  [4]     Chr1 4706-5095      + |         4 NAC001:exon:4
  [5]     Chr1 5174-5326      + |         5 NAC001:exon:5
  [6]     Chr1 5439-5899      + |         6 NAC001:exon:6
  -------
  seqinfo: 7 sequences from an unspecified genome

$AT1G01020
GRanges object with 14 ranges and 2 metadata columns:
       seqnames    ranges strand |   exon_id    exon_name
          <Rle> <IRanges>  <Rle> | <integer>  <character>
   [1]     Chr1 6788-7069      - |     26363 ARV1:exon:14
   [2]     Chr1 7157-7232      - |     26364 ARV1:exon:13
   [3]     Chr1 7157-7450      - |     26365 ARV1:exon:12
   [4]     Chr1 7384-7450      - |     26366 ARV1:exon:11
   [5]     Chr1 7564-7649      - |     26367 ARV1:exon:10
   ...      ...       ...    ... .       ...          ...
  [10]     Chr1 8417-8464      - |     26372  ARV1:exon:5
  [11]     Chr1 8571-8737      - |     26373  ARV1:exon:4
  [12]     Chr1 8571-9130      - |     26374  ARV1:exon:3
  [13]     Chr1 8594-8737      - |     26375  ARV1:exon:2
  [14]     Chr1 8594-9130      - |     26376  ARV1:exon:1
  -------
  seqinfo: 7 sequences from an unspecified genome

...
<37334 more elements>

然后我们做去冗余，统计长度等操作

# reduce去冗余
> reduce(exons_gen_expr)
GRangesList object of length 37336:
$AT1G01010
GRanges object with 6 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]     Chr1 3631-3913      +
  [2]     Chr1 3996-4276      +
  [3]     Chr1 4486-4605      +
  [4]     Chr1 4706-5095      +
  [5]     Chr1 5174-5326      +
  [6]     Chr1 5439-5899      +
  -------
  seqinfo: 7 sequences from an unspecified genome

$AT1G01020
GRanges object with 7 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]     Chr1 6788-7069      -
  [2]     Chr1 7157-7450      -
  [3]     Chr1 7564-7649      -
  [4]     Chr1 7762-7835      -
  [5]     Chr1 7942-7987      -
  [6]     Chr1 8236-8464      -
  [7]     Chr1 8571-9130      -
  -------
  seqinfo: 7 sequences from an unspecified genome

$AT1G03987
GRanges object with 1 range and 0 metadata columns:
      seqnames      ranges strand
         <Rle>   <IRanges>  <Rle>
  [1]     Chr1 11101-11372      +
  -------
  seqinfo: 7 sequences from an unspecified genome

...
<37333 more elements>

# width统计长度
> width(reduce(exons_gen_expr))
IntegerList of length 37336
[["AT1G01010"]] 283 281 120 390 153 461
[["AT1G01020"]] 282 294 86 74 46 229 560
[["AT1G03987"]] 272
[["AT1G01030"]] 1525 380
[["AT1G01040"]] 1331 114 211 395 220 173 123 161 234 151 183 165 96 629 98 191 906 165 407 326
[["AT1G03993"]] 788
[["AT1G01050"]] 255 82 121 66 108 66 29 124 171 143
[["AT1G03997"]] 283
[["AT1G01060"]] 225 666 1074 81 270 82 62 112 849
[["AT1G01070"]] 611 152 406 117 545
...
<37326 more elements>

# sum下
> gene_exon_length <- sum(width(reduce(exons_gen_expr)))
> head(gene_exon_length)
AT1G01010 AT1G01020 AT1G03987 AT1G01030 AT1G01040 AT1G03993 
     1688      1571       272      1905      6279       788 

# 然后再看看基因的顺序和我们data上的基因顺序是不是一致的
# 是一样的
> mean(names(gene_exon_length) == rownames(data))
[1] 1

参考文章

Htseq Count To Fpkm

这样我们就可以计算了

# 先除以长度
> data_count_length <- sweep(data, 2, gene_exon_length , FUN = "/")
> head(data_count_length)
             WT_0h_R1    WT_0h_R2   WT_4h_R1   WT_4h_R2
AT1G01010 0.008886256 0.008886256 0.09004739 0.06279621
AT1G01020 0.224697645 0.293443666 0.16231700 0.17759389
AT1G03987 0.003676471 0.000000000 0.01102941 0.00000000
AT1G01030 0.012073491 0.020472441 0.03884514 0.02939633
AT1G01040 0.110686415 0.118808728 0.29160694 0.25656952
AT1G03993 0.000000000 0.000000000 0.00000000 0.00000000

# 再除以文库的总count
> data_FRPKM <- sweep(data_count_length, 2, colSums(data), FUN = "/")*1e9
> head(data_FRPKM)
            WT_0h_R1  WT_0h_R2   WT_4h_R1  WT_4h_R2
AT1G01010 0.21522997 0.1938422  4.1759683  3.207229
AT1G01020 5.44229962 6.4010941  7.5274874  9.070360
AT1G03987 0.08904612 0.0000000  0.5114915  0.000000
AT1G01030 0.29242654 0.4465798  1.8014524  1.501376
AT1G01040 2.68088540 2.5916587 13.5233381 13.103930
AT1G03993 0.00000000 0.0000000  0.0000000  0.000000

检查下

# 我们会发现各个文库之间的FPKM总和是不一样的，这也是FPKM受到人质疑的原因
> colSums(data_FRPKM)
WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2 
752371.2 716130.4 616267.3 608064.9

# 我们检查一个基因
> rownames(data_FRPKM)[10]
[1] "AT1G01070"
> data_FRPKM[rownames(data_FRPKM)[10],]
  WT_0h_R1   WT_0h_R2   WT_4h_R1   WT_4h_R2 
0.17196455 0.08339484 0.00000000 0.00000000 

# 手动算一个基因的FPKM值
> gene_exon_length["AT1G01070"]
AT1G01070 
     1831 
> (data["AT1G01070",1] / 1831) / sum(data[, 1]) * 1e9
[1] 0.1719646

> (data["AT1G01070", ] / 1831) / colSums(data) * 1e9
           WT_0h_R1   WT_0h_R2 WT_4h_R1 WT_4h_R2
AT1G01070 0.1719646 0.08339484        0        0

TPM

公式：
$TPM_i=\frac{\frac{X_i}{l_i}}{\sum_j\frac{X_j}{l_j}}10^6$

> data_TPM <- sweep(data_count_length, 2, colSums(data_count_length), FUN = "/") * 1e6
> head(data_TPM)
           WT_0h_R1  WT_0h_R2   WT_4h_R1  WT_4h_R2
AT1G01010 0.2860689 0.2706800  6.7762290  5.274484
AT1G01020 7.2335303 8.9384480 12.2146472 14.916763
AT1G03987 0.1183540 0.0000000  0.8299832  0.000000
AT1G01030 0.3886732 0.6236013  2.9231673  2.469105
AT1G01040 3.5632485 3.6189762 21.9439494 21.550216
AT1G03993 0.0000000 0.0000000  0.0000000  0.000000

> colSums(data_TPM)
WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2 
   1e+06    1e+06    1e+06    1e+06

DESeq2的norm count

library(DESeq2)

coldata <- data.frame(row.names = colnames(data), 
                      type = gsub("_R\\d+", "", colnames(data)))

> coldata
          type
WT_0h_R1 WT_0h
WT_0h_R2 WT_0h
WT_4h_R1 WT_4h
WT_4h_R2 WT_4h


dds <- DESeqDataSetFromMatrix(countData = data,
                              colData = coldata,
                              design= ~ type)

> dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing


> resultsNames(dds)
[1] "Intercept"           "type_WT_4h_vs_WT_0h"

> res <- lfcShrink(dds, coef=2, type="apeglm")
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895


> data_DESeq2_norm <- DESeq2::counts(dds, normalized = TRUE)
> head(data_DESeq2_norm)
            WT_0h_R1  WT_0h_R2    WT_4h_R1   WT_4h_R2
AT1G01010  16.519753  14.86971  143.445091  100.86482
AT1G01020 388.764862 456.99580  240.648014  265.48381
AT1G03987   1.101317   0.00000    2.831153    0.00000
AT1G01030  25.330288  38.66125   69.835110   53.28707
AT1G01040 765.415238 739.52032 1727.947115 1532.95488
AT1G03993   0.000000   0.00000    0.000000    0.00000

还有个我没用到的链接
https://gist.github.com/slowkow/c6ab0348747f86e2748b

edgeR的TMM

不太会……不敢说……懒得写

下一部分我会探究不同的normaliztion的差异