slowkow · October 8, 2024 02:00 · mostafaBio · Sep 22, 2016 · slowkow · Nov 6, 2016
diff --git a/counts_to_tpm.R b/counts_to_tpm.R
 #' Convert counts to transcripts per million (TPM).
 #' 
 #' Convert a numeric matrix of features (rows) and conditions (columns) with
 #' raw feature counts to transcripts per million.
 #' 
 #'    Lior Pachter. Models for transcript quantification from RNA-Seq.
 #'    arXiv:1104.3889v2 
 #'    
 #'    Wagner, et al. Measurement of mRNA abundance using RNA-seq data:
 #'    RPKM measure is inconsistent among samples. Theory Biosci. 24 July 2012.
 #'    doi:10.1007/s12064-012-0162-3
 #'    
 #' @param counts A numeric matrix of raw feature counts i.e.
 #'  fragments assigned to each gene.
 #' @param featureLength A numeric vector with feature lengths.
 #' @param meanFragmentLength A numeric vector with mean fragment lengths.
 #' @return tpm A numeric matrix normalized by library size and feature length.
 counts_to_tpm <- function(counts, featureLength, meanFragmentLength) {
  
  # Ensure valid arguments.
  stopifnot(length(featureLength) == nrow(counts))
  stopifnot(length(meanFragmentLength) == ncol(counts))
  
  # Compute effective lengths of features in each library.
  effLen <- do.call(cbind, lapply(1:ncol(counts), function(i) {
    featureLength - meanFragmentLength[i] + 1
  }))
  
  # Exclude genes with length less than the mean fragment length.
  idx <- apply(effLen, 1, function(x) min(x) > 1)
  counts <- counts[idx,]
  effLen <- effLen[idx,]
  featureLength <- featureLength[idx]
  
  # Process one column at a time.
  tpm <- do.call(cbind, lapply(1:ncol(counts), function(i) {
    rate = log(counts[,i]) - log(effLen[,i])
    denom = log(sum(exp(rate)))
    exp(rate - denom + log(1e6))
  }))

  # Copy the row and column names from the original matrix.
  colnames(tpm) <- colnames(counts)
  rownames(tpm) <- rownames(counts)
  return(tpm)
 }
	#' Convert counts to transcripts per million (TPM).
	#'
	#' Convert a numeric matrix of features (rows) and conditions (columns) with
	#' raw feature counts to transcripts per million.
	#'
	#' Lior Pachter. Models for transcript quantification from RNA-Seq.
	#' arXiv:1104.3889v2
	#'
	#' Wagner, et al. Measurement of mRNA abundance using RNA-seq data:
	#' RPKM measure is inconsistent among samples. Theory Biosci. 24 July 2012.
	#' doi:10.1007/s12064-012-0162-3
	#'
	#' @param counts A numeric matrix of raw feature counts i.e.
	#' fragments assigned to each gene.
	#' @param featureLength A numeric vector with feature lengths.
	#' @param meanFragmentLength A numeric vector with mean fragment lengths.
	#' @return tpm A numeric matrix normalized by library size and feature length.
	counts_to_tpm <- function(counts, featureLength, meanFragmentLength) {

	# Ensure valid arguments.
	stopifnot(length(featureLength) == nrow(counts))
	stopifnot(length(meanFragmentLength) == ncol(counts))

	# Compute effective lengths of features in each library.
	effLen <- do.call(cbind, lapply(1:ncol(counts), function(i) {
	featureLength - meanFragmentLength[i] + 1
	}))

	# Exclude genes with length less than the mean fragment length.
	idx <- apply(effLen, 1, function(x) min(x) > 1)
	counts <- counts[idx,]
	effLen <- effLen[idx,]
	featureLength <- featureLength[idx]

	# Process one column at a time.
	tpm <- do.call(cbind, lapply(1:ncol(counts), function(i) {
	rate = log(counts[,i]) - log(effLen[,i])
	denom = log(sum(exp(rate)))
	exp(rate - denom + log(1e6))
	}))

	# Copy the row and column names from the original matrix.
	colnames(tpm) <- colnames(counts)
	rownames(tpm) <- rownames(counts)
	return(tpm)
	}