0.需求
想要查询TCGA里的乳腺癌每个样本是什么亚型,并选出LumA和LumB亚型的样本。
这个信息在xena网站上有现成的,如下:
1.读取数据
其实不光是BRCA,各种癌症的亚型都在这里
rm(list = ls())
a = read.delim("TCGASubtype.20170308.tsv.gz")
head(a)
## sampleID Subtype_mRNA
## 1 TCGA-OR-A5J1-01 steroid-phenotype-high+proliferation
## 2 TCGA-OR-A5J2-01 steroid-phenotype-high+proliferation
## 3 TCGA-OR-A5J3-01 steroid-phenotype-high
## 4 TCGA-OR-A5J4-01 <NA>
## 5 TCGA-OR-A5J5-01 steroid-phenotype-high
## 6 TCGA-OR-A5J6-01 steroid-phenotype-low
## Subtype_DNAmeth Subtype_protein Subtype_miRNA Subtype_CNA
## 1 CIMP-high <NA> miRNA_1 Quiet
## 2 CIMP-low 1 miRNA_1 Noisy
## 3 CIMP-intermediate 3 miRNA_6 Chromosomal
## 4 CIMP-high <NA> miRNA_6 Chromosomal
## 5 CIMP-intermediate <NA> miRNA_2 Chromosomal
## 6 CIMP-low 2 miRNA_1 Noisy
## Subtype_Integrative Subtype_other Subtype_Selected
## 1 COC3 C1A ACC.CIMP-high
## 2 COC3 C1A ACC.CIMP-low
## 3 COC2 C1A ACC.CIMP-intermediate
## 4 <NA> <NA> ACC.CIMP-high
## 5 COC2 C1A ACC.CIMP-intermediate
## 6 COC1 C1B ACC.CIMP-low
2.选出BRCA样本
library(stringr)
k = str_starts(a$Subtype_Selected,"BRCA");table(k)
## k
## FALSE TRUE
## 6516 1218
a = a[k,]
table(a$Subtype_mRNA)
##
## Basal Her2 LumA LumB Normal
## 193 82 581 219 143
可以看到各种亚型的样本数量统计
3.提取需要的亚型样本
brca_exp.Rdata里面存放的是brca的表达矩阵。可以通过TCGAbiolinks下载得到。
# 表达矩阵
load("brca_exp.Rdata")
brca[1:4,1:4]
## TCGA-E2-A1L7-11A-33R-A144-07
## ENSG00000000003.15 4209
## ENSG00000000005.6 71
## ENSG00000000419.13 1611
## ENSG00000000457.14 1217
## TCGA-E2-A1L7-01A-11R-A144-07
## ENSG00000000003.15 1689
## ENSG00000000005.6 16
## ENSG00000000419.13 1810
## ENSG00000000457.14 1098
## TCGA-AR-A0U0-01A-11R-A109-07
## ENSG00000000003.15 1590
## ENSG00000000005.6 0
## ENSG00000000419.13 2073
## ENSG00000000457.14 889
## TCGA-BH-A28O-01A-11R-A22K-07
## ENSG00000000003.15 4583
## ENSG00000000005.6 135
## ENSG00000000419.13 1531
## ENSG00000000457.14 1445
head(colnames(brca))
## [1] "TCGA-E2-A1L7-11A-33R-A144-07"
## [2] "TCGA-E2-A1L7-01A-11R-A144-07"
## [3] "TCGA-AR-A0U0-01A-11R-A109-07"
## [4] "TCGA-BH-A28O-01A-11R-A22K-07"
## [5] "TCGA-A2-A0D4-01A-11R-A00Z-07"
## [6] "TCGA-E9-A1R4-01A-21R-A14D-07"
# 缩短id
colnames(brca) = str_sub(colnames(brca),1,15)
rownames(a) = NULL
a = tibble::column_to_rownames(a,"sampleID")
# 匹配两个表格的顺序
s = intersect(colnames(brca),rownames(a))
exp = brca[,s]
a = a[s,]
# 提取
k1 = a$Subtype_mRNA %in% c("LumA","LumB");table(k1)
## k1
## FALSE TRUE
## 415 800
a = a[k1,]
exp = exp[,k1]
exp[1:4,1:4]
## TCGA-E2-A1L7-01 TCGA-A2-A0D4-01
## ENSG00000000003.15 1689 1550
## ENSG00000000005.6 16 45
## ENSG00000000419.13 1810 2710
## ENSG00000000457.14 1098 2026
## TCGA-E9-A1R4-01 TCGA-AO-A1KQ-01
## ENSG00000000003.15 711 825
## ENSG00000000005.6 15 3
## ENSG00000000419.13 1943 2626
## ENSG00000000457.14 1285 1058