感谢刘博出了一些针对R for data science_dplyr部分的题目。我稍微对刘博的题目做了改动,并做了对应解答。
不一定对╮(╯_╰)╭
题目一
还是上一章节的 cuffdiff 结果,根据变化倍数 log2FC 和 显著性 q_value,利用本章节所学知识筛选出上调和下调表达基因,并分别统计上调、下调基因的数目。
筛选标准:
- 上调:log2FC >= 1 且 q_value < 0.5
- 下调:log2FC <= -1 且 q_value < 0.5
data <- readr::read_delim("rawdata/test_data.txt", delim = "\t")
> head(data, n = 5)
# A tibble: 5 x 8
Gene_name Ctrl_fpkm Treat_fpkm log2FC test_stat p_value q_value significant
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 Gene_1 0 0 0 0 1 1 no
2 Gene_2 0 0.217 Inf 0 1 1 no
3 Gene_3 0 0.175 Inf 0 1 1 no
4 Gene_4 0 0 0 0 1 1 no
5 Gene_5 0.721 1.10 0.613 0 1 1 no
data %>%
mutate(., result_type = case_when(
log2FC >= 1 & q_value < 0.5 ~ "up",
log2FC <= 1 & q_value < 0.5 ~ "down",
TRUE ~ "nosig"
)) %>%
group_by(., result_type) %>%
summarise(n = n())
# A tibble: 3 x 2
result_type n
<chr> <int>
1 down 6194
2 nosig 16003
3 up 1087
题目二
已知我有一系列候选基因 candidate_Gene ,如何根据问题 1 中的表达量数据,得到他们所对应的表达量?
> set.seed("20200402")
> (candidate_Gene <- paste0("Gene", "_", sample(1:50, 20)))
[1] "Gene_11" "Gene_30" "Gene_39" "Gene_18" "Gene_47" "Gene_4" "Gene_42" "Gene_37"
[9] "Gene_3" "Gene_36" "Gene_23" "Gene_8" "Gene_50" "Gene_28" "Gene_17" "Gene_25"
[17] "Gene_1" "Gene_38" "Gene_48" "Gene_15"
> data %>%
+ filter(., Gene_name %in% candidate_Gene) %>%
+ select(., 1:3) %>%
+ head()
# A tibble: 6 x 3
Gene_name Ctrl_fpkm Treat_fpkm
<chr> <dbl> <dbl>
1 Gene_1 0 0
2 Gene_3 0 0.175
3 Gene_4 0 0
4 Gene_8 0 0.346
5 Gene_11 0 0
6 Gene_15 0 0
题目三
假如有一个各种激素处理的表达矩阵 exp_data ,利用本章节所学知识怎么提取某一种激素的表达矩阵,比如 CK ?
set.seed("20200402")
exp_data <- tibble(
Gene = paste0("Gene", "_", 1:50),
"JA_1h" = runif(50, min = 1, max = 20),
"JA_2h" = runif(50, min = 2, max = 42),
"JA_3h" = runif(50, min = 5, max = 50),
"CK_1h" = runif(50, min = 1, max = 20),
"CK_2h" = runif(50, min = 2, max = 42),
"CK_3h" = runif(50, min = 5, max = 50),
"SA_1h" = runif(50, min = 1, max = 20),
"SA_2h" = runif(50, min = 2, max = 42),
"SA_3h" = runif(50, min = 5, max = 50)
)
> head(exp_data)
# A tibble: 6 x 10
Gene JA_1h JA_2h JA_3h CK_1h CK_2h CK_3h SA_1h SA_2h SA_3h
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Gene_1 19.8 24.0 48.6 12.6 33.9 41.2 10.3 6.37 28.2
2 Gene_2 3.35 23.8 33.5 10.9 36.3 35.1 19.3 35.7 37.2
3 Gene_3 13.7 35.4 31.3 13.8 37.6 30.1 17.4 15.4 11.7
4 Gene_4 12.7 15.4 7.51 10.3 12.5 25.1 5.68 27.3 25.4
5 Gene_5 4.49 39.9 19.1 17.2 20.2 30.3 2.63 39.7 29.5
6 Gene_6 18.3 12.5 12.1 2.24 20.8 5.05 6.16 39.5 38.4
exp_data %>%
select(., starts_with("CK"))
# A tibble: 50 x 3
CK_1h CK_2h CK_3h
<dbl> <dbl> <dbl>
1 12.6 33.9 41.2
2 10.9 36.3 35.1
3 13.8 37.6 30.1
4 10.3 12.5 25.1
5 17.2 20.2 30.3
6 2.24 20.8 5.05
7 9.74 6.94 19.8
8 18.5 25.4 48.6
9 1.74 6.50 24.8
10 12.4 33.2 29.5
# ... with 40 more rows
题目四
假如,我有下面一个表达芯片数据,一个基因对应多个芯片 ID,计算出该基因的表达量。如果按照一个基因中对应的芯片 ID 的表达量最大值即为该基因表达量怎么求?同样如果是按照芯片 ID 的表达量的平均值为该基因的表达量 怎么求?得到基因的表达量后,筛选出表达量最高的 10 个基因和表达量最低的 10 个基因。
set.seed("20200402")
array_data <- tibble(
Gene_name = rep(paste0("Gene", "_", 1:100), each = 3),
Array_ID = paste0("Array", "_", 1:300),
Expression = runif(300, min = 2, max = 42)
)
> head(array_data)
# A tibble: 6 x 3
Gene_name Array_ID Expression
<chr> <chr> <dbl>
1 Gene_1 Array_1 41.7
2 Gene_1 Array_2 6.94
3 Gene_1 Array_3 28.7
4 Gene_2 Array_4 26.6
5 Gene_2 Array_5 9.35
6 Gene_2 Array_6 38.4
# 均值和最大值
# 但有一个小问题,就是Gene_name的输出顺序不怎么舒服……
array_data %>%
group_by(Gene_name) %>%
summarise(Gene_expr_mean = mean(Expression),
Gene_expr_max = max(Expression))
# A tibble: 100 x 3
Gene_name Gene_expr_mean Gene_expr_max
<chr> <dbl> <dbl>
1 Gene_1 25.8 41.7
2 Gene_10 19.5 40.3
3 Gene_100 30.3 40.2
4 Gene_11 17.1 21.0
5 Gene_12 24.1 39.1
6 Gene_13 8.93 17.8
7 Gene_14 17.5 26.5
8 Gene_15 15.2 27.6
9 Gene_16 26.4 36.4
10 Gene_17 14.7 24.0
# ... with 90 more rows
# 这是我搜索得到的一个方法
array_data %>%
group_by(Gene_name) %>%
summarise(Gene_expr_mean = mean(Expression),
Gene_expr_max = max(Expression)) %>%
arrange(., as.numeric(stringr::str_extract(Gene_name, "\\d+")))
# A tibble: 100 x 3
Gene_name Gene_expr_mean Gene_expr_max
<chr> <dbl> <dbl>
1 Gene_1 25.8 41.7
2 Gene_2 24.8 38.4
3 Gene_3 22.2 32.9
4 Gene_4 27.9 38.4
5 Gene_5 12.9 24.0
6 Gene_6 35.7 39.0
7 Gene_7 30.2 38.0
8 Gene_8 24.4 35.7
9 Gene_9 19.5 24.6
10 Gene_10 19.5 40.3
# ... with 90 more rows
# 回到原始那个问题
# 得到基因的表达量后,筛选出表达量最高的 10 个基因和表达量最低的 10 个基因。
# 我这里以平均值为例
array_data %>%
group_by(Gene_name) %>%
summarise(Gene_expr_mean = mean(Expression),
Gene_expr_max = max(Expression)) %>%
mutate(r_up = rank(Gene_expr_mean),
r_down = rank(desc(Gene_expr_mean))) %>%
filter(r_up <= 10 | r_down <= 10)
# A tibble: 20 x 5
Gene_name Gene_expr_mean Gene_expr_max r_up r_down
<chr> <dbl> <dbl> <dbl> <dbl>
1 Gene_100 30.3 40.2 91 10
2 Gene_13 8.93 17.8 3 98
3 Gene_26 12.5 22.3 7 94
4 Gene_29 7.45 11.9 1 100
5 Gene_31 8.72 10.6 2 99
6 Gene_34 30.9 40.8 92 9
7 Gene_36 9.07 9.57 4 97
8 Gene_37 10.6 24.0 5 96
9 Gene_38 37.0 41.7 100 1
10 Gene_49 10.8 16.5 6 95
11 Gene_5 12.9 24.0 10 91
12 Gene_56 32.2 34.9 97 4
13 Gene_59 12.8 20.2 9 92
14 Gene_6 35.7 39.0 99 2
15 Gene_75 31.1 41.1 93 8
16 Gene_77 12.6 15.9 8 93
17 Gene_80 31.8 39.7 95 6
18 Gene_84 31.6 34.2 94 7
19 Gene_89 32.1 39.1 96 5
20 Gene_98 33.1 36.9 98 3
关于arrange对于数字字母混合列的排序,参考了How to sort alphanumeric column by natural numbers in a datatable?
题目五
假设我们通过5mC(5 甲基胞嘧啶)DNA 甲基化高通量数据上游分析得到下面矩阵(请无视怎么来的)。甲基化的胞嘧啶 C 我们测序得到仍然是 C,但是没有甲基化的胞嘧啶 C 就会被测出来 变为 T,那么一个位点上 DNA 甲基化的值 = 甲基化的 C 除以该位点 C+ T 的数目。简化:甲基化值 meth = C/(C+T)。DNA 甲基化的种类有 CG、CHG、CHH 三种(H 为 ATC)
Gene_name:基因名
Chr:基因在哪一条染色体
position:表示染色体上的位置
meth_Type:DNA 甲基化的类型(CG、CHG、CHH)
C:该位点 C 的数目
T:该位点 T 的数目
set.seed("20200402")
meth_data <- tibble(
Gene_name = rep(paste0("Gene", "_", 1:5), each = 3),
Chr = "chr01",
position = seq(1, 43, by=3),
meth_Type = sample(c("CG", "CHG", "CHH"), 15, replace = T),
C = round(runif(15, min = 1, max = 20)),
T = round(runif(15, min = 2, max = 42))
)
head(meth_data)
# A tibble: 6 x 6
Gene_name Chr position meth_Type C T
<chr> <chr> <dbl> <chr> <dbl> <dbl>
1 Gene_1 chr01 1 CG 5 27
2 Gene_1 chr01 4 CHH 14 22
3 Gene_1 chr01 7 CHH 11 33
4 Gene_2 chr01 10 CHH 9 30
5 Gene_2 chr01 13 CG 12 3
6 Gene_2 chr01 16 CHH 7 37
- 不区分 DNA 甲基化类型,分别按照位点平均和区域平均求每个基因的 DNA 甲基化水平,找出 DNA 甲基化最高的2个基因、最低的2个基因。
- 位点平均即为先求单个基因内单个位点的 meth,然后取平均值;下图 2)
- 区域平均即为单个基因内所有的 C 除以所有 C 和 T 的和,即 total_C/(total_C + total_T);下图 1)
图片来源链接:https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5856372/
meth_data %>%
mutate(meth_site = C / (C + T)) %>%
group_by(Gene_name) %>%
summarise(Average = mean(meth_site),
Poll = sum(C) / (sum(C) + sum(T)))
# A tibble: 5 x 3
Gene_name Average Poll
<chr> <dbl> <dbl>
1 Gene_1 0.265 0.268
2 Gene_2 0.397 0.286
3 Gene_3 0.271 0.267
4 Gene_4 0.284 0.284
5 Gene_5 0.225 0.231
# 手动检查Gene_1,看对不对
> (5/32 + 14/36 + 11/44) / 3
[1] 0.2650463
> (5+14+11) / (5+14+11+27+22+33)
[1] 0.2678571
# 得到表达量最高的2个和最低的2个
# 我感觉这样做不太行了,还是应该把Average和Poll的两种算法拆开来好点
meth_data %>%
mutate(meth_site = C / (C + T)) %>%
group_by(Gene_name) %>%
summarise(Average = mean(meth_site),
Poll = sum(C) / (sum(C) + sum(T))) %>%
mutate(r_up_averge = rank(Average),
r_down_average = rank(Average),
r_up_poll = rank(Poll),
r_down_poll = rank(Poll)
)
# A tibble: 5 x 7
Gene_name Average Poll r_up_averge r_down_average r_up_poll r_down_poll
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Gene_1 0.265 0.268 2 2 3 3
2 Gene_2 0.397 0.286 5 5 5 5
3 Gene_3 0.271 0.267 3 3 2 2
4 Gene_4 0.284 0.284 4 4 4 4
5 Gene_5 0.225 0.231 1 1 1 1
- 区分 DNA 甲基化类型,分别按照位点平均和区域平均求每个基因的三种 DNA 甲基化水平,分别找出三种 DNA 甲基化中最高的十个基因和最低的十个基因。
# 我把Average和Poll两种算法拆开来了
# 以Average为例
meth_level %>%
ungroup() %>%
select(-Poll) -> meth_level_Average
> meth_level_Average
# A tibble: 11 x 3
Gene_name meth_Type Average
<chr> <chr> <dbl>
1 Gene_1 CG 0.156
2 Gene_1 CHH 0.319
3 Gene_2 CG 0.8
4 Gene_2 CHH 0.195
5 Gene_3 CG 0.253
6 Gene_3 CHH 0.306
7 Gene_4 CG 0.214
8 Gene_4 CHG 0.438
9 Gene_4 CHH 0.2
10 Gene_5 CG 0.183
11 Gene_5 CHG 0.308
meth_level_Average %>%
group_by(meth_Type) %>%
arrange(meth_Type) %>%
mutate(rank_up = rank(Average),
rank_down = rank(desc(Average))) %>%
filter(rank_up <= 2 | rank_down <= 2)
# A tibble: 10 x 5
# Groups: meth_Type [3]
Gene_name meth_Type Average rank_up rank_down
<chr> <chr> <dbl> <dbl> <dbl>
1 Gene_1 CG 0.156 1 5
2 Gene_2 CG 0.8 5 1
3 Gene_3 CG 0.253 4 2
4 Gene_5 CG 0.183 2 4
5 Gene_4 CHG 0.438 2 1
6 Gene_5 CHG 0.308 1 2
7 Gene_1 CHH 0.319 4 1
8 Gene_2 CHH 0.195 1 4
9 Gene_3 CHH 0.306 3 2
10 Gene_4 CHH 0.2 2 3
题目六
已知我们通过某修饰的蛋白组数据,通过前期分析整合得到下面信息,求有多少个基因含有修饰位点、每一个基因中包含几个修饰位点、平均每个基因有多少个修饰位点、每一个基因中前后修饰位点间隔距离以及每个基因中修饰间隔的平均距离。
# 数据
Proteinaccession Position
A0A0N7KCG8 92
A0A0N7KCG8 97
A0A0N7KCG8 138
A0A0N7KCG8 261
A0A0N7KD63 16
A0A0N7KD71 191
A0A0N7KDI2 14
A0A0N7KEK0 86
A0A0N7KEL2 112
A0A0N7KEN1 498
A0A0N7KEN1 513
A0A0N7KFI2 241
A0A0N7KFL5 11
A0A0N7KG02 356
A0A0N7KGS3 137
A0A0N7KH16 81
A0A0N7KH54 148
A0A0N7KH54 184
A0A0N7KI17 359
A0A0N7KI20 77
A0A0N7KI20 224
A0A0N7KI20 282
A0A0N7KIR0 18
A0A0N7KIR1 104
A0A0N7KIR1 285
A0A0N7KJ67 81
A0A0N7KJB1 342
A0A0N7KJF4 78
A0A0N7KK10 235
A0A0N7KK10 256
A0A0N7KK10 279
A0A0N7KK90 387
A0A0N7KKI3 21
A0A0N7KKT9 50
A0A0N7KLH2 307
A0A0N7KLN6 9
A0A0N7KLY1 1033
A0A0N7KMN9 220
# 你按下ctrl + c,你的数据就到了粘贴板上
# file = "clipboard"就代表从粘贴板复制数据
data <- read.table(file = "clipboard", header = T, sep = "", stringsAsFactors = F)
# 有多少个基因含有修饰位点
# 这里就标识了,共28个基因
# A tibble: 38 x 2
# Groups: Proteinaccession [28]
# 当然,你unique也行
> data %>%
+ pull(Proteinaccession) %>%
+ unique() %>%
+ length()
[1] 28
# 每一个基因中包含几个修饰位点
> data %>%
+ group_by(Proteinaccession) %>%
+ summarise(n = n())
# A tibble: 28 x 2
Proteinaccession n
<chr> <int>
1 A0A0N7KCG8 4
2 A0A0N7KD63 1
3 A0A0N7KD71 1
4 A0A0N7KDI2 1
5 A0A0N7KEK0 1
6 A0A0N7KEL2 1
7 A0A0N7KEN1 2
8 A0A0N7KFI2 1
9 A0A0N7KFL5 1
10 A0A0N7KG02 1
# ... with 18 more rows
# 平均每个基因有多少个修饰位点
> data %>%
+ group_by(Proteinaccession) %>%
+ summarise(n = n()) %>%
+ summarise(total = sum(n)) %>%
+ as.numeric()
[1] 38
> data %>%
+ group_by(Proteinaccession) %>%
+ summarise(n = n()) %>%
+ summarise(total = sum(n)) %>%
+ as.numeric() / 28
[1] 1.357143
# 当然,这是想复杂了,简单的其实是
> dim(data)[1] / 28
[1] 1.357143
# 每一个基因中前后修饰位点间隔距离
> data %>%
+ group_by(Proteinaccession) %>%
+ mutate(distance = Position - lag(Position))
# A tibble: 38 x 3
# Groups: Proteinaccession [28]
Proteinaccession Position distance
<chr> <int> <int>
1 A0A0N7KCG8 92 NA
2 A0A0N7KCG8 97 5
3 A0A0N7KCG8 138 41
4 A0A0N7KCG8 261 123
5 A0A0N7KD63 16 NA
6 A0A0N7KD71 191 NA
7 A0A0N7KDI2 14 NA
8 A0A0N7KEK0 86 NA
9 A0A0N7KEL2 112 NA
10 A0A0N7KEN1 498 NA
# ... with 28 more rows
# 每个基因中修饰间隔的平均距离
# NaN的应该就是只有一个修饰的
> data %>%
+ group_by(Proteinaccession) %>%
+ mutate(distance = Position - lag(Position)) %>%
+ summarise(distance = mean(distance,na.rm = T))
# A tibble: 28 x 2
Proteinaccession distance
<chr> <dbl>
1 A0A0N7KCG8 56.3
2 A0A0N7KD63 NaN
3 A0A0N7KD71 NaN
4 A0A0N7KDI2 NaN
5 A0A0N7KEK0 NaN
6 A0A0N7KEL2 NaN
7 A0A0N7KEN1 15
8 A0A0N7KFI2 NaN
9 A0A0N7KFL5 NaN
10 A0A0N7KG02 NaN
# ... with 18 more rows