> flights
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>
1 2013 1 1 517 515 2 830 819 11
2 2013 1 1 533 529 4 850 830 20
3 2013 1 1 542 540 2 923 850 33
4 2013 1 1 544 545 -1 1004 1022 -18
5 2013 1 1 554 600 -6 812 837 -25
6 2013 1 1 554 558 -4 740 728 12
7 2013 1 1 555 600 -5 913 854 19
8 2013 1 1 557 600 -3 709 723 -14
9 2013 1 1 557 600 -3 838 846 -8
10 2013 1 1 558 600 -2 753 745 8
# ... with 336,766 more rows, and 10 more variables: carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
- filter():按值筛选观测
- arrange():对行进行重新排序
- select():按名称选取变量
- mutate():使用现有变量的函数创建新变量
- summarize():创建摘要统计
一个简单的筛选操作dplyr::filter(flights, month == 1)
> dplyr::filter(flights, month == 1)
# A tibble: 27,004 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ... with 26,994 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
# flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
# distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 此处有两种过滤方式
dplyr::filter(flights, arr_delay <= 120, dep_delay <=120)
dplyr::filter(flights, !(arr_delay > 120 | dep_delay > 120))
# 示例 dplyr::filter(flights, dest == "IAH" | dest == "HOU")
# 示例 dplyr::filter(flights, month %in% c(7,8,9))
课后问题:dplyr包中对筛选有用的另一个函数为between, 其作用是什么?
> arrange(flights, year, month, dep_delay)
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int>
1 2013 1 11 1900 1930 -30 2233 2243 -10 DL 1435
2 2013 1 29 1703 1730 -27 1947 1957 -10 F9 837
3 2013 1 12 1354 1416 -22 1606 1650 -44 FL 349
4 2013 1 21 2137 2159 -22 2232 2316 -44 DL 2155
5 2013 1 20 704 725 -21 1025 1035 -10 AS 11
6 2013 1 12 2050 2110 -20 2310 2355 -45 B6 529
7 2013 1 12 2134 2154 -20 4 50 -46 B6 515
8 2013 1 14 2050 2110 -20 2329 2355 -26 B6 529
9 2013 1 4 2140 2159 -19 2241 2316 -35 DL 2155
10 2013 1 11 1947 2005 -18 2209 2230 -21 9E 4033
# ... with 336,766 more rows, and 8 more variables: tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
> arrange(flights, desc(month))
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int>
1 2013 12 1 13 2359 14 446 445 1 B6 745
2 2013 12 1 17 2359 18 443 437 6 B6 839
3 2013 12 1 453 500 -7 636 651 -15 US 1895
4 2013 12 1 520 515 5 749 808 -19 UA 1487
5 2013 12 1 536 540 -4 845 850 -5 AA 2243
6 2013 12 1 540 550 -10 1005 1027 -22 B6 939
7 2013 12 1 541 545 -4 734 755 -21 EV 3819
8 2013 12 1 546 545 1 826 835 -9 UA 1441
9 2013 12 1 549 600 -11 648 659 -11 US 2167
10 2013 12 1 550 600 -10 825 854 -29 B6 605
# ... with 336,766 more rows, and 8 more variables: tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
> arrange(flights, desc(is.na(arr_time))) # A tibble: 336,776 x 19 year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> 1 2013 1 1 2016 1930 46 NA 2220 NA EV 2 2013 1 1 NA 1630 NA NA 1815 NA EV 3 2013 1 1 NA 1935 NA NA 2240 NA AA 4 2013 1 1 NA 1500 NA NA 1825 NA AA 5 2013 1 1 NA 600 NA NA 901 NA B6 6 2013 1 2 2041 2045 -4 NA 2359 NA B6 7 2013 1 2 2145 2129 16 NA 33 NA UA 8 2013 1 2 NA 1540 NA NA 1747 NA EV 9 2013 1 2 NA 1620 NA NA 1746 NA EV 10 2013 1 2 NA 1355 NA NA 1459 NA EV # ... with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>, origin <chr>, # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 按照名称选择列,并且可以搭配使用辅助函数
> select(flights, starts_with("mo"), ends_with("delay"), contains("sche"))
# A tibble: 336,776 x 5
month dep_delay arr_delay sched_dep_time sched_arr_time
<int> <dbl> <dbl> <int> <int>
1 1 2 11 515 819
2 1 4 20 529 830
3 1 2 33 540 850
4 1 -1 -18 545 1022
5 1 -6 -25 600 837
6 1 -4 12 558 728
7 1 -5 19 600 854
8 1 -3 -14 600 723
9 1 -3 -8 600 846
10 1 -2 8 600 745
# ... with 336,766 more rows
> select(flights, time_hour ,air_time, everything())
# A tibble: 336,776 x 19
time_hour air_time year month day dep_time sched_dep_time dep_delay
<dttm> <dbl> <int> <int> <int> <int> <int> <dbl>
1 2013-01-01 05:00:00 227 2013 1 1 517 515 2
2 2013-01-01 05:00:00 227 2013 1 1 533 529 4
3 2013-01-01 05:00:00 160 2013 1 1 542 540 2
4 2013-01-01 05:00:00 183 2013 1 1 544 545 -1
5 2013-01-01 06:00:00 116 2013 1 1 554 600 -6
6 2013-01-01 05:00:00 150 2013 1 1 554 558 -4
7 2013-01-01 06:00:00 158 2013 1 1 555 600 -5
8 2013-01-01 06:00:00 53 2013 1 1 557 600 -3
9 2013-01-01 06:00:00 140 2013 1 1 557 600 -3
10 2013-01-01 06:00:00 138 2013 1 1 558 600 -2
# ... with 336,766 more rows, and 11 more variables: arr_time <int>,
# sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>
# 示例 > var <- c("year", "month", "day", "dep_delay") > select(flights, one_of(var)) # A tibble: 336,776 x 4 year month day dep_delay <int> <int> <int> <dbl> 1 2013 1 1 2 2 2013 1 1 4 3 2013 1 1 2 4 2013 1 1 -1 5 2013 1 1 -6 6 2013 1 1 -4 7 2013 1 1 -5 8 2013 1 1 -3 9 2013 1 1 -3 10 2013 1 1 -2 # ... with 336,766 more rows
Chevy 指出下列可用辅助函数,大家可以
自行疯狂了解一波starts_with(), ends_with(), contains()
> mutate(select(flights, year:day, ends_with("delay"), distance, air_time),
+ gain = arr_delay - dep_delay,
+ hours = air_time / 60,
+ gain_per_hour = gain / hours)
# A tibble: 336,776 x 10
year month day dep_delay arr_delay distance air_time gain hours gain_per_hour
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 2 11 1400 227 9 3.78 2.38
2 2013 1 1 4 20 1416 227 16 3.78 4.23
3 2013 1 1 2 33 1089 160 31 2.67 11.6
4 2013 1 1 -1 -18 1576 183 -17 3.05 -5.57
5 2013 1 1 -6 -25 762 116 -19 1.93 -9.83
6 2013 1 1 -4 12 719 150 16 2.5 6.4
7 2013 1 1 -5 19 1065 158 24 2.63 9.11
8 2013 1 1 -3 -14 229 53 -11 0.883 -12.5
9 2013 1 1 -3 -8 944 140 -5 2.33 -2.14
10 2013 1 1 -2 8 733 138 10 2.3 4.35
# ... with 336,766 more rows
如果只是想依据旧有数据创建一个新的data.frame,好的, transmute()
> transmute(flights,
+ gain = arr_delay - dep_delay,
+ hours = air_time / 60,
+ gain_per_hour = gain / hours)
# A tibble: 336,776 x 3
gain hours gain_per_hour
<dbl> <dbl> <dbl>
1 9 3.78 2.38
2 16 3.78 4.23
3 31 2.67 11.6
4 -17 3.05 -5.57
5 -19 1.93 -9.83
6 16 2.5 6.4
7 24 2.63 9.11
8 -11 0.883 -12.5
9 -5 2.33 -2.14
10 10 2.3 4.35
# ... with 336,766 more rows
- 算术运算符:+、-、*、/、^
- x / sum(x)
- y -mean(y)
- 模运算符:%/% 和 %%
- x == y * (x %/% y) + (x %% y)
- 对数函数:log2 (), log10(), log()
- 偏移函数:lead()、lag()
- 累加和滚动聚合:cumsum()、cumprod()、commin()和cummax()
- 更加深入的需求可以了解RcppRoll包
- 逻辑比较:<、 <=、 >、 >=和 !=
- 排秩函数:min_rank()
> transmute(flights, + dep_time_new = (dep_time %/% 100) * 60 + dep_time %% 100 ) # A tibble: 336,776 x 1 dep_time_new <dbl> 1 317 2 333 3 342 4 344 5 354 6 354 7 355 8 357 9 357 10 358 # ... with 336,766 more rows
比较air_time 和 arr_time - dep_time,有什么区别,如何解决。
> summarise(flights, delay = mean(dep_delay, na.rm = T))
# A tibble: 1 x 1
1 12.6
> by_day <- group_by(flights, year, month, day)
> by_day
# A tibble: 336,776 x 19
# Groups: year, month, day [365]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 1 517 515 2 830 819 11 UA
2 2013 1 1 533 529 4 850 830 20 UA
3 2013 1 1 542 540 2 923 850 33 AA
4 2013 1 1 544 545 -1 1004 1022 -18 B6
5 2013 1 1 554 600 -6 812 837 -25 DL
6 2013 1 1 554 558 -4 740 728 12 UA
7 2013 1 1 555 600 -5 913 854 19 B6
8 2013 1 1 557 600 -3 709 723 -14 EV
9 2013 1 1 557 600 -3 838 846 -8 B6
10 2013 1 1 558 600 -2 753 745 8 AA
# ... with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
# dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
> summarise(by_day, delay = mean(dep_delay, na.rm = T))
# A tibble: 365 x 4
# Groups: year, month [?]
year month day delay
<int> <int> <int> <dbl>
1 2013 1 1 11.5
2 2013 1 2 13.9
3 2013 1 3 11.0
4 2013 1 4 8.95
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.55
9 2013 1 9 2.28
10 2013 1 10 2.84
# ... with 355 more rows
> by_dest <- group_by(flights, dest)
> delay <- summarise(by_dest,
+ conuts = n(),
+ dis = mean(distance, na.rm = T),
+ delay = mean(arr_delay, na.rm = T)
+ )
> delay
# A tibble: 105 x 4
dest conuts dis delay
<chr> <int> <dbl> <dbl>
1 ABQ 254 1826 4.38
2 ACK 265 199 4.85
3 ALB 439 143 14.4
4 ANC 8 3370 -2.5
5 ATL 17215 757. 11.3
6 AUS 2439 1514. 6.02
7 AVL 275 584. 8.00
8 BDL 443 116 7.05
9 BGR 375 378 8.03
10 BHM 297 866. 16.9
# ... with 95 more rows
> delay <- dplyr::filter(delay, count > 20, dest != "NHL")
> delays <- flights %>%
+ group_by(dest) %>%
+ summarise(
+ count = n(),
+ dis = mean(distance, na.rm = T),
+ delay = mean(arr_delay, na.rm = T)
+ ) %>%
+ dplyr::filter(count > 20, dest != "NHL")
> delays
# A tibble: 97 x 4
dest count dis delay
<chr> <int> <dbl> <dbl>
1 ABQ 254 1826 4.38
2 ACK 265 199 4.85
3 ALB 439 143 14.4
4 ATL 17215 757. 11.3
5 AUS 2439 1514. 6.02
6 AVL 275 584. 8.00
7 BDL 443 116 7.05
8 BGR 375 378 8.03
9 BHM 297 866. 16.9
10 BNA 6333 758. 11.8
# ... with 87 more rows
我们都知道,数据里掺杂着NA值的时候(事实上还有另外几种),NA: 缺失数据;NaN: 无意义的数,比如sqrt(-2);Inf: 正无穷大;-Inf: 负无穷大,会给计算带来无法预料的麻烦,所以在处理数据的时候我们需要去除这些NA值。basic函数na.omit()