dplyr包
数据集:iris
安装、加载包及常用功能
options("repo"=c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(Bioc_mirror="https://mirrors.ustc.edu.cn/bioc/")
install.packages("dplyr")
library(dplyr)
test <- iris[c(1:2,51:52,101:102),]
#新增列
mutate(test,new=Sepal.Length*Sepal.Width)
#筛选
##select,按列筛选
select(test,1)
select(test,c(1,5))
select(test,Sepal.Length)
select(test,Petal.Length,Petal.Width)
vars <- c("Petal.Length","Petal.Width")
select(test,one_of(vars)) #one_of()根据名称选择变量
##filter,按行筛选
filter(test,Species=="setosa")
filter(test,Species=="setosa"&Sepal.Length>5)
filter(test,Species %in% c("setosa","versicolor"))
#arrange(),按某一列或某几列对整个表格进行排序
arrange(test,Sepal.Length) #默认从小到大排序
arrange(test,desc(Sepal.Length)) ##desc从大到小
#summarise()汇总
summarise(test,mean(Sepal.Length),sd(Sepal.Length))
group_by(test,Species)
summarise(group_by(test,Species),mean(Sepal.Length),sd(Sepal.Length))
#管道操作%>%,将左侧的数据集传给右侧
test %>%
group_by(Species) %>%
summarise(mean(Sepal.Length),sd(Sepal.Length))
#count统计某列的unique值
count(test,Species)
#处理关系数据,连接两个表
options(stringsAsFactors = F)
test1 <- data.frame(x=c("b","e","f","x"),
z=c("A","B","c","D"),
stringsAsFactors = F)
test1
x z
1 b A
2 e B
3 f c
4 x D
test2 <- data.frame(x=c("a","b","c","d","e","f"),
+ y=c(1,2,3,4,5,6),
+ stringsAsFactors = F)
test2
x y
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
##两个数据框取交集后合并
inner_join(test1,test2)
Joining, by = "x"
x z y
1 b A 2
2 e B 5
3 f C 6
##以左边数据某列为准连接
left_join(test1,test2,by="x")
x z y
1 b A 2
2 e B 5
3 f C 6
4 x D NA
left_join(test2,test1,by="x")
x y z
1 a 1 <NA>
2 b 2 A
3 c 3 <NA>
4 d 4 <NA>
5 e 5 B
6 f 6 C
##以某列数据在两个数据框的所有数据为准连接数据库
full_join(test1,test2,by="x")
x z y
1 b A 2
2 e B 5
3 f C 6
4 x D NA
5 a <NA> 1
6 c <NA> 3
7 d <NA> 4
##半连接:返回能够与y表匹配的x表所有记录
semi_join(x=test1,y=test2,by="x")
x z
1 b A
2 e B
3 f C
##反连接:返回与y表无法匹配的x表数据
anti_join(x=test2,y=test1,by="x")
x y
1 a 1
2 c 3
3 d 4
#简单合并,类似cbind和rbind
#bind_rows(),列数相同
#bind_cols(),行数相同