目标:结合正则表达式,实现
确定与某种模式匹配的字符串
找出匹配位置
提取匹配内容
替换匹配内容
基于匹配拆分字符串
1. 匹配检测
1.1 str_detect()
#返回逻辑向量
> str_detect(c("huang","si","yuan"),"a")
[1] TRUE FALSE TRUE
#能匹配上几个向量元素
> sum(str_detect(c("huang","si","yuan"),"a"))
[1] 2
#匹配上的向量元素的占比
> mean(str_detect(c("huang","si","yuan"),"a"))
[1] 0.6666667
还能选取出匹配某种模式的元素
预备知识:逻辑取子集,如
> hsy <- c("huang","si","yuan")
> hsy[c(TRUE,TRUE,FALSE)]
[1] "huang" "si"
继续
> hsy <- c("huang","si","yuan")
> hsy[str_detect(hsy,"a")]
[1] "huang" "yuan"
其他方法
> str_subset(hsy,"a")
[1] "huang" "yuan"
更实用的场景
针对数据框的某一列,筛选出符合条件的行记录
> df <- tibble(
+ word=words,
+ i=seq_along(word) #添加行号
+ )
> df %>% filter(str_detect(word,"x$"))
# A tibble: 4 x 2
word i
<chr> <int>
1 box 108
2 sex 747
3 six 772
4 tax 841
1.2 str_count()
#返回每一个元素匹配的次数
> str_count(hsy,"a")
[1] 1 0 1
#平均每个元素匹配的次数
> mean(str_count(hsy,"a"))
[1] 0.6666667
2. 提取匹配内容
这里是指匹配的内容
,与上面的提取向量元素
有区别
2.1 str_extract()
sentences数据集是stringr包自带的,为720个元素的字符串向量
先提取能匹配上的句子/行看看
> has_red_blue <- str_subset(sentences,"red|blue")
> head(has_red_blue)
[1] "Glue the sheet to the dark blue background."
[2] "Two blue fish swam in the tank."
[3] "The colt reared and threw the tall rider."
[4] "The wide road shimmered in the hot sun."
[5] "See the cat glaring at the scared mouse."
[6] "A wisp of cloud hung in the blue air."
提取匹配内容, 注意str_extract()
只会提取第一个匹配
> matches <- str_extract(has_red_blue,"red|blue")
> head(matches)
[1] "blue" "blue" "red" "red" "red" "blue"
2.2 str_extract_all()
如何提取多个匹配呢?
先来看看有没有多次匹配的行
> more <- has_red_blue[str_count(has_red_blue,"red|blue") > 1]
> more
[1] "It is hard to erase blue or red ink."
用str_extract_all()
提取
> str_extract_all(more,"red|blue") #返回列表
[[1]]
[1] "blue" "red"
> str_extract_all(more,"red|blue",simplify = T) #返回矩阵
[,1] [,2]
[1,] "blue" "red"
> head(str_extract_all(has_red_blue,"red|blue",simplify = T)) #每一行长度自动统一
[,1] [,2]
[1,] "blue" ""
[2,] "blue" ""
[3,] "red" ""
[4,] "red" ""
[5,] "red" ""
[6,] "blue" ""
3. 分组匹配
str_match()
可以给出每个分组的详细匹配内容,比括号搭配\1, \2方便
> two_words <- "(a|the) ([^ ]+)"
> has_two_words <- sentences %>% str_subset(two_words) %>% head(10)
> has_two_words %>% str_extract(two_words) #给出模式的完整匹配
[1] "the smooth" "the sheet" "the depth" "a chicken" "the parked" "the sun"
[7] "the huge" "the ball" "the woman" "a helps"
> has_two_words %>% str_match(two_words) #给出完整匹配以及分组匹配
[,1] [,2] [,3]
[1,] "the smooth" "the" "smooth"
[2,] "the sheet" "the" "sheet"
[3,] "the depth" "the" "depth"
[4,] "a chicken" "a" "chicken"
[5,] "the parked" "the" "parked"
[6,] "the sun" "the" "sun"
[7,] "the huge" "the" "huge"
[8,] "the ball" "the" "ball"
[9,] "the woman" "the" "woman"
[10,] "a helps" "a" "helps"
4. 替换匹配内容
str_replace()
> hsy <- c("huang","si","yuan")
> str_replace(hsy,"[aeiou]"," ")
[1] "h ang" "s " "y an"
> str_replace_all(hsy, "[aeiou]", " ")
[1] "h ng" "s " "y n"
同时执行多种替换
> x <- c("1 house", "2 cars", "3 people")
> str_replace_all(x, c("1" = "one","2" = "two", "3" = "three"))
[1] "one house" "two cars" "three people"
5. 拆分
str_split()
str_split()返回列表,加了simplify之后变为矩阵
> sentences %>% head(4) %>% str_split(" ",simplify = T)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks." ""
[2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background." ""
[3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
[4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare" "dish."
如何提取str_split()返回的列表元素
> "a|b|c" %>% str_split("\\|") %>% .[[1]]
[1] "a" "b" "c"
> "a|b|c" %>% str_split("\\|") %>% .[[1]] %>% .[2]
[1] "b"
6. 定位匹配内容
str_locate()
> str_locate(hsy,"[aeiou]")
start end
[1,] 2 2
[2,] 2 2
[3,] 2 2
> str_locate_all(hsy,"[aeiou]")
[[1]]
start end
[1,] 2 2
[2,] 3 3
[[2]]
start end
[1,] 2 2
[[3]]
start end
[1,] 2 2
[2,] 3 3
7. 使用regex()
调整模式规则
str_view_all(hsy,regex("[aeiou]",ignore_case = T,multiline = T,comments = T,dotall = T))
ignore_case = T:不区分大小写
multiline = T:^和$分别表示每一行的开头和结尾,而不是整个字符串的
comments = T:添加注释
dotall = T:点号.能够代表换行符