library(rvest)
## Loading required package: xml2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Newsurl <- "https://edition.cnn.com/cnn10"
articlebody <- read_html(Newsurl) %>%
html_nodes(".cd__description") %>% html_text()
articlebody
## [1] "Today's show discusses response plans, closures, market drops, and testing related to the new coronavirus. We also report on the rescue of Rockstar Freddy. Transcript and Newsquiz"
oneword <- strsplit(articlebody, split = " ")[[1]]
oneword
## [1] "Today's" "show" "discusses" "response" "plans,"
## [6] "closures," "market" "drops," "and" "testing"
## [11] "related" "to" "the" "new" "coronavirus."
## [16] "We" "also" "report" "on" "the"
## [21] "rescue" "of" "Rockstar" "Freddy." "Transcript"
## [26] "and" "Newsquiz"
# search the location of "coronavirus"
grep("coronavirus", oneword) # order
## [1] 15
grepl("coronavirus", oneword) # TRUE/FALSE
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE
# paste "weather" to each of oneword
paste(oneword, "weather")
## [1] "Today's weather" "show weather" "discusses weather"
## [4] "response weather" "plans, weather" "closures, weather"
## [7] "market weather" "drops, weather" "and weather"
## [10] "testing weather" "related weather" "to weather"
## [13] "the weather" "new weather" "coronavirus. weather"
## [16] "We weather" "also weather" "report weather"
## [19] "on weather" "the weather" "rescue weather"
## [22] "of weather" "Rockstar weather" "Freddy. weather"
## [25] "Transcript weather" "and weather" "Newsquiz weather"
# search ", " and replace it as " %and% "
gsub(x = articlebody, pattern = ", ", replacement = " %and% ")
## [1] "Today's show discusses response plans %and% closures %and% market drops %and% and testing related to the new coronavirus. We also report on the rescue of Rockstar Freddy. Transcript and Newsquiz"
字串處理通常會搭配正規表示式 (Regular Expression)
Regular Expression (正規表示式)是指一組能用來表示字串共同格式 (common structure)的樣式 (Pattern),像是@符號會固定出現在email中,或是手機號碼固定是10碼,等等樣式。 在所有的程式語言中,只要用到字串比對與字串取代等字串相關功能,都會用到正規表示式。雖然正規表示式在不同程式語言中會有些許差異,但核心概念是相同的。
以下是常見的範例:
語法 | 正規表達 | 範例 |
---|---|---|
整數 | [0-9]+ | 5815 |
浮點數 | [0-9]+.[0-9]+ | 58.15 |
純英文字串 | [A-Za-z]+ | CGUIM |
[a-zA-Z0-9_]+@[a-zA-Z0-9_]+ | im@gmail.com | |
URL網址 | http://[a-zA-Z0-9_]+ | http://www.yahoo.com.tw/ |
stringVector <- c("a","abc","ac","abbc","abbbc","abbbbc")
grep("ab*", stringVector, value=TRUE) # return value in stringVector
## [1] "a" "abc" "ac" "abbc" "abbbc" "abbbbc"
grep("ab+", stringVector, value=TRUE)
## [1] "abc" "abbc" "abbbc" "abbbbc"
grep("ab?c", stringVector, value=TRUE)
## [1] "abc" "ac"
grep("a{1}b{1}c{1}", stringVector, value=TRUE)
## [1] "abc"
grep("ab{2}c", stringVector, value=TRUE)
## [1] "abbc"
grep("ab{3}", stringVector, value=TRUE) ==
grep("ab{3,}", stringVector, value=TRUE)
## [1] TRUE TRUE
stringVector <- c("abc","bcd","cde","def","abc def","bcdefg abc","blablammmc","k a")
grep("^bc",stringVector,value=T)
## [1] "bcd" "bcdefg abc"
grep("^b",stringVector,value=T)
## [1] "bcd" "bcdefg abc" "blablammmc"
grep("bc$",stringVector,value=T)
## [1] "abc" "bcdefg abc"
grep("c$",stringVector,value=T)
## [1] "abc" "bcdefg abc" "blablammmc"
grep("\\ba",stringVector,value=T) # "\" is needed!
## [1] "abc" "abc def" "bcdefg abc" "k a"
grep("\\Ba",stringVector,value=T)
## [1] "blablammmc"
grep("\\bde",stringVector,value=T)
## [1] "def" "abc def"
grep("\\Bde",stringVector,value=T)
## [1] "cde" "bcdefg abc"
stringVector <- c("03-1234567","02-87654321","0988123456",
"07-118","0-888","03548445",
"csim@mail.cgu.edu.tw","csim@.","csim@","@gms.",
"http://www.is.cgu.edu.tw/","https://www.yahoo.com.tw/")
grep("[0-9]{2}-[0-9]{7,8}",stringVector,value=T)
## [1] "03-1234567" "02-87654321"
grep("09[0-9]{8}",stringVector,value=T) # cellphone
## [1] "0988123456"
grep("03-|02-",stringVector,value=T) # Taipei or Hualian
## [1] "03-1234567" "02-87654321"
grep("[A-Za-z.]{2,}@[A-Za-z.]{2,}",stringVector,value=T) # email
## [1] "csim@mail.cgu.edu.tw"
grep("http:|https:",stringVector,value=T) # url
## [1] "http://www.is.cgu.edu.tw/" "https://www.yahoo.com.tw/"
stringVector<-c("03-2118800","02-23123456","0988123456",
"07-118","0-888","csim@mail.cgu.edu.tw","http://www.is.cgu.edu.tw/")
grep("\\d{2}-\\d{7,8}",stringVector,value=T)
## [1] "03-2118800" "02-23123456"
grep("\\d{10}",stringVector,value=T)
## [1] "0988123456"
grep("\\w+@[a-zA-Z0-9._]+",stringVector,value=T)
## [1] "csim@mail.cgu.edu.tw"
grep("\\w{2,}@[a-zA-Z0-9._]{2,}",stringVector,value=T)
## [1] "csim@mail.cgu.edu.tw"