網頁爬蟲 Webscraping

用 css 路徑定位的話,我們可以用幾種方式告訴程式我們要範例中的 a 標籤:

  • a 指定要標籤 a

  • #foo 指定要 idfoo 的標籤

  • .bar 指定 classbar 的標籤

# IMDB's fan favorites movie and score
Newsurl <- "https://www.imdb.com/"
con <- read_html(Newsurl)
articlebody <- con %>%
  html_nodes(".fan-picks a.ipc-poster-card__title") %>%
  html_text()
articlebody1 <- con %>%
  html_nodes(".fan-picks .ipc-poster-card__rating-star-group span") %>%
  html_text(trim = TRUE)
articlebody2 <- con %>%
  html_nodes(".fan-picks a.ipc-poster-card__title") %>%
  html_attr("href")
articlebody2 <- paste0("https://www.imdb.com", articlebody2)

article <- data.frame(content = articlebody, score = articlebody1, url = articlebody2)
head(article)
##    content score
## 1     1917   8.4
## 2 寄生上流   8.6
## 3 鋒迴路轉   8.0
## 4     小丑   8.5
## 5 兔嘲男孩   8.0
## 6 賽道狂人   8.1
##                                                                url
## 1 https://www.imdb.com/title/tt8579674/?ref_=hm_fanfav_tt_1_pd_fp1
## 2 https://www.imdb.com/title/tt6751668/?ref_=hm_fanfav_tt_2_pd_fp1
## 3 https://www.imdb.com/title/tt8946378/?ref_=hm_fanfav_tt_3_pd_fp1
## 4 https://www.imdb.com/title/tt7286456/?ref_=hm_fanfav_tt_4_pd_fp1
## 5 https://www.imdb.com/title/tt2584384/?ref_=hm_fanfav_tt_5_pd_fp1
## 6 https://www.imdb.com/title/tt1950186/?ref_=hm_fanfav_tt_6_pd_fp1
# ETtoday 熱們新聞
Newsurl <- "https://www.ettoday.net/"
con <- read_html(Newsurl)
articlebody <- con %>% 
  html_nodes("div.block .block_content .part_list_1 .txt a") %>%
  html_text(trim = TRUE)
articlebody1 <- con %>% 
  html_nodes("div.block .block_content .part_list_1 .txt a") %>%
  html_attr("href")
articlebody1 <- paste0("https://www.ettoday.net", articlebody1)
article <- data.frame(content = articlebody, url = articlebody1)
head(article)
##                                                                  content
## 1              去日本爽玩確診!案54被爆回台「連玩3夜店」 名嘴笑:挫咧等
## 2 影/畫面曝光!桃機湧萬人逃難返國潮 防疫車隊排爆...人龍繞2排看不到盡頭
## 3                                             《李屍朝鮮》播兩季要改劇名
## 4                 獨/直擊好市多搶購潮!闖3關...結帳長龍嘆:與櫃檯的距離
## 5                      遭開3槍推下山!她最後合照放閃文:你沒外面說的糟糕
## 6                         屏東男心情不好拔鐮刀猛砍路人!她噴血苦求:救我
##                                                 url
## 1 https://www.ettoday.net/news/20200319/1670966.htm
## 2 https://www.ettoday.net/news/20200319/1671203.htm
## 3 https://www.ettoday.net/news/20200318/1670983.htm
## 4 https://www.ettoday.net/news/20200319/1671261.htm
## 5 https://www.ettoday.net/news/20200319/1671058.htm
## 6 https://www.ettoday.net/news/20200318/1670493.htm
# 爬不同日期的新聞內容
Newsurl <- "https://www.ettoday.net/news/news-list" 

con <- read_html(paste0(Newsurl,".htm"))
# time of news
news_time <- con %>% 
  html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
  html_nodes(".date") %>%
  html_text()
# class of news
news_class <- con %>% 
  html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
  html_nodes("em.tag") %>%
  html_text()
# content 
news_content <- con %>% 
  html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
  html_nodes("a") %>%
  html_text(trim = TRUE)
# url
news_url <- con %>% 
  html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
  html_nodes("a") %>%
  html_attr("href")
news_url <- paste0("https://www.ettoday.net", news_url)

article <- data.frame(time = news_time,
                      class = news_class,
                      content = news_content,
                      url = news_url)
head(article)
##               time class
## 1 2020/03/19 22:00  時尚
## 2 2020/03/19 21:56  社會
## 3 2020/03/19 21:55  影劇
## 4 2020/03/19 21:54  生活
## 5 2020/03/19 21:54  影劇
## 6 2020/03/19 21:50  生活
##                                                          content
## 1         李玉璽買耳機不花冤枉錢 讚萬寶龍精品級設計、戴了超舒服
## 2 北市監理所集體貪汙!繼所長之後 科長也收受代檢廠賄賂遭羈押禁見
## 3   小賈斯汀寵妻無極限! 曬超甜居家隔離照「肉麻土味情話」閃瞎網
## 4     新冠肺炎病例破百「未來14天為防疫關鍵期」莊人祥分析兩種情況
## 5 免排隊!譚艾珍超強「十步驟DIY布口罩」 隨身帶「防疫5寶」保健康
## 6          發燒38度「台北街頭狂奔」!移工默遞紙條…轉逃跑原因曝光
##                                                 url
## 1 https://www.ettoday.net/news/20200319/1671826.htm
## 2 https://www.ettoday.net/news/20200319/1671971.htm
## 3 https://www.ettoday.net/news/20200319/1671978.htm
## 4 https://www.ettoday.net/news/20200319/1671742.htm
## 5 https://www.ettoday.net/news/20200319/1671015.htm
## 6 https://www.ettoday.net/news/20200319/1671970.htm
# different date
#"https://www.ettoday.net/news/news-list-2020-3-18-0.htm" ==
news_webscrap <- function(year, month, day, Newsurl = "https://www.ettoday.net/news/news-list"){
  # year <- 2020
  # month <- 3
  # day <- 18
  which_date_you_want <- 
  paste(Newsurl ,year, month, day, "0.htm", sep = "-") 
  
  con <- read_html(which_date_you_want)
  # time of news
  news_time <- con %>% 
    html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
    html_nodes(".date") %>%
    html_text(trim = TRUE)
  # class of news
  news_class <- con %>% 
    html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
    html_nodes("em.tag") %>%
    html_text(trim = TRUE)
  # content 
  news_content <- con %>% 
    html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
    html_nodes("a") %>%
    html_text(trim = TRUE)
  # url
  news_url <- con %>% 
    html_nodes("div.block div.block_content div.part_list_2 h3") %>% 
    html_nodes("a") %>%
    html_attr("href")
  news_url <- paste0("https://www.ettoday.net", news_url)

  return(data.frame(time = news_time,
                    class = news_class,
                    content = news_content,
                    url = news_url) 
  )

}
article20191205 <- news_webscrap(year = 2019, month = 12, day = 05)
article20181205 <- news_webscrap(year = 2018, month = 12, day = 05)
article20111205 <- news_webscrap(year = 2011, month = 12, day = 05)
head(article20191205)
##               time class
## 1 2019/12/05 23:53  電競
## 2 2019/12/05 23:53  地方
## 3 2019/12/05 23:53  國際
## 4 2019/12/05 23:52  大陸
## 5 2019/12/05 23:50  財經
## 6 2019/12/05 23:45  影劇
##                                                                 content
## 1            LIGHTSPEED無線技術樹立新標竿 羅技G502入選2019百大最佳產品
## 2        吃早餐連2次吃到毛!新北女氣炸PO網…老闆娘哭了「公布監視器畫面」
## 3          中駐美外交官見地方官需「通知國務院」 中國對等反制美國外交官
## 4               香港罕見發生規模1.4地震「搖晃數秒鐘」 市民驚:天有異象
## 5                                    中美晶轉投資台特化獲台積電供應商獎
## 6 扣嫂婚前告白謝和弦:我相信你一定會好起來!結婚3年「目睹老公呼麻」爆哭
##                                                 url
## 1 https://www.ettoday.net/news/20191205/1595474.htm
## 2 https://www.ettoday.net/news/20191205/1595470.htm
## 3 https://www.ettoday.net/news/20191205/1595465.htm
## 4 https://www.ettoday.net/news/20191205/1595473.htm
## 5 https://www.ettoday.net/news/20191205/1595466.htm
## 6 https://www.ettoday.net/news/20191205/1595471.htm
head(article20181205)
##               time class
## 1 2018/12/05 23:50  社會
## 2 2018/12/05 23:46  財經
## 3 2018/12/05 23:44  影劇
## 4 2018/12/05 23:44  社會
## 5 2018/12/05 23:43  國際
## 6 2018/12/05 23:40  時尚
##                                                        content
## 1     男帶小學生在SOGO百貨「做色色的事」法官認為情有可原還減刑
## 2     全力支持綠能產業穩定資金需求 櫃買鬆綁外國人發行外債資格
## 3 薛之謙驗毒結果出爐! 黃毅清嗆「野雞律師函」:給腦殘粉看就行
## 4   莽夫潑酸妻兒上吊亡!鄰曝「酒後瘋狗目」 兒曾怨:回家是恐怖
## 5                 西沙群島浪花礁建新平台設施 美智庫:軍事用途
## 6             伊薩米勒穿猿人迷彩致敬日本潮牌 見偶像洛基太興奮
##                                                 url
## 1 https://www.ettoday.net/news/20181205/1323753.htm
## 2 https://www.ettoday.net/news/20181205/1323849.htm
## 3 https://www.ettoday.net/news/20181205/1323847.htm
## 4 https://www.ettoday.net/news/20181205/1323775.htm
## 5 https://www.ettoday.net/news/20181205/1323705.htm
## 6 https://www.ettoday.net/news/20181205/1323561.htm