使用 R 抓取动态内容网站的困难

问题描述 投票:0回答:1
library(readr)
library(tidyverse)
library(rvest)

salary_link <- "https://www.spotrac.com/mlb/rankings/salary/pitching/"
salary_page <- read_html(salary_link)
salaries <- html_text(html_elements(salary_page, ".info"))
teams <- html_text(html_elements(salary_page, ".rank-position"))

当我尝试从该网站抓取数据时,我只获得前 100 个元素,而页面上应该有 308 个元素。

我尝试使用 rvest 进行网页抓取,但通过一些搜索,我知道我必须使用 R Selenium 做一些事情。如果您能帮忙介绍一下,我们将不胜感激。

r web-scraping rvest rselenium
1个回答
0
投票

幸运的是,在

rvest_1.0.4
他们介绍了
read_html_live()
。它的行为足够相似,您可以就地使用它并获得您需要的东西。

salary_page <- rvest::read_html_live(salary_link)

仅供参考,您可以根据需要获取各个组件:

salaries <- html_elements(salary_page, ".info") |> html_text()
teams <- html_elements(salary_page, ".rank-position") |> html_text()
out <- data.frame(salary = salaries, teams = trimws(teams))
head(out)
#          salary teams
# 1 $70,000,000     LAD
# 2 $43,333,334     TEX
# 3 $43,333,333     HOU
# 4 $40,000,000     TEX
# 5 $36,000,000     NYY
# 6 $35,416,670     WSH

或者您只需付出一点努力就可以得到整张桌子:

out <- html_node(salary_page, "table") |>
  html_table()
out <- out[!is.na(out$Player) & nzchar(out$Player), nzchar(colnames(out))]
head(out)
# # A tibble: 6 × 5
#   Player                                                                                                                                                 POS     Age Throws salary     
#   <chr>                                                                                                                                                  <chr> <int> <chr>  <chr>      
# 1 "Ohtani\n\t\t\t\n            \t\n            \n                                \n            \n\t\t\t\n\t\t\t\t\tShohei Ohtani\n\t\t\t\t\t  LAD"       SP/DH    29 R      $70,000,000
# 2 "Scherzer\n\t\t\t\n            \t\n            \n                                \n            \n\t\t\t\n\t\t\t\t\tMax Scherzer\n\t\t\t\t\t  TEX"      SP       39 R      $43,333,334
# 3 "Verlander\n\t\t\t\n            \t\n            \n                                \n            \n\t\t\t\n\t\t\t\t\tJustin Verlander\n\t\t\t\t\t  HOU" SP       41 R      $43,333,333
# 4 "deGrom\n\t\t\t\n            \t\n            \n                                \n            \n\t\t\t\n\t\t\t\t\tJacob deGrom\n\t\t\t\t\t  TEX"        SP       36 R      $40,000,000
# 5 "Cole\n\t\t\t\n            \t\n            \n                                \n            \n\t\t\t\n\t\t\t\t\tGerrit Cole\n\t\t\t\t\t  NYY"           SP       33 R      $36,000,000
# 6 "Corbin\n\t\t\t\n            \t\n            \n                                \n            \n\t\t\t\n\t\t\t\t\tPatrick Corbin\n\t\t\t\t\t  WSH"      SP       34 L      $35,416,670

out2 <- strcapture("^([^\n]*)\n(.*)\n([^\n]*)$", out$Player, list(last="", full="", team="")) |>
  lapply(trimws) |>
  as.data.frame()
out3 <- cbind(out, out2)
out3 <- cbind(out[,-1], out2)
head(out3)
#     POS Age Throws      salary      last             full team
# 1 SP/DH  29      R $70,000,000    Ohtani    Shohei Ohtani  LAD
# 2    SP  39      R $43,333,334  Scherzer     Max Scherzer  TEX
# 3    SP  41      R $43,333,333 Verlander Justin Verlander  HOU
# 4    SP  36      R $40,000,000    deGrom     Jacob deGrom  TEX
# 5    SP  33      R $36,000,000      Cole      Gerrit Cole  NYY
# 6    SP  34      L $35,416,670    Corbin   Patrick Corbin  WSH
© www.soinside.com 2019 - 2024. All rights reserved.