library(readr)
library(tidyverse)
library(rvest)
salary_link <- "https://www.spotrac.com/mlb/rankings/salary/pitching/"
salary_page <- read_html(salary_link)
salaries <- html_text(html_elements(salary_page, ".info"))
teams <- html_text(html_elements(salary_page, ".rank-position"))
当我尝试从该网站抓取数据时,我只获得前 100 个元素,而页面上应该有 308 个元素。
我尝试使用 rvest 进行网页抓取,但通过一些搜索,我知道我必须使用 R Selenium 做一些事情。如果您能帮忙介绍一下,我们将不胜感激。
幸运的是,在
rvest_1.0.4
他们介绍了read_html_live()
。它的行为足够相似,您可以就地使用它并获得您需要的东西。
salary_page <- rvest::read_html_live(salary_link)
仅供参考,您可以根据需要获取各个组件:
salaries <- html_elements(salary_page, ".info") |> html_text()
teams <- html_elements(salary_page, ".rank-position") |> html_text()
out <- data.frame(salary = salaries, teams = trimws(teams))
head(out)
# salary teams
# 1 $70,000,000 LAD
# 2 $43,333,334 TEX
# 3 $43,333,333 HOU
# 4 $40,000,000 TEX
# 5 $36,000,000 NYY
# 6 $35,416,670 WSH
或者您只需付出一点努力就可以得到整张桌子:
out <- html_node(salary_page, "table") |>
html_table()
out <- out[!is.na(out$Player) & nzchar(out$Player), nzchar(colnames(out))]
head(out)
# # A tibble: 6 × 5
# Player POS Age Throws salary
# <chr> <chr> <int> <chr> <chr>
# 1 "Ohtani\n\t\t\t\n \t\n \n \n \n\t\t\t\n\t\t\t\t\tShohei Ohtani\n\t\t\t\t\t LAD" SP/DH 29 R $70,000,000
# 2 "Scherzer\n\t\t\t\n \t\n \n \n \n\t\t\t\n\t\t\t\t\tMax Scherzer\n\t\t\t\t\t TEX" SP 39 R $43,333,334
# 3 "Verlander\n\t\t\t\n \t\n \n \n \n\t\t\t\n\t\t\t\t\tJustin Verlander\n\t\t\t\t\t HOU" SP 41 R $43,333,333
# 4 "deGrom\n\t\t\t\n \t\n \n \n \n\t\t\t\n\t\t\t\t\tJacob deGrom\n\t\t\t\t\t TEX" SP 36 R $40,000,000
# 5 "Cole\n\t\t\t\n \t\n \n \n \n\t\t\t\n\t\t\t\t\tGerrit Cole\n\t\t\t\t\t NYY" SP 33 R $36,000,000
# 6 "Corbin\n\t\t\t\n \t\n \n \n \n\t\t\t\n\t\t\t\t\tPatrick Corbin\n\t\t\t\t\t WSH" SP 34 L $35,416,670
out2 <- strcapture("^([^\n]*)\n(.*)\n([^\n]*)$", out$Player, list(last="", full="", team="")) |>
lapply(trimws) |>
as.data.frame()
out3 <- cbind(out, out2)
out3 <- cbind(out[,-1], out2)
head(out3)
# POS Age Throws salary last full team
# 1 SP/DH 29 R $70,000,000 Ohtani Shohei Ohtani LAD
# 2 SP 39 R $43,333,334 Scherzer Max Scherzer TEX
# 3 SP 41 R $43,333,333 Verlander Justin Verlander HOU
# 4 SP 36 R $40,000,000 deGrom Jacob deGrom TEX
# 5 SP 33 R $36,000,000 Cole Gerrit Cole NYY
# 6 SP 34 L $35,416,670 Corbin Patrick Corbin WSH