我正在使用 R Selenium 从可折叠表格中收集数据。我已经成功抓取了主表,但我还想收集每个公司的子表数据。
library(tidyverse)
library(RSelenium)
library(rvest)
library(netstat)
library(data.table)
remote_driver <- rsDriver(browser = "chrome", chromever = "126.0.6478.182", verbose = F, port = free_port())
remDr <- remote_driver$client
remDr$open()
remDr$navigate("https://graphics.axios.com/2019-02-11-harris-polls/index.html?initialWidth=787&childId=av-2019-02-11-harris-polls-N8S34&parentTitle=The%20Axios%20Harris%20Poll%20100%20reputation%20rankings&parentUrl=https%3A%2F%2Fwww.axios.com%2F2019%2F03%2F06%2Faxios-harris-poll-corporate-reputations")
expand_table <- remDr$findElement(using = 'xpath', '/html/body/div/div[2]/div[1]/span')
expand_table$clickElement()
data_table <- remDr$findElement(using = 'xpath', '/html/body/div/div[2]/table')
data_table_html <- data_table$getPageSource()
page <- read_html(data_table_html %>% unlist())
df <- html_table(page)
如有任何帮助,我们将不胜感激。谢谢!
我认为在这种情况下,我需要创建一个循环来系统地扩展每个子表并提取该数据的 html 表。我尝试使用 findElements 识别“+”元素,但收到错误。
sub_tables <- remDr$findElements(using = 'class name', 'annotation expand')
Selenium 消息:不允许使用复合类名
除非此处有严格要求,否则在这种情况下从 Javascript 源提取数据似乎更直接。
如果您在加载该页面并摆弄表格时检查浏览器的网络选项卡,您可以通过 Serach识别实际数据源(Windows 版 Chrome 中的 CTRL+f,例如搜索“Wegmans”,第一家公司) 。唯一有意义的匹配是缩小的 js 脚本,所有公司的所有详细信息都嵌入在那里。 让我们首先从页面源中提取脚本 url,然后我们可以使用
{RSelenium}
获取脚本并计算出一个正则表达式,该表达式将仅返回创建数据对象的部分。此时我们已经有了一段有效的 Javascript。我们可以在
httr2
js 引擎中对其进行评估,并将结果对象转换为有效的 JSON 字符串,该字符串恰好可以很好地解析为半平面框架:V8
结果帧:
library(stringr)
library(dplyr)
library(rvest)
library(httr2)
library(V8)
#> Using V8 engine 11.8.172.13
url_ <- "https://graphics.axios.com/2019-02-11-harris-polls/index.html"
# get link to javascript
js_attr_src <-
read_html(url_) |>
html_element("script") |>
html_attr("src")
js_attr_src
#> [1] "js/app.b1e00630c4751b7ccae0.min.js?b1e00630c4751b7ccae0"
# build full script src url & fetch it as text
js_content_str <-
str_replace(url_, "index.html$", js_attr_src) |>
request() |>
req_perform() |>
resp_body_string()
# dataset is embedded in the minified js script,
# start of the dataset object:
# r.exports=[{"2019_rank":1,"2019_rq":83,industry:"Groceries",history:[...],dimensions:{AFFINITY:{score:81.9,rank:5},...},"2018_rank":2,"2018_rq":82.8,change:-1,company:"Wegmans"}
# end of the datset object:
# ,TRAJECTORY:{score:51,rank:99}},change:null,company:"U.S. Government"}]}
# extract a part that would match with "\[\{.*?\}\]" and is between
# "r\.exports=" and "\}" (non-inclusive, positive lookbehind & lookahead)
js <- str_extract(js_content_str, "(?<=r\\.exports=)\\[\\{.*?\\}\\](?=\\})")
# let's check if start and end of the extracted js snippet are what we expected
str_trunc(js, 80, side = "right")
#> [1] "[{\"2019_rank\":1,\"2019_rq\":83,industry:\"Groceries\",history:[{year:\"2009\",rank:..."
str_trunc(js, 80, side = "left")
#> [1] "...nk:99},TRAJECTORY:{score:51,rank:99}},change:null,company:\"U.S. Government\"}]"
# let's evaluate it with V8 javascript engine and turn resulting object into pretty-printed
# JSON string
ctx <- v8()
json_str <- ctx$eval(str_glue("JSON.stringify({js}, null, 2)"))
str_trunc(json_str, 80, side = "right") |> str_view()
#> [1] │ [
#> │ {
#> │ "2019_rank": 1,
#> │ "2019_rq": 83,
#> │ "industry": "Groceries",
#> │ ...
str_trunc(json_str, 80, side = "left") |> str_view()
#> [1] │ ... 99
#> │ }
#> │ },
#> │ "change": null,
#> │ "company": "U.S. Government"
#> │ }
#> │ ]
# parse json, return almost flat tibble (history is nested)
flat_tbl <-
jsonlite::fromJSON(json_str, flatten = TRUE) |>
as_tibble()