在 R 中取消列出抓取的信息?

问题描述 投票:0回答:1

我使用下面的代码来抓取有关 CH 医生的数据。我设法获取姓名及其邮政编码+州名,但我很难获取完整的地址信息,这些信息在单击医生的图片后可用。

requiredpackages <- c('rvest', 'httr','RSelenium', "dplyr", "tidyverse", "netstat", 
                      "seleniumPipes", "data.table", "ggplot2", "forcats", "robotstxt", "Rcrawler", "xlsx", "arsenal", "mailR")

install_load <- function(packages){
  for (p in packages) {
    if (p %in% rownames(installed.packages())) {
      library(p, character.only=TRUE)
    } else {
      install.packages(p)
      library(p,character.only = TRUE)
    }
  }
}

install_load(requiredpackages)

rs_driver_object <- rsDriver(
  browser = "chrome",
  chromever = "105.0.5195.52",
  verbose = F,
  port = free_port()
)

remDr <- rs_driver_object$client
remDr$navigate("https://www.doctorfmh.ch/en")
remDr$deleteAllCookies
remDr$maxWindowSize()

search_box <- remDr$findElement(using = "id", "input-50")
search_box$clickElement()
search_box_click <- search_box$sendKeysToElement(list("Bern", key= "enter"))

for (i in 1:45) {
  remDr$executeScript('window.scrollTo(0, document.body.scrollHeight);')
  Sys.sleep(3)
}

doc_titles <- remDr$findElements(using = 'xpath', '//[contains(concat( " ", @class, " " ), concat( " ", "ttl", " " ))]')

Titles = lapply(doc_titles, function(x) {
  x$getElementText() %>% unlist()
}) %>% flatten_chr()

doc_locations <- remDr$findElements( using = 'xpath', '//[contains(concat( " ", @class, " " ), concat( " ", "subttl", " " ))]//span')

Locations = lapply(doc_locations, function(x) {
  x$getElementText() %>% unlist()
}) %>% flatten_chr() 

DF = data.frame(Titles, Locations) 

我用红色突出显示了我无法抓取的信息。通过单击从上面的代码获得结果的每个医生即可看到它。

enter image description here

r web-scraping rselenium
1个回答
1
投票

这是您可以考虑的方法:

library(RSelenium)
url <- "https://www.doctorfmh.ch/en"
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")

remDr$open()

list_Adress <- list()

for(i in 1 : 4)
{
  print(i)

  remDr$navigate(url)
  
  print("1")
  Sys.sleep(3)
  
  remDr$deleteAllCookies()
  remDr$maxWindowSize()

  print("2")
  Sys.sleep(3)
  
  remDr$executeScript("scroll(0,700);")

  print("3")
  Sys.sleep(7)
  
  search_box <- remDr$findElement(using = "id", "input-50")
  search_box$sendKeysToElement(list("Bern", key = "enter"))
  
  print("4")
  Sys.sleep(10)
  
  search_box <- remDr$findElement(using = "id", "input-50")
  search_box$sendKeysToElement(list("Bern", key = "enter"))
  
  search_box$clickElement()

  print("5")
  Sys.sleep(3)
  
  start_Search <- remDr$findElement("css selector", "#app > div > div.wrp.parent > article > div > section > form > div.cf.mb-3 > div:nth-child(6) > div:nth-child(1) > button > span")
  start_Search$clickElement()
  
  remDr$executeScript("scroll(0,10000);")

  print("6")
  Sys.sleep(3)
  
  xpath <- paste0("/html/body/div[7]/div/div[2]/div[2]/div/article/div/section/div/div/div[2]/div/div[1]/div[2]/div[", i , "]/div/div[1]/div/div[3]")
  web_Obj_Doctor <- remDr$findElement("xpath", xpath)

  print("7")
  Sys.sleep(3)
  
  web_Obj_Doctor$clickElement()

  print("8")
  Sys.sleep(3)
  
  remDr$executeScript("scroll(0,1200);")
                   
  xpath_Adress <- "/html/body/div[7]/div/div[2]/div[2]/div/article/div/section/div/div/div[2]/div/div[3]/div[2]/div/div[1]/div/div[3]/div/div"
  web_Obj_Address <- tryCatch(remDr$findElement("xpath", xpath_Adress), error = function(e) NA)

  if(is.na(web_Obj_Address) == TRUE)
  {
    xpath_Adress <- "/html/body/div[7]/div/div[2]/div[2]/div/article/div/section/div/div/div[2]/div/div[3]/div[2]/div/div[1]/div/div[2]/div/div/div[1]/p"
    web_Obj_Address <- tryCatch(remDr$findElement("xpath", xpath_Adress), error = function(e) NA)
  }
    
  list_Adress[[i]] <- web_Obj_Address$getElementText()[[1]]

  print("9")
  Sys.sleep(3)
}


list_Adress

[[1]]
[1] "Address of workplace, Senior Physician\nKinderklinik Inselspital\nPneumologie\nFreiburgstrasse 15\n3010 Bern, BE\n\nTel.   031 632 21 11\n    location_on\n\nphone"

[[2]]
[1] "Address of workplace, Senior Physician\nKinderklinik Inselspital\nPneumologie\nFreiburgstrasse 15\n3010 Bern, BE\n\nTel.   031 632 21 11\n    location_on\n\nphone"

[[3]]
[1] "Hausärztepraxis OST\nUntere Zollgasse 28\n3072 Ostermundigen, BE"

[[4]]
[1] "Herr Dr. med.\nRolf Abderhalden\nArztpraxis\nJungfraustrasse 15A\n3600 Thun, BE"
© www.soinside.com 2019 - 2024. All rights reserved.