我使用下面的代码来抓取有关 CH 医生的数据。我设法获取姓名及其邮政编码+州名,但我很难获取完整的地址信息,这些信息在单击医生的图片后可用。
requiredpackages <- c('rvest', 'httr','RSelenium', "dplyr", "tidyverse", "netstat",
"seleniumPipes", "data.table", "ggplot2", "forcats", "robotstxt", "Rcrawler", "xlsx", "arsenal", "mailR")
install_load <- function(packages){
for (p in packages) {
if (p %in% rownames(installed.packages())) {
library(p, character.only=TRUE)
} else {
install.packages(p)
library(p,character.only = TRUE)
}
}
}
install_load(requiredpackages)
rs_driver_object <- rsDriver(
browser = "chrome",
chromever = "105.0.5195.52",
verbose = F,
port = free_port()
)
remDr <- rs_driver_object$client
remDr$navigate("https://www.doctorfmh.ch/en")
remDr$deleteAllCookies
remDr$maxWindowSize()
search_box <- remDr$findElement(using = "id", "input-50")
search_box$clickElement()
search_box_click <- search_box$sendKeysToElement(list("Bern", key= "enter"))
for (i in 1:45) {
remDr$executeScript('window.scrollTo(0, document.body.scrollHeight);')
Sys.sleep(3)
}
doc_titles <- remDr$findElements(using = 'xpath', '//[contains(concat( " ", @class, " " ), concat( " ", "ttl", " " ))]')
Titles = lapply(doc_titles, function(x) {
x$getElementText() %>% unlist()
}) %>% flatten_chr()
doc_locations <- remDr$findElements( using = 'xpath', '//[contains(concat( " ", @class, " " ), concat( " ", "subttl", " " ))]//span')
Locations = lapply(doc_locations, function(x) {
x$getElementText() %>% unlist()
}) %>% flatten_chr()
DF = data.frame(Titles, Locations)
我用红色突出显示了我无法抓取的信息。通过单击从上面的代码获得结果的每个医生即可看到它。
这是您可以考虑的方法:
library(RSelenium)
url <- "https://www.doctorfmh.ch/en"
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
list_Adress <- list()
for(i in 1 : 4)
{
print(i)
remDr$navigate(url)
print("1")
Sys.sleep(3)
remDr$deleteAllCookies()
remDr$maxWindowSize()
print("2")
Sys.sleep(3)
remDr$executeScript("scroll(0,700);")
print("3")
Sys.sleep(7)
search_box <- remDr$findElement(using = "id", "input-50")
search_box$sendKeysToElement(list("Bern", key = "enter"))
print("4")
Sys.sleep(10)
search_box <- remDr$findElement(using = "id", "input-50")
search_box$sendKeysToElement(list("Bern", key = "enter"))
search_box$clickElement()
print("5")
Sys.sleep(3)
start_Search <- remDr$findElement("css selector", "#app > div > div.wrp.parent > article > div > section > form > div.cf.mb-3 > div:nth-child(6) > div:nth-child(1) > button > span")
start_Search$clickElement()
remDr$executeScript("scroll(0,10000);")
print("6")
Sys.sleep(3)
xpath <- paste0("/html/body/div[7]/div/div[2]/div[2]/div/article/div/section/div/div/div[2]/div/div[1]/div[2]/div[", i , "]/div/div[1]/div/div[3]")
web_Obj_Doctor <- remDr$findElement("xpath", xpath)
print("7")
Sys.sleep(3)
web_Obj_Doctor$clickElement()
print("8")
Sys.sleep(3)
remDr$executeScript("scroll(0,1200);")
xpath_Adress <- "/html/body/div[7]/div/div[2]/div[2]/div/article/div/section/div/div/div[2]/div/div[3]/div[2]/div/div[1]/div/div[3]/div/div"
web_Obj_Address <- tryCatch(remDr$findElement("xpath", xpath_Adress), error = function(e) NA)
if(is.na(web_Obj_Address) == TRUE)
{
xpath_Adress <- "/html/body/div[7]/div/div[2]/div[2]/div/article/div/section/div/div/div[2]/div/div[3]/div[2]/div/div[1]/div/div[2]/div/div/div[1]/p"
web_Obj_Address <- tryCatch(remDr$findElement("xpath", xpath_Adress), error = function(e) NA)
}
list_Adress[[i]] <- web_Obj_Address$getElementText()[[1]]
print("9")
Sys.sleep(3)
}
list_Adress
[[1]]
[1] "Address of workplace, Senior Physician\nKinderklinik Inselspital\nPneumologie\nFreiburgstrasse 15\n3010 Bern, BE\n\nTel. 031 632 21 11\n location_on\n\nphone"
[[2]]
[1] "Address of workplace, Senior Physician\nKinderklinik Inselspital\nPneumologie\nFreiburgstrasse 15\n3010 Bern, BE\n\nTel. 031 632 21 11\n location_on\n\nphone"
[[3]]
[1] "Hausärztepraxis OST\nUntere Zollgasse 28\n3072 Ostermundigen, BE"
[[4]]
[1] "Herr Dr. med.\nRolf Abderhalden\nArztpraxis\nJungfraustrasse 15A\n3600 Thun, BE"