刮除问答工作正常,除非一个帖子有多页答案

问题描述 投票:0回答:1

下面的代码用他们的作者和日期来搜索所有问题和答案,但是我无法弄清楚如何编写多个页面的答案,例如这里的第二个问题

https://www.healthboards.com/boards/aspergers-syndrome/index2.html

阿斯伯格和自己说话

答案有两页:第一页15页,第二页3页,我只在第一页得到答案

library(rvest)
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
library(RCurl)
library(xlsx)
#install.packages("xlsx")
# Scrape thread titles, thread links, authors and number of views

url <- "https://www.healthboards.com/boards/aspergers-syndrome/index2.html"

h <- read_html(url)

threads <- h %>%
  html_nodes("#threadslist .alt1 div > a") %>%
  html_text()
threads

thread_links <- h %>%
  html_nodes("#threadslist .alt1 div > a") %>%
  html_attr(name = "href")
thread_links


thread_starters <- h %>%
  html_nodes("#threadslist .alt1 div.smallfont") %>%
  html_text() %>%
  str_replace_all(pattern = "\t|\r|\n", replacement = "")

thread_starters

views <- h %>%
  html_nodes(".alt2:nth-child(6)") %>%
  html_text() %>%
  str_replace_all(pattern = ",", replacement = "") %>%
  as.numeric()

# Custom functions to scrape author IDs and posts

scrape_posts <- function(link){
  read_html(link) %>%
    html_nodes(css = ".smallfont~ hr+ div") %>%
    html_text() %>%
    str_replace_all(pattern = "\t|\r|\n", replacement = "") %>%
    str_trim()
}



scrape_dates <- function(link){
  read_html(link) %>%
    html_nodes(css = "table[id^='post'] td.thead:first-child") %>%
    html_text() %>%
    str_replace_all(pattern = "\t|\r|\n", replacement = "") %>%
    str_trim()
}




scrape_author_ids <- function(link){
  h <- read_html(link) %>%
    html_nodes("div") 

  id_index <- h %>%
    html_attr("id") %>%
    str_which(pattern = "postmenu")

  h %>%
    `[`(id_index) %>%
    html_text() %>%
    str_replace_all(pattern = "\t|\r|\n", replacement = "") %>%
    str_trim()
}


htmls <- map(thread_links, getURL)

# Create master dataset

master_data <- 
  tibble(threads, thread_starters,thread_links) %>%
  mutate(
    post_author_id = map(htmls, scrape_author_ids),
    post = map(htmls, scrape_posts),
    fec=map(htmls, scrape_dates)
  ) %>%
  select(threads: post_author_id, post, thread_links,fec) %>%
  unnest()

master_data$thread_starters
threads
post
titles<-master_data$threads
therad_starters<-master_data$thread_starters
#views<-master_data$views

post_author<-master_data$post_author_id
post<-master_data$post
fech<-master_data$fec
employ.data <- data.frame(titles, therad_starters, post_author, post,fech)


write.xlsx(employ.data, "C:/2.xlsx")

无法弄清楚如何包含第二页..

r web-scraping rvest
1个回答
1
投票

快速查看您的代码和网站,在td类下面有一个vbmenu_control,它包含页数(在您的情况下,第2页,共2页)。你可以使用一些简单的regex,如

a = "page 2 of 2"
b = as.numeric(gsub("page 2 of ","",a))
> b
[1] 2

并添加条件if b>1。如果这是TRUE,你可以循环搜索链接...- talk-yourself-i.html,我是从序列1到页面数量的值。

© www.soinside.com 2019 - 2024. All rights reserved.