我试图从 IMDB 页面上抓取一部电影的标准数据(标题、年份、评级、流派),但我陷入了流派。没有像以前那样的 .genre 类(并且通常在在线教程/操作方法中使用),并且我对 rvest 不够熟悉,无法弄清楚如何收集数据。下面是主要由 chatgpt 生成的代码,我一直在对其进行故障排除。
如何获取流派?任何帮助将不胜感激。
# Load necessary libraries
library(rvest)
library(dplyr)
# Function to scrape movie information from IMDB
scrape_imdb <- function(movie_names) {
base_url <- "https://www.imdb.com/find?q="
# Initialize an empty data frame to store the results
results_df <- data.frame(Movie = character(),
Year = character(),
Rating = character(),
Genres = character(),
Runtime = character(),
stringsAsFactors = FALSE)
for (movie_name in movie_names) {
# Construct search URL
search_url <- paste0(base_url, URLencode(movie_name), "&s=tt")
# Read the search result page
search_page <- read_html(search_url)
# Extract the first movie result's URL
movie_url <- search_page %>%
html_nodes(".ipc-metadata-list-summary-item__c a") %>%
html_attr("href") %>%
.[1] %>%
paste0("https://www.imdb.com", .)
movie_year <- search_page %>%
html_nodes(".ipc-metadata-list-summary-item__c span") |>
html_text() %>%
.[1]
# Read the movie page
movie_page <- read_html(movie_url)
movie_rating <- movie_page %>%
html_nodes(".sc-bde20123-1.cMEQkK") %>%
html_text() %>%
.[1]
movie_runtime <- movie_page %>%
html_nodes(".sc-d8941411-2.cdJsTz li") %>%
html_text() %>%
.[3]
movie_genres <- movie_page %>%
html_element('[data-testid="storyline-genres"] div') |>
html_text()
# paste(collapse = ", ")
print(movie_genres)
# Append the results to the data frame
results_df <- rbind(results_df, data.frame(Movie = movie_name,
Year = as.numeric(movie_year),
Rating = as.numeric(movie_rating),
# Genres = movie_genres,
Runtime = movie_runtime,
stringsAsFactors = FALSE))
}
return(results_df)
}
# Example usage
movie_names <- c("The Matrix", "Inception", "Interstellar")
movie_info <- scrape_imdb(movie_names)
print(movie_info)
以下代码:
html <- read_html("https://www.imdb.com/title/tt10838180/")
html %>%
html_elements(xpath = "//a[@role='button'][contains(@href,'genre')]") %>%
html_text()
将输出:
[1] "Action" "Sci-Fi"