我想用 R 抓取大学课程目录。我的代码已经相当不错了,但是学科和子学科的课程分配尚未按照我想要的方式工作。
这是我的代码:
# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code <- read_html(https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39)
# Eine rekursive Funktion, um Informationen unter jedem h-Tags zu sammeln
extract_module_info <- function(node, module_path = c()) {
# Basisfall: Wenn der Knoten leer ist, beenden Sie die Rekursion
if (length(node) == 0) return(tibble())
current_tag <- node %>% html_name()
current_text <- node %>% html_text(trim = TRUE)
# Aktualisieren des Pfades mit dem aktuellen Modul/Submodul
new_path <- c(module_path, current_text)
# Suchen nach dem nächsten div, das die Details enthält
details_node <- node %>% html_node(xpath = "./following-sibling::div[1]")
# Sammeln von Detailinformationen, wenn vorhanden
if (!is.null(details_node) && length(html_nodes(details_node, 'tr')) > 0) {
details <- html_nodes(details_node, 'tr') %>%
map_df(~{
tibble(
ModulePath = list(new_path),
CourseDesc = html_nodes(.x, '.expander') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseElse = html_nodes(.x, 'td:nth-child(2)') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseVAK = html_nodes(.x, 'td:nth-child(1)') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseTitle = html_nodes(.x, 'strong') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseTeacher = html_nodes(.x, 'td ~ td + td') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .)
)
})
} else {
details <- tibble(ModulePath = list(new_path))
}
# Rekursiver Abstieg zum nächsten h-Tag, falls vorhanden
next_node <- node %>% html_node(xpath = "./following-sibling::*[self::h2 or self::h3 or self::h4][1]")
child_details <- extract_module_info(next_node, new_path)
# Kombinieren der aktuellen Details mit den rekursiv gesammelten Details
bind_rows(details, child_details)
}
# Anwendung der Funktion auf das gesamte Dokument, startend mit dem ersten h2-Tag
results <- html_nodes(html_code, 'h2') %>% map_df(~extract_module_info(.x))
# Ausgabe der Ergebnisse
print(results)
问题出现在变量 ModulePath 上。让我们以 VAK ID SZHB 0806 为例来看看课程设置:
> results |> slice(544) |> select(CourseVAK )
# A tibble: 1 × 1
CourseVAK
<chr>
1 SZHB 0806
还有 ModulePath 的值:
> results |> slice(544) |> select(ModulePath) |> pull()
[[1]]
[1] "Language Center of the Universities in the State of Bremen"
[2] "Arabic"
[3] "Chinese"
[4] "German"
[5] "German sign language"
[6] "English"
[7] "French"
[8] "Hebrew (modern)"
[9] "Italian"
[10] "Japanese"
[11] "Catalan"
[12] "Korean"
[13] "Croatian"
[14] "Kurdish"
[15] "Latin"
[16] "Dutch"
[17] "Polish"
不幸的是,所有 h4 标题都已达到相应的课程报价。
我想要的是这样的结果:
[[1]]
[1] "Language Center of the Universities in the State of Bremen"
[2] "Polish"
当然,您现在可以删除除第一个和最后一个之外的所有元素。但考虑到我不知道其他页面的结构,这是一种可靠的方法吗?也许标题将来会更加嵌套。
很酷的代码和不错的项目。我知道当 html 代码没有正确嵌套时的痛苦。总是让事情变得更困难。
我重写了代码的逻辑,因为对我来说,利用不同样式标题的隐式排序似乎更容易。如果你利用保持这种方式的优势,你可以使用 tidyverse 中漂亮的
fill
函数复制以前节点的值。
请仔细检查,所有内容仍然按预期提取值。
编辑:我添加了一项修改,该修改还删除了偶尔的“p Strong”标签作为标题。当然,它变得更加复杂,只是没有时间考虑更漂亮的东西,但我认为它有效。
# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2)
html_code_1 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=93fdb6be384979f7300d263ba0c094be&tx_hbulvp_pi1%5Bsem%5D=39")
html_code_2 <- read_html("https://www.uni-bremen.de/studium/starten-studieren/veranstaltungsverzeichnis?tx_hbulvp_pi1%5Bmodule%5D=54c6fd5c0b74c8c6b7f81ab2939a7196&tx_hbulvp_pi1%5Bsem%5D=40")
result <- html_elements(html_code_2, '.tx-hbulvp-pi1-module') |>
## map over the different modules
map_dfr(function(main_module) {
## extract the children of these, which are studium generale, etc.
html_children(main_module) |>
map_dfr(function(headers) {
if (as.character(headers) |> str_detect("h2")) {
return(tibble(h2 = html_text(headers)))
} else if (as.character(headers) |> str_detect("h3")) {
return(tibble(h3 = html_text(headers)))
} else if (as.character(headers) |> str_detect("h4")) {
return(tibble(h4 = html_text(headers)))
} else if (as.character(headers) |> str_detect("<p")) {
return(tibble(p_strong = html_text(headers)))
} else if (as.character(headers) |> str_detect("div")) {
content <- map_dfr(headers |> html_elements("tr"),
function(tr) {
tibble(
CourseDesc = html_nodes(tr, '.expander') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseElse = html_nodes(tr, 'td:nth-child(2)') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseVAK = html_nodes(tr, 'td:nth-child(1)') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseTitle = html_nodes(tr, 'strong') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .),
CourseTeacher = html_nodes(tr, 'td ~ td + td') %>%
html_text(trim = TRUE) %>%
ifelse(length(.) == 0, NA_character_, .)
)
})
return(content)
}
})
}) |>
fill(h2, .direction = "down") |>
group_by(h2) |>
fill(h3, .direction = "down") |>
group_by(h3,h2) |>
fill(h4, .direction = "down")|>
group_by(h2,h3,h4)
if("p_strong" %in% names(result)){
result <- result |>
fill(p_strong, .direction = "down") |>
filter(!is.na(CourseDesc)) |>
select(h2, h3, h4,p_strong, everything())
} else {
result <- result |>
filter(!is.na(CourseDesc)) |>
select(h2, h3, h4, everything())
}