我废弃了一个网络,现在需要清理“服务”列,这是一个字符串。
fl_data数据集中的service列,可以看到有Testing Services、Preventions Services等多种服务。这些服务介于两者之间 和:但并非所有行都有所有服务。
我需要将服务类型设置为列名称,并将值设置为内部元素。如果服务类型不在字符串中,那么它应该是 NA。
这是我的数据集:
url_base <- "https://npin.cdc.gov/search?=type%3Aorganization&page="
map_df(0:0, function(i) {
cat(".")
pg <- read_html(sprintf(url_base, i))
data.frame(org_name = html_text2(html_nodes(pg, ".block-field-blocknodeorganizationtitle")),
street = html_text(html_nodes(pg, ".address-line1")),
city = html_text(html_nodes(pg, ".locality")),
state = html_text(html_nodes(pg, ".administrative-area")),
zip = html_text(html_nodes(pg, ".postal-code")),
service = html_text2(html_nodes(pg, ".services-fieldset")),
stringsAsFactors=FALSE
)
}) -> raw_data
fl_data <- raw_data |>
filter(state=="FL") |>
mutate(service = str_remove(service, "Services\nPlease contact organization for eligibility requirements"))
您可以使用for循环来提取服务和相应的项目。在
result
中,项目用 ,
分隔。
library(tidyverse)
library(rvest)
url <- "https://npin.cdc.gov/search?=type%3Aorganization&page=0"
content <- read_html(url)
services <- content %>% html_nodes(".services-fieldset")
org_name <- content %>% html_nodes(".block-field-blocknodeorganizationtitle") %>% html_text2()
result <- data.frame(org_name = as.integer(), service = as.character(), item = as.character())
for (i in 1:length(services)) {
temp <- services[i] %>% html_nodes(".field__items")
for (j in 2:length(temp)) {
label <- temp[j] %>% html_nodes(".field-label") %>% html_text() %>% gsub(":", "", .)
items <- temp[j] %>% html_nodes(".field__item") %>% html_text()
result[nrow(result) +1, ] <- c(org_name[i], label, paste0(items, collapse = ","))
}
}
result |> tibble()
#> # A tibble: 32 × 3
#> org_name service item
#> <chr> <chr> <chr>
#> 1 Eastport Health Care Incorporated Testing Services Gonorrhea Test…
#> 2 Eastport Health Care Incorporated Care and Treatment Services Family Plannin…
#> 3 Alamosa County Public Health Testing Services TB Testing
#> 4 Alamosa County Public Health Care and Treatment Services Mpox Vaccine,H…
#> 5 Alamo Navajo Health Center Testing Services TB Testing,Gon…
#> 6 Alamo Navajo Health Center Prevention Services TB Prevention/…
#> 7 Alamo Navajo Health Center Care and Treatment Services Family Plannin…
#> 8 AIDS Resource Group Testing Services Hepatitis C Te…
#> 9 AIDS Resource Group Prevention Services STD/STI Preven…
#> 10 AIDS Resource Group Support Services Support Groups…
#> # ℹ 22 more rows
创建于 2024-03-14,使用 reprex v2.1.0