我正在从特定对象中提取有关不同主题的信息。在第一部分中,当我编写代码时,选择器提供了我正在寻找的特定信息。但是,每个对象提供的信息更少或更多。事实是最终结果是一个矩阵,其中包含许多NA值和错误顺序的信息。我检查了该页面,并且某些对象中的选择器提供了一些信息,而其他对象中的信息则不同。
有一种方法可以使用所需选择器的变量名称来获得默认信息,而不必考虑由于网站或信息长度而引起的更改。
第一部分(向量Var
,是通过先前的过程获得的]
Var <- c("https://eol.org/pages/401504/data", "https://eol.org/pages/3089826/data",
"https://eol.org/pages/52361/data", "https://eol.org/pages/2967667/data",
"https://eol.org/pages/587416/data", "https://eol.org/pages/3096662/data",
"https://eol.org/pages/3096667/data", "https://eol.org/pages/18009694/data",
"https://eol.org/pages/2967662/data", "https://eol.org/pages/2967669/data")
这是我的代码:
GiveMeData<-function(url){
furl<-read_html(url)
iden<-"body > div.l-basic-main > div.l-tabs > div > div > div.names-wrapper > div.names > h1 > i"
Iden<-html_node(furl,iden)
Identext<-html_text(Iden)
dmin<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(25) > div.trait-data > div.trait-val"
Dmin<-html_node(furl,dmin)
Dmintext<-html_text(Dmin)
dmax<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(24) > div.trait-data > div.trait-val"
Dmax<-html_node(furl,dmax)
Dmaxtext<-html_text(Dmax)
dminextra<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(24) > div.trait-data > div.trait-val"
Dminesxtra<-html_node(furl,dminextra)
Dminextratext<-html_text(Dmin)
dmaxextra<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(27) > div.trait-data > div.trait-val"
Dmaxesxtra<-html_node(furl,dmaxextra)
Dmaxextratext<-html_text(Dmax)
#Temperature
tmin<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(38) > div.trait-data > div.trait-val"
Tmin<-html_node(furl,tmin)
Tmintext<-html_text(Tmin)
tmax<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(54) > div.trait-data > div.trait-val"
Tmax<-html_node(furl,tmax)
Tmaxtext<-html_text(Tmax)
tminextra<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(53) > div.trait-data > div.trait-val"
Tminextra<-html_node(furl,tminextra)
Tminextratext<-html_text(Tminextra)
tmaxextra<-"body > div.l-basic-main > div.l-content > div > div.l-below-filters > ul > li:nth-child(52) > div.trait-data > div.trait-val"
Tmaxextra<-html_node(furl,tmaxextra)
Tmaxextratext<-html_text(Tmaxextra)
Identext
Tmaxtext<-gsub("degrees Celsius\n","",Tmaxtext)
Tmaxtext<-gsub("\n","", Tmaxtext)
Tmintext<-gsub("degrees Celsius\n","",Tmintext)
Tmintext<-gsub("\n","", Tmintext)
Tmaxextratext<-gsub("degrees Celsius\n","",Tmaxextratext)
Tmaxextratext<-gsub("\n","", Tmaxextratext)
Tminextratext<-gsub("degrees Celsius\n","",Tminextratext)
Tminextratext<-gsub("\n","",Tminextratext)
Dmaxtext<-gsub(" m\n","",Dmaxextratext)
Dmaxtext<-gsub("\n","",Dmaxextratext)
Dmintext<-gsub(" m\n","",Dmintext)
Dmintext<-gsub("\n","",Dmintext)
Dmaxextratext<-gsub(" m\n","",Dmaxextratext)
Dmaxextratext<-gsub("\n","",Dmaxextratext)
Dminextratext<-gsub(" m\n","",Dminextratext)
Dminextratext<-gsub("\n","",Dminextratext)
info=(c(as.character(Identext), as.character(Tmaxtext), as.character(Tmintext), as.character(Tminextratext), as.character(Dmaxtext), as.character(Dmaxextratext), as.character(Dminextratext)))
}
output2<- lapply(c(Var), function(x) tryCatch(GiveMeData(x), error = function(e){}))
这些不同网页上的数据没有确定的顺序,并且它们是没有标准标签的html标记。有些字段有多个条目,而有些则缺少条目。因此,无法通过默认使用CSS选择器来选择所需的数据。您需要使用其他方法。
关键是要意识到每个数据条目都是独立的,并且文本可以充当标签。因此,您可以构建一个包含页面上每个条目的数据框。对于数字值,您可以提取数字并获取具有多个条目的字段的最大值/最小值。
我已经更改了您的GiveMeData
函数以执行此操作:
library(rvest)
library(magrittr)
GiveMeData <- function(url)
{
# Define some strings that we will use to find the nodes and parse strings
number_regex <- "-?[[:digit:]]+[.]*[[:digit:]]{0,10}"
species_xpath <- "//div[@class='names']//i"
genus_xpath <- "//div[@class='names']/h1"
all_data_xpath <- "//div[@class='trait-data']"
# Read the page using rvest
furl <- read_html(url)
# Read the species name. If there isn't one, just get the genus
species <- html_nodes(furl, xpath = species_xpath) %>% html_text()
if(length(species) == 0)
{
species <- html_nodes(furl, xpath = genus_xpath) %>% html_text()
}
# Get an entry for each data element and parse the text into a two-column
# data frame with the label and the data content
html_nodes(furl, xpath = all_data_xpath) %>%
html_text() %>%
strsplit("\n\n") %>%
lapply(function(x)
{
x[which(x != "" & !grepl("URI:", x))]
}
) %>%
lapply(function(x)
{
values <- paste0(x[-1], collapse = " ");
values <- gsub("\n", "", values);
values <- gsub(" ([(]m(in|ax)[)])", "", values)
data.frame(label = x[1], value = values, stringsAsFactors = F)
}
) %>%
{do.call("rbind", .)} ->
df
# extract the numbers from the value field
gregexpr(number_regex, df$value) %>%
{regmatches(df$value, .)} %>%
lapply( function(x)
if(length(x) > 0)
as.numeric(x[1])
else NA) %>%
unlist -> df$number
# Now remove numbers and terminal spaces from the "value" fields
df$value %<>%
{gsub(number_regex, "", .)} %>%
{gsub("(^ +)|( +$)", "", .)}
# Finally, create a data frame with only one entry for each label, taking
# max / min values for the numeric entries
lapply(split.data.frame(df, df$label),
function(x)
{
data.frame(species = species,
measure = x$label[1],
min = min(x$number),
max = max(x$number),
value = x$value[1],
stringsAsFactors = F)
}
) %>%
{do.call("rbind", .)} %>%
`row.names<-.data.frame`(1:nrow(.)) %>%
as_tibble() %>%
return()
}
现在打电话时:
output2 <- lapply(c(Var), function(x) tryCatch(GiveMeData(x), error = function(e){}))
您将获得输出2的该值:
output2
#> [[1]]
#> # A tibble: 15 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Bispira melanosti~ body length 100 100 mm
#> 2 Bispira melanosti~ ecomorphological guild NA NA planktonic
#> 3 Bispira melanosti~ geographic distribution in~ NA NA South Atlantic
#> 4 Bispira melanosti~ habitat is NA NA marine benthic b~
#> 5 Bispira melanosti~ latitude 7.24 30.3 degrees
#> 6 Bispira melanosti~ longitude -97.6 93.7 degrees
#> 7 Bispira melanosti~ trophic guild NA NA suspension feeder
#> 8 Bispira melanosti~ water depth 0.9 75 m
#> 9 Bispira melanosti~ water dissolved O2 concent~ 4.54 4.87 mL/L
#> 10 Bispira melanosti~ water nitrate concentration 1.35 2.93 µmol/L
#> 11 Bispira melanosti~ water O2 saturation 87.2 97.4 percent
#> 12 Bispira melanosti~ water phosphate concentrat~ 0.129 0.354 µmol/L
#> 13 Bispira melanosti~ water salinity 36.3 36.4 PSU
#> 14 Bispira melanosti~ water silicate concentrati~ 1.49 2.17 µmol/L
#> 15 Bispira melanosti~ water temperature 20.5 24.6 degrees Celsius
#>
#> [[2]]
#> # A tibble: 10 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma b~ body length 100 100 mm
#> 2 Branchiomma b~ ecomorphological gui~ NA NA planktonic
#> 3 Branchiomma b~ geographic distribut~ NA NA Coral Sea
#> 4 Branchiomma b~ habitat NA NA marine
#> 5 Branchiomma b~ habitat is NA NA marine benthic biome
#> 6 Branchiomma b~ introduced range inc~ NA NA Spanish Exclusive Economic Zon~
#> 7 Branchiomma b~ invasive in NA NA Cyprus
#> 8 Branchiomma b~ latitude 18.2 23.9 degrees
#> 9 Branchiomma b~ longitude -97.8 -90.7 degrees
#> 10 Branchiomma b~ trophic guild NA NA suspension feeder
#>
#> [[3]]
#> # A tibble: 8 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma body length 100 100 mm
#> 2 Branchiomma ecomorphological guild NA NA planktonic
#> 3 Branchiomma feeding structure NA NA buccal organ absent or oc~
#> 4 Branchiomma geographic distribution inclu~ NA NA Red Sea
#> 5 Branchiomma habitat is NA NA marine benthic biome
#> 6 Branchiomma marine larval development str~ NA NA maternally derived nutrit~
#> 7 Branchiomma pattern of oogenesis NA NA extraovarian
#> 8 Branchiomma trophic guild NA NA suspension feeder
#>
#> [[4]]
#> # A tibble: 9 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma luculla~ body length 100 100 mm
#> 2 Branchiomma luculla~ ecomorphological guild NA NA planktonic
#> 3 Branchiomma luculla~ geographic distribution incl~ NA NA English Channel
#> 4 Branchiomma luculla~ habitat is NA NA marine benthic b~
#> 5 Branchiomma luculla~ latitude 30.6 50.8 degrees
#> 6 Branchiomma luculla~ longitude -4.14 32.3 degrees
#> 7 Branchiomma luculla~ pattern of oogenesis NA NA extraovarian
#> 8 Branchiomma luculla~ substrate type NA NA bedrock
#> 9 Branchiomma luculla~ trophic guild NA NA suspension feeder
#>
#> [[5]]
#> # A tibble: 22 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma bo~ body size 50 50 mm
#> 2 Branchiomma bo~ ecomorphological guild NA NA planktonic
#> 3 Branchiomma bo~ ecosystem engineering NA NA organism is an ecosyste~
#> 4 Branchiomma bo~ feeding structure NA NA buccal organ absent or ~
#> 5 Branchiomma bo~ geographic distribution in~ NA NA Skagerrak
#> 6 Branchiomma bo~ habitat is NA NA caves
#> 7 Branchiomma bo~ latitude 36.6 80.7 degrees
#> 8 Branchiomma bo~ locomotion NA NA non-motile / semi-motile
#> 9 Branchiomma bo~ longitude -20 57.8 degrees
#> 10 Branchiomma bo~ marine larval development ~ NA NA maternally derived nutr~
#> # ... with 12 more rows
#>
#> [[6]]
#> # A tibble: 5 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma picta body length 100 100 mm
#> 2 Branchiomma picta ecomorphological guild NA NA planktonic
#> 3 Branchiomma picta geographic distribution includ~ NA NA Seto Inland Sea
#> 4 Branchiomma picta habitat is NA NA marine benthic bio~
#> 5 Branchiomma picta trophic guild NA NA suspension feeder
#>
#> [[7]]
#> # A tibble: 4 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma wyvillei body length 100 100 mm
#> 2 Branchiomma wyvillei ecomorphological guild NA NA planktonic
#> 3 Branchiomma wyvillei habitat is NA NA marine benthic biome
#> 4 Branchiomma wyvillei trophic guild NA NA suspension feeder
#>
#> [[8]]
#> # A tibble: 5 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma nigromacul~ body length 100 100 mm
#> 2 Branchiomma nigromacul~ ecomorphological guild NA NA planktonic
#> 3 Branchiomma nigromacul~ geographic distribution in~ NA NA South Atlantic
#> 4 Branchiomma nigromacul~ habitat is NA NA marine benthic b~
#> 5 Branchiomma nigromacul~ trophic guild NA NA suspension feeder
#>
#> [[9]]
#> # A tibble: 7 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma bo~ body length 100 100 mm
#> 2 Branchiomma bo~ ecomorphological gui~ NA NA planktonic
#> 3 Branchiomma bo~ geographic distribut~ NA NA Israeli part of the Mediterran~
#> 4 Branchiomma bo~ habitat NA NA marine
#> 5 Branchiomma bo~ habitat is NA NA marine benthic biome
#> 6 Branchiomma bo~ introduced range inc~ NA NA Israeli part of the Mediterran~
#> 7 Branchiomma bo~ trophic guild NA NA suspension feeder
#>
#> [[10]]
#> # A tibble: 5 x 5
#> species measure min max value
#> <chr> <chr> <dbl> <dbl> <chr>
#> 1 Branchiomma spongi~ body length 100 100 mm
#> 2 Branchiomma spongi~ ecomorphological guild NA NA planktonic
#> 3 Branchiomma spongi~ geographic distribution i~ NA NA European waters (ERMS~
#> 4 Branchiomma spongi~ habitat is NA NA marine benthic biome
#> 5 Branchiomma spongi~ trophic guild NA NA suspension feeder
现在您可以轻松访问页面上的所有数据,并根据需要建立每个物种的数据框。