大多数时候我无法向以下网站提出请求:
https://www.adondevivir.com/proyectos-etapa-pre-venta-en-construccion.html
library(rvest);library(tibble);library(httr2)
base_url <- "https://www.adondevivir.com/proyectos-etapa-pre-venta-en-construccion.html"
parsed_base_url <- base_url |>
read_html() # This works sometimes and I get the underlying html
# THIS NEVER WORKS
pagina_parsed <- base_url |>
request() |>
req_user_agent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
) |>
req_headers(
Referer = "https://www.adondevivir.com/",
Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
`Accept-Language` = "es-419,es;q=0.6",
`Accept-Encoding` = "gzip, deflate, br, zstd",
`Cache-Control` = "max-age=0",
`Sec-Ch-Ua` = '"Brave";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
Priority = "u=0, i"
) |>
req_perform()
为什么我大部分时间都无法向页面发出请求(更不用说它不适用于上面提供的标头的 httr2)?有没有办法用
httr2
克服这个“问题”?这与 cookie 或页面保护自身不被抓取的方式有关吗?
我想我可以重试该请求很多次,直到它起作用,但我认为我不会了解太多关于它不起作用的原因。
这与 cookie 或页面保护自身不被抓取的方式有关吗?
两者皆有。它受到 Cloudflare 的保护,Cloudflare 经历了一系列的恶作剧来迷惑自动化工具和无头浏览器。如果您在浏览器的新会话或隐身模式下打开该网站,您实际上应该看到 Cloudflare Javascript 挑战正在运行。如果您打开了 DevTools 的网络选项卡(启用了保留日志,并且可能还进行了限制以减慢速度),您应该会看到一些有关所探测内容的其他提示。如果 Cloudflare 发现您的请求合法,则会设置 cookie 以在您的会话期间授予通过权。您可以通过
httr2
重复使用它们。
半手动方法可能是这样的:
httr2::curl_translate()
library(rvest)
library(httr2)
# translate curl to httr2:
curl_translate(r"(curl 'https://www.adondevivir.com/proyectos-etapa-pre-venta-en-construccion.html' \
-H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' \
-H 'accept-language: en-GB,en;q=0.9' \
-H 'cookie: __cf_bm=MbgHRAOsR8nrNwVtUAXzb0HMRrBSK4hbiiNtO3mg41A-1722860033-1.0.1.1-WvLWPer9d5s3PyuFjRZeCwyIRmiyELE5bs40JWH4Txc4OZXWvFUaSgqUbvbf_xYKnpePSYWv5GY4btcLBR_vvc6pt01F1sLPXt0QVrPolJk; sessionId=b5a7a506-4689-4188-88e7-fe005fc154ab; cf_clearance=vchooYPfzwbG.fmcao_YheRm7DPILRmr8xhNSWLUabQ-1722860102-1.0.1.1-tYQZHx6sTEzzRUS.UHz1rjZNq1a1VcSoOcn7l0EjRqbFeHNUwzsHyhTjW2R0RU_Tnv.6L5WbxxEy8m3xxcepaw' \
-H 'priority: u=0, i' \
-H 'sec-ch-ua: "Not)A;Brand";v="99", "Brave";v="127", "Chromium";v="127"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-model: ""' \
-H 'sec-ch-ua-platform: "Windows"' \
-H 'sec-ch-ua-platform-version: "15.0.0"' \
-H 'sec-fetch-dest: document' \
-H 'sec-fetch-mode: navigate' \
-H 'sec-fetch-site: none' \
-H 'sec-fetch-user: ?1' \
-H 'sec-gpc: 1' \
-H 'upgrade-insecure-requests: 1' \
-H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36')")
#> request("https://www.adondevivir.com/proyectos-etapa-pre-venta-en-construccion.html") |>
#> req_headers(
#> accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
#> `accept-language` = "en-GB,en;q=0.9",
#> cookie = "__cf_bm=MbgHRAOsR8nrNwVtUAXzb0HMRrBSK4hbiiNtO3mg41A-1722860033-1.0.1.1-WvLWPer9d5s3PyuFjRZeCwyIRmiyELE5bs40JWH4Txc4OZXWvFUaSgqUbvbf_xYKnpePSYWv5GY4btcLBR_vvc6pt01F1sLPXt0QVrPolJk; sessionId=b5a7a506-4689-4188-88e7-fe005fc154ab; cf_clearance=vchooYPfzwbG.fmcao_YheRm7DPILRmr8xhNSWLUabQ-1722860102-1.0.1.1-tYQZHx6sTEzzRUS.UHz1rjZNq1a1VcSoOcn7l0EjRqbFeHNUwzsHyhTjW2R0RU_Tnv.6L5WbxxEy8m3xxcepaw",
#> priority = "u=0, i",
#> `sec-gpc` = "1",
#> `upgrade-insecure-requests` = "1",
#> `user-agent` = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
#> ) |>
#> req_perform()
# make request, parse html response
request("https://www.adondevivir.com/proyectos-etapa-pre-venta-en-construccion.html") |>
req_headers(
accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
`accept-language` = "en-GB,en;q=0.9",
cookie = "__cf_bm=MbgHRAOsR8nrNwVtUAXzb0HMRrBSK4hbiiNtO3mg41A-1722860033-1.0.1.1-WvLWPer9d5s3PyuFjRZeCwyIRmiyELE5bs40JWH4Txc4OZXWvFUaSgqUbvbf_xYKnpePSYWv5GY4btcLBR_vvc6pt01F1sLPXt0QVrPolJk; sessionId=b5a7a506-4689-4188-88e7-fe005fc154ab; cf_clearance=vchooYPfzwbG.fmcao_YheRm7DPILRmr8xhNSWLUabQ-1722860102-1.0.1.1-tYQZHx6sTEzzRUS.UHz1rjZNq1a1VcSoOcn7l0EjRqbFeHNUwzsHyhTjW2R0RU_Tnv.6L5WbxxEy8m3xxcepaw",
priority = "u=0, i",
`sec-gpc` = "1",
`upgrade-insecure-requests` = "1",
`user-agent` = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
) |>
req_perform() |>
resp_body_html() |>
html_elements("h3[data-qa='POSTING_CARD_DESCRIPTION']") |>
html_text() |>
head() |>
stringr::str_trunc(80)
#> [1] "Vive en el distrito patriota de Lima, hogar de libertadores. Disfruta de una ..."
#> [2] "Obra en curso - 47% vendido! Un proyecto inigualable ubicado en la zona monum..."
#> [3] "¡Vive frente al Campo de Marte en Jesús María! Presentamos \"Salaverry 571\", u..."
#> [4] "¡Vive en la mejor zona de Surquillo! Lobby, Sala de niños, Sala de Usos Multi..."
#> [5] "Grupo Lar, única inmobiliaria en Perú, con presencia en 5 países en simultáne..."
#> [6] "Proyecto exclusivo en la Nueva Santa Catalina, a 5 minutos del centro financi..."
创建于 2024-08-05,使用 reprex v2.1.0