正如标题所说,我想自动抓取存储在嵌入网站的 power bi 工具中的所有 PDF 文件。网站如下:网站链接 要下载每个文件,您必须单击“单击以查看外壳元素”。我没有找到同时下载所有文件的方法。我还尝试在 python 中使用 webscraping 工具和 selenium 但它不起作用。网络抓取工具似乎无法选择 power bi 工具内的元素。此外,当您访问网站时,您需要接受一个条件才能看到 power bi 工具(请参阅屏幕截图)。我想我们无法直接访问网页这一事实增加了一些复杂性。
这是我的Python代码:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Set up the WebDriver (Chrome in this case)
options = webdriver.ChromeOptions()
# Set your download path
download_folder = 'downloaded_files'
# Create the download folder if it doesn't exist
if not os.path.exists(download_folder):
os.makedirs(download_folder)
driver = webdriver.Chrome(options=options)
# Open the URL
driver.get('https://www.hcd.ca.gov/planning-and-community-development/housing-open-data-tools/housing-element-download-tool')
# Wait for you to manually click the accept button and press Enter
input("Please manually click the 'Click here to accept' button and press Enter here once done...")
# Ensure the page is fully loaded and document is ready
WebDriverWait(driver, 40).until(lambda d: d.execute_script('return document.readyState') == 'complete')
# Use a more precise XPath based on the HTML structure you provided
download_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'housing-elements') and contains(text(), 'Click to View Housing Element')]")
# Check if any links were found
if not download_links:
print("No download links were found. Please check the XPath or ensure the page is loaded properly.")
else:
print(f"Found {len(download_links)} download links.")
# Loop through each link, extract the URL, and download the file
for index, link in enumerate(download_links, start=1):
file_url = link.get_attribute('href')
file_name = os.path.join(download_folder, f"housing_element_{index}.pdf")
# Download the file
try:
response = requests.get(file_url)
response.raise_for_status() # Check if the request was successful
with open(file_name, 'wb') as file:
file.write(response.content)
print(f"Downloaded: {file_name}")
except Exception as e:
print(f"Failed to download {file_url}: {e}")
# Close the browser after downloading all files
driver.quit()
print("All files have been downloaded.")
最好
我发现他们正在使用
https://wabi-us-gov-iowa-api.analysis.usgovcloudapi.net/public/reports/querydata?synchronous=true
方法从此 API 端点 POST
获取所有 pdf 下载链接,这是自动化整个过程的示例代码,我猜这比 selenium 更快。
注意:由于某种原因,我无法访问您的目标 Web,因此我使用 VPN 来查看哪个 Web 请求正在获取这些 URL。我最终找到了这个 API 请求,我使用
concurrent.futures
来加快下载过程,因此如果您的连接有任何问题,请尝试减少连接到 50
import requests
import json
import concurrent.futures
out = []
def getUrl():
pdf_url = set()
url = "https://wabi-us-gov-iowa-api.analysis.usgovcloudapi.net/public/reports/querydata?synchronous=true" #API endpoint fetching all the pdf URL
#Body for the request
data = {"version":"1.0.0","queries":[{"Query":{"Commands":[{"SemanticQueryDataShapeCommand":{"Query":{"Version":2,"From":[{"Name":"h","Entity":"housing element mapping (2)","Type":0},{"Name":"h1","Entity":"housing-elements-docs","Type":0}],"Select":[{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"County"},"Name":"housing element mapping (2).county"},{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"Jurisdiction"},"Name":"housing element mapping (2).jurisdiction"},{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"Planning Period"},"Name":"housing element mapping (2).planning_period"},{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"Received Date"},"Name":"housing element mapping (2).received date"},{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"type"},"Name":"housing element mapping (2).type"},{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"LinkText"},"Name":"housing element mapping (2).LinkText"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"View Link"}},"Function":3},"Name":"Min(housing element mapping (2).View Link)"}],"Where":[{"Condition":{"Not":{"Expression":{"In":{"Expressions":[{"Column":{"Expression":{"SourceRef":{"Source":"h1"}},"Property":"Column1 - Copy"}}],"Values":[[{"Literal":{"Value":"null"}}]]}}}}}],"OrderBy":[{"Direction":1,"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"h"}},"Property":"Jurisdiction"}}}]},"Binding":{"Primary":{"Groupings":[{"Projections":[0,1,2,3,4,5,6]}]},"DataReduction":{"DataVolume":3,"Primary":{"Window":{"Count":500}}},"SuppressedJoinPredicates":[6],"Version":1},"ExecutionMetricsKind":1}}]},"CacheKey":"{\"Commands\":[{\"SemanticQueryDataShapeCommand\":{\"Query\":{\"Version\":2,\"From\":[{\"Name\":\"h\",\"Entity\":\"housing element mapping (2)\",\"Type\":0},{\"Name\":\"h1\",\"Entity\":\"housing-elements-docs\",\"Type\":0}],\"Select\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"County\"},\"Name\":\"housing element mapping (2).county\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"Jurisdiction\"},\"Name\":\"housing element mapping (2).jurisdiction\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"Planning Period\"},\"Name\":\"housing element mapping (2).planning_period\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"Received Date\"},\"Name\":\"housing element mapping (2).received date\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"type\"},\"Name\":\"housing element mapping (2).type\"},{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"LinkText\"},\"Name\":\"housing element mapping (2).LinkText\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"View Link\"}},\"Function\":3},\"Name\":\"Min(housing element mapping (2).View Link)\"}],\"Where\":[{\"Condition\":{\"Not\":{\"Expression\":{\"In\":{\"Expressions\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h1\"}},\"Property\":\"Column1 - Copy\"}}],\"Values\":[[{\"Literal\":{\"Value\":\"null\"}}]]}}}}}],\"OrderBy\":[{\"Direction\":1,\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"h\"}},\"Property\":\"Jurisdiction\"}}}]},\"Binding\":{\"Primary\":{\"Groupings\":[{\"Projections\":[0,1,2,3,4,5,6]}]},\"DataReduction\":{\"DataVolume\":3,\"Primary\":{\"Window\":{\"Count\":500}}},\"SuppressedJoinPredicates\":[6],\"Version\":1},\"ExecutionMetricsKind\":1}}]}","QueryId":"","ApplicationContext":{"DatasetId":"35670b73-8451-456d-b6eb-d4f3362fbf6a","Sources":[{"ReportId":"b5ad39f0-b74f-4b12-a794-3af501a30416","VisualId":"d84782124001a62ecd43"}]}}],"cancelQueries":[],"modelId":629761}
resp = requests.post(url, data=json.dumps(data, separators=(',', ':')))
get_link = resp.json()['results'][0]['result']['data']['dsr']['DS'][0]['ValueDicts']['D4']
for i in get_link:
pdf_url.add(i)
get_more_pdf = resp.json()['results'][0]['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'] #some complex JSON structure if you found a more easy way to do this please do so
for i in get_more_pdf:
get_all_url = i['C']
for i in get_all_url:
pdf_url.add(i)
pdf_url_ready = []
for i in pdf_url:
try:
if 'http' in i:
pdf_url_ready.append(i)
except Exception:
pass
return pdf_url_ready
def downloadPdf(url, n):
get_content = requests.get(url, stream=True).content
with open(f"pdf-file-{n}.pdf", 'wb') as pdf_file:
pdf_file.write(get_content)
pdf_file.close()
#Used concurrent.futures to speed up the download
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
future_to_url = (executor.submit(downloadPdf, url, n) for n, url in enumerate(getUrl()))
for future in concurrent.futures.as_completed(future_to_url):
try:
data = future.result()
except Exception as exc:
data = str(type(exc))
finally:
out.append(data)
print(str(len(out)),end="\r")
让我知道这是否适合您或者您是否想严格使用硒