我正在编写一个网络抓取工作,它将在所有州和年份的网站上抓取数据。目前,它需要 2 分钟并给我“硒超时异常”。
如此公平的尝试: 我一直保持 sleep(5) 并将 WebDriverWait 增加到 400 但没有用。你能帮帮我吗? 请建议或发布必要的东西来删除这个异常并以有效的方式运行它?
import os
from apify_client import ApifyClient
import concurrent.futures
import chromedriver_autoinstaller
import requests
import subprocess
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import boto3
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import stat
import json
import ast
import threading
import time
from concurrent.futures import ThreadPoolExecutor
def main():
print("Aditya")
# Initialize the main ApifyClient instance
client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
default_kv_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])
actor_input = default_kv_store_client.get_record(os.environ['APIFY_INPUT_KEY'])['value']
s3 = boto3.client('s3',aws_access_key_id='A',
aws_secret_access_key='n'
)
today_date=datetime.today().strftime('%Y-%m-%d')
generate_xmlfiles(s3,today_date)
extract_data_to_xml_files(s3,today_date)
print("Uploaded successfully")
def generate_xmlfiles(s3,today_date):
s3.put_object(Bucket='datahub-raw-zone', Key='Bankruptcy/'+today_date+"/xmlfiles.txt", Body=(str(scrape_data())))
print("Exported files to s3")
def scrape_data():
years = [2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003]
data = []
states = ["alnb","alsb","almb"]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("headless")
driver = webdriver.Chrome(executable_path='/usr/src/app/chromedriver', options=options)
st = os.stat('/usr/src/app/chromedriver')
os.chmod('/usr/src/app/chromedriver', st.st_mode | stat.S_IEXEC)
with requests.Session() as session:
for state in states:
with ThreadPoolExecutor() as executor:
futures = []
for year in years:
url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
response = session.get(url)
if response.status_code == 200:
futures.append(executor.submit(scrape_year, driver, url, state, year))
for future in futures:
data += future.result()
print("Loaded " +state)
driver.quit()
return data
def scrape_year(driver, url, state, year):
print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
driver.get(url)
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
bankruptcy_element = soup.findAll('div',{"class": "panel-collapse collapse in","class": "panel-title","class": "panel-body","class":"panel panel-default","class": "panel-collapse collapse in"})
return [{year:xmlfile['href'].replace(".pdf","/mods.xml").replace("/pdf","").replace("/pkg/","/").replace("/content","").replace("https://www.govinfo.gov/metadata/granule","https://www.govinfo.gov/content/pkg")+""} for i in bankruptcy_element for xmlfile in i.findAll('a', href=True) if ("pdf" in (xmlfile['href']))]
def extract_data_to_xml_files(s3,today_date):
bucket='dae'
result = s3.list_objects(Bucket = bucket, Prefix='Banptcy/'+today_date)
if 'Contents' in result:
objects = result['Contents']
for obj in objects:
if "xmlfiles.txt" in obj['Key']:
filtered_objects=obj
data = s3.get_object(Bucket=bucket, Key=filtered_objects.get('Key'))
contents = data['Body'].read()
urls_str = contents.decode('utf-8') # decode the byte string to a string
urls_list = ast.literal_eval(urls_str) # parse the string representation of the list into a Python list
today_date=datetime.today().strftime('%Y-%m-%d')
threads = []
for i in urls_list:
print("Current Url"+str(i))
year=(next(iter( i.items() ))[0])
url=next(iter( i.items() ))[1]
filename=url.split('/')[6]
print(filename)
response = requests.get(url)
if response.status_code == 200:
thread = threading.Thread(target=load_xml_intofolders, args=(s3,url,today_date,filename,response,year))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
def load_xml_intofolders(s3,url,today_date,filename,response,year):
response = requests.get(url)
if response.status_code == 200:
s3.put_object(Bucket='datahub-raw-zone', Key='Bankruptcy/'+today_date+'/'+url.split('-')[1]+'/'+str(year)+'/'+filename+".xml", Body=(str(response.text)))