Selenium 超时异常(尝试睡眠和等待时间)

问题描述 投票:0回答:0

我正在编写一个网络抓取工作,它将在所有州和年份的网站上抓取数据。目前,它需要 2 分钟并给我“硒超时异常”。

如此公平的尝试: 我一直保持 sleep(5) 并将 WebDriverWait 增加到 400 但没有用。你能帮帮我吗? 请建议或发布必要的东西来删除这个异常并以有效的方式运行它?

            import os
            from apify_client import ApifyClient
            import concurrent.futures
            import chromedriver_autoinstaller
            import requests
            import subprocess
            from selenium import webdriver
            from selenium.webdriver.common.by import By
            from selenium.webdriver.support.ui import WebDriverWait
            from selenium.webdriver.support import expected_conditions as EC
            from bs4 import BeautifulSoup
            import boto3
            from datetime import datetime
            from selenium import webdriver
            from selenium.webdriver.chrome.service import Service
            import stat
            import json
            import ast
            import threading
            import time
            from concurrent.futures import ThreadPoolExecutor
            
            
            def main():
                print("Aditya")
                # Initialize the main ApifyClient instance
                client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
                default_kv_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])
                actor_input = default_kv_store_client.get_record(os.environ['APIFY_INPUT_KEY'])['value']
                s3 = boto3.client('s3',aws_access_key_id='A', 
                                  aws_secret_access_key='n'
                                  )
                today_date=datetime.today().strftime('%Y-%m-%d')                  
                generate_xmlfiles(s3,today_date)
                extract_data_to_xml_files(s3,today_date)
                print("Uploaded successfully")
            
            def generate_xmlfiles(s3,today_date):
                s3.put_object(Bucket='datahub-raw-zone', Key='Bankruptcy/'+today_date+"/xmlfiles.txt", Body=(str(scrape_data())))
                print("Exported files to s3")

         
            def scrape_data():  
                    years = [2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003]
                    data = []
                    states = ["alnb","alsb","almb"]
                    options = webdriver.ChromeOptions()
                    
                    options.add_argument("--no-sandbox")
                    options.add_argument("--disable-dev-shm-usage")
                    options.add_argument("headless")
                    driver = webdriver.Chrome(executable_path='/usr/src/app/chromedriver', options=options)
                    st = os.stat('/usr/src/app/chromedriver')
                    os.chmod('/usr/src/app/chromedriver', st.st_mode | stat.S_IEXEC)
                
                    with requests.Session() as session:
                        for state in states:
                            with ThreadPoolExecutor() as executor:
                                futures = []
                                for year in years:
                                    url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
                                    response = session.get(url)
                                    if response.status_code == 200: 
                                        futures.append(executor.submit(scrape_year, driver, url, state, year))
                
                                for future in futures:
                                    data += future.result()
                
                            print("Loaded " +state)
                    driver.quit()
                    return data
                
            def scrape_year(driver, url, state, year):
                print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
                driver.get(url)
                elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body")))
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                bankruptcy_element = soup.findAll('div',{"class": "panel-collapse collapse in","class": "panel-title","class": "panel-body","class":"panel panel-default","class": "panel-collapse collapse in"})
                return [{year:xmlfile['href'].replace(".pdf","/mods.xml").replace("/pdf","").replace("/pkg/","/").replace("/content","").replace("https://www.govinfo.gov/metadata/granule","https://www.govinfo.gov/content/pkg")+""} for i in bankruptcy_element for xmlfile in i.findAll('a', href=True) if ("pdf" in (xmlfile['href']))]
            
                
                        
            def extract_data_to_xml_files(s3,today_date): 
                        bucket='dae'
                        result = s3.list_objects(Bucket = bucket, Prefix='Banptcy/'+today_date)
                        if 'Contents' in result:
                            objects = result['Contents']
                            for obj in objects:
                                if "xmlfiles.txt" in obj['Key']:
                                    filtered_objects=obj
                            data = s3.get_object(Bucket=bucket, Key=filtered_objects.get('Key'))
                            contents = data['Body'].read() 
                            urls_str = contents.decode('utf-8')  # decode the byte string to a string
                            urls_list = ast.literal_eval(urls_str)  # parse the string representation of the list into a Python list 
                            today_date=datetime.today().strftime('%Y-%m-%d')   
                            threads = []
                            for i in urls_list:
                                    print("Current Url"+str(i))
                                    year=(next(iter( i.items() ))[0])
                                    url=next(iter( i.items() ))[1]
                                    filename=url.split('/')[6]
                                    print(filename)
                                    response = requests.get(url)
                                    if response.status_code == 200:  
                                        thread = threading.Thread(target=load_xml_intofolders, args=(s3,url,today_date,filename,response,year))
                                        threads.append(thread)
                                        thread.start()
                            for thread in threads:
                                thread.join()        
                    
                    
            def load_xml_intofolders(s3,url,today_date,filename,response,year):
                    response = requests.get(url)
                    if response.status_code == 200:
                            s3.put_object(Bucket='datahub-raw-zone', Key='Bankruptcy/'+today_date+'/'+url.split('-')[1]+'/'+str(year)+'/'+filename+".xml", Body=(str(response.text)))
python-3.x asynchronous web-scraping async-await apify
© www.soinside.com 2019 - 2024. All rights reserved.