使用 python 和 selenium 并行化网页抓取[编辑和更新]

问题描述 投票:0回答:1

我正在尝试从巴西健康数据存储库这里获取健康数据。我正在尝试获取 2018 年至 2024 年、每个性别(男/女)以及 22 个不同年龄组的每个月因焦虑/抑郁而去看医生的次数。我有 A/D 代码列表 (54),并且想分别查看每个代码的访问次数。我有一个数据框,其中每一行都是这些变量的组合(54 个代码 X 2 个性别 X 22 个年龄组 X 7 年):

combns = pd.read_csv("/.../data/Anxiety_Depression_combinations.csv")

code    year    sex codeType    agelower    ageupper
0   F32 2018    2   CID 0   200
1   F320    2018    2   CID 0   200
2   F321    2018    2   CID 0   200
3   F322    2018    2   CID 0   200
4   F323    2018    2   CID 0   200
... ... ... ... ... ... ...
16627   P01 2024    3   CIAP    100 200
16628   P02 2024    3   CIAP    100 200
16629   P74 2024    3   CIAP    100 200
16630   P79 2024    3   CIAP    100 200
16631   P82 2024    3   CIAP    100 200
16632 rows × 6 columns

对于数据框中的每一行,我更改设置,然后抓取生成的 html 表格并将其保存为 Excel 文件。我有这段代码可以在无头模式下打开网络浏览器,并选择一些常用设置:

url = "https://sisab.saude.gov.br/paginas/acessoRestrito/relatorio/federal/saude/RelSauProducao.xhtml"
DRIVER_PATH = '/home/kvemuri/.local/share/binman_chromedriver/linux64/128.0.6613.119/chromedriver'
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(executable_path=DRIVER_PATH,options=options)
#driver.quit()

    # Open browser and navigate to SISAB website.
    #DRIVER_PATH = '/home/kvemuri/.local/share/binman_chromedriver/linux64/127.0.6533.119/chromedriver'
    #driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    #url = "https://sisab.saude.gov.br/paginas/acessoRestrito/relatorio/federal/saude/RelSauProducao.xhtml"
    driver.get(url)
    # Set type of service - Individual
    driver.find_element(By.ID,"tpProducao").send_keys('Atendimento Individual')
    
    # Set column to month (Competencia)
    driver.find_element(By.XPATH,'//*[@id="selectLinha"]/optgroup[1]/option[6]').click()
    
    # Set Type of team
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]/ul/li[2]/a/label/input').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]/ul/li[5]/a/label/input').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]/ul/li[9]/a/label/input').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]').click()
    
    # Set place of service
    driver.find_element(By.XPATH,'//*[@id="filtrosLocalTipoEstabelecimento"]/div/button').click()
    driver.find_element(By.XPATH,'//*[@id="filtrosLocalTipoEstabelecimento"]/div/ul/li[2]/a/label/input').click()
    driver.find_element(By.XPATH,'//*[@id="filtrosLocalTipoEstabelecimento"]/div/button').click()
    
    # Set type of service - Individual
    driver.find_element(By.ID,"tpProducao").send_keys('Atendimento Individual')
    
    # Set type of attendance - select all
    driver.find_element(By.XPATH,'//*[@id="divTipoAtendimento"]/div/button/span').click()
    driver.find_element(By.XPATH,'//*[@id="divTipoAtendimento"]/div/ul/li[1]/a/label').click()
    driver.find_element(By.XPATH,'//*[@id="divTipoAtendimento"]/div/button/span').click()
    
    # Set type of professional
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/button/span').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/ul/li[12]/a/label').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/ul/li[19]/a/label').click()
    driver.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/button/span').click()
    
    # Select 'years' button for age
    driver.find_element(By.XPATH,'//*[@id="tpIdade:2"]').click()

我有以下功能:

def createXpathPattern(code,codeType):
    if codeType == 'CID':
        codeXpath = '//*[@id="cid-{}-1"]'.format(code)
        #code = code
    else:
        codeXpath = '//*[@id="cid-{}-4"]'.format(code)


def setDxCode(code, codeType):
    if codeType == 'CID':
        codeXpath = '//*[@id="cid-{}-1"]'.format(code)
        #code = code
    else:
        codeXpath = '//*[@id="cid-{}-4"]'.format(code)
    #codeXpath = '//*[@id="cid-{}"]'.format(code)
    #codeXpath = createXpathPattern(code = code, codeType = codeType)
    driver.find_element(By.ID,"btnAddCid").click()
    time.sleep(1)
    searchBox = driver.find_element(By.XPATH,'//*[@id="dtBasicExample_filter"]/label/input')
    searchBox.clear()
    searchBox.send_keys(code)
    #time.sleep(1)
    driver.find_element(By.XPATH,codeXpath).click()
    searchBox.clear()
    #time.sleep(1)
    driver.find_element(By.XPATH,'//*[@id="modal-default-cid"]/div/div/div[2]/button[2]/span').click()
    
# function to set age group and sex

def setAgeLimit(agelower, ageupper):
    
    """Selects the upper and lower age limit in years, and the sex for which data is to be output"""
    
    #Set lower age limit
    agelim1 = driver.find_element(By.XPATH,'//*[@id="idadeInicio"]')
    agelim1.clear()
    agelim1.send_keys(str(agelower))
    
    #Set uper age limit
    agelim2 = driver.find_element(By.XPATH,'//*[@id="idadeFim"]')
    agelim2.clear()
    agelim2.send_keys(str(ageupper))
    

def setSex(sex = 1):
    # Set Sex
    sexXpath = '//*[@id="filtrosCaracteristicaUsuario"]/div/ul/li[{}]/a/label'.format(sex)
    driver.find_element(By.XPATH,'//*[@id="filtrosCaracteristicaUsuario"]/div/button/span').click() #//*[@id="filtrosCaracteristicaUsuario"]/div/button
    driver.find_element(By.XPATH,sexXpath).click() # //*[@id="filtrosCaracteristicaUsuario"]/div/ul/li[2]/a/label/input
    driver.find_element(By.XPATH,'//*[@id="filtrosCaracteristicaUsuario"]/div/button/span').click()
    
# Get list of all months in the months column in the webpage
monthBoxes = driver.find_elements(By.CSS_SELECTOR,"#competencia > div > ul > li > a> label > input")
yearMonth = []
for i in range(len(monthBoxes)):
    yearMonth.append(monthBoxes[i].get_attribute("value"))


def getMonthsforYear(year):
    year = str(year)
    boolMonth = [year in i for i in yearMonth]
    return(list(compress(yearMonth,boolMonth)))


def ClickMonthCheckbox(monthsList):
    driver.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    for month in monthsList:
        monthIdx = str(yearMonth.index(month) + 1)
        monthxpath = '//*[@id="competencia"]/div/ul/li[{}]/a/label/input'.format(monthIdx)
        driver.find_element(By.XPATH,monthxpath).click()


def SelectAllMonthsInaYear(year):
    TmpMonthsList = getMonthsforYear(year)
    ClickMonthCheckbox(TmpMonthsList)
    driver.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    

def checkboxIsSelected(monthsList):
    driver.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    for month in monthsList:
        monthIdx = str(yearMonth.index(month) + 1)
        monthxpath = '//*[@id="competencia"]/div/ul/li[{}]/a/label/input'.format(monthIdx)
        checkbox = driver.find_element(By.XPATH,monthxpath)
        print(checkbox.is_selected())


def checkboxYearIsSelected(year):
    driver.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    TmpMonthsList = getMonthsforYear(year)
    checkboxIsSelected(TmpMonthsList)
    driver.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    

def getHTMLtable():
    driver.find_element(By.XPATH,'//*[@id="j_idt44"]/div[3]/div[1]/div/div[2]/div[2]/div[5]/div/div/div[1]/label').click() # Click on submit button
    driver.find_element(By.XPATH,'//*[@id="tabela_length"]/label/select').send_keys('100') # Set number of rows in table to 100
    t = driver.find_element(By.XPATH,'//*[@id="tabela"]').get_attribute('outerHTML') # //*[@id="tabela"]/tbody
    table = pd.read_html(t,thousands ='.')
    table_dt = pd.concat(table)
    #table_dt['sex'] = 'Female'
    #table_dt['agelower'] = 5
    #table_dt['ageupper'] = 9
    return(table_dt)


def SetParametersAndGetTable(agemin,agemax,sex,year, code, codeType):
    if sex == 1:
        sexL = "Both sexes"
    elif sex == 2:
        sexL = "Males"
    else:
        sexL = "Females"
    
    dlCode = code + "." + str(year) + "." + sexL + "." + str(agemin) + "." + str(agemax)
    
    setAgeLimit(agelower = agemin,ageupper = agemax)
    SelectAllMonthsInaYear(year)
    setSex(sex = sex)
    setDxCode(code = code, codeType = codeType)
    time.sleep(1)
    try:
        table_dt = getHTMLtable()
        table_dt['code'] = code
        table_dt['sex'] = sexL
        table_dt['agelower'] = agemin
        table_dt['ageupper'] = agemax
        table_dt = table_dt[['code','sex', 'agelower', 'ageupper','Competência','Atendimento Individual']]
        table_dt.rename(columns = {'Competência':"YearMonth",'Atendimento Individual':"individual.visits", 
                               'code':'code','sex':"sex",'agelower':"agelower",'ageupper':"ageupper"},inplace= True)
        table_dt['YearMonth'] = pd.to_datetime(table_dt['YearMonth'], format = "%Y%m").dt.strftime('%Y-%m')
        setDxCode(code = code, codeType = codeType) # Unselect code
        time.sleep(1)
        setSex(sex = sex) # Unselect Sex 
        SelectAllMonthsInaYear(year) # Unselect yearMonth parameters
        return(table_dt)
    except:
        print(f'Table not found for {dlCode}')
        setDxCode(code = code, codeType = codeType) # Unselect code
        time.sleep(1)
        setSex(sex = sex) # Unselect Sex 
        SelectAllMonthsInaYear(year) # Unselect yearMonth parameters


def ApplyRows(row):
    year = row['year']
    sex = row['sex']
    agelower = row['agelower']
    ageupper = row['ageupper']
    code = row['code']
    codeType = row['codeType']
    
    if sex == 1:
        sexL = "Both sexes"
    elif sex == 2:
        sexL = "Males"
    else:
        sexL = "Females"
    
    anx_dep_key = code + "." + str(year) + "." + str(sex) + "." + str(agelower) + "." + str(ageupper)
    print(anx_dep_key)
    table_name = "~/sisab/data/anxiety_depression_with_codes/sisab_anx_dep_code_{}_{}_{}_{}_to_{}.xlsx".format(code,str(year),sexL,str(agelower),str(ageupper))
    print(table_name)
    
    try:
        print("Scraping code {}, year {}, for {} in age range {} to {}".format(code,str(year),sexL,str(agelower),str(ageupper)))
    
        table_dt = SetParametersAndGetTable(agemin = agelower,agemax = ageupper,sex = sex,year = year, code = code, codeType = codeType)
        table_dt.to_excel(table_name,index = False,header = True)
        Anx_Dep_dict[anx_dep_key] = table_dt
    except:
        notRun = {'Code':code,'codeType':codeType,'Year':year,'Sex':sex,'agelower':agelower,'ageupper':ageupper}
        noRun.append(notRun)

然后将 ApplyRows() 函数应用于组合数据帧的每一行。

noRun = []
Anx_Dep_dict = {}
combns.apply(ApplyRows, axis = 1)

我的问题是,每行运行大约需要 20 秒,大约 17000 行,这意味着如果没有问题的话,所有行需要 3 天以上才能完成运行。

我想看看是否有办法将其与 Dask 并行化。我不确定是否需要使用 Dask.dataframe 或 dask.delayed 或两者,以及如何使用它们。我正在寻找有关如何执行此操作的任何指示,并且希望得到任何帮助。

非常感谢!!

更新2024-11-07

我已经使用了@EuanG的建议,并尝试了

concurrent.futures
库来并行化代码。这是我更新的代码:

import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from itertools import compress
import time
from concurrent.futures import ThreadPoolExecutor, wait



def get_driver(headless):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless=new")
    # initialize driver
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,options=options)
    #driver.get(URL)
    return driver
   
def browserSettings(browser):
    base_url = "https://sisab.saude.gov.br/paginas/acessoRestrito/relatorio/federal/saude/RelSauProducao.xhtml"
    #time.sleep(3)
    browser.get(base_url)
    # Set type of service - Individual
    browser.find_element(By.ID,"tpProducao").send_keys('Atendimento Individual')

    # Set column to month (Competencia)
    browser.find_element(By.XPATH,'//*[@id="selectLinha"]/optgroup[1]/option[6]').click()

    # Set Type of team
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]/ul/li[2]/a/label/input').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]/ul/li[5]/a/label/input').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]/ul/li[9]/a/label/input').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[1]').click()

    # Set place of service
    browser.find_element(By.XPATH,'//*[@id="filtrosLocalTipoEstabelecimento"]/div/button').click()
    browser.find_element(By.XPATH,'//*[@id="filtrosLocalTipoEstabelecimento"]/div/ul/li[2]/a/label/input').click()
    browser.find_element(By.XPATH,'//*[@id="filtrosLocalTipoEstabelecimento"]/div/button').click()

    # Set type of service - Individual
    browser.find_element(By.ID,"tpProducao").send_keys('Atendimento Individual')

    # Set type of attendance - select all
    browser.find_element(By.XPATH,'//*[@id="divTipoAtendimento"]/div/button/span').click()
    browser.find_element(By.XPATH,'//*[@id="divTipoAtendimento"]/div/ul/li[1]/a/label').click()
    browser.find_element(By.XPATH,'//*[@id="divTipoAtendimento"]/div/button/span').click()

    # Set type of professional
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/button/span').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/ul/li[12]/a/label').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/ul/li[19]/a/label').click()
    browser.find_element(By.XPATH,'//*[@id="filtroEquipeProf"]/div[2]/button/span').click()

    # Select 'years' button for age
    browser.find_element(By.XPATH,'//*[@id="tpIdade:2"]').click()

以及更新的功能:

def setDxCode(code, codeType,browser):
    if codeType == 'CID':
        codeXpath = '//*[@id="cid-{}-1"]'.format(code)
        #code = code
    else:
        codeXpath = '//*[@id="cid-{}-4"]'.format(code)
    #codeXpath = '//*[@id="cid-{}"]'.format(code)
    #codeXpath = createXpathPattern(code = code, codeType = codeType)
    browser.find_element(By.ID,"btnAddCid").click()
    time.sleep(1)
    searchBox = browser.find_element(By.XPATH,'//*[@id="dtBasicExample_filter"]/label/input')
    searchBox.clear()
    searchBox.send_keys(code)
    #time.sleep(1)
    browser.find_element(By.XPATH,codeXpath).click()
    searchBox.clear()
    #time.sleep(1)
    browser.find_element(By.XPATH,'//*[@id="modal-default-cid"]/div/div/div[2]/button[2]/span').click()
    
# function to set age group and sex
def setAgeLimit(agelower, ageupper,browser):
    
    """Selects the upper and lower age limit in years, and the sex for which data is to be output"""
    
    #Set lower age limit
    agelim1 = browser.find_element(By.XPATH,'//*[@id="idadeInicio"]')
    agelim1.clear()
    agelim1.send_keys(str(agelower))
    
    #Set uper age limit
    agelim2 = browser.find_element(By.XPATH,'//*[@id="idadeFim"]')
    agelim2.clear()
    agelim2.send_keys(str(ageupper))
    
def setSex(browser,sex = 1):
    # Set Sex
    sexXpath = '//*[@id="filtrosCaracteristicaUsuario"]/div/ul/li[{}]/a/label'.format(sex)
    browser.find_element(By.XPATH,'//*[@id="filtrosCaracteristicaUsuario"]/div/button/span').click() #//*[@id="filtrosCaracteristicaUsuario"]/div/button
    browser.find_element(By.XPATH,sexXpath).click() # //*[@id="filtrosCaracteristicaUsuario"]/div/ul/li[2]/a/label/input
    browser.find_element(By.XPATH,'//*[@id="filtrosCaracteristicaUsuario"]/div/button/span').click()
    
# Get list of all months in the months column in the webpage
#monthBoxes = driver.find_elements(By.CSS_SELECTOR,"#competencia > div > ul > li > a> label > input")
#yearMonth = []
#for i in range(len(monthBoxes)):
#    yearMonth.append(monthBoxes[i].get_attribute("value"))

def GetYearMonth(browser):
    monthBoxes = browser.find_elements(By.CSS_SELECTOR,"#competencia > div > ul > li > a> label > input")
    yearMonth = []
    for i in range(len(monthBoxes)):
        yearMonth.append(monthBoxes[i].get_attribute("value"))
    return(yearMonth)

def getMonthsforYear(year,browser):
    yearMonth = GetYearMonth(browser = browser)
    year = str(year)
    boolMonth = [year in i for i in yearMonth]
    return(list(compress(yearMonth,boolMonth)))

def ClickMonthCheckbox(monthsList,browser):
    yearMonth = GetYearMonth(browser = browser)
    browser.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    for month in monthsList:
        monthIdx = str(yearMonth.index(month) + 1)
        monthxpath = '//*[@id="competencia"]/div/ul/li[{}]/a/label/input'.format(monthIdx)
        browser.find_element(By.XPATH,monthxpath).click()

def SelectAllMonthsInaYear(year,browser):
    TmpMonthsList = getMonthsforYear(year, browser = browser)
    ClickMonthCheckbox(TmpMonthsList, browser = browser)
    browser.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    
def checkboxIsSelected(monthsList,browser):
    browser.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    for month in monthsList:
        monthIdx = str(yearMonth.index(month) + 1)
        monthxpath = '//*[@id="competencia"]/div/ul/li[{}]/a/label/input'.format(monthIdx)
        checkbox = browser.find_element(By.XPATH,monthxpath)
        print(checkbox.is_selected())

def checkboxYearIsSelected(year,browser):
    browser.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    TmpMonthsList = getMonthsforYear(year)
    checkboxIsSelected(TmpMonthsList)
    browser.find_element(By.XPATH,'//*[@id="competencia"]/div/button/span').click()
    
def getHTMLtable(browser):
    browser.find_element(By.XPATH,'//*[@id="j_idt44"]/div[3]/div[1]/div/div[2]/div[2]/div[5]/div/div/div[1]/label').click() # Click on submit button
    browser.find_element(By.XPATH,'//*[@id="tabela_length"]/label/select').send_keys('100') # Set number of rows in table to 100
    t = browser.find_element(By.XPATH,'//*[@id="tabela"]').get_attribute('outerHTML') # //*[@id="tabela"]/tbody
    table = pd.read_html(t,thousands ='.')
    table_dt = pd.concat(table)
    #table_dt['sex'] = 'Female'
    #table_dt['agelower'] = 5
    #table_dt['ageupper'] = 9
    return(table_dt)

def SetParametersAndGetTable(agemin,agemax,sex,year, code, codeType,browser):
    if sex == 1:
        sexL = "Both sexes"
    elif sex == 2:
        sexL = "Males"
    else:
        sexL = "Females"
    
    dlCode = code + "." + str(year) + "." + sexL + "." + str(agemin) + "." + str(agemax)
    
    setAgeLimit(agelower = agemin,ageupper = agemax,browser = browser)
    SelectAllMonthsInaYear(year,browser = browser)
    setSex(sex = sex,browser = browser)
    setDxCode(code = code, codeType = codeType,browser = browser)
    time.sleep(1)
    try:
        table_dt = getHTMLtable(browser = browser)
        table_dt['code'] = code
        table_dt['sex'] = sexL
        table_dt['agelower'] = agemin
        table_dt['ageupper'] = agemax
        table_dt = table_dt[['code','sex', 'agelower', 'ageupper','Competência','Atendimento Individual']]
        table_dt.rename(columns = {'Competência':"YearMonth",'Atendimento Individual':"individual.visits", 
                               'code':'code','sex':"sex",'agelower':"agelower",'ageupper':"ageupper"},inplace= True)
        table_dt['YearMonth'] = pd.to_datetime(table_dt['YearMonth'], format = "%Y%m").dt.strftime('%Y-%m')
        setDxCode(code = code, codeType = codeType,browser = browser) # Unselect code
        time.sleep(1)
        setSex(sex = sex,browser = browser) # Unselect Sex 
        SelectAllMonthsInaYear(year,browser = browser) # Unselect yearMonth parameters
        return(table_dt)
    except:
        print(f'Table not found for {dlCode}')
        setDxCode(code = code, codeType = codeType,browser = browser) # Unselect code
        time.sleep(1)
        setSex(sex = sex,browser = browser) # Unselect Sex 
        SelectAllMonthsInaYear(year,browser = browser) # Unselect yearMonth parameters

def getVarsFromDF(df,rownum):
    year = df.loc[rownum,'year']
    sex = df.loc[rownum,'sex']
    agelower = df.loc[rownum,'agelower']
    ageupper = df.loc[rownum,'ageupper']
    code = df.loc[rownum,'code']
    codeType = df.loc[rownum,'codeType']
    
    if sex == 1:
        sexL = "Both sexes"
    elif sex == 2:
        sexL = "Males"
    else:
        sexL = "Females"
    
    anx_dep_key = code + "." + str(year) + "." + str(sex) + "." + str(agelower) + "." + str(ageupper)
    #print(anx_dep_key)
    table_name = "~/sisab/data/anxiety_depression_with_codes/parallel_download/sisab_anx_dep_code_{}_{}_{}_{}_to_{}.xlsx".format(code,str(year),sexL,str(agelower),str(ageupper))
    #print(table_name)
    return({'year':year,'sex':sex,'sexL':sexL,'ageupper':ageupper,'agelower':agelower,'code':code,'codeType':codeType,'anx_dep_key':anx_dep_key,'table_name':table_name})
        
def getRowArgs(df,rownum,browser):
    year = df.loc[rownum,'year']
    sex = df.loc[rownum,'sex']
    agelower = df.loc[rownum,'agelower']
    ageupper = df.loc[rownum,'ageupper']
    code = df.loc[rownum,'code']
    codeType = df.loc[rownum,'codeType']
    
    if sex == 1:
        sexL = "Both sexes"
    elif sex == 2:
        sexL = "Males"
    else:
        sexL = "Females"
    
    anx_dep_key = code + "." + str(year) + "." + str(sex) + "." + str(agelower) + "." + str(ageupper)
    print(anx_dep_key)
    table_name = "~/sisab/data/anxiety_depression_with_codes/parallel_download/sisab_anx_dep_code_{}_{}_{}_{}_to_{}.xlsx".format(code,str(year),sexL,str(agelower),str(ageupper))
    print(table_name)
    
    try:
        print("Scraping code {}, year {}, for {} in age range {} to {}".format(code,str(year),sexL,str(agelower),str(ageupper)))
    
        table_dt = SetParametersAndGetTable(agemin = agelower,agemax = ageupper,sex = sex,year = year, code = code, codeType = codeType,browser = browser)
        table_dt.to_excel(table_name,index = False,header = True)
        Anx_Dep_dict[anx_dep_key] = table_dt
        return(table_dt)
    except:
        notRun = {'Code':code,'codeType':codeType,'Year':year,'Sex':sex,'agelower':agelower,'ageupper':ageupper}
        noRun.append(notRun)
        
def run_process(headless,df, number):
    Browser = get_driver(headless = False)
    browserSettings(browser = Browser)
    #vars = getVarsFromDF(df = combns, rownum = 365)
    #setSex(browser = Browser,sex = vars['sex'])
    #test_dt = SetParametersAndGetTable(browser = Browser,agemin = vars['agelower'],agemax = vars['ageupper'],sex = vars['sex'],year = vars['year'], code = vars['code'], 
    #                         codeType = vars['codeType'])

    table_dt = getRowArgs(df = df, rownum = number,browser = Browser)
    Browser.close()
    return(table_dt)

最后在多线程中运行代码:

noRun = []
Anx_Dep_dict = {}
futures = []

if __name__ == '__main__':
    # scrape and crawl
    with ThreadPoolExecutor(max_workers = 3) as executor:
        for number in range(0,combns.shape[0]):
            futures.append(
                executor.submit(run_process, headless = True, df = combns, number = number)
            )

wait(futures)

虽然这看起来效果很好,但我遇到了一些行被遗漏的问题,看起来是随机的。我不确定如何解决或诊断此问题。有什么推荐吗?

再次感谢!

python selenium-webdriver parallel-processing concurrent.futures
1个回答
-2
投票

我建议您使用 DrissionPage python 库进行网页抓取和自动化项目。

DrissionPage是一个基于Python的Web自动化工具。 它可以控制浏览器、发送和接收数据包,并将两者合而为一。 可以兼顾浏览器自动化的便利性和请求的高效率。 它功能强大,内置众多人性化设计和便捷功能。 其语法简洁优雅,代码量少,对新手友好。

请参考以下网址: https://drissionpage.cn/dp40docs/

© www.soinside.com 2019 - 2024. All rights reserved.