我想使用 Selenium 抓取网页,我无法绕过 cloudflare 机器人检查。之前我使用下面的代码从here获取数据表。由于云耀斑机器人检查,下面的代码失败。我正在使用
undetected_chromedriver
但这似乎也不起作用。
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
from datetime import datetime
pd.options.mode.chained_assignment = None
dates = pd.bdate_range('2024-02-28', '2024-02-28')
for d in dates:
try:
date = d.strftime('%d-%m-%y')
conv_date = datetime.strptime(date, '%d-%m-%y')
day = conv_date.strftime("%d").lstrip('0')
date = '{}/{}/{}'.format(day, conv_date.strftime("%m"), conv_date.strftime("%Y"))
# url = 'https://www.nccpl.com.pk/en/market-information/fipi-lipi/fipi-sector-wise'
url = 'https://www.nccpl.com.pk/en/portfolio-investments/fipi-sector-wise'
options = Options()
options.add_argument('--headless')
browser = uc.Chrome(options=options)
browser.get(url)
picker = wait(browser, 10).until(EC.presence_of_element_located((By.ID, 'popupDatepicker')))
t.sleep(2)
browser.execute_script('arguments[0].scrollIntoView();', picker)
picker.click()
t.sleep(3)
browser.execute_script(f'arguments[0].value = "{date}";', picker)
# print('clicked the day!')
t.sleep(5)
picker = wait(browser, 10).until(EC.presence_of_element_located((By.ID, 'popupDatepicker1')))
t.sleep(2)
browser.execute_script('arguments[0].scrollIntoView();', picker)
picker.click()
t.sleep(4)
browser.execute_script(f'arguments[0].value = "{date}";', picker)
# print('clicked the second day!')
t.sleep(1)
search_button = browser.find_element(By.XPATH, '//button[@class="search_btn"]/parent::div')
t.sleep(2)
search_button.click()
# print('clicked search!')
t.sleep(10)
dfs = pd.read_html(browser.page_source)
df_lipi = dfs[0]
browser.quit()
todays_date = conv_date
df_lipi.columns = ['CLIENT_TYPE', 'SEC_CODE', 'SECTOR_NAME', 'MARKET_TYPE',
'BUY_VOLUME', 'BUY_VALUE', 'SELL_VOLUME', 'SELL_VALUE',
'NET_VOLUME', 'NET_VALUE', 'USD']
df_lipi.loc[:, 'Date'] = todays_date
df_lipi = df_lipi[['CLIENT_TYPE', 'Date', 'SEC_CODE', 'SECTOR_NAME', 'MARKET_TYPE',
'BUY_VOLUME', 'BUY_VALUE', 'SELL_VOLUME', 'SELL_VALUE',
'NET_VOLUME', 'NET_VALUE', 'USD']]
lipi_columns = ['BUY_VOLUME', 'BUY_VALUE', 'SELL_VOLUME', 'SELL_VALUE',
'NET_VOLUME', 'NET_VALUE', 'USD']
df_lipi.drop(df_lipi[df_lipi.MARKET_TYPE == 'TOTAL'].index, inplace=True)
df_lipi.iloc[-1:, 0] = 'FIPI_NET'
df_lipi.iloc[-1:, 2:4] = ['x9999', 'All']
for string in lipi_columns:
df_lipi[[string]] = (df_lipi[[string]].replace('[\$,)]', '', regex=True)
.replace('[(]', '-', regex=True).astype(float))
except:
print(d)
有没有办法绕过cloudflare?我添加了
headless
的参数,但这似乎并不能解决问题。
未检测到的 chromedriver 可以帮助完全绕过,但有时我们无法这样做
原因可能是你无法模仿人类行为
尝试添加随机时间延迟
替代解决方案
尝试轮换您的 IP 地址
使用 requests-rotating-proxy 包
另外,尝试添加随机标头
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + str(randint(1, 100)))