我正在尝试在此站点上自动进行网络抓取,这通常会在它给我 HTTP 错误 403 之前运行一次。我尝试按照此处类似问题中的建议更改用户代理,但无济于事。最初我计划抓取 100 多个结果,但我将其更改为只有 19 个结果,但我仍然被阻止。有什么方法可以绕过网站屏蔽授权吗?任何帮助表示赞赏!
代码
from urllib.request import Request, urlopen
import requests
from itertools import cycle
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
journal_url = []
pagesToScrape = 1
for i in range(1,pagesToScrape+1):
search_url = f"https://jamanetwork.com/searchresults?q=vegan&exPrm_qqq=%7bDEFAULT_BOOST_FUNCTION%7d%22vegan%22&exPrm_hl.q=vegan&page={i}"
req = Request(search_url , headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "html.parser")
for i in soup.find_all('h3', attrs={'class':'article--title at-sr-item-title-link'}):
journal_url.append(i.a['href'])
date = []
for journal in journal_url:
journal_req = Request(journal , headers={'User-Agent': 'Mozilla/5.0'})
journal_webpage = urlopen(journal_req).read()
journal_soup = BeautifulSoup(journal_webpage, "html.parser")
try:
title = journal_soup.find('h1', attrs={'class':'meta-article-title '}).get_text() #finds journal's title
except AttributeError:
continue
try:
accept = journal_soup.find("strong", text="Accepted for Publication:").next_sibling.strip(" .") #finds journal accepted for publication date
except AttributeError:
continue
except:
accept = journal_soup.find("strong", text="Submitted for Publication:").next_sibling.strip(" ;")
try:
accept = datetime.strptime(accept, '%B %d, %Y') #convert str to datetime
except ValueError:
continue
try:
publish = journal_soup.find("strong", text="Published Online:").next_sibling.strip(". doi:") #finds journal's online publishing date
except AttributeError:
continue
try:
publish = datetime.strptime(accept, '%B %d, %Y') #convert str to datetime
except ValueError:
continue
timeBetweenPublish = publish - accept
date.append([title, accept, publish, timeBetweenPublish]) #adds variables into list
print(title)
df = pd.DataFrame(date, columns=['Title', 'Accepted for Publication Date', 'Published Online Date', 'Time Between Accepted Date and Published Date'])
df.to_csv('date.csv')