我试图通过使用asyncio / aiohttp并行化Web请求来从https://www.officialcharts.com/中获取一些数据。我实现了链接here给出的代码。
我遵循了两个不同的程序。第一个是这样的。
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import json
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from IPython.display import clear_output
import memory_profiler
import spotipy
import spotipy.util as util
import pandas as pd
from more_itertools import unique_everseen
weeks = []
d = date(1970, 1, 1)
d += timedelta(days = 6 - d.weekday())
for i in range(2500):
weeks.append(d.strftime('%Y%m%d'))
d += timedelta(days = 7)
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def run(r):
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(fetch(url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(5))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
上面代码的问题是,当我发出超过1000个请求时,它失败了。
post的作者实施了一个不同的程序来解决这个问题,他声称我们可以做多达10K的请求。我按照他的第二个程序,这是我的代码。
import random
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
print("{}:{} with delay {}".format(date, response.url, delay))
return await response.read()
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
await fetch(url, session)
async def run(r):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(1000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
number = 5
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
出于某种原因,这不会返回任何回复。
PS:我不是来自CS背景,只是为了好玩。我不知道asyncio代码里面发生了什么。
尝试使用最新版本。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# python 3.7.2
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
return content
if __name__ == '__main__':
urls = ['http://demin.co:8080/1', 'http://demin.co:8080/2']
res = run(scrape(urls))
print(res)
这是一个真正的project的模板,按预期工作。