我正在做一个大学项目,但我需要让下面的代码与socks4而不是tor/socks5一起工作。我尝试将
SOCKS5Agent
修改为 SOCKS4Agent
但随后我收到错误:
原始代码:https://stackoverflow.com/a/33944924/11219616
我的代码:
import scrapy.core.downloader.handlers.http11 as handler
from twisted.internet import reactor
from txsocksx.http import SOCKS4Agent
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.webclient import _parse
class TorScrapyAgent(handler.ScrapyAgent):
_Agent = SOCKS4Agent
def _get_agent(self, request, timeout):
proxy = request.meta.get('proxy')
if proxy:
proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy)
if proxy_scheme == 'socks4':
endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)
return self._Agent(reactor, proxyEndpoint=endpoint)
return super(TorScrapyAgent, self)._get_agent(request, timeout)
class TorHTTPDownloadHandler(handler.HTTP11DownloadHandler):
def download_request(self, request, spider):
agent = TorScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
return agent.download_request(request)
我收到错误:
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\Python27\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\Python27\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Python27\lib\site-packages\ometa\protocol.py", line 53, in dataReceived
self._parser.receive(data)
File "C:\Python27\lib\site-packages\ometa\tube.py", line 41, in receive
status = self._interp.receive(data)
File "C:\Python27\lib\site-packages\ometa\interp.py", line 48, in receive
for x in self.next:
File "C:\Python27\lib\site-packages\ometa\interp.py", line 177, in apply
for x in self._apply(f, ruleName, argvals):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 110, in _apply
for x in rule():
File "C:\Python27\lib\site-packages\ometa\interp.py", line 256, in parse_Or
for x in self._eval(subexpr):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 241, in parse_And
for x in self._eval(subexpr):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 440, in parse_Action
val = eval(expr.data, self.globals, self._localsStack[-1])
File "<string>", line 1, in <module>
File "C:\Python27\lib\site-packages\txsocksx\client.py", line 276, in serverResponse
raise e.socks4ErrorMap.get(status)()
RequestRejectedOrFailed
我从@drunkpig 的答案中得到了https://github.com/scrapy/scrapy/issues/747 我也修改为Socks4
txsocksx
必须从 https://github.com/unk2k/txsocksx 安装,如果您使用 python3.x
有关替代方案,请参阅https://github.com/habnabit/txsocksx#19
第1步
settings.py
添加以下行告诉程序在哪里可以找到socks5代理
PROXY_FILE = os.path.dirname(__file__) +"/US_100_S5_20220531.txt"
第2步
最重要的部分来了:
s4downloader.py
from typing import List
from txsocksx.http import SOCKS4Agent
from twisted.internet import reactor
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, ScrapyAgent
import random
from urllib.parse import urlsplit
from loguru import logger
# Ref https://txsocksx.readthedocs.io/en/latest/#txsocksx.http.SOCKS4Agent
import certifi, os
os.environ["SSL_CERT_FILE"] = certifi.where() # if not setted , you'll got an ERROR : certificate verify failed')] [<twisted.python.failure.Failure OpenSSL.SSL.Error: [('STORE routines', '', 'unregistered scheme')
class Socks4DownloadHandler(HTTP11DownloadHandler):
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
settings = spider.settings
agent = ScrapySocks4Agent(settings, contextFactory=self._contextFactory, pool=self._pool, crawler=self._crawler)
return agent.download_request(request)
class ScrapySocks4Agent(ScrapyAgent):
def __init__(self, settings, **kwargs):
"""
init proxy pool
"""
super(ScrapySocks4Agent, self).__init__(**kwargs)
self.__proxy_file = settings['PROXY_FILE']
self._s4proxy_pool: List = self.__get_s4proxy_pool()
def _get_agent(self, request, timeout):
_, proxy_host, proxy_port, proxy_user, proxy_pass = self.__random_choose_proxy()
proxy_user = bytes(map(ord, proxy_user)) # It's very strange, may be it's a BUG
proxy_pass = bytes(map(ord, proxy_pass)) # It's very strange, may be it's a BUG
proxyEndpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)
agent = SOCKS4Agent(reactor, proxyEndpoint=proxyEndpoint,
endpointArgs=dict(methods={'login': [proxy_user, proxy_pass]}))
return agent
def __get_s4proxy_pool(self) -> List:
"""
return proxy pool
:return:
"""
proxy_list = []
with open(self.__proxy_file, 'r') as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
else:
proxy_info = urlsplit(f"socks4://{line}")
schema, user, passwd, host, port = proxy_info.scheme, proxy_info.username, proxy_info.password, proxy_info.hostname, proxy_info.port
proxy_list.append((schema, host, port, user, passwd))
return proxy_list
def __random_choose_proxy(self):
"""
schema, host, port, user, pass
:return:
"""
p = random.choice(self._s4proxy_pool)
logger.info("use proxy {}", p)
return p
第三步
最后,告诉你的蜘蛛如何使用
Socks5DownloadHandler
:
class MySpider(scrapy.Spider):
name = "myname"
allowed_domains = ["oh.com"]
custom_settings = {
# other configurations
"DOWNLOAD_HANDLERS": {
'http': 'asos.s4downloader.Socks4DownloadHandler',
'https': 'asos.s4downloader.Socks4DownloadHandler',
},
# other configurations
}
第四步
$scrapy crawl you-spider