如何为scrapy编写一个DownloadHandler,通过txsocksx发出socks4请求

问题描述 投票:0回答:1

我正在做一个大学项目,但我需要让下面的代码与socks4而不是tor/socks5一起工作。我尝试将

SOCKS5Agent
修改为
SOCKS4Agent
但随后我收到错误:

原始代码:https://stackoverflow.com/a/33944924/11219616

我的代码:

import scrapy.core.downloader.handlers.http11 as handler
from twisted.internet import reactor
from txsocksx.http import SOCKS4Agent
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.webclient import _parse


class TorScrapyAgent(handler.ScrapyAgent):
    _Agent = SOCKS4Agent

    def _get_agent(self, request, timeout):
        proxy = request.meta.get('proxy')

        if proxy:
            proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy)

            if proxy_scheme == 'socks4':
                endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)

                return self._Agent(reactor, proxyEndpoint=endpoint)

        return super(TorScrapyAgent, self)._get_agent(request, timeout)


class TorHTTPDownloadHandler(handler.HTTP11DownloadHandler):
    def download_request(self, request, spider):
        agent = TorScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
                               maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
                               warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))

        return agent.download_request(request)

我收到错误:

Traceback (most recent call last):
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
    result = result.throwExceptionIntoGenerator(g)
File "C:\Python27\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
    return g.throw(self.type, self.value, self.tb)
File "C:\Python27\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
    defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Python27\lib\site-packages\ometa\protocol.py", line 53, in dataReceived
    self._parser.receive(data)
File "C:\Python27\lib\site-packages\ometa\tube.py", line 41, in receive
    status = self._interp.receive(data)
File "C:\Python27\lib\site-packages\ometa\interp.py", line 48, in receive
    for x in self.next:
File "C:\Python27\lib\site-packages\ometa\interp.py", line 177, in apply
    for x in self._apply(f, ruleName, argvals):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 110, in _apply
    for x in rule():
File "C:\Python27\lib\site-packages\ometa\interp.py", line 256, in parse_Or
    for x in self._eval(subexpr):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 241, in parse_And
    for x in self._eval(subexpr):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 440, in parse_Action
    val = eval(expr.data, self.globals, self._localsStack[-1])
File "<string>", line 1, in <module>
File "C:\Python27\lib\site-packages\txsocksx\client.py", line 276, in serverResponse
    raise e.socks4ErrorMap.get(status)()
RequestRejectedOrFailed
python python-2.7 scrapy twisted
1个回答
0
投票

我从@drunkpig 的答案中得到了https://github.com/scrapy/scrapy/issues/747 我也修改为Socks4

txsocksx
必须从 https://github.com/unk2k/txsocksx 安装,如果您使用 python3.x

有关替代方案,请参阅https://github.com/habnabit/txsocksx#19

第1步

settings.py
添加以下行告诉程序在哪里可以找到socks5代理

PROXY_FILE = os.path.dirname(__file__) +"/US_100_S5_20220531.txt"

第2步

最重要的部分来了:

s4downloader.py

from typing import List

from txsocksx.http import SOCKS4Agent
from twisted.internet import reactor
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, ScrapyAgent
import random
from urllib.parse import urlsplit
from loguru import logger

# Ref https://txsocksx.readthedocs.io/en/latest/#txsocksx.http.SOCKS4Agent

import certifi, os

os.environ["SSL_CERT_FILE"] = certifi.where() # if not setted , you'll got an ERROR : certificate verify failed')] [<twisted.python.failure.Failure OpenSSL.SSL.Error: [('STORE routines', '', 'unregistered scheme')


class Socks4DownloadHandler(HTTP11DownloadHandler):

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        settings = spider.settings
        agent = ScrapySocks4Agent(settings, contextFactory=self._contextFactory, pool=self._pool, crawler=self._crawler)
        return agent.download_request(request)


class ScrapySocks4Agent(ScrapyAgent):
    def __init__(self, settings, **kwargs):
        """
        init proxy pool
        """
        super(ScrapySocks4Agent, self).__init__(**kwargs)
        self.__proxy_file = settings['PROXY_FILE']
        self._s4proxy_pool: List = self.__get_s4proxy_pool()

    def _get_agent(self, request, timeout):
        _, proxy_host, proxy_port, proxy_user, proxy_pass = self.__random_choose_proxy()
        proxy_user = bytes(map(ord, proxy_user))  # It's very strange, may be it's a BUG
        proxy_pass = bytes(map(ord, proxy_pass)) # It's very strange, may be it's a BUG
        proxyEndpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)
        agent = SOCKS4Agent(reactor, proxyEndpoint=proxyEndpoint,
                            endpointArgs=dict(methods={'login': [proxy_user, proxy_pass]}))
        return agent

    def __get_s4proxy_pool(self) -> List:
        """
        return proxy pool
        :return:
        """
        proxy_list = []
        with open(self.__proxy_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                else:
                    proxy_info = urlsplit(f"socks4://{line}")
                    schema, user, passwd, host, port = proxy_info.scheme, proxy_info.username, proxy_info.password, proxy_info.hostname, proxy_info.port
                    proxy_list.append((schema, host, port, user, passwd))

        return proxy_list

    def __random_choose_proxy(self):
        """
        schema, host, port, user, pass
        :return:
        """
        p = random.choice(self._s4proxy_pool)
        logger.info("use proxy {}", p)
        return p

第三步

最后,告诉你的蜘蛛如何使用

Socks5DownloadHandler
:

class MySpider(scrapy.Spider):
    name = "myname"
    allowed_domains = ["oh.com"]
    custom_settings = {
        # other configurations

        "DOWNLOAD_HANDLERS": {
            'http': 'asos.s4downloader.Socks4DownloadHandler',
            'https': 'asos.s4downloader.Socks4DownloadHandler',
        },
       # other configurations

    }

第四步

$scrapy crawl  you-spider
© www.soinside.com 2019 - 2024. All rights reserved.