使用'scrapy'抓取网页抓取0页和项目

问题描述 投票:1回答:1

我从一个网站设置代理抓取器,但我什么也没得到。

import scrapy
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose

class ProxyServersPro(Item):
    ip = scrapy.Field()
    port = scrapy.Field()
    country = scrapy.Field()
    speed = scrapy.Field()
    protocol = scrapy.Field()
    anon = scrapy.Field()

class ProxyServersPro(CrawlSpider):
    name = "ProxyServersProCrawler"
    start_urls = ["https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1"]
    allowed_domains = ['proxyservers.pro']

    rules = {
        Rule(LinkExtractor(allow=r'page'), callback = 'parse_item')
        }

def parse_item(self, response):
    item = ItemLoader (ProxyServersPro(), response=response)
    item.add_xpath('ip', '//*[@id="content-content"]/div/div/div[1]/table/tbody/tr[1]/td[2]/a/text()')
    item.add_xpath('port', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[3]/span/text()')
    item.add_xpath('country', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[4]/text()')
    item.add_xpath('speed', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[5]/div[1]/div/div/text()')
    item.add_xpath('protocol', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[7]/text()')
    item.add_xpath('anon', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[8]/text()')
    return item.load_item()

这就是控制台所说的。

2019-03-24 04:53:27 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

谁能想出发生了什么?谢谢

python web-scraping scrapy
1个回答
0
投票

这是工作示例代码,请看一下

# -*- coding: utf-8 -*-
from scrapy import Selector
from scrapy.http import Request, FormRequest, HtmlResponse
from scrapy.spiders import CrawlSpider
from scrapy.conf import settings
import urllib
import json
import re
from urllib.parse import urljoin
from html.parser import HTMLParser
from requests import Session

from scrapy import Item, Field


class ProxyServersPro(Item):
    ip = Field()
    port = Field()
    country = Field()
    speed = Field()
    protocol = Field()
    anon = Field()
    port = Field()
class ProxyServers(CrawlSpider):
    name = "ProxyServersProCrawler"

    allowed_domains = ['proxyservers.pro']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    start_url = ['https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/2', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/3', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/4', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/5']

    def __init__(self):
        super(ProxyServers, self).__init__()

    def start_requests(self):
        for url in self.start_url:
            yield Request(url, callback=self.parse_companies, headers=self.headers)


    def parse_companies(self, response):
        table = response.xpath('//table[@class="table table-hover"]/tbody/tr')
        for data in table:
            ip = data.xpath('./td[2]/a/text()').extract_first()
            country = data.xpath('./td[4]/text()').extract_first()
            protocol = data.xpath('./td[7]/text()').extract_first()
            anon = data.xpath('./td[8]/text()').extract_first()
            port = data.xpath('./td[3]/text()').extract_first()

            item = ProxyServersPro()
            item['ip'] = ip
            item['country'] = country
            item['protocol'] = protocol
            item['anon'] = anon
            item['port'] = port
            yield item

也是端口和速度不在网站的内容,它是即时加载我们无法通过xpath。

© www.soinside.com 2019 - 2024. All rights reserved.