在运行scrapy之前插入多个输入字段

问题描述 投票:1回答:1

我正在引用类似于我的GUI应用程序的堆栈溢出qazxsw poop。我的应用程序有点不同。执行应用程序时,用户会提示输入要搜索的scrapy关键字

看起来像这样

answer

我试图把这个逻辑放在GUI上,但我不确定如何做到这一点。

这就是现在gui的样子。

enter image description here

我希望能够在处理scrapy脚本之前输入用户可以输入所需信息的字段。

这里有一些scrapy脚本

没有_spider.朋友

enter image description here

这是GUI

卖弄.朋友

import scrapy
import sys
import random
import csv
from scrape.items import Item
from var_dump import var_dump


search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second Location:")
third_location = input("Third Location:")
fourth_location = input("Fourth Location:")
fifth_location = input("Fifth Location:")
sixth_location = input("Sixth Location:")




# city = [
#     "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth", 
#     "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis",  "Seattle", "St. Paul", "Nashville", 
#     "Louisville", "Plano"
# ]

# rancity = random.choice(city)


class YellowSpider(scrapy.Spider):


    name = "yellow"

    # start_urls = [
    #     "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
    # ]

    def start_requests(self):
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location, self.parse3)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location, self.parse4)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fifth_location, self.parse5)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + sixth_location, self.parse6)
        # yield scrapy.Request('http://www.example.com/3.html', self.parse)

    def __init__(self):
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def parse(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse)

    def parse2(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile2)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse2)

    def parse3(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile3)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse3)
        ........
python pyqt scrapy pyqt5
1个回答
2
投票

首先,你必须修改你的蜘蛛直接通过控制台接受参数,避免使用from functools import partial from PyQt5 import QtCore, QtGui, QtWidgets class ScrapyWorker(QtCore.QObject): logChanged = QtCore.pyqtSignal(str) started = QtCore.pyqtSignal() finished = QtCore.pyqtSignal() def __init__(self, parent=None): super(ScrapyWorker, self).__init__(parent) self._process = QtCore.QProcess(self) self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels) self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput) self._process.setProgram('scrapy') self._process.started.connect(self.started) self._process.finished.connect(self.finished) def run(self, project, spider): self._process.setWorkingDirectory(project) self._process.setArguments(['crawl', spider]) self._process.start() @QtCore.pyqtSlot() def on_readyReadStandardOutput(self): data = self._process.readAllStandardOutput().data().decode() self.logChanged.emit(data) @QtCore.pyqtSlot() def stop(self): self._process.kill() def spiders(self, project): process = QtCore.QProcess() process.setProcessChannelMode(QtCore.QProcess.MergedChannels) process.setWorkingDirectory(project) loop = QtCore.QEventLoop() process.finished.connect(loop.quit) process.start('scrapy', ['list']) loop.exec_() return process.readAllStandardOutput().data().decode().split() class MainWindow(QtWidgets.QMainWindow): def __init__(self, parent=None): super(MainWindow, self).__init__(parent) self.project_le = QtWidgets.QLineEdit() self.project_button = QtWidgets.QPushButton('Select Project') self.spider_combobox = QtWidgets.QComboBox() self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True) self.text_edit = QtWidgets.QTextBrowser() self.input = QtWidgets.QLineEdit() self.input1 = QtWidgets.QLineEdit() self.input2 = QtWidgets.QLineEdit() self.input3 = QtWidgets.QLineEdit() self.input4 = QtWidgets.QLineEdit() self.input5 = QtWidgets.QLineEdit() self.input6 = QtWidgets.QLineEdit() central_widget = QtWidgets.QWidget() self.setCentralWidget(central_widget) lay = QtWidgets.QVBoxLayout(central_widget) hlay = QtWidgets.QHBoxLayout() hlay.addWidget(self.project_le) hlay.addWidget(self.project_button) lay.addLayout(hlay) hlay2 = QtWidgets.QHBoxLayout() hlay2.addWidget(QtWidgets.QLabel("Input The Search Item :")) hlay2.addWidget(self.input, 1) hlay3 = QtWidgets.QHBoxLayout() hlay4 = QtWidgets.QHBoxLayout() hlay5 = QtWidgets.QHBoxLayout() hlay6 = QtWidgets.QHBoxLayout() hlay7 = QtWidgets.QHBoxLayout() hlay8 = QtWidgets.QHBoxLayout() hlay3.addWidget(QtWidgets.QLabel("Location :")) hlay3.addWidget(self.input1, 1 ) hlay4.addWidget(QtWidgets.QLabel("Location 2 :")) hlay4.addWidget(self.input2, 1 ) hlay5.addWidget(QtWidgets.QLabel("Location 3 :")) hlay5.addWidget(self.input3, 1 ) hlay6.addWidget(QtWidgets.QLabel("Location 4 :")) hlay6.addWidget(self.input4, 1 ) hlay7.addWidget(QtWidgets.QLabel("Location 5 :")) hlay7.addWidget(self.input5, 1 ) hlay8.addWidget(QtWidgets.QLabel("Location 6 :")) hlay8.addWidget(self.input6, 1 ) lay.addLayout(hlay2) lay.addLayout(hlay3) lay.addLayout(hlay4) lay.addLayout(hlay5) lay.addLayout(hlay6) lay.addLayout(hlay7) lay.addLayout(hlay8) lay.addWidget(self.start_stop_button) lay.addWidget(self.text_edit) self.start_stop_button.setEnabled(False) self.scrapy_worker = ScrapyWorker(self) self.scrapy_worker.logChanged.connect(self.insert_log) self.scrapy_worker.started.connect(self.text_edit.clear) self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False)) self.start_stop_button.toggled.connect(self.on_checked) self.project_button.clicked.connect(self.select_project) self.resize(640, 480) @QtCore.pyqtSlot(bool) def on_checked(self, state): if state: filename = self.project_le.text() finfo = QtCore.QFileInfo(filename) directory = finfo.dir().absolutePath() self.scrapy_worker.run(directory, self.spider_combobox.currentText()) self.start_stop_button.setText('Stop') else: self.start_stop_button.setText('Start') self.scrapy_worker.stop() @QtCore.pyqtSlot() def select_project(self): filename, _ = QtWidgets.QFileDialog.getOpenFileName( self, "Select .cfg file", QtCore.QDir.currentPath(), "Configure File (*.cfg)" ) if filename: self.project_le.setText(filename) finfo = QtCore.QFileInfo(filename) directory = finfo.dir().absolutePath() spiders = self.scrapy_worker.spiders(directory) self.spider_combobox.clear() self.spider_combobox.addItems(spiders) self.start_stop_button.setEnabled(True if spiders else False) @QtCore.pyqtSlot(str) def insert_log(self, text): prev_cursor = self.text_edit.textCursor() self.text_edit.moveCursor(QtGui.QTextCursor.End) self.text_edit.insertPlainText(text) self.text_edit.setTextCursor(prev_cursor) if __name__ == '__main__': import sys app = QtWidgets.QApplication(sys.argv) app.setStyle('fusion') w = MainWindow() w.show() sys.exit(app.exec_()) 方法:

yellow Page_spider.朋友

input()

然后前面的代码需要一个名为import json import scrapy from scrape.items import Item class YellowSpider(scrapy.Spider): name = "yellow" def __init__(self, *args, **kwargs): super(YellowSpider, self).__init__(*args, **kwargs) self.seen_business_names = [] self.seen_phonenumbers = [] self.seen_websites = [] self.seen_emails = [] def start_requests(self): if not hasattr(self, 'parameters'): return parameters = json.loads(self.parameters) search_item = parameters['search_item'] locations = parameters['locations'] for location in locations: url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location) yield scrapy.Request(url=url, callback=self.parse, meta={'location': location}) def parse(self, response): location = response.meta['location'] for href in response.css('div.v-card a.business-name::attr(href)'): yield response.follow(href, self.businessprofile, meta={'location': location}) for href in response.css('div.pagination a::attr(href)'): yield response.follow(href, self.parse, meta={'location': location}) def businessprofile(self, response): location = response.meta['location'] for business in response.css('header#main-header'): item = Item() item['business_name'] = business.css('div.sales-info h1::text').extract() w = business.css('a.secondary-btn.website-link::attr(href)').extract() item['website'] = str(w).strip('[]') item['location'] = location s = business.css('a.email-business::attr(href)').extract() item['email'] = [item[7:] for item in s] item['phonenumber'] = business.css('p.phone::text').extract_first() for x in item['email']: #new code here, call to self.seen_business_names if x not in self.seen_emails: if item['email']: if item['phonenumber']: if item['website']: self.seen_emails.append(x) yield item 的参数:

parameters

因此,在GUI中,我们现在必须使用GUI输入形成条目:

贵.朋友

scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'

我使用了utils.py文件中的一个函数:

import os
import json
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
import utils


dir_path = os.path.dirname(os.path.abspath(__file__))
icons_dir = os.path.join(dir_path, 'assets', 'icons')


class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, program, arguments):
        self._process.setWorkingDirectory(project)
        self._process.setProgram('scrapy')
        self._process.setArguments(arguments)
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

class LocationWidget(QtWidgets.QWidget):
    def __init__(self, parent=None):
        super(LocationWidget, self).__init__(parent)
        self.lay = QtWidgets.QVBoxLayout(self)
        self.lay.setContentsMargins(0, 0, 0, 0)
        self.lay.addStretch()
        self.setContentsMargins(0, 0, 0, 0)
        self.widgets = []
        self.create_row()

    def create_row(self):
        widget = QtWidgets.QWidget()
        widget.setContentsMargins(0, 0, 0, 0)
        hlay = QtWidgets.QHBoxLayout(widget)
        hlay.setContentsMargins(0, 0, 0, 0)
        lineedit = QtWidgets.QLineEdit()
        button = QtWidgets.QToolButton(clicked=self.on_clicled)
        button.setFocusPolicy(QtCore.Qt.NoFocus)
        hlay.addWidget(lineedit)
        hlay.addWidget(button)
        button.setIconSize(QtCore.QSize(24, 24))
        button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
        self.widgets.append(widget)
        self.lay.insertWidget(-1, widget)

    @QtCore.pyqtSlot()
    def on_clicled(self):
        button = self.sender()
        widget = button.parentWidget()
        if self.lay.indexOf(widget) == (self.lay.count()-1):
            self.create_row()
        else:
            self.lay.removeWidget(widget)
            widget.deleteLater()
            self.widgets.remove(widget)
        for widget in self.widgets:
            button = widget.findChild(QtWidgets.QToolButton)
            button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))
        self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))

    def get_locations(self):
        locations = []
        for widget in self.widgets:
            le = widget.findChild(QtWidgets.QLineEdit)
            if le.text():
                locations.append(le.text())
        return locations

class YellowWidget(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(YellowWidget, self).__init__(parent)
        self.setWindowTitle('Yellow Pages Scrapper')
        self.scrapy_worker = ScrapyWorker(self)
        self.search_item_le = QtWidgets.QLineEdit()
        self.location_widget = LocationWidget()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()

        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)
        lay = QtWidgets.QGridLayout(central_widget)
        lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)
        lay.addWidget(self.search_item_le, 0, 1)
        lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, alignment=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)
        lay.addWidget(self.location_widget, 1, 1, alignment=QtCore.Qt.AlignTop)
        lay.addWidget(self.start_stop_button, 2, 0, 1, 2)
        lay.addWidget(self.text_edit, 3, 0, 1, 2)

        self.start_stop_button.toggled.connect(self.on_checked)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            # crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'
            search_item = self.search_item_le.text()
            locations = self.location_widget.get_locations()
            directory, program, args = utils.create_arguments(search_item, locations)
            self.scrapy_worker.run(directory, program, args)
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = YellowWidget()
    w.resize(640, 480)
    w.show()
    sys.exit(app.exec_())

获得以下内容:

import os import json def create_arguments(search_item, locations): program = 'scrapy' dir_path = os.path.dirname(os.path.abspath(__file__)) directory = os.path.join(dir_path, 'scrape') d = {"search_item": search_item, "locations": locations} argument = 'parameters={}'.format(json.dumps(d)) return directory, program, ['crawl', 'yellow', "-a", argument]

完整的项目是enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.