我是Python和网络抓取的新手。请原谅我的无知。在此程序中,我想按计划运行Scrapy Spider。我已经尝试过“时间表”包。但这并不成功,因为do()方法需要一个方法调用作为参数。我尝试传递类名“ MySpider”,但未产生所需的输出。我也尝试过cron工作。但是无法弄清楚如何将Spider用作Cron工作。请帮助我了解如何按计划运行此程序。任何帮助,将不胜感激。我在Mac OS上使用Python 3.7
import csv
import os
import random
from time import sleep
import scrapy
import schedule
import time
class Myspider(scrapy.Spider):
name = "spider1"
with open("data.csv", "a") as filee:
if os.stat("data.csv").st_size != 0:
filee.truncate(0)
filee.close()
def start_requests(self):
list = ["https://www.example.com/item1",
"https://www.example.com/item2",
"https://www.example.com/item3",
"https://www.example.com/item4",
"https://www.example.com/item5"
]
for i in list:
yield scrapy.Request(i, callback=self.parse)
sleep(random.randint(0, 5))
def parse(self, response):
product_name = response.css('#pd-h1-cartridge::text')[0].extract()
product_price = response.css(
'.product-price .is-current, .product-price_total .is-current, .product-price_total ins, .product-price ins').css(
'::text')[3].extract()
with open('data.csv', 'a+') as file:
itemwriter = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
itemwriter.writerow([str(product_name).strip(), str(product_price).strip()])
file.close()
schedule.every(120).seconds.do(Myspider)
while 1:
schedule.run_pending()
time.sleep(10)
您可以使用apscheduler
pip install apscheduler
# -*- coding: utf-8 -*-
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler
from Demo.spiders.baidu import YourSpider
process = CrawlerProcess(get_project_settings())
scheduler = TwistedScheduler()
scheduler.add_job(process.crawl, 'interval', args=[YourSpider], seconds=10)
scheduler.start()
process.start(False)