我想减少代码完成抓取页面所需的时间,我正在使用 selenium。 我在这个抓取项目中使用了 Scrapy,但 JavaScript 隐藏了 Scrapy 中的电子邮件元素。
Scrapy 很完美,我不知道是否有办法减少 selenium 的时间,或者有其他方法,或者在这种情况下使用其他工具或包?
如果有任何信息或文档可以了解更多信息,我将不胜感激。
这是代码:
# import the nesscery packages
from typing import Iterable
import csv
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver import Chrome
import time
from selenium.webdriver.common.by import By
import logging
from selenium.webdriver.support.ui import WebDriverWait
logging.basicConfig(level=logging.INFO)
path = r"C:\Users\HP\Desktop\scraping\chromedriver.exe"
options = Options()
options.headless=True
driver =Chrome(executable_path=path, options=options)
start_time = time.ctime()
end_time = time.ctime()
data = []
page_no = 1
base_url = f"https://www.mdpi.com/search?sort=pubdate&page_no={page_no}&page_count=50&year_from=1996&year_to=2024&q=biomaterials&view=default"
print (start_time)
for page_no in range(218,219):
# visit the link (the main web page)
logging.info(f"Scraping page number : {page_no} ")
url = base_url.format(page_no)
driver.get(url)
time.sleep(1)
# extracting the articles links from the main page
article_links = driver.find_elements_by_xpath(".//a[@class='title-link']")
article_hrefs = [lnk.get_attribute("href") for lnk in article_links]
# loop through all the article links to extract the specified information
for href in article_hrefs:
# Visit the article page
logging.info(f"Scraping article: {href}")
driver.get(href)
time.sleep(1)
# extracting title, author name and his email from article page
title_element = driver.find_element(By.XPATH,".//h1[contains(@class,'title')]")
title = title_element.text
author_element= author_element = driver.find_element(By.XPATH, ".//a[@class='profile-card-drop']")
author_name = author_element.text
email_elements = driver.find_elements(By.XPATH, ".//a[contains(@class,'email')]")
# Extract the first email element
if email_elements:
email_element = email_elements[0]
email = email_element.get_attribute("href")
data.append({"Title": title ,"Link": href,"Author": author_name, "Email": email })
# to know how much time it took to scrape all pages
print (end_time)
driver.quit()
# saving the data in the csv file
with open('emails.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=["Title","Author","Link", "Email"])
writer.writeheader()
writer.writerows(data)
电子邮件地址受 CloudFlare 的电子邮件保护脚本保护(实际上是双重编码)。我在网上找到了解码脚本,但它是针对单个编码字符串的,所以我必须修改它。
以下是如何使用 Scrapy(无硒)抓取网站:
import scrapy
import logging
def decode_email_protection(encoded_string):
encoded_data = encoded_string.split('#')[-1]
r = int(encoded_data[:2], 16)
email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(2, len(encoded_data), 2)])
encoded_data = email.split('#')[-1]
r = int(encoded_data[4:6], 16)
encoded_data = encoded_data[:4] + encoded_data[6:]
email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(0, len(encoded_data), 2)])
return email
class ExampleSpider(scrapy.Spider):
name = "example_spider"
allowed_domains = ["mdpi.com"]
base_url = "https://www.mdpi.com/search?sort=pubdate&page_no={}&page_count=50&year_from=1996&year_to=2024&q=biomaterials&view=default"
def start_requests(self):
for page_no in range(218, 219):
yield scrapy.Request(url=self.base_url.format(page_no), cb_kwargs={"page_no": page_no})
def parse(self, response, page_no):
self.log(f"Scraping page number : {page_no}", logging.INFO)
article_hrefs = response.xpath("//a[@class='title-link']/@href").getall()
for href in article_hrefs:
yield response.follow(url=href, callback=self.parse_page)
def parse_page(self, response):
self.log(f"Scraping article: {response.url}", logging.INFO)
title = response.xpath("//h1[contains(@class,'title')]/text()").get(default="").strip()
authors = response.xpath("//a[@class='profile-card-drop']//text()").getall()
authors = [i.strip() for i in authors]
email_href = response.xpath("//a[contains(@class,'email')]/@href").get(default="")
email = decode_email_protection(email_href)
yield {
"Title": title,
"Link": response.url,
"Authors": authors,
"Email": email
}