我目前正在尝试 scrapy-playwright。我也尝试过使用硒,但无济于事。
我可以访问主体和根 div。我知道该页面是使用 java 动态加载的,但我似乎无法在根 div 中加载任何内容。每次尝试访问无限滚动 div 时,我都会得到 null。 (或任何其他与此相关的 div)这是我当前的代码:
import scrapy
from OpenSupply.items import FacItem
from scrapy_playwright.page import PageMethod
class FacspiderSpider(scrapy.Spider):
name = "facspider"
def start_requests(self):
url = "https://opensupplyhub.org/facilities?facility_type=Final%20Product%20Assembly&sort_by=contributors_desc"
yield scrapy.Request(url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods = [
PageMethod('wait_for_selector', 'div.infinite-scroll'),
],
errback = self.errback
))
def parse(self, response):
facitem = FacItem()
for comp in response.css('div.infinite-scroll div'):
facitem['test'] = comp.css('span').get()
yield facitem
async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
我尝试使用Scrapy-Splash,但无法获得任何输出。我也尝试过使用不同的标题。
这应该可以做到:
import scrapy
import urllib.parse
class OpensupplyhubSpider(scrapy.Spider):
name = "opensupplyhub"
base_url = 'https://opensupplyhub.org/api/facilities/'
params = {
'facility_type': 'Final Product Assembly',
'sort_by': 'contributors_desc',
'number_of_public_contributors': 'true',
'pageSize': '50',
'page': '1'
}
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://opensupplyhub.org/facilities?facility_type=Final%20Product%20Assembly&sort_by=contributors_desc',
'Credentials': 'same-origin',
'X-Oar-Client-Key': 'bbd21248d53d958583f36a87b84067d5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}
def start_requests(self):
url = f'{self.base_url}?{urllib.parse.urlencode(self.params)}'
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse,
)
def parse(self, response):
for item in response.json()['features']:
yield {'name':item['properties']['name']}
next_page = response.json().get('next','')
if next_page:
print("=====================>",next_page)
yield scrapy.Request(
url=next_page,
headers=self.headers,
callback=self.parse,
)