抓取无限滚动,AJAX

问题描述 投票:0回答:1

我尝试了不同的方法,但在抓取 10,000 个项目时,它们似乎效率低下。 有人告诉我,我必须使用网站的 API 之类的东西。但我不知道如何与之沟通。

网站:https://aitoptools.com/

这是我的代码:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time


def main():
    driver = webdriver.Chrome()  # You need to have ChromeDriver installed and its path set correctly
    driver.get("https://aitoptools.com/")
    driver.implicitly_wait(10)

    num_scrolls = 10
    for _ in range(num_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(15)  # Wait for content to load (you might need to adjust the time)

    page_source = driver.page_source
    driver.quit()
    soup = BeautifulSoup(page_source, "html.parser")

    titles = soup.find_all("h2", class_="elementor-heading-title elementor-size-default")
    for title in titles:
        print(title.text)
    print(len(titles))


if __name__ == "__main__":
    main()

我认为我可以模仿的 XMLrequest: enter image description here

python ajax selenium-webdriver web-scraping beautifulsoup
1个回答
0
投票

可以通过使用

requests
BeautifulSoup
库来完成。

import json

import requests
from bs4 import BeautifulSoup

def get_title(page):

    payload = json_object = {
        "action": "jet_engine_ajax",
        "handler": "listing_load_more",
        "query[post_status][]": "publish",
        "query[post_type][]": "tool",
        "query[posts_per_page]": 12,
        "query[paged]": 1,
        "query[ignore_sticky_posts]": 1,
        "query[orderby]": "meta_value_num",
        "query[meta_key]": "popularity",
        "query[meta_type]": "NUMERIC",
        "query[suppress_filters]": False,
        "query[jet_smart_filters]": "jet-engine/default",
        "widget_settings[lisitng_id]": 43,
        "widget_settings[posts_num]": 12,
        "widget_settings[columns]": 3,
        "widget_settings[columns_tablet]": 2,
        "widget_settings[columns_mobile]": 1,
        "widget_settings[is_archive_template]": "",
        "widget_settings[post_status][]": "publish",
        "widget_settings[use_random_posts_num]": "",
        "widget_settings[max_posts_num]": 9,
        "widget_settings[not_found_message]": "No data was found",
        "widget_settings[is_masonry]": False,
        "widget_settings[equal_columns_height]": "yes",
        "widget_settings[use_load_more]": "yes",
        "widget_settings[load_more_id]": "",
        "widget_settings[load_more_type]": "scroll",
        "widget_settings[load_more_offset][unit]": "px",
        "widget_settings[load_more_offset][size]": 1000,
        "widget_settings[use_custom_post_types]": "yes",
        "widget_settings[custom_post_types][]": "tool",
        "widget_settings[hide_widget_if]": "",
        "widget_settings[carousel_enabled]": "",
        "widget_settings[slides_to_scroll]": 1,
        "widget_settings[arrows]": True,
        "widget_settings[arrow_icon]": "fa fa-angle-left",
        "widget_settings[dots]": "",
        "widget_settings[autoplay]": True,
        "widget_settings[autoplay_speed]": 5000,
        "widget_settings[infinite]": True,
        "widget_settings[center_mode]": "",
        "widget_settings[effect]": "slide",
        "widget_settings[speed]": 500,
        "widget_settings[inject_alternative_items]": "",
        "widget_settings[scroll_slider_enabled]": "",
        "widget_settings[scroll_slider_on][]": ["desktop", "tablet", "mobile"],
        "widget_settings[custom_query]": False,
        "widget_settings[custom_query_id]": "",
        "widget_settings[_element_id]": "",
        "page_settings[post_id]": False,
        "page_settings[queried_id]": False,
        "page_settings[element_id]": False,
        "page_settings[page]": page,
        "listing_type": False,
        "isEditMode": False,
        "addedPostCSS[]": 43
    }

    ajaxlisting_url = "https://aitoptools.com/?gclid=CjwKCAjw_uGmBhBREiwAeOfsd1dsGiFy6fi4gPYJAqzH87j-KFxpm04zduFu6MpCcUrUTYTuiPaXQhoCXGMQAvD_BwE&nocache=1691950841"
    res = requests.post(ajaxlisting_url, data=payload)
    html = json.loads(res.text)["data"]["html"]
    soup = BeautifulSoup(html, "html.parser")

    for i in soup.select(".elementor-size-default"):
        print(i.text)

© www.soinside.com 2019 - 2024. All rights reserved.