我尝试了不同的方法,但在抓取 10,000 个项目时,它们似乎效率低下。 有人告诉我,我必须使用网站的 API 之类的东西。但我不知道如何与之沟通。
这是我的代码:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
def main():
driver = webdriver.Chrome() # You need to have ChromeDriver installed and its path set correctly
driver.get("https://aitoptools.com/")
driver.implicitly_wait(10)
num_scrolls = 10
for _ in range(num_scrolls):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(15) # Wait for content to load (you might need to adjust the time)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, "html.parser")
titles = soup.find_all("h2", class_="elementor-heading-title elementor-size-default")
for title in titles:
print(title.text)
print(len(titles))
if __name__ == "__main__":
main()
可以通过使用
requests
和 BeautifulSoup
库来完成。
import json
import requests
from bs4 import BeautifulSoup
def get_title(page):
payload = json_object = {
"action": "jet_engine_ajax",
"handler": "listing_load_more",
"query[post_status][]": "publish",
"query[post_type][]": "tool",
"query[posts_per_page]": 12,
"query[paged]": 1,
"query[ignore_sticky_posts]": 1,
"query[orderby]": "meta_value_num",
"query[meta_key]": "popularity",
"query[meta_type]": "NUMERIC",
"query[suppress_filters]": False,
"query[jet_smart_filters]": "jet-engine/default",
"widget_settings[lisitng_id]": 43,
"widget_settings[posts_num]": 12,
"widget_settings[columns]": 3,
"widget_settings[columns_tablet]": 2,
"widget_settings[columns_mobile]": 1,
"widget_settings[is_archive_template]": "",
"widget_settings[post_status][]": "publish",
"widget_settings[use_random_posts_num]": "",
"widget_settings[max_posts_num]": 9,
"widget_settings[not_found_message]": "No data was found",
"widget_settings[is_masonry]": False,
"widget_settings[equal_columns_height]": "yes",
"widget_settings[use_load_more]": "yes",
"widget_settings[load_more_id]": "",
"widget_settings[load_more_type]": "scroll",
"widget_settings[load_more_offset][unit]": "px",
"widget_settings[load_more_offset][size]": 1000,
"widget_settings[use_custom_post_types]": "yes",
"widget_settings[custom_post_types][]": "tool",
"widget_settings[hide_widget_if]": "",
"widget_settings[carousel_enabled]": "",
"widget_settings[slides_to_scroll]": 1,
"widget_settings[arrows]": True,
"widget_settings[arrow_icon]": "fa fa-angle-left",
"widget_settings[dots]": "",
"widget_settings[autoplay]": True,
"widget_settings[autoplay_speed]": 5000,
"widget_settings[infinite]": True,
"widget_settings[center_mode]": "",
"widget_settings[effect]": "slide",
"widget_settings[speed]": 500,
"widget_settings[inject_alternative_items]": "",
"widget_settings[scroll_slider_enabled]": "",
"widget_settings[scroll_slider_on][]": ["desktop", "tablet", "mobile"],
"widget_settings[custom_query]": False,
"widget_settings[custom_query_id]": "",
"widget_settings[_element_id]": "",
"page_settings[post_id]": False,
"page_settings[queried_id]": False,
"page_settings[element_id]": False,
"page_settings[page]": page,
"listing_type": False,
"isEditMode": False,
"addedPostCSS[]": 43
}
ajaxlisting_url = "https://aitoptools.com/?gclid=CjwKCAjw_uGmBhBREiwAeOfsd1dsGiFy6fi4gPYJAqzH87j-KFxpm04zduFu6MpCcUrUTYTuiPaXQhoCXGMQAvD_BwE&nocache=1691950841"
res = requests.post(ajaxlisting_url, data=payload)
html = json.loads(res.text)["data"]["html"]
soup = BeautifulSoup(html, "html.parser")
for i in soup.select(".elementor-size-default"):
print(i.text)