我目前正在使用 Selenium 和 BeautifulSoup 进行网络抓取项目,以基于主题标签从 Instagram 提取帖子数据。但是,在运行我的代码时,它返回 0 个帖子。这是我到目前为止所做的:
设置 Selenium webdriver 并初始化它。
使用提供的主题标签导航到 Instagram 主题标签页面。
将会话 ID cookie 添加到网络驱动程序。
刷新页面以应用会话 ID cookie。
等待页面加载。
提取 HTML 内容并使用 BeautifulSoup 对其进行解析。
位于页面的主要部分。
通过遍历帖子元素提取帖子 URL、用户名和点赞数。
在 post_data 列表中添加了超过 1000 个赞的帖子。
然而,尽管执行了这些步骤,我在提取的数据中得到了 0 个帖子。你能告诉我我可能做错了什么或建议一种替代方法来检索所需的帖子数据吗?任何帮助将不胜感激。谢谢!
from json.tool import main
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pprint import pprint
from selenium_stealth import stealth
import json
import time
def setup_webdriver(webdriver_path):
"""Creates a new Chrome webdriver instance using Service."""
service = Service(webdriver_path)
options = webdriver.ChromeOptions()
options.add_argument(
f'--user-data-dir=/Users/ronnydiaz/Library/Application Support/Google/Chrome/Default')
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
return webdriver.Chrome(service=service, options=options)
def get_post_data(driver, hashtag):
"""Gets the post data for the given hashtag."""
session_id = '37384277226%3AeIYsLt3LDOCNoW%3A3%3AAYfZbBm3JLc1fAaagWhF2ly0ZoruzLUCrKJMwRVRcQ'
driver.get(f"https://www.instagram.com/explore/tags/{hashtag}/")
driver.add_cookie({'name': 'sessionid', 'value': session_id,
'domain': '.instagram.com'})
# Get cookie details with named cookie 'session_id'
print(driver.get_cookie("session_id"))
# Refresh the page to apply the session ID cookie
driver.refresh()
# Wait for the page to load and post elements to be present
wait = WebDriverWait(driver, 10)
post_elements = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, 'a[href*="/p/"]')))
print(f"Attempting: {driver.current_url}")
if "login" in driver.current_url:
print("Failed/ redir to login")
return []
# Extract the HTML content
html_content = driver.page_source
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
main_section = soup.find('main')
print(main_section)
# Extract post URLs, usernames, and likes
post_data = []
post_elements = main_section.select('a[href*="/p/"]')
for post_element in post_elements:
post_url = post_element['href']
username_element = post_element.find_next('span')
username = username_element.text if username_element else ''
likes_element = post_element.find_next('span', class_='zV_Nj')
likes_count = int(likes_element.text.replace(
',', '')) if likes_element else 0
if likes_count > 1000:
post_data.append({
'post_url': f"https://www.instagram.com{post_url}",
'username': username,
'likes': likes_count
})
return post_data
def save_post_data(post_data, filename):
"""Saves the post data to a JSON file."""
with open(filename, 'w') as file:
json.dump(post_data, file, indent=4)
print(f"Total posts: {len(post_data)}")
if __name__ == '__main__':
# Set up the webdriver
webdriver_path = '/usr/local/bin/chromedriver'
driver = setup_webdriver(webdriver_path)
# Get the post data for the hashtag 'fitness'
hashtag = 'health'
post_data = get_post_data(driver, hashtag)
# Save the post data to a JSON file
filename = 'post_data.json'
save_post_data(post_data, filename)
# Close the browser
driver.quit()