使用 Python 3.12.3 和 Selenium,我尝试在抓取之前加载更多行,并且对这个过程非常陌生。理想情况下是全部或至少尽可能多,但网站可能会在某个时候自动限制页面上的评论总数。通过手动点击,我可以再点击至少 10 次而不会失败。任何帮助将不胜感激。如果我可以提供任何其他背景信息,请告诉我。
这是我要单击的部分的屏幕截图:显示更多按钮 我始终无法单击此部分,并且我相信这是由于 :before psuedo-element 造成的,但此外,还有许多 div.Button__containers 使我尝试单击通常会导致按下我无意的按钮。
这是我用来提取评论的脚本,但它无法实际单击加载更多行,因此我希望使用 selenium 来弥补这一差距:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from IPython.display import display, Image
driver = webdriver.Chrome()
title = driver.title
for i in range(0,1000,100):
#McCurdy - I'm Glad my Mom Died
response = requests.get('https://www.goodreads.com/book/show/26074156/reviews?reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v1.FSsY8ohzUZCeEXoBsiEYqw%22,%22after%22:%22NjgxNSwxNTAwNjU3MjE4NDI1%22}')
print(response.status_code)
time.sleep(6)
doc = BeautifulSoup(response.text, 'html.parser')
df = pd.DataFrame(columns=['ReviewDate','ReviewerName','Rating','ReviewText','ReviewerMeta'])
ratings = []
# Loop through all elements found
for tag in book_tags_:
# Get the aria-label attribute from the current element
aria_label = tag.get('aria-label')
# Check if aria-label is not None and contains the expected format
if aria_label and 'Rating ' in aria_label and ' out of 5' in aria_label:
# Split the aria-label to extract the desired text
rating_text = aria_label.split('Rating ')[1].split(' out of 5')[0]
# Append the rating_text to the list of ratings
ratings.append(rating_text)
else:
print(f"Skipping element with aria-label: {aria_label}")
# Create a dataframe from the list of ratings
df = pd.DataFrame({'Rating': ratings})
#This is repeated for additional fields
这是我用来尝试按下按钮的代码:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, ElementClickInterceptedException
# Initialize WebDriver
driver = webdriver.Chrome() # or another driver you're using
url = "https://www.goodreads.com/book/show/59364173/reviews?reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k%22,%22after%22:%22MjYwMTYsMTY2MDc1MjY5MjY2Mw%22}"
# Open the page
driver.get(url)
time.sleep(20)
def click_show_more():
while True:
try:
# Scroll to the bottom of the page to ensure button is in view
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Find the 'Show More' button
show_more_button = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.Divider.Divider--contents.Divider--largeMargin > div.Button__container'))
)
# Scroll the element into view
driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
# Ensure the element is clickable
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.Divider.Divider--contents.Divider--largeMargin > div.Button__container')))
# Try to click the button using JavaScript if needed
driver.execute_script("arguments[0].click();", show_more_button)
# Optionally, wait for some condition after clicking, e.g., new content to load
WebDriverWait(driver, 10).until(EC.staleness_of(show_more_button))
except TimeoutException:
print("No more 'Show More' button found or timed out.")
break
except StaleElementReferenceException:
print("StaleElementReferenceException: Trying to find the button again.")
continue
except ElementClickInterceptedException:
print("ElementClickInterceptedException: Element is being obstructed.")
continue
except Exception as e:
print(f"Error clicking 'Show More' button: {e}")
break
# Remember to call your function
click_show_more()
注意:此答案包含实现目标的不同方法。 (使用的模块:requests、bs4、pandas、time)
根据您的问题,我认为您正在尝试获取与评论数据相关的所有信息。好吧,我发现有比使用硒更好的解决方案,这是我的思维导图:-
您的目标应用程序有一个 Graphql API 端点
https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql
,它从服务器获取所有评论详细信息,因此,如果我们发送请求以在 POST 正文上指定 resourseId
到此端点,我们可以在帮助下轻松获取这些数据python 请求库和一些编码。这是我的代码:
注意:为了避免速率限制,我使用 time.sleep(3) 来最小化线程。
import time
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
import pandas
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def get_data(content):
data = content['data']['getReviews']['edges']
for i in data:
id_user = i['node']['creator']['id']
rev_img = i['node']['creator']['imageUrlSquare']
is_author = i['node']['creator']['isAuthor']
follower_count = i['node']['creator']['followersCount']
name = i['node']['creator']['name']
review = BeautifulSoup(i['node']['text'], "lxml").text
review_create_data = i['node']['createdAt']
result_ms = pandas.to_datetime(review_create_data,unit='ms')
review_liked = i['node']['likeCount']
rating = i['node']['rating']
print(f"reviewers_name: {name}\nreviewer_user_id: {id_user}\nIs reviewr is author: {is_author}\nreview_date: {result_ms}\nreviewer_image: {rev_img}\nreviewer_follower: {follower_count}\nreview: {review}\nreview_ratings: {rating}\nreview_liked: {review_liked}\n========================================")
def gatherNextPage(resourceId):
url = "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql"
headers = {
"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0",
"X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy"
}
data = {
"operationName": "getReviews",
"variables": {
"filters": {
"resourceType": "WORK",
"resourceId": f"{resourceId}"
},
"pagination": {
"limit": 100
}
},
"query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n getReviews(filters: $filters, pagination: $pagination) {\n ...BookReviewsFragment\n __typename\n }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n totalCount\n edges {\n node {\n ...ReviewCardFragment\n __typename\n }\n __typename\n }\n pageInfo {\n prevPageToken\n nextPageToken\n __typename\n }\n __typename\n}\n\nfragment ReviewCardFragment on Review {\n __typename\n id\n creator {\n ...ReviewerProfileFragment\n __typename\n }\n recommendFor\n updatedAt\n createdAt\n spoilerStatus\n lastRevisionAt\n text\n rating\n shelving {\n shelf {\n name\n webUrl\n __typename\n }\n taggings {\n tag {\n name\n webUrl\n __typename\n }\n __typename\n }\n webUrl\n __typename\n }\n likeCount\n viewerHasLiked\n commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n id: legacyId\n imageUrlSquare\n isAuthor\n ...SocialUserFragment\n textReviewsCount\n viewerRelationshipStatus {\n isBlockedByViewer\n __typename\n }\n name\n webUrl\n contributor {\n id\n works {\n totalCount\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment SocialUserFragment on User {\n viewerRelationshipStatus {\n isFollowing\n isFriend\n __typename\n }\n followersCount\n __typename\n}\n"
}
n, data_next = 1, data
while True:
time.sleep(3)
resp = requests.post(url, headers=headers, json=data_next, verify=False)
data = resp.json()
get_data(data)
nextPageToken = data['data']['getReviews']['pageInfo']['nextPageToken']
if not nextPageToken:
break
data_next = {
"operationName": "getReviews",
"variables": {
"filters": {
"resourceType": "WORK",
"resourceId": "kca://work/amzn1.gr.work.v1.FSsY8ohzUZCeEXoBsiEYqw"
},
"pagination": {
"after": f"{nextPageToken}",
"limit": 100
}
},
"query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n getReviews(filters: $filters, pagination: $pagination) {\n ...BookReviewsFragment\n __typename\n }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n totalCount\n edges {\n node {\n ...ReviewCardFragment\n __typename\n }\n __typename\n }\n pageInfo {\n prevPageToken\n nextPageToken\n __typename\n }\n __typename\n}\n\nfragment ReviewCardFragment on Review {\n __typename\n id\n creator {\n ...ReviewerProfileFragment\n __typename\n }\n recommendFor\n updatedAt\n createdAt\n spoilerStatus\n lastRevisionAt\n text\n rating\n shelving {\n shelf {\n name\n webUrl\n __typename\n }\n taggings {\n tag {\n name\n webUrl\n __typename\n }\n __typename\n }\n webUrl\n __typename\n }\n likeCount\n viewerHasLiked\n commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n id: legacyId\n imageUrlSquare\n isAuthor\n ...SocialUserFragment\n textReviewsCount\n viewerRelationshipStatus {\n isBlockedByViewer\n __typename\n }\n name\n webUrl\n contributor {\n id\n works {\n totalCount\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment SocialUserFragment on User {\n viewerRelationshipStatus {\n isFollowing\n isFriend\n __typename\n }\n followersCount\n __typename\n}\n"
}
gatherNextPage("kca://work/amzn1.gr.work.v1.FSsY8ohzUZCeEXoBsiEYqw")
resourceId
:好吧,
resourceId
实际上是来自您的网址的workId
reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k%22,%22after%22:%22MjYwMTYsMTY2MDc1MjY5MjY2Mw%22}
希望这会有所帮助。
谢谢