我目前正在从 Reddit 子版块中检索数据,旨在捕获自该子版块创建以来的所有帖子。但是,我遇到了限制,只能访问最新 1000 个帖子的信息。以下是我当前使用的代码:
是否有解决方法或替代方法可以让我从 Reddit 子版块中从最早的条目开始获取所有帖子?任何指导或帮助将不胜感激!谢谢你。
subreddit = reddit.subreddit("AfricanCichlids")
%%time
def scrape_subreddit_data(subreddit):
data = {
'Title': [],
'Author': [],
'Upvotes': [],
'Downvotes': [],
'Score': [],
'Number of Comments': [],
'Creation Time': [],
'Is NSFW': [],
'Is Spoiler': [],
'Is Locked': [],
'Is Archived': [],
'Subreddit Subscribers': [],
'Subreddit Active Users': [],
'Subreddit Creation Time': [],
'Comments': []
# Add more fields as needed
}
for submission in subreddit.new(limit=None): # Paginate through all posts from the start
data['Title'].append(submission.title)
data['Author'].append(submission.author)
data['Upvotes'].append(submission.score)
data['Downvotes'].append(submission.downs)
data['Score'].append(submission.ups - submission.downs)
data['Number of Comments'].append(submission.num_comments)
data['Creation Time'].append(pd.to_datetime(submission.created_utc, unit='s'))
data['Is NSFW'].append(submission.over_18)
data['Is Spoiler'].append(submission.spoiler)
data['Is Locked'].append(submission.locked)
data['Is Archived'].append(submission.archived)
data['Subreddit Subscribers'].append(submission.subreddit.subscribers)
data['Subreddit Active Users'].append(submission.subreddit.accounts_active)
data['Subreddit Creation Time'].append(pd.to_datetime(submission.subreddit.created_utc, unit='s'))
# Rate limiting: sleep for a short duration between requests
time.sleep(2) # Sleep for 1 second between requests
submission.comments.replace_more(limit=None)
comments = []
for comment in submission.comments.list():
comments.append(comment.body)
data['Comments'].append(comments)
# Add more fields as needed
return data
# Scrape data from subreddit
data = scrape_subreddit_data(subreddit)
# Convert data to DataFrame
df = pd.DataFrame(data)
由于 Reddit 的 API 限制,从 Reddit 子版块中检索自创建以来的所有帖子可能具有挑战性。 Reddit 的 API 通常限制请求仅检索大多数列表的最新 1000 项,包括子 Reddit 中的帖子。因此,除非您将您的问题告知 Reddit,否则我认为这是不可能的,如果有任何可能的解决方法,建议不要遵循,因为这将违反 Reddit 政策