import tweepy
import csv
import json
import nltk
import re
def scrub_text(string):
nltk.download('words')
words = set(nltk.corpus.words.words())
string=re.sub(r'[^a-zA-Z]+', ' ', string).lower()
string=" ".join(w for w in nltk.wordpunct_tokenize(string)
if w.lower() in words or not w.isalpha())
return string
def get_all_tweets():
with open('twitter_credentials.json') as cred_data:
info=json.load(cred_data)
consumer_key=info['API_KEY']
consumer_secret=info['API_SECRET']
access_key=info['ACCESS_TOKEN']
access_secret=info['ACCESS_SECRET']
screen_name = input("Enter twitter Handle: ")
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
api=tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True,
timeout=500000, retry_count=10, retry_delay=100)
all_the_tweets=[]
new_tweets=api.user_timeline(screen_name=screen_name, count=200)
all_the_tweets.extend(new_tweets)
oldest_tweet=all_the_tweets[-1].id - 1
while len(new_tweets) > 0:
new_tweets=api.user_timeline(screen_name=screen_name, count=200,
max_id=oldest_tweet)
all_the_tweets.extend(new_tweets)
oldest_tweet=all_the_tweets[-1].id -1
print('...%s tweets downloaded' %len(all_the_tweets))
outtweets=[[tweet.text.encode('utf-8')] for tweet in all_the_tweets]
outtweets=scrub_text(str(outtweets))
with open('tweets.txt', 'w') as f:
f.write(outtweets)
f.close()
上面的python代码应该从特定用户下载所有tweet。它似乎适用于大多数句柄,但是当我将其用于@realDonaldTrump时,有时会得到800,有时会得到1。我从来没有接近所有推文。由于帐户的活跃程度,我认为存在问题,但是我认为应该有一种解决方法。
Twitter时间轴API仅最多支持3200条Tweets(source),这也可能取决于Tweet的年龄/您分页的时间。不幸的是,您将无法使用API来获取所有这些推文。您可能需要使用商业化的Full Archive搜索API来从帐户中检索所有推文。
关于结果数量不一致,这听起来像是一个小故障,因为它应该相差不大。