在我的脚本中,我想从instagram搜索中获取数据,在我的代码中,硒按照我所应用的条件滚动搜索结果。但是一旦加载完成并且我尝试在我的bs4对象中获取滚动数据,它只会返回第一个结果(介于60-70之间)。它没有获得滚动后加载的数据
import time
import re
import json
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chop = Options()
chop.add_argument("--disable-gpu")
chop.add_argument("--no-sandbox")
chop.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(executable_path="c:/Users/Haseeb-Ahmad/3D
Objects/chromedriver.exe",options=chop)
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get('https://www.instagram.com/accounts/login/')
emailInput = driver.find_elements_by_css_selector('form input')[0]
passwordInput = driver.find_elements_by_css_selector('form input')[1]
emailInput.send_keys('#username')
passwordInput.send_keys('#password')
passwordInput.send_keys(Keys.ENTER) #login so i can avoid the login-popup
time.sleep(5)
try:
SCROLL_PAUSE_TIME = 1
driver.get("https://www.instagram.com/explore/tags/pakistan/")
x = True
count = 0
while x == True :
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
count = count + 1
if count > 10:
x = False
else:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
continue
bsObj = BeautifulSoup(driver.page_source,features='html.parser')
scripts = bsObj.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
data = json.loads(stringified_json)['entry_data']['TagPage'][0]['graphql'['hashtag'['edge_hashtag_to_media']['edges']
print(len(data)) #to chk data returned
finally:
driver.quit()
我在Twitter上也遇到过同样的问题,我发现bs4并不是最有效的库。
我用硒来做,效果很好。
这是我的代码:
account_names = []
account_tags = []
account_link = []
def scroll():
SCROLL_PAUSE_TIME = 1
global account_name
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
account_name = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div/main/div/div/div/div/div/div[2]/div/div/section/div/div/div/div/div/div/div/div[2]/div[1]/div[1]/a/div/div[1]/div[1]/span/span')
for act_name in account_name:
global acctname
acctname = act_name.text
account_names.append(acctname)
account_handle = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div/main/div/div/div/div/div/div[2]/div/div/section/div/div/div/div/div/div/div/div[2]/div[1]/div[1]/a/div/div[2]/div/span')
for act_handle in account_handle:
acct_handles = act_handle.text
account_tags.append(acct_handles)
soup = BeautifulSoup(driver.page_source, 'lxml')
account_links = soup.find_all('a', href=True, class_='css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l')
for acct_links in account_links:
global act_link
act_link = acct_links['href']
account_link.append(act_link)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll()
希望这会有所帮助。