我的目标是:削减用户在汗学院完成的项目数量。
为此,我需要解析配置文件用户页面。但是我需要点击show more
来查看用户所做的所有项目,然后抓住它们。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException
from bs4 import BeautifulSoup
# here is one example of a user
driver = webdriver.Chrome()
driver.get('https://www.khanacademy.org/profile/trekcelt/projects')
# to infinite click on show more button until there is none
while True:
try:
showmore_project=WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME,'showMore_17tx5ln')))
showmore_project.click()
except TimeoutException:
break
except StaleElementReferenceException:
break
# parsing the profile
soup=BeautifulSoup(driver.page_source,'html.parser')
# get a list of all the projects
project=soup.find_all(class_='title_1usue9n')
# get the number of projects
print(len(project))
这段代码为0
返回print(len(project))
。这是不正常的,因为当你手动检查https://www.khanacademy.org/profile/trekcelt/projects
时,你可以看到项目数量绝对不是0
。
奇怪的是:首先,您可以看到(使用webdriver)此代码正常工作,然后selenium点击其他东西而不是show more button
,它点击其中一个项目的链接,例如更改页面,这就是为什么我们得到0
。
我不明白如何纠正我的代码所以selenium只点击右键而没有别的。
查看以下实现以获得所需的行为。脚本运行时,请仔细查看滚动条以查看进度。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver,10)
driver.get('https://www.khanacademy.org/profile/trekcelt/projects')
while True:
try:
showmore = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'[class^="showMore"] > a')))
driver.execute_script("arguments[0].click();",showmore)
except Exception:
break
soup = BeautifulSoup(driver.page_source,'html.parser')
project = soup.find_all(class_='title_1usue9n')
print(len(project))
另一种方式是:
while True:
try:
showmore = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'[class^="showMore"] > a')))
showmore.location_once_scrolled_into_view
showmore.click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,'[class^="spinnerContainer"] > img[class^="loadingSpinner"]')))
except Exception:
break
此时输出:
381
我修改了已接受的答案以提高脚本的性能。评论如何实现它是在代码中
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from bs4 import BeautifulSoup
import time
start_time = time.time()
# here is one example of a user
with webdriver.Chrome() as driver:
driver.get('https://www.khanacademy.org/profile/trekcelt/projects')
# This code will wait until the first Show More is displayed (After page loaded)
showmore_project = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME,
'showMore_17tx5ln')))
showmore_project.click()
# to infinite click on show more button until there is none
while True:
try:
# We will retrieve and click until we do not find the element
# NoSuchElementException will be raised when we reach the button. This will save the wait time of 10 sec
showmore_project= driver.find_element_by_css_selector('.showMore_17tx5ln [role="button"]')
# Using a JS to send the click will avoid Selenium to through an exception where the click would not be
# performed on the right element.
driver.execute_script("arguments[0].click();", showmore_project)
except StaleElementReferenceException:
continue
except NoSuchElementException:
break
# parsing the profile
soup=BeautifulSoup(driver.page_source,'html.parser')
# get a list of all the projects
project=soup.find_all(class_='title_1usue9n')
# get the number of projects
print(len(project))
print(time.time() - start_time)
执行时间1:14.343502759933472 执行时间2:13.955228090286255 希望这对你有所帮助!