我正在尝试在 Power BI Dash 上抓取动态表。具体来说,就是第2个表。我遇到了两个问题。主要问题是我的垂直抓取没有收集所有行数据。我只能抓取 2k 行中的 248 行左右。第二个问题是,当我水平滚动时,当其余列移出 DOM 时,会收集 9 列。我想我只会运行代码两次,而不是第二次水平滚动。
如果有人可以帮助我了解如何抓取所有行,那将会很有帮助。我看到表格确实向下滚动到底部,所以这似乎是等待数据加载然后抓取它的问题。
# Initialize the driver, replace with your own setup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,
MoveTargetOutOfBoundsException
import pandas as pd
import time
# Initialize the WebDriver and navigate to the URL
driver = webdriver.Chrome()
driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")
wait = WebDriverWait(driver, 20)
# Ensure the table is visible
table = driver.find_elements(By.CSS_SELECTOR,
'div.tableExContainer')[1]
# Locate the scroll bars
scrolls = driver.find_elements(By.CSS_SELECTOR, 'div.scroll-bar-
part-bar')
h_scroll = scrolls[2] # Horizontal scroll bar
v_scroll = scrolls[3] # Vertical scroll bar
# Perform initial horizontal scrolling if necessary
ActionChains(driver).move_to_element(h_scroll).click_and_hold()
.move_by_offset(500, 0).release().perform()
time.sleep(1) # Allow for loading
all_row_data = [] # List to store all rows data before creating
DataFrame
# Begin vertical scrolling
previous_row_count = 0
flag = True
while flag:
try:
# Wait for rows to be visible and get the current count
current_rows =
wait.until(EC.visibility_of_all_elements_located
((By.CSS_SELECTOR,"div[role='row']")))
if len(current_rows) == previous_row_count:
raise TimeoutException("No new rows loaded after
scroll.")
# Skip the header row and capture data
current_rows.pop(0) # Assuming the first row is the header
for row in current_rows:
cells = row.find_elements(By.CSS_SELECTOR,
"div[role='gridcell']")
row_data = [cell.text for cell in cells if cell.text]
if row_data:
all_row_data.append(row_data)
previous_row_count = len(current_rows)
# Update row count after processing
# Scroll down
ActionChains(driver).move_to_element(v_scroll).click_and_hold()
.move_by_offset(0, 100).release().perform()
time.sleep(5)
# Allow time for new rows to load
except TimeoutException as e:
print(e)
flag = False # Exit loop if no new rows load
except MoveTargetOutOfBoundsException:
print("Reached the end of the table or cannot scroll
further.")
flag = False
except Exception as e:
print(f"Encountered an exception: {e}")
flag = False
# Create DataFrame from collected data
df = pd.DataFrame(all_row_data, columns=[header.text.strip() for
header in table.find_elements(By.CSS_SELECTOR,
"div[role='columnheader']") if header.text.strip()])
driver.quit()
print(df)
以下代码将解决该问题。
# Initialize the driver, replace with your own setup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, MoveTargetOutOfBoundsException
import pandas as pd
import time
# Initialize the WebDriver and navigate to the URL
driver = webdriver.Chrome()
driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")
wait = WebDriverWait(driver, 20)
time.sleep(30)
# Ensure the table is visible
table = driver.find_elements(By.CSS_SELECTOR, 'div.tableExContainer')[1]
# Locate the scroll bars
scrolls = driver.find_elements(By.CSS_SELECTOR, 'div.scroll-bar-part-bar')
h_scroll = scrolls[2] # Horizontal scroll bar
v_scroll = scrolls[3] # Vertical scroll bar
# Perform initial horizontal scrolling if necessary
ActionChains(driver).move_to_element(h_scroll).click_and_hold().move_by_offset(500, 0).release().perform()
time.sleep(1) # Allow for loading
all_row_data = [] # List to store all rows data before creating DataFrame
# Begin vertical scrolling
previous_row_count = 0
flag = True
row_index = 0
while flag:
try:
# Wait for rows to be visible and get the current count
current_rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"div[role='row']")))
current_rows.pop(0)
row_index = int(current_rows[-1].get_attribute('row-index'))
if row_index == previous_row_count:
raise TimeoutException("No new rows loaded after scroll.")
# Skip the header row and capture data
current_rows.pop(0) # Assuming the first row is the header
for row in current_rows:
row_index = int(row.get_attribute('row-index'))
cells = row.find_elements(By.CSS_SELECTOR, "div[role='gridcell']")
row_data = [cell.text for cell in cells if cell.text]
if row_data:
all_row_data.append(row_data)
previous_row_count = int(current_rows[-1].get_attribute('row-index'))
# Update row count after processing
# Scroll down
ActionChains(driver).move_to_element(v_scroll).click_and_hold().move_by_offset(0, 100).release().perform()
time.sleep(5)
# Allow time for new rows to load
except TimeoutException as e:
print(e)
flag = False # Exit loop if no new rows load
except MoveTargetOutOfBoundsException:
print("Reached the end of the table or cannot scroll further.")
flag = False
except Exception as e:
print(f"Encountered an exception: {e}")
flag = False
# Create DataFrame from collected data
df = pd.DataFrame(all_row_data, columns=[header.text.strip() for header in table.find_elements(By.CSS_SELECTOR, "div[role='columnheader']") if header.text.strip()])
driver.quit()
print(df)