抓取动态表后丢失数据

问题描述 投票:0回答:1

我正在尝试在 Power BI Dash 上抓取动态表。具体来说,就是第2个表。我遇到了两个问题。主要问题是我的垂直抓取没有收集所有行数据。我只能抓取 2k 行中的 248 行左右。第二个问题是,当我水平滚动时,当其余列移出 DOM 时,会收集 9 列。我想我只会运行代码两次,而不是第二次水平滚动。

如果有人可以帮助我了解如何抓取所有行,那将会很有帮助。我看到表格确实向下滚动到底部,所以这似乎是等待数据加载然后抓取它的问题。

 # Initialize the driver, replace with your own setup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, 
MoveTargetOutOfBoundsException
import pandas as pd
import time

# Initialize the WebDriver and navigate to the URL
 driver = webdriver.Chrome()
driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")
wait = WebDriverWait(driver, 20)

# Ensure the table is visible
table = driver.find_elements(By.CSS_SELECTOR, 
 'div.tableExContainer')[1]

# Locate the scroll bars
scrolls = driver.find_elements(By.CSS_SELECTOR, 'div.scroll-bar- 
part-bar')
h_scroll = scrolls[2]  # Horizontal scroll bar
v_scroll = scrolls[3]  # Vertical scroll bar

  # Perform initial horizontal scrolling if necessary
 ActionChains(driver).move_to_element(h_scroll).click_and_hold() 
.move_by_offset(500, 0).release().perform()
time.sleep(1)  # Allow for loading

all_row_data = []  # List to store all rows data before creating 
DataFrame

# Begin vertical scrolling
previous_row_count = 0
flag = True
while flag:
 try:
    # Wait for rows to be visible and get the current count
    current_rows = 
wait.until(EC.visibility_of_all_elements_located
((By.CSS_SELECTOR,"div[role='row']")))
    if len(current_rows) == previous_row_count:
        raise TimeoutException("No new rows loaded after 
scroll.")

    # Skip the header row and capture data
    current_rows.pop(0)  # Assuming the first row is the header
    for row in current_rows:
        cells = row.find_elements(By.CSS_SELECTOR, 
"div[role='gridcell']")
        row_data = [cell.text for cell in cells if cell.text]
        if row_data:
            all_row_data.append(row_data)

    previous_row_count = len(current_rows)  
  # Update row count after processing

    # Scroll down
  ActionChains(driver).move_to_element(v_scroll).click_and_hold()
  .move_by_offset(0, 100).release().perform()
    time.sleep(5)  
 # Allow time for new rows to load

except TimeoutException as e:
    print(e)
    flag = False  # Exit loop if no new rows load
except MoveTargetOutOfBoundsException:
    print("Reached the end of the table or cannot scroll 
further.")
    flag = False
except Exception as e:
    print(f"Encountered an exception: {e}")
    flag = False

# Create DataFrame from collected data
 df = pd.DataFrame(all_row_data, columns=[header.text.strip() for 
 header in table.find_elements(By.CSS_SELECTOR, 
 "div[role='columnheader']") if header.text.strip()])

driver.quit()
print(df)
selenium-webdriver web-scraping powerbi
1个回答
0
投票

以下代码将解决该问题。

# Initialize the driver, replace with your own setup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, MoveTargetOutOfBoundsException
import pandas as pd
import time

# Initialize the WebDriver and navigate to the URL
driver = webdriver.Chrome()
driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")
wait = WebDriverWait(driver, 20)
time.sleep(30)
# Ensure the table is visible
table = driver.find_elements(By.CSS_SELECTOR, 'div.tableExContainer')[1]

# Locate the scroll bars
scrolls = driver.find_elements(By.CSS_SELECTOR, 'div.scroll-bar-part-bar')
h_scroll = scrolls[2]  # Horizontal scroll bar
v_scroll = scrolls[3]  # Vertical scroll bar

  # Perform initial horizontal scrolling if necessary
ActionChains(driver).move_to_element(h_scroll).click_and_hold().move_by_offset(500, 0).release().perform()
time.sleep(1)  # Allow for loading

all_row_data = []  # List to store all rows data before creating DataFrame

# Begin vertical scrolling
previous_row_count = 0
flag = True
row_index = 0
while flag:
    try:
        # Wait for rows to be visible and get the current count
        current_rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"div[role='row']")))
        current_rows.pop(0)
        row_index = int(current_rows[-1].get_attribute('row-index')) 
        if row_index == previous_row_count:
            raise TimeoutException("No new rows loaded after scroll.")

        # Skip the header row and capture data
        current_rows.pop(0)  # Assuming the first row is the header
        
        for row in current_rows:
            row_index = int(row.get_attribute('row-index'))
            cells = row.find_elements(By.CSS_SELECTOR, "div[role='gridcell']")
            row_data = [cell.text for cell in cells if cell.text]
            if row_data:
                all_row_data.append(row_data)

        previous_row_count = int(current_rows[-1].get_attribute('row-index'))
        # Update row count after processing

         # Scroll down
        ActionChains(driver).move_to_element(v_scroll).click_and_hold().move_by_offset(0, 100).release().perform()
        time.sleep(5)  
 # Allow time for new rows to load

    except TimeoutException as e:
        print(e)
        flag = False  # Exit loop if no new rows load
    except MoveTargetOutOfBoundsException:
        print("Reached the end of the table or cannot scroll further.")
        flag = False
    except Exception as e:
        print(f"Encountered an exception: {e}")
        flag = False

# Create DataFrame from collected data
df = pd.DataFrame(all_row_data, columns=[header.text.strip() for header in table.find_elements(By.CSS_SELECTOR, "div[role='columnheader']") if header.text.strip()])
driver.quit()
print(df)
© www.soinside.com 2019 - 2024. All rights reserved.