我正在寻求帮助,以使用 Python 和 Selenium 从具有多个选项卡的网站中提取数据并将其保存为 .csv 格式。有问题的网站是:https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details。
页面上有五个不同的选项卡,但我的重点是从前三个选项卡中提取数据。
此外,还有两个选项卡,一个代表“全部”,另一个代表“日期”。我需要检索前三个选项卡的所有组合的数据,同时保持选中“全部”选项卡并将日期设置为当前日期。
我尝试使用 Selenium 执行此操作,但由于我使用该工具的经验有限,我无法达到预期的结果。因此,我正在寻求有关如何进行的指导。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random
def wait_for_element(driver, by, value, timeout=10):
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
def scrape_and_save(driver, end_type, equity_type, cap_type, all_type, filename):
# Select options from dropdowns
Select(wait_for_element(driver, By.ID, "end-type")).select_by_value(end_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "equity-type")).select_by_value(equity_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "cap-type")).select_by_value(cap_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "all-type")).select_by_value(all_type)
time.sleep(random.uniform(1, 2))
# Click "Go" button
wait_for_element(driver, By.ID, "go-button").click()
# Wait for table to load
table = wait_for_element(driver, By.ID, "fund-table", timeout=15)
# Extract table data
df = pd.read_html(table.get_attribute('outerHTML'))[0]
# Save to CSV
df.to_csv(filename, index=False)
print(f"Saved data to {filename}")
# Set up Selenium WebDriver
driver = webdriver.Chrome() # Make sure you have chromedriver installed and in PATH
driver.get("https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details") # Replace with actual URL
# Wait for initial page load
wait_for_element(driver, By.ID, "end-type", timeout=30)
print("Page loaded successfully")
# Define options for each dropdown
end_types = ["1", "2"] # Open-ended, Closed-end
equity_types = ["1", "2", "3", "4", "5", "6"] # Replace with actual values
cap_types = ["1", "2", "3", "4"] # Replace with actual values
all_types = ["1", "2", "3", "4"] # Replace with actual values
# Iterate through combinations
for end in end_types:
for equity in equity_types:
for cap in cap_types:
for all_type in all_types:
filename = f"fund_data_{end}_{equity}_{cap}_{all_type}.csv"
try:
scrape_and_save(driver, end, equity, cap, all_type, filename)
time.sleep(random.uniform(3, 5)) # Random wait between 3 to 5 seconds
except Exception as e:
print(f"Error scraping combination {end}_{equity}_{cap}_{all_type}: {str(e)}")
driver.quit()
您的目标应用程序通过 iframe 从此应用程序加载表格页面
https://www.valueresearchonline.com/amfi
,在这种情况下,我们可以使用 https://www.valueresearchonline.com/amfi
轻松地通过 iframe 页面bs4
提取这些数据,这里是带有 bs4
的示例代码(bs4在这种情况下比硒更快):
import requests
from bs4 import BeautifulSoup
import pandas as pd
def fetchValue(primary_category, category, file):
fund_name = []
fund_benchmark = []
riskometer_scheme = []
riskometer_benchmark = []
latest_nav_regular = []
latest_nav_direct = []
five_year_return_regular = []
five_year_return_direct = []
five_year_return_benchmark = []
daily_aum_cr = []
url = f'https://www.valueresearchonline.com/amfi/fund-performance-data/?end-type=1&primary-category={primary_category}&category={category}&amc=ALL&nav-date=25-Oct-2024'
resp = requests.get(url,headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":"https://www.valueresearchonline.com/amfi/fund-performance"}).text
soup = BeautifulSoup(resp, 'lxml')
table = soup.findAll('tr')
for i in table:
try:
fund_name.append(i.findAll('td')[0].text.strip())
fund_benchmark.append(i.findAll('td')[1].text)
riskometer_scheme.append(i.findAll('td')[2].text)
riskometer_benchmark.append(i.findAll('td')[3].text)
latest_nav_regular.append(i.findAll('td')[4].text.strip())
latest_nav_direct.append(i.findAll('td')[5].text.strip())
five_year_return_regular.append(i.findAll('td')[6].text.strip())
five_year_return_direct.append(i.findAll('td')[7].text.strip())
five_year_return_benchmark.append(i.findAll('td')[8].text.strip())
daily_aum_cr.append(i.findAll('td')[10].text.strip())
except Exception:
pass
a = {
"Scheme": fund_name,
"Benchmark": fund_benchmark,
"Riskometer_Scheme": riskometer_scheme,
"Riskometer_Benchmark": riskometer_benchmark,
"Lates_Nav_Regular": latest_nav_regular,
"Lates_Nav_Direct": latest_nav_direct,
"Five_Year_Retrun_Regular": five_year_return_regular,
"Five_Year_Retrun_Direct": five_year_return_direct,
"Five_Year_Retrun_Benchmark": five_year_return_benchmark,
"Daily_AUM": daily_aum_cr
}
df = pd.DataFrame(a)
df.to_csv(file, index=False)
url = "https://www.valueresearchonline.com/amfi/fund-performance"
resp = requests.get(url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":"https://www.amfiindia.com/"}).text
soup = BeautifulSoup(resp, 'lxml')
category_list = soup.find('select', id='category')
for i in range(40):# has 40 category combinations on table 2-3
category = category_list.findAll('option')[i]['value']
primary_category = category.split('_')[0]
fetchValue(primary_category, category, f'{category}.csv')
我尝试尽可能基本地使用我的代码,以便更好地理解