我想从 AMFI 网站下载所有方案的每日 NAV(资产净值)并将其全部存储在 MongoDB 中。但使用我当前的代码,由于我试图更改数据的结构,下载所有数据并将其推送到数据库需要花费近 5 天的时间。我希望有人可以帮助我优化代码,以便更快地完成。
我知道在我的代码中,占用时间的是我试图将每个日期的每个 NAV 数据推送到数据库中。逐个。我想将它分组并将其推送到数据库中,但要做到这一切,我想我需要一台更好的笔记本电脑。因为如果我将数据存储在数组中,会占用大量空间。
请在下面找到我的代码
#https://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx?&frmdt=14-Aug-2023&todt=16-Aug-2023
import requests
from pytz import utc
from datetime import datetime
import pymongo # Import the pymongo library for MongoDB operations
# Initialize MongoDB client and database
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/") # Replace with your MongoDB connection string
mydb = mongo_client["M_F"] # Replace with your database name
mycollection = mydb["MyNAV"] # Replace with your collection name
def convert_date_to_utc_datetime(date_string):
date_format = "%d-%b-%Y"
date_object = datetime.strptime(date_string, date_format)
return date_object.replace(tzinfo=utc)
from datetime import datetime, timedelta
def split_date_range(start_date_str, end_date_str, max_duration=90):
# Convert input date strings to datetime objects
start_date = datetime.strptime(start_date_str, "%d-%b-%Y")
end_date = datetime.strptime(end_date_str, "%d-%b-%Y")
date_ranges = []
current_date = start_date
while current_date <= end_date:
# Calculate the end of the current sub-range
sub_range_end = current_date + timedelta(days=max_duration - 1)
# Make sure the sub-range end is not greater than the end_date
if sub_range_end > end_date:
sub_range_end = end_date
# Append the current sub-range as a tuple to the list
date_ranges.append((current_date, sub_range_end))
# Move the current_date to the day after the sub-range end
current_date = sub_range_end + timedelta(days=1)
return date_ranges
def nav_data(start,end):
"""Put the date in DD-Mmm-YYYY that too in a string format"""
url = f"https://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx?&frmdt={start}&todt={end}"
response = requests.session().get(url)
print("Got the data form connection")
data = response.text.split("\r\n")
Structure = ""
Category = ""
Sub_Category = ""
amc = ""
code = int()
name = str()
nav = float()
date = ""
inv_src = ""
dg = ""
i = 0
j = 1
for lines in data[1:]:
split = lines.split(";")
if j == len(data)-1:
break
if split[0] == "":
# To check the Scheme [Structure, Category, Sub-Category]
if data[j] == data[j+1]:
sch_cat = data[j-1].split("(")
sch_cat[-1]=sch_cat[-1][:-2].strip()
sch_cat = [i.strip() for i in sch_cat]
if "-" in sch_cat[1]:
sch_sub_cat = sch_cat[1].split("-")
sch_sub_cat = [i.strip() for i in sch_sub_cat]
sch_cat.pop(-1)
sch_cat = sch_cat+sch_sub_cat
else:
sch_sub_cat = ["",sch_cat[1]]
sch_cat.pop(-1)
sch_cat = sch_cat+sch_sub_cat
Structure = sch_cat[0]
Category = sch_cat[1]
Sub_Category = sch_cat[2]
#print(sch_cat)
# to check the AMC name
elif "Mutual Fund" in data[j+1]:
amc = data[j+1]
elif len(split)>1:
code = int(split[0].strip())
name = str(split[1].strip())
if "growth" in name.lower():
dg = "Growth"
elif "idcw" or "dividend" in name.lower():
dg = "IDCW"
else:
dg = ""
if "direct" in name.lower():
inv_src = "Direct"
elif "regular" in name.lower():
inv_src = "Regular"
else:
inv_src = ""
try:
nav = float(split[4].strip())
except:
nav = split[4].strip()
date = convert_date_to_utc_datetime(split[7].strip())
print(type(date),date)
existing_data = mycollection.find_one({"meta.Code": code})
if existing_data:
# If data with the code already exists in MongoDB, update it
mycollection.update_one({"_id": existing_data["_id"]}, {
"$push": {"data": {"date": date, "nav": nav}}})
print("Another one bites the dust")
else:
new_record = {
"meta": {
"Structure": Structure,
"Category": Category,
"Sub-Category": Sub_Category,
"AMC": amc,
"Code": code,
"Name": name,
"Source": inv_src,
"Option" : dg
},
"data": [{"date":date, "nav": nav }]
}
mycollection.insert_one(new_record)
print("Data data data")
j = j+1
return
start_date_str = "04-Apr-2023"
end_date_str = "31-Aug-2023"
max_duration = 90
date_ranges = split_date_range(start_date_str, end_date_str, max_duration)
for start, end in date_ranges:
print(f"Start Date: {start.strftime('%d-%b-%Y')}, End Date: {end.strftime('%d-%b-%Y')}")
nav_data(start.strftime('%d-%b-%Y'),end.strftime('%d-%b-%Y'))
input("press any key to confirm")
我可以推荐两件事。
使用会话对象来满足您的请求。每次您发出
GET
请求时,requests
模块都会创建一个新连接,这确实需要时间。
def nav_data(start,end, req):
url = f"https://..."
response = req.get(url)
...
with requests.Session() as req:
for start, end in date_ranges:
print(f"Start Date: {start.strftime('%d-%b-%Y')}, End Date: {end.strftime('%d-%b-%Y')}")
nav_data(start.strftime('%d-%b-%Y'),end.strftime('%d-%b-%Y'), req)
对 mongodb 使用批量插入。您说将数据存储在数组中需要大量空间,但是您测试过吗?我创建了一个包含 10000 个
new_record
的数组,它仅使用 ~85Kb。如果您能够提出请求并使用浏览器,我怀疑 85Kb 对您的笔记本电脑来说应该是一个问题。