python-2.7.15、pymssql-2.1.4、SQL_Server-2018、Windows 10 专业版、MS-Office-2016
import time
import csv
import pymssql
db_settings = {
"host" : "127.0.0.1",
"port" : "1433",
"user" : "sa",
"password" : "********",
"database" : "testdb",
"charset" : "utf8"
}
conn = pymssql.connect(**db_settings)
cursor = conn.cursor()
ff = csv.reader(open('base.csv', 'r'))
sql = """
BEGIN
INSERT INTO Base([name], [year], [update], [status],
[timeline], [language], [pic]) VALUES (%s, %s, %s, %s, %s, %s, %s)
END
"""
now=time.strftime("%M:%S")
t = []
for i in ff:
i = i[1:]
if "year" in i:
pass
else:
t.append((i[0], i[1], i[3], i[4], i[6], i[5], i[8]))
cursor.executemany(sql, t)
conn.commit()
end=time.strftime("%M:%S")
print(now+","+end)
“base.csv”文件大小为 21.7 MB,30374 行。当我执行上面的代码时,需要 929 秒才能完成。这意味着每秒只有 32.7 行,太慢了。 谁能帮我看看原因?万分感谢。 :-)
我将pymssql中execute_many的时间从30分钟减少到30秒,就像这样。
在 sql 中,您可以一次创建多行插入语句。如下图所示
INSERT (col_name1, col_name2)
INTO table_name
VALUES
(row1_val1, row1_val2),
(row2_val1, row2_val2) ...
(row1000_val1, row1000_val2)
我实现了插入函数,该函数获取数据块并修改查询以通过一次执行插入多个值。
def insert(query, data, chunk=999):
conn = get_connection()
cursor = conn.cursor()
query = query.lower()
insert_q, values_q = query.split('values') # get part with the query and the parameters
insert_q += 'values' # add values to make sql query correct after split
for chunk_data in chunks(data, chunk):
# chunk_data contains list of row parameters
flat_list = [item for sublist in chunk_data for item in sublist] # we make it flat to use execute later instead execute_many
chunk_query = insert_q + ','.join([values_q] * len(chunk_data)) # creating the query with multiple values insert
cursor.execute(chunk_query, tuple(flat_list))
conn.commit()
chunks
可以这样实现(感谢这个论坛的精彩回复)
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
使用示例
insert('INSERT (user_id, name, surname) INTO users VALUES (%s, %s, %s)',
[(1, 'Jack', 'Kcaj'), (2, 'Andrew', 'Golara')]