我是pandas / python的新手,我想出了以下代码来从工作表的特定部分提取数据。
import openpyxl as xl
import pandas as pd
rows_with_data = [34,37,38,39,44,45,46,47,48,49, 50,54,55,57,58,59,60,62,63,64,65,66,70,71,72,76,77, 78,79,80,81,82,83,84,88,89,90,91,92]
path = r'XXX'
xpath = input('XXX')
file = r'**.xlsm'
xfile = input('Change file name, current is ' + file + ' :')
sheetname = r'Summary'
wb = xl.load_workbook(filename = xpath + '\\' +file, data_only = True)
sheet = wb.get_sheet_by_name(sheetname)
rows = len(rows_with_data)
line_items = []
for i in range(rows) :
line_items.append(sheet.cell(row = rows_with_data[i], column = 13).value)
period = []
for col in range(17,35):
period.append(sheet.cell(row = 20, column = col).value)
print(line_items)
vals = []
x = []
for i in range(rows):
if i != 0:
vals.append(x)
x = []
for col in range(17,35):
x.append(sheet.cell(row = rows_with_data[i], column = col).value)
vals.append(x)
all_values = {}
all_values['Period'] = period
for i in range(rows):
print(line_items[i])
all_values[line_items[i]] = vals[i]
print(all_values)
period_review = input('Enter a period (i.e. 2002): ')
item = input('Enter a period (i.e. XXX): ')
time = period.index(period_review)
display_item = str(all_values[item][time])
print(item + ' for ' + period_review + " is " + display_item)
Summary_Dataframe = pd.DataFrame(all_values)
writer = pd.ExcelWriter(xpath + '\\' + 'values.xlsx')
Summary_Dataframe.to_excel(writer,'Sheet1')
writer.save()
writer.close()
我在60个xlsm文件的库中有相同的工作表(摘要结果),我很难弄清楚如何在整个文件夹中迭代这个。我还希望将此更改从提取特定行到获取整个“摘要”工作表,将其粘贴到新文件并在粘贴到新excel文件时按文件名(“Experiment_A”)命名工作表。有什么建议?
我很难阅读您的代码,以了解您最终想要做什么。所以这只是一个建议而不是解决方案。您可以使用os
遍历文件夹中的所有文件,然后将文件读入一个数据帧,然后将单个大数据帧保存到csv中。我通常避免使用excel,但我猜你需要excel转换。在下面的例子中,我已经从一个目录中读取了所有txt文件,将它们放入数据帧列表,然后将大数据帧存储为json。您也可以将其存储为excel / csv。
import os
import pandas as pd
def process_data():
# input file path in 2 part in case it is very long
input_path_1 = r'\\path\to\the\folder'
input_path_2 = r'\second\part\of\the\path'
# adding the all file path
file_path = input_path_1 + input_path_2
# listing all file in the file folder
file_list = os.listdir(os.path.join(file_path))
# selecting only the .txt files in to a list object
file_list = [file_name for file_name in file_list if '.txt' in file_name]
# selecting the fields we need
field_names = ['country', 'ticket_id']
# defining a list to put all the datafremes in one list
pd_list = []
inserted_files = []
# looping over txt files and storing in to database
for file_name in file_list:
# creating the file path to read the file
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names)
# converting the datetime to date
# few internal data transformation example before writting
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
# adding each dataframe to the list
pd_list.append(df_)
# adding file name to the inserted list to print later
inserted_files.append(file_name)
print(inserted_files)
# sql like union all dataframes and create a single data source
df_ = pd.concat(pd_list)
output_path_1 = r'\\path\to\output'
output_path_2 = r'\path\to\output'
output_path = output_path_1 + output_path_2
# put the file name
file_name = 'xyz.json'
# adding the day the file processed
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, file_name), orient='records')
return print('Data Stored as json successfully')
process_data()