有人处理过这个关键错误吗?我正在尝试用 pandas 读取 Excel 文件。 我找到了这个,但看起来是俄语的:
解决此问题的唯一方法是手动打开它,然后将其另存为其他名称。我尝试下载并运行,而不必每天为一组 5-6 个文件执行 10 次以上
这是一个完整的解决方案,可解压缩 xlsx 文件,将
SharedStrings.xml
重命名为 sharedStrings.xml
并重新压缩。
如果 xlsx 文件根本没有
sharedStrings.xml
(这是我的问题),则可以使用注释函数从文件中的 xlsx 工作簿生成它。
只需注释掉这一行即可
correct_file_name(extracted_dir)
并取消注释接下来的两个
# strings = extract_strings_from_xlsx(zf)
# generate_shared_strings(strings, shared_strings_file)
将代码从重命名现有文件更改为生成新文件。
这是代码:
import xml.etree.ElementTree as ET
import os
import pandas as pd
import shutil
import zipfile
def extract_strings_from_xlsx(zf):
"""Extract the strings from an xlsx for sharedStrings.xml."""
strings = []
for name in zf.namelist():
if name.startswith('xl/worksheets/sheet'):
with zf.open(name) as sheet_file:
sheet_tree = ET.parse(sheet_file)
sheet_root = sheet_tree.getroot()
for c in sheet_root.iter('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}c'):
v_element = c.find('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v')
if v_element is not None and v_element.text:
strings.append(v_element.text)
return strings
def generate_shared_strings(strings, destination='xl/sharedStrings.xml'):
"""Create sharedStrings.xml for an xlsx workbook."""
root = ET.Element('sst')
root.set('xmlns', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main')
for string in strings:
si = ET.SubElement(root, 'si')
t = ET.SubElement(si, 't')
t.text = string
tree = ET.ElementTree(root)
tree.write(destination, encoding='utf-8', xml_declaration=True)
def correct_file_name(dir):
"""Rename SharedStrings.xml to sharedStrings.xml."""
wrong_file_path = os.path.join(dir, 'xl', 'SharedStrings.xml')
correct_file_path = os.path.join(dir, 'xl', 'sharedStrings.xml')
os.rename(wrong_file_path, correct_file_path)
def add_shared_strings(file_path, destination):
"""Generate and add sharedStrings.xml to an xlsx workbook.
Credit to https://ru.stackoverflow.com/a/1329702 for the general solution.
"""
file_name_with_ext = file_path.split('/')[-1:][0]
file_name = file_name_with_ext.split('.')[0]
extracted_dir = f'{destination}/{file_name}'
# shared_strings_file = f'{extracted_dir}/xl/sharedStrings.xml'
zip_file = f'{destination}/{file_name}'
with zipfile.ZipFile(file_path, 'r') as zf:
zf.extractall(extracted_dir)
correct_file_name(extracted_dir)
# strings = extract_strings_from_xlsx(zf)
# generate_shared_strings(strings, shared_strings_file)
zipped_file = shutil.make_archive(base_name=zip_file, format='zip', root_dir=extracted_dir)
xlsx_file = zipped_file.replace('.zip', '.xlsx')
os.rename(zipped_file, xlsx_file)
shutil.rmtree(extracted_dir)
os.remove(file_path)
return xlsx_file
tmp_folder = '/tmp'
broken_xlsx_file = 'DischargeReport.xlsx'
fixed_xlsx_file = add_shared_strings(broken_xlsx_file, tmp_folder)
df = pd.read_excel(fixed_xlsx_file)