我不断遇到这样的错误:
引发 IllegalCharacterError(f"{value} 不能在工作表中使用。") openpyxl.utils.exceptions.IllegalCharacterError: uP j AS ▼ V 6 Q H F O Q x o 6 PK PK LnX 不能在工作表中使用。
import pandas as pd
from PyPDF2 import PdfReader
import difflib
import re
# Define the PDF file path
pdf_path = "./pdf_name.pdf"
excel_path = 'excel_name.xlsx'
# Function to extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
with open(pdf_path, "rb") as file:
pdf_reader = PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to read entire content of Excel file as a string
def read_excel_as_string(excel_path):
try:
with open(excel_path, "r", encoding="latin1") as file:
return file.read()
except FileNotFoundError:
print("Excel file not found.")
return ""
# Function to clean text data
def clean_text(text):
# Remove any characters that are not printable ASCII or illegal in Excel
cleaned_text = ''.join(filter(lambda x: x.isprintable() or x.isspace() or ord(x) > 31, text))
# Replace any problematic characters with a space
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', cleaned_text)
return cleaned_text
# Function to find differences between two texts
def find_differences(text1, text2):
diff = difflib.ndiff(text1.splitlines(), text2.splitlines())
diff_data = []
for line in diff:
if line.startswith('+'):
diff_data.append(["", line[2:], "Added"])
elif line.startswith('-'):
diff_data.append([line[2:], "", "Deleted"])
elif line.startswith(' '):
diff_data.append([line[2:], line[2:], "Matched"])
return diff_data
# Extract text from Excel
excel_text = read_excel_as_string(excel_path)
# Extract text from PDF
pdf_text = extract_text_from_pdf(pdf_path)
# Clean the PDF and Excel text
pdf_text_cleaned = clean_text(pdf_text)
excel_text_cleaned = clean_text(excel_text)
# Find differences between PDF and Excel files
diff_data = find_differences(pdf_text_cleaned, excel_text_cleaned)
# Create a DataFrame to store the differences
diff_df = pd.DataFrame(diff_data, columns=["PDF", "Excel", "Status"])
# Write the DataFrame to an Excel file
diff_df.to_excel("./differences.xlsx", index=False)
您的问题可能来自于您读取非纯文本的 Excel 文件的方式。
你尝试过使用pandas吗? 尝试导入一个字符串中的所有文本,以便与 pdf 中的字符串进行比较。
def read_excel(excel_path):
try:
df = pd.read_excel(excel_path)
return '\n'.join(df.astype(str).apply(lambda x: ' '.join(x), axis=1))
except FileNotFoundError:
print("Excel file not found.")
return ""