使用 pdfplumber 提取表的问题 - 大表中的空字段

问题描述 投票:0回答:1

下面的脚本使用 pdfpumbler 提取 PDF 表。该表格总是出现在“Quadro de Definições”一词第二次出现之后,并且该表格长达 12 页。 该脚本有效,但某些字段缺少内容,例如:“Custodiante”、“Fundo”、“Gestora”、“Escriturador”...

这里是原始PDF。它是巴西葡萄牙语 这是我生成的CSV

请问,有人知道我可以使用什么策略来捕获所有信息吗?

import logging import pdfplumber import pandas as pd import os import re # Configure logging to display informational messages logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Set the fixed path for the PDF file PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf' def extract_tables_from_pdf(pdf_path): """ Extract tables from the specified PDF file. Args: pdf_path (str): Path to the PDF file. Returns: list: A list of extracted tables, or None if extraction fails. """ # Check if the PDF file exists if not os.path.exists(pdf_path): logging.error(f"The file {pdf_path} does not exist.") return None try: # Open the PDF file using pdfplumber with pdfplumber.open(pdf_path) as pdf: logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}") # Find the second occurrence of "Quadro de Definições" page_with_second_occurrence = None occurrences = 0 for page_num, page in enumerate(pdf.pages, start=1): if "Quadro de Definições" in page.extract_text(): occurrences += 1 if occurrences == 2: page_with_second_occurrence = page_num break # Check if we found the second occurrence if page_with_second_occurrence is None: logging.warning("Could not find two occurrences of 'Quadro de Definições'.") return None logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}") # Define the range of pages to extract (12 pages after the second occurrence) start_page = page_with_second_occurrence end_page = min(start_page + 12, len(pdf.pages)) logging.info(f"Extracting tables from pages {start_page} to {end_page}") # Extract tables from the specified range of pages tables = [] for page in pdf.pages[start_page-1:end_page]: page_tables = page.extract_tables() if page_tables: tables.extend(page_tables) logging.info(f"Number of tables extracted: {len(tables)}") return tables except Exception as e: logging.error(f"An error occurred while processing the PDF: {str(e)}") return None def safe_strip(cell): """ Safely strip whitespace from a cell, handling None values. Args: cell: The cell content to strip. Returns: str: The stripped string or an empty string if cell is None. """ if cell is None: return '' return str(cell).strip() def process_and_combine_tables(tables): """ Process and combine all extracted tables into a single DataFrame. Args: tables (list): List of tables extracted from the PDF. Returns: pandas.DataFrame: A DataFrame containing all processed and combined table data. """ processed_tables = [] for table_index, table in enumerate(tables): # Remove empty rows table = [row for row in table if row and any(safe_strip(cell) != '' for cell in row)] # Process each row processed_rows = [] for row in table: if len(row) == 1: # If the row has only one column, split it into two at the first double space content = safe_strip(row[0]) split_row = re.split(r'\s{2,}', content, maxsplit=1) processed_rows.append(split_row if len(split_row) == 2 else [split_row[0], '']) else: # If the row has multiple columns, take the first two processed_rows.append([safe_strip(cell) for cell in row[:2]]) processed_tables.extend(processed_rows) # Create a DataFrame with all processed rows df = pd.DataFrame(processed_tables, columns=['Term', 'Definition']) # Remove rows where the Term is empty df = df[df['Term'] != ''] # Remove duplicates, keeping the first occurrence df = df.drop_duplicates(subset='Term', keep='first') # Set 'Term' as the index of the DataFrame df.set_index('Term', inplace=True) return df if __name__ == "__main__": # Log the start of the table extraction process logging.info(f"Starting table extraction from file: {PDF_PATH}") # Extract tables from the PDF extracted_tables = extract_tables_from_pdf(PDF_PATH) if extracted_tables: try: # Process and combine the extracted tables df_combined = process_and_combine_tables(extracted_tables) print(df_combined) # Save the combined DataFrame to a CSV file csv_path = 'combined definitions framework.csv' df_combined.to_csv(csv_path) logging.info(f"Combined DataFrame saved to '{csv_path}'") except Exception as e: logging.error(f"Error processing and combining tables: {str(e)}") else: logging.warning("Could not extract tables from the PDF.") # Log the completion of the extraction and combination process logging.info("Table extraction and combination process completed.")
    
regex pdf extract pdfplumber
1个回答
0
投票
解决方案是通过调整表的设置来改进表提取。

这是脚本的修订版本:

import logging import pdfplumber import pandas as pd import os # Configure logging to display informational messages logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Set the fixed path for the PDF file PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf' # default settings for table extraction from: https://pypi.org/project/pdfplumber/#Table-extraction%20settings # settings = { # "vertical_strategy": "lines", # "horizontal_strategy": "lines", # "explicit_vertical_lines": [], # "explicit_horizontal_lines": [], # "snap_tolerance": 3, # "snap_x_tolerance": 3, # "snap_y_tolerance": 3, # "join_tolerance": 3, # "join_x_tolerance": 3, # "join_y_tolerance": 3, # "edge_min_length": 3, # "min_words_vertical": 3, # "min_words_horizontal": 3, # "intersection_tolerance": 3, # "intersection_x_tolerance": 3, # "intersection_y_tolerance": 3, # "text_tolerance": 3, # "text_x_tolerance": 3, # "text_y_tolerance": 3 # } # Custom settings for table extraction settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines", "snap_x_tolerance": 20, # impacts number of columns "snap_y_tolerance": 2.5, # impacts rows alignment and number or rows "intersection_x_tolerance": 1 # impacts row sep } def extract_tables_from_pdf(pdf_path): """ Extract tables from the specified PDF file. Args: pdf_path (str): Path to the PDF file. Returns: list: A list of extracted tables, or None if extraction fails. """ # Check if the PDF file exists if not os.path.exists(pdf_path): logging.error(f"The file {pdf_path} does not exist.") return None try: # Open the PDF file using pdfplumber with pdfplumber.open(pdf_path) as pdf: logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}") # Find the second occurrence of "Quadro de Definições" page_with_second_occurrence = None occurrences = 0 for page_num, page in enumerate(pdf.pages, start=1): if "Quadro de Definições" in page.extract_text(): occurrences += 1 if occurrences == 2: page_with_second_occurrence = page_num break # Check if we found the second occurrence if page_with_second_occurrence is None: logging.warning("Could not find two occurrences of 'Quadro de Definições'.") return None logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}") # Define the range of pages to extract (12 pages after the second occurrence) start_page = page_with_second_occurrence end_page = min(start_page + 12, len(pdf.pages)) logging.info(f"Extracting tables from pages {start_page} to {end_page}") # Extract tables from the specified range of pages tables = [] for page in pdf.pages[start_page-1:end_page]: page_tables = page.extract_tables(table_settings=settings) if page_tables: tables.extend(page_tables) logging.info(f"Number of tables extracted: {len(tables)}") return tables except Exception as e: logging.error(f"An error occurred while processing the PDF: {str(e)}") return None def process_and_combine_tables(tables): """ Process and combine all extracted tables into a single DataFrame. Args: tables (list): List of tables extracted from the PDF. Returns: pandas.DataFrame: A DataFrame containing all processed and combined table data. """ processed_tables = [] # regroup tables regr_tables = [] for table in tables: for row in table: regr_tables.append(row) table = regr_tables # regroup definition splitted by page break for i in range(len(table)-2, -1, -1): # iterate from last to first to avoid index offset when deleting list'elements. if table[i+1][0]=='': table[i][1]+= f"\n{table[i+1][1]}" del table[i+1] # deleting list's element processed_tables = table # Create a DataFrame with all processed rows df = pd.DataFrame(processed_tables, columns=['Term', 'Definition']) # Set 'Term' as the index of the DataFrame df.set_index('Term', inplace=True) return df if __name__ == "__main__": # Log the start of the table extraction process logging.info(f"Starting table extraction from file: {PDF_PATH}") # Extract tables from the PDF extracted_tables = extract_tables_from_pdf(PDF_PATH) if extracted_tables: try: # Process and combine the extracted tables df_combined = process_and_combine_tables(extracted_tables) print(df_combined) # Save the combined DataFrame to a CSV file csv_path = 'combined definitions framework.csv' df_combined.to_csv(csv_path) logging.info(f"Combined DataFrame saved to '{csv_path}'") except Exception as e: logging.error(f"Error processing and combining tables: {str(e)}") else: logging.warning("Could not extract tables from the PDF.") # Log the completion of the extraction and combination process logging.info("Table extraction and combination process completed.")
    
© www.soinside.com 2019 - 2024. All rights reserved.