使用 pdfplumber 提取表的问题 - 大表中的空字段

Question

下面的脚本使用 pdfpumbler 提取 PDF 表。该表格总是出现在“Quadro de Definições”一词第二次出现之后，并且该表格长达 12 页。该脚本有效，但某些字段缺少内容，例如：“Custodiante”、“Fundo”、“Gestora”、“Escriturador”...

这里是原始PDF。它是巴西葡萄牙语这是我生成的CSV

请问，有人知道我可以使用什么策略来捕获所有信息吗？

import logging
import pdfplumber
import pandas as pd
import os
import re

# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'

def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from the specified PDF file.
    
    Args:
    pdf_path (str): Path to the PDF file.
    
    Returns:
    list: A list of extracted tables, or None if extraction fails.
    """
    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        logging.error(f"The file {pdf_path} does not exist.")
        return None

    try:
        # Open the PDF file using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")

            # Find the second occurrence of "Quadro de Definições"
            page_with_second_occurrence = None
            occurrences = 0
            for page_num, page in enumerate(pdf.pages, start=1):
                if "Quadro de Definições" in page.extract_text():
                    occurrences += 1
                    if occurrences == 2:
                        page_with_second_occurrence = page_num
                        break

            # Check if we found the second occurrence
            if page_with_second_occurrence is None:
                logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
                return None
            
            logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")

            # Define the range of pages to extract (12 pages after the second occurrence)
            start_page = page_with_second_occurrence
            end_page = min(start_page + 12, len(pdf.pages))

            logging.info(f"Extracting tables from pages {start_page} to {end_page}")

            # Extract tables from the specified range of pages
            tables = []
            for page in pdf.pages[start_page-1:end_page]:
                page_tables = page.extract_tables()
                if page_tables:
                    tables.extend(page_tables)

            logging.info(f"Number of tables extracted: {len(tables)}")

            return tables

    except Exception as e:
        logging.error(f"An error occurred while processing the PDF: {str(e)}")
        return None

def safe_strip(cell):
    """
    Safely strip whitespace from a cell, handling None values.
    
    Args:
    cell: The cell content to strip.
    
    Returns:
    str: The stripped string or an empty string if cell is None.
    """
    if cell is None:
        return ''
    return str(cell).strip()

def process_and_combine_tables(tables):
    """
    Process and combine all extracted tables into a single DataFrame.
    
    Args:
    tables (list): List of tables extracted from the PDF.
    
    Returns:
    pandas.DataFrame: A DataFrame containing all processed and combined table data.
    """
    processed_tables = []
    for table_index, table in enumerate(tables):
        # Remove empty rows
        table = [row for row in table if row and any(safe_strip(cell) != '' for cell in row)]
        
        # Process each row
        processed_rows = []
        for row in table:
            if len(row) == 1:
                # If the row has only one column, split it into two at the first double space
                content = safe_strip(row[0])
                split_row = re.split(r'\s{2,}', content, maxsplit=1)
                processed_rows.append(split_row if len(split_row) == 2 else [split_row[0], ''])
            else:
                # If the row has multiple columns, take the first two
                processed_rows.append([safe_strip(cell) for cell in row[:2]])
        
        processed_tables.extend(processed_rows)
    
    # Create a DataFrame with all processed rows
    df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
    
    # Remove rows where the Term is empty
    df = df[df['Term'] != '']
    
    # Remove duplicates, keeping the first occurrence
    df = df.drop_duplicates(subset='Term', keep='first')
    
    # Set 'Term' as the index of the DataFrame
    df.set_index('Term', inplace=True)
    
    return df

if __name__ == "__main__":
    # Log the start of the table extraction process
    logging.info(f"Starting table extraction from file: {PDF_PATH}")
    
    # Extract tables from the PDF
    extracted_tables = extract_tables_from_pdf(PDF_PATH)
    
    if extracted_tables:
        try:
            # Process and combine the extracted tables
            df_combined = process_and_combine_tables(extracted_tables)
            print(df_combined)
            
            # Save the combined DataFrame to a CSV file
            csv_path = 'combined definitions framework.csv'
            df_combined.to_csv(csv_path)
            logging.info(f"Combined DataFrame saved to '{csv_path}'")
        except Exception as e:
            logging.error(f"Error processing and combining tables: {str(e)}")
    else:
        logging.warning("Could not extract tables from the PDF.")
    
    # Log the completion of the extraction and combination process
    logging.info("Table extraction and combination process completed.")

Answer 1

解决方案是通过调整表的设置来改进表提取。

这是脚本的修订版本：

import logging
import pdfplumber
import pandas as pd
import os

# Configure logging to display informational messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the fixed path for the PDF file
PDF_PATH = 'data/prospectos/52670402000105-opd08122023v01-000566736.pdf'

# default settings for table extraction from: https://pypi.org/project/pdfplumber/#Table-extraction%20settings
# settings = {
#     "vertical_strategy": "lines", 
#     "horizontal_strategy": "lines",
#     "explicit_vertical_lines": [],
#     "explicit_horizontal_lines": [],
#     "snap_tolerance": 3,
#     "snap_x_tolerance": 3,
#     "snap_y_tolerance": 3,
#     "join_tolerance": 3,
#     "join_x_tolerance": 3,
#     "join_y_tolerance": 3,
#     "edge_min_length": 3,
#     "min_words_vertical": 3,
#     "min_words_horizontal": 3,
#     "intersection_tolerance": 3,
#     "intersection_x_tolerance": 3,
#     "intersection_y_tolerance": 3,
#     "text_tolerance": 3,
#     "text_x_tolerance": 3,
#     "text_y_tolerance": 3
# }

# Custom settings for table extraction 
settings = {
    "vertical_strategy": "lines", 
    "horizontal_strategy": "lines",
    "snap_x_tolerance": 20,          # impacts number of columns
    "snap_y_tolerance": 2.5,         # impacts rows alignment and number or rows
    "intersection_x_tolerance": 1    # impacts row sep
}

def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from the specified PDF file.
    
    Args:
    pdf_path (str): Path to the PDF file.
    
    Returns:
    list: A list of extracted tables, or None if extraction fails.
    """
    # Check if the PDF file exists
    if not os.path.exists(pdf_path):
        logging.error(f"The file {pdf_path} does not exist.")
        return None

    try:
        # Open the PDF file using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            logging.info(f"PDF loaded successfully. Total pages: {len(pdf.pages)}")

            # Find the second occurrence of "Quadro de Definições"
            page_with_second_occurrence = None
            occurrences = 0
            for page_num, page in enumerate(pdf.pages, start=1):
                if "Quadro de Definições" in page.extract_text():
                    occurrences += 1
                    if occurrences == 2:
                        page_with_second_occurrence = page_num
                        break

            # Check if we found the second occurrence
            if page_with_second_occurrence is None:
                logging.warning("Could not find two occurrences of 'Quadro de Definições'.")
                return None
            
            logging.info(f"The second 'Quadro de Definições' appears on page: {page_with_second_occurrence}")

            # Define the range of pages to extract (12 pages after the second occurrence)
            start_page = page_with_second_occurrence
            end_page = min(start_page + 12, len(pdf.pages))

            logging.info(f"Extracting tables from pages {start_page} to {end_page}")

            # Extract tables from the specified range of pages
            tables = []
            for page in pdf.pages[start_page-1:end_page]:
                page_tables = page.extract_tables(table_settings=settings)
                if page_tables:
                    tables.extend(page_tables)

            logging.info(f"Number of tables extracted: {len(tables)}")

            return tables

    except Exception as e:
        logging.error(f"An error occurred while processing the PDF: {str(e)}")
        return None


def process_and_combine_tables(tables):
    """
    Process and combine all extracted tables into a single DataFrame.
    
    Args:
    tables (list): List of tables extracted from the PDF.
    
    Returns:
    pandas.DataFrame: A DataFrame containing all processed and combined table data.
    """
    processed_tables = []

    # regroup tables
    regr_tables = []
    for table in tables:
        for row in table:
            regr_tables.append(row)
    table = regr_tables
        
    # regroup definition splitted by page break
    for i in range(len(table)-2, -1, -1):        # iterate from last to first to avoid index offset when deleting list'elements.
        if table[i+1][0]=='':
            table[i][1]+= f"\n{table[i+1][1]}"
            del table[i+1]                        # deleting list's element

    processed_tables = table
    
    # Create a DataFrame with all processed rows
    df = pd.DataFrame(processed_tables, columns=['Term', 'Definition'])
    
    # Set 'Term' as the index of the DataFrame
    df.set_index('Term', inplace=True)
    
    return df


if __name__ == "__main__":
    # Log the start of the table extraction process
    logging.info(f"Starting table extraction from file: {PDF_PATH}")
    
    # Extract tables from the PDF
    extracted_tables = extract_tables_from_pdf(PDF_PATH)
    
    if extracted_tables:
        try:
            # Process and combine the extracted tables
            df_combined = process_and_combine_tables(extracted_tables)
            print(df_combined)
            
            # Save the combined DataFrame to a CSV file
            csv_path = 'combined definitions framework.csv'
            df_combined.to_csv(csv_path)
            logging.info(f"Combined DataFrame saved to '{csv_path}'")
        except Exception as e:
            logging.error(f"Error processing and combining tables: {str(e)}")
    else:
        logging.warning("Could not extract tables from the PDF.")
    
    # Log the completion of the extraction and combination process
    logging.info("Table extraction and combination process completed.")

使用 pdfplumber 提取表的问题 - 大表中的空字段

问题描述投票：0回答：1

1个回答

最新问题

使用 pdfplumber 提取表的问题 - 大表中的空字段

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1