如何使用python从pdf中删除文本层

问题描述 投票:0回答:1

我需要从pdf文件中删除所有文本信息。所以我想要得到的文件应该像扫描一样:只有包装为pdf的图像,没有可以复制或选择的文本。现在我正在使用 Ghostscript 命令:

import os
...
os.system(f"gs -o {output_path} -sDEVICE=pdfwrite -dFILTERTEXT {input_path}")

不幸的是,对于某些文档,它不仅删除了文本层,还删除了字符的真实像素! 而且有时页面上看不到任何文字图片,这不是我需要的

是否有一些使用 python 或 pip utils 的稳定且快速的解决方案?如果我能用 PyMuPDF (fitz) 解决这个问题那就太好了,但我找不到任何相关信息

python pdf text ghostscript pymupdf
1个回答
0
投票

这是对我有用的代码。您需要先

pip install PyPDF2

import PyPDF2
import sys
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import DictionaryObject, NameObject, ArrayObject, NumberObject, DecodedStreamObject, IndirectObject

def remove_text_from_page(page, writer) -> PyPDF2._page.PageObject:
    # Create a new blank page object with the same size as the original page
    new_page = PyPDF2._page.PageObject.create_blank_page(width=page.mediabox.width,
                                                        height=page.mediabox.height)

    # Copy non-text content (images, graphics) from the original page
    content_streams = page.get_contents()
    if content_streams:
        content_streams = [content_streams] if not isinstance(content_streams, list) else content_streams
        for content_stream in content_streams:
            content_stream = content_stream.get_object()  # Dereference the IndirectObject
            data = content_stream.get_data()
            # Filter out text operations, which usually start with "BT" and end with "ET"
            if b'BT' in data and b'ET' in data:
                data = remove_text_operations(data)
            new_stream = DecodedStreamObject()
            new_stream.set_data(data)
            # Ensure the new stream is added as an indirect object
            new_stream_indirect = writer._add_object(new_stream)

            # If the new page has no contents yet, create the contents array
            if new_page.get_contents() is None:
                new_page[NameObject("/Contents")] = ArrayObject()
            new_page.get_contents().append(new_stream_indirect)

    # Copy over all non-text elements from the original page
    for key in page:
        if key != NameObject('/Contents'):
            new_page[key] = page[key]

    return new_page

def remove_text_operations(data: bytes) -> bytes:
    output = b""
    inside_text = False
    for line in data.splitlines(True):
        if b"BT" in line:  # Begin Text Object
            inside_text = True
        if not inside_text:
            output += line
        if b"ET" in line:  # End Text Object
            inside_text = False
    return output

def remove_text_from_pdf(input_pdf_path: str, output_pdf_path: str):
    # Read the input PDF
    reader = PdfReader(input_pdf_path)
    writer = PdfWriter()

    # Process each page to remove text
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        new_page = remove_text_from_page(page, writer)
        writer.add_page(new_page)

    # Write the output PDF
    with open(output_pdf_path, 'wb') as output_pdf_file:
        writer.write(output_pdf_file)

# Example usage
if __name__ == "__main__":
    input_pdf_path = sys.argv[1]
    output_pdf_path = sys.argv[2]
    remove_text_from_pdf(input_pdf_path, output_pdf_path)
© www.soinside.com 2019 - 2024. All rights reserved.