
问题描述 投票:0回答:1

我需要从pdf文件中删除所有文本信息。所以我想要得到的文件应该像扫描一样:只有包装为pdf的图像,没有可以复制或选择的文本。现在我正在使用 Ghostscript 命令:

import os
os.system(f"gs -o {output_path} -sDEVICE=pdfwrite -dFILTERTEXT {input_path}")

不幸的是,对于某些文档,它不仅删除了文本层,还删除了字符的真实像素! 而且有时页面上看不到任何文字图片,这不是我需要的

是否有一些使用 python 或 pip utils 的稳定且快速的解决方案?如果我能用 PyMuPDF (fitz) 解决这个问题那就太好了,但我找不到任何相关信息

python pdf text ghostscript pymupdf


pip install PyPDF2

import PyPDF2
import sys
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import DictionaryObject, NameObject, ArrayObject, NumberObject, DecodedStreamObject, IndirectObject

def remove_text_from_page(page, writer) -> PyPDF2._page.PageObject:
    # Create a new blank page object with the same size as the original page
    new_page = PyPDF2._page.PageObject.create_blank_page(width=page.mediabox.width,

    # Copy non-text content (images, graphics) from the original page
    content_streams = page.get_contents()
    if content_streams:
        content_streams = [content_streams] if not isinstance(content_streams, list) else content_streams
        for content_stream in content_streams:
            content_stream = content_stream.get_object()  # Dereference the IndirectObject
            data = content_stream.get_data()
            # Filter out text operations, which usually start with "BT" and end with "ET"
            if b'BT' in data and b'ET' in data:
                data = remove_text_operations(data)
            new_stream = DecodedStreamObject()
            # Ensure the new stream is added as an indirect object
            new_stream_indirect = writer._add_object(new_stream)

            # If the new page has no contents yet, create the contents array
            if new_page.get_contents() is None:
                new_page[NameObject("/Contents")] = ArrayObject()

    # Copy over all non-text elements from the original page
    for key in page:
        if key != NameObject('/Contents'):
            new_page[key] = page[key]

    return new_page

def remove_text_operations(data: bytes) -> bytes:
    output = b""
    inside_text = False
    for line in data.splitlines(True):
        if b"BT" in line:  # Begin Text Object
            inside_text = True
        if not inside_text:
            output += line
        if b"ET" in line:  # End Text Object
            inside_text = False
    return output

def remove_text_from_pdf(input_pdf_path: str, output_pdf_path: str):
    # Read the input PDF
    reader = PdfReader(input_pdf_path)
    writer = PdfWriter()

    # Process each page to remove text
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        new_page = remove_text_from_page(page, writer)

    # Write the output PDF
    with open(output_pdf_path, 'wb') as output_pdf_file:

# Example usage
if __name__ == "__main__":
    input_pdf_path = sys.argv[1]
    output_pdf_path = sys.argv[2]
    remove_text_from_pdf(input_pdf_path, output_pdf_path)
© www.soinside.com 2019 - 2024. All rights reserved.