如何从Python中填写的表单中提取PDF字段?

问题描述 投票:0回答:7

我正在尝试使用 Python 处理一些使用 Adobe Acrobat Reader 填写和签名的 PDF 表单。

我已经尝试过:

  • pdfminer演示:它没有转储任何填写的数据。
  • pyPdf:当我尝试使用 PdfFileReader(f) 加载文件时,它使核心达到最大 2 分钟,我只是放弃并杀死了它。
  • Jython 和 PDFBox:工作得很好,但启动时间太长,如果这是我唯一的选择,我会直接用 Java 编写一个外部实用程序。

我可以继续寻找库并尝试它们,但我希望有人已经为此找到了有效的解决方案。


更新:根据史蒂文的回答,我研究了pdfminer,它做得很好。

from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1, PDFObjRef

def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        return [load_fields(resolve1(f)) for f in
                   resolve1(doc.catalog['AcroForm'])['Fields']]

def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        return [load_fields(resolve1(f)) for f in form]
    else:
        # Some field types, like signatures, need extra resolving
        return (field.get('T').decode('utf-16'), resolve1(field.get('V')))

def parse_cli():
    """Load command line arguments"""
    parser = ArgumentParser(description='Dump the form contents of a PDF.')
    parser.add_argument('file', metavar='pdf_form',
                    help='PDF Form to dump the contents of')
    parser.add_argument('-o', '--out', help='Write output to file',
                      default=None, metavar='FILE')
    parser.add_argument('-p', '--pickle', action='store_true', default=False,
                      help='Format output for python consumption')
    return parser.parse_args()

def main():
    args = parse_cli()
    form = load_form(args.file)
    if args.out:
        with open(args.out, 'w') as outfile:
            if args.pickle:
                pickle.dump(form, outfile)
            else:
                pp = pprint.PrettyPrinter(indent=2)
                file.write(pp.pformat(form))
    else:
        if args.pickle:
            print(pickle.dumps(form))
        else:
            pp = pprint.PrettyPrinter(indent=2)
            pp.pprint(form)

if __name__ == '__main__':
    main()
python forms pdf
7个回答
50
投票

你应该能够使用 pdfminer 来做到这一点,但这需要深入研究 pdfminer 的内部结构以及一些关于 pdf 格式的知识(当然是 wrt 形式,而且还需要了解 pdf 的内部结构,如“字典”和“间接”)对象”)。

这个示例可能会对您有所帮助(我认为它仅适用于简单的情况,没有嵌套字段等...)

import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1

filename = sys.argv[1]
fp = open(filename, 'rb')

parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
for i in fields:
    field = resolve1(i)
    name, value = field.get('T'), field.get('V')
    print '{0}: {1}'.format(name, value)

编辑:忘记提及:如果您需要提供密码,请将其传递给

doc.initialize()


20
投票

Python 3.6+:

pip install PyPDF2

# -*- coding: utf-8 -*-


def get_fields(obj, tree=None, retval=None, fileobj=None):
    """
    Extracts field data if this PDF contains interactive form fields.
    The *tree* and *retval* parameters are for recursive use.

    :param fileobj: A file object (usually a text file) to write
        a report to on all interactive form fields found.
    :return: A dictionary where each key is a field name, and each
        value is a :class:`Field<PyPDF2.generic.Field>` object. By
        default, the mapping name is used for keys.
    :rtype: dict, or ``None`` if form data could not be located.
    """
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                    '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
    if retval is None:
        retval = OrderedDict()
        catalog = obj.trailer["/Root"]
        # get the AcroForm tree
        if "/AcroForm" in catalog:
            tree = catalog["/AcroForm"]
        else:
            return None
    if tree is None:
        return retval

    obj._check_kids(tree, retval, fileobj)
    for attr in fieldAttributes:
        if attr in tree:
            # Tree is a field
            obj._build_field(tree, retval, fileobj, fieldAttributes)
            break

    if "/Fields" in tree:
        fields = tree["/Fields"]
        for f in fields:
            field = f.get_object()
            obj._build_field(field, retval, fileobj, fieldAttributes)

    return retval


def get_form_fields(infile):
    infile = PdfReader(open(infile, 'rb'))
    fields = get_fields(infile)
    return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())



if __name__ == '__main__':
    from pprint import pprint

    pdf_file_name = 'FormExample.pdf'

    pprint(get_form_fields(pdf_file_name))

15
投票

Python PyPDF2 包(pyPdf 的后继者)非常方便:

import PyPDF2
f = PyPDF2.PdfReader('form.pdf')
ff = f.get_fields()

那么

ff
就是一个包含所有相关表单信息的
dict


4
投票

快速而肮脏的2分钟工作;只需使用 PDFminer 将 PDF 转换为 xml,然后获取所有字段。

from xml.etree import ElementTree
from pprint import pprint
import os

def main():
    print "Calling PDFDUMP.py"
    os.system("dumppdf.py -a FILE.pdf > out.xml")

    # Preprocess the file to eliminate bad XML.
    print "Screening the file"
    o = open("output.xml","w") #open for append
    for line in open("out.xml"):
       line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info.
       o.write(line) 
    o.close()

    print "Opening XML output"
    tree = ElementTree.parse('output.xml')
    lastnode = ""
    lastnode2 = ""
    list = {}
    entry = {}

    for node in tree.iter(): # Run through the tree..        
        # Check if New node
        if node.tag == "key" and node.text == "T":
            lastnode = node.tag + node.text
        elif lastnode == "keyT":
            for child in node.iter():
                entry["ID"] = child.text
            lastnode = ""

        if node.tag == "key" and node.text == "V":
            lastnode2 = node.tag + node.text
        elif lastnode2 == "keyV":
            for child in node.iter():
                if child.tag == "string":
                    if entry.has_key("ID"):
                        entry["Value"] = child.text
                        list[entry["ID"]] = entry["Value"]
                        entry = {}
            lastnode2 = ""

    pprint(list)

if __name__ == '__main__':
  main()

它并不漂亮,只是一个简单的概念证明。我需要为我正在开发的系统实现它,所以我会清理它,但我想我会发布它,以防有人发现它有用。


3
投票

更新最新版本的 pdf miner(更改第一个功能中的导入和解析器/文档设置)

from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.pdftypes import PDFObjRef

def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        #doc.set_parser(parser)
        doc.initialize()
        return [load_fields(resolve1(f)) for f in
            resolve1(doc.catalog['AcroForm'])['Fields']]

def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        return [load_fields(resolve1(f)) for f in form]
    else:
        # Some field types, like signatures, need extra resolving
        return (field.get('T').decode('utf-8'), resolve1(field.get('V')))

def parse_cli():
    """Load command line arguments"""
    parser = ArgumentParser(description='Dump the form contents of a PDF.')
    parser.add_argument('file', metavar='pdf_form',
        help='PDF Form to dump the contents of')
    parser.add_argument('-o', '--out', help='Write output to file',
        default=None, metavar='FILE')
    parser.add_argument('-p', '--pickle', action='store_true', default=False,
        help='Format output for python consumption')
    return parser.parse_args()

def main():
    args = parse_cli()
    form = load_form(args.file)
    if args.out:
        with open(args.out, 'w') as outfile:
            if args.pickle:
                pickle.dump(form, outfile)
            else:
                pp = pprint.PrettyPrinter(indent=2)
                file.write(pp.pformat(form))
    else:
        if args.pickle:
            print pickle.dumps(form)
        else:
            pp = pprint.PrettyPrinter(indent=2)
            pp.pprint(form)

if __name__ == '__main__':
    main()

3
投票

我创建了一个库来执行此操作:

pip install fillpdf

from fillpdf import fillpdfs
fillpdfs.get_form_fields("ex.pdf")

感谢 dvska 的回答,作为库代码的基础。


0
投票

这些行有一个拼写错误:

file.write(pp.pformat(form))

应该是:

outfile.write(pp.pformat(form))
© www.soinside.com 2019 - 2024. All rights reserved.