我正在尝试使用 Python 处理一些使用 Adobe Acrobat Reader 填写和签名的 PDF 表单。
我已经尝试过:
我可以继续寻找库并尝试它们,但我希望有人已经为此找到了有效的解决方案。
更新:根据史蒂文的回答,我研究了pdfminer,它做得很好。
from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1, PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument(parser)
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field):
"""Recursively load form fields"""
form = field.get('Kids', None)
if form:
return [load_fields(resolve1(f)) for f in form]
else:
# Some field types, like signatures, need extra resolving
return (field.get('T').decode('utf-16'), resolve1(field.get('V')))
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
parser.add_argument('-o', '--out', help='Write output to file',
default=None, metavar='FILE')
parser.add_argument('-p', '--pickle', action='store_true', default=False,
help='Format output for python consumption')
return parser.parse_args()
def main():
args = parse_cli()
form = load_form(args.file)
if args.out:
with open(args.out, 'w') as outfile:
if args.pickle:
pickle.dump(form, outfile)
else:
pp = pprint.PrettyPrinter(indent=2)
file.write(pp.pformat(form))
else:
if args.pickle:
print(pickle.dumps(form))
else:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(form)
if __name__ == '__main__':
main()
你应该能够使用 pdfminer 来做到这一点,但这需要深入研究 pdfminer 的内部结构以及一些关于 pdf 格式的知识(当然是 wrt 形式,而且还需要了解 pdf 的内部结构,如“字典”和“间接”)对象”)。
这个示例可能会对您有所帮助(我认为它仅适用于简单的情况,没有嵌套字段等...)
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
filename = sys.argv[1]
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
print '{0}: {1}'.format(name, value)
编辑:忘记提及:如果您需要提供密码,请将其传递给
doc.initialize()
Python 3.6+:
pip install PyPDF2
# -*- coding: utf-8 -*-
def get_fields(obj, tree=None, retval=None, fileobj=None):
"""
Extracts field data if this PDF contains interactive form fields.
The *tree* and *retval* parameters are for recursive use.
:param fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
:return: A dictionary where each key is a field name, and each
value is a :class:`Field<PyPDF2.generic.Field>` object. By
default, the mapping name is used for keys.
:rtype: dict, or ``None`` if form data could not be located.
"""
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._check_kids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._build_field(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.get_object()
obj._build_field(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfReader(open(infile, 'rb'))
fields = get_fields(infile)
return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
if __name__ == '__main__':
from pprint import pprint
pdf_file_name = 'FormExample.pdf'
pprint(get_form_fields(pdf_file_name))
Python PyPDF2 包(pyPdf 的后继者)非常方便:
import PyPDF2
f = PyPDF2.PdfReader('form.pdf')
ff = f.get_fields()
那么
ff
就是一个包含所有相关表单信息的dict
。
快速而肮脏的2分钟工作;只需使用 PDFminer 将 PDF 转换为 xml,然后获取所有字段。
from xml.etree import ElementTree
from pprint import pprint
import os
def main():
print "Calling PDFDUMP.py"
os.system("dumppdf.py -a FILE.pdf > out.xml")
# Preprocess the file to eliminate bad XML.
print "Screening the file"
o = open("output.xml","w") #open for append
for line in open("out.xml"):
line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info.
o.write(line)
o.close()
print "Opening XML output"
tree = ElementTree.parse('output.xml')
lastnode = ""
lastnode2 = ""
list = {}
entry = {}
for node in tree.iter(): # Run through the tree..
# Check if New node
if node.tag == "key" and node.text == "T":
lastnode = node.tag + node.text
elif lastnode == "keyT":
for child in node.iter():
entry["ID"] = child.text
lastnode = ""
if node.tag == "key" and node.text == "V":
lastnode2 = node.tag + node.text
elif lastnode2 == "keyV":
for child in node.iter():
if child.tag == "string":
if entry.has_key("ID"):
entry["Value"] = child.text
list[entry["ID"]] = entry["Value"]
entry = {}
lastnode2 = ""
pprint(list)
if __name__ == '__main__':
main()
它并不漂亮,只是一个简单的概念证明。我需要为我正在开发的系统实现它,所以我会清理它,但我想我会发布它,以防有人发现它有用。
更新最新版本的 pdf miner(更改第一个功能中的导入和解析器/文档设置)
from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.pdftypes import PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument(parser)
parser.set_document(doc)
#doc.set_parser(parser)
doc.initialize()
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field):
"""Recursively load form fields"""
form = field.get('Kids', None)
if form:
return [load_fields(resolve1(f)) for f in form]
else:
# Some field types, like signatures, need extra resolving
return (field.get('T').decode('utf-8'), resolve1(field.get('V')))
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
parser.add_argument('-o', '--out', help='Write output to file',
default=None, metavar='FILE')
parser.add_argument('-p', '--pickle', action='store_true', default=False,
help='Format output for python consumption')
return parser.parse_args()
def main():
args = parse_cli()
form = load_form(args.file)
if args.out:
with open(args.out, 'w') as outfile:
if args.pickle:
pickle.dump(form, outfile)
else:
pp = pprint.PrettyPrinter(indent=2)
file.write(pp.pformat(form))
else:
if args.pickle:
print pickle.dumps(form)
else:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(form)
if __name__ == '__main__':
main()
我创建了一个库来执行此操作:
pip install fillpdf
from fillpdf import fillpdfs
fillpdfs.get_form_fields("ex.pdf")
感谢 dvska 的回答,作为库代码的基础。
这些行有一个拼写错误:
file.write(pp.pformat(form))
应该是:
outfile.write(pp.pformat(form))