我试图使用Textract在doem PDF文件中提取文本。但是,当我在代码末尾打印文本时,它只会打印出很多空白区域。任何人都可以指出我的方向发生了什么? (顺便说一句,文字不是=“”)
import os
import codecs
import PyPDF2
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
for filename in os.listdir('Harbour PDF'):
if '.DS_Store' == filename:
continue
filename = 'Harbour PDF/' + filename
print(filename)
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText()
if text != "":
text = text
else:
text = textract.process(pdfFileObj, method='tesseract', language='eng')
print(text)
我通过python使用的2个函数(第2个需要tesseract)。好吧,我实际上更喜欢tesseract而不是pdfminer,但他们有效地做同样的事情。不确定你的代码有什么问题,但这些是我认为的替代等价物。
from PIL import Image
import pytesseract
import cv2
import os
import subprocess
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
def to_txt(pdf_path, output_dir, name=None):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if name == None:
parts = pdf_path.split('\\')
fname = parts[-1][:-4] + '.txt'
fname = fname.replace(" ", "_")
end = output_dir + fname
else:
if name[-4:] != '.txt':
name += '.txt'
end = output_dir + name
cmd = ['pdftotext', pdf_path,
end]
subprocess.call(cmd)
print('Converted')