尝试在 python 中实现来自 google cloud 的文档 OCR 时遇到此错误,如下所述:https://cloud.google.com/document-ai/docs/ocr#documentai_process_document-python。
当我跑步时
operation.result(timeout=None)
我收到此错误
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "/Users/Niolo/Desktop/project/venv/lib/python3.8/site-packages/google/api_core/future/polling.py", line 134, in result
raise self._exception
google.api_core.exceptions.InternalServerError: 500 Failed to process all the documents
我的完整代码
import re
import os
from google.cloud import storage
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
project_id = 'my_project_id'
location = 'eu' # Format is 'us' or 'eu'
processor_id = 'my_processor_id' # Create processor in Cloud Console
gcs_input_uri = "gs://my_bucket/toy1.py"
gcs_output_uri = "gs://my_bucket"
gcs_output_uri_prefix = "gs://"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/Niolo/Desktop/Work/DocumentAI/OCR/key.json"
def batch_process_documents(
project_id,
location,
processor_id,
gcs_input_uri,
gcs_output_uri,
gcs_output_uri_prefix,
timeout: int = 300,
):
# Set endpoint to EU
options = ClientOptions(api_endpoint="eu-documentai.googleapis.com:443")
# Instantiates a client
client = documentai.DocumentProcessorServiceClient(client_options=options)
destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
# 'mime_type' can be 'application/pdf', 'image/tiff',
# and 'image/gif', or 'application/json'
input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
gcs_source=gcs_input_uri, mime_type="application/pdf"
)
# Where to write results
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
gcs_destination=destination_uri
)
# Location can be 'us' or 'eu'
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
name=name,
input_configs=[input_config],
output_config=output_config,
)
operation = client.batch_process_documents(request)
# Wait for the operation to finish
operation.result(timeout=None)
# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)
storage_client = storage.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")
for i, blob in enumerate(blob_list):
# Download the contents of this blob as a bytes object.
if ".json" not in blob.name:
print(f"skipping non-supported file type {blob.name}")
return
# Only parses JSON files
blob_as_bytes = blob.download_as_bytes()
document = documentai.types.Document.from_json(blob_as_bytes)
print(f"Fetched file {i + 1}")
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
# Read the text recognition output from the processor
for page in document.pages:
for form_field in page.form_fields:
field_name = get_text(form_field.field_name, document)
field_value = get_text(form_field.field_value, document)
print("Extracted key value pair:")
print(f"\t{field_name}, {field_value}")
for paragraph in document.pages:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text:\n{paragraph_text}")
对于以下变量,您需要为其提供正确的值。
gcs_input_uri
您要处理的 pdf/tiff/gif 文件的完整路径gcs_input_uri = 'gs://cloud-samples-data/documentai/loan_form.pdf'
gcs_output_uri
用于存储输出的存储桶。 注意:不要在存储桶名称末尾添加“/”。这也会导致错误 500!gcs_output_uri = 'gs://samplebucket'
gcs_output_uri_prefix
这将用作您存储桶中的文件夹。gcs_output_uri_prefix = '测试'
将超时保持在
operation.result()
,因为 client.batch_process_documents(request) 返回长时间运行的操作。
代表长时间运行的操作的对象。 操作的结果类型将是 :class:~.document_processor_service.BatchProcessResponse: 响应 批处理文档方法的消息。
# Wait for the operation to finish
operation.result(timeout=timeout)
这是工作代码:
import re
import os
from google.cloud import storage
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
project_id = 'tiph-ricconoel-batch8'
location = 'eu' # Format is 'us' or 'eu'
processor_id = 'your_processor_id' # Create processor in Cloud Console
gcs_input_uri = 'gs://cloud-samples-data/documentai/loan_form.pdf'
gcs_output_uri = 'gs://samplebucket'
gcs_output_uri_prefix = 'test'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/full_path/your_json_file.json'
def batch_process_documents(
project_id,
location,
processor_id,
gcs_input_uri,
gcs_output_uri,
gcs_output_uri_prefix,
timeout: int = 300,
):
# Set endpoint to EU
options = ClientOptions(api_endpoint="eu-documentai.googleapis.com:443")
# Instantiates a client
client = documentai.DocumentProcessorServiceClient(client_options=options)
destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
# 'mime_type' can be 'application/pdf', 'image/tiff',
# and 'image/gif', or 'application/json'
input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
gcs_source=gcs_input_uri, mime_type="application/pdf"
)
# Where to write results
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
gcs_destination=destination_uri
)
# Location can be 'us' or 'eu'
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
name=name,
input_configs=[input_config],
output_config=output_config,
)
operation = client.batch_process_documents(request)
# Wait for the operation to finish
operation.result(timeout=timeout)
# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)
storage_client = storage.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")
for i, blob in enumerate(blob_list):
# Download the contents of this blob as a bytes object.
if ".json" not in blob.name:
print(f"skipping non-supported file type {blob.name}")
return
# Only parses JSON files
blob_as_bytes = blob.download_as_bytes()
document = documentai.types.Document.from_json(blob_as_bytes)
print(f"Fetched file {i + 1}")
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
# Read the text recognition output from the processor
for page in document.pages:
for form_field in page.form_fields:
field_name = get_text(form_field.field_name, document)
field_value = get_text(form_field.field_value, document)
print("Extracted key value pair:")
print(f"\t{field_name}, {field_value}")
for paragraph in document.pages:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text:\n{paragraph_text}")
这将在
gs://samplebucket/test/xxxxx/x/output.json
中创建输出文件。请参阅下面的测试:
pip install --升级 google-ai-generativelanguage
使用最新版本解决了我的问题~