使用 Python 从 Google 文档 AI 处理器数据集中删除文档

问题描述 投票:0回答:1

我目前正在开发一个涉及 Google Document AI 的项目,我需要使用 Python 从处理器数据集中删除文档的帮助。我尝试了各种方法,但一直无法找到解决方案。

这是我到目前为止所做的:

  • 我已在 Google Document AI 中设置了处理器。
  • 我已成功将文档上传到处理器数据集。
  • 现在,我需要使用 Python 以编程方式从数据集中删除特定文档。
  • 我已经查看了Google Cloud Document AI文档(特别是这部分),但我找不到任何关于如何实现删除的明确指导。

我们使用的代码如下:

import io

from google.cloud import documentai_v1beta3
from google.cloud.documentai_v1beta3 import DocumentId

PROCESSOR_LOCATION = "eu"
PROJECT_NUMBER = {
    "DEV": "123456789",
    "PROD": "123456789"
}
PROCESSOR_ID_CDE = {
    "DEV": "abcdefghijk",
    "PROD": "abcdefghijk"
}

# Create DocumentId objects and set the gcs_managed_doc_id attribute
doc_id1 = DocumentId(document_id="gs://test/raw_data/training/abc.pdf")
doc_id2 = DocumentId(document_id="gs://test/raw_data/training/xyz.pdf")

ENV_DEST = "DEV"


def sample_batch_delete_documents():
    # Create a client
    opts = {"api_endpoint": "eu-documentai.googleapis.com"}
    client = documentai_v1beta3.DocumentServiceClient(client_options=opts)

    # Initialize request argument(s)
    dataset_documents = documentai_v1beta3.BatchDatasetDocuments.IndividualDocumentIds(document_ids=[doc_id1, doc_id2])
    batch_dataset_documents = documentai_v1beta3.BatchDatasetDocuments()
    batch_dataset_documents.individual_document_ids = dataset_documents
    request = documentai_v1beta3.BatchDeleteDocumentsRequest(
        dataset=f"projects/{PROJECT_NUMBER[ENV_DEST]}/locations/{PROCESSOR_LOCATION}/processors/{PROCESSOR_ID_CDE[ENV_DEST]}/dataset",
        dataset_documents=batch_dataset_documents,
        timeout=300  # 300 seconds
    )
    print(request)

    # Make the request
    operation = client.batch_delete_documents(request=request)
    print("Waiting for operation to complete...")
    response = operation.result()

    # Handle the response
    print(response)

我收到的错误消息如下:

Error Message
 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/var/tmp/ipykernel_13116/441287119.py in <module>
----> 1 sample_batch_delete_documents()
 
/var/tmp/ipykernel_13116/2850527411.py in sample_batch_delete_documents()
    39     print("Waiting for operation to complete...")
    40
---> 41     response =operation.result()
    42 
    43     # Handle the responseh
 
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in result(self, timeout, retry, polling)
   254         """
   255
--> 256         self._blocking_poll(timeout=timeout,retry=retry,polling=polling)
   257
   258         ifself._exception isnotNone:
 
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in _blocking_poll(self, timeout, retry, polling)
   135
   136         try:
--> 137             polling(self._done_or_raise)(retry=retry)
   138         exceptexceptions.RetryError:
   139             raise concurrent.futures.TimeoutError(
 
/opt/conda/lib/python3.7/site-packages/google/api_core/retry.py in retry_wrapped_func(*args, **kwargs)
   352                 sleep_generator,
   353                 self._timeout,
--> 354                 on_error=on_error,
   355             )
   356
 
/opt/conda/lib/python3.7/site-packages/google/api_core/retry.py in retry_target(target, predicate, sleep_generator, timeout, on_error, **kwargs)
   189     forsleep insleep_generator:
   190         try:
--> 191             return target()
   192
   193         # pylint: disable=broad-except
 
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in _done_or_raise(self, retry)
   117     def_done_or_raise(self,retry=None):
   118         """Check if the future is done and raise if it's not."""
--> 119         if notself.done(retry=retry):
   120             raise_OperationNotComplete()
   121
 
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in done(self, retry)
   172             bool:Trueifthe operation iscomplete,Falseotherwise.
   173         """
--> 174         self._refresh_and_update(retry)
   175         returnself._operation.done
   176
 
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in _refresh_and_update(self, retry)
   161         ifnotself._operation.done:
   162             self._operation =self._refresh(retry=retry)ifretry elseself._refresh()
--> 163             self._set_result_from_operation()
   164
   165     defdone(self,retry=None):
 
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in _set_result_from_operation(self)
   133             ifself._operation.HasField("response"):
   134                 response = protobuf_helpers.from_any_pb(
--> 135                     self._result_type,self._operation.response
   136                 )
   137                 self.set_result(response)
 
/opt/conda/lib/python3.7/site-packages/google/api_core/protobuf_helpers.py in from_any_pb(pb_type, any_pb)
    65         raise TypeError(
    66             "Could not convert {} to {}".format(
---> 67                 any_pb.__class__.__name__,pb_type.__name__
    68             )
    69         )
 
TypeError: Could not convert Any to BatchDeleteDocumentsResponse

如果有人能为我提供如何解决此问题的指导,我将不胜感激。我愿意接受有关我已经尝试过的方法或任何其他方法的建议。

python-3.x google-cloud-platform cloud-document-ai
1个回答
0
投票

你能解决这个问题吗?如果是这样,您能提供一些关于您是如何做到的指导吗?

谢谢!

© www.soinside.com 2019 - 2024. All rights reserved.