我正在关注thisSO帖子,从纯文本的pdf中提取页码。我在索引中创建了一个
page_number
字段。
我还使用 azure 函数创建了自定义 Web API 技能,并在索引器定义中使用
OuputFieldMappings
数组映射技能丰富的输出。在下面的天蓝色函数定义中,我假设请求数据看起来像这样,正如上述 SO 帖子中提到的那样。这个假设正确吗?开发人员如何测试他们的技能强化步骤?
{
"data": "BASE64 ENCODED STRING OF A JPEG IMAGE",
"width": 500,
"height": 300,
"originalWidth": 5000,
"originalHeight": 3000,
"rotationFromOriginal": 90,
"contentOffset": 500,
"pageNumber": 2
}
运行索引器后,我通过为部署创建的 azure 函数应用程序资源中的日志流来监视日志,并且不会生成任何信息日志。我怀疑我的自定义 Web API 技能在技能组运行时没有被调用。因此,这就是为什么当我对搜索索引执行矢量搜索查询时,
page_number
字段始终为null
。我列出了我的技能组、天蓝色函数和索引器的定义,任何人都可以指导我缺少的内容吗? TIA
技能组定义
{
"@odata.context": "https://ai-studio-search-test.search.windows.net/$metadata#skillsets/$entity",
"@odata.etag": "\"0x8DC9745F955DC29\"",
"name": "test-skillset",
"description": "Skillset to chunk documents and generating embeddings",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"name": "#1",
"description": "Split skill to chunk documents",
"context": "/document",
"defaultLanguageCode": "en",
"textSplitMode": "pages",
"maximumPageLength": 2000,
"pageOverlapLength": 500,
"maximumPagesToTake": 0,
"inputs": [
{
"name": "text",
"source": "/document/content"
}
],
"outputs": [
{
"name": "textItems",
"targetName": "pages"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"name": "#2",
"description": "Skill to generate embeddings via Azure OpenAI",
"context": "/document/pages/*",
"resourceUri": "https://crowemind-non-prd-us-east-2.openai.azure.com",
"apiKey": "<redacted>",
"deploymentId": "text-embedding-ada-002",
"dimensions": 1536,
"modelName": "text-embedding-ada-002",
"inputs": [
{
"name": "text",
"source": "/document/pages/*"
}
],
"outputs": [
{
"name": "embedding",
"targetName": "vector"
}
],
"authIdentity": null
},
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"name": "#3",
"description": "",
"context": "/document/normalized_images/*",
"uri": "https://azure-searchskill-apis.azurewebsites.net/api/pageno_skillapi_anonymous",
"httpMethod": "POST",
"timeout": "PT30S",
"batchSize": 1,
"degreeOfParallelism": 1,
"authResourceId": null,
"inputs": [
{
"name": "normalized_images",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "page_number",
"targetName": "page_number"
}
],
"httpHeaders": {},
"authIdentity": null
}
],
"cognitiveServices": null,
"knowledgeStore": null,
"indexProjections": {
"selectors": [
{
"targetIndexName": "test-index-secondary",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document",
"mappings": [
{
"name": "chunk",
"source": "/document/pages/*",
"sourceContext": null,
"inputs": []
},
{
"name": "text_vector",
"source": "/document/pages/*/vector",
"sourceContext": null,
"inputs": []
},
{
"name": "title",
"source": "/document/metadata_storage_name",
"sourceContext": null,
"inputs": []
},
{
"name": "page_number",
"source": "/document/normalized_images/*/page_number",
"sourceContext": null,
"inputs": []
}
]
}
],
"parameters": {
"projectionMode": "skipIndexingParentDocuments"
}
},
"encryptionKey": null
}
Azure 功能已部署
@app.route(route="pageno_skillapi_anonymous", auth_level=func.AuthLevel.ANONYMOUS)
def pageno_skillapi_anonymous(req: func.HttpRequest) -> func.HttpResponse:
payload = req.get_json()
if payload:
logging.info("request body received from /document/normalized_images/* context")
logging.info(f"payload: ${payload}")
if "pageNumber" in payload:
page_number = payload
return func.HttpResponse(str(payload["pageNumber"]), status_code=200)
else:
return func.HttpResponse("No pageNumber parameter in /document/normalized_images/* context", status_code=200)
else:
logging.info("request body is empty")
return func.HttpResponse("request body is empty", status_code=200)
索引器定义
{
"@odata.context": "https://ai-studio-search-test.search.windows.net/$metadata#indexers/$entity",
"@odata.etag": "\"0x8DC976AFC2C0D01\"",
"name": "test-indexer",
"description": "Indexer to index documents and generate embeddings",
"dataSourceName": "test-blob",
"skillsetName": "test-skillset",
"targetIndexName": "test-index",
"disabled": false,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": null,
"maxFailedItemsPerBatch": null,
"base64EncodeKeys": null,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImagePerPage"
}
},
"fieldMappings": [
{
"sourceFieldName": "metadata_storage_name",
"targetFieldName": "title",
"mappingFunction": null
}
],
"outputFieldMappings": [
{
"sourceFieldName": "/document/normalized_images/*/page_number",
"targetFieldName": "page_number"
}
],
"cache": null,
"encryptionKey": null
}
我尝试更改自定义字段的上下文,但没有执行任何操作。
根据此文档,当您设置
imageAction
时,您将在/document/normalized_images/*
上下文中获得图像数组
因此,您不能在主索引中进行输出字段映射,如果进行映射,则需要将其映射到结果数据类型的类型集合的字段,如下所示。
要单独获取数据,需要创建二级索引并对其进行投影。
您无需将
page_number
传递给您的自定义 Web api 技能,当您将 imageAction
作为 generateNormalizedImages
时,它已经配置好了。
因此,您可以使用
ocr
技能组从页面中提取文本并使用页码进行投影。
以下是技能组定义。
{
"@odata.context": "https://jgsai.search.windows.net/$metadata#skillsets/$entity",
"@odata.etag": "\"0x8DC9A7C02836ACA\"",
"name": "skillset1719835991688",
"description": "",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"name": "#1",
"description": null,
"context": "/document/normalized_images/*",
"textExtractionAlgorithm": null,
"lineEnding": "Space",
"defaultLanguageCode": "en",
"detectOrientation": true,
"inputs": [
{
"name": "image",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "text",
"targetName": "text"
}
]
}
],
"cognitiveServices": {
"@odata.type": "#Microsoft.Azure.Search.DefaultCognitiveServices",
"description": null
},
"knowledgeStore": null,
"indexProjections": {
"selectors": [
{
"targetIndexName": "test-index",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document/normalized_images/*",
"mappings": [
{
"name": "page_number",
"source": "/document/normalized_images/*/pageNumber",
"sourceContext": null,
"inputs": []
},
{
"name": "page_content",
"source": "/document/normalized_images/*/text",
"sourceContext": null,
"inputs": []
}
]
}
],
"parameters": {
"projectionMode": "skipIndexingParentDocuments"
}
},
"encryptionKey": null
}
索引器定义
{
"@odata.context": "https://jgsai.search.windows.net/$metadata#indexers/$entity",
"@odata.etag": "\"0x8DC9A7C1434746D\"",
"name": "azureblob-indexer",
"description": "",
"dataSourceName": "ds",
"skillsetName": "skillset1719835991688",
"targetIndexName": "azureblob-index",
"disabled": null,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": 0,
"maxFailedItemsPerBatch": 0,
"base64EncodeKeys": null,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages"
}
},
"fieldMappings": [
{
"sourceFieldName": "metadata_storage_path",
"targetFieldName": "metadata_storage_path",
"mappingFunction": {
"name": "base64Encode",
"parameters": null
}
}
],
"outputFieldMappings": [],
"cache": null,
"encryptionKey": null
}
以及目标指标定义。
{
"@odata.context": "https://jgsai.search.windows.net/$metadata#indexes/$entity",
"@odata.etag": "\"0x8DC9A73656E2423\"",
"name": "test-index",
"defaultScoringProfile": null,
"fields": [
{
"name": "id",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": true,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "keyword",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "parent_id",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "page_number",
"type": "Edm.Int64",
"searchable": false,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "page_content",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
}
],
"scoringProfiles": [],
"corsOptions": null,
"suggesters": [],
"analyzers": [],
"normalizers": [],
"tokenizers": [],
"tokenFilters": [],
"charFilters": [],
"encryptionKey": null,
"similarity": {
"@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
"k1": null,
"b": null
},
"semantic": null,
"vectorSearch": null
}
输出:
如果您想使用自定义 Web api 自定义数据,那么 您使用下面的功能代码。
import azure.functions as func
import logging
import json
import base64
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
@app.route(route="http_trigger1")
def http_trigger1(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
values = req.get_json()['values']
res=[]
content = base64.b64decode(i['data']['normalized_images']['data']) #do whatever you want with this image.
for i in values:
tmp = i
tmp['data'] = {
"page_number": i['data']['normalized_images']['pageNumber']
}
res.append(tmp)
return func.HttpResponse(json.dumps({"values":res}),mimetype="application/json")
有关结合标准化图像和自定义 Web api 的更多信息,请参阅 this。