我有一个包含 pdf 文件的 blob 存储,我想在其中实施知识挖掘解决方案。我创建了数据源、索引、技能组和索引器。但是,当我运行所有内容时,我收到一条警告“无法执行技能,因为一个或多个技能输入无效。”。
技能组代码:
def create_skillset(search_service_endpoint, search_service_api_key, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
# Define skills
doc_extraction_skill = DocumentExtractionSkill(
name="documentExtractionSkill",
description="Extract text from documents",
context="/document",
configuration={"imageAction": "generateNormalizedImagePerPage"},
inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
outputs=[OutputFieldMappingEntry(name="content", target_name="/documents/content")]
)
# Create skillset
skillset = SearchIndexerSkillset(
name=skillset_name,
skills=[doc_extraction_skill]
)
# Create skillset in Azure Cognitive Search
indexer_client.create_skillset(skillset)
print(f"Skillset '{skillset_name}' created successfully.")
索引器:
# Function to create an indexer
def create_indexer(search_service_endpoint, search_service_api_key, indexer_name, data_source_name, index_name, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
field_mappings = [
FieldMapping(source_field_name="metadata_storage_path", target_field_name="metadata_storage_path"),
FieldMapping(source_field_name="metadata_storage_name", target_field_name="metadata_storage_name"),
FieldMapping(source_field_name="metadata_storage_last_modified", target_field_name="metadata_storage_last_modified"),
FieldMapping(source_field_name="metadata_content_type", target_field_name="metadata_content_type"),
]
output_field_mappings = [
FieldMapping(source_field_name="/document/content", target_field_name= "content"),
]
# Define indexing parameters
indexing_parameters = IndexingParameters(
configuration={
"indexStorageMetadataOnlyForOversizedDocuments": True,
"failOnUnsupportedContentType": False,
"indexedFileNameExtensions": ".pdf,.docx,.txt,.json",
"parseJson": True,
"parsingMode": "default",
"allowSkillsetToReadFileData": False
}
)
indexer = SearchIndexer(
name=indexer_name,
data_source_name=data_source_name,
target_index_name=index_name,
skillset_name=skillset_name,
field_mappings=field_mappings,
output_field_mappings=output_field_mappings,
schedule=IndexingSchedule(interval="PT15M"),
parameters=indexing_parameters
)
indexer_client.create_indexer(indexer)
print(f"Indexer '{indexer_name}' created.")
您尝试过为索引器设置
"allowSkillsetToReadFileData": True
吗?