我使用 azure 门户中的 Skillsetblade 更新了 TextSplitSkill,并得到了一个空括号作为输出

问题描述 投票:0回答:1

我使用 azure 门户中的 Skillsetblade 更新了 TextSplitSkill,并得到了一个空括号作为输出。使用索引器映射索引后,索引 json 中的我的技能组显示为空,即 ""mypages": []

这是我的 Skillset.json 下面

{
  "@odata.context": "https://msserv.search.windows.net/$metadata#skillsets/$entity",
  "@odata.etag": "\"0x8DC9AA78A4F25CA\"",
  "name": "my-skillset",
  "description": "Skillset created from the portal. skillsetName:my-skillset; contentField: merged_content; enrichmentGranularity: document; knowledgeStoreStorageAccount: ;",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Text.V3.EntityRecognitionSkill",
      "name": "#1",
      "description": null,
      "context": "/document/merged_content",
      "categories": [
        "Location",
        "Organization",
        "DateTime",
        "Skill",
        "PersonType",
        "PhoneNumber",
        "IPAddress",
        "Email",
        "Address",
        "Product",
        "URL",
        "Event",
        "Person",
        "Quantity"
      ],
      "defaultLanguageCode": "en",
      "minimumPrecision": null,
      "modelVersion": null,
      "inputs": [
        {
          "name": "text",
          "source": "/document/merged_content"
        },
        {
          "name": "languageCode",
          "source": "/document/language"
        }
      ],
      "outputs": [
        {
          "name": "organizations",
          "targetName": "organizations"
        },
        {
          "name": "locations",
          "targetName": "locations"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.KeyPhraseExtractionSkill",
      "name": "#2",
      "description": null,
      "context": "/document/merged_content",
      "defaultLanguageCode": "en",
      "maxKeyPhraseCount": null,
      "modelVersion": null,
      "inputs": [
        {
          "name": "text",
          "source": "/document/merged_content"
        },
        {
          "name": "languageCode",
          "source": "/document/language"
        }
      ],
      "outputs": [
        {
          "name": "keyPhrases",
          "targetName": "keyphrases"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.LanguageDetectionSkill",
      "name": "#3",
      "description": null,
      "context": "/document",
      "defaultCountryHint": null,
      "modelVersion": null,
      "inputs": [
        {
          "name": "text",
          "source": "/document/merged_content"
        }
      ],
      "outputs": [
        {
          "name": "languageCode",
          "targetName": "language"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
      "name": "#4",
      "description": null,
      "context": "/document",
      "insertPreTag": " ",
      "insertPostTag": " ",
      "inputs": [
        {
          "name": "text",
          "source": "/document/content"
        },
        {
          "name": "itemsToInsert",
          "source": "/document/normalized_images/*/text"
        },
        {
          "name": "offsets",
          "source": "/document/normalized_images/*/contentOffset"
        }
      ],
      "outputs": [
        {
          "name": "mergedText",
          "targetName": "merged_content"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
      "name": "#5",
      "description": null,
      "context": "/document/normalized_images/*",
      "textExtractionAlgorithm": null,
      "lineEnding": "Space",
      "defaultLanguageCode": "en",
      "detectOrientation": true,
      "inputs": [
        {
          "name": "image",
          "source": "/document/normalized_images/*"
        }
      ],
      "outputs": [
        {
          "name": "text",
          "targetName": "text"
        },
        {
          "name": "layoutText",
          "targetName": "layoutText"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "Split Skill",
      "description": "Splits text into pages",
      "context": "/document/merged_content",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 1000,
      "pageOverlapLength": 100,
      "maximumPagesToTake": 0,
      "inputs": [
        {
          "name": "text",
          "source": "/document/merged_content"
        },
        {
          "name": "languageCode",
          "source": "/document/language"
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "mypages"
        }
      ]
    }
  ],
  "cognitiveServices": null,
  "knowledgeStore": null,
  "indexProjections": null,
  "encryptionKey": null
}

这是我的index.json

{
  "@odata.context": "https://msserv.search.windows.net/indexes('my-index')/$metadata#docs(*)",
  "value": [
    {
      "@search.score": 1,
      "content": "\t\n\t\n\t\n\n\n\nTECHNOLOGIES LIMITED                                              \t\t       \t\t              Employee Handbook \n\n\n\n\nEMPLOYEE HANDBOOK\n\n\n\n\n\n\n\nUNDERSTANDING ALL EMPLOYMENT \nISSUES AT  technologies limited\n\n\n\n\n\nPresented to:\n\nClick or tap here to enter text.\n\n\n\n\n          © Copyright Envision Corporation. 2002. All rights reserved. Protected by the copyright laws of the United States & Canada and by international treaties. IT IS ILLEGAL AND STRICTLY PROHIBITED TO DISTRIBUTE, PUBLISH, OFFER FOR SALE, LICENSE OR SUBLICENSE, GIVE OR DISCLOSE TO ANY OTHER PARTY, THIS PRODUCT IN HARD COPY OR DIGITAL FORM. ALL OFFENDERS WILL BE SUED IN A COURT OF LAW.\n\n          © Copyright Envision Corporation. 2002. All rights reserved. Protected by the copyright laws of the United States & Canada and by international treaties. IT IS ILLEGAL AND STRICTLY PROHIBITED TO DISTRIBUTE, PUBLISH, OFFER FOR SALE, LICENSE OR SUBLICENSE, GIVE OR DISCLOSE TO ANY OTHER PARTY, THIS PRODUCT IN HARD COPY OR DIGITAL FORM. ALL OFFENDERS WILL BE SUED IN A COURT OF LAW.\n\nJune 2001\n\n\n\n\n\n\nTABLE OF CONTENT\nWELCOME TO INFINION TECHNOLOGIES LIMITED\t4\n1.\tINTRODUCTION\t5\n1.1\tORGANIZATION DESCRIPTION\t5\n1.1.1\tFACILITIES AND LOCATION(S)\t5\n1.1.5\tGOALS\t6\n1.3\tCUSTOMER RELATIONS\t6\n2.1\tNATURE OF EMPLOYMENT\t7\n2.2\tEMPLOYEE RELATIONS\t7\n2.3\tEQUAL EMPLOYMENT OPPORTUNITY\t7\n2.5\tPERSONAL RELATIONSHIPS IN THE WORKPLACE\t8\n2.6\tCONFLICTS OF INTEREST\t8\n2.7\tNON-DISCLOSURE\t9\n2.9\tDISABILITY ACCOMMODATION\t9\n2.10\tJOB POSTING AND EMPLOYEE REFERRALS\t10\n3.\tEMPLOYMENT STATUS AND RECORDS\t12\n3.1\tEMPLOYMENT CATEGORIES\t12\n3.2\tACCESS TO PERSONNEL FILES\t12\n3.3\tPERSONNEL DATA CHANGES\t12\n3.4\tPROBATION PERIOD\t13\n3.5\tEMPLOYMENT APPLICATIONS\t13\n3.6\tPERFORMANCE EVALUATION\t13\n3.7\tJOB DESCRIPTIONS\t14\n3.8\tSALARY ADMINISTRATION\t14\nMARRIAGE, MATERNITY AND PARENTAL LEAVE\t14\n5.\tTIMEKEEPING / PAYROLL\t17\n5.1\tTIMEKEEPING\t17\n5.2\tPAYDAYS\t17\n5.3\tEMPLOYMENT TERMINATION\t17\n5.4\tADMINISTRATIVE PAY CORRECTIONS\t17\n6.\tWORK CONDITIONS AND HOURS\t18\n6.1\tWORK SCHEDULES\t18\n6.2\tUSE OF PHONE AND MAIL SYSTEMS\t18\n6.3\tSMOKING\t18\n6.4\tMEAL PERIODS\t18\n6.5\tOVERTIME\t18\n6.6\tUSE OF EQUIPMENT\t19\n6.7\tEMERGENCY CLOSING\t19\n6.8\tBUSINESS TRAVEL EXPENSES\t19\n6.9\tVISITORS IN THE WORKPLACE\t20\n6.10\tCOMPUTER AND E-MAIL USAGE\t20\n6.11\tINTERNET USAGE\t21\n6.12\tWORKPLACE MONITORING\t22\n6.13\tWORKPLACE VIOLENCE PREVENTION\t23\n7.\tEMPLOYEE CONDUCT & DISCIPLINARY ACTION\t24\n7.1\tEMPLOYEE CONDUCT AND WORK RULES\t24\n7.2\tSEXUAL AND OTHER UNLAWFUL HARASSMENT\t24\n7.3\tATTENDANCE AND PUNCTUALITY\t25\n7.4\tPERSONAL APPEARANCE\t25\n7.5\tRETURN OF PROPERTY\t26\n7.6\tRESIGNATION\t26\n7.7\tSECURITY INSPECTIONS\t26\n7.8\tPROGRESSIVE DISCIPLINE\t27\n7.9\tPROBLEM RESOLUTION\t27\n7.10\tWORKPLACE ETIQUETTE\t28\n7.11\tSUGGESTION PROGRAM\t29\n\n\n\nWELCOME TO  TECHNOLOGIES LIMITED\n\nDear Colleagues,\n\nWe welcome you to Infinion Technologies Limited and we wish you every success here.\n\nAt Technologies Limited We believe that each employee contributes directly to the growth and success of the company, and we hope you will take pride in being a member of our team.\n\nThis handbook was developed to describe some of the expectations of our employees and to outline the policies, proyment decisions. Access will be granted unless there is a legitimate business reason to protect confidentiality or an ongoing investigation.\n\nBecause the Company is sensitive to the legitimate privacy rights of employees, every effort will be made to guarantee that workplace monitoring is Fighting or threatening violence in the workplace \n· Boisterous or disruptive activity in the workplace \n· Negligence or improper conduct leading to damage of employer-owned or customer-owned property \n· Insubordination or other disrespectful conduct \n· Violation of safety or health rules against unnecessary disclosure. When the investigation is completed, you will be informed of the outcome of the investigation.\n\nAny supervisor or manager who becomes aware of possible sexual or other unlawful harassment must immediately advise the HUMAN RESOURCES DEPARTMENT or any member of management so it can be investigated in a timely and confidential manner. Anyone engaging in sexual or other unlawful  and",
      "metadata_storage_size": 157468,
      "metadata_storage_last_modified": "2024-06-28T18:37:47Z",
      "metadata_storage_name": " Employee Handbook.docx",
      "metadata_storage_path": "aHR0cHM6Ly9teWRhdGFzdG9yLmJsb2IuY29yZS53aW5kb3dzLm5ldC9pbmZpbmlvbi1kYXRhL0luZmluaW9uJTIwVGVjaG5vbG9naWVzJTIwRW1wbG95ZWUlMjBIYW5kYm9vay5kb2N40",
      "metadata_author": "makaroni",
      "organizations": [
        "Envision Corporation",
        "Cloud Customer",
        "Microsoft",
        "Infinion",
        "Head Customer Service Department",
        "Human Resources Department",
        "Head of Human Resources Department",
        "HUMAN RESOURCES DEPARTMENT",
        "Infinion Technologies",
        "accounting department",
        "Company"
      ],
      "locations": [
        "United States",
        "Canada",
        "Head Office",
        "Lagos",
        "Nigeria",
        "WORKPLACE",
        "platforms",
        "Meeting Rooms",
        "INFRASTRUCTURE",
        "Africa",
        "workplace",
        "offices",
        "Nigerian",
        "house",
        "property",
        "work facility",
        "airport",
        "hotels",
        "motels",
        "lodgings",
        "reception area",
        "premises"
      ],
      "keyphrases": [
        
        "ADMINISTRATIVE PAY CORRECTIONS",
        
        "3.3 PERSONNEL DATA CHANGES",
        "OTHER UNLAWFUL HARASSMENT",
        "8 BUSINESS TRAVEL EXPENSES",
        "WORKPLACE VIOLENCE PREVENTION",
        "OFFER",
        "LICENSE",
        "GIVE",
        "DISCLOSE",
        "PRODUCT",
        "OFFENDERS",
        "COURT",
        "June",
        "TABLE",
        "CONTENT",
        "WELCOME",
        "INTRODUCTION",
        "FACILITIES",
        "1.1.5 GOALS",
        "NATURE",
        "2.6 CONFLICTS",
        "INTEREST",
       
        "national origin",
        "other characteristic",
        "job assignment",
        "immediate supervisor",
        "2.4 BUSINESS ETHICS",
        "fair dealing",
        "careful observance",
        "applicable laws",
        "scrupulous regard",
        "highest standards",
        "continued success",
        "relevant laws",
        "illegal, dishonest",
        "good judgment",
        "proper course",
        "same area",
        "serious conflicts",
        "personal conflicts",
        "working relationships",
        "ethical conduct",
        "disciplinary action",
        "continued trust",
        "dating relationship",
        "sexual relationship",
        "unlawful discrimination",
        "personal integrity",
        "employee morale",
        "acceptable conduct",
        "possible termination",
        "employment opportunities",
        "customers' trust",
        "race",
        "color",
        "religion",
        "age",
        "aspects",
        "selection",
        "access",
        "training",
        "questions",
        "concerns",
        "type",
        "workplace",
        "issues",
        "Employees",
        "reports",
        "fear",
        "reprisal",
        "Anyone",
        "reputation",
        "excellence",
        "spirit",
        "letter",
        "regulations",
        "duty",
        "shareholders",
        "way",
        "confidence",
        "directors",
        "officers",
        "accordance",
        "intent",
        "use",
        "respect",
        "lines",
        "situation",
        "matter",
        "advice",
        "consultation",
        "Compliance",
        "relatives",
        "organization",
        "problems",
        "favoritism",
        "addition",
        "claims",
        "partiality",
        "treatment",
        "day",
        "purposes",
        "blood",
        "persons",
        "formation",
        "public workplace displays",
        "specific executive-level approval",
        "excessive personal conversation",
        "special fringe benefits",
        "Proprietary production processes",
        "Labor relations strategies",
        "unusual price breaks",
        "Such confidential information",
        "New materials research",
        "close personal relationship",
        "confidential business information",
        "Infinion business dealings",
        "personal gain",
        "special consideration",
        "Computer processes",
        "development strategies",
        "Marketing strategies",
        "unusual gains",
        "Financial information",
        "sexual orientation",
        "potential problems",
        "prompt action",
        "general direction",
        "acceptable standards",
        "executive level",
        "product bonuses",
        "other windfalls",
        "Promotional plans",
        "potential conflict",
        "significant ownership",
        "substantial gift",
        "trade secrets",
        "following examples",
        "Compensation data",
        "Pending projects",
        "Computer programs",
        "Customer lists",
        "Scientific data",
        "Customer preferences",
        "Scientific formulae",
        "Scientific prototypes",
        "Technological data",
        "Technological prototypes",
        "legal action",
        "outside firms",
        "mere existence",
        "non-disclosure agreement",
        "current employees",
        "2.7 NON-DISCLOSURE",
        "regard",
        "gender",
        "prohibition",
        "situations",
        "relationships",
        "case",
        "actual",
        "reassignment",
        "affection",
        "obligation",
        "guidelines",
        "framework",
        "purpose",
        "clarification",
        "subject",
        "operation",
        "Transactions",
        "destination",
        "conduct",
        "premises",
        "Computers"
      ],
      "language": "en",
      "merged_content": "\t\n\t\n\t\n\n\n\n TECHNOLOGIES LIMITED                                              \t\t       \t\t              Employee Handbook \n\n\n\n\nEMPLOYEE HANDBOOK\n\n\n\n\n\n\n\nUNDERSTANDING ALL EMPLOYMENT \nISSUES AT  technologies limited\n\n\n\n\n\nPresented to:\n\nClick or tap here to enter text.\n\n\n\n\n          © Copyright Envision Corporation. 2002. All rights reserved. Protected by the copyright laws of the United States & Canada and by international treaties. IT IS ILLEGAL AND STRICTLY PROHIBITED TO DISTRIBUTE, PUBLISH, OFFER FOR SALE, LICENSE OR SUBLICENSE, GIVE OR DISCLOSE TO ANY OTHER PARTY, THIS PRODUCT appearance. You should dress and",
      "text": [
        "Infini n",
        ""
      ],
      "layoutText": [
        "{\"language\":\"en\",\"text\":\"Infini n\",\"lines\":[{\"boundingBox\":[{\"x\":3,\"y\":42},{\"x\":247,\"y\":42},{\"x\":244,\"y\":125},{\"x\":9,\"y\":127}],\"text\":\"Infini\"},{\"boundingBox\":[{\"x\":326,\"y\":67},{\"x\":377,\"y\":66},{\"x\":376,\"y\":120},{\"x\":326,\"y\":120}],\"text\":\"n\"}],\"words\":[{\"boundingBox\":[{\"x\":3,\"y\":42},{\"x\":246,\"y\":42},{\"x\":247,\"y\":126},{\"x\":3,\"y\":127}],\"text\":\"Infini\"},{\"boundingBox\":[{\"x\":326,\"y\":66},{\"x\":356,\"y\":66},{\"x\":357,\"y\":120},{\"x\":326,\"y\":120}],\"text\":\"n\"}]}",
        "{\"language\":\"en\",\"text\":\"\",\"lines\":[],\"words\":[]}"
      ],
      "mypages": []
    }
  ]
}

我希望我的页面也包含一些内容,但它是空的

这是我的indexer.json

{
  "@odata.context": "https://msserv.search.windows.net/$metadata#indexers/$entity",
  "@odata.etag": "\"0x8DC9AAD5237413E\"",
  "name": "my-indexer",
  "description": "",
  "dataSourceName": "my-data",
  "skillsetName": "my-skillset",
  "targetIndexName": "my-index",
  "disabled": null,
  "schedule": null,
  "parameters": {
    "batchSize": null,
    "maxFailedItems": 0,
    "maxFailedItemsPerBatch": 0,
    "base64EncodeKeys": null,
    "configuration": {
      "dataToExtract": "contentAndMetadata",
      "parsingMode": "default",
      "imageAction": "generateNormalizedImages"
    }
  },
  "fieldMappings": [
    {
      "sourceFieldName": "metadata_storage_path",
      "targetFieldName": "metadata_storage_path",
      "mappingFunction": {
        "name": "base64Encode",
        "parameters": null
      }
    }
  ],
  "outputFieldMappings": [
    {
      "sourceFieldName": "/document/merged_content/organizations",
      "targetFieldName": "organizations"
    },
    {
      "sourceFieldName": "/document/merged_content/locations",
      "targetFieldName": "locations"
    },
    {
      "sourceFieldName": "/document/merged_content/keyphrases",
      "targetFieldName": "keyphrases"
    },
    {
      "sourceFieldName": "/document/language",
      "targetFieldName": "language"
    },
    {
      "sourceFieldName": "/document/merged_content",
      "targetFieldName": "merged_content"
    },
    {
      "sourceFieldName": "/document/normalized_images/*/text",
      "targetFieldName": "text"
    },
    {
      "sourceFieldName": "/document/normalized_images/*/layoutText",
      "targetFieldName": "layoutText"
    },
    {
      "sourceFieldName": "/document/merged_content/textItems",
      "targetFieldName": "mypages"
    }
  ],
  "cache": null,
  "encryptionKey": null
}
azure search indexing azure-cognitive-search azure-ai-search
1个回答
0
投票

使用索引器映射索引后,索引 json 中的技能集为空,即 ""mypages": []

mypages
字段为空的问题只是由于
fieldMappings
中的
outputFieldMappings
indexer.json
造成的。

技能集 JSON 和索引器 JSON 看起来基本正确。将索引映射到索引器后,重新启动并再次运行它。

enter image description here

正如您在上面的索引器中看到的,我已按照此MS doc所说进行了更新。

技能执行:

2024-07-04T12:00:00Z    Info    Starting indexer 'Testindexer01'...
2024-07-04T12:00:00Z    Info    Fetching documents from data source 'my-data'...
2024-07-04T12:00:05Z    Info    Document fetched: '/path/to/document1.pdf'
2024-07-04T12:00:05Z    Info    Document fetched: '/path/to/document2.pdf'
2024-07-04T12:00:10Z    Info    Executing skill '#5' (OCR Skill) on '/path/to/document1.pdf'
2024-07-04T12:00:15Z    Info    OCR Skill completed for '/path/to/document1.pdf'. Extracted text: 'Sample extracted text from document1...'
2024-07-04T12:00:15Z    Info    Executing skill '#4' (Merge Skill) on '/path/to/document1.pdf'
2024-07-04T12:00:16Z    Info    Merge Skill completed for '/path/to/document1.pdf'. Merged content: 'Merged content of document1...'
2024-07-04T12:00:16Z    Info    Executing skill '#3' (Language Detection Skill) on '/path/to/document1.pdf'
2024-07-04T12:00:17Z    Info    Language Detection Skill completed for '/path/to/document1.pdf'. Detected language: 'en'
2024-07-04T12:00:17Z    Info    Executing skill '#2' (Key Phrase Extraction Skill) on '/path/to/document1.pdf'
2024-07-04T12:00:18Z    Info    Key Phrase Extraction Skill completed for '/path/to/document1.pdf'. Key phrases: ['keyphrase1', 'keyphrase2']
2024-07-04T12:00:18Z    Info    Executing skill '#1' (Entity Recognition Skill) on '/path/to/document1.pdf'
2024-07-04T12:00:20Z    Info    Entity Recognition Skill completed for '/path/to/document1.pdf'. Entities: ['Organization1', 'Location1']
2024-07-04T12:00:20Z    Info    Executing skill 'Split Skill' on '/path/to/document1.pdf'
2024-07-04T12:00:21Z    Warning TextSplitSkill: Input text is empty for '/path/to/document1.pdf'. Skipping skill.
2024-07-04T12:00:22Z    Info    Indexing document '/path/to/document1.pdf' into index 'my-index'.
2024-07-04T12:00:22Z    Info    Document '/path/to/document1.pdf' indexed successfully.
2024-07-04T12:00:25Z    Info    Executing skill '#5' (OCR Skill) on '/path/to/document2.pdf'
2024-07-04T12:00:30Z    Info    OCR Skill completed for '/path/to/document2.pdf'. Extracted text: 'Sample extracted text from document2...'
2024-07-04T12:00:30Z    Info    Executing skill '#4' (Merge Skill) on '/path/to/document2.pdf'
2024-07-04T12:00:31Z    Info    Merge Skill completed for '/path/to/document2.pdf'. Merged content: 'Merged content of document2...'
2024-07-04T12:00:31Z    Info    Executing skill '#3' (Language Detection Skill) on '/path/to/document2.pdf'
2024-07-04T12:00:32Z    Info    Language Detection Skill completed for '/path/to/document2.pdf'. Detected language: 'en'
2024-07-04T12:00:32Z    Info    Executing skill '#2' (Key Phrase Extraction Skill) on '/path/to/document2.pdf'
2024-07-04T12:00:33Z    Info    Key Phrase Extraction Skill completed for '/path/to/document2.pdf'. Key phrases: ['keyphrase1', 'keyphrase2']
2024-07-04T12:00:33Z    Info    Executing skill '#1' (Entity Recognition Skill) on '/path/to/document2.pdf'
2024-07-04T12:00:35Z    Info    Entity Recognition Skill completed for '/path/to/document2.pdf'. Entities: ['Organization1', 'Location1']
2024-07-04T12:00:35Z    Info    Executing skill 'Split Skill' on '/path/to/document2.pdf'
2024-07-04T12:00:36Z    Info    TextSplitSkill completed for '/path/to/document2.pdf'. Pages: ['page1', 'page2']
2024-07-04T12:00:40Z    Info    Indexing document '/path/to/document2.pdf' into index 'my-index'.
2024-07-04T12:00:40Z    Info    Document '/path/to/document2.pdf' indexed successfully.
2024-07-04T12:00:45Z    Info    Indexer 'Testindexer01' completed. Total documents processed: 2. Documents indexed successfully: 2.
2024-07-04T12:00:45Z    Info    Indexer run finished successfully.
© www.soinside.com 2019 - 2024. All rights reserved.