Azure 新手。尝试按照此处的代码为 Azure AI Search 创建嵌入:
在 Jupyter 笔记本上,我的 Python 代码如下
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")
output_path = os.path.join('output', 'docVectors.json')
client = AzureOpenAI(
api_version=azure_openai_api_version,
azure_endpoint=azure_openai_endpoint,
api_key=azure_openai_key,
azure_deployment=azure_openai_embedding_deployment,
azure_ad_token_provider=token_provider if not azure_openai_key else None
)
# Generate Document Embeddings using OpenAI Ada 002
# Read the text-sample.json
path = os.path.join('data', 'worthiness_with_result.json')
with open(path, 'r', encoding='utf-8') as file:
input_data = json.load(file)
def filter_empty_strings(data):
return [item for item in data if item]
results = filter_empty_strings([item['result'] for item in input_data])
result_response = client.embeddings.create(input=results,model=azure_openai_embedding_model, dimensions=azure_openai_embedding_dimensions)
result_embeddings = [item['embedding'] for item in result_response['data']]
.....
# Generate embeddings for title and content fields
for i, item in enumerate(input_data):
item['resultVector'] = result_embeddings[i]
....
# Output embeddings to docVectors.json file
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
with open(output_path, "w") as f:
json.dump(input_data, f)
哪里:
azure_openai_embedding_deployment= embedding_test
azure_openai_embedding_model= text-embedding-ada-002
azure_openai_api_version= 2023-03-15-preview
使用上面的代码,我收到错误:
BadRequestError: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
错误详情:
BadRequestError Traceback (most recent call last)
Cell In[44], line 28
25 return [item for item in data if item]
27 results = filter_empty_strings([item['result'] for item in input_data])
---> 28 result_response = client.embeddings.create(input=results, model=azure_openai_embedding_model, dimensions=azure_openai_embedding_dimensions)
File /opt/homebrew/anaconda3/lib/python3.12/site-packages/openai/resources/embeddings.py:114, in Embeddings.create(self, input, model, dimensions, encoding_format, user, extra_headers, extra_query, extra_body, timeout)
108 embedding.embedding = np.frombuffer( # type: ignore[no-untyped-call]
109 base64.b64decode(data), dtype="float32"
110 ).tolist()
112 return obj
--> 114 return self._post(
115 "/embeddings",
116 body=maybe_transform(params, embedding_create_params.EmbeddingCreateParams),
117 options=make_request_options(
118 extra_headers=extra_headers,
119 extra_query=extra_query,
120 extra_body=extra_body,
121 timeout=timeout,
122 post_parser=parser,
123 ),
124 cast_to=CreateEmbeddingResponse,
125 )
File /opt/homebrew/anaconda3/lib/python3.12/site-packages/openai/_base_client.py:1260, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
1246 def post(
1247 self,
1248 path: str,
(...)
1255 stream_cls: type[_StreamT] | None = None,
1256 ) -> ResponseT | _StreamT:
1257 opts = FinalRequestOptions.construct(
1258 method="post", url=path, json_data=body, files=to_httpx_files(files), **options
1259 )
-> 1260 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File /opt/homebrew/anaconda3/lib/python3.12/site-packages/openai/_base_client.py:937, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
928 def request(
929 self,
930 cast_to: Type[ResponseT],
(...)
935 stream_cls: type[_StreamT] | None = None,
936 ) -> ResponseT | _StreamT:
--> 937 return self._request(
938 cast_to=cast_to,
939 options=options,
940 stream=stream,
941 stream_cls=stream_cls,
942 remaining_retries=remaining_retries,
943 )
File /opt/homebrew/anaconda3/lib/python3.12/site-packages/openai/_base_client.py:1041, in SyncAPIClient._request(self, cast_to, options, remaining_retries, stream, stream_cls)
1038 err.response.read()
1040 log.debug("Re-raising status error")
-> 1041 raise self._make_status_error_from_response(err.response) from None
1043 return self._process_response(
1044 cast_to=cast_to,
1045 options=options,
(...)
1049 retries_taken=options.get_max_retries(self.max_retries) - retries,
1050 )
BadRequestError: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
我很难理解这个错误是什么。有线索吗?
如果我打印“结果”,它看起来像:
[-5759.12, -2997.68, 3498.0748000000003, 4368.7612, 913.6112, 24922.718, 16120.722, 1560.622, 19102.6684, 7897.2512, 4.1262958737372343e-13, 41062.5401, 20809.3396, 861.7709, 5764.752, 2237.2804, 18778.1712, 779.211, 15279.093200000001, 5373.621, 2065.7306, 50715.734300000004, 2330.3348, 1066.9288, 1518.1474, 9842.0912, 7679.6024, 12653.202, 7892.9774, 805.6532, 8786.36, 26726.585600000002, 198.6692, 32071.5668, 203005.66, 17249.1272, 3572.5712000000003 ...... ]
由于错误表明输入无效,因此您给出的
results
输入
[-5759.12, -2997.68, 3498.0748000000003, 4368.7612, 913.6112, 24922.718, 16120.722, 1560.622, 19102.6684, 7897.2512, 4.1262958737372343e-13, 41062.5401, 20809.3396, 861.7709, 5764.752, 2237.2804, 18778.1712, 779.211, 15279.093200000001, 5373.621, 2065.7306, 50715.734300000004, 2330.3348, 1066.9288, 1518.1474, 9842.0912, 7679.6024, 12653.202, 7892.9774, 805.6532, 8786.36, 26726.585600000002, 198.6692, 32071.5668, 203005.66, 17249.1272, 3572.5712000000003 ...... ]
是数值,不是嵌入模型所需的输入。
您需要传递文本输入列表来生成嵌入。
以下是示例。
results = ["Hi how are you?"]
result_response = client.embeddings.create(input=results,model="text-embedding-ada-002")
result_embeddings = [item.embedding for item in result_response.data]
result_embeddings
输出:
[[-0.014039497, 0.009754552, 0.0011114076, -0.04075739, -0.026491044, 0.022949664, -0.012149081, -0.012817028, -0.011304694, -0.013358947, 0.01926965, ....]]
因此,改变您的
filter_empty_strings
函数来检查空项目和每个项目的类型是否为字符串
def filter_empty_strings(data):
return [item for item in data if isinstance(item, str) and item]