Error while searching:- [{'code': 3, 'summary': 'Illegal query', 'message': "Could not set 'ranking.features.query(query_embedding)' to 'embed(e5, What is the Goodwill as of December 2023 in SUBSIDIARIES CONSOLIDATED BALANCE SHEETS?)': Multiple embedders are provided but no embedder id is given. Valid embedders are colbert,e5"}]
问题仅出现在检索时,如果查询与嵌入有关,则无法嵌入查询字符串。
但是我尝试了一个简单的查询来查看数据中是否确实存在嵌入,确实有。
不知道为什么它能够在索引文档时识别它,但不能在查询中识别它。
映射:-
self.app_package = ApplicationPackage(name=self.app_name)
# self.app_package.schema.mode = "streaming"
self.meta_variables = ['doc_id','document_name', 'type', 'reportedTime', 'period', 'IsNro', 'pageNumber', 'language', 'company_ID', 'company_name', 'company_ticker', 'company_countryCode', 'company_quantum', 'company_currency', 'company_fiscalYear', 'company_fyAdjustment']
self.app_package.schema.add_fields(
Field(
name="text", type="string", indexing=["index", "summary"], index="enable-bm25"
),
Field(
name="embedding",
type="tensor<float>(x[1024])",
indexing=["input text", "embed e5","attribute", "summary", "index"],
attribute=["distance-metric: angular"],
is_document_field=False
),
Field(
name="colbert",
type="tensor<float>(dt{}, x[128])",
indexing=["input text", "embed colbert","attribute", "summary", "index"],
attribute=["distance-metric: angular"],
is_document_field=False
),
Field(name="doc_id", type="int", indexing=["attribute", "summary"]),
Field(name="document_name", type="string", indexing=["attribute", "summary"], match=['word']),
Field(name="type", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="reportedTime", type="string", indexing=["attribute", "summary"], match=['word']),
Field(name="period", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="IsNro", type="bool", indexing=["attribute", "summary"]),
Field(name="pageNumber", type="int", indexing=["attribute", "summary"]),
Field(name="language", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_ID", type="int", indexing=["attribute", "summary"]),
Field(name="company_name", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_ticker", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_countryCode", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_quantum", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_currency", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_fiscalYear", type="string", indexing=["attribute", "summary"], match=['exact']),
Field(name="company_fyAdjustment", type="bool", indexing=["attribute", "summary"], match=['exact']),
)
self.app_package.schema.add_rank_profile(
RankProfile(
name="default",
first_phase="closeness(field, embedding)",
inputs=[("query(query_embedding)", "tensor<float>(x[1024])")],
)
)
self.app_package.schema.add_rank_profile(
RankProfile(
name="combined_ranking",
first_phase="cos_sim",
second_phase=SecondPhaseRanking(expression="0.05 * bm25(text) + 0.15 * cos_sim + 0.8 * max_sim", rerank_count=10),
# global_phase=GlobalPhaseRanking(expression="0.05 * bm25(text) + 0.25 * cos_sim + 0.7 * max_sim"),
functions=[Function(name="unpack", expression="cell_cast(attribute(colbert), float)"),Function(name="cos_sim", expression="cosine_similarity(query(query_embedding), attribute(embedding),x)"),Function(
name="max_sim",
expression="""sum(
reduce(
sum(
query(qt) * attribute(colbert) , x
),
max, dt
),
qt
)/32.0
"""
)],
inputs=[
("query(query_embedding)", "tensor<float>(x[1024])"),
("query(qt)", "tensor<float>(qt{}, x[128])")
],
match_features=["max_sim", "cos_sim", "bm25(text)"]
)
)
self.app_package.components = [Component(id="colbert", type="colbert-embedder",
parameters=[
Parameter("transformer-model", {"url": "https://huggingface.co/mixedbread-ai/mxbai-colbert-large-v1/resolve/main/onnx/model.onnx?download=true"}),
Parameter("tokenizer-model", {"url": "https://huggingface.co/mixedbread-ai/mxbai-colbert-Large-v1/raw/main/tokenizer.json"})
]
),
Component(id="e5", type="hugging-face-embedder",
parameters=[
Parameter("transformer-model", {"url": "https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/onnx/model.onnx?download=true"}),
Parameter("tokenizer-model", {"url": "https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/tokenizer.json"})
]
)]
出现此错误的查询:-
def get_top_para_finance(self, query: str, doc_id: int):
# print(self.vespa_app.application_package.get_model(model_id='colbert'))
with self.vespa_app.syncio(connections=12) as session:
start = time.time()
print(f"Got the Query:- {query}")
st = time.time()
# embeddings = self.vespa_obj.embedding_function.embed_query(query)
print(f"Time to get the Embeddings:- {round(time.time()-st, 2)}s")
result = self.vespa_app.query(
yql="select * from sources * where {targetHits: 10}nearestNeighbor(embedding, query_embedding) and doc_id = "+ f"{doc_id}",
query=query,
ranking= "default",
body={
"input.query(qt)": f"embed(colbert, {query})",
"input.query(query_embedding)": f"embed(e5, {query})",
},
hits = 1,
# timeout = "1ms"
)
assert(result.is_successfull())
end = time.time()
total_time = round(end-start, 2)
print(f"Search time:- {total_time}s")
return self.display_hits_as_df(result, self.vespa_obj.meta_variables+['text']), total_time
您需要引用您要嵌入的文本:
body={
"input.query(qt)": f"embed(colbert, \"{query}\")",
"input.query(query_embedding)": f"embed(e5, \"{query}\")",
},
这里的错误消息肯定会更清楚 - 我会解决这个问题。