def store_embeddings_in_astradb(embeddings,text_chunks, metadata):
vstore = AstraDBVectorStore(
collection_name="test",
embedding=embedding_model,
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
)
print("after Vstore")
# Create documents with page content, embeddings, and metadata
documents = [
{
"page_content": chunk,
"metadata": metadata
}
for chunk in text_chunks
]
for doc in documents:
print(f"Document structure: {doc}")
print("after documents")
# Add documents to AstraDB vector store
inserted_ids = vstore.add_documents(documents)
return inserted_ids
# List of PDF files to process
pdf_files = ["WhatYouNeedToKnowAboutWOMENSHEALTH.pdf", "Womens-Health-Book.pdf"]
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Process each PDF file
for pdf_file in pdf_files:
if not os.path.isfile(pdf_file):
raise ValueError(f"PDF file '{pdf_file}' not found.")
print(f"Processing file: {pdf_file}")
# Extract text from PDF
text = extract_text_from_pdf(pdf_file)
# Split text into chunks
text_chunks = split_text_into_chunks(text)
# Embed text chunks
embeddings = embed_text_chunks(text_chunks, embedding_model)
# Extract metadata
metadata = extract_metadata(pdf_file)
# Store embeddings in AstraDB
try:
inserted_ids = store_embeddings_in_astradb(embeddings,text_chunks, metadata)
print(f"Inserted {len(inserted_ids)} embeddings from '{pdf_file}' into AstraDB.")
except Exception as e:
print(f"Failed to insert embeddings for '{pdf_file}': {e}")
这是我用来将文本块转换为嵌入然后将它们存储在 AstraDB 中的代码。在插入时我收到错误
'dict' object has no attribute 'page_content'
。怎么解决?
我正在努力理解你的代码,但我怀疑问题是变量范围不正确。如果您包含最小的代码示例以及复制问题的步骤,我很乐意帮助您解决问题。
作为旁注,我建议不要在函数中创建
AstraDBVectorStore
对象,因为这是没有必要的。您应该只实例化一次并在应用程序的整个生命周期内共享它。
此外,当您调用
AstraDBVectorStore.add_documents()
时,它会自动为每个文档生成嵌入,然后将其存储在 Astra DB 中,因此无需多次调用 embed_text_chunks()
。事实上,我看不到 embeddings
变量在任何地方使用。干杯!