I have a Problem with following Code. The function should create a ChormaDB if it's not already existing, or retrieve it if it does exist. Then it should create an entry in a given collection / create said collection an add the entry.
import os
from sentence_transformers import SentenceTransformer
import chromadb
import config
def create_chroma_db(chromadb_path: str, chunk_folder: str, collection_name):
if chromadb_path and os.path.isdir(chromadb_path):
db_path = chromadb_path
print(f"Bestehende ChromaDB wird geladen: {db_path}")
else:
db_path = "./chroma_db/new_db"
if not os.path.exists(db_path):
os.makedirs(db_path)
print(f"Created directory: {db_path}")
else:
print(f"Directory exists: {db_path}")
# ChromaDB Client initialisieren
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(name=collection_name)
for filename in os.listdir(chunk_folder):
if filename.endswith(".txt"):
file_path = os.path.join(chunk_folder, filename)
with open(file_path, "r", encoding="utf-8") as file:
chunk_text = file.read().strip()
if chunk_text:
embedding = get_vector_embeddings(chunk_text)
# Check if ID exists
existing_ids = [doc["id"] for doc in collection.get(ids=[filename])["metadatas"]] if collection.count() > 0 else []
if filename not in existing_ids:
collection.add(
documents=[chunk_text],
embeddings=[embedding.tolist()],
ids=[filename]
)
print(f"Embedding für {filename} wurde in ChromaDB gespeichert.")
else:
print(f"Embedding für {filename} existiert bereits und wird nicht erneut hinzugefügt.")
def get_vector_embeddings(txt: str):
print("generating vektor embeddings with: " + config.embedding_model)
model = SentenceTransformer(config.embedding_model)
embeddings = model.encode(txt)
return embeddings
My Problem is if I try to retrieve the newly added chunk it doesnt exist.
Here the Code for retrieving. I tested with query by name and with query by embedding:
from utils import vektor_emb_to_chroma
import numpy as np
import chromadb
from chromadb.config import Settings
import os
def get_embeddings(question: str) -> np.ndarray:
embs = vektor_emb_to_chroma.get_vector_embeddings(question)
return embs
def fetch(question: str, k:int, db_path:str, collection_name:str) -> list[str]:
if os.path.isdir(db_path):
client = chromadb.PersistentClient(path=db_path)
collection = client.get_collection(collection_name)
# Create query embedding
embedding = get_embeddings(question)
# Query the Chroma collection
results = collection.query(
query_embeddings=[embedding], # Embedding must be in a list
n_results=k # Number of most similar documents/chunks to return
)
return results["documents"]
else:
print("ERROR: NO CHROMA DB PATH SELECTED")
def test(file):
db_path = "C:/Users/tberend/Documents/GitHub/RAG-ppx-plain_text/RAG-Framework/chroma_db"
collection = "chunk_embeddings"
query_by_chunk_name(db_path,file)
def query_by_chunk_name(chromadb_path: str, chunk_name: str):
# ChromaDB Client initialisieren
client = chromadb.PersistentClient(path=chromadb_path)
collection = client.get_or_create_collection(name="chunk_embeddings")
# Suche nach dem Chunk in der Collection
result = collection.get(ids=[chunk_name])
# Überprüfen, ob der Chunk existiert
if result and "documents" in result and result["documents"]:
print(f"Gefundener Chunk ({chunk_name}): {result['documents'][0]}")
return result['documents'][0] # Rückgabe des gefundenen Chunks
else:
print(f"Kein Chunk mit dem Namen {chunk_name} gefunden.")
return None