最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

database - ChromaDB add chunk to collection - Stack Overflow

programmeradmin8浏览0评论

I have a Problem with following Code. The function should create a ChormaDB if it's not already existing, or retrieve it if it does exist. Then it should create an entry in a given collection / create said collection an add the entry.

import os
from sentence_transformers import SentenceTransformer
import chromadb
import config



def create_chroma_db(chromadb_path: str, chunk_folder: str, collection_name):
    if chromadb_path and os.path.isdir(chromadb_path):
        db_path = chromadb_path
        print(f"Bestehende ChromaDB wird geladen: {db_path}")
    else:
        db_path = "./chroma_db/new_db"
        if not os.path.exists(db_path):
            os.makedirs(db_path)
            print(f"Created directory: {db_path}")
        else:
            print(f"Directory exists: {db_path}")
    
    # ChromaDB Client initialisieren
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)
    
    for filename in os.listdir(chunk_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(chunk_folder, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                chunk_text = file.read().strip()
            
            if chunk_text:
                embedding = get_vector_embeddings(chunk_text)
                
                # Check if ID exists
                existing_ids = [doc["id"] for doc in collection.get(ids=[filename])["metadatas"]] if collection.count() > 0 else []
                
                if filename not in existing_ids:
                    collection.add(
                        documents=[chunk_text],
                        embeddings=[embedding.tolist()],
                        ids=[filename]
                    )
                    print(f"Embedding für {filename} wurde in ChromaDB gespeichert.")
                else:
                    print(f"Embedding für {filename} existiert bereits und wird nicht erneut hinzugefügt.")

def get_vector_embeddings(txt: str):
    print("generating vektor embeddings with: " + config.embedding_model)
    model = SentenceTransformer(config.embedding_model)
    embeddings = model.encode(txt)
    return embeddings

My Problem is if I try to retrieve the newly added chunk it doesnt exist.

Here the Code for retrieving. I tested with query by name and with query by embedding:

from utils import vektor_emb_to_chroma
import numpy as np
import chromadb
from chromadb.config import Settings
import os

def get_embeddings(question: str) -> np.ndarray:
    embs = vektor_emb_to_chroma.get_vector_embeddings(question)
    return embs

def fetch(question: str, k:int, db_path:str, collection_name:str) -> list[str]:
    if os.path.isdir(db_path):
        client = chromadb.PersistentClient(path=db_path)
        collection = client.get_collection(collection_name)
        
        # Create query embedding
        embedding = get_embeddings(question)
        
        # Query the Chroma collection
        results = collection.query(
            query_embeddings=[embedding],  # Embedding must be in a list
            n_results=k                  # Number of most similar documents/chunks to return
        )
        
        return results["documents"]
    else:
        print("ERROR: NO CHROMA DB PATH SELECTED")





def test(file):
    
    db_path = "C:/Users/tberend/Documents/GitHub/RAG-ppx-plain_text/RAG-Framework/chroma_db"
    collection = "chunk_embeddings"
    query_by_chunk_name(db_path,file)


def query_by_chunk_name(chromadb_path: str, chunk_name: str):
    # ChromaDB Client initialisieren
    client = chromadb.PersistentClient(path=chromadb_path)
    collection = client.get_or_create_collection(name="chunk_embeddings")

    # Suche nach dem Chunk in der Collection
    result = collection.get(ids=[chunk_name])

    # Überprüfen, ob der Chunk existiert
    if result and "documents" in result and result["documents"]:
        print(f"Gefundener Chunk ({chunk_name}): {result['documents'][0]}")
        return result['documents'][0]  # Rückgabe des gefundenen Chunks
    else:
        print(f"Kein Chunk mit dem Namen {chunk_name} gefunden.")
        return None
发布评论

评论列表(0)

  1. 暂无评论