import os import json import chromadb # Initialize a persistent Chroma client client = chromadb.PersistentClient(path="/home/johannes/Desktop/proj/Datenbank/chroma") # Create or retrieve a collection for the books collection = client.get_or_create_collection(name="phil_en", metadata={"hnsw:space": "cosine"}) # Function to safely get metadata, replacing None with "Unknown" def get_metadata(entry, key): return entry.get(key) if entry.get(key) is not None else "Unknown" # Directory containing the JSON files with pre-computed embeddings json_dir = "/home/johannes/Desktop/proj/Datenbank/bücher/en/verarbeitet/ready_for_chroma/" # Function to load JSON data from a file def load_json_data(file_path): with open(file_path, 'r', encoding='utf-8') as file: return json.load(file) # Get all JSON files in the directory json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')] # Loop through each file, read the data, and add it to the collection for file_path in json_files: try: data = load_json_data(file_path) documents = [] embeddings = [] metadatas = [] ids = [] # Extract entry information and embeddings from each object in the JSON file for entry in data: documents.append(entry['text']) embeddings.append(entry['embedding']) # Assume embeddings are stored under the key 'embedding' metadatas.append({ 'author': get_metadata(entry, 'autor'), 'book': get_metadata(entry, 'buch'), 'section': get_metadata(entry, 'abschnitt'), 'title': get_metadata(entry, 'titel') }) # Generating a structured ID for each entry entry_number = entry['entry_number'] # Ensure each JSON object has a entry number ids.append(f"{entry_number}") # Add the entrys to the collection with pre-computed embeddings collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids) print(f"Added {len(documents)} documents from {os.path.basename(file_path)}") except Exception as e: print(f"Failed to process {file_path}: {e}")