sorhwphuo / chroma /chroma_setup.py
HonestAnnie's picture
extended database
10e1fa1
import os
import json
import chromadb
# Initialize a persistent Chroma client
client = chromadb.PersistentClient(path="/home/johannes/Desktop/proj/Datenbank/chroma")
# Create or retrieve a collection for the books
collection = client.get_or_create_collection(name="phil_en", metadata={"hnsw:space": "cosine"})
# Function to safely get metadata, replacing None with "Unknown"
def get_metadata(entry, key):
return entry.get(key) if entry.get(key) is not None else "Unknown"
# Directory containing the JSON files with pre-computed embeddings
json_dir = "/home/johannes/Desktop/proj/Datenbank/bücher/en/verarbeitet/ready_for_chroma/"
# Function to load JSON data from a file
def load_json_data(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
# Get all JSON files in the directory
json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
# Loop through each file, read the data, and add it to the collection
for file_path in json_files:
try:
data = load_json_data(file_path)
documents = []
embeddings = []
metadatas = []
ids = []
# Extract entry information and embeddings from each object in the JSON file
for entry in data:
documents.append(entry['text'])
embeddings.append(entry['embedding']) # Assume embeddings are stored under the key 'embedding'
metadatas.append({
'author': get_metadata(entry, 'autor'),
'book': get_metadata(entry, 'buch'),
'section': get_metadata(entry, 'abschnitt'),
'title': get_metadata(entry, 'titel')
})
# Generating a structured ID for each entry
entry_number = entry['entry_number'] # Ensure each JSON object has a entry number
ids.append(f"{entry_number}")
# Add the entrys to the collection with pre-computed embeddings
collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids)
print(f"Added {len(documents)} documents from {os.path.basename(file_path)}")
except Exception as e:
print(f"Failed to process {file_path}: {e}")