Spaces:

LordFarquaad42
/

Groove-GPT

Sleeping

App Files Files Community

Groove-GPT / Chroma.py

LordFarquaad42

wrapping all database related actions under Chroma class

5cb3e42 3 months ago

raw

history blame contribute delete

4.19 kB

	class Chroma:

	def __init__(self) -> None:
	"""
	creates client if none, stores session of client
	"""
	import chromadb
	from chromadb.utils import embedding_functions
	import os

	self.DB_PATH = "./chromadb_linux/"
	self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
	self.COLLECTION_NAME: str = "scheme"
	self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name=self.MODEL_NAME
	)

	if os.path.exists(self.DB_PATH):
	self.client = chromadb.PersistentClient(path=self.DB_PATH)
	self.schemer = self.client.get_collection(
	name=self.COLLECTION_NAME,
	embedding_function=self.EMBEDDING_FUNC,
	)
	else:
	print("Collection Missing, Creating New Collection")
	client = chromadb.PersistentClient(path=self.DB_PATH)
	self.schemer = client.create_collection(
	name=self.COLLECTION_NAME,
	embedding_function=self.EMBEDDING_FUNC,
	)

	def get_collection(self):
	return self.schemer

	def add_materials(self, file_path: str) -> None:
	"""
	adds file path of PDF into embedded database
	"""
	print("hiiii")
	from pypdf import PdfReader as reader

	doc = reader(file_path)
	text_content: str = ""

	for page in doc.pages:
	text_content += page.extract_text()

	text_content.replace("\n", " ")

	batch_size = 1024
	padding_element = "."
	batch_documents = []
	batch_ids = []
	batch_metadata = []

	for i in range(0, len(text_content), batch_size):
	batch = text_content[i : min(i + batch_size, len(text_content))]

	if len(batch) < batch_size:
	padding_needed = batch_size - len(batch)
	batch = batch + str(padding_element * padding_needed)

	print(f"Batch {i}/{len(text_content)}")
	batch_documents.append(text_content)
	batch_ids.append(f"batch{i}{batch[0]}")
	batch_metadata.append({"length": len(batch)})

	print("Upserting into collection")
	self.schemer.upsert(
	ids=[str(id) for id in batch_ids],
	metadatas=batch_metadata,
	documents=batch_documents,
	)

	def encode_image(image) -> str:
	"""
	idk why u would need this
	"""
	import io
	import base64

	byte_arr = io.BytesIO()
	image.save(byte_arr, format="JPEG")
	encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
	return encoded_image

	async def image_to_text(self, image) -> object:
	"""
	idk why you would need this ngl
	"""
	from openai import OpenAI
	import json

	client = OpenAI()

	response = client.chat.completions.create(
	model="gpt-4-turbo",
	response_format={"type": "json_object"},
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64;,{image}",
	"detail": "high",
	},
	},
	],
	}
	],
	)
	return json.loads(response.choices[0].message.content)


	if __name__ == "__main__":
	c = Chroma()
	c.add_materials("data/Essentials of Programming Languages 2001.pdf")