Spaces:
Sleeping
Sleeping
class Chroma: | |
def __init__(self) -> None: | |
""" | |
creates client if none, stores session of client | |
""" | |
import chromadb | |
from chromadb.utils import embedding_functions | |
import os | |
self.DB_PATH = "./chromadb_linux/" | |
self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb | |
self.COLLECTION_NAME: str = "scheme" | |
self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( | |
model_name=self.MODEL_NAME | |
) | |
if os.path.exists(self.DB_PATH): | |
self.client = chromadb.PersistentClient(path=self.DB_PATH) | |
self.schemer = self.client.get_collection( | |
name=self.COLLECTION_NAME, | |
embedding_function=self.EMBEDDING_FUNC, | |
) | |
else: | |
print("Collection Missing, Creating New Collection") | |
client = chromadb.PersistentClient(path=self.DB_PATH) | |
self.schemer = client.create_collection( | |
name=self.COLLECTION_NAME, | |
embedding_function=self.EMBEDDING_FUNC, | |
) | |
def get_collection(self): | |
return self.schemer | |
def add_materials(self, file_path: str) -> None: | |
""" | |
adds file path of PDF into embedded database | |
""" | |
print("hiiii") | |
from pypdf import PdfReader as reader | |
doc = reader(file_path) | |
text_content: str = "" | |
for page in doc.pages: | |
text_content += page.extract_text() | |
text_content.replace("\n", " ") | |
batch_size = 1024 | |
padding_element = "." | |
batch_documents = [] | |
batch_ids = [] | |
batch_metadata = [] | |
for i in range(0, len(text_content), batch_size): | |
batch = text_content[i : min(i + batch_size, len(text_content))] | |
if len(batch) < batch_size: | |
padding_needed = batch_size - len(batch) | |
batch = batch + str(padding_element * padding_needed) | |
print(f"Batch {i}/{len(text_content)}") | |
batch_documents.append(text_content) | |
batch_ids.append(f"batch{i}{batch[0]}") | |
batch_metadata.append({"length": len(batch)}) | |
print("Upserting into collection") | |
self.schemer.upsert( | |
ids=[str(id) for id in batch_ids], | |
metadatas=batch_metadata, | |
documents=batch_documents, | |
) | |
def encode_image(image) -> str: | |
""" | |
idk why u would need this | |
""" | |
import io | |
import base64 | |
byte_arr = io.BytesIO() | |
image.save(byte_arr, format="JPEG") | |
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8") | |
return encoded_image | |
async def image_to_text(self, image) -> object: | |
""" | |
idk why you would need this ngl | |
""" | |
from openai import OpenAI | |
import json | |
client = OpenAI() | |
response = client.chat.completions.create( | |
model="gpt-4-turbo", | |
response_format={"type": "json_object"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image", | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64;,{image}", | |
"detail": "high", | |
}, | |
}, | |
], | |
} | |
], | |
) | |
return json.loads(response.choices[0].message.content) | |
if __name__ == "__main__": | |
c = Chroma() | |
c.add_materials("data/Essentials of Programming Languages 2001.pdf") | |