Spaces:

LordFarquaad42
/

Groove-GPT

Running

App Files Files Community

LordFarquaad42 commited on Nov 2, 2024

Commit

5cb3e42

1 Parent(s): 12c8868

wrapping all database related actions under Chroma class

Browse files

Files changed (3) hide show

Chroma.py +123 -0
add_data.py +0 -96
database.py +0 -15

Chroma.py ADDED Viewed

	@@ -0,0 +1,123 @@

+class Chroma:
+    def __init__(self) -> None:
+        """
+        creates client if none, stores session of client
+        """
+        import chromadb
+        from chromadb.utils import embedding_functions
+        import os
+        self.DB_PATH = "./chromadb_linux/"
+        self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
+        self.COLLECTION_NAME: str = "scheme"
+        self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name=self.MODEL_NAME
+        )
+        if os.path.exists(self.DB_PATH):
+            self.client = chromadb.PersistentClient(path=self.DB_PATH)
+            self.schemer = self.client.get_collection(
+                name=self.COLLECTION_NAME,
+                embedding_function=self.EMBEDDING_FUNC,
+            )
+        else:
+            print("Collection Missing, Creating New Collection")
+            client = chromadb.PersistentClient(path=self.DB_PATH)
+            self.schemer = client.create_collection(
+                name=self.COLLECTION_NAME,
+                embedding_function=self.EMBEDDING_FUNC,
+            )
+    def get_collection(self):
+        return self.schemer
+    def add_materials(self, file_path: str) -> None:
+        """
+        adds file path of PDF into embedded database
+        """
+        print("hiiii")
+        from pypdf import PdfReader as reader
+        doc = reader(file_path)
+        text_content: str = ""
+        for page in doc.pages:
+            text_content += page.extract_text()
+        text_content.replace("\n", " ")
+        batch_size = 1024
+        padding_element = "."
+        batch_documents = []
+        batch_ids = []
+        batch_metadata = []
+        for i in range(0, len(text_content), batch_size):
+            batch = text_content[i : min(i + batch_size, len(text_content))]
+            if len(batch) < batch_size:
+                padding_needed = batch_size - len(batch)
+                batch = batch + str(padding_element * padding_needed)
+            print(f"Batch {i}/{len(text_content)}")
+            batch_documents.append(text_content)
+            batch_ids.append(f"batch{i}{batch[0]}")
+            batch_metadata.append({"length": len(batch)})
+        print("Upserting into collection")
+        self.schemer.upsert(
+            ids=[str(id) for id in batch_ids],
+            metadatas=batch_metadata,
+            documents=batch_documents,
+        )
+    def encode_image(image) -> str:
+        """
+        idk why u would need this
+        """
+        import io
+        import base64
+        byte_arr = io.BytesIO()
+        image.save(byte_arr, format="JPEG")
+        encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
+        return encoded_image
+    async def image_to_text(self, image) -> object:
+        """
+        idk why you would need this ngl
+        """
+        from openai import OpenAI
+        import json
+        client = OpenAI()
+        response = client.chat.completions.create(
+            model="gpt-4-turbo",
+            response_format={"type": "json_object"},
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64;,{image}",
+                                "detail": "high",
+                            },
+                        },
+                    ],
+                }
+            ],
+        )
+        return json.loads(response.choices[0].message.content)
+if __name__ == "__main__":
+    c = Chroma()
+    c.add_materials("data/Essentials of Programming Languages 2001.pdf")

add_data.py DELETED Viewed

@@ -1,96 +0,0 @@
-import chromadb
-from chromadb.utils import embedding_functions
-def create_client():
-    client = chromadb.PersistentClient(path="./chromadb_linux/")
-    MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
-    COLLECTION_NAME: str = "schemer2"
-    EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
-        model_name=MODEL_NAME
-    )
-    schemer = client.create_collection(
-        name=COLLECTION_NAME,
-        embedding_function=EMBEDDING_FUNC,
-    )
-    return schemer
-def get_client():
-    client = chromadb.PersistentClient(path="./chromadb_linux/")
-    MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
-    COLLECTION_NAME: str = "scheme"
-    EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
-        model_name=MODEL_NAME
-    )
-    schemer = client.get_collection(
-        name=COLLECTION_NAME,
-        embedding_function=EMBEDDING_FUNC,
-    )
-    return schemer
-def update_collection(iter: int, text: object, client: chromadb.Collection):
-    client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
-def encode_image(image) -> str:
-    import io
-    import base64
-    byte_arr = io.BytesIO()
-    image.save(byte_arr, format="JPEG")
-    encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
-    return encoded_image
-async def image_to_text(image) -> object:
-    from openai import OpenAI
-    import json
-    client = OpenAI()
-    response = client.chat.completions.create(
-        model="gpt-4-turbo",
-        response_format={"type": "json_object"},
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64;,{image}",
-                            "detail": "high",
-                        },
-                    },
-                ],
-            }
-        ],
-    )
-    return json.loads(response.choices[0].message.content)
-async def start_troggin_off(dir: str, client):
-    # recursive
-    import os
-    from pdf2image import convert_from_path
-    dirs = os.listdir(dir)
-    for path in dirs:
-        if os.path.isdir(os.path.join(dir, path)):
-            await start_troggin_off(os.path.join(dir, path), client) # recursive call
-        if(os.path.join(dir, path).endswith(".pdf")):
-            images = convert_from_path(os.path.join(dir, path))
-            for i, image in enumerate(images):
-                encoded_image = encode_image(image)
-                text = await image_to_text(encoded_image)
-                update_collection(i, text, client)
-if __name__ == "__main__":
-    import asyncio
-    client = create_client()
-    # client = None
-    asyncio.run(start_troggin_off("data/", client))

database.py DELETED Viewed

@@ -1,15 +0,0 @@
-import chromadb
-from chromadb.utils import embedding_functions
-def get_client():
-    client = chromadb.PersistentClient(path="./chromadb_linux/")
-    MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
-    COLLECTION_NAME: str = "scheme"
-    EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
-        model_name=MODEL_NAME
-    )
-    schemer = client.get_collection(
-        name=COLLECTION_NAME,
-        embedding_function=EMBEDDING_FUNC,
-    )
-    return schemer