Spaces:
Running
Running
File size: 3,242 Bytes
09cb3f5 1612f56 c796de9 1612f56 09cb3f5 1612f56 09cb3f5 1612f56 09cb3f5 1612f56 09cb3f5 1612f56 09cb3f5 1612f56 09cb3f5 1612f56 09cb3f5 1612f56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import chromadb
from chromadb.utils import embedding_functions
def create_client():
client = chromadb.PersistentClient(path="./chromadb_linux/")
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
COLLECTION_NAME: str = "schemer2"
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=MODEL_NAME
)
schemer = client.create_collection(
name=COLLECTION_NAME,
embedding_function=EMBEDDING_FUNC,
)
return schemer
def get_client():
client = chromadb.PersistentClient(path="./chromadb_linux/")
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
COLLECTION_NAME: str = "scheme"
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=MODEL_NAME
)
schemer = client.get_collection(
name=COLLECTION_NAME,
embedding_function=EMBEDDING_FUNC,
)
return schemer
def update_collection(iter: int, text: object, client: chromadb.Collection):
client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
def encode_image(image) -> str:
import io
import base64
byte_arr = io.BytesIO()
image.save(byte_arr, format="JPEG")
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
return encoded_image
async def image_to_text(image) -> object:
from openai import OpenAI
import json
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4-turbo",
response_format={"type": "json_object"},
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64;,{image}",
"detail": "high",
},
},
],
}
],
)
return json.loads(response.choices[0].message.content)
async def start_troggin_off(dir: str, client):
# recursive
import os
from pdf2image import convert_from_path
dirs = os.listdir(dir)
for path in dirs:
if os.path.isdir(os.path.join(dir, path)):
await start_troggin_off(os.path.join(dir, path), client) # recursive call
if(os.path.join(dir, path).endswith(".pdf")):
images = convert_from_path(os.path.join(dir, path))
for i, image in enumerate(images):
encoded_image = encode_image(image)
text = await image_to_text(encoded_image)
update_collection(i, text, client)
if __name__ == "__main__":
import asyncio
client = create_client()
# client = None
asyncio.run(start_troggin_off("data/", client))
|