Spaces:

dreamcollector
/

SearchInVideo

Sleeping

App Files Files Community

kayrakan commited on Jul 8

Commit

1da5cdb

•

1 Parent(s): c25e3ee

new version

Browse files

Files changed (3) hide show

.gitignore +1 -0
__pycache__/app.cpython-312.pyc +0 -0
app.py +83 -47

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /chroma_db/*

__pycache__/app.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -1,68 +1,104 @@
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
-from pydantic import BaseModel
 from sentence_transformers import SentenceTransformer
 import chromadb
-import os
 import json
 app = FastAPI()
-# Load the model
-model_name = "all-MiniLM-L6-v2"  # A popular model for sentence embeddings
 model = SentenceTransformer(model_name)
-# Initialize ChromaDB
-chroma_client = chromadb.Client()
-collection_name = "json_lines"
-def embed_text(text):
-    return model.encode(text).tolist()  # Generate embeddings
-@app.post("/generate")
-async def generate_text(sentence: str = Form(...), file: UploadFile = File(...)):
-    contents = await file.read()
-    lines = json.loads(contents)
-    # Check if the collection exists before attempting to delete it
-    try:
-        chroma_client.delete_collection(collection_name)
-    except ValueError as e:
-        if "does not exist" in str(e):
-            pass  # Ignore the error if the collection does not exist
-    # Recreate the collection
-    collection = chroma_client.get_or_create_collection(collection_name)
-    # Process each line and store the embeddings in ChromaDB
-    for i, line in enumerate(lines):
-        text = line['text']  # Adjust this according to your JSON structure
         embedding = embed_text(text)
-        metadata = {
-            "id": i,
             "text": text,
-            "duration": line.get("duration"),
-            "lang": line.get("lang"),
-            "offset": line.get("offset")
-        }
-        collection.add(embeddings=[embedding], metadatas=[metadata], ids=[str(i)])
-    # Embed the query sentence
-    query_embedding = embed_text(sentence)
-    # Perform search in ChromaDB
-    results = collection.query(query_embeddings=[query_embedding], n_results=5)  # Adjust n_results as needed
-    # Extract relevant lines from results
-    relevant_lines = results["metadatas"][0]
-    # Clear the collection after finding relevant lines
-    chroma_client.delete_collection(collection_name)
-    return {"relevant_lines": relevant_lines}
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}

+import asyncio
+import logging
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from sentence_transformers import SentenceTransformer
 import chromadb
 import json
+from typing import List
+from functools import lru_cache
 app = FastAPI()
+# Load the multilingual model
+model_name = "paraphrase-multilingual-mpnet-base-v2"  # This model supports 50+ languages
 model = SentenceTransformer(model_name)
+# Initialize persistent ChromaDB
+chroma_client = chromadb.PersistentClient(path="./chroma_db")
+collection_name = "transcriptions"
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@lru_cache(maxsize=1000)
+def embed_text(text: str):
+    return model.encode(text).tolist()
+async def process_batch(batch, collection, start_index):
+    embeddings = []
+    metadatas = []
+    ids = []
+    for i, item in enumerate(batch):
+        text = item['text']
         embedding = embed_text(text)
+        embeddings.append(embedding)
+        metadatas.append({
             "text": text,
+            "duration": item.get("duration"),
+            "offset": item.get("offset"),
+            "lang": item.get("lang")
+        })
+        # Create a unique ID using the start_index and the current item index
+        unique_id = f"{start_index + i}_{item.get('offset')}"
+        ids.append(unique_id)
+    collection.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
+@app.post("/generate")
+async def generate_text(sentence: str = Form(...), file: UploadFile = File(...), delete_after: bool = True):
+    try:
+        contents = await file.read()
+        transcription = json.loads(contents)
+        # Get or create the collection
+        try:
+            collection = chroma_client.get_collection(collection_name)
+            # Clear existing data
+            collection.delete(where={})
+        except ValueError:
+            collection = chroma_client.create_collection(collection_name)
+        # Process in batches
+        batch_size = 100
+        tasks = []
+        for i in range(0, len(transcription), batch_size):
+            batch = transcription[i:i + batch_size]
+            task = asyncio.create_task(process_batch(batch, collection, i))
+            tasks.append(task)
+        await asyncio.gather(*tasks)
+        # Embed the query sentence
+        query_embedding = embed_text(sentence)
+        # Perform search in ChromaDB
+        results = collection.query(query_embeddings=[query_embedding], n_results=5)
+        # Extract relevant lines from results
+        relevant_lines = results["metadatas"][0]
+        logger.info(f"Query results: {relevant_lines}")
+        return {"relevant_lines": relevant_lines}
+    except Exception as e:
+        logger.error(f"Error during processing: {e}")
+        raise HTTPException(status_code=500, detail="Internal Server Error")
+    finally:
+        if delete_after:
+            # Clear the collection after finding relevant lines or if an error occurred
+            try:
+                chroma_client.delete_collection(collection_name)
+            except ValueError:
+                pass  # Collection might have already been deleted or doesn't exist
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)