Spaces:

Marroco93
/

PacmanAI-2

Sleeping

App Files Files Community

Marroco93 commited on Apr 24, 2024

Commit

bcee5ff

1 Parent(s): 10d17a3

no message

Browse files

Files changed (1) hide show

main.py +16 -54

main.py CHANGED Viewed

@@ -5,7 +5,7 @@ from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
 import uvicorn
-from typing import Generator
 import json  # Asegúrate de que esta línea esté al principio del archivo
 import nltk
 import os
@@ -81,68 +81,30 @@ async def generate_text(item: Item):
     return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
-# Load spaCy model
-nlp = spacy.load("en_core_web_sm")
-class TextRequest(BaseModel):
-    text: str
-def preprocess_text(text: str) -> str:
-    # Normalize whitespace and strip punctuation
-    text = re.sub(r'\s+', ' ', text.strip())
-    text = re.sub(r'[^\w\s]', '', text)
-    return text
-def embed_text(text: str) -> np.ndarray:
-    # Load the JinaAI/jina-embeddings-v2-base-en model
-    model_name = "JinaAI/jina-embeddings-v2-base-en"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModel.from_pretrained(model_name)
-    inputs = tokenizer(text, return_tensors='pt')
-    embeddings = model(**inputs).pooler_output.numpy()
-    return embeddings
-def semantic_matching(text, context):
-    text_embeddings = embed_text(text)
-    context_embeddings = [embed_text(ctx) for ctx in context]
-    # Calculate cosine similarity between text and context embeddings
-    similarities = np.dot(text_embeddings, context_embeddings.T)
-    # Find the most similar sentence in the context
-    most_similar_idx = np.argmax(similarities)
-    return context[most_similar_idx]
-def handle_endpoint(text):
-    # Define your large context here
-    context = [
-        "This is a sample context sentence 1.",
-        "Another context sentence to provide additional information.",
-        "This context sentence introduces a new topic.",
-        "Some additional details about the new topic are provided here.",
-        "Context sentences can be added or removed as needed.",
-        "The context should cover a range of topics and provide relevant information.",
-        "Make sure the context is diverse and representative of the domain.",
-    ]
-    # Perform semantic matching to retrieve the most relevant portion of the context
-    relevant_context = semantic_matching(text, context)
-    return relevant_context
 @app.post("/process_document")
 async def process_document(request: TextRequest):
     try:
-        processed_text = preprocess_text(request.text)
-        embedded_text = embed_text(processed_text)
-        relevant_context = handle_endpoint(processed_text)
         return {
-            "embedded_text": embedded_text.tolist(),
-            "relevant_context": relevant_context
         }
     except Exception as e:
         print(f"Error during document processing: {e}")

 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
 import uvicorn
+from typing import Generator, List
 import json  # Asegúrate de que esta línea esté al principio del archivo
 import nltk
 import os
     return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
+# Define request model
+class TextRequest(BaseModel):
+    text: List[str]  # Expect a list of text segments
+# Load Longformer model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
+model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
+# Endpoint to process the document and return embeddings for each segment
 @app.post("/process_document")
 async def process_document(request: TextRequest):
     try:
+        embeddings_list = []
+        for text_segment in request.text:
+            # Process each segment individually
+            inputs = tokenizer(text_segment, return_tensors="pt", padding=True, truncation=True, max_length=4096)
+            outputs = model(**inputs)
+            embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
+            embeddings_list.append(embeddings.tolist())  # Store embeddings for each segment
         return {
+            "embeddings": embeddings_list
         }
     except Exception as e:
         print(f"Error during document processing: {e}")