Spaces:

Marroco93
/

PacmanAI-2

Sleeping

App Files Files Community

Marroco93 commited on Apr 24, 2024

Commit

10d17a3

1 Parent(s): abb61e1

no message

Browse files

Files changed (1) hide show

main.py +78 -134

main.py CHANGED Viewed

@@ -11,8 +11,9 @@ import nltk
 import os
 import google.protobuf  # This line should execute without errors if protobuf is installed correctly
 import sentencepiece
-from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification
 import spacy
 nltk.data.path.append(os.getenv('NLTK_DATA'))
@@ -80,7 +81,6 @@ async def generate_text(item: Item):
     return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
 # Load spaCy model
 nlp = spacy.load("en_core_web_sm")
@@ -93,152 +93,96 @@ def preprocess_text(text: str) -> str:
     text = re.sub(r'[^\w\s]', '', text)
     return text
-def reduce_tokens(text: str):
-    # Process the text with spaCy
-    doc = nlp(text)
-    # Select sentences that might be more important - this is a simple heuristic
-    important_sentences = []
-    for sent in doc.sents:
-        if any(tok.dep_ == 'ROOT' for tok in sent):
-            important_sentences.append(sent.text)
-    # Join selected sentences to form the reduced text
-    reduced_text = ' '.join(important_sentences)
-    # Tokenize the reduced text to count the tokens
-    reduced_doc = nlp(reduced_text)  # Ensure this line is correctly aligned
-    token_count = len(reduced_doc)
-    return reduced_text, token_count
-def segment_text(text: str, max_tokens=500):  # Setting a conservative limit below 512
-    doc = nlp(text)
-    segments = []
-    current_segment = []
-    current_length = 0
-    for sent in doc.sents:
-        sentence = sent.text.strip()
-        sentence_length = len(sentence.split())  # Counting words for simplicity
-        if sentence_length > max_tokens:
-            # Split long sentences into smaller chunks if a single sentence exceeds max_tokens
-            words = sentence.split()
-            while words:
-                part = ' '.join(words[:max_tokens])
-                segments.append(part)
-                words = words[max_tokens:]
-        elif current_length + sentence_length > max_tokens:
-            segments.append(' '.join(current_segment))
-            current_segment = [sentence]
-            current_length = sentence_length
-        else:
-            current_segment.append(sentence)
-            current_length += sentence_length
-    if current_segment:  # Add the last segment
-        segments.append(' '.join(current_segment))
-    return segments
-# Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-# Set up the pipeline
-classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
-def robust_segment_text(text: str, max_tokens=510):
-    doc = nlp(text)
-    segments = []
-    current_segment = []
-    current_tokens = []
-    for sent in doc.sents:
-        words = sent.text.strip().split()
-        sentence_tokens = tokenizer.encode(' '.join(words), add_special_tokens=False)
-        if len(current_tokens) + len(sentence_tokens) > max_tokens:
-            segments.append(tokenizer.decode(current_tokens))
-            current_segment = words
-            current_tokens = sentence_tokens
-        else:
-            current_segment.extend(words)
-            current_tokens.extend(sentence_tokens)
-    if current_tokens:  # Add the last segment
-        segments.append(tokenizer.decode(current_tokens))
-    return segments
-def classify_segments(segments):
-    labels = [
-        "Coverage Details", "Exclusions", "Premiums", "Claims Process",
-        "Policy Limits", "Legal and Regulatory Information", "Renewals and Cancellations",
-        "Discounts and Incentives", "Duties and Responsibilities", "Contact Information"
-    ]
-    classified_segments = []
-    for segment in segments:
-        # Note: Adjust the input here based on how your model was trained
-        predictions = classifier(segment)
-        classified_segments.append(predictions)
-    return classified_segments
-class TextRequest(BaseModel):
-    text: str
 @app.post("/process_document")
 async def process_document(request: TextRequest):
     try:
-        processed_text = preprocess_text(request.text)  # Ensure preprocess_text is defined
-        segments = robust_segment_text(processed_text)
-        classified_segments = classify_segments(segments)
         return {
-            "classified_segments": classified_segments
         }
     except Exception as e:
         print(f"Error during document processing: {e}")
         raise HTTPException(status_code=500, detail=str(e))
-@app.post("/summarize")
-async def summarize(request: TextRequest):
-    try:
-        # Preprocess and segment the text
-        processed_text = preprocess_text(request.text)
-        segments = segment_text(processed_text)
-        # Classify each segment safely
-        classified_segments = []
-        for segment in segments:
-            try:
-                result = classifier(segment)
-                classified_segments.append(result)
-            except Exception as e:
-                print(f"Error classifying segment: {e}")
-                classified_segments.append({"error": str(e)})
-        # Optional: Reduce tokens or summarize
-        reduced_texts = []
-        for segment in segments:
-            try:
-                reduced_text, token_count = reduce_tokens(segment)
-                reduced_texts.append((reduced_text, token_count))
-            except Exception as e:
-                print(f"Error during token reduction: {e}")
-                reduced_texts.append(("Error", 0))
-        return {
-            "classified_segments": classified_segments,
-            "reduced_texts": reduced_texts
-        }
-    except Exception as e:
-        print(f"Error during token reduction: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 import google.protobuf  # This line should execute without errors if protobuf is installed correctly
 import sentencepiece
+from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification,AutoModel
 import spacy
+import numpy as np
 nltk.data.path.append(os.getenv('NLTK_DATA'))
     return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
 # Load spaCy model
 nlp = spacy.load("en_core_web_sm")
     text = re.sub(r'[^\w\s]', '', text)
     return text
+def embed_text(text: str) -> np.ndarray:
+    # Load the JinaAI/jina-embeddings-v2-base-en model
+    model_name = "JinaAI/jina-embeddings-v2-base-en"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    inputs = tokenizer(text, return_tensors='pt')
+    embeddings = model(**inputs).pooler_output.numpy()
+    return embeddings
+def semantic_matching(text, context):
+    text_embeddings = embed_text(text)
+    context_embeddings = [embed_text(ctx) for ctx in context]
+    # Calculate cosine similarity between text and context embeddings
+    similarities = np.dot(text_embeddings, context_embeddings.T)
+    # Find the most similar sentence in the context
+    most_similar_idx = np.argmax(similarities)
+    return context[most_similar_idx]
+def handle_endpoint(text):
+    # Define your large context here
+    context = [
+        "This is a sample context sentence 1.",
+        "Another context sentence to provide additional information.",
+        "This context sentence introduces a new topic.",
+        "Some additional details about the new topic are provided here.",
+        "Context sentences can be added or removed as needed.",
+        "The context should cover a range of topics and provide relevant information.",
+        "Make sure the context is diverse and representative of the domain.",
+    ]
+    # Perform semantic matching to retrieve the most relevant portion of the context
+    relevant_context = semantic_matching(text, context)
+    return relevant_context
 @app.post("/process_document")
 async def process_document(request: TextRequest):
     try:
+        processed_text = preprocess_text(request.text)
+        embedded_text = embed_text(processed_text)
+        relevant_context = handle_endpoint(processed_text)
         return {
+            "embedded_text": embedded_text.tolist(),
+            "relevant_context": relevant_context
         }
     except Exception as e:
         print(f"Error during document processing: {e}")
         raise HTTPException(status_code=500, detail=str(e))
+# @app.post("/summarize")
+# async def summarize(request: TextRequest):
+#     try:
+#         # Preprocess and segment the text
+#         processed_text = preprocess_text(request.text)
+#         segments = segment_text(processed_text)
+#         # Classify each segment safely
+#         classified_segments = []
+#         for segment in segments:
+#             try:
+#                 result = classifier(segment)
+#                 classified_segments.append(result)
+#             except Exception as e:
+#                 print(f"Error classifying segment: {e}")
+#                 classified_segments.append({"error": str(e)})
+#         # Optional: Reduce tokens or summarize
+#         reduced_texts = []
+#         for segment in segments:
+#             try:
+#                 reduced_text, token_count = reduce_tokens(segment)
+#                 reduced_texts.append((reduced_text, token_count))
+#             except Exception as e:
+#                 print(f"Error during token reduction: {e}")
+#                 reduced_texts.append(("Error", 0))
+#         return {
+#             "classified_segments": classified_segments,
+#             "reduced_texts": reduced_texts
+#         }
+#     except Exception as e:
+#         print(f"Error during token reduction: {e}")
+#         raise HTTPException(status_code=500, detail=str(e))
+# if __name__ == "__main__":
+#     uvicorn.run(app, host="0.0.0.0", port=8000)