# -*- coding: utf-8 -*-
"""gradio_bert.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly

# libraries
"""


import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import re

"""# data - text"""

splitted_df = pd.read_csv('/content/splitted_df_jo.csv')

"""# getting context"""

def remove_symbols(text: str)-> str:
   """
    Removes specified symbols and non-ASCII characters from the input text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with specified symbols and non-ASCII characters removed.

    Example:
        >>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ."
        >>> clean_text = remove_symbols(text)
        >>> print(clean_text)
        This is a test string  with some symbols.And some non-ASCII characters like  and .
    """
    remove_list = ['/', '(', ')', '\n', '.']
    remove_chars = "".join(remove_list)
    cleaned_text = "".join([char for char in text if char not in remove_chars])

    # Remove non-ASCII characters
    pattern_ascii = r'[^\x00-\x7F]'  # Matches any character outside the ASCII range
    filtered_text = re.sub(pattern_ascii, '', cleaned_text)

    return filtered_text


def context_func(message: str)-> str:
  """
    Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity.

    Args:
        message (str): The input message or question.

    Returns:
        str: The most similar context to the input message from the collection of texts.

    Example:
        >>> message = "What are the symptoms of breast cancer?"
        >>> similar_context = context_func(message)
        >>> print(similar_context)
        Breast cancer is the most common cancer among women worldwide...
    """
  # Create a TF-IDF vectorizer
  vectorizer = TfidfVectorizer()

  # Convert abstracts and question to TF-IDF vectors
  text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
  question_tfidf = vectorizer.transform([message])

  # Calculate cosine similarity between question and each abstract
  similarities = cosine_similarity(question_tfidf, text_tfidf)[0]

  # Find the index of the most similar abstract
  most_similar_index = similarities.argmax()

  # Get the most similar abstract
  most_similar_context = splitted_df["section_text"][most_similar_index]
  most_similar_context = remove_symbols(most_similar_context)

  return most_similar_context

"""# the model"""

!huggingface-cli login

tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")

def answer_question(question: str)-> str, str:
  """
    Generates an answer to the input question based on the provided context.

    Args:
        question (str): The input question.

    Returns:
        tuple: A tuple containing the generated answer and the context used for answering.

    Example:
        >>> question = "What is the capital of France?"
        >>> answer, context = answer_question(question)
        >>> print("Answer:", answer)
        >>> print("Context:", context)
    """
    context = context_func(question)
    # Tokenize the inputs
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
    
    # Get the answer from the model
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
    
    return answer, context

iface = gr.Interface(fn=answer_question,
                     inputs=["text"],
                     outputs=[gr.Textbox(label="Answer")],
                     title="Women Cancer ChatBot",
                     description="How can I help you?",
                     examples=[
                         ["What is breast cancer?"],
                         ["What are treatments for cervical cancer?"]
                     ])

iface.launch(debug = True)