# -*- coding: utf-8 -*-
"""gradio_sindi.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly

# libraries
"""

import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
#import re

"""# data - text"""

splitted_df = pd.read_csv('splitted_df_jo.csv')

"""# getting context"""

def remove_symbols(text):
    remove_list = ['/', '(', ')', '\n', '.']
    remove_chars = "".join(remove_list)
    cleaned_text = "".join([char for char in text if char not in remove_chars])

    # Remove non-ASCII characters
    #pattern_ascii = r'[^\x00-\x7F]'  # Matches any character outside the ASCII range
    #filtered_text = re.sub(pattern_ascii, '', cleaned_text)

    return cleaned_text

def context_func(message):
  # Create a TF-IDF vectorizer
  vectorizer = TfidfVectorizer()

  # Convert abstracts and question to TF-IDF vectors
  text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
  question_tfidf = vectorizer.transform([message])

  # Calculate cosine similarity between question and each abstract
  similarities = cosine_similarity(question_tfidf, text_tfidf)[0]

  # Find the index of the most similar abstract
  most_similar_index = similarities.argmax()

  # Get the most similar abstract
  most_similar_context = splitted_df["section_text"][most_similar_index]
  most_similar_context = remove_symbols(most_similar_context)

  return most_similar_context

def answer_question(question):
    context = context_func(question)
    tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
    model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
    # Tokenize the inputs
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)

    # Get the answer from the model
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])

    return answer, context

def main():
    """"
    Initializes a Women Cancer ChatBot interface using Hugging Face models for question answering.
    
    This function loads a pretrained tokenizer and model from the Hugging Face model hub
    and creates a Gradio interface for the ChatBot. Users can input questions related to
    women's cancer topics, and the ChatBot will generate answers based on the provided context.
    
    Returns:
    None

    Example:
    >>> main()
    """
    tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
    model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
    iface = gr.Interface(fn=answer_question,
                         inputs=["text"],
                         outputs=[gr.Textbox(label="Answer")],
                         title="Women Cancer ChatBot",
                         description="How can I help you?",
                         examples=[
                             ["What is breast cancer?"],
                             ["What are treatments for cervical cancer?"]
                         ])
    
    return iface.launch(debug = True, share=True)

if __name__ == "__main__":
    main()