# -*- coding: utf-8 -*- """gradio_sindi.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly # libraries """ import gradio as gr import torch from transformers import pipeline import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModelForQuestionAnswering #import re """# data - text""" splitted_df = pd.read_csv('splitted_df_jo.csv') """# getting context""" def remove_symbols(text): remove_list = ['/', '(', ')', '\n', '.'] remove_chars = "".join(remove_list) cleaned_text = "".join([char for char in text if char not in remove_chars]) # Remove non-ASCII characters #pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range #filtered_text = re.sub(pattern_ascii, '', cleaned_text) return cleaned_text def context_func(message): # Create a TF-IDF vectorizer vectorizer = TfidfVectorizer() # Convert abstracts and question to TF-IDF vectors text_tfidf = vectorizer.fit_transform(splitted_df["section_text"]) question_tfidf = vectorizer.transform([message]) # Calculate cosine similarity between question and each abstract similarities = cosine_similarity(question_tfidf, text_tfidf)[0] # Find the index of the most similar abstract most_similar_index = similarities.argmax() # Get the most similar abstract most_similar_context = splitted_df["section_text"][most_similar_index] most_similar_context = remove_symbols(most_similar_context) return most_similar_context def answer_question(question): context = context_func(question) tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final") model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final") # Tokenize the inputs inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True) # Get the answer from the model outputs = model(**inputs) answer_start_scores = outputs.start_logits answer_end_scores = outputs.end_logits answer_start = torch.argmax(answer_start_scores) answer_end = torch.argmax(answer_end_scores) + 1 answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end]) return answer, context def main(): """" Initializes a Women Cancer ChatBot interface using Hugging Face models for question answering. This function loads a pretrained tokenizer and model from the Hugging Face model hub and creates a Gradio interface for the ChatBot. Users can input questions related to women's cancer topics, and the ChatBot will generate answers based on the provided context. Returns: None Example: >>> main() """ tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final") model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final") iface = gr.Interface(fn=answer_question, inputs=["text"], outputs=[gr.Textbox(label="Answer")], title="Women Cancer ChatBot", description="How can I help you?", examples=[ ["What is breast cancer?"], ["What are treatments for cervical cancer?"] ]) return iface.launch(debug = True, share=True) if __name__ == "__main__": main()