# -*- coding: utf-8 -*- """gradio_sindi.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly # libraries """ !pip install gradio>=4.13.0 !pip install accelerate !pip install transformers>=4.34 import gradio as gr import torch from transformers import pipeline import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModelForQuestionAnswering import re """# data - text""" splitted_df = pd.read_csv('/content/splitted_df_jo.csv') """# getting context""" def remove_symbols(text): remove_list = ['/', '(', ')', '\n', '.'] remove_chars = "".join(remove_list) cleaned_text = "".join([char for char in text if char not in remove_chars]) # Remove non-ASCII characters pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range filtered_text = re.sub(pattern_ascii, '', cleaned_text) return filtered_text def context_func(message): # Create a TF-IDF vectorizer vectorizer = TfidfVectorizer() # Convert abstracts and question to TF-IDF vectors text_tfidf = vectorizer.fit_transform(splitted_df["section_text"]) question_tfidf = vectorizer.transform([message]) # Calculate cosine similarity between question and each abstract similarities = cosine_similarity(question_tfidf, text_tfidf)[0] # Find the index of the most similar abstract most_similar_index = similarities.argmax() # Get the most similar abstract most_similar_context = splitted_df["section_text"][most_similar_index] most_similar_context = remove_symbols(most_similar_context) return most_similar_context """# the model""" !huggingface-cli login tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final") model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final") def answer_question(question): context = context_func(question) # Tokenize the inputs inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True) # Get the answer from the model outputs = model(**inputs) answer_start_scores = outputs.start_logits answer_end_scores = outputs.end_logits answer_start = torch.argmax(answer_start_scores) answer_end = torch.argmax(answer_end_scores) + 1 answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end]) return answer, context iface = gr.Interface(fn=answer_question, inputs=["text"], outputs=[gr.Textbox(label="Answer")], title="Women Cancer ChatBot", description="How can I help you?", examples=[ ["What is breast cancer?"], ["What are treatments for cervical cancer?"] ]) iface.launch(debug = True)