# -*- coding: utf-8 -*- """gradio_bert.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly # libraries """ import gradio as gr import torch from transformers import pipeline import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModelForQuestionAnswering import re """# data - text""" splitted_df = pd.read_csv('/content/splitted_df_jo.csv') """# getting context""" def remove_symbols(text: str)-> str: """ Removes specified symbols and non-ASCII characters from the input text. Args: text (str): The input text to be cleaned. Returns: str: The cleaned text with specified symbols and non-ASCII characters removed. Example: >>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ." >>> clean_text = remove_symbols(text) >>> print(clean_text) This is a test string with some symbols.And some non-ASCII characters like and . """ remove_list = ['/', '(', ')', '\n', '.'] remove_chars = "".join(remove_list) cleaned_text = "".join([char for char in text if char not in remove_chars]) # Remove non-ASCII characters pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range filtered_text = re.sub(pattern_ascii, '', cleaned_text) return filtered_text def context_func(message: str)-> str: """ Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity. Args: message (str): The input message or question. Returns: str: The most similar context to the input message from the collection of texts. Example: >>> message = "What are the symptoms of breast cancer?" >>> similar_context = context_func(message) >>> print(similar_context) Breast cancer is the most common cancer among women worldwide... """ # Create a TF-IDF vectorizer vectorizer = TfidfVectorizer() # Convert abstracts and question to TF-IDF vectors text_tfidf = vectorizer.fit_transform(splitted_df["section_text"]) question_tfidf = vectorizer.transform([message]) # Calculate cosine similarity between question and each abstract similarities = cosine_similarity(question_tfidf, text_tfidf)[0] # Find the index of the most similar abstract most_similar_index = similarities.argmax() # Get the most similar abstract most_similar_context = splitted_df["section_text"][most_similar_index] most_similar_context = remove_symbols(most_similar_context) return most_similar_context """# the model""" !huggingface-cli login tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final") model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final") def answer_question(question: str)-> str, str: """ Generates an answer to the input question based on the provided context. Args: question (str): The input question. Returns: tuple: A tuple containing the generated answer and the context used for answering. Example: >>> question = "What is the capital of France?" >>> answer, context = answer_question(question) >>> print("Answer:", answer) >>> print("Context:", context) """ context = context_func(question) # Tokenize the inputs inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True) # Get the answer from the model outputs = model(**inputs) answer_start_scores = outputs.start_logits answer_end_scores = outputs.end_logits answer_start = torch.argmax(answer_start_scores) answer_end = torch.argmax(answer_end_scores) + 1 answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end]) return answer, context iface = gr.Interface(fn=answer_question, inputs=["text"], outputs=[gr.Textbox(label="Answer")], title="Women Cancer ChatBot", description="How can I help you?", examples=[ ["What is breast cancer?"], ["What are treatments for cervical cancer?"] ]) iface.launch(debug = True)