gradio_bert / gradio_bert.py
jrocha's picture
Rename gradio_sindi.py to gradio_bert.py
5499fd6 verified
raw
history blame
4.5 kB
# -*- coding: utf-8 -*-
"""gradio_bert.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
# libraries
"""
import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import re
"""# data - text"""
splitted_df = pd.read_csv('/content/splitted_df_jo.csv')
"""# getting context"""
def remove_symbols(text: str)-> str:
"""
Removes specified symbols and non-ASCII characters from the input text.
Args:
text (str): The input text to be cleaned.
Returns:
str: The cleaned text with specified symbols and non-ASCII characters removed.
Example:
>>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ."
>>> clean_text = remove_symbols(text)
>>> print(clean_text)
This is a test string with some symbols.And some non-ASCII characters like and .
"""
remove_list = ['/', '(', ')', '\n', '.']
remove_chars = "".join(remove_list)
cleaned_text = "".join([char for char in text if char not in remove_chars])
# Remove non-ASCII characters
pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
filtered_text = re.sub(pattern_ascii, '', cleaned_text)
return filtered_text
def context_func(message: str)-> str:
"""
Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity.
Args:
message (str): The input message or question.
Returns:
str: The most similar context to the input message from the collection of texts.
Example:
>>> message = "What are the symptoms of breast cancer?"
>>> similar_context = context_func(message)
>>> print(similar_context)
Breast cancer is the most common cancer among women worldwide...
"""
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Convert abstracts and question to TF-IDF vectors
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
question_tfidf = vectorizer.transform([message])
# Calculate cosine similarity between question and each abstract
similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
# Find the index of the most similar abstract
most_similar_index = similarities.argmax()
# Get the most similar abstract
most_similar_context = splitted_df["section_text"][most_similar_index]
most_similar_context = remove_symbols(most_similar_context)
return most_similar_context
"""# the model"""
!huggingface-cli login
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
def answer_question(question: str)-> str, str:
"""
Generates an answer to the input question based on the provided context.
Args:
question (str): The input question.
Returns:
tuple: A tuple containing the generated answer and the context used for answering.
Example:
>>> question = "What is the capital of France?"
>>> answer, context = answer_question(question)
>>> print("Answer:", answer)
>>> print("Context:", context)
"""
context = context_func(question)
# Tokenize the inputs
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
# Get the answer from the model
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
return answer, context
iface = gr.Interface(fn=answer_question,
inputs=["text"],
outputs=[gr.Textbox(label="Answer")],
title="Women Cancer ChatBot",
description="How can I help you?",
examples=[
["What is breast cancer?"],
["What are treatments for cervical cancer?"]
])
iface.launch(debug = True)