File size: 4,501 Bytes
b942bef 5499fd6 b942bef 5499fd6 b942bef 5499fd6 b942bef 5499fd6 b942bef 5499fd6 b942bef 5499fd6 b942bef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# -*- coding: utf-8 -*-
"""gradio_bert.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
# libraries
"""
import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import re
"""# data - text"""
splitted_df = pd.read_csv('/content/splitted_df_jo.csv')
"""# getting context"""
def remove_symbols(text: str)-> str:
"""
Removes specified symbols and non-ASCII characters from the input text.
Args:
text (str): The input text to be cleaned.
Returns:
str: The cleaned text with specified symbols and non-ASCII characters removed.
Example:
>>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ."
>>> clean_text = remove_symbols(text)
>>> print(clean_text)
This is a test string with some symbols.And some non-ASCII characters like and .
"""
remove_list = ['/', '(', ')', '\n', '.']
remove_chars = "".join(remove_list)
cleaned_text = "".join([char for char in text if char not in remove_chars])
# Remove non-ASCII characters
pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
filtered_text = re.sub(pattern_ascii, '', cleaned_text)
return filtered_text
def context_func(message: str)-> str:
"""
Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity.
Args:
message (str): The input message or question.
Returns:
str: The most similar context to the input message from the collection of texts.
Example:
>>> message = "What are the symptoms of breast cancer?"
>>> similar_context = context_func(message)
>>> print(similar_context)
Breast cancer is the most common cancer among women worldwide...
"""
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Convert abstracts and question to TF-IDF vectors
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
question_tfidf = vectorizer.transform([message])
# Calculate cosine similarity between question and each abstract
similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
# Find the index of the most similar abstract
most_similar_index = similarities.argmax()
# Get the most similar abstract
most_similar_context = splitted_df["section_text"][most_similar_index]
most_similar_context = remove_symbols(most_similar_context)
return most_similar_context
"""# the model"""
!huggingface-cli login
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
def answer_question(question: str)-> str, str:
"""
Generates an answer to the input question based on the provided context.
Args:
question (str): The input question.
Returns:
tuple: A tuple containing the generated answer and the context used for answering.
Example:
>>> question = "What is the capital of France?"
>>> answer, context = answer_question(question)
>>> print("Answer:", answer)
>>> print("Context:", context)
"""
context = context_func(question)
# Tokenize the inputs
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
# Get the answer from the model
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
return answer, context
iface = gr.Interface(fn=answer_question,
inputs=["text"],
outputs=[gr.Textbox(label="Answer")],
title="Women Cancer ChatBot",
description="How can I help you?",
examples=[
["What is breast cancer?"],
["What are treatments for cervical cancer?"]
])
iface.launch(debug = True)
|