File size: 3,006 Bytes
b942bef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# -*- coding: utf-8 -*-
"""gradio_sindi.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
# libraries
"""
!pip install gradio>=4.13.0
!pip install accelerate
!pip install transformers>=4.34
import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import re
"""# data - text"""
splitted_df = pd.read_csv('/content/splitted_df_jo.csv')
"""# getting context"""
def remove_symbols(text):
remove_list = ['/', '(', ')', '\n', '.']
remove_chars = "".join(remove_list)
cleaned_text = "".join([char for char in text if char not in remove_chars])
# Remove non-ASCII characters
pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
filtered_text = re.sub(pattern_ascii, '', cleaned_text)
return filtered_text
def context_func(message):
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Convert abstracts and question to TF-IDF vectors
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
question_tfidf = vectorizer.transform([message])
# Calculate cosine similarity between question and each abstract
similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
# Find the index of the most similar abstract
most_similar_index = similarities.argmax()
# Get the most similar abstract
most_similar_context = splitted_df["section_text"][most_similar_index]
most_similar_context = remove_symbols(most_similar_context)
return most_similar_context
"""# the model"""
!huggingface-cli login
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
def answer_question(question):
context = context_func(question)
# Tokenize the inputs
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
# Get the answer from the model
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
return answer, context
iface = gr.Interface(fn=answer_question,
inputs=["text"],
outputs=[gr.Textbox(label="Answer")],
title="Women Cancer ChatBot",
description="How can I help you?",
examples=[
["What is breast cancer?"],
["What are treatments for cervical cancer?"]
])
iface.launch(debug = True)
|