Upload gradio_sindi.py with huggingface_hub
Browse files- gradio_sindi.py +98 -0
gradio_sindi.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""gradio_sindi.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
|
8 |
+
|
9 |
+
# libraries
|
10 |
+
"""
|
11 |
+
|
12 |
+
!pip install gradio>=4.13.0
|
13 |
+
|
14 |
+
!pip install accelerate
|
15 |
+
|
16 |
+
!pip install transformers>=4.34
|
17 |
+
|
18 |
+
import gradio as gr
|
19 |
+
import torch
|
20 |
+
from transformers import pipeline
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
24 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
25 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
26 |
+
import re
|
27 |
+
|
28 |
+
"""# data - text"""
|
29 |
+
|
30 |
+
splitted_df = pd.read_csv('/content/splitted_df_jo.csv')
|
31 |
+
|
32 |
+
"""# getting context"""
|
33 |
+
|
34 |
+
def remove_symbols(text):
|
35 |
+
remove_list = ['/', '(', ')', '\n', '.']
|
36 |
+
remove_chars = "".join(remove_list)
|
37 |
+
cleaned_text = "".join([char for char in text if char not in remove_chars])
|
38 |
+
|
39 |
+
# Remove non-ASCII characters
|
40 |
+
pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
|
41 |
+
filtered_text = re.sub(pattern_ascii, '', cleaned_text)
|
42 |
+
|
43 |
+
return filtered_text
|
44 |
+
|
45 |
+
def context_func(message):
|
46 |
+
# Create a TF-IDF vectorizer
|
47 |
+
vectorizer = TfidfVectorizer()
|
48 |
+
|
49 |
+
# Convert abstracts and question to TF-IDF vectors
|
50 |
+
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
|
51 |
+
question_tfidf = vectorizer.transform([message])
|
52 |
+
|
53 |
+
# Calculate cosine similarity between question and each abstract
|
54 |
+
similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
|
55 |
+
|
56 |
+
# Find the index of the most similar abstract
|
57 |
+
most_similar_index = similarities.argmax()
|
58 |
+
|
59 |
+
# Get the most similar abstract
|
60 |
+
most_similar_context = splitted_df["section_text"][most_similar_index]
|
61 |
+
most_similar_context = remove_symbols(most_similar_context)
|
62 |
+
|
63 |
+
return most_similar_context
|
64 |
+
|
65 |
+
"""# the model"""
|
66 |
+
|
67 |
+
!huggingface-cli login
|
68 |
+
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
|
70 |
+
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
|
71 |
+
|
72 |
+
def answer_question(question):
|
73 |
+
context = context_func(question)
|
74 |
+
# Tokenize the inputs
|
75 |
+
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
|
76 |
+
|
77 |
+
# Get the answer from the model
|
78 |
+
outputs = model(**inputs)
|
79 |
+
answer_start_scores = outputs.start_logits
|
80 |
+
answer_end_scores = outputs.end_logits
|
81 |
+
answer_start = torch.argmax(answer_start_scores)
|
82 |
+
answer_end = torch.argmax(answer_end_scores) + 1
|
83 |
+
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
|
84 |
+
|
85 |
+
return answer, context
|
86 |
+
|
87 |
+
iface = gr.Interface(fn=answer_question,
|
88 |
+
inputs=["text"],
|
89 |
+
outputs=[gr.Textbox(label="Answer")],
|
90 |
+
title="Women Cancer ChatBot",
|
91 |
+
description="How can I help you?",
|
92 |
+
examples=[
|
93 |
+
["What is breast cancer?"],
|
94 |
+
["What are treatments for cervical cancer?"]
|
95 |
+
])
|
96 |
+
|
97 |
+
iface.launch(debug = True)
|
98 |
+
|