jrocha commited on
Commit
b942bef
1 Parent(s): 64dfc10

Upload gradio_sindi.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. gradio_sindi.py +98 -0
gradio_sindi.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """gradio_sindi.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
8
+
9
+ # libraries
10
+ """
11
+
12
+ !pip install gradio>=4.13.0
13
+
14
+ !pip install accelerate
15
+
16
+ !pip install transformers>=4.34
17
+
18
+ import gradio as gr
19
+ import torch
20
+ from transformers import pipeline
21
+ import numpy as np
22
+ import pandas as pd
23
+ from sklearn.feature_extraction.text import TfidfVectorizer
24
+ from sklearn.metrics.pairwise import cosine_similarity
25
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
26
+ import re
27
+
28
+ """# data - text"""
29
+
30
+ splitted_df = pd.read_csv('/content/splitted_df_jo.csv')
31
+
32
+ """# getting context"""
33
+
34
+ def remove_symbols(text):
35
+ remove_list = ['/', '(', ')', '\n', '.']
36
+ remove_chars = "".join(remove_list)
37
+ cleaned_text = "".join([char for char in text if char not in remove_chars])
38
+
39
+ # Remove non-ASCII characters
40
+ pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
41
+ filtered_text = re.sub(pattern_ascii, '', cleaned_text)
42
+
43
+ return filtered_text
44
+
45
+ def context_func(message):
46
+ # Create a TF-IDF vectorizer
47
+ vectorizer = TfidfVectorizer()
48
+
49
+ # Convert abstracts and question to TF-IDF vectors
50
+ text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
51
+ question_tfidf = vectorizer.transform([message])
52
+
53
+ # Calculate cosine similarity between question and each abstract
54
+ similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
55
+
56
+ # Find the index of the most similar abstract
57
+ most_similar_index = similarities.argmax()
58
+
59
+ # Get the most similar abstract
60
+ most_similar_context = splitted_df["section_text"][most_similar_index]
61
+ most_similar_context = remove_symbols(most_similar_context)
62
+
63
+ return most_similar_context
64
+
65
+ """# the model"""
66
+
67
+ !huggingface-cli login
68
+
69
+ tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
70
+ model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
71
+
72
+ def answer_question(question):
73
+ context = context_func(question)
74
+ # Tokenize the inputs
75
+ inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
76
+
77
+ # Get the answer from the model
78
+ outputs = model(**inputs)
79
+ answer_start_scores = outputs.start_logits
80
+ answer_end_scores = outputs.end_logits
81
+ answer_start = torch.argmax(answer_start_scores)
82
+ answer_end = torch.argmax(answer_end_scores) + 1
83
+ answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
84
+
85
+ return answer, context
86
+
87
+ iface = gr.Interface(fn=answer_question,
88
+ inputs=["text"],
89
+ outputs=[gr.Textbox(label="Answer")],
90
+ title="Women Cancer ChatBot",
91
+ description="How can I help you?",
92
+ examples=[
93
+ ["What is breast cancer?"],
94
+ ["What are treatments for cervical cancer?"]
95
+ ])
96
+
97
+ iface.launch(debug = True)
98
+