jrocha commited on
Commit
5499fd6
1 Parent(s): b942bef

Rename gradio_sindi.py to gradio_bert.py

Browse files
Files changed (1) hide show
  1. gradio_sindi.py → gradio_bert.py +52 -11
gradio_sindi.py → gradio_bert.py RENAMED
@@ -1,5 +1,5 @@
1
  # -*- coding: utf-8 -*-
2
- """gradio_sindi.ipynb
3
 
4
  Automatically generated by Colab.
5
 
@@ -9,11 +9,6 @@ Original file is located at
9
  # libraries
10
  """
11
 
12
- !pip install gradio>=4.13.0
13
-
14
- !pip install accelerate
15
-
16
- !pip install transformers>=4.34
17
 
18
  import gradio as gr
19
  import torch
@@ -31,7 +26,22 @@ splitted_df = pd.read_csv('/content/splitted_df_jo.csv')
31
 
32
  """# getting context"""
33
 
34
- def remove_symbols(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  remove_list = ['/', '(', ')', '\n', '.']
36
  remove_chars = "".join(remove_list)
37
  cleaned_text = "".join([char for char in text if char not in remove_chars])
@@ -42,7 +52,23 @@ def remove_symbols(text):
42
 
43
  return filtered_text
44
 
45
- def context_func(message):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Create a TF-IDF vectorizer
47
  vectorizer = TfidfVectorizer()
48
 
@@ -69,11 +95,26 @@ def context_func(message):
69
  tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
70
  model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
71
 
72
- def answer_question(question):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  context = context_func(question)
74
  # Tokenize the inputs
75
  inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
76
-
77
  # Get the answer from the model
78
  outputs = model(**inputs)
79
  answer_start_scores = outputs.start_logits
@@ -81,7 +122,7 @@ def answer_question(question):
81
  answer_start = torch.argmax(answer_start_scores)
82
  answer_end = torch.argmax(answer_end_scores) + 1
83
  answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
84
-
85
  return answer, context
86
 
87
  iface = gr.Interface(fn=answer_question,
 
1
  # -*- coding: utf-8 -*-
2
+ """gradio_bert.ipynb
3
 
4
  Automatically generated by Colab.
5
 
 
9
  # libraries
10
  """
11
 
 
 
 
 
 
12
 
13
  import gradio as gr
14
  import torch
 
26
 
27
  """# getting context"""
28
 
29
+ def remove_symbols(text: str)-> str:
30
+ """
31
+ Removes specified symbols and non-ASCII characters from the input text.
32
+
33
+ Args:
34
+ text (str): The input text to be cleaned.
35
+
36
+ Returns:
37
+ str: The cleaned text with specified symbols and non-ASCII characters removed.
38
+
39
+ Example:
40
+ >>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ."
41
+ >>> clean_text = remove_symbols(text)
42
+ >>> print(clean_text)
43
+ This is a test string with some symbols.And some non-ASCII characters like and .
44
+ """
45
  remove_list = ['/', '(', ')', '\n', '.']
46
  remove_chars = "".join(remove_list)
47
  cleaned_text = "".join([char for char in text if char not in remove_chars])
 
52
 
53
  return filtered_text
54
 
55
+
56
+ def context_func(message: str)-> str:
57
+ """
58
+ Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity.
59
+
60
+ Args:
61
+ message (str): The input message or question.
62
+
63
+ Returns:
64
+ str: The most similar context to the input message from the collection of texts.
65
+
66
+ Example:
67
+ >>> message = "What are the symptoms of breast cancer?"
68
+ >>> similar_context = context_func(message)
69
+ >>> print(similar_context)
70
+ Breast cancer is the most common cancer among women worldwide...
71
+ """
72
  # Create a TF-IDF vectorizer
73
  vectorizer = TfidfVectorizer()
74
 
 
95
  tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
96
  model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
97
 
98
+ def answer_question(question: str)-> str, str:
99
+ """
100
+ Generates an answer to the input question based on the provided context.
101
+
102
+ Args:
103
+ question (str): The input question.
104
+
105
+ Returns:
106
+ tuple: A tuple containing the generated answer and the context used for answering.
107
+
108
+ Example:
109
+ >>> question = "What is the capital of France?"
110
+ >>> answer, context = answer_question(question)
111
+ >>> print("Answer:", answer)
112
+ >>> print("Context:", context)
113
+ """
114
  context = context_func(question)
115
  # Tokenize the inputs
116
  inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
117
+
118
  # Get the answer from the model
119
  outputs = model(**inputs)
120
  answer_start_scores = outputs.start_logits
 
122
  answer_start = torch.argmax(answer_start_scores)
123
  answer_end = torch.argmax(answer_end_scores) + 1
124
  answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
125
+
126
  return answer, context
127
 
128
  iface = gr.Interface(fn=answer_question,