wissamantoun commited on
Commit
90afd57
1 Parent(s): c64d018

added sarcasm and qa with logging

Browse files
app.py CHANGED
@@ -1,22 +1,28 @@
1
  import awesome_streamlit as ast
2
  import streamlit as st
3
 
4
- from backend.utils import get_current_ram_usage
5
 
6
  import backend.aragpt
7
  import backend.home
8
  import backend.processor
9
  import backend.sa
 
 
10
 
11
  st.set_page_config(
12
  page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
13
  )
14
 
 
 
15
  PAGES = {
16
  "Home": backend.home,
17
  "Arabic Text Preprocessor": backend.processor,
18
  "Arabic Language Generation": backend.aragpt,
19
  "Arabic Sentiment Analysis": backend.sa,
 
 
20
  }
21
 
22
 
 
1
  import awesome_streamlit as ast
2
  import streamlit as st
3
 
4
+ from backend.utils import get_current_ram_usage, ga
5
 
6
  import backend.aragpt
7
  import backend.home
8
  import backend.processor
9
  import backend.sa
10
+ import backend.qa
11
+ import backend.sarcasm
12
 
13
  st.set_page_config(
14
  page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
15
  )
16
 
17
+ ga(st.__file__)
18
+
19
  PAGES = {
20
  "Home": backend.home,
21
  "Arabic Text Preprocessor": backend.processor,
22
  "Arabic Language Generation": backend.aragpt,
23
  "Arabic Sentiment Analysis": backend.sa,
24
+ "Arabic Sarcasm Detection": backend.sarcasm,
25
+ "Arabic Question Answering": backend.qa,
26
  }
27
 
28
 
backend/home.py CHANGED
@@ -14,7 +14,8 @@ def write():
14
  - Arabic Text Preprocessor: Test how text imput is treated by our preprocessor
15
  - Arabic Language Generation: Generate Arabic text using our AraGPT2 language models
16
  - Arabic Sentiment Analysis: Test the senitment analysis model that won the [Arabic Senitment Analysis competition @ KAUST](https://www.kaggle.com/c/arabic-sentiment-analysis-2021-kaust)
17
- - Arabic Masked Language Modeling: Test our AraBERT models MLM capabilities
 
18
  """
19
  )
20
  st.markdown("#")
 
14
  - Arabic Text Preprocessor: Test how text imput is treated by our preprocessor
15
  - Arabic Language Generation: Generate Arabic text using our AraGPT2 language models
16
  - Arabic Sentiment Analysis: Test the senitment analysis model that won the [Arabic Senitment Analysis competition @ KAUST](https://www.kaggle.com/c/arabic-sentiment-analysis-2021-kaust)
17
+ - Arabic Sarcasm Detection: Test MARBERT trained for sarcasm detection
18
+ - Arabic Question Answering: Test our AraELECTRA QA capabilities
19
  """
20
  )
21
  st.markdown("#")
backend/qa.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from qa_utils import annotate_answer, get_qa_answers
4
+
5
+ _, col1, _ = st.beta_columns(3)
6
+
7
+ with col1:
8
+ st.image("is2alni_logo.png", width=200)
9
+ st.title("إسألني أي شيء")
10
+
11
+ st.markdown(
12
+ """
13
+ <style>
14
+ p, div, input, label {
15
+ text-align: right;
16
+ }
17
+ </style>
18
+ """,
19
+ unsafe_allow_html=True,
20
+ )
21
+
22
+ st.sidebar.header("Info")
23
+ st.sidebar.image("AraELECTRA.png", width=150)
24
+ st.sidebar.write("Powered by [AraELECTRA](https://github.com/aub-mind/arabert)")
25
+
26
+ st.sidebar.write("\n")
27
+ n_answers = st.sidebar.slider(
28
+ "Max. number of answers", min_value=1, max_value=10, value=2, step=1
29
+ )
30
+
31
+ question = st.text_input("", value="من هو جو بايدن؟")
32
+ if "؟" not in question:
33
+ question += "؟"
34
+
35
+ run_query = st.button("أجب")
36
+ if run_query:
37
+ # https://discuss.streamlit.io/t/showing-a-gif-while-st-spinner-runs/5084
38
+ with st.spinner("... جاري البحث "):
39
+ results_dict = get_qa_answers(question)
40
+
41
+ if len(results_dict) > 0:
42
+ st.write("## :الأجابات هي")
43
+ for result in results_dict["results"][:n_answers]:
44
+ annotate_answer(result)
45
+ f"[**المصدر**](<{result['link']}>)"
46
+ else:
47
+ st.write("## 😞 ليس لدي جواب")
backend/qa_utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit.components.v1
2
+
3
+ from htbuilder import HtmlElement, div, span, styles
4
+ from htbuilder.units import px, rem, em
5
+
6
+
7
+ def annotation(body, label="", background="#ddd", color="#333", **style):
8
+ """Build an HtmlElement span object with the given body and annotation label.
9
+
10
+ The end result will look something like this:
11
+
12
+ [body | label]
13
+
14
+ Parameters
15
+ ----------
16
+ body : string
17
+ The string to put in the "body" part of the annotation.
18
+ label : string
19
+ The string to put in the "label" part of the annotation.
20
+ background : string
21
+ The color to use for the background "chip" containing this annotation.
22
+ color : string
23
+ The color to use for the body and label text.
24
+ **style : dict
25
+ Any CSS you want to use to customize the containing "chip".
26
+
27
+ Examples
28
+ --------
29
+
30
+ Produce a simple annotation with default colors:
31
+
32
+ >>> annotation("apple", "fruit")
33
+
34
+ Produce an annotation with custom colors:
35
+
36
+ >>> annotation("apple", "fruit", background="#FF0", color="black")
37
+
38
+ Produce an annotation with crazy CSS:
39
+
40
+ >>> annotation("apple", "fruit", background="#FF0", border="1px dashed red")
41
+
42
+ """
43
+
44
+ if "font_family" not in style:
45
+ style["font_family"] = "sans-serif"
46
+
47
+ return span(
48
+ style=styles(
49
+ background=background,
50
+ border_radius=rem(0.33),
51
+ color=color,
52
+ padding=(rem(0.17), rem(0.67)),
53
+ display="inline-flex",
54
+ justify_content="center",
55
+ align_items="center",
56
+ **style,
57
+ )
58
+ )(
59
+ body,
60
+ span(
61
+ style=styles(
62
+ color=color,
63
+ font_size=em(0.67),
64
+ opacity=0.5,
65
+ padding_left=rem(0.5),
66
+ text_transform="uppercase",
67
+ margin_bottom=px(-2),
68
+ )
69
+ )(label),
70
+ )
71
+
72
+
73
+ def annotated_text(*args, **kwargs):
74
+ """Writes test with annotations into your Streamlit app.
75
+
76
+ Parameters
77
+ ----------
78
+ *args : str, tuple or htbuilder.HtmlElement
79
+ Arguments can be:
80
+ - strings, to draw the string as-is on the screen.
81
+ - tuples of the form (main_text, annotation_text, background, color) where
82
+ background and foreground colors are optional and should be an CSS-valid string such as
83
+ "#aabbcc" or "rgb(10, 20, 30)"
84
+ - HtmlElement objects in case you want to customize the annotations further. In particular,
85
+ you can import the `annotation()` function from this module to easily produce annotations
86
+ whose CSS you can customize via keyword arguments.
87
+
88
+ Examples
89
+ --------
90
+
91
+ >>> annotated_text(
92
+ ... "This ",
93
+ ... ("is", "verb", "#8ef"),
94
+ ... " some ",
95
+ ... ("annotated", "adj", "#faa"),
96
+ ... ("text", "noun", "#afa"),
97
+ ... " for those of ",
98
+ ... ("you", "pronoun", "#fea"),
99
+ ... " who ",
100
+ ... ("like", "verb", "#8ef"),
101
+ ... " this sort of ",
102
+ ... ("thing", "noun", "#afa"),
103
+ ... )
104
+
105
+ >>> annotated_text(
106
+ ... "Hello ",
107
+ ... annotation("world!", "noun", color="#8ef", border="1px dashed red"),
108
+ ... )
109
+
110
+ """
111
+ out = div(
112
+ style=styles(
113
+ font_family="sans-serif",
114
+ line_height="1.45",
115
+ font_size=px(16),
116
+ text_align="right",
117
+ )
118
+ )
119
+
120
+ for arg in args:
121
+ if isinstance(arg, str):
122
+ out(arg)
123
+
124
+ elif isinstance(arg, HtmlElement):
125
+ out(arg)
126
+
127
+ elif isinstance(arg, tuple):
128
+ out(annotation(*arg))
129
+
130
+ else:
131
+ raise Exception("Oh noes!")
132
+
133
+ streamlit.components.v1.html(str(out), **kwargs)
134
+
135
+
136
+ def shorten_text(text, n, reverse=False):
137
+ if text.isspace() or text == "":
138
+ return text
139
+ if reverse:
140
+ text = text[::-1]
141
+ words = iter(text.split())
142
+ lines, current = [], next(words)
143
+ for word in words:
144
+ if len(current) + 1 + len(word) > n:
145
+ break
146
+ else:
147
+ current += " " + word
148
+ lines.append(current)
149
+ if reverse:
150
+ return lines[0][::-1]
151
+ return lines[0]
152
+
153
+
154
+ def annotate_answer(result):
155
+ annotated_text(
156
+ shorten_text(
157
+ result["original"][: result["new_start"]],
158
+ 500,
159
+ reverse=True,
160
+ ),
161
+ (result["new_answer"], "جواب", "#8ef"),
162
+ shorten_text(result["original"][result["new_end"] :], 500) + " ...... إلخ",
163
+ )
backend/sarcasm.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from .sa import predictor
3
+
4
+
5
+ def write():
6
+ st.markdown(
7
+ """
8
+ # Arabic Sarcasm Detection
9
+
10
+ This is a simple sarcasm detection app that uses the [MARBERT](https://huggingface.co/UBC-NLP/MARBERT) model trained on [ArSarcasm](https://github.com/iabufarha/ArSarcasm)
11
+ """
12
+ )
13
+
14
+ input_text = st.text_input(
15
+ "Enter your text here:",
16
+ )
17
+ if st.button("Predict"):
18
+ with st.spinner("Predicting..."):
19
+ prediction, scores = predictor.get_preds_from_sarcasm([input_text])
20
+ st.write(f"Result: {prediction[0]}")
21
+ detailed_score = {
22
+ "Sarcastic": scores[0][0],
23
+ "Not_Sarcastic": scores[0][1],
24
+ }
25
+ st.write("All scores:")
26
+ st.write(detailed_score)
backend/services.py CHANGED
@@ -13,6 +13,17 @@ from .preprocess import ArabertPreprocessor
13
  from .sa_utils import *
14
  from .utils import download_models, softmax
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  logger = logging.getLogger(__name__)
17
  # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
18
  class TextGeneration:
@@ -72,6 +83,7 @@ class TextGeneration:
72
  do_sample: bool,
73
  num_beams: int,
74
  ):
 
75
  prompt = self.preprocessor.preprocess(prompt)
76
  return_full_text = False
77
  return_text = True
@@ -127,6 +139,9 @@ class TextGeneration:
127
  return "Something happened 🤷‍♂️!!"
128
  else:
129
  generated_text = generated_text[0]["generated_text"]
 
 
 
130
  return self.preprocessor.unpreprocess(generated_text)
131
 
132
  def query(self, payload, model_name):
@@ -219,7 +234,7 @@ class SentimentAnalyzer:
219
  preds_df = pd.DataFrame([])
220
  for i in range(0, 5):
221
  preds = []
222
- for s in tqdm(more_itertools.chunked(list(prep_texts), 128)):
223
  preds.extend(self.pipelines["sar_trial10"][i](s))
224
  preds_df[f"model_{i}"] = preds
225
 
@@ -245,55 +260,63 @@ class SentimentAnalyzer:
245
  return final_labels, final_scores
246
 
247
  def get_preds_from_a_model(self, texts: List[str], model_name):
 
 
248
 
249
- prep = self.processors[model_name]
 
 
 
 
 
 
250
 
251
- prep_texts = [prep.preprocess(x) for x in texts]
252
- if model_name == "sa_sarcasm":
253
- sarcasm_label, _ = self.get_preds_from_sarcasm(texts)
254
- sarcastic_map = {"Not_Sarcastic": "غير ساخر", "Sarcastic": "ساخر"}
255
- labeled_prep_texts = []
256
- for t, l in zip(prep_texts, sarcasm_label):
257
- labeled_prep_texts.append(sarcastic_map[l] + " [SEP] " + t)
258
 
259
- preds_df = pd.DataFrame([])
260
- for i in range(0, 5):
261
- preds = []
262
- for s in more_itertools.chunked(list(prep_texts), 128):
263
- preds.extend(self.pipelines[model_name][i](s))
264
- preds_df[f"model_{i}"] = preds
 
 
 
 
 
265
 
266
- final_labels = []
267
- final_scores = []
268
- final_scores_list = []
269
- for id, row in preds_df.iterrows():
270
- pos_total = 0
271
- neg_total = 0
272
- neu_total = 0
273
- for pred in row[2:]:
274
- pos_total += pred[0]["score"]
275
- neu_total += pred[1]["score"]
276
- neg_total += pred[2]["score"]
277
 
278
- pos_avg = pos_total / 5
279
- neu_avg = neu_total / 5
280
- neg_avg = neg_total / 5
281
-
282
- if model_name == "sa_no_aoa_in_neutral":
283
- final_labels.append(
284
- self.pipelines[model_name][0].model.config.id2label[
285
- np.argmax([neu_avg, neg_avg, pos_avg])
286
- ]
 
 
 
 
 
 
 
 
 
 
 
287
  )
288
  else:
289
- final_labels.append(
290
- self.pipelines[model_name][0].model.config.id2label[
291
- np.argmax([pos_avg, neu_avg, neg_avg])
292
- ]
293
- )
294
- final_scores.append(np.max([pos_avg, neu_avg, neg_avg]))
295
- final_scores_list.append((pos_avg, neu_avg, neg_avg))
296
-
297
  return final_labels, final_scores, final_scores_list
298
 
299
  def predict(self, texts: List[str]):
@@ -355,3 +378,139 @@ class SentimentAnalyzer:
355
  logger.info(f"Score: {final_ensemble_score}")
356
  logger.info(f"All Scores: {final_ensemble_all_score}")
357
  return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from .sa_utils import *
14
  from .utils import download_models, softmax
15
 
16
+ from functools import lru_cache
17
+ from urllib.parse import unquote
18
+
19
+ import streamlit as st
20
+ import wikipedia
21
+ from codetiming import Timer
22
+ from fuzzysearch import find_near_matches
23
+ from googleapi import google
24
+ from transformers import AutoTokenizer
25
+
26
+
27
  logger = logging.getLogger(__name__)
28
  # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
29
  class TextGeneration:
 
83
  do_sample: bool,
84
  num_beams: int,
85
  ):
86
+ logger.info(f"Generating with {model_name}")
87
  prompt = self.preprocessor.preprocess(prompt)
88
  return_full_text = False
89
  return_text = True
 
139
  return "Something happened 🤷‍♂️!!"
140
  else:
141
  generated_text = generated_text[0]["generated_text"]
142
+
143
+ logger.info(f"Prompt: {prompt}")
144
+ logger.info(f"Generated text: {generated_text}")
145
  return self.preprocessor.unpreprocess(generated_text)
146
 
147
  def query(self, payload, model_name):
 
234
  preds_df = pd.DataFrame([])
235
  for i in range(0, 5):
236
  preds = []
237
+ for s in more_itertools.chunked(list(prep_texts), 128):
238
  preds.extend(self.pipelines["sar_trial10"][i](s))
239
  preds_df[f"model_{i}"] = preds
240
 
 
260
  return final_labels, final_scores
261
 
262
  def get_preds_from_a_model(self, texts: List[str], model_name):
263
+ try:
264
+ prep = self.processors[model_name]
265
 
266
+ prep_texts = [prep.preprocess(x) for x in texts]
267
+ if model_name == "sa_sarcasm":
268
+ sarcasm_label, _ = self.get_preds_from_sarcasm(texts)
269
+ sarcastic_map = {"Not_Sarcastic": "غير ساخر", "Sarcastic": "ساخر"}
270
+ labeled_prep_texts = []
271
+ for t, l in zip(prep_texts, sarcasm_label):
272
+ labeled_prep_texts.append(sarcastic_map[l] + " [SEP] " + t)
273
 
274
+ preds_df = pd.DataFrame([])
275
+ for i in range(0, 5):
276
+ preds = []
277
+ for s in more_itertools.chunked(list(prep_texts), 128):
278
+ preds.extend(self.pipelines[model_name][i](s))
279
+ preds_df[f"model_{i}"] = preds
 
280
 
281
+ final_labels = []
282
+ final_scores = []
283
+ final_scores_list = []
284
+ for id, row in preds_df.iterrows():
285
+ pos_total = 0
286
+ neg_total = 0
287
+ neu_total = 0
288
+ for pred in row[2:]:
289
+ pos_total += pred[0]["score"]
290
+ neu_total += pred[1]["score"]
291
+ neg_total += pred[2]["score"]
292
 
293
+ pos_avg = pos_total / 5
294
+ neu_avg = neu_total / 5
295
+ neg_avg = neg_total / 5
 
 
 
 
 
 
 
 
296
 
297
+ if model_name == "sa_no_aoa_in_neutral":
298
+ final_labels.append(
299
+ self.pipelines[model_name][0].model.config.id2label[
300
+ np.argmax([neu_avg, neg_avg, pos_avg])
301
+ ]
302
+ )
303
+ else:
304
+ final_labels.append(
305
+ self.pipelines[model_name][0].model.config.id2label[
306
+ np.argmax([pos_avg, neu_avg, neg_avg])
307
+ ]
308
+ )
309
+ final_scores.append(np.max([pos_avg, neu_avg, neg_avg]))
310
+ final_scores_list.append((pos_avg, neu_avg, neg_avg))
311
+ except RuntimeError as e:
312
+ if model_name == "sa_cnnbert":
313
+ return (
314
+ ["Neutral"] * len(texts),
315
+ [0.0] * len(texts),
316
+ [(0.0, 0.0, 0.0)] * len(texts),
317
  )
318
  else:
319
+ raise RuntimeError(e)
 
 
 
 
 
 
 
320
  return final_labels, final_scores, final_scores_list
321
 
322
  def predict(self, texts: List[str]):
 
378
  logger.info(f"Score: {final_ensemble_score}")
379
  logger.info(f"All Scores: {final_ensemble_all_score}")
380
  return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score
381
+
382
+
383
+ wikipedia.set_lang("ar")
384
+
385
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
386
+
387
+ preprocessor = ArabertPreprocessor("wissamantoun/araelectra-base-artydiqa")
388
+ logger.info("Loading QA Pipeline...")
389
+ tokenizer = AutoTokenizer.from_pretrained("wissamantoun/araelectra-base-artydiqa")
390
+ qa_pipe = pipeline("question-answering", model="wissamantoun/araelectra-base-artydiqa")
391
+ logger.info("Finished loading QA Pipeline...")
392
+
393
+
394
+ @lru_cache(maxsize=100)
395
+ def get_qa_answers(question):
396
+ logger.info("\n=================================================================")
397
+ logger.info(f"Question: {question}")
398
+
399
+ if "وسام أنطون" in question or "wissam antoun" in question.lower():
400
+ return {
401
+ "title": "Creator",
402
+ "results": [
403
+ {
404
+ "score": 1.0,
405
+ "new_start": 0,
406
+ "new_end": 12,
407
+ "new_answer": "My Creator 😜",
408
+ "original": "My Creator 😜",
409
+ "link": "https://github.com/WissamAntoun/",
410
+ }
411
+ ],
412
+ }
413
+ search_timer = Timer(
414
+ "search and wiki", text="Search and Wikipedia Time: {:.2f}", logger=logging.info
415
+ )
416
+ try:
417
+ search_timer.start()
418
+ search_results = google.search(
419
+ question + " site:ar.wikipedia.org", lang="ar", area="ar"
420
+ )
421
+ if len(search_results) == 0:
422
+ return {}
423
+
424
+ page_name = search_results[0].link.split("wiki/")[-1]
425
+ wiki_page = wikipedia.page(unquote(page_name))
426
+ wiki_page_content = wiki_page.content
427
+ search_timer.stop()
428
+ except:
429
+ return {}
430
+
431
+ sections = []
432
+ for section in re.split("== .+ ==[^=]", wiki_page_content):
433
+ if not section.isspace():
434
+ prep_section = tokenizer.tokenize(preprocessor.preprocess(section))
435
+ if len(prep_section) > 500:
436
+ subsections = []
437
+ for subsection in re.split("=== .+ ===", section):
438
+ if subsection.isspace():
439
+ continue
440
+ prep_subsection = tokenizer.tokenize(
441
+ preprocessor.preprocess(subsection)
442
+ )
443
+ subsections.append(subsection)
444
+ # logger.info(f"Subsection found with length: {len(prep_subsection)}")
445
+ sections.extend(subsections)
446
+ else:
447
+ # logger.info(f"Regular Section with length: {len(prep_section)}")
448
+ sections.append(section)
449
+
450
+ full_len_sections = []
451
+ temp_section = ""
452
+ for section in sections:
453
+ if (
454
+ len(tokenizer.tokenize(preprocessor.preprocess(temp_section)))
455
+ + len(tokenizer.tokenize(preprocessor.preprocess(section)))
456
+ > 384
457
+ ):
458
+ if temp_section == "":
459
+ temp_section = section
460
+ continue
461
+ full_len_sections.append(temp_section)
462
+ # logger.info(
463
+ # f"full section length: {len(tokenizer.tokenize(preprocessor.preprocess(temp_section)))}"
464
+ # )
465
+ temp_section = ""
466
+ else:
467
+ temp_section += " " + section + " "
468
+ if temp_section != "":
469
+ full_len_sections.append(temp_section)
470
+
471
+ reader_time = Timer("electra", text="Reader Time: {:.2f}", logger=logging.info)
472
+ reader_time.start()
473
+ results = qa_pipe(
474
+ question=[preprocessor.preprocess(question)] * len(full_len_sections),
475
+ context=[preprocessor.preprocess(x) for x in full_len_sections],
476
+ )
477
+
478
+ if not isinstance(results, list):
479
+ results = [results]
480
+
481
+ logger.info(f"Wiki Title: {unquote(page_name)}")
482
+ logger.info(f"Total Sections: {len(sections)}")
483
+ logger.info(f"Total Full Sections: {len(full_len_sections)}")
484
+
485
+ for result, section in zip(results, full_len_sections):
486
+ result["original"] = section
487
+ answer_match = find_near_matches(
488
+ " " + preprocessor.unpreprocess(result["answer"]) + " ",
489
+ result["original"],
490
+ max_l_dist=min(5, len(preprocessor.unpreprocess(result["answer"])) // 2),
491
+ max_deletions=0,
492
+ )
493
+ try:
494
+ result["new_start"] = answer_match[0].start
495
+ result["new_end"] = answer_match[0].end
496
+ result["new_answer"] = answer_match[0].matched
497
+ result["link"] = (
498
+ search_results[0].link + "#:~:text=" + result["new_answer"].strip()
499
+ )
500
+ except:
501
+ result["new_start"] = result["start"]
502
+ result["new_end"] = result["end"]
503
+ result["new_answer"] = result["answer"]
504
+ result["original"] = preprocessor.preprocess(result["original"])
505
+ result["link"] = search_results[0].link
506
+ logger.info(f"Answers: {preprocessor.preprocess(result['new_answer'])}")
507
+
508
+ sorted_results = sorted(results, reverse=True, key=lambda x: x["score"])
509
+
510
+ return_dict = {}
511
+ return_dict["title"] = unquote(page_name)
512
+ return_dict["results"] = sorted_results
513
+
514
+ reader_time.stop()
515
+ logger.info(f"Total time spent: {reader_time.last + search_timer.last}")
516
+ return return_dict
backend/utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  import psutil
3
  import os
@@ -40,3 +41,24 @@ def download_models(models):
40
 
41
  def softmax(x):
42
  return np.exp(x) / sum(np.exp(x))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
  import numpy as np
3
  import psutil
4
  import os
 
41
 
42
  def softmax(x):
43
  return np.exp(x) / sum(np.exp(x))
44
+
45
+
46
+ def ga(file):
47
+ code = """
48
+ <!-- Global site tag (gtag.js) - Google Analytics -->
49
+ <script async src="https://www.googletagmanager.com/gtag/js?id=G-NH9HWCW08F"></script>
50
+ <script>
51
+ window.dataLayer = window.dataLayer || [];
52
+ function gtag(){dataLayer.push(arguments);}
53
+ gtag('js', new Date());
54
+ gtag('config', 'G-NH9HWCW08F');
55
+ </script>
56
+ """
57
+
58
+ a = os.path.dirname(file) + "/static/index.html"
59
+ with open(a, "r") as f:
60
+ data = f.read()
61
+ if len(re.findall("G-", data)) == 0:
62
+ with open(a, "w") as ff:
63
+ newdata = re.sub("<head>", "<head>" + code, data)
64
+ ff.write(newdata)
images/is2alni_logo.png ADDED
requirements.txt CHANGED
@@ -10,4 +10,8 @@ transformers==4.10.0
10
  psutil==5.8.0
11
  fuzzysearch==0.7.3
12
  more-itertools==8.9.0
13
- cookiecutter
 
 
 
 
 
10
  psutil==5.8.0
11
  fuzzysearch==0.7.3
12
  more-itertools==8.9.0
13
+ cookiecutter
14
+ git+https://github.com/dantru7/Google-Search-API
15
+ codetiming==1.3.0
16
+ htbuilder
17
+ wikipedia==1.4.0