ola13 commited on
Commit
7e70097
Β·
1 Parent(s): f06f0df

fix em highlights

Browse files
Files changed (1) hide show
  1. app.py +48 -24
app.py CHANGED
@@ -12,7 +12,9 @@ from huggingface_hub import HfApi
12
  hf_api = HfApi()
13
  roots_datasets = {
14
  dset.id.split("/")[-1]: dset
15
- for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))
 
 
16
  }
17
 
18
 
@@ -64,7 +66,9 @@ def process_pii(text):
64
  for tag in PII_TAGS:
65
  text = text.replace(
66
  PII_PREFIX + tag,
67
- """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
 
 
68
  )
69
  return text
70
 
@@ -99,17 +103,11 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
99
  return ""
100
 
101
  if exact_search:
102
- highlight_terms = normalize(highlight_terms).split()
103
- print("highlight_terms", highlight_terms)
104
- tokens = text.split()
105
- tokens_html = []
106
- for token in tokens:
107
- norm_token = normalize(token)
108
- if norm_token in highlight_terms:
109
- tokens_html.append("<b>{}</b>".format(token))
110
- else:
111
- tokens_html.append(token)
112
- tokens_html = " ".join(tokens_html)
113
  else:
114
  tokens = text.split()
115
  tokens_html = []
@@ -154,7 +152,9 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
154
  return "<p>" + result_html + "</p>"
155
 
156
 
157
- def format_result_page(language, results, highlight_terms, num_results, exact_search, datasets_filter=None) -> gr.HTML:
 
 
158
  filtered_num_results = 0
159
  header_html = ""
160
 
@@ -179,7 +179,9 @@ def format_result_page(language, results, highlight_terms, num_results, exact_se
179
  continue
180
  results_for_lang_html = ""
181
  for result in results_for_lang:
182
- result_html = format_result(result, highlight_terms, exact_search, datasets_filter)
 
 
183
  if result_html != "":
184
  filtered_num_results += 1
185
  results_for_lang_html += result_html
@@ -221,7 +223,9 @@ def extract_results_from_payload(query, language, payload, exact_search):
221
  text = result["text"]
222
  url = (
223
  result["meta"]["url"]
224
- if "meta" in result and result["meta"] is not None and "url" in result["meta"]
 
 
225
  else None
226
  )
227
  docid = result["docid"]
@@ -259,7 +263,11 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
259
  post_data = {"query": query, "k": num_results, "received_results": received_results}
260
  if language != "detect_language":
261
  post_data["lang"] = language
262
- address = os.environ.get("address_exact_search") if exact_search else os.environ.get("address")
 
 
 
 
263
  output = requests.post(
264
  address,
265
  headers={"Content-type": "application/json"},
@@ -270,7 +278,9 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
270
  return payload
271
 
272
 
273
- title = """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
 
 
274
  description = """
275
 
276
  The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
@@ -389,7 +399,9 @@ if __name__ == "__main__":
389
  payload,
390
  exact_search,
391
  )
392
- result_page = format_result_page(lang, processed_results, highlight_terms, num_results, exact_search)
 
 
393
  return (
394
  processed_results,
395
  highlight_terms,
@@ -410,13 +422,19 @@ if __name__ == "__main__":
410
  datasets,
411
  ) = run_query(query, lang, k, dropdown_input, 0)
412
  has_more_results = exact_search and (num_results > k)
413
- current_results = len(next(iter(processed_results.values()))) if len(processed_results) > 0 else 0
 
 
 
 
414
  return [
415
  processed_results,
416
  highlight_terms,
417
  num_results,
418
  exact_search,
419
- gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
 
 
420
  gr.Dropdown.update(choices=datasets, value=datasets),
421
  gr.update(visible=has_more_results),
422
  current_results,
@@ -439,8 +457,12 @@ if __name__ == "__main__":
439
  result_page,
440
  datasets,
441
  ) = run_query(query, lang, k, dropdown_input, received_results)
442
- current_results = sum(len(results) for results in processed_results.values())
443
- has_more_results = exact_search and (received_results + current_results < num_results)
 
 
 
 
444
  print("received_results", received_results)
445
  print("current_results", current_results)
446
  print("has_more_results", has_more_results)
@@ -449,7 +471,9 @@ if __name__ == "__main__":
449
  highlight_terms,
450
  num_results,
451
  exact_search,
452
- gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
 
 
453
  gr.Dropdown.update(choices=datasets, value=datasets),
454
  gr.update(visible=current_results >= k and has_more_results),
455
  received_results + current_results,
 
12
  hf_api = HfApi()
13
  roots_datasets = {
14
  dset.id.split("/")[-1]: dset
15
+ for dset in hf_api.list_datasets(
16
+ author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
17
+ )
18
  }
19
 
20
 
 
66
  for tag in PII_TAGS:
67
  text = text.replace(
68
  PII_PREFIX + tag,
69
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
70
+ tag
71
+ ),
72
  )
73
  return text
74
 
 
103
  return ""
104
 
105
  if exact_search:
106
+ query_start = text.find(highlight_terms)
107
+ query_end = query_start + len(highlight_terms)
108
+ tokens_html = text[0:query_start]
109
+ tokens_html += "<b>{}</b>".format(text[query_start:query_end])
110
+ tokens_html += text[query_end:]
 
 
 
 
 
 
111
  else:
112
  tokens = text.split()
113
  tokens_html = []
 
152
  return "<p>" + result_html + "</p>"
153
 
154
 
155
+ def format_result_page(
156
+ language, results, highlight_terms, num_results, exact_search, datasets_filter=None
157
+ ) -> gr.HTML:
158
  filtered_num_results = 0
159
  header_html = ""
160
 
 
179
  continue
180
  results_for_lang_html = ""
181
  for result in results_for_lang:
182
+ result_html = format_result(
183
+ result, highlight_terms, exact_search, datasets_filter
184
+ )
185
  if result_html != "":
186
  filtered_num_results += 1
187
  results_for_lang_html += result_html
 
223
  text = result["text"]
224
  url = (
225
  result["meta"]["url"]
226
+ if "meta" in result
227
+ and result["meta"] is not None
228
+ and "url" in result["meta"]
229
  else None
230
  )
231
  docid = result["docid"]
 
263
  post_data = {"query": query, "k": num_results, "received_results": received_results}
264
  if language != "detect_language":
265
  post_data["lang"] = language
266
+ address = (
267
+ os.environ.get("address_exact_search")
268
+ if exact_search
269
+ else os.environ.get("address")
270
+ )
271
  output = requests.post(
272
  address,
273
  headers={"Content-type": "application/json"},
 
278
  return payload
279
 
280
 
281
+ title = (
282
+ """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
283
+ )
284
  description = """
285
 
286
  The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
 
399
  payload,
400
  exact_search,
401
  )
402
+ result_page = format_result_page(
403
+ lang, processed_results, highlight_terms, num_results, exact_search
404
+ )
405
  return (
406
  processed_results,
407
  highlight_terms,
 
422
  datasets,
423
  ) = run_query(query, lang, k, dropdown_input, 0)
424
  has_more_results = exact_search and (num_results > k)
425
+ current_results = (
426
+ len(next(iter(processed_results.values())))
427
+ if len(processed_results) > 0
428
+ else 0
429
+ )
430
  return [
431
  processed_results,
432
  highlight_terms,
433
  num_results,
434
  exact_search,
435
+ gr.update(visible=True)
436
+ if current_results > 0
437
+ else gr.update(visible=False),
438
  gr.Dropdown.update(choices=datasets, value=datasets),
439
  gr.update(visible=has_more_results),
440
  current_results,
 
457
  result_page,
458
  datasets,
459
  ) = run_query(query, lang, k, dropdown_input, received_results)
460
+ current_results = sum(
461
+ len(results) for results in processed_results.values()
462
+ )
463
+ has_more_results = exact_search and (
464
+ received_results + current_results < num_results
465
+ )
466
  print("received_results", received_results)
467
  print("current_results", current_results)
468
  print("has_more_results", has_more_results)
 
471
  highlight_terms,
472
  num_results,
473
  exact_search,
474
+ gr.update(visible=True)
475
+ if current_results > 0
476
+ else gr.update(visible=False),
477
  gr.Dropdown.update(choices=datasets, value=datasets),
478
  gr.update(visible=current_results >= k and has_more_results),
479
  received_results + current_results,