fix em highlights
Browse files
app.py
CHANGED
@@ -12,7 +12,9 @@ from huggingface_hub import HfApi
|
|
12 |
hf_api = HfApi()
|
13 |
roots_datasets = {
|
14 |
dset.id.split("/")[-1]: dset
|
15 |
-
for dset in hf_api.list_datasets(
|
|
|
|
|
16 |
}
|
17 |
|
18 |
|
@@ -64,7 +66,9 @@ def process_pii(text):
|
|
64 |
for tag in PII_TAGS:
|
65 |
text = text.replace(
|
66 |
PII_PREFIX + tag,
|
67 |
-
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
|
|
|
|
68 |
)
|
69 |
return text
|
70 |
|
@@ -99,17 +103,11 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
99 |
return ""
|
100 |
|
101 |
if exact_search:
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
tokens_html
|
106 |
-
|
107 |
-
norm_token = normalize(token)
|
108 |
-
if norm_token in highlight_terms:
|
109 |
-
tokens_html.append("<b>{}</b>".format(token))
|
110 |
-
else:
|
111 |
-
tokens_html.append(token)
|
112 |
-
tokens_html = " ".join(tokens_html)
|
113 |
else:
|
114 |
tokens = text.split()
|
115 |
tokens_html = []
|
@@ -154,7 +152,9 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
154 |
return "<p>" + result_html + "</p>"
|
155 |
|
156 |
|
157 |
-
def format_result_page(
|
|
|
|
|
158 |
filtered_num_results = 0
|
159 |
header_html = ""
|
160 |
|
@@ -179,7 +179,9 @@ def format_result_page(language, results, highlight_terms, num_results, exact_se
|
|
179 |
continue
|
180 |
results_for_lang_html = ""
|
181 |
for result in results_for_lang:
|
182 |
-
result_html = format_result(
|
|
|
|
|
183 |
if result_html != "":
|
184 |
filtered_num_results += 1
|
185 |
results_for_lang_html += result_html
|
@@ -221,7 +223,9 @@ def extract_results_from_payload(query, language, payload, exact_search):
|
|
221 |
text = result["text"]
|
222 |
url = (
|
223 |
result["meta"]["url"]
|
224 |
-
if "meta" in result
|
|
|
|
|
225 |
else None
|
226 |
)
|
227 |
docid = result["docid"]
|
@@ -259,7 +263,11 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
259 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
260 |
if language != "detect_language":
|
261 |
post_data["lang"] = language
|
262 |
-
address =
|
|
|
|
|
|
|
|
|
263 |
output = requests.post(
|
264 |
address,
|
265 |
headers={"Content-type": "application/json"},
|
@@ -270,7 +278,9 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
270 |
return payload
|
271 |
|
272 |
|
273 |
-
title =
|
|
|
|
|
274 |
description = """
|
275 |
|
276 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
@@ -389,7 +399,9 @@ if __name__ == "__main__":
|
|
389 |
payload,
|
390 |
exact_search,
|
391 |
)
|
392 |
-
result_page = format_result_page(
|
|
|
|
|
393 |
return (
|
394 |
processed_results,
|
395 |
highlight_terms,
|
@@ -410,13 +422,19 @@ if __name__ == "__main__":
|
|
410 |
datasets,
|
411 |
) = run_query(query, lang, k, dropdown_input, 0)
|
412 |
has_more_results = exact_search and (num_results > k)
|
413 |
-
current_results =
|
|
|
|
|
|
|
|
|
414 |
return [
|
415 |
processed_results,
|
416 |
highlight_terms,
|
417 |
num_results,
|
418 |
exact_search,
|
419 |
-
gr.update(visible=True)
|
|
|
|
|
420 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
421 |
gr.update(visible=has_more_results),
|
422 |
current_results,
|
@@ -439,8 +457,12 @@ if __name__ == "__main__":
|
|
439 |
result_page,
|
440 |
datasets,
|
441 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
442 |
-
current_results = sum(
|
443 |
-
|
|
|
|
|
|
|
|
|
444 |
print("received_results", received_results)
|
445 |
print("current_results", current_results)
|
446 |
print("has_more_results", has_more_results)
|
@@ -449,7 +471,9 @@ if __name__ == "__main__":
|
|
449 |
highlight_terms,
|
450 |
num_results,
|
451 |
exact_search,
|
452 |
-
gr.update(visible=True)
|
|
|
|
|
453 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
454 |
gr.update(visible=current_results >= k and has_more_results),
|
455 |
received_results + current_results,
|
|
|
12 |
hf_api = HfApi()
|
13 |
roots_datasets = {
|
14 |
dset.id.split("/")[-1]: dset
|
15 |
+
for dset in hf_api.list_datasets(
|
16 |
+
author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
|
17 |
+
)
|
18 |
}
|
19 |
|
20 |
|
|
|
66 |
for tag in PII_TAGS:
|
67 |
text = text.replace(
|
68 |
PII_PREFIX + tag,
|
69 |
+
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
70 |
+
tag
|
71 |
+
),
|
72 |
)
|
73 |
return text
|
74 |
|
|
|
103 |
return ""
|
104 |
|
105 |
if exact_search:
|
106 |
+
query_start = text.find(highlight_terms)
|
107 |
+
query_end = query_start + len(highlight_terms)
|
108 |
+
tokens_html = text[0:query_start]
|
109 |
+
tokens_html += "<b>{}</b>".format(text[query_start:query_end])
|
110 |
+
tokens_html += text[query_end:]
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
else:
|
112 |
tokens = text.split()
|
113 |
tokens_html = []
|
|
|
152 |
return "<p>" + result_html + "</p>"
|
153 |
|
154 |
|
155 |
+
def format_result_page(
|
156 |
+
language, results, highlight_terms, num_results, exact_search, datasets_filter=None
|
157 |
+
) -> gr.HTML:
|
158 |
filtered_num_results = 0
|
159 |
header_html = ""
|
160 |
|
|
|
179 |
continue
|
180 |
results_for_lang_html = ""
|
181 |
for result in results_for_lang:
|
182 |
+
result_html = format_result(
|
183 |
+
result, highlight_terms, exact_search, datasets_filter
|
184 |
+
)
|
185 |
if result_html != "":
|
186 |
filtered_num_results += 1
|
187 |
results_for_lang_html += result_html
|
|
|
223 |
text = result["text"]
|
224 |
url = (
|
225 |
result["meta"]["url"]
|
226 |
+
if "meta" in result
|
227 |
+
and result["meta"] is not None
|
228 |
+
and "url" in result["meta"]
|
229 |
else None
|
230 |
)
|
231 |
docid = result["docid"]
|
|
|
263 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
264 |
if language != "detect_language":
|
265 |
post_data["lang"] = language
|
266 |
+
address = (
|
267 |
+
os.environ.get("address_exact_search")
|
268 |
+
if exact_search
|
269 |
+
else os.environ.get("address")
|
270 |
+
)
|
271 |
output = requests.post(
|
272 |
address,
|
273 |
headers={"Content-type": "application/json"},
|
|
|
278 |
return payload
|
279 |
|
280 |
|
281 |
+
title = (
|
282 |
+
"""<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
283 |
+
)
|
284 |
description = """
|
285 |
|
286 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
|
|
399 |
payload,
|
400 |
exact_search,
|
401 |
)
|
402 |
+
result_page = format_result_page(
|
403 |
+
lang, processed_results, highlight_terms, num_results, exact_search
|
404 |
+
)
|
405 |
return (
|
406 |
processed_results,
|
407 |
highlight_terms,
|
|
|
422 |
datasets,
|
423 |
) = run_query(query, lang, k, dropdown_input, 0)
|
424 |
has_more_results = exact_search and (num_results > k)
|
425 |
+
current_results = (
|
426 |
+
len(next(iter(processed_results.values())))
|
427 |
+
if len(processed_results) > 0
|
428 |
+
else 0
|
429 |
+
)
|
430 |
return [
|
431 |
processed_results,
|
432 |
highlight_terms,
|
433 |
num_results,
|
434 |
exact_search,
|
435 |
+
gr.update(visible=True)
|
436 |
+
if current_results > 0
|
437 |
+
else gr.update(visible=False),
|
438 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
439 |
gr.update(visible=has_more_results),
|
440 |
current_results,
|
|
|
457 |
result_page,
|
458 |
datasets,
|
459 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
460 |
+
current_results = sum(
|
461 |
+
len(results) for results in processed_results.values()
|
462 |
+
)
|
463 |
+
has_more_results = exact_search and (
|
464 |
+
received_results + current_results < num_results
|
465 |
+
)
|
466 |
print("received_results", received_results)
|
467 |
print("current_results", current_results)
|
468 |
print("has_more_results", has_more_results)
|
|
|
471 |
highlight_terms,
|
472 |
num_results,
|
473 |
exact_search,
|
474 |
+
gr.update(visible=True)
|
475 |
+
if current_results > 0
|
476 |
+
else gr.update(visible=False),
|
477 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
478 |
gr.update(visible=current_results >= k and has_more_results),
|
479 |
received_results + current_results,
|