ola13 commited on
Commit
5e38a42
1 Parent(s): 8e0107d
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +141 -79
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🌏🌖
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: streamlit
7
- sdk_version: 1.17.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: streamlit
7
+ sdk_version: 1.18.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -6,6 +6,7 @@ import streamlit as st
6
  import streamlit.components.v1 as components
7
  import requests
8
 
 
9
 
10
  pp = pprint.PrettyPrinter(indent=2)
11
 
@@ -20,6 +21,7 @@ with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
20
 
21
  corpus_name_map = {
22
  "LAION": "laion",
 
23
  "The Pile": "pile",
24
  "C4": "c4",
25
  }
@@ -64,11 +66,11 @@ st.sidebar.markdown(
64
  # </p>
65
 
66
 
67
- query = st.sidebar.text_input(label="Query", value="")
68
  corpus = st.sidebar.selectbox(
69
  "Corpus",
70
  tuple(corpus_name_map.keys()),
71
- index=0,
72
  )
73
  max_results = st.sidebar.slider(
74
  "Max Results",
@@ -127,10 +129,15 @@ def scisearch(query, corpus, num_results=10):
127
  if query == "" or query is None:
128
  return
129
 
130
- post_data = {"query": query, "corpus": corpus, "k": num_results}
 
 
 
 
 
131
 
132
  output = requests.post(
133
- os.environ.get("address"),
134
  headers={"Content-type": "application/json"},
135
  data=json.dumps(post_data),
136
  timeout=60,
@@ -170,9 +177,54 @@ def highlight_string(paragraph: str, highlight_terms: list) -> str:
170
  return process_pii(tokens_html)
171
 
172
 
173
- def process_results(corpus: str, hits: list, highlight_terms: list) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  hit_list = []
175
- for i, hit in enumerate(hits):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  res_head = f"""
177
  <p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
178
  """
@@ -200,76 +252,86 @@ def process_results(corpus: str, hits: list, highlight_terms: list) -> str:
200
  return " ".join(hit_list)
201
 
202
 
203
- if st.sidebar.button("Search", type="primary"):
204
- hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
205
- html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
206
- rendered_results = f"""
207
- <div id="searchresultsarea">
208
- <br>
209
- <p id="searchresultsnumber">About {max_results} results</p>
210
- {html_results}
211
- # </div>"""
212
- # st.markdown(
213
- # """
214
- # <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
215
- # integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
216
- # """,
217
- # unsafe_allow_html=True,
218
- # )
219
- # st.markdown(
220
- # """
221
- # <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
222
- # """,
223
- # unsafe_allow_html=True,
224
- # )
225
- # st.markdown(
226
- # f"""
227
- # <div class="row no-gutters mt-3 align-items-center">
228
- # Gaia Search 🌖🌏
229
- # <div class="col col-md-4">
230
- # <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
231
- # </div>
232
- # <div class="col-auto">
233
- # <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
234
- # <i class="fa fa-search"></i>
235
- # </button>
236
- # </div>
237
- # </div>
238
- # """,
239
- # unsafe_allow_html=True,
240
- # )
241
- # .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
242
- components.html(
243
- """
244
- <head>
245
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
246
- </head>
247
- <style>
248
- #searchresultsarea {
249
- font-family: "Source Sans Pro", sans-serif;
250
- }
251
- #searchresultsnumber {
252
- font-size: 0.8rem;
253
- color: gray;
254
- }
255
- .searchresult h2 {
256
- font-size: 19px;
257
- line-height: 18px;
258
- font-weight: normal;
259
- color: rgb(7, 111, 222);
260
- margin-bottom: 0px;
261
- margin-top: 25px;
262
- color: #7978FF;"
263
- }
264
- .searchresult a {
265
- font-size: 12px;
266
- line-height: 12px;
267
- color: green;
268
- margin-bottom: 0px;
269
- }
270
- </style>
271
- """
272
- + rendered_results,
273
- height=800,
274
- scrolling=True,
275
- )
 
 
 
 
 
 
 
 
 
 
 
6
  import streamlit.components.v1 as components
7
  import requests
8
 
9
+ from typing import Union
10
 
11
  pp = pprint.PrettyPrinter(indent=2)
12
 
 
21
 
22
  corpus_name_map = {
23
  "LAION": "laion",
24
+ "ROOTS": "roots",
25
  "The Pile": "pile",
26
  "C4": "c4",
27
  }
 
66
  # </p>
67
 
68
 
69
+ query = st.sidebar.text_input(label="Query", placeholder="Type your query here")
70
  corpus = st.sidebar.selectbox(
71
  "Corpus",
72
  tuple(corpus_name_map.keys()),
73
+ index=2,
74
  )
75
  max_results = st.sidebar.slider(
76
  "Max Results",
 
129
  if query == "" or query is None:
130
  return
131
 
132
+ post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"}
133
+ address = (
134
+ os.environ.get("address")
135
+ if corpus != "roots"
136
+ else "http://34.116.206.238:8080"
137
+ )
138
 
139
  output = requests.post(
140
+ address,
141
  headers={"Content-type": "application/json"},
142
  data=json.dumps(post_data),
143
  timeout=60,
 
177
  return process_pii(tokens_html)
178
 
179
 
180
+ def extract_lang_from_docid(docid):
181
+ return docid.split("_")[1]
182
+
183
+
184
+ def format_result(result, highlight_terms):
185
+ text = result["text"]
186
+ docid = result["docid"]
187
+ tokens_html = highlight_string(text, highlight_terms)
188
+ language = extract_lang_from_docid(docid)
189
+ result_html = """
190
+ <span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
191
+ <span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span><br>
192
+ <span style='font-family: Arial;'>{}</span><br>
193
+ <br>
194
+ """.format(
195
+ language, docid, tokens_html
196
+ )
197
+ return "<p>" + result_html + "</p>"
198
+
199
+
200
+ def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str:
201
  hit_list = []
202
+
203
+ if corpus == "roots":
204
+ result_page_html = ""
205
+ for lang, results_for_lang in hits.items():
206
+ print("Processing language", lang)
207
+ if len(results_for_lang) == 0:
208
+ result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
209
+ No results for language: <b>{}</b></div>""".format(
210
+ lang
211
+ )
212
+ continue
213
+ results_for_lang_html = ""
214
+ for result in results_for_lang:
215
+ result_html = format_result(result, highlight_terms)
216
+ results_for_lang_html += result_html
217
+ results_for_lang_html = f"""
218
+ <details>
219
+ <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
220
+ Results for language: <b>{lang}</b>
221
+ </summary>
222
+ {results_for_lang_html}
223
+ </details>"""
224
+ result_page_html += results_for_lang_html
225
+ return result_page_html
226
+
227
+ for hit in hits:
228
  res_head = f"""
229
  <p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
230
  """
 
252
  return " ".join(hit_list)
253
 
254
 
255
+ submit_button = st.sidebar.button("Search", type="primary")
256
+
257
+ if submit_button or query:
258
+ query = query.strip()
259
+ if query is None or query == "":
260
+ components.html(
261
+ """<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
262
+ Please provide a non-empty query.
263
+ </p><br><hr><br>"""
264
+ )
265
+ else:
266
+ hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
267
+ html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
268
+ rendered_results = f"""
269
+ <div id="searchresultsarea">
270
+ <br>
271
+ <p id="searchresultsnumber">About {max_results} results</p>
272
+ {html_results}
273
+ </div>"""
274
+ # st.markdown(
275
+ # """
276
+ # <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
277
+ # integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
278
+ # """,
279
+ # unsafe_allow_html=True,
280
+ # )
281
+ # st.markdown(
282
+ # """
283
+ # <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
284
+ # """,
285
+ # unsafe_allow_html=True,
286
+ # )
287
+ # st.markdown(
288
+ # f"""
289
+ # <div class="row no-gutters mt-3 align-items-center">
290
+ # Gaia Search 🌖🌏
291
+ # <div class="col col-md-4">
292
+ # <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
293
+ # </div>
294
+ # <div class="col-auto">
295
+ # <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
296
+ # <i class="fa fa-search"></i>
297
+ # </button>
298
+ # </div>
299
+ # </div>
300
+ # """,
301
+ # unsafe_allow_html=True,
302
+ # )
303
+ # .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
304
+ components.html(
305
+ """
306
+ <head>
307
+ <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
308
+ </head>
309
+ <style>
310
+ #searchresultsarea {
311
+ font-family: "Source Sans Pro", sans-serif;
312
+ }
313
+ #searchresultsnumber {
314
+ font-size: 0.8rem;
315
+ color: gray;
316
+ }
317
+ .searchresult h2 {
318
+ font-size: 19px;
319
+ line-height: 18px;
320
+ font-weight: normal;
321
+ color: rgb(7, 111, 222);
322
+ margin-bottom: 0px;
323
+ margin-top: 25px;
324
+ color: #7978FF;"
325
+ }
326
+ .searchresult a {
327
+ font-size: 12px;
328
+ line-height: 12px;
329
+ color: green;
330
+ margin-bottom: 0px;
331
+ }
332
+ </style>
333
+ """
334
+ + rendered_results,
335
+ height=800,
336
+ scrolling=True,
337
+ )