ola13 commited on
Commit
9291cc0
1 Parent(s): b6da1a8
Files changed (1) hide show
  1. app.py +175 -161
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import json
2
  import os
3
  import pprint
4
- import re
5
 
6
  import streamlit as st
7
  import streamlit.components.v1 as components
@@ -12,74 +11,111 @@ pp = pprint.PrettyPrinter(indent=2)
12
 
13
  os.environ["address"] = "http://34.79.83.149:8080"
14
 
15
- st.set_page_config(page_title="Gaia Search", layout="wide")
16
 
17
  os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
18
  with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
19
  file.write('[theme]\nbase="light"')
20
 
 
 
 
 
 
 
 
21
  st.sidebar.markdown(
22
  """
23
- <style>
24
- .aligncenter {
25
- text-align: center;
26
- font-weight: bold;
27
- font-size: 50px;
28
- }
29
- </style>
30
- <p class="aligncenter">Gaia Search 🌖🌏</p>
31
- <p style="text-align: center;"> A search engine for the LAION large scale image caption corpora</p>
32
- """,
 
 
 
 
33
  unsafe_allow_html=True,
34
  )
35
 
36
  st.sidebar.markdown(
37
  """
38
- <style>
39
- .aligncenter {
40
- text-align: center;
41
- }
42
- </style>
43
- <p style='text-align: center'>
44
- <a href="" >GitHub</a> | <a href="" >Project Report</a>
45
- </p>
46
- <p class="aligncenter">
47
- <a href="" target="_blank">
48
- <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
49
- </a>
50
- </p>
51
- """,
52
  unsafe_allow_html=True,
53
  )
54
 
55
- query = st.sidebar.text_input(label="Search query", value="")
 
 
 
 
 
 
 
56
  corpus = st.sidebar.selectbox(
57
  "Corpus",
58
- ("laion", "pile", "c4"),
59
  index=0,
60
  )
61
  max_results = st.sidebar.slider(
62
- "Maximum Number of Results",
63
  min_value=1,
64
  max_value=100,
65
  step=1,
66
  value=10,
67
- help="Maximum Number of Documents to return",
68
  )
69
- footer = """<style>
70
- .footer {
71
- position: fixed;
72
- left: 0;
73
- bottom: 0;
74
- width: 100%;
75
- background-color: white;
76
- color: black;
77
- text-align: center;
78
- }
79
- </style>
80
- <div class="footer">
81
- <p>Powered by <a href="https://huggingface.co/" >HuggingFace 🤗</a> and <a href="https://github.com/castorini/pyserini" >Pyserini 🦆</a></p>
82
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
84
  st.sidebar.markdown(footer, unsafe_allow_html=True)
85
 
@@ -101,23 +137,12 @@ def scisearch(query, corpus, num_results=10):
101
  )
102
 
103
  payload = json.loads(output.text)
104
- results = payload["results"]
105
- highlight_terms = payload["highlight_terms"]
106
- return results, highlight_terms
107
 
108
  except Exception as e:
109
- results_html = f"""
110
- <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
111
- Raised {type(e).__name__}</p>
112
- <p style='font-size:14px; font-family: Arial; '>
113
- Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
114
- </p>
115
- """
116
  print(e)
117
 
118
 
119
-
120
-
121
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
122
  PII_PREFIX = "PI:"
123
 
@@ -134,126 +159,115 @@ def process_pii(text):
134
 
135
 
136
  def highlight_string(paragraph: str, highlight_terms: list) -> str:
137
- # TODO:
138
- # for term in highlight_terms:
139
- # paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
140
- paragraph = process_pii(paragraph)
141
- return paragraph
 
 
 
 
142
 
143
 
144
- def process_results(hits: list, highlight_terms: list) -> str:
145
  hit_list = []
146
  for i, hit in enumerate(hits):
147
  res_head = f"""
148
- <div class="searchresult">
149
- <h2>{i+1}. Document ID: {hit['docid']}</h2>, Score: {round(hit['score'], 2)}</p>
150
- """
151
- if "meta" in hit:
152
- if hit["meta"] is not None and "docs" in hit["meta"]:
 
 
 
 
 
 
 
 
 
153
  for subhit in hit["meta"]["docs"]:
154
- res_head += f"""
155
- <button onclick="load_image({subhit['_id']})">Load Image</button><br>
156
- <p><img id='{subhit['_id']}' src='{subhit['URL']}' style="width:400px;height:auto;display:none;"></p>
157
- <a href='{subhit['URL']}'>{subhit['URL']}</a>
158
- <p>{highlight_string(subhit['TEXT'], highlight_terms)}</p>
159
- """
160
- res_head += f"""
161
- <p>{highlight_string(hit['text'], highlight_terms)}</p>
162
- </div>
163
- <hr>
164
- """
165
  hit_list.append(res_head)
166
  return " ".join(hit_list)
167
 
168
 
169
- if st.sidebar.button("Search"):
170
- hits, highlight_terms = scisearch(query, corpus, max_results)
171
- html_results = process_results(hits, highlight_terms)
172
  rendered_results = f"""
173
- <div id="searchresultsarea">
174
- <br>
175
- <p id="searchresultsnumber">About {max_results} results</p>
176
- {html_results}
177
- </div>
178
- """
179
- st.markdown(
180
- """
181
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
182
- integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
183
- """,
184
- unsafe_allow_html=True,
185
- )
186
- st.markdown(
187
- """
188
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
189
- """,
190
- unsafe_allow_html=True,
191
- )
192
- st.markdown(
193
- f"""
194
- <div class="row no-gutters mt-3 align-items-center">
195
- Gaia Search 🌖🌏
196
- <div class="col col-md-4">
197
- <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
198
- </div>
199
- <div class="col-auto">
200
- <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
201
- <i class="fa fa-search"></i>
202
- </button>
203
- </div>
204
- </div>
205
- """,
206
- unsafe_allow_html=True,
207
- )
208
  components.html(
209
  """
 
 
 
210
  <style>
211
- #searchresultsarea {
212
- font-family: 'Arial';
213
- }
214
-
215
- #searchresultsnumber {
216
- font-size: 0.8rem;
217
- color: gray;
218
- }
219
-
220
- .searchresult h2 {
221
- font-size: 19px;
222
- line-height: 18px;
223
- font-weight: normal;
224
- color: rgb(7, 111, 222);
225
- margin-bottom: 0px;
226
- margin-top: 25px;
227
- }
228
-
229
- .searchresult a {
230
- font-size: 12px;
231
- line-height: 12px;
232
- color: green;
233
- margin-bottom: 0px;
234
- }
235
-
236
- .dark-mode {
237
- color: white;
238
- }
239
- </style>
240
- <script>
241
- function load_image(id){
242
- console.log(id)
243
- var x = document.getElementById(id);
244
- console.log(x)
245
- if (x.style.display === "none") {
246
- x.style.display = "block";
247
- } else {
248
- x.style.display = "none";
249
  }
250
- };
251
- function myFunction() {
252
- var element = document.body;
253
- element.classList.toggle("dark-mode");
254
- }
255
- </script>
256
- <button onclick="myFunction()">Toggle dark mode</button>
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  """
258
  + rendered_results,
259
  height=800,
 
1
  import json
2
  import os
3
  import pprint
 
4
 
5
  import streamlit as st
6
  import streamlit.components.v1 as components
 
11
 
12
  os.environ["address"] = "http://34.79.83.149:8080"
13
 
14
+ st.set_page_config(page_title="Gaia Search 🌖🌏", layout="wide")
15
 
16
  os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
17
  with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
18
  file.write('[theme]\nbase="light"')
19
 
20
+
21
+ corpus_name_map = {
22
+ "LAION": "laion",
23
+ "The Pile": "pile",
24
+ "C4": "c4",
25
+ }
26
+
27
  st.sidebar.markdown(
28
  """
29
+ <style>
30
+ .aligncenter {
31
+ text-align: center;
32
+ font-weight: bold;
33
+ font-size: 36px;
34
+ }
35
+ </style>
36
+ <p class="aligncenter">Gaia Search 🌖🌏</p>
37
+ <p>A search engine for large scale texual
38
+ corpora. Most of the datasets included in the tool are based on Common
39
+ Crawl. By using the tool, you are also bound by the Common Crawl terms
40
+ of use in respect of the content contained in the datasets.
41
+ </p>
42
+ """,
43
  unsafe_allow_html=True,
44
  )
45
 
46
  st.sidebar.markdown(
47
  """
48
+ <style>
49
+ .aligncenter {
50
+ text-align: center;
51
+ }
52
+ </style>
53
+ <p style='text-align: center'>
54
+ <a href="" style="color:#7978FF;">GitHub</a> | <a href="" style="color:#7978FF;" >Project Report</a> | <a href="" style="color:#7978FF;" >Colab</a>
55
+ </p>
56
+ """,
 
 
 
 
 
57
  unsafe_allow_html=True,
58
  )
59
 
60
+ # <p class="aligncenter">
61
+ # <a href="" target="_blank">
62
+ # <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
63
+ # </a>
64
+ # </p>
65
+
66
+
67
+ query = st.sidebar.text_input(label="Query", value="")
68
  corpus = st.sidebar.selectbox(
69
  "Corpus",
70
+ tuple(corpus_name_map.keys()),
71
  index=0,
72
  )
73
  max_results = st.sidebar.slider(
74
+ "Max Results",
75
  min_value=1,
76
  max_value=100,
77
  step=1,
78
  value=10,
79
+ help="Max Number of Documents to return",
80
  )
81
+
82
+ # dark_mode_toggle = """
83
+ # <script>
84
+ # function load_image(id){
85
+ # console.log(id)
86
+ # var x = document.getElementById(id);
87
+ # console.log(x)
88
+ # if (x.style.display === "none") {
89
+ # x.style.display = "block";
90
+ # } else {
91
+ # x.style.display = "none";
92
+ # }
93
+ # };
94
+ # function myFunction() {
95
+ # var element = document.body;
96
+ # element.classList.toggle("dark-mode");
97
+ # }
98
+ # </script>
99
+ # <button onclick="myFunction()">Toggle dark mode</button>
100
+ # """
101
+ # st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True)
102
+
103
+
104
+ footer = """
105
+ <style>
106
+ .footer {
107
+ position: fixed;
108
+ left: 0;
109
+ bottom: 0;
110
+ width: 100%;
111
+ background-color: white;
112
+ color: black;
113
+ text-align: center;
114
+ }
115
+ </style>
116
+ <div class="footer">
117
+ <p>Powered by <a href="https://huggingface.co/" >HuggingFace 🤗</a> and <a href="https://github.com/castorini/pyserini" >Pyserini 🦆</a></p>
118
+ </div>
119
  """
120
  st.sidebar.markdown(footer, unsafe_allow_html=True)
121
 
 
137
  )
138
 
139
  payload = json.loads(output.text)
140
+ return payload["results"], payload["highlight_terms"]
 
 
141
 
142
  except Exception as e:
 
 
 
 
 
 
 
143
  print(e)
144
 
145
 
 
 
146
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
147
  PII_PREFIX = "PI:"
148
 
 
159
 
160
 
161
  def highlight_string(paragraph: str, highlight_terms: list) -> str:
162
+ tokens = paragraph.split()
163
+ tokens_html = []
164
+ for token in tokens:
165
+ if token in highlight_terms:
166
+ tokens_html.append("<b>{}</b>".format(token))
167
+ else:
168
+ tokens_html.append(token)
169
+ tokens_html = " ".join(tokens_html)
170
+ return process_pii(tokens_html)
171
 
172
 
173
+ def process_results(corpus: str, hits: list, highlight_terms: list) -> str:
174
  hit_list = []
175
  for i, hit in enumerate(hits):
176
  res_head = f"""
177
+ <p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
178
+ """
179
+ if corpus == "laion":
180
+ res_head += f"""
181
+ <p style="color: #7978FF;">Caption:</p>
182
+ <p>{highlight_string(hit['text'], highlight_terms)}</p>
183
+ """
184
+ if (
185
+ "meta" in hit
186
+ and hit["meta"] is not None
187
+ and "docs" in hit["meta"]
188
+ and len(hit["meta"]["docs"]) > 0
189
+ ):
190
+ res_head += """<p style="color: #7978FF;"> Image links:</p><ul>"""
191
  for subhit in hit["meta"]["docs"]:
192
+ res_head += f"""<li><a href={subhit["URL"]} target="_blank" style="color:#ffcdf8; ">{subhit["URL"]}</a></li>"""
193
+ res_head += "</ul>"
194
+ res_head += "<hr>"
195
+ else:
196
+ res_head += (
197
+ f"""<p>{highlight_string(hit['text'], highlight_terms)}</p></div><hr>"""
198
+ )
 
 
 
 
199
  hit_list.append(res_head)
200
  return " ".join(hit_list)
201
 
202
 
203
+ if st.sidebar.button("Search", type="primary"):
204
+ hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
205
+ html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
206
  rendered_results = f"""
207
+ <div id="searchresultsarea">
208
+ <br>
209
+ <p id="searchresultsnumber">About {max_results} results</p>
210
+ {html_results}
211
+ # </div>"""
212
+ # st.markdown(
213
+ # """
214
+ # <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
215
+ # integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
216
+ # """,
217
+ # unsafe_allow_html=True,
218
+ # )
219
+ # st.markdown(
220
+ # """
221
+ # <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
222
+ # """,
223
+ # unsafe_allow_html=True,
224
+ # )
225
+ # st.markdown(
226
+ # f"""
227
+ # <div class="row no-gutters mt-3 align-items-center">
228
+ # Gaia Search 🌖🌏
229
+ # <div class="col col-md-4">
230
+ # <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
231
+ # </div>
232
+ # <div class="col-auto">
233
+ # <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
234
+ # <i class="fa fa-search"></i>
235
+ # </button>
236
+ # </div>
237
+ # </div>
238
+ # """,
239
+ # unsafe_allow_html=True,
240
+ # )
241
+ # .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
242
  components.html(
243
  """
244
+ <head>
245
+ <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
246
+ </head>
247
  <style>
248
+ #searchresultsarea {
249
+ font-family: "Source Sans Pro", sans-serif;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  }
251
+ #searchresultsnumber {
252
+ font-size: 0.8rem;
253
+ color: gray;
254
+ }
255
+ .searchresult h2 {
256
+ font-size: 19px;
257
+ line-height: 18px;
258
+ font-weight: normal;
259
+ color: rgb(7, 111, 222);
260
+ margin-bottom: 0px;
261
+ margin-top: 25px;
262
+ color: #7978FF;"
263
+ }
264
+ .searchresult a {
265
+ font-size: 12px;
266
+ line-height: 12px;
267
+ color: green;
268
+ margin-bottom: 0px;
269
+ }
270
+ </style>
271
  """
272
  + rendered_results,
273
  height=800,