ola13 commited on
Commit
dfc4a0d
1 Parent(s): 8a603fa

Revert changes

Browse files
Files changed (1) hide show
  1. app.py +123 -155
app.py CHANGED
@@ -1,100 +1,81 @@
 
1
  import json
 
2
  import os
3
  import pprint
4
  import re
 
5
 
6
- import requests
7
  import streamlit as st
8
  import streamlit.components.v1 as components
 
 
9
 
10
  pp = pprint.PrettyPrinter(indent=2)
11
  st.set_page_config(page_title="Gaia Search", layout="wide")
12
 
13
- os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
14
- with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
15
- file.write('[theme]\nbase="light"')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- LANG_MAPPING = {
18
- "Arabic": "ar",
19
- "Catalan": "ca",
20
- "Code": "code",
21
- "English": "en",
22
- "Spanish": "es",
23
- "French": "fr",
24
- "Indonesian": "id",
25
- "Indic": "indic",
26
- "Niger-Congo": "nigercongo",
27
- "Portuguese": "pt",
28
- "Vietnamese": "vi",
29
- "Chinese": "zh",
30
- "Detect Language": "detect_language",
31
- "All": "all",
32
- }
33
 
34
 
35
  st.sidebar.markdown(
36
- """
37
- <style>
38
- .aligncenter {
39
- text-align: center;
40
- font-weight: bold;
41
- font-size: 50px;
42
- }
43
- </style>
44
- <p class="aligncenter">Gaia Search 🌖🌏</p>
45
- <p style="text-align: center;"> A search engine for the LAION large scale image caption corpora</p>
46
- """,
47
- unsafe_allow_html=True,
48
  )
49
 
50
  st.sidebar.markdown(
51
- """
52
- <style>
53
- .aligncenter {
54
- text-align: center;
55
- }
56
- </style>
57
- <p style='text-align: center'>
58
- <a href="" >GitHub</a> | <a href="" >Project Report</a>
59
- </p>
60
- <p class="aligncenter">
61
- <a href="" target="_blank">
62
- <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
63
- </a>
64
- </p>
65
- """,
66
- unsafe_allow_html=True,
67
  )
68
 
69
- corpus = st.sidebar.selectbox(
70
- "Corpus",
71
- (
72
- "LAION",
73
- "C4",
74
- ),
75
- index=3,
76
- )
77
- query = st.sidebar.text_input(label="Search query", value="")
78
  language = st.sidebar.selectbox(
79
- "Language",
80
- (
81
- "Arabic",
82
- "Catalan",
83
- "Code",
84
- "English",
85
- "Spanish",
86
- "French",
87
- "Indonesian",
88
- "Indic",
89
- "Niger-Congo",
90
- "Portuguese",
91
- "Vietnamese",
92
- "Chinese",
93
- "Detect Language",
94
- "All",
95
- ),
96
- index=3,
97
- )
98
  max_results = st.sidebar.slider(
99
  "Maximum Number of Results",
100
  min_value=1,
@@ -103,41 +84,36 @@ max_results = st.sidebar.slider(
103
  value=10,
104
  help="Maximum Number of Documents to return",
105
  )
106
- footer = """
107
- <style>
108
- .footer {
109
- position: fixed;
110
- left: 0;
111
- bottom: 0;
112
- width: 100%;
113
- background-color: white;
114
- color: black;
115
- text-align: center;
116
- }
117
- </style>
118
- <div class="footer">
119
- <p>
120
- Powered by <a href="https://huggingface.co/" >HuggingFace 🤗</a> and <a href="https://github.com/castorini/pyserini" >Pyserini 🦆</a>
121
- </p>
122
- </div>
123
  """
124
- st.sidebar.markdown(footer, unsafe_allow_html=True)
125
 
126
 
127
- def scisearch(query, corpus, language, num_results=10):
128
  try:
129
  query = query.strip()
130
  if query == "" or query is None:
131
  return
132
 
133
- corpus = corpus.strip()
134
- address = os.environ.get("address") if corpus == "LAION" else os.environ.get("address")
135
  post_data = {"query": query, "k": num_results}
136
  if language != "detect_language":
137
  post_data["lang"] = language
138
 
139
  output = requests.post(
140
- address, # os.environ.get("address"),
141
  headers={"Content-type": "application/json"},
142
  data=json.dumps(post_data),
143
  timeout=60,
@@ -153,6 +129,7 @@ def scisearch(query, corpus, language, num_results=10):
153
  Detected language <b>{detected_lang}</b> is not supported.<br>
154
  Please choose a language from the dropdown or type another query.
155
  </p><br><hr><br>"""
 
156
  results = payload["results"]
157
  highlight_terms = payload["highlight_terms"]
158
  except Exception as e:
@@ -167,29 +144,23 @@ def scisearch(query, corpus, language, num_results=10):
167
 
168
  return results, highlight_terms
169
 
170
-
171
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
172
  PII_PREFIX = "PI:"
173
 
174
-
175
  def process_pii(text):
176
  for tag in PII_TAGS:
177
  text = text.replace(
178
  PII_PREFIX + tag,
179
- """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
180
- tag
181
- ),
182
- )
183
  return text
184
 
185
-
186
  def highlight_string(paragraph: str, highlight_terms: list) -> str:
187
  for term in highlight_terms:
188
  paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
189
  paragraph = process_pii(paragraph)
190
  return paragraph
191
 
192
-
193
  def process_results(hits: list, highlight_terms: list) -> str:
194
  hit_list = []
195
  for i, hit in enumerate(hits):
@@ -198,7 +169,7 @@ def process_results(hits: list, highlight_terms: list) -> str:
198
  <h2>{i+1}. Document ID: {hit['docid']}</h2>
199
  <p>Language: <string>{hit['lang']}</string>, Score: {round(hit['score'], 2)}</p>
200
  """
201
- for subhit in hit["meta"]["docs"]:
202
  res_head += f"""
203
  <button onclick="load_image({subhit['_id']})">Load Image</button><br>
204
  <p><img id='{subhit['_id']}' src='{subhit['URL']}' style="width:400px;height:auto;display:none;"></p>
@@ -215,28 +186,25 @@ def process_results(hits: list, highlight_terms: list) -> str:
215
 
216
 
217
  if st.sidebar.button("Search"):
218
- hits, highlight_terms = scisearch(query, corpus, LANG_MAPPING[language], max_results)
219
  html_results = process_results(hits, highlight_terms)
220
  rendered_results = f"""
221
- <div id="searchresultsarea">
222
- <br>
223
- <p id="searchresultsnumber">About {max_results} results</p>
224
- {html_results}
225
- </div>
226
- """
227
- st.markdown(
228
- """
229
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
230
- integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
231
- """,
232
- unsafe_allow_html=True,
233
- )
234
  st.markdown(
235
  """
236
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
237
  """,
238
- unsafe_allow_html=True,
239
- )
240
  st.markdown(
241
  f"""
242
  <div class="row no-gutters mt-3 align-items-center">
@@ -251,35 +219,38 @@ if st.sidebar.button("Search"):
251
  </div>
252
  </div>
253
  """,
254
- unsafe_allow_html=True,
255
- )
256
  components.html(
257
  """
258
  <style>
259
- #searchresultsarea {
260
- font-family: 'Arial';
261
- }
262
- #searchresultsnumber {
263
- font-size: 0.8rem;
264
- color: gray;
265
- }
266
- .searchresult h2 {
267
- font-size: 19px;
268
- line-height: 18px;
269
- font-weight: normal;
270
- color: rgb(7, 111, 222);
271
- margin-bottom: 0px;
272
- margin-top: 25px;
273
- }
274
- .searchresult a {
275
- font-size: 12px;
276
- line-height: 12px;
277
- color: green;
278
- margin-bottom: 0px;
279
- }
280
- .dark-mode {
281
- color: white;
282
- }
 
 
 
 
283
  </style>
284
  <script>
285
  function load_image(id){
@@ -298,8 +269,5 @@ if st.sidebar.button("Search"):
298
  }
299
  </script>
300
  <button onclick="myFunction()">Toggle dark mode</button>
301
- """
302
- + rendered_results,
303
- height=800,
304
- scrolling=True,
305
- )
 
1
+ import http.client as http_client
2
  import json
3
+ import logging
4
  import os
5
  import pprint
6
  import re
7
+ import string
8
 
 
9
  import streamlit as st
10
  import streamlit.components.v1 as components
11
+ import requests
12
+
13
 
14
  pp = pprint.PrettyPrinter(indent=2)
15
  st.set_page_config(page_title="Gaia Search", layout="wide")
16
 
17
+ os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True)
18
+ with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file:
19
+ file.write(
20
+ '[theme]\nbase="light"'
21
+ )
22
+
23
+ LANG_MAPPING = {'Arabic':'ar',
24
+ 'Catalan':'ca',
25
+ 'Code':'code',
26
+ 'English':'en',
27
+ 'Spanish':'es',
28
+ 'French':'fr',
29
+ 'Indonesian':'id',
30
+ 'Indic':'indic',
31
+ 'Niger-Congo':'nigercongo',
32
+ 'Portuguese': 'pt',
33
+ 'Vietnamese': 'vi',
34
+ 'Chinese': 'zh',
35
+ 'Detect Language':'detect_language',
36
+ 'All':'all'}
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  st.sidebar.markdown(
41
+ """
42
+ <style>
43
+ .aligncenter {
44
+ text-align: center;
45
+ font-weight: bold;
46
+ font-size: 50px;
47
+ }
48
+ </style>
49
+ <p class="aligncenter">Gaia Search 🌖🌏</p>
50
+ <p style="text-align: center;"> A search engine for the LAION large scale image caption corpora</p>
51
+ """,
52
+ unsafe_allow_html=True,
53
  )
54
 
55
  st.sidebar.markdown(
56
+ """
57
+ <style>
58
+ .aligncenter {
59
+ text-align: center;
60
+ }
61
+ </style>
62
+ <p style='text-align: center'>
63
+ <a href="" >GitHub</a> | <a href="" >Project Report</a>
64
+ </p>
65
+ <p class="aligncenter">
66
+ <a href="" target="_blank">
67
+ <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
68
+ </a>
69
+ </p>
70
+ """,
71
+ unsafe_allow_html=True,
72
  )
73
 
74
+ query = st.sidebar.text_input(label='Search query', value='')
 
 
 
 
 
 
 
 
75
  language = st.sidebar.selectbox(
76
+ 'Language',
77
+ ('Arabic', 'Catalan', 'Code', 'English', 'Spanish', 'French', 'Indonesian', 'Indic', 'Niger-Congo', 'Portuguese', 'Vietnamese', 'Chinese', 'Detect Language', 'All'),
78
+ index=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  max_results = st.sidebar.slider(
80
  "Maximum Number of Results",
81
  min_value=1,
 
84
  value=10,
85
  help="Maximum Number of Documents to return",
86
  )
87
+ footer="""<style>
88
+ .footer {
89
+ position: fixed;
90
+ left: 0;
91
+ bottom: 0;
92
+ width: 100%;
93
+ background-color: white;
94
+ color: black;
95
+ text-align: center;
96
+ }
97
+ </style>
98
+ <div class="footer">
99
+ <p>Powered by <a href="https://huggingface.co/" >HuggingFace 🤗</a> and <a href="https://github.com/castorini/pyserini" >Pyserini 🦆</a></p>
100
+ </div>
 
 
 
101
  """
102
+ st.sidebar.markdown(footer,unsafe_allow_html=True)
103
 
104
 
105
+ def scisearch(query, language, num_results=10):
106
  try:
107
  query = query.strip()
108
  if query == "" or query is None:
109
  return
110
 
 
 
111
  post_data = {"query": query, "k": num_results}
112
  if language != "detect_language":
113
  post_data["lang"] = language
114
 
115
  output = requests.post(
116
+ os.environ.get("address"),
117
  headers={"Content-type": "application/json"},
118
  data=json.dumps(post_data),
119
  timeout=60,
 
129
  Detected language <b>{detected_lang}</b> is not supported.<br>
130
  Please choose a language from the dropdown or type another query.
131
  </p><br><hr><br>"""
132
+
133
  results = payload["results"]
134
  highlight_terms = payload["highlight_terms"]
135
  except Exception as e:
 
144
 
145
  return results, highlight_terms
146
 
 
147
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
148
  PII_PREFIX = "PI:"
149
 
 
150
  def process_pii(text):
151
  for tag in PII_TAGS:
152
  text = text.replace(
153
  PII_PREFIX + tag,
154
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
155
+ )
 
 
156
  return text
157
 
 
158
  def highlight_string(paragraph: str, highlight_terms: list) -> str:
159
  for term in highlight_terms:
160
  paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
161
  paragraph = process_pii(paragraph)
162
  return paragraph
163
 
 
164
  def process_results(hits: list, highlight_terms: list) -> str:
165
  hit_list = []
166
  for i, hit in enumerate(hits):
 
169
  <h2>{i+1}. Document ID: {hit['docid']}</h2>
170
  <p>Language: <string>{hit['lang']}</string>, Score: {round(hit['score'], 2)}</p>
171
  """
172
+ for subhit in hit['meta']['docs']:
173
  res_head += f"""
174
  <button onclick="load_image({subhit['_id']})">Load Image</button><br>
175
  <p><img id='{subhit['_id']}' src='{subhit['URL']}' style="width:400px;height:auto;display:none;"></p>
 
186
 
187
 
188
  if st.sidebar.button("Search"):
189
+ hits, highlight_terms = scisearch(query, LANG_MAPPING[language], max_results)
190
  html_results = process_results(hits, highlight_terms)
191
  rendered_results = f"""
192
+ <div id="searchresultsarea">
193
+ <br>
194
+ <p id="searchresultsnumber">About {max_results} results</p>
195
+ {html_results}
196
+ </div>
197
+ """
198
+ st.markdown("""
199
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
200
+ integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
201
+ """,
202
+ unsafe_allow_html=True)
 
 
203
  st.markdown(
204
  """
205
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
206
  """,
207
+ unsafe_allow_html=True)
 
208
  st.markdown(
209
  f"""
210
  <div class="row no-gutters mt-3 align-items-center">
 
219
  </div>
220
  </div>
221
  """,
222
+ unsafe_allow_html=True)
 
223
  components.html(
224
  """
225
  <style>
226
+ #searchresultsarea {
227
+ font-family: 'Arial';
228
+ }
229
+
230
+ #searchresultsnumber {
231
+ font-size: 0.8rem;
232
+ color: gray;
233
+ }
234
+
235
+ .searchresult h2 {
236
+ font-size: 19px;
237
+ line-height: 18px;
238
+ font-weight: normal;
239
+ color: rgb(7, 111, 222);
240
+ margin-bottom: 0px;
241
+ margin-top: 25px;
242
+ }
243
+
244
+ .searchresult a {
245
+ font-size: 12px;
246
+ line-height: 12px;
247
+ color: green;
248
+ margin-bottom: 0px;
249
+ }
250
+
251
+ .dark-mode {
252
+ color: white;
253
+ }
254
  </style>
255
  <script>
256
  function load_image(id){
 
269
  }
270
  </script>
271
  <button onclick="myFunction()">Toggle dark mode</button>
272
+ """ + rendered_results, height=800, scrolling=True
273
+ )