import json import os import pprint import re import requests import streamlit as st import streamlit.components.v1 as components pp = pprint.PrettyPrinter(indent=2) st.set_page_config(page_title="Gaia Search", layout="wide") os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True) with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file: file.write('[theme]\nbase="light"') LANG_MAPPING = { "Arabic": "ar", "Catalan": "ca", "Code": "code", "English": "en", "Spanish": "es", "French": "fr", "Indonesian": "id", "Indic": "indic", "Niger-Congo": "nigercongo", "Portuguese": "pt", "Vietnamese": "vi", "Chinese": "zh", "Detect Language": "detect_language", "All": "all", } st.sidebar.markdown( """

Gaia Search 🌖🌏

A search engine for the LAION large scale image caption corpora

""", unsafe_allow_html=True, ) st.sidebar.markdown( """

GitHub | Project Report

""", unsafe_allow_html=True, ) corpus = st.sidebar.selectbox( "Corpus", ( "LAION", "C4", ), index=3, ) query = st.sidebar.text_input(label="Search query", value="") language = st.sidebar.selectbox( "Language", ( "Arabic", "Catalan", "Code", "English", "Spanish", "French", "Indonesian", "Indic", "Niger-Congo", "Portuguese", "Vietnamese", "Chinese", "Detect Language", "All", ), index=3, ) max_results = st.sidebar.slider( "Maximum Number of Results", min_value=1, max_value=100, step=1, value=10, help="Maximum Number of Documents to return", ) footer = """ """ st.sidebar.markdown(footer, unsafe_allow_html=True) def scisearch(query, corpus, language, num_results=10): try: query = query.strip() if query == "" or query is None: return corpus = corpus.strip() address = os.environ.get("address") if corpus == "LAION" else os.environ.get("address") post_data = {"query": query, "k": num_results} if language != "detect_language": post_data["lang"] = language output = requests.post( address, # os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) if "err" in payload: if payload["err"]["type"] == "unsupported_lang": detected_lang = payload["err"]["meta"]["detected_lang"] return f"""

Detected language {detected_lang} is not supported.
Please choose a language from the dropdown or type another query.

""" results = payload["results"] highlight_terms = payload["highlight_terms"] except Exception as e: results_html = f"""

Raised {type(e).__name__}

Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.

""" print(e) return results, highlight_terms PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} PII_PREFIX = "PI:" def process_pii(text): for tag in PII_TAGS: text = text.replace( PII_PREFIX + tag, """REDACTED {}""".format( tag ), ) return text def highlight_string(paragraph: str, highlight_terms: list) -> str: for term in highlight_terms: paragraph = re.sub(f"\\b{term}\\b", f"{term}", paragraph, flags=re.I) paragraph = process_pii(paragraph) return paragraph def process_results(hits: list, highlight_terms: list) -> str: hit_list = [] for i, hit in enumerate(hits): res_head = f"""

{i+1}. Document ID: {hit['docid']}

Language: {hit['lang']}, Score: {round(hit['score'], 2)}

""" for subhit in hit["meta"]["docs"]: res_head += f"""

{subhit['URL']}

{highlight_string(subhit['TEXT'], highlight_terms)}

""" res_head += f"""

{highlight_string(hit['text'], highlight_terms)}

""" hit_list.append(res_head) return " ".join(hit_list) if st.sidebar.button("Search"): hits, highlight_terms = scisearch(query, corpus, LANG_MAPPING[language], max_results) html_results = process_results(hits, highlight_terms) rendered_results = f"""

About {max_results} results

{html_results}

""" st.markdown( """ """, unsafe_allow_html=True, ) st.markdown( """ """, unsafe_allow_html=True, ) st.markdown( f"""

Gaia Search 🌖🌏

""", unsafe_allow_html=True, ) components.html( """ """ + rendered_results, height=800, scrolling=True, )