import json import streamlit as st st.set_page_config( page_title="BigScience Training Corpus", page_icon="https://avatars.githubusercontent.com/u/82455566", layout="wide", initial_sidebar_state="auto", ) query_params = st.experimental_get_query_params() @st.cache_data def load_catalogue(): full_catalogue = dict( [ (source_name, source) for source_name, source in json.load( open("resources/sources_with_info_cards.json") ) if source_name != "aggregated" ] ) language_catalogues = { "all": full_catalogue, } for source_name, source in full_catalogue.items(): for ln_dct in source["languages"]: ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"] language_catalogues[ln_code] = language_catalogues.get(ln_code, {}) language_catalogues[ln_code][source_name] = source for ln in language_catalogues: if ln != "all": language_catalogues[ln] = dict( sorted( language_catalogues[ln].items(), key=lambda x: [ ln_dct["size"] for ln_dct in x[1]["languages"] if ln_dct["ln_code"] == ln ][0], reverse=True, ) ) return dict(sorted(language_catalogues.items())) catalogue_by_ln = load_catalogue() with st.sidebar: ln_select = st.selectbox( "Show source list for language:", catalogue_by_ln, ) source_select = st.selectbox( "Show information for source:", catalogue_by_ln[ln_select], index=list(catalogue_by_ln[ln_select]).index( query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0] ) if ln_select == "all" else 0, ) st.experimental_set_query_params(**{"source": source_select}) with st.expander(f"Dataset Card for {source_select}"): st.markdown(catalogue_by_ln["all"][source_select]["data_card"]) if "catalogue_info" in catalogue_by_ln["all"][source_select]: with st.expander(f"Catalogue Information for {source_select}"): st.write(catalogue_by_ln["all"][source_select]["catalogue_info"]) if "seed_info" in catalogue_by_ln["all"][source_select]: with st.expander(f"Pseudocrawl Seed Information for {source_select}"): st.write(catalogue_by_ln["all"][source_select]["seed_info"]) if "hf_info" in catalogue_by_ln["all"][source_select]: with st.expander(f"HF Dataset Information for {source_select}"): st.write(catalogue_by_ln["all"][source_select]["hf_info"])