import streamlit as st import numpy as np import pandas as pd import datasets from dataclasses import asdict from PIL import Image import yaml import textwrap import tornado import json import time import sys from git import Repo import os if not os.path.exists('datasets_clone'): Repo.clone_from('https://github.com/huggingface/datasets.git', 'datasets_clone') MAX_SIZE = 40000000000 # if len(sys.argv) > 1: # path_to_datasets = sys.argv[1] # else: # path_to_datasets = None path_to_datasets = 'datasets_clone/datasets/' ## Hack to extend the width of the main pane. def _max_width_(): max_width_str = f"max-width: 1000px;" st.markdown( f""" """, unsafe_allow_html=True, ) _max_width_() def render_features(features): if isinstance(features, dict): return {k: render_features(v) for k, v in features.items()} if isinstance(features, datasets.features.ClassLabel): return features.names if isinstance(features, datasets.features.Value): return features.dtype if isinstance(features, datasets.features.Sequence): return {"[]": render_features(features.feature)} return features app_state = st.experimental_get_query_params() # print(app_state) start = True loaded = True INITIAL_SELECTION = "" # if app_state == "NOT_INITIALIZED": # latest_iteration = st.empty() # bar = st.progress(0) # start = False # for i in range(0, 101, 10): # # Update the progress bar with each iteration. # # latest_iteration.text(f'Iteration {i+1}') # bar.progress(i) # time.sleep(0.1) # if i == 100: # start = True # bar.empty() # loaded = True # app_state = st.experimental_get_query_params() # print("appstate is", app_state) app_state.setdefault("dataset", "glue") if len(app_state.get("dataset", [])) == 1: app_state["dataset"] = app_state["dataset"][0] INITIAL_SELECTION = app_state["dataset"] if len(app_state.get("config", [])) == 1: app_state["config"] = app_state["config"][0] print(INITIAL_SELECTION) if start: ## Logo and sidebar decoration. st.sidebar.markdown( """
""", unsafe_allow_html=True, ) st.sidebar.image("datasets_logo_name.png", width=300) st.sidebar.markdown( "

github/huggingface/datasets

", unsafe_allow_html=True, ) st.sidebar.markdown( """
Docs | Browse | Add Dataset
""", unsafe_allow_html=True, ) st.sidebar.subheader("") ## Interaction with the datasets libary. # @st.cache def get_confs(opt): "Get the list of confs for a dataset." if path_to_datasets is not None and opt is not None: path = path_to_datasets + opt else: path = opt module_path = datasets.load.prepare_module(path, dataset=True ) # Get dataset builder class from the processing script builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) # Instantiate the dataset builder confs = builder_cls.BUILDER_CONFIGS if confs and len(confs) > 1: return confs else: return [] # @st.cache(allow_output_mutation=True) def get(opt, conf=None): "Get a dataset from name and conf" if path_to_datasets is not None: path = path_to_datasets + opt else: path = opt module_path = datasets.load.prepare_module(path, dataset=True) builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) if conf: builder_instance = builder_cls(name=conf, cache_dir=path if path_to_datasets is not None else None) else: builder_instance = builder_cls(cache_dir=path if path_to_datasets is not None else None) fail = False if path_to_datasets is not None: dts = datasets.load_dataset(path, name=builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None, ) dataset = dts elif ( builder_instance.manual_download_instructions is None and builder_instance.info.size_in_bytes is not None and builder_instance.info.size_in_bytes < MAX_SIZE): builder_instance.download_and_prepare() dts = builder_instance.as_dataset() dataset = dts else: dataset = builder_instance fail = True return dataset, fail # Dataset select box. dataset_names = [] selection = None import glob if path_to_datasets is None: list_of_datasets = datasets.list_datasets(with_community_datasets=False) else: list_of_datasets = sorted(glob.glob(path_to_datasets + "*")) print(list_of_datasets) for i, dataset in enumerate(list_of_datasets): dataset = dataset.split("/")[-1] if INITIAL_SELECTION and dataset == INITIAL_SELECTION: selection = i dataset_names.append(dataset ) if selection is not None: option = st.sidebar.selectbox( "Dataset", dataset_names, index=selection, format_func=lambda a: a ) else: option = st.sidebar.selectbox("Dataset", dataset_names, format_func=lambda a: a) print(option) app_state["dataset"] = option st.experimental_set_query_params(**app_state) # Side bar Configurations. configs = get_confs(option) conf_avail = len(configs) > 0 conf_option = None if conf_avail: start = 0 for i, conf in enumerate(configs): if conf.name == app_state.get("config", None): start = i conf_option = st.sidebar.selectbox( "Subset", configs, index=start, format_func=lambda a: a.name ) app_state["config"] = conf_option.name else: if "config" in app_state: del app_state["config"] st.experimental_set_query_params(**app_state) dts, fail = get(str(option), str(conf_option.name) if conf_option else None) # Main panel setup. if fail: st.markdown( "Dataset is too large to browse or requires manual download. Check it out in the datasets library! \n\n Size: " + str(dts.info.size_in_bytes) + "\n\n Instructions: " + str(dts.manual_download_instructions) ) else: k = list(dts.keys()) index = 0 if "train" in dts.keys(): index = k.index("train") split = st.sidebar.selectbox("Split", k, index=index) d = dts[split] keys = list(d[0].keys()) st.header( "Dataset: " + option + " " + (("/ " + conf_option.name) if conf_option else "") ) st.markdown( "*Homepage*: " + d.info.homepage + "\n\n*Dataset*: https://huggingface.co/datasets/%s" % (option) ) md = """ %s """ % ( d.info.description.replace("\\", "") if option else "" ) st.markdown(md) step = 50 offset = st.sidebar.number_input( "Offset (Size: %d)" % len(d), min_value=0, max_value=int(len(d)) - step, value=0, step=step, ) image_classification, gallary = False, False if d.info.task_templates: for task_template in d.info.task_templates: if task_template.task == 'image-classification': image_classification = True st.sidebar.markdown('\n---\n') gallary = st.sidebar.checkbox("Show Image Gallary 🖼️", False) if image_classification else None break citation = st.sidebar.checkbox("Show Citations 📎", False) table = image_classification or not st.sidebar.checkbox("Show List View 📋", False) show_features = st.sidebar.checkbox("Show Features 🧐", True) md = """ ``` %s ``` """ % ( d.info.citation.replace("\\", "").replace("}", " }").replace("{", "{ "), ) if citation: st.markdown(md) # st.text("Features:") if show_features: if not gallary: on_keys = st.multiselect("Features", keys, keys) st.write(render_features(d.features)) else: on_keys = keys if not table and not (image_classification and gallary): # Full view. for item in range(offset, offset + step): st.text(" ") st.text(" ---- #" + str(item)) st.text(" ") # Use st to write out. for k in on_keys: v = d[item][k] st.subheader(k) if isinstance(v, str): out = v st.text(textwrap.fill(out, width=120)) elif ( isinstance(v, bool) or isinstance(v, int) or isinstance(v, float) ): st.text(v) else: st.write(v) elif image_classification and gallary: # Image Gallary View. d = d.prepare_for_task('image-classification') n_cols, n_rows = 5, 10 images = [] labels = [] for item in range(offset, offset+step): image = Image.open(d[item]['image_file_path']).convert("RGB") images.append(image) label_id = d[item]['labels'] label_str = d.features['labels'].int2str(label_id) labels.append(f"#{item} | {label_str}") n_rows = 1 + len(images) // int(n_cols) cols_per_row = [st.beta_columns(n_cols) for _ in range(n_rows)] cols = [column for row in cols_per_row for column in row] for idx, (image, label) in enumerate(zip(images, labels)): cols[idx].image(image, caption=label) else: # Table view. Use Pandas. df = [] for item in range(offset, offset + step): df_item = {} df_item["_number"] = item for k in on_keys: v = d[item][k] if isinstance(v, str): out = v df_item[k] = textwrap.fill(out, width=50) elif ( isinstance(v, bool) or isinstance(v, int) or isinstance(v, float) ): df_item[k] = v else: out = json.dumps(v, indent=2, sort_keys=True) df_item[k] = out df.append(df_item) df2 = df df = pd.DataFrame(df).set_index("_number") def hover(hover_color="#ffff99"): return dict( selector="tr:hover", props=[("background-color", "%s" % hover_color)], ) styles = [ hover(), dict( selector="th", props=[("font-size", "150%"), ("text-align", "center")], ), dict(selector="caption", props=[("caption-side", "bottom")]), ] # Table view. Use pands styling. style = df.style.set_properties( **{"text-align": "left", "white-space": "pre"} ).set_table_styles([dict(selector="th", props=[("text-align", "left")])]) style = style.set_table_styles(styles) st.table(style) # Additional dataset installation and sidebar properties. md = """ ### Code ```python !pip install datasets from datasets import load_dataset dataset = load_dataset( '%s'%s) ``` """ % ( option, (", '" + conf_option.name + "'") if conf_option else "", ) st.sidebar.markdown(md)