Spaces:
Build error
Build error
import streamlit as st | |
import datasets | |
import os | |
import json | |
from transformers import AutoTokenizer | |
import ast | |
import re | |
version = st.sidebar.selectbox("Choose a version", ["init","local_dedup"]) | |
if version == "init": | |
CACHE_DIR = "cache_ds/" #Use this to build the dataset | |
else: | |
CACHE_DIR = "local_dedup/" | |
contribution_json = "contributors.json" | |
contribution_dict = json.load(open(contribution_json,"r")) | |
IGNORE_LIST = ["Bible","Tanzil","GNOME"] | |
splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST] | |
cached_ds = os.listdir(CACHE_DIR) | |
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b') | |
def load_page(split): | |
with st.spinner('Downloading and buidling dataset...'): | |
if split not in cached_ds: | |
ds = datasets.load_dataset('CarperAI/pile-v2-small-filtered',"train", data_files="data/"+split+"/data.json") | |
else: | |
ds = datasets.load_from_disk(CACHE_DIR+split) | |
print("Sucessfully loaded "+split) | |
st.title("Dataset Explorer") | |
st.write(f"# {split}") | |
if split in contribution_dict: | |
st.caption(f"Contributors: {','.join(contribution_dict[split])}") | |
else: | |
st.caption(f"Needs to be updated....") | |
with st.form("dataset_form"): | |
index = st.slider('Select a row', 0, len(ds)-1, 0) | |
if st.form_submit_button("Load"): | |
st.write(f"Row {index}") | |
data = ds[index] | |
content = data["text"] | |
meta = data["meta"] | |
with st.expander("Render Content"): | |
st.write(content) | |
with st.expander("Raw Content"): | |
st.text(content) | |
with st.expander("Metadata and Metrics"): | |
st.write("### Meta:") | |
try: | |
st.write(ast.literal_eval(meta)) | |
except: | |
st.write(meta) | |
# Tokenizer-related count | |
tokenized = tokenizer(content, return_length=True)['length'][0] | |
token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized) | |
#Word related count | |
split_words = re.findall(r'\w+', content) | |
word_count_metric = st.metric("Word Count",value=len(split_words)) | |
demo_name = st.sidebar.selectbox("Choose a demo", splits) | |
load_page(demo_name) |