Spaces:

CarperAI
/

pile-v2-eda

Build error

File size: 2,483 Bytes

d79b272
 
 
 
 
 
9c88e2b
 
212fefd
d1b0126
 
212fefd
d1b0126
212fefd
 
d79b272
 
 
a9a3b2f
d79b272
9c88e2b
d79b272
 
 
 
 
 
 
 
 
 
 
 
 
 
fe813bd
 
 
 
d79b272
a4b7ec2
d79b272
 
 
 
 
 
 
8e50efb
f9173b6
8e50efb
 
1b6aa17
 
 
 
8e50efb
 
 
9c88e2b
8e50efb
 
5a14023
d79b272
 
 
fe813bd

import streamlit as st
import datasets
import os
import json
from transformers import AutoTokenizer
import ast
import re

version = st.sidebar.selectbox("Choose a version", ["init","local_dedup", "reformatted"])
if version == "init":
    CACHE_DIR = "cache_ds/" #Use this to build the dataset
elif version == "local_dedup":
    CACHE_DIR = "local_dedup/"
elif version == "reformatted":
    CACHE_DIR = "reformatted/"
contribution_json = "contributors.json"

contribution_dict = json.load(open(contribution_json,"r"))
IGNORE_LIST = ["Bible","Tanzil","GNOME"]

splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]

cached_ds = os.listdir(CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')


def load_page(split):
    with st.spinner('Downloading and buidling dataset...'):
        if split not in cached_ds:
            ds = datasets.load_dataset('CarperAI/pile-v2-small-filtered',"train", data_files="data/"+split+"/data.json")
        else:
            ds = datasets.load_from_disk(CACHE_DIR+split)
    print("Sucessfully loaded "+split)
    st.title("Dataset Explorer")
    st.write(f"# {split}")
    if split in contribution_dict:
        st.caption(f"Contributors: {','.join(contribution_dict[split])}")
    else:
        st.caption(f"Needs to be updated....")
    with st.form("dataset_form"):
        index = st.slider('Select a row', 0, len(ds)-1, 0)
        if st.form_submit_button("Load"):
            st.write(f"Row {index}")
            data = ds[index]
            content = data["text"]
            meta = data["meta"]
            with st.expander("Render Content"):
                st.write(content)
            with st.expander("Raw Content"):
                st.text(content)
            with st.expander("Metadata and Metrics"):    
                st.write("### Meta:")
                try:
                    st.write(ast.literal_eval(meta))
                except:
                    st.write(meta)
            # Tokenizer-related count
                tokenized =  tokenizer(content, return_length=True)['length'][0]
                token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
            #Word related count
                split_words = re.findall(r'\w+', content)
                word_count_metric = st.metric("Word Count",value=len(split_words))
                


demo_name = st.sidebar.selectbox("Choose a demo", splits)
load_page(demo_name)