Reshinth Adithyan commited on
Commit
9c88e2b
1 Parent(s): 1e714f6

Remove single datapoint datasets

Browse files
Files changed (2) hide show
  1. app.py +10 -4
  2. load_dataset.py +18 -13
app.py CHANGED
@@ -4,14 +4,16 @@ import os
4
  import json
5
  from transformers import AutoTokenizer
6
  import ast
 
 
7
 
8
  CACHE_DIR = "cache_ds/" #Use this to build the dataset
9
  contribution_json = "contributors.json"
10
 
11
  contribution_dict = json.load(open(contribution_json,"r"))
 
12
 
13
-
14
- splits = os.listdir(CACHE_DIR)#['EuroParliamentProceedings', 'TED2020', 'PileOfLaw', 'StackExchange_ver2', 'GithubIssues', 'Opensubtitles', 'USPTO', 'S2ORC', 'DevDocs', 'CodePileReddit2022', 'DMMath', 'Gutenberg', 'USENET', 'GithubDiff', 'Enwiki', 'GNOME', 'ASFPublicMail', 'PileV2Reddit2020', 'CodePilePosts', 'Discourse', 'Tanzil', 'arXiv', 'UbuntuIRC', 'PubMed', 'CodePileReddit2020', 'CodePileReddit2021', 'GlobalVoices', 'FreeLaw_Options', 'PileV2Posts','Bible']
15
 
16
  cached_ds = os.listdir(CACHE_DIR)
17
  tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
@@ -43,9 +45,13 @@ def load_page(split):
43
  st.text(content)
44
  st.write("### Meta:")
45
  st.write(ast.literal_eval(meta))
 
46
  tokenized = tokenizer(content, return_length=True)['length'][0]
47
- token_count_metric = st.metric("Token Count",value=tokenized,delta=2048-tokenized)
48
-
 
 
 
49
 
50
 
51
  demo_name = st.sidebar.selectbox("Choose a demo", splits)
 
4
  import json
5
  from transformers import AutoTokenizer
6
  import ast
7
+ import re
8
+
9
 
10
  CACHE_DIR = "cache_ds/" #Use this to build the dataset
11
  contribution_json = "contributors.json"
12
 
13
  contribution_dict = json.load(open(contribution_json,"r"))
14
+ IGNORE_LIST = ["Bible","Tanzil",""]
15
 
16
+ splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]
 
17
 
18
  cached_ds = os.listdir(CACHE_DIR)
19
  tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
 
45
  st.text(content)
46
  st.write("### Meta:")
47
  st.write(ast.literal_eval(meta))
48
+ #Tokenizer related count
49
  tokenized = tokenizer(content, return_length=True)['length'][0]
50
+ token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
51
+ #Word related count
52
+ split_words = re.findall(r'\w+', content)
53
+ word_count_metric = st.metric("Word Count",value=len(split_words))
54
+
55
 
56
 
57
  demo_name = st.sidebar.selectbox("Choose a demo", splits)
load_dataset.py CHANGED
@@ -1,17 +1,22 @@
1
- import datasets
2
- import logging
3
  import os
4
- from tqdm import tqdm
5
- PATH = "/Users/reshinthadithyan/master/research/code-research/carperai/pile-v2-small-filtered/data"
6
- dataset_subs = os.listdir(PATH)
7
 
8
- print(dataset_subs)
9
 
10
 
11
- for ds in tqdm(dataset_subs):
12
- try:
13
- print(ds)
14
- dataset = datasets.load_dataset("CarperAI/pile-v2-small-filtered",data_files=f"data/{ds}/data.json", split="train")
15
- dataset.save_to_disk(f"cache_ds/{ds}")
16
- except:
17
- print(f"Error at {ds}")
 
 
 
 
 
 
1
+ # import datasets
2
+ # import logging
3
  import os
4
+ import json
5
+ # from tqdm import tqdm
6
+ # dataset_subs = os.listdir(PATH)
7
 
8
+ # print(dataset_subs)
9
 
10
 
11
+ # for ds in tqdm(dataset_subs):
12
+ # try:
13
+ # print(ds)
14
+ # dataset = datasets.load_dataset("CarperAI/pile-v2-small-filtered",data_files=f"data/{ds}/data.json", split="train")
15
+ # dataset.save_to_disk(f"cache_ds/{ds}")
16
+ # except:
17
+ # print(f"Error at {ds}")
18
+
19
+ ds_subsets = os.listdir("cache_ds")
20
+
21
+ with open("documentation.json","w") as f:
22
+ json.dump(ds_subsets,f)