JonasGeiping commited on
Commit
77c5c17
1 Parent(s): a61dd96

Upload data_budget_hours_24.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_budget_hours_24.json +67 -0
data_budget_hours_24.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sources": {
3
+ "the_pile": {
4
+ "provider": "local",
5
+ "file_type": "json",
6
+ "files": [
7
+ "/fs/cml-datasets/Pile/train/00.jsonl.zst",
8
+ "/fs/cml-datasets/Pile/train/01.jsonl.zst",
9
+ "/fs/cml-datasets/Pile/train/02.jsonl.zst",
10
+ "/fs/cml-datasets/Pile/train/03.jsonl.zst",
11
+ "/fs/cml-datasets/Pile/train/04.jsonl.zst",
12
+ "/fs/cml-datasets/Pile/train/05.jsonl.zst",
13
+ "/fs/cml-datasets/Pile/train/06.jsonl.zst",
14
+ "/fs/cml-datasets/Pile/train/07.jsonl.zst",
15
+ "/fs/cml-datasets/Pile/train/08.jsonl.zst",
16
+ "/fs/cml-datasets/Pile/train/09.jsonl.zst",
17
+ "/fs/cml-datasets/Pile/train/10.jsonl.zst",
18
+ "/fs/cml-datasets/Pile/train/11.jsonl.zst",
19
+ "/fs/cml-datasets/Pile/train/12.jsonl.zst",
20
+ "/fs/cml-datasets/Pile/train/13.jsonl.zst",
21
+ "/fs/cml-datasets/Pile/train/14.jsonl.zst",
22
+ "/fs/cml-datasets/Pile/train/15.jsonl.zst",
23
+ "/fs/cml-datasets/Pile/train/16.jsonl.zst",
24
+ "/fs/cml-datasets/Pile/train/17.jsonl.zst",
25
+ "/fs/cml-datasets/Pile/train/18.jsonl.zst",
26
+ "/fs/cml-datasets/Pile/train/19.jsonl.zst",
27
+ "/fs/cml-datasets/Pile/train/20.jsonl.zst",
28
+ "/fs/cml-datasets/Pile/train/21.jsonl.zst",
29
+ "/fs/cml-datasets/Pile/train/22.jsonl.zst",
30
+ "/fs/cml-datasets/Pile/train/23.jsonl.zst",
31
+ "/fs/cml-datasets/Pile/train/24.jsonl.zst",
32
+ "/fs/cml-datasets/Pile/train/25.jsonl.zst",
33
+ "/fs/cml-datasets/Pile/train/26.jsonl.zst",
34
+ "/fs/cml-datasets/Pile/train/27.jsonl.zst",
35
+ "/fs/cml-datasets/Pile/train/28.jsonl.zst",
36
+ "/fs/cml-datasets/Pile/train/29.jsonl.zst"
37
+ ],
38
+ "filter": null,
39
+ "split": "train",
40
+ "streaming": true,
41
+ "remove_columns": null,
42
+ "concatenate_successive_entries": 0
43
+ }
44
+ },
45
+ "name": "the_pile",
46
+ "normalizer": {
47
+ "force_lowercase": true,
48
+ "strip_accents": true,
49
+ "force_english_keyboard": true,
50
+ "whitespace_escape": false
51
+ },
52
+ "tokenizer": "WordPiece",
53
+ "vocab_size": 32768,
54
+ "seq_length": 128,
55
+ "include_cls_token_in_corpus": false,
56
+ "include_sep_token_in_corpus": true,
57
+ "use_type_ids": false,
58
+ "max_entries_in_raw_dataset": 8000000.0,
59
+ "max_seq_in_tokenized_dataset": 85000000.0,
60
+ "named_entity_simplification": false,
61
+ "remove_whitespaces": false,
62
+ "remove_trash": true,
63
+ "trash_cutoff": 0.25,
64
+ "deduplicate_entries": false,
65
+ "deduplication_threshold": 75,
66
+ "ordering": "sentence-length-curriculum"
67
+ }