sanchit-gandhi HF staff ArthurZ HF staff commited on
Commit
e37978b
1 Parent(s): 8c1db9b

Upload tokenizer (#28)

Browse files

- Upload tokenizer (2ca60a10925b961f7b3f37ed3646fd8991bac3f6)


Co-authored-by: Arthur Zucker <ArthurZ@users.noreply.huggingface.co>

merges.txt CHANGED
@@ -1,4 +1,5 @@
1
  #version: 0.2
 
2
  Ġ a
3
  Ġt h
4
  i n
 
1
  #version: 0.2
2
+ Ġ t
3
  Ġ a
4
  Ġt h
5
  i n
special_tokens_map.json CHANGED
@@ -111,22 +111,28 @@
111
  "bos_token": {
112
  "content": "<|endoftext|>",
113
  "lstrip": false,
114
- "normalized": true,
115
  "rstrip": false,
116
  "single_word": false
117
  },
118
  "eos_token": {
119
  "content": "<|endoftext|>",
120
  "lstrip": false,
121
- "normalized": true,
 
 
 
 
 
 
 
122
  "rstrip": false,
123
  "single_word": false
124
  },
125
- "pad_token": "<|endoftext|>",
126
  "unk_token": {
127
  "content": "<|endoftext|>",
128
  "lstrip": false,
129
- "normalized": true,
130
  "rstrip": false,
131
  "single_word": false
132
  }
 
111
  "bos_token": {
112
  "content": "<|endoftext|>",
113
  "lstrip": false,
114
+ "normalized": false,
115
  "rstrip": false,
116
  "single_word": false
117
  },
118
  "eos_token": {
119
  "content": "<|endoftext|>",
120
  "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
+ "pad_token": {
126
+ "content": "<|endoftext|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
  "rstrip": false,
130
  "single_word": false
131
  },
 
132
  "unk_token": {
133
  "content": "<|endoftext|>",
134
  "lstrip": false,
135
+ "normalized": false,
136
  "rstrip": false,
137
  "single_word": false
138
  }
tokenizer.json CHANGED
@@ -64848,6 +64848,7 @@
64848
  "<|endoftext|>": 50257
64849
  },
64850
  "merges": [
 
64851
  "Ġ a",
64852
  "Ġt h",
64853
  "i n",
 
64848
  "<|endoftext|>": 50257
64849
  },
64850
  "merges": [
64851
+ "Ġ t",
64852
  "Ġ a",
64853
  "Ġt h",
64854
  "i n",
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff