Upload tokenizer.json

The persisted `tokenizer.json` does not have the template processor for adding special tokens. `transformers` overrides the processor on load, but when loading `tokenizer.json` directly with the Rust tokenizers it's nice to have the processor there already (which worked so far in case of other models). This basically re-saves the tokenizer to match exactly what is loaded by `transformers`.

---

Generated with:

```python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")
assert tokenizer.is_fast
tokenizer.save_pretrained("...")
```

Files changed (1) hide show

tokenizer.json +52 -4

tokenizer.json CHANGED Viewed

@@ -254,10 +254,58 @@
     ]
   },
   "post_processor": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": false,
-    "use_regex": true
   },
   "decoder": {
     "type": "ByteLevel",

     ]
   },
   "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<｜begin▁of▁sentence｜>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "<｜begin▁of▁sentence｜>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<｜begin▁of▁sentence｜>",
+          "type_id": 1
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<｜begin▁of▁sentence｜>": {
+        "id": "<｜begin▁of▁sentence｜>",
+        "ids": [
+          32013
+        ],
+        "tokens": [
+          "<｜begin▁of▁sentence｜>"
+        ]
+      }
+    }
   },
   "decoder": {
     "type": "ByteLevel",