|
--- |
|
library_name: transformers |
|
license: apache-2.0 |
|
tags: |
|
- tokenizer |
|
- claude3 |
|
- t5 |
|
--- |
|
|
|
# claude3 tokenizer: for T5 |
|
|
|
Vocabulary size: 65103 |
|
|
|
- relevant special tokens for T5 training added |
|
- post processor updated following t5's tokenizer |
|
|
|
usage: |
|
|
|
|
|
```py |
|
from transformers import AutoTokenizer |
|
tk = AutoTokenizer.from_pretrained('BEE-spoke-data/claude-tokenizer-forT5') |
|
inputs = tk("here are some words", return_tensors="pt") |
|
``` |
|
|
|
## post processor |
|
|
|
|
|
```json |
|
"post_processor": { |
|
"type": "TemplateProcessing", |
|
"single": [ |
|
{ |
|
"Sequence": { |
|
"id": "A", |
|
"type_id": 0 |
|
} |
|
}, |
|
{ |
|
"SpecialToken": { |
|
"id": "</s>", |
|
"type_id": 0 |
|
} |
|
} |
|
], |
|
"pair": [ |
|
{ |
|
"Sequence": { |
|
"id": "A", |
|
"type_id": 0 |
|
} |
|
}, |
|
{ |
|
"SpecialToken": { |
|
"id": "</s>", |
|
"type_id": 0 |
|
} |
|
}, |
|
{ |
|
"Sequence": { |
|
"id": "B", |
|
"type_id": 0 |
|
} |
|
}, |
|
{ |
|
"SpecialToken": { |
|
"id": "</s>", |
|
"type_id": 0 |
|
} |
|
} |
|
], |
|
"special_tokens": { |
|
"</s>": { |
|
"id": "</s>", |
|
"ids": [ |
|
65001 |
|
], |
|
"tokens": [ |
|
"</s>" |
|
] |
|
} |
|
} |
|
}, |
|
``` |