Spaces:
Runtime error
Runtime error
saicharan2804
commited on
Commit
·
0cee7ca
1
Parent(s):
719a68e
Updated training
Browse files- trainBpeTokenizer.py +11 -2
trainBpeTokenizer.py
CHANGED
@@ -3,6 +3,15 @@ from tokenizers.models import BPE
|
|
3 |
from tokenizers.trainers import BpeTrainer
|
4 |
from tokenizers.pre_tokenizers import ByteLevel
|
5 |
from tokenizers.processors import TemplateProcessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Initialize a tokenizer
|
8 |
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
@@ -14,7 +23,7 @@ tokenizer.pre_tokenizer = ByteLevel()
|
|
14 |
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
15 |
|
16 |
# Path to the file(s) for training the tokenizer
|
17 |
-
files = [
|
18 |
|
19 |
# Train the tokenizer
|
20 |
tokenizer.train(files, trainer)
|
@@ -30,4 +39,4 @@ tokenizer.post_processor = TemplateProcessing(
|
|
30 |
)
|
31 |
|
32 |
# Save the tokenizer
|
33 |
-
tokenizer.save(
|
|
|
3 |
from tokenizers.trainers import BpeTrainer
|
4 |
from tokenizers.pre_tokenizers import ByteLevel
|
5 |
from tokenizers.processors import TemplateProcessing
|
6 |
+
import argparse
|
7 |
+
|
8 |
+
|
9 |
+
parser = argparse.ArgumentParser(description='Train BPE Tokenizer.')
|
10 |
+
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
|
11 |
+
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
|
12 |
+
|
13 |
+
# Parse the arguments
|
14 |
+
args = parser.parse_args()
|
15 |
|
16 |
# Initialize a tokenizer
|
17 |
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
|
|
23 |
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
24 |
|
25 |
# Path to the file(s) for training the tokenizer
|
26 |
+
files = [args.dataset_file_path]
|
27 |
|
28 |
# Train the tokenizer
|
29 |
tokenizer.train(files, trainer)
|
|
|
39 |
)
|
40 |
|
41 |
# Save the tokenizer
|
42 |
+
tokenizer.save(args.output_file_path)
|