saicharan2804 commited on
Commit
0cee7ca
·
1 Parent(s): 719a68e

Updated training

Browse files
Files changed (1) hide show
  1. trainBpeTokenizer.py +11 -2
trainBpeTokenizer.py CHANGED
@@ -3,6 +3,15 @@ from tokenizers.models import BPE
3
  from tokenizers.trainers import BpeTrainer
4
  from tokenizers.pre_tokenizers import ByteLevel
5
  from tokenizers.processors import TemplateProcessing
 
 
 
 
 
 
 
 
 
6
 
7
  # Initialize a tokenizer
8
  tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
@@ -14,7 +23,7 @@ tokenizer.pre_tokenizer = ByteLevel()
14
  trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
15
 
16
  # Path to the file(s) for training the tokenizer
17
- files = ["/home/saicharan/Downloads/chembl.csv"]
18
 
19
  # Train the tokenizer
20
  tokenizer.train(files, trainer)
@@ -30,4 +39,4 @@ tokenizer.post_processor = TemplateProcessing(
30
  )
31
 
32
  # Save the tokenizer
33
- tokenizer.save("/home/saicharan/Downloads/chembl_bpe_tokenizer.json")
 
3
  from tokenizers.trainers import BpeTrainer
4
  from tokenizers.pre_tokenizers import ByteLevel
5
  from tokenizers.processors import TemplateProcessing
6
+ import argparse
7
+
8
+
9
+ parser = argparse.ArgumentParser(description='Train BPE Tokenizer.')
10
+ parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
11
+ parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
12
+
13
+ # Parse the arguments
14
+ args = parser.parse_args()
15
 
16
  # Initialize a tokenizer
17
  tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
 
23
  trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
24
 
25
  # Path to the file(s) for training the tokenizer
26
+ files = [args.dataset_file_path]
27
 
28
  # Train the tokenizer
29
  tokenizer.train(files, trainer)
 
39
  )
40
 
41
  # Save the tokenizer
42
+ tokenizer.save(args.output_file_path)