Spaces:
Sleeping
Sleeping
File size: 838 Bytes
254cbbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
"""
This script is used to train the tokenizer model using the combined content of the dataset.
The trained model is saved in the tokenizer_model directory.
"""
import os
from tokenizer.basic_bpe import BasicTokenizer
# reading the input file
combined_file_path = "hindi_combined.txt"
combined_file_path = os.path.join(os.getcwd(), "data", combined_file_path)
with open(combined_file_path, "r") as file:
combined_content = file.read()
basic_tokenizer = BasicTokenizer()
print("Training the tokenizer model...")
basic_tokenizer.train(combined_content, vocab_size=5000, verbose=True)
print("Saving the model...")
model_path = os.path.join(os.getcwd(), "tokenizer_model")
os.makedirs(model_path, exist_ok=True)
prefix = os.path.join(model_path, "hindi_sentiments_basic")
basic_tokenizer.save(prefix)
print("Model saved at:", prefix)
|