Spaces:
Running
Running
""" | |
This script is used to train the tokenizer model using the combined content of the dataset. | |
The trained model is saved in the tokenizer_model directory. | |
""" | |
import os | |
from tokenizer.basic_bpe import BasicTokenizer | |
# reading the input file | |
combined_file_path = "hindi_combined.txt" | |
combined_file_path = os.path.join(os.getcwd(), "data", combined_file_path) | |
with open(combined_file_path, "r") as file: | |
combined_content = file.read() | |
basic_tokenizer = BasicTokenizer() | |
print("Training the tokenizer model...") | |
basic_tokenizer.train(combined_content, vocab_size=5000, verbose=True) | |
print("Saving the model...") | |
model_path = os.path.join(os.getcwd(), "tokenizer_model") | |
os.makedirs(model_path, exist_ok=True) | |
prefix = os.path.join(model_path, "hindi_sentiments_basic") | |
basic_tokenizer.save(prefix) | |
print("Model saved at:", prefix) | |