soutrik's picture
added: trained tokenizer plus gradio app
254cbbb
raw
history blame
838 Bytes
"""
This script is used to train the tokenizer model using the combined content of the dataset.
The trained model is saved in the tokenizer_model directory.
"""
import os
from tokenizer.basic_bpe import BasicTokenizer
# reading the input file
combined_file_path = "hindi_combined.txt"
combined_file_path = os.path.join(os.getcwd(), "data", combined_file_path)
with open(combined_file_path, "r") as file:
combined_content = file.read()
basic_tokenizer = BasicTokenizer()
print("Training the tokenizer model...")
basic_tokenizer.train(combined_content, vocab_size=5000, verbose=True)
print("Saving the model...")
model_path = os.path.join(os.getcwd(), "tokenizer_model")
os.makedirs(model_path, exist_ok=True)
prefix = os.path.join(model_path, "hindi_sentiments_basic")
basic_tokenizer.save(prefix)
print("Model saved at:", prefix)