Spaces:

soutrik
/

EraV2_S20_Tokenization

Sleeping

File size: 838 Bytes

254cbbb

"""
This script is used to train the tokenizer model using the combined content of the dataset.
The trained model is saved in the tokenizer_model directory.
"""

import os
from tokenizer.basic_bpe import BasicTokenizer

# reading the input file
combined_file_path = "hindi_combined.txt"
combined_file_path = os.path.join(os.getcwd(), "data", combined_file_path)
with open(combined_file_path, "r") as file:
    combined_content = file.read()

basic_tokenizer = BasicTokenizer()
print("Training the tokenizer model...")
basic_tokenizer.train(combined_content, vocab_size=5000, verbose=True)

print("Saving the model...")
model_path = os.path.join(os.getcwd(), "tokenizer_model")
os.makedirs(model_path, exist_ok=True)
prefix = os.path.join(model_path, "hindi_sentiments_basic")
basic_tokenizer.save(prefix)
print("Model saved at:", prefix)