Spaces:

soutrik
/

EraV2_S20_Tokenization

Running

EraV2_S20_Tokenization / train.py

added: trained tokenizer plus gradio app

254cbbb 6 months ago

838 Bytes

	"""
	This script is used to train the tokenizer model using the combined content of the dataset.
	The trained model is saved in the tokenizer_model directory.
	"""

	import os
	from tokenizer.basic_bpe import BasicTokenizer

	# reading the input file
	combined_file_path = "hindi_combined.txt"
	combined_file_path = os.path.join(os.getcwd(), "data", combined_file_path)
	with open(combined_file_path, "r") as file:
	combined_content = file.read()

	basic_tokenizer = BasicTokenizer()
	print("Training the tokenizer model...")
	basic_tokenizer.train(combined_content, vocab_size=5000, verbose=True)

	print("Saving the model...")
	model_path = os.path.join(os.getcwd(), "tokenizer_model")
	os.makedirs(model_path, exist_ok=True)
	prefix = os.path.join(model_path, "hindi_sentiments_basic")
	basic_tokenizer.save(prefix)
	print("Model saved at:", prefix)