mms-1b-all / create_vocab.py
patrickvonplaten's picture
up
0e87828
raw
history blame
880 Bytes
#!/usr/bin/env python3
import os
import json
folder_path = "./vocabs"
all_dict = {}
def parse_file(filename):
dictionary = {
"</s>": 2,
"<pad>": 0,
"<s>": 1,
"<unk>": 3,
}
value = 4
with open(filename, 'r') as file:
for line in file:
line = line.strip().split()
if line:
key = line[0]
dictionary[key] = value
value += 1
return dictionary
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
lang = filename.split(".")[0]
if os.path.isfile(filepath):
all_dict[lang] = parse_file(filepath)
output_path = "vocab_1.json" # Replace "output.json" with the desired output file path
with open(output_path, 'w') as output_file:
json.dump(all_dict, output_file, indent=4, sort_keys=True)