|
|
|
import os |
|
import json |
|
folder_path = "./vocabs" |
|
|
|
all_dict = {} |
|
|
|
def parse_file(filename): |
|
dictionary = { |
|
"</s>": 2, |
|
"<pad>": 0, |
|
"<s>": 1, |
|
"<unk>": 3, |
|
} |
|
value = 4 |
|
|
|
with open(filename, 'r') as file: |
|
for line in file: |
|
line = line.strip().split() |
|
if line: |
|
key = line[0] |
|
dictionary[key] = value |
|
value += 1 |
|
|
|
return dictionary |
|
|
|
for filename in os.listdir(folder_path): |
|
filepath = os.path.join(folder_path, filename) |
|
lang = filename.split(".")[0] |
|
if os.path.isfile(filepath): |
|
all_dict[lang] = parse_file(filepath) |
|
|
|
|
|
output_path = "vocab_1.json" |
|
|
|
with open(output_path, 'w') as output_file: |
|
json.dump(all_dict, output_file, indent=4, sort_keys=True) |
|
|