import os def vocab_process(data_dir): slot_label_vocab = 'slot_label.txt' intent_label_vocab = 'intent_label.txt' train_dir = os.path.join(data_dir, 'train') # intent with open(os.path.join(train_dir, 'label'), 'r', encoding='utf-8') as f_r, open(os.path.join(data_dir, intent_label_vocab), 'w', encoding='utf-8') as f_w: intent_vocab = set() for line in f_r: line = line.strip() intent_vocab.add(line) additional_tokens = ["UNK"] for token in additional_tokens: f_w.write(token + '\n') intent_vocab = sorted(list(intent_vocab)) for intent in intent_vocab: f_w.write(intent + '\n') # slot with open(os.path.join(train_dir, 'seq.out'), 'r', encoding='utf-8') as f_r, open(os.path.join(data_dir, slot_label_vocab), 'w', encoding='utf-8') as f_w: slot_vocab = set() for line in f_r: line = line.strip() slots = line.split() for slot in slots: slot_vocab.add(slot) slot_vocab = sorted(list(slot_vocab), key=lambda x: (x[2:], x[:2])) # Write additional tokens additional_tokens = ["PAD", "UNK"] for token in additional_tokens: f_w.write(token + '\n') for slot in slot_vocab: f_w.write(slot + '\n') if __name__ == "__main__": vocab_process('atis') vocab_process('snips')