DeepMoji / examples /vocab_extension.py
Pendrokar's picture
torchmoji code
86a83a2
raw
history blame
954 Bytes
"""
Extend the given vocabulary using dataset-specific words.
1. First create a vocabulary for the specific dataset.
2. Find all words not in our vocabulary, but in the dataset vocabulary.
3. Take top X (default=1000) of these words and add them to the vocabulary.
4. Save this combined vocabulary and embedding matrix, which can now be used.
"""
from __future__ import print_function, unicode_literals
import example_helper
import json
from torchmoji.create_vocab import extend_vocab, VocabBuilder
from torchmoji.word_generator import WordGenerator
new_words = ['#zzzzaaazzz', 'newword', 'newword']
word_gen = WordGenerator(new_words)
vb = VocabBuilder(word_gen)
vb.count_all_words()
with open('../model/vocabulary.json') as f:
vocab = json.load(f)
print(len(vocab))
print(vb.word_counts)
extend_vocab(vocab, vb, max_tokens=1)
# 'newword' should be added because it's more frequent in the given vocab
print(vocab['newword'])
print(len(vocab))