DeepMoji / examples /tokenize_dataset.py
Pendrokar's picture
torchmoji code
86a83a2
raw
history blame
670 Bytes
"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""
from __future__ import print_function, unicode_literals
import example_helper
import json
from torchmoji.sentence_tokenizer import SentenceTokenizer
with open('../model/vocabulary.json', 'r') as f:
vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
'\u2014 -- \u203c !!\U0001F602',
'Hello world!',
'This is a sample tweet #example',
]
tokens, infos, stats = st.tokenize_sentences(test_sentences)
print(tokens)
print(infos)
print(stats)