Spaces:

Pendrokar
/

DeepMoji

Running

App Files Files Community

Pendrokar commited on Jan 20, 2024

Commit

86a83a2

1 Parent(s): 0a214bf

torchmoji code

Browse files

Files changed (41) hide show

.travis.yml +1 -1
LICENSE +21 -0
data/.gitkeep +1 -0
data/Olympic/raw.pickle +3 -0
data/PsychExp/raw.pickle +3 -0
data/SCv1/raw.pickle +3 -0
data/SCv2-GEN/raw.pickle +3 -0
data/SE0714/raw.pickle +3 -0
data/SS-Twitter/raw.pickle +3 -0
data/SS-Youtube/raw.pickle +3 -0
data/emoji_codes.json +67 -0
data/kaggle-insults/raw.pickle +3 -0
emoji_overview.png +0 -0
environment.yml +41 -0
examples/.gitkeep +1 -0
examples/README.md +39 -0
examples/__init__.py +0 -0
examples/create_twitter_vocab.py +13 -0
examples/dataset_split.py +59 -0
examples/encode_texts.py +41 -0
examples/example_helper.py +6 -0
examples/finetune_insults_chain-thaw.py +44 -0
examples/finetune_semeval_class-avg_f1.py +50 -0
examples/finetune_youtube_last.py +35 -0
examples/score_texts_emojis.py +85 -0
examples/text_emojize.py +63 -0
examples/tokenize_dataset.py +26 -0
examples/vocab_extension.py +30 -0
scripts/analyze_all_results.py +40 -0
scripts/analyze_results.py +39 -0
scripts/calculate_coverages.py +90 -0
scripts/convert_all_datasets.py +110 -0
scripts/download_weights.py +65 -0
scripts/finetune_dataset.py +109 -0
scripts/results/.gitkeep +1 -0
setup.py +16 -0
tests/test_finetuning.py +235 -0
tests/test_helper.py +6 -0
tests/test_sentence_tokenizer.py +113 -0
tests/test_tokenizer.py +167 -0
tests/test_word_generator.py +73 -0

.travis.yml CHANGED Viewed

@@ -24,4 +24,4 @@ script:
     - true  # pytest --capture=sys  # add other tests here
 notifications:
     on_success: change
-    on_failure: change  # `always` will be the setting once code changes slow down

     - true  # pytest --capture=sys  # add other tests here
 notifications:
     on_success: change
+    on_failure: change  # `always` will be the setting once code changes slow down

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

data/Olympic/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:398d394ac1d7c2116166ca968bae9b1f9fd049f9e9281f05c94ae7b2ea97d427
+size 227301

data/PsychExp/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc7d710f2ccd7e9d8e620be703a446ce7ec05818d5ce6afe43d1e6aa9ff4a8aa
+size 3492229

data/SCv1/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a65db490451dada57b88918a951d04082a51599d2cde24914f8c713312de89f5
+size 868931

data/SCv2-GEN/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43ae3ea310130c2ca2089d60876ba6b08006d7f2e018a0519c4fdb7b166f992f
+size 883467

data/SE0714/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66f0ecf48affe92bdacdeb64ab20c1c84b9990a3ac7b659a1a98aa29c9c4a064
+size 126311

data/SS-Twitter/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ef34a4f0fe39b1bb45fcb72026bbf3b82ce2e2a14c13d39610b3b41f18fc98e
+size 413660

data/SS-Youtube/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83ec15e393fb4f0dbb524946480de50e9baf9fef83a3e9eaf95caa3c425b87aa
+size 396130

data/emoji_codes.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+  "0": ":joy:",
+  "1": ":unamused:",
+  "2": ":weary:",
+  "3": ":sob:",
+  "4": ":heart_eyes:",
+  "5": ":pensive:",
+  "6": ":ok_hand:",
+  "7": ":blush:",
+  "8": ":heart:",
+  "9": ":smirk:",
+  "10":":grin:",
+  "11":":notes:",
+  "12":":flushed:",
+  "13":":100:",
+  "14":":sleeping:",
+  "15":":relieved:",
+  "16":":relaxed:",
+  "17":":raised_hands:",
+  "18":":two_hearts:",
+  "19":":expressionless:",
+  "20":":sweat_smile:",
+  "21":":pray:",
+  "22":":confused:",
+  "23":":kissing_heart:",
+  "24":":hearts:",
+  "25":":neutral_face:",
+  "26":":information_desk_person:",
+  "27":":disappointed:",
+  "28":":see_no_evil:",
+  "29":":tired_face:",
+  "30":":v:",
+  "31":":sunglasses:",
+  "32":":rage:",
+  "33":":thumbsup:",
+  "34":":cry:",
+  "35":":sleepy:",
+  "36":":stuck_out_tongue_winking_eye:",
+  "37":":triumph:",
+  "38":":raised_hand:",
+  "39":":mask:",
+  "40":":clap:",
+  "41":":eyes:",
+  "42":":gun:",
+  "43":":persevere:",
+  "44":":imp:",
+  "45":":sweat:",
+  "46":":broken_heart:",
+  "47":":blue_heart:",
+  "48":":headphones:",
+  "49":":speak_no_evil:",
+  "50":":wink:",
+  "51":":skull:",
+  "52":":confounded:",
+  "53":":smile:",
+  "54":":stuck_out_tongue_winking_eye:",
+  "55":":angry:",
+  "56":":no_good:",
+  "57":":muscle:",
+  "58":":punch:",
+  "59":":purple_heart:",
+  "60":":sparkling_heart:",
+  "61":":blue_heart:",
+  "62":":grimacing:",
+  "63":":sparkles:"
+}

data/kaggle-insults/raw.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbeca5470209163e04b6975fc5fb91889e79583fe6ff499f83966e36392fcda
+size 1338159

emoji_overview.png ADDED Viewed

environment.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: torchMoji
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - blas=1.0
+  - ca-certificates=2019.11.27
+  - certifi=2019.11.28
+  - cffi=1.13.2
+  - cudatoolkit=10.1.243
+  - intel-openmp=2019.4
+  - libedit=3.1.20181209
+  - libffi=3.2.1
+  - libgcc-ng=9.1.0
+  - libgfortran-ng=7.3.0
+  - libstdcxx-ng=9.1.0
+  - mkl=2018.0.3
+  - ncurses=6.1
+  - ninja=1.9.0
+  - nose=1.3.7
+  - numpy=1.13.1
+  - openssl=1.1.1d
+  - pip=19.3.1
+  - pycparser=2.19
+  - python=3.6.9
+  - pytorch=1.3.1
+  - readline=7.0
+  - scikit-learn=0.19.0
+  - scipy=0.19.1
+  - setuptools=42.0.2
+  - sqlite=3.30.1
+  - text-unidecode=1.0
+  - tk=8.6.8
+  - wheel=0.33.6
+  - xz=5.2.4
+  - zlib=1.2.11
+  - pip:
+    - emoji==0.4.5
+prefix: /home/cbowdon/miniconda3/envs/torchMoji

examples/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

examples/README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# torchMoji examples
+## Initialization
+[create_twitter_vocab.py](create_twitter_vocab.py)
+Create a new vocabulary from a tsv file.
+[tokenize_dataset.py](tokenize_dataset.py)
+Tokenize a given dataset using the prebuilt vocabulary.
+[vocab_extension.py](vocab_extension.py)
+Extend the given vocabulary using dataset-specific words.
+[dataset_split.py](dataset_split.py)
+Split a given dataset into training, validation and testing.
+## Use pretrained model/architecture
+[score_texts_emojis.py](score_texts_emojis.py)
+Use torchMoji to score texts for emoji distribution.
+[text_emojize.py](text_emojize.py)
+Use torchMoji to output emoji visualization from a single text input (mapped from `emoji_overview.png`)
+```sh
+python examples/text_emojize.py --text "I love mom's cooking\!"
+# => I love mom's cooking! 😋 😍 💓 💛 ❤
+```
+[encode_texts.py](encode_texts.py)
+Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis.
+## Transfer learning
+[finetune_youtube_last.py](finetune_youtube_last.py)
+Finetune the model on the SS-Youtube dataset using the 'last' method.
+[finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py)
+Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method.
+[finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py)
+Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric.

examples/__init__.py ADDED Viewed

File without changes

examples/create_twitter_vocab.py ADDED Viewed

	@@ -0,0 +1,13 @@

+""" Creates a vocabulary from a tsv file.
+"""
+import codecs
+import example_helper
+from torchmoji.create_vocab import VocabBuilder
+from torchmoji.word_generator import TweetWordGenerator
+with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU', 'utf-8') as stream:
+    wg = TweetWordGenerator(stream)
+    vb = VocabBuilder(wg)
+    vb.count_all_words()
+    vb.save_vocab()

examples/dataset_split.py ADDED Viewed

	@@ -0,0 +1,59 @@

+'''
+Split a given dataset into three different datasets: training, validation and
+testing.
+This is achieved by splitting the given list of sentences into three separate
+lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
+explicit enumeration. The sentences are also tokenised using the given
+vocabulary.
+Also splits a given list of dictionaries containing information about
+each sentence.
+An additional parameter can be set 'extend_with', which will extend the given
+vocabulary with up to 'extend_with' tokens, taken from the training dataset.
+'''
+from __future__ import print_function, unicode_literals
+import example_helper
+import json
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+DATASET = [
+    'I am sentence 0',
+    'I am sentence 1',
+    'I am sentence 2',
+    'I am sentence 3',
+    'I am sentence 4',
+    'I am sentence 5',
+    'I am sentence 6',
+    'I am sentence 7',
+    'I am sentence 8',
+    'I am sentence 9 newword',
+    ]
+INFO_DICTS = [
+    {'label': 'sentence 0'},
+    {'label': 'sentence 1'},
+    {'label': 'sentence 2'},
+    {'label': 'sentence 3'},
+    {'label': 'sentence 4'},
+    {'label': 'sentence 5'},
+    {'label': 'sentence 6'},
+    {'label': 'sentence 7'},
+    {'label': 'sentence 8'},
+    {'label': 'sentence 9'},
+    ]
+with open('../model/vocabulary.json', 'r') as f:
+    vocab = json.load(f)
+st = SentenceTokenizer(vocab, 30)
+# Split using the default split ratio
+print(st.split_train_val_test(DATASET, INFO_DICTS))
+# Split explicitly
+print(st.split_train_val_test(DATASET,
+                              INFO_DICTS,
+                              [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
+                              extend_with=1))

examples/encode_texts.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# -*- coding: utf-8 -*-
+""" Use torchMoji to encode texts into emotional feature vectors.
+"""
+from __future__ import print_function, division, unicode_literals
+import json
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+from torchmoji.model_def import torchmoji_feature_encoding
+from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
+TEST_SENTENCES = ['I love mom\'s cooking',
+                  'I love how you never reply back..',
+                  'I love cruising with my homies',
+                  'I love messing with yo mind!!',
+                  'I love you and now you\'re just gone..',
+                  'This is shit',
+                  'This is the shit']
+maxlen = 30
+batch_size = 32
+print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
+with open(VOCAB_PATH, 'r') as f:
+    vocabulary = json.load(f)
+st = SentenceTokenizer(vocabulary, maxlen)
+tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
+print('Loading model from {}.'.format(PRETRAINED_PATH))
+model = torchmoji_feature_encoding(PRETRAINED_PATH)
+print(model)
+print('Encoding texts..')
+encoding = model(tokenized)
+print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
+print(encoding[0,:5])
+# Now you could visualize the encodings to see differences,
+# run a logistic regression classifier on top,
+# or basically anything you'd like to do.

examples/example_helper.py ADDED Viewed

	@@ -0,0 +1,6 @@

+""" Module import helper.
+Modifies PATH in order to allow us to import the torchmoji directory.
+"""
+import sys
+from os.path import abspath, dirname
+sys.path.insert(0, dirname(dirname(abspath(__file__))))

examples/finetune_insults_chain-thaw.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Finetuning example.
+Trains the torchMoji model on the kaggle insults dataset, using the 'chain-thaw'
+finetuning method and the accuracy metric. See the blog post at
+https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0
+for more information. Note that results may differ a bit due to slight
+changes in preprocessing and train/val/test split.
+The 'chain-thaw' method does the following:
+0) Load all weights except for the softmax layer. Extend the embedding layer if
+   necessary, initialising the new weights with random values.
+1) Freeze every layer except the last (softmax) layer and train it.
+2) Freeze every layer except the first layer and train it.
+3) Freeze every layer except the second etc., until the second last layer.
+4) Unfreeze all layers and train entire model.
+"""
+from __future__ import print_function
+import example_helper
+import json
+from torchmoji.model_def import torchmoji_transfer
+from torchmoji.global_variables import PRETRAINED_PATH
+from torchmoji.finetuning import (
+     load_benchmark,
+     finetune)
+DATASET_PATH = '../data/kaggle-insults/raw.pickle'
+nb_classes = 2
+with open('../model/vocabulary.json', 'r') as f:
+    vocab = json.load(f)
+# Load dataset. Extend the existing vocabulary with up to 10000 tokens from
+# the training dataset.
+data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
+# Set up model and finetune. Note that we have to extend the embedding layer
+# with the number of tokens added to the vocabulary.
+model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
+print(model)
+model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
+                      data['batch_size'], method='chain-thaw')
+print('Acc: {}'.format(acc))

examples/finetune_semeval_class-avg_f1.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Finetuning example.
+Trains the torchMoji model on the SemEval emotion dataset, using the 'last'
+finetuning method and the class average F1 metric.
+The 'last' method does the following:
+0) Load all weights except for the softmax layer. Do not add tokens to the
+   vocabulary and do not extend the embedding layer.
+1) Freeze all layers except for the softmax layer.
+2) Train.
+The class average F1 metric does the following:
+1) For each class, relabel the dataset into binary classification
+   (belongs to/does not belong to this class).
+2) Calculate F1 score for each class.
+3) Compute the average of all F1 scores.
+"""
+from __future__ import print_function
+import example_helper
+import json
+from torchmoji.finetuning import load_benchmark
+from torchmoji.class_avg_finetuning import class_avg_finetune
+from torchmoji.model_def import torchmoji_transfer
+from torchmoji.global_variables import PRETRAINED_PATH
+DATASET_PATH = '../data/SE0714/raw.pickle'
+nb_classes = 3
+with open('../model/vocabulary.json', 'r') as f:
+    vocab = json.load(f)
+# Load dataset. Extend the existing vocabulary with up to 10000 tokens from
+# the training dataset.
+data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
+# Set up model and finetune. Note that we have to extend the embedding layer
+# with the number of tokens added to the vocabulary.
+#
+# Also note that when using class average F1 to evaluate, the model has to be
+# defined with two classes, since the model will be trained for each class
+# separately.
+model = torchmoji_transfer(2, PRETRAINED_PATH, extend_embedding=data['added'])
+print(model)
+# For finetuning however, pass in the actual number of classes.
+model, f1 = class_avg_finetune(model, data['texts'], data['labels'],
+                                nb_classes, data['batch_size'], method='last')
+print('F1: {}'.format(f1))

examples/finetune_youtube_last.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Finetuning example.
+Trains the torchMoji model on the SS-Youtube dataset, using the 'last'
+finetuning method and the accuracy metric.
+The 'last' method does the following:
+0) Load all weights except for the softmax layer. Do not add tokens to the
+   vocabulary and do not extend the embedding layer.
+1) Freeze all layers except for the softmax layer.
+2) Train.
+"""
+from __future__ import print_function
+import example_helper
+import json
+from torchmoji.model_def import torchmoji_transfer
+from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, ROOT_PATH
+from torchmoji.finetuning import (
+     load_benchmark,
+     finetune)
+DATASET_PATH = '{}/data/SS-Youtube/raw.pickle'.format(ROOT_PATH)
+nb_classes = 2
+with open(VOCAB_PATH, 'r') as f:
+    vocab = json.load(f)
+# Load dataset.
+data = load_benchmark(DATASET_PATH, vocab)
+# Set up model and finetune
+model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
+print(model)
+model, acc = finetune(model, data['texts'], data['labels'], nb_classes, data['batch_size'], method='last')
+print('Acc: {}'.format(acc))

examples/score_texts_emojis.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# -*- coding: utf-8 -*-
+""" Use torchMoji to score texts for emoji distribution.
+The resulting emoji ids (0-63) correspond to the mapping
+in emoji_overview.png file at the root of the torchMoji repo.
+Writes the result to a csv file.
+"""
+from __future__ import print_function, division, unicode_literals
+import sys
+from os.path import abspath, dirname
+import json
+import csv
+import numpy as np
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+from torchmoji.model_def import torchmoji_emojis
+from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
+sys.path.insert(0, dirname(dirname(abspath(__file__))))
+OUTPUT_PATH = 'test_sentences.csv'
+TEST_SENTENCES = ['I love mom\'s cooking',
+                  'I love how you never reply back..',
+                  'I love cruising with my homies',
+                  'I love messing with yo mind!!',
+                  'I love you and now you\'re just gone..',
+                  'This is shit',
+                  'This is the shit']
+def top_elements(array, k):
+    ind = np.argpartition(array, -k)[-k:]
+    return ind[np.argsort(array[ind])][::-1]
+maxlen = 30
+print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
+with open(VOCAB_PATH, 'r') as f:
+    vocabulary = json.load(f)
+st = SentenceTokenizer(vocabulary, maxlen)
+print('Loading model from {}.'.format(PRETRAINED_PATH))
+model = torchmoji_emojis(PRETRAINED_PATH)
+print(model)
+def doImportableFunction():
+    print('Running predictions.')
+    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
+    prob = model(tokenized)
+    for prob in [prob]:
+        # Find top emojis for each sentence. Emoji ids (0-63)
+        # correspond to the mapping in emoji_overview.png
+        # at the root of the torchMoji repo.
+        print('Writing results to {}'.format(OUTPUT_PATH))
+        scores = []
+        for i, t in enumerate(TEST_SENTENCES):
+            t_tokens = tokenized[i]
+            t_score = [t]
+            t_prob = prob[i]
+            ind_top = top_elements(t_prob, 5)
+            t_score.append(sum(t_prob[ind_top]))
+            t_score.extend(ind_top)
+            t_score.extend([t_prob[ind] for ind in ind_top])
+            scores.append(t_score)
+            print(t_score)
+        with open(OUTPUT_PATH, 'w') as csvfile:
+            writer = csv.writer(csvfile, delimiter=str(','), lineterminator='\n')
+            writer.writerow(['Text', 'Top5%',
+                            'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
+                            'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'])
+            for i, row in enumerate(scores):
+                try:
+                    writer.writerow(row)
+                except:
+                    print("Exception at row {}!".format(i))
+    return

examples/text_emojize.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# -*- coding: utf-8 -*-
+""" Use torchMoji to predict emojis from a single text input
+"""
+from __future__ import print_function, division, unicode_literals
+import example_helper
+import json
+import csv
+import argparse
+import numpy as np
+import emoji
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+from torchmoji.model_def import torchmoji_emojis
+from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
+# Emoji map in emoji_overview.png
+EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
+:pensive: :ok_hand: :blush: :heart: :smirk: \
+:grin: :notes: :flushed: :100: :sleeping: \
+:relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
+:sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
+:neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
+:v: :sunglasses: :rage: :thumbsup: :cry: \
+:sleepy: :yum: :triumph: :hand: :mask: \
+:clap: :eyes: :gun: :persevere: :smiling_imp: \
+:sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
+:wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
+:angry: :no_good: :muscle: :facepunch: :purple_heart: \
+:sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')
+def top_elements(array, k):
+    ind = np.argpartition(array, -k)[-k:]
+    return ind[np.argsort(array[ind])][::-1]
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
+    argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
+    args = argparser.parse_args()
+    # Tokenizing using dictionary
+    with open(VOCAB_PATH, 'r') as f:
+        vocabulary = json.load(f)
+    st = SentenceTokenizer(vocabulary, args.maxlen)
+    # Loading model
+    model = torchmoji_emojis(PRETRAINED_PATH)
+    # Running predictions
+    tokenized, _, _ = st.tokenize_sentences([args.text])
+    # Get sentence probability
+    prob = model(tokenized)[0]
+    # Top emoji id
+    emoji_ids = top_elements(prob, 5)
+    # map to emojis
+    emojis = map(lambda x: EMOJIS[x], emoji_ids)
+    print(emoji.emojize("{} {}".format(args.text,' '.join(emojis)), use_aliases=True))

examples/tokenize_dataset.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Take a given list of sentences and turn it into a numpy array, where each
+number corresponds to a word. Padding is used (number 0) to ensure fixed length
+of sentences.
+"""
+from __future__ import print_function, unicode_literals
+import example_helper
+import json
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+with open('../model/vocabulary.json', 'r') as f:
+    vocabulary = json.load(f)
+st = SentenceTokenizer(vocabulary, 30)
+test_sentences = [
+    '\u2014 -- \u203c !!\U0001F602',
+    'Hello world!',
+    'This is a sample tweet #example',
+    ]
+tokens, infos, stats = st.tokenize_sentences(test_sentences)
+print(tokens)
+print(infos)
+print(stats)

examples/vocab_extension.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+Extend the given vocabulary using dataset-specific words.
+1. First create a vocabulary for the specific dataset.
+2. Find all words not in our vocabulary, but in the dataset vocabulary.
+3. Take top X (default=1000) of these words and add them to the vocabulary.
+4. Save this combined vocabulary and embedding matrix, which can now be used.
+"""
+from __future__ import print_function, unicode_literals
+import example_helper
+import json
+from torchmoji.create_vocab import extend_vocab, VocabBuilder
+from torchmoji.word_generator import WordGenerator
+new_words = ['#zzzzaaazzz', 'newword', 'newword']
+word_gen = WordGenerator(new_words)
+vb = VocabBuilder(word_gen)
+vb.count_all_words()
+with open('../model/vocabulary.json') as f:
+    vocab = json.load(f)
+print(len(vocab))
+print(vb.word_counts)
+extend_vocab(vocab, vb, max_tokens=1)
+# 'newword' should be added because it's more frequent in the given vocab
+print(vocab['newword'])
+print(len(vocab))

scripts/analyze_all_results.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from __future__ import print_function
+# allow us to import the codebase directory
+import sys
+import glob
+import numpy as np
+from os.path import dirname, abspath
+sys.path.insert(0, dirname(dirname(abspath(__file__))))
+DATASETS = ['SE0714', 'Olympic', 'PsychExp', 'SS-Twitter', 'SS-Youtube',
+            'SCv1', 'SV2-GEN'] # 'SE1604' excluded due to Twitter's ToS
+def get_results(dset):
+    METHOD = 'last'
+    RESULTS_DIR = 'results/'
+    RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, dset, METHOD))
+    assert len(RESULT_PATHS)
+    scores = []
+    for path in RESULT_PATHS:
+        with open(path) as f:
+            score = f.readline().split(':')[1]
+        scores.append(float(score))
+    average = np.mean(scores)
+    maximum = max(scores)
+    minimum = min(scores)
+    std = np.std(scores)
+    print('Dataset: {}'.format(dset))
+    print('Method:  {}'.format(METHOD))
+    print('Number of results: {}'.format(len(scores)))
+    print('--------------------------')
+    print('Average: {}'.format(average))
+    print('Maximum: {}'.format(maximum))
+    print('Minimum: {}'.format(minimum))
+    print('Standard deviaton: {}'.format(std))
+for dset in DATASETS:
+    get_results(dset)

scripts/analyze_results.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from __future__ import print_function
+import sys
+import glob
+import numpy as np
+DATASET = 'SS-Twitter' # 'SE1604' excluded due to Twitter's ToS
+METHOD = 'new'
+# Optional usage: analyze_results.py <dataset> <method>
+if len(sys.argv) == 3:
+    DATASET = sys.argv[1]
+    METHOD = sys.argv[2]
+RESULTS_DIR = 'results/'
+RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, DATASET, METHOD))
+if not RESULT_PATHS:
+    print('Could not find results for \'{}\' using \'{}\' in directory \'{}\'.'.format(DATASET, METHOD, RESULTS_DIR))
+else:
+    scores = []
+    for path in RESULT_PATHS:
+        with open(path) as f:
+            score = f.readline().split(':')[1]
+        scores.append(float(score))
+    average = np.mean(scores)
+    maximum = max(scores)
+    minimum = min(scores)
+    std = np.std(scores)
+    print('Dataset: {}'.format(DATASET))
+    print('Method:  {}'.format(METHOD))
+    print('Number of results: {}'.format(len(scores)))
+    print('--------------------------')
+    print('Average: {}'.format(average))
+    print('Maximum: {}'.format(maximum))
+    print('Minimum: {}'.format(minimum))
+    print('Standard deviaton: {}'.format(std))

scripts/calculate_coverages.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from __future__ import print_function
+import pickle
+import json
+import csv
+import sys
+from io import open
+# Allow us to import the torchmoji directory
+from os.path import dirname, abspath
+sys.path.insert(0, dirname(dirname(abspath(__file__))))
+from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
+try:
+    unicode        # Python 2
+except NameError:
+    unicode = str  # Python 3
+IS_PYTHON2 = int(sys.version[0]) == 2
+OUTPUT_PATH = 'coverage.csv'
+DATASET_PATHS = [
+    '../data/Olympic/raw.pickle',
+    '../data/PsychExp/raw.pickle',
+    '../data/SCv1/raw.pickle',
+    '../data/SCv2-GEN/raw.pickle',
+    '../data/SE0714/raw.pickle',
+    #'../data/SE1604/raw.pickle', # Excluded due to Twitter's ToS
+    '../data/SS-Twitter/raw.pickle',
+    '../data/SS-Youtube/raw.pickle',
+    ]
+with open('../model/vocabulary.json', 'r') as f:
+    vocab = json.load(f)
+results = []
+for p in DATASET_PATHS:
+    coverage_result = [p]
+    print('Calculating coverage for {}'.format(p))
+    with open(p, 'rb') as f:
+        if IS_PYTHON2:
+            s = pickle.load(f)
+        else:
+            s = pickle.load(f, fix_imports=True)
+    # Decode data
+    try:
+        s['texts'] = [unicode(x) for x in s['texts']]
+    except UnicodeDecodeError:
+        s['texts'] = [x.decode('utf-8') for x in s['texts']]
+    # Own
+    st = SentenceTokenizer({}, 30)
+    tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
+                                              [s['train_ind'],
+                                               s['val_ind'],
+                                               s['test_ind']],
+                                              extend_with=10000)
+    coverage_result.append(coverage(tests[2]))
+    # Last
+    st = SentenceTokenizer(vocab, 30)
+    tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
+                                              [s['train_ind'],
+                                               s['val_ind'],
+                                               s['test_ind']],
+                                              extend_with=0)
+    coverage_result.append(coverage(tests[2]))
+    # Full
+    st = SentenceTokenizer(vocab, 30)
+    tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
+                                              [s['train_ind'],
+                                               s['val_ind'],
+                                               s['test_ind']],
+                                              extend_with=10000)
+    coverage_result.append(coverage(tests[2]))
+    results.append(coverage_result)
+with open(OUTPUT_PATH, 'wb') as csvfile:
+    writer = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
+    writer.writerow(['Dataset', 'Own', 'Last', 'Full'])
+    for i, row in enumerate(results):
+        try:
+            writer.writerow(row)
+        except:
+            print("Exception at row {}!".format(i))
+print('Saved to {}'.format(OUTPUT_PATH))

scripts/convert_all_datasets.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from __future__ import print_function
+import json
+import math
+import pickle
+import sys
+from io import open
+import numpy as np
+from os.path import abspath, dirname
+sys.path.insert(0, dirname(dirname(abspath(__file__))))
+from torchmoji.word_generator import WordGenerator
+from torchmoji.create_vocab import VocabBuilder
+from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
+from torchmoji.tokenizer import tokenize
+try:
+    unicode        # Python 2
+except NameError:
+    unicode = str  # Python 3
+IS_PYTHON2 = int(sys.version[0]) == 2
+DATASETS = [
+    'Olympic',
+    'PsychExp',
+    'SCv1',
+    'SCv2-GEN',
+    'SE0714',
+    #'SE1604', # Excluded due to Twitter's ToS
+    'SS-Twitter',
+    'SS-Youtube',
+    ]
+DIR = '../data'
+FILENAME_RAW = 'raw.pickle'
+FILENAME_OWN = 'own_vocab.pickle'
+FILENAME_OUR = 'twitter_vocab.pickle'
+FILENAME_COMBINED = 'combined_vocab.pickle'
+def roundup(x):
+    return int(math.ceil(x / 10.0)) * 10
+def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
+    return {'dataset': dset,
+            'train_texts': train_texts,
+            'val_texts': val_texts,
+            'test_texts': test_texts,
+            'train_labels': train_labels,
+            'val_labels': val_labels,
+            'test_labels': test_labels}
+def convert_dataset(filepath, extend_with, vocab):
+    print('-- Generating {} '.format(filepath))
+    sys.stdout.flush()
+    st = SentenceTokenizer(vocab, maxlen)
+    tokenized, dicts, _ = st.split_train_val_test(texts,
+                                                  labels,
+                                                  [data['train_ind'],
+                                                   data['val_ind'],
+                                                   data['test_ind']],
+                                                  extend_with=extend_with)
+    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
+                        dicts[0], dicts[1], dicts[2])
+    with open(filepath, 'w') as f:
+        pickle.dump(pick, f)
+    cover = coverage(tokenized[2])
+    print('     done. Coverage: {}'.format(cover))
+with open('../model/vocabulary.json', 'r') as f:
+    vocab = json.load(f)
+for dset in DATASETS:
+    print('Converting {}'.format(dset))
+    PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
+    PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
+    PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
+    PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)
+    with open(PATH_RAW, 'rb') as dataset:
+        if IS_PYTHON2:
+            data = pickle.load(dataset)
+        else:
+            data = pickle.load(dataset, fix_imports=True)
+    # Decode data
+    try:
+        texts = [unicode(x) for x in data['texts']]
+    except UnicodeDecodeError:
+        texts = [x.decode('utf-8') for x in data['texts']]
+    wg = WordGenerator(texts)
+    vb = VocabBuilder(wg)
+    vb.count_all_words()
+    # Calculate max length of sequences considered
+    # Adjust batch_size accordingly to prevent GPU overflow
+    lengths = [len(tokenize(t)) for t in texts]
+    maxlen = roundup(np.percentile(lengths, 80.0))
+    # Extract labels
+    labels = [x['label'] for x in data['info']]
+    convert_dataset(PATH_OWN, 50000, {})
+    convert_dataset(PATH_OUR, 0, vocab)
+    convert_dataset(PATH_COMBINED, 10000, vocab)

scripts/download_weights.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import print_function
+import os
+from subprocess import call
+from builtins import input
+curr_folder = os.path.basename(os.path.normpath(os.getcwd()))
+weights_filename = 'pytorch_model.bin'
+weights_folder = 'model'
+weights_path = '{}/{}'.format(weights_folder, weights_filename)
+if curr_folder == 'scripts':
+    weights_path = '../' + weights_path
+weights_download_link = 'https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0#'
+MB_FACTOR = float(1<<20)
+def prompt():
+    while True:
+        valid = {
+            'y': True,
+            'ye': True,
+            'yes': True,
+            'n': False,
+            'no': False,
+        }
+        choice = input().lower()
+        if choice in valid:
+            return valid[choice]
+        else:
+            print('Please respond with \'y\' or \'n\' (or \'yes\' or \'no\')')
+download = True
+if os.path.exists(weights_path):
+    print('Weight file already exists at {}. Would you like to redownload it anyway? [y/n]'.format(weights_path))
+    download = prompt()
+    already_exists = True
+else:
+    already_exists = False
+if download:
+    print('About to download the pretrained weights file from {}'.format(weights_download_link))
+    if already_exists == False:
+        print('The size of the file is roughly 85MB. Continue? [y/n]')
+    else:
+        os.unlink(weights_path)
+    if already_exists or prompt():
+        print('Downloading...')
+        #urllib.urlretrieve(weights_download_link, weights_path)
+        #with open(weights_path,'wb') as f:
+        #    f.write(requests.get(weights_download_link).content)
+        # downloading using wget due to issues with urlretrieve and requests
+        sys_call = 'wget {} -O {}'.format(weights_download_link, os.path.abspath(weights_path))
+        print("Running system call: {}".format(sys_call))
+        call(sys_call, shell=True)
+        if os.path.getsize(weights_path) / MB_FACTOR < 80:
+            raise ValueError("Download finished, but the resulting file is too small! " +
+                             "It\'s only {} bytes.".format(os.path.getsize(weights_path)))
+        print('Downloaded weights to {}'.format(weights_path))
+else:
+    print('Exiting.')

scripts/finetune_dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+""" Finetuning example.
+"""
+from __future__ import print_function
+import sys
+import numpy as np
+from os.path import abspath, dirname
+sys.path.insert(0, dirname(dirname(abspath(__file__))))
+import json
+import math
+from torchmoji.model_def import torchmoji_transfer
+from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
+from torchmoji.finetuning import (
+     load_benchmark,
+     finetune)
+from torchmoji.class_avg_finetuning import class_avg_finetune
+def roundup(x):
+    return int(math.ceil(x / 10.0)) * 10
+# Format: (dataset_name,
+#          path_to_dataset,
+#          nb_classes,
+#          use_f1_score)
+DATASETS = [
+     #('SE0714', '../data/SE0714/raw.pickle', 3, True),
+     #('Olympic', '../data/Olympic/raw.pickle', 4, True),
+     #('PsychExp', '../data/PsychExp/raw.pickle', 7, True),
+     #('SS-Twitter', '../data/SS-Twitter/raw.pickle', 2, False),
+     ('SS-Youtube', '../data/SS-Youtube/raw.pickle', 2, False),
+     #('SE1604', '../data/SE1604/raw.pickle', 3, False), # Excluded due to Twitter's ToS
+     #('SCv1', '../data/SCv1/raw.pickle', 2, True),
+     #('SCv2-GEN', '../data/SCv2-GEN/raw.pickle', 2, True)
+      ]
+RESULTS_DIR = 'results'
+# 'new' | 'last' | 'full' | 'chain-thaw'
+FINETUNE_METHOD = 'last'
+VERBOSE = 1
+nb_tokens = 50000
+nb_epochs = 1000
+epoch_size = 1000
+with open(VOCAB_PATH, 'r') as f:
+    vocab = json.load(f)
+for rerun_iter in range(5):
+    for p in DATASETS:
+        # debugging
+        assert len(vocab) == nb_tokens
+        dset = p[0]
+        path = p[1]
+        nb_classes = p[2]
+        use_f1_score = p[3]
+        if FINETUNE_METHOD == 'last':
+            extend_with = 0
+        elif FINETUNE_METHOD in ['new', 'full', 'chain-thaw']:
+            extend_with = 10000
+        else:
+            raise ValueError('Finetuning method not recognised!')
+        # Load dataset.
+        data = load_benchmark(path, vocab, extend_with=extend_with)
+        (X_train, y_train) = (data['texts'][0], data['labels'][0])
+        (X_val, y_val) = (data['texts'][1], data['labels'][1])
+        (X_test, y_test) = (data['texts'][2], data['labels'][2])
+        weight_path = PRETRAINED_PATH if FINETUNE_METHOD != 'new' else None
+        nb_model_classes = 2 if use_f1_score else nb_classes
+        model = torchmoji_transfer(
+                    nb_model_classes,
+                    weight_path,
+                    extend_embedding=data['added'])
+        print(model)
+        # Training
+        print('Training: {}'.format(path))
+        if use_f1_score:
+            model, result = class_avg_finetune(model, data['texts'],
+                                               data['labels'],
+                                               nb_classes, data['batch_size'],
+                                               FINETUNE_METHOD,
+                                               verbose=VERBOSE)
+        else:
+            model, result = finetune(model, data['texts'], data['labels'],
+                                     nb_classes, data['batch_size'],
+                                     FINETUNE_METHOD, metric='acc',
+                                     verbose=VERBOSE)
+        # Write results
+        if use_f1_score:
+            print('Overall F1 score (dset = {}): {}'.format(dset, result))
+            with open('{}/{}_{}_{}_results.txt'.
+                      format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
+                      "w") as f:
+                f.write("F1: {}\n".format(result))
+        else:
+            print('Test accuracy (dset = {}): {}'.format(dset, result))
+            with open('{}/{}_{}_{}_results.txt'.
+                      format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
+                      "w") as f:
+                f.write("Acc: {}\n".format(result))

scripts/results/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

setup.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from setuptools import setup
+setup(
+    name='torchmoji',
+    version='1.0',
+    packages=['torchmoji'],
+    description='torchMoji',
+    include_package_data=True,
+    install_requires=[
+        'emoji==0.4.5',
+        'numpy==1.13.1',
+        'scipy==0.19.1',
+        'scikit-learn==0.19.0',
+        'text-unidecode==1.0',
+    ],
+)

tests/test_finetuning.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from __future__ import absolute_import, print_function, division, unicode_literals
+import test_helper
+from nose.plugins.attrib import attr
+import json
+import numpy as np
+from torchmoji.class_avg_finetuning import relabel
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+from torchmoji.finetuning import (
+    calculate_batchsize_maxlen,
+    freeze_layers,
+    change_trainable,
+    finetune,
+    load_benchmark
+    )
+from torchmoji.model_def import (
+    torchmoji_transfer,
+    torchmoji_feature_encoding,
+    torchmoji_emojis
+    )
+from torchmoji.global_variables import (
+    PRETRAINED_PATH,
+    NB_TOKENS,
+    VOCAB_PATH,
+    ROOT_PATH
+    )
+def test_calculate_batchsize_maxlen():
+    """ Batch size and max length are calculated properly.
+    """
+    texts = ['a b c d',
+             'e f g h i']
+    batch_size, maxlen = calculate_batchsize_maxlen(texts)
+    assert batch_size == 250
+    assert maxlen == 10, maxlen
+def test_freeze_layers():
+    """ Correct layers are frozen.
+    """
+    model = torchmoji_transfer(5)
+    keyword = 'output_layer'
+    model = freeze_layers(model, unfrozen_keyword=keyword)
+    for name, module in model.named_children():
+        trainable = keyword.lower() in name.lower()
+        assert all(p.requires_grad == trainable for p in module.parameters())
+def test_change_trainable():
+    """ change_trainable() changes trainability of layers.
+    """
+    model = torchmoji_transfer(5)
+    change_trainable(model.embed, False)
+    assert not any(p.requires_grad for p in model.embed.parameters())
+    change_trainable(model.embed, True)
+    assert all(p.requires_grad for p in model.embed.parameters())
+def test_torchmoji_transfer_extend_embedding():
+    """ Defining torchmoji with extension.
+    """
+    extend_with = 50
+    model = torchmoji_transfer(5, weight_path=PRETRAINED_PATH,
+                              extend_embedding=extend_with)
+    embedding_layer = model.embed
+    assert embedding_layer.weight.size()[0] == NB_TOKENS + extend_with
+def test_torchmoji_return_attention():
+    seq_tensor = np.array([[1]])
+    # test the output of the normal model
+    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
+    # check correct number of outputs
+    assert len(model(seq_tensor)) == 1
+    # repeat above described tests when returning attention weights
+    model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True)
+    assert len(model(seq_tensor)) == 2
+def test_relabel():
+    """ relabel() works with multi-class labels.
+    """
+    nb_classes = 3
+    inputs = np.array([
+        [True, False, False],
+        [False, True, False],
+        [True, False, True],
+    ])
+    expected_0 = np.array([True, False, True])
+    expected_1 = np.array([False, True, False])
+    expected_2 = np.array([False, False, True])
+    assert np.array_equal(relabel(inputs, 0, nb_classes), expected_0)
+    assert np.array_equal(relabel(inputs, 1, nb_classes), expected_1)
+    assert np.array_equal(relabel(inputs, 2, nb_classes), expected_2)
+def test_relabel_binary():
+    """ relabel() works with binary classification (no changes to labels)
+    """
+    nb_classes = 2
+    inputs = np.array([True, False, False])
+    assert np.array_equal(relabel(inputs, 0, nb_classes), inputs)
+@attr('slow')
+def test_finetune_full():
+    """ finetuning using 'full'.
+    """
+    DATASET_PATH = ROOT_PATH+'/data/SS-Youtube/raw.pickle'
+    nb_classes = 2
+    # Keras and pyTorch implementation of the Adam optimizer are slightly different and change a bit the results
+    # We reduce the min accuracy needed here to pass the test
+    # See e.g. https://discuss.pytorch.org/t/suboptimal-convergence-when-compared-with-tensorflow-model/5099/11
+    min_acc = 0.68
+    with open(VOCAB_PATH, 'r') as f:
+        vocab = json.load(f)
+    data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
+    print('Loading pyTorch model from {}.'.format(PRETRAINED_PATH))
+    model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
+    print(model)
+    model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
+                          data['batch_size'], method='full', nb_epochs=1)
+    print("Finetune full SS-Youtube 1 epoch acc: {}".format(acc))
+    assert acc >= min_acc
+@attr('slow')
+def test_finetune_last():
+    """ finetuning using 'last'.
+    """
+    dataset_path = ROOT_PATH + '/data/SS-Youtube/raw.pickle'
+    nb_classes = 2
+    min_acc = 0.68
+    with open(VOCAB_PATH, 'r') as f:
+        vocab = json.load(f)
+    data = load_benchmark(dataset_path, vocab)
+    print('Loading model from {}.'.format(PRETRAINED_PATH))
+    model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
+    print(model)
+    model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
+                          data['batch_size'], method='last', nb_epochs=1)
+    print("Finetune last SS-Youtube 1 epoch acc: {}".format(acc))
+    assert acc >= min_acc
+def test_score_emoji():
+    """ Emoji predictions make sense.
+    """
+    test_sentences = [
+        'I love mom\'s cooking',
+        'I love how you never reply back..',
+        'I love cruising with my homies',
+        'I love messing with yo mind!!',
+        'I love you and now you\'re just gone..',
+        'This is shit',
+        'This is the shit'
+    ]
+    expected = [
+        np.array([36,  4,  8, 16, 47]),
+        np.array([1, 19, 55, 25, 46]),
+        np.array([31,  6, 30, 15, 13]),
+        np.array([54, 44,  9, 50, 49]),
+        np.array([46,  5, 27, 35, 34]),
+        np.array([55, 32, 27,  1, 37]),
+        np.array([48, 11,  6, 31,  9])
+    ]
+    def top_elements(array, k):
+        ind = np.argpartition(array, -k)[-k:]
+        return ind[np.argsort(array[ind])][::-1]
+    # Initialize by loading dictionary and tokenize texts
+    with open(VOCAB_PATH, 'r') as f:
+        vocabulary = json.load(f)
+    st = SentenceTokenizer(vocabulary, 30)
+    tokens, _, _ = st.tokenize_sentences(test_sentences)
+    # Load model and run
+    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
+    prob = model(tokens)
+    # Find top emojis for each sentence
+    for i, t_prob in enumerate(list(prob)):
+        assert np.array_equal(top_elements(t_prob, 5), expected[i])
+def test_encode_texts():
+    """ Text encoding is stable.
+    """
+    TEST_SENTENCES = ['I love mom\'s cooking',
+                      'I love how you never reply back..',
+                      'I love cruising with my homies',
+                      'I love messing with yo mind!!',
+                      'I love you and now you\'re just gone..',
+                      'This is shit',
+                      'This is the shit']
+    maxlen = 30
+    batch_size = 32
+    with open(VOCAB_PATH, 'r') as f:
+        vocabulary = json.load(f)
+    st = SentenceTokenizer(vocabulary, maxlen)
+    print('Loading model from {}.'.format(PRETRAINED_PATH))
+    model = torchmoji_feature_encoding(PRETRAINED_PATH)
+    print(model)
+    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
+    encoding = model(tokenized)
+    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
+    assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
+test_encode_texts()

tests/test_helper.py ADDED Viewed

	@@ -0,0 +1,6 @@

+""" Module import helper.
+Modifies PATH in order to allow us to import the torchmoji directory.
+"""
+import sys
+from os.path import abspath, dirname
+sys.path.insert(0, dirname(dirname(abspath(__file__))))

tests/test_sentence_tokenizer.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from __future__ import absolute_import, print_function, division, unicode_literals
+import test_helper
+import json
+from torchmoji.sentence_tokenizer import SentenceTokenizer
+sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
+dicts = [
+    {'label': 0},
+    {'label': 1},
+    {'label': 2},
+    {'label': 3},
+    {'label': 4},
+    {'label': 5},
+    {'label': 6},
+    {'label': 7},
+    {'label': 8},
+    {'label': 9},
+    ]
+train_ind = [0, 5, 3, 6, 8]
+val_ind = [9, 2, 1]
+test_ind = [4, 7]
+with open('../model/vocabulary.json', 'r') as f:
+    vocab = json.load(f)
+def test_dataset_split_parameter():
+    """ Dataset is split in the desired ratios
+    """
+    split_parameter = [0.7, 0.1, 0.2]
+    st = SentenceTokenizer(vocab, 30)
+    result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
+                                               split_parameter, extend_with=0)
+    train = result[0]
+    val = result[1]
+    test = result[2]
+    train_dicts = result_dicts[0]
+    val_dicts = result_dicts[1]
+    test_dicts = result_dicts[2]
+    assert len(train) == len(sentences) * split_parameter[0]
+    assert len(val) == len(sentences) * split_parameter[1]
+    assert len(test) == len(sentences) * split_parameter[2]
+    assert len(train_dicts) == len(dicts) * split_parameter[0]
+    assert len(val_dicts) == len(dicts) * split_parameter[1]
+    assert len(test_dicts) == len(dicts) * split_parameter[2]
+def test_dataset_split_explicit():
+    """ Dataset is split according to given indices
+    """
+    split_parameter = [train_ind, val_ind, test_ind]
+    st = SentenceTokenizer(vocab, 30)
+    tokenized, _, _ = st.tokenize_sentences(sentences)
+    result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
+    train = result[0]
+    val = result[1]
+    test = result[2]
+    train_dicts = result_dicts[0]
+    val_dicts = result_dicts[1]
+    test_dicts = result_dicts[2]
+    tokenized = tokenized
+    for i, sentence in enumerate(sentences):
+        if i in train_ind:
+            assert tokenized[i] in train
+            assert dicts[i] in train_dicts
+        elif i in val_ind:
+            assert tokenized[i] in val
+            assert dicts[i] in val_dicts
+        elif i in test_ind:
+            assert tokenized[i] in test
+            assert dicts[i] in test_dicts
+    assert len(train) == len(train_ind)
+    assert len(val) == len(val_ind)
+    assert len(test) == len(test_ind)
+    assert len(train_dicts) == len(train_ind)
+    assert len(val_dicts) == len(val_ind)
+    assert len(test_dicts) == len(test_ind)
+def test_id_to_sentence():
+    """Tokenizing and converting back preserves the input.
+    """
+    vb = {'CUSTOM_MASK': 0,
+          'aasdf': 1000,
+          'basdf': 2000}
+    sentence = 'aasdf basdf basdf basdf'
+    st = SentenceTokenizer(vb, 30)
+    token, _, _ = st.tokenize_sentences([sentence])
+    assert st.to_sentence(token[0]) == sentence
+def test_id_to_sentence_with_unknown():
+    """Tokenizing and converting back preserves the input, except for unknowns.
+    """
+    vb = {'CUSTOM_MASK': 0,
+          'CUSTOM_UNKNOWN': 1,
+          'aasdf': 1000,
+          'basdf': 2000}
+    sentence = 'aasdf basdf ccc'
+    expected = 'aasdf basdf CUSTOM_UNKNOWN'
+    st = SentenceTokenizer(vb, 30)
+    token, _, _ = st.tokenize_sentences([sentence])
+    assert st.to_sentence(token[0]) == expected

tests/test_tokenizer.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# -*- coding: utf-8 -*-
+""" Tokenization tests.
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+import sys
+from nose.tools import nottest
+from os.path import dirname, abspath
+sys.path.append(dirname(dirname(abspath(__file__))))
+from torchmoji.tokenizer import tokenize
+TESTS_NORMAL = [
+    ('200K words!', ['200', 'K', 'words', '!']),
+]
+TESTS_EMOJIS = [
+    ('i \U0001f496 you to the moon and back',
+     ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']),
+    ("i\U0001f496you to the \u2605's and back",
+     ['i', '\U0001f496', 'you', 'to', 'the',
+      '\u2605', "'", 's', 'and', 'back']),
+    ('~<3~', ['~', '<3', '~']),
+    ('<333', ['<333']),
+    (':-)', [':-)']),
+    ('>:-(', ['>:-(']),
+    ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661',
+     ['\u266b', '\u266a', '\u2605', '\u2606',
+      '\u2665', '\u2764', '\u2661']),
+]
+TESTS_URLS = [
+    ('www.sample.com', ['www.sample.com']),
+    ('http://endless.horse', ['http://endless.horse']),
+    ('https://github.mit.ed', ['https://github.mit.ed']),
+]
+TESTS_TWITTER = [
+    ('#blacklivesmatter', ['#blacklivesmatter']),
+    ('#99_percent.', ['#99_percent', '.']),
+    ('the#99%', ['the', '#99', '%']),
+    ('@golden_zenith', ['@golden_zenith']),
+    ('@99_percent', ['@99_percent']),
+    ('latte-express@mit.ed', ['latte-express@mit.ed']),
+]
+TESTS_PHONE_NUMS = [
+    ('518)528-0252', ['518', ')', '528', '-', '0252']),
+    ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']),
+    ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']),
+]
+TESTS_DATETIME = [
+    ('15:00', ['15', ':', '00']),
+    ('2:00pm', ['2', ':', '00', 'pm']),
+    ('9/14/16', ['9', '/', '14', '/', '16']),
+]
+TESTS_CURRENCIES = [
+    ('517.933\xa3', ['517', '.', '933', '\xa3']),
+    ('$517.87', ['$', '517', '.', '87']),
+    ('1201.6598', ['1201', '.', '6598']),
+    ('120,6', ['120', ',', '6']),
+    ('10,00\u20ac', ['10', ',', '00', '\u20ac']),
+    ('1,000', ['1', ',', '000']),
+    ('1200pesos', ['1200', 'pesos']),
+]
+TESTS_NUM_SYM = [
+    ('5162f', ['5162', 'f']),
+    ('f5162', ['f', '5162']),
+    ('1203(', ['1203', '(']),
+    ('(1203)', ['(', '1203', ')']),
+    ('1200/', ['1200', '/']),
+    ('1200+', ['1200', '+']),
+    ('1202o-east', ['1202', 'o-east']),
+    ('1200r', ['1200', 'r']),
+    ('1200-1400', ['1200', '-', '1400']),
+    ('120/today', ['120', '/', 'today']),
+    ('today/120', ['today', '/', '120']),
+    ('120/5', ['120', '/', '5']),
+    ("120'/5", ['120', "'", '/', '5']),
+    ('120/5pro', ['120', '/', '5', 'pro']),
+    ("1200's,)", ['1200', "'", 's', ',', ')']),
+    ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']),
+]
+TESTS_PUNCTUATION = [
+    ("don''t", ['don', "''", 't']),
+    ("don'tcha", ["don'tcha"]),
+    ('no?!?!;', ['no', '?', '!', '?', '!', ';']),
+    ('no??!!..', ['no', '??', '!!', '..']),
+    ('a.m.', ['a.m.']),
+    ('.s.u', ['.', 's', '.', 'u']),
+    ('!!i..n__', ['!!', 'i', '..', 'n', '__']),
+    ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3',
+                            '>', ')', 'u', 'Mr.', '!']),
+    ('-->', ['--', '>']),
+    ('->', ['-', '>']),
+    ('<-', ['<', '-']),
+    ('<--', ['<', '--']),
+    ('hello (@person)', ['hello', '(', '@person', ')']),
+]
+def test_normal():
+    """ Normal/combined usage.
+    """
+    test_base(TESTS_NORMAL)
+def test_emojis():
+    """ Tokenizing emojis/emoticons/decorations.
+    """
+    test_base(TESTS_EMOJIS)
+def test_urls():
+    """ Tokenizing URLs.
+    """
+    test_base(TESTS_URLS)
+def test_twitter():
+    """ Tokenizing hashtags, mentions and emails.
+    """
+    test_base(TESTS_TWITTER)
+def test_phone_nums():
+    """ Tokenizing phone numbers.
+    """
+    test_base(TESTS_PHONE_NUMS)
+def test_datetime():
+    """ Tokenizing dates and times.
+    """
+    test_base(TESTS_DATETIME)
+def test_currencies():
+    """ Tokenizing currencies.
+    """
+    test_base(TESTS_CURRENCIES)
+def test_num_sym():
+    """ Tokenizing combinations of numbers and symbols.
+    """
+    test_base(TESTS_NUM_SYM)
+def test_punctuation():
+    """ Tokenizing punctuation and contractions.
+    """
+    test_base(TESTS_PUNCTUATION)
+@nottest
+def test_base(tests):
+    """ Base function for running tests.
+    """
+    for (test, expected) in tests:
+        actual = tokenize(test)
+        assert actual == expected, \
+            "Tokenization of \'{}\' failed, expected: {}, actual: {}"\
+            .format(test, expected, actual)

tests/test_word_generator.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# -*- coding: utf-8 -*-
+import sys
+from os.path import dirname, abspath
+sys.path.append(dirname(dirname(abspath(__file__))))
+from nose.tools import raises
+from torchmoji.word_generator import WordGenerator
+IS_PYTHON2 = int(sys.version[0]) == 2
+@raises(ValueError)
+def test_only_unicode_accepted():
+    """ Non-Unicode strings raise a ValueError.
+        In Python 3 all string are Unicode
+    """
+    if not IS_PYTHON2:
+        raise ValueError("You are using python 3 so this test should always pass")
+    sentences = [
+        u'Hello world',
+        u'I am unicode',
+        'I am not unicode',
+        ]
+    wg = WordGenerator(sentences)
+    for w in wg:
+        pass
+def test_unicode_sentences_ignored_if_set():
+    """ Strings with Unicode characters tokenize to empty array if they're not allowed.
+    """
+    sentence = [u'Dobrý den, jak se máš?']
+    wg = WordGenerator(sentence, allow_unicode_text=False)
+    assert wg.get_words(sentence[0]) == []
+def test_check_ascii():
+    """ check_ascii recognises ASCII words properly.
+        In Python 3 all string are Unicode
+    """
+    if not IS_PYTHON2:
+        return
+    wg = WordGenerator([])
+    assert wg.check_ascii('ASCII')
+    assert not wg.check_ascii('ščřžýá')
+    assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢')
+def test_convert_unicode_word():
+    """ convert_unicode_word converts Unicode words correctly.
+    """
+    wg = WordGenerator([], allow_unicode_text=True)
+    result = wg.convert_unicode_word(u'č')
+    assert result == (True, u'\u010d'), '{}'.format(result)
+def test_convert_unicode_word_ignores_if_set():
+    """ convert_unicode_word ignores Unicode words if set.
+    """
+    wg = WordGenerator([], allow_unicode_text=False)
+    result = wg.convert_unicode_word(u'č')
+    assert result == (False, ''), '{}'.format(result)
+def test_convert_unicode_chars():
+    """ convert_unicode_word correctly converts accented characters.
+    """
+    wg = WordGenerator([], allow_unicode_text=True)
+    result = wg.convert_unicode_word(u'ěščřžýáíé')
+    assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result)