Pendrokar commited on
Commit
86a83a2
1 Parent(s): 0a214bf

torchmoji code

Browse files
.travis.yml CHANGED
@@ -24,4 +24,4 @@ script:
24
  - true # pytest --capture=sys # add other tests here
25
  notifications:
26
  on_success: change
27
- on_failure: change # `always` will be the setting once code changes slow down
 
24
  - true # pytest --capture=sys # add other tests here
25
  notifications:
26
  on_success: change
27
+ on_failure: change # `always` will be the setting once code changes slow down
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
data/Olympic/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:398d394ac1d7c2116166ca968bae9b1f9fd049f9e9281f05c94ae7b2ea97d427
3
+ size 227301
data/PsychExp/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc7d710f2ccd7e9d8e620be703a446ce7ec05818d5ce6afe43d1e6aa9ff4a8aa
3
+ size 3492229
data/SCv1/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a65db490451dada57b88918a951d04082a51599d2cde24914f8c713312de89f5
3
+ size 868931
data/SCv2-GEN/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43ae3ea310130c2ca2089d60876ba6b08006d7f2e018a0519c4fdb7b166f992f
3
+ size 883467
data/SE0714/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66f0ecf48affe92bdacdeb64ab20c1c84b9990a3ac7b659a1a98aa29c9c4a064
3
+ size 126311
data/SS-Twitter/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ef34a4f0fe39b1bb45fcb72026bbf3b82ce2e2a14c13d39610b3b41f18fc98e
3
+ size 413660
data/SS-Youtube/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ec15e393fb4f0dbb524946480de50e9baf9fef83a3e9eaf95caa3c425b87aa
3
+ size 396130
data/emoji_codes.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": ":joy:",
3
+ "1": ":unamused:",
4
+ "2": ":weary:",
5
+ "3": ":sob:",
6
+ "4": ":heart_eyes:",
7
+ "5": ":pensive:",
8
+ "6": ":ok_hand:",
9
+ "7": ":blush:",
10
+ "8": ":heart:",
11
+ "9": ":smirk:",
12
+ "10":":grin:",
13
+ "11":":notes:",
14
+ "12":":flushed:",
15
+ "13":":100:",
16
+ "14":":sleeping:",
17
+ "15":":relieved:",
18
+ "16":":relaxed:",
19
+ "17":":raised_hands:",
20
+ "18":":two_hearts:",
21
+ "19":":expressionless:",
22
+ "20":":sweat_smile:",
23
+ "21":":pray:",
24
+ "22":":confused:",
25
+ "23":":kissing_heart:",
26
+ "24":":hearts:",
27
+ "25":":neutral_face:",
28
+ "26":":information_desk_person:",
29
+ "27":":disappointed:",
30
+ "28":":see_no_evil:",
31
+ "29":":tired_face:",
32
+ "30":":v:",
33
+ "31":":sunglasses:",
34
+ "32":":rage:",
35
+ "33":":thumbsup:",
36
+ "34":":cry:",
37
+ "35":":sleepy:",
38
+ "36":":stuck_out_tongue_winking_eye:",
39
+ "37":":triumph:",
40
+ "38":":raised_hand:",
41
+ "39":":mask:",
42
+ "40":":clap:",
43
+ "41":":eyes:",
44
+ "42":":gun:",
45
+ "43":":persevere:",
46
+ "44":":imp:",
47
+ "45":":sweat:",
48
+ "46":":broken_heart:",
49
+ "47":":blue_heart:",
50
+ "48":":headphones:",
51
+ "49":":speak_no_evil:",
52
+ "50":":wink:",
53
+ "51":":skull:",
54
+ "52":":confounded:",
55
+ "53":":smile:",
56
+ "54":":stuck_out_tongue_winking_eye:",
57
+ "55":":angry:",
58
+ "56":":no_good:",
59
+ "57":":muscle:",
60
+ "58":":punch:",
61
+ "59":":purple_heart:",
62
+ "60":":sparkling_heart:",
63
+ "61":":blue_heart:",
64
+ "62":":grimacing:",
65
+ "63":":sparkles:"
66
+ }
67
+
data/kaggle-insults/raw.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fbeca5470209163e04b6975fc5fb91889e79583fe6ff499f83966e36392fcda
3
+ size 1338159
emoji_overview.png ADDED
environment.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: torchMoji
2
+ channels:
3
+ - pytorch
4
+ - defaults
5
+ dependencies:
6
+ - _libgcc_mutex=0.1
7
+ - blas=1.0
8
+ - ca-certificates=2019.11.27
9
+ - certifi=2019.11.28
10
+ - cffi=1.13.2
11
+ - cudatoolkit=10.1.243
12
+ - intel-openmp=2019.4
13
+ - libedit=3.1.20181209
14
+ - libffi=3.2.1
15
+ - libgcc-ng=9.1.0
16
+ - libgfortran-ng=7.3.0
17
+ - libstdcxx-ng=9.1.0
18
+ - mkl=2018.0.3
19
+ - ncurses=6.1
20
+ - ninja=1.9.0
21
+ - nose=1.3.7
22
+ - numpy=1.13.1
23
+ - openssl=1.1.1d
24
+ - pip=19.3.1
25
+ - pycparser=2.19
26
+ - python=3.6.9
27
+ - pytorch=1.3.1
28
+ - readline=7.0
29
+ - scikit-learn=0.19.0
30
+ - scipy=0.19.1
31
+ - setuptools=42.0.2
32
+ - sqlite=3.30.1
33
+ - text-unidecode=1.0
34
+ - tk=8.6.8
35
+ - wheel=0.33.6
36
+ - xz=5.2.4
37
+ - zlib=1.2.11
38
+ - pip:
39
+ - emoji==0.4.5
40
+ prefix: /home/cbowdon/miniconda3/envs/torchMoji
41
+
examples/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
examples/README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torchMoji examples
2
+
3
+ ## Initialization
4
+ [create_twitter_vocab.py](create_twitter_vocab.py)
5
+ Create a new vocabulary from a tsv file.
6
+
7
+ [tokenize_dataset.py](tokenize_dataset.py)
8
+ Tokenize a given dataset using the prebuilt vocabulary.
9
+
10
+ [vocab_extension.py](vocab_extension.py)
11
+ Extend the given vocabulary using dataset-specific words.
12
+
13
+ [dataset_split.py](dataset_split.py)
14
+ Split a given dataset into training, validation and testing.
15
+
16
+ ## Use pretrained model/architecture
17
+ [score_texts_emojis.py](score_texts_emojis.py)
18
+ Use torchMoji to score texts for emoji distribution.
19
+
20
+ [text_emojize.py](text_emojize.py)
21
+ Use torchMoji to output emoji visualization from a single text input (mapped from `emoji_overview.png`)
22
+
23
+ ```sh
24
+ python examples/text_emojize.py --text "I love mom's cooking\!"
25
+ # => I love mom's cooking! 😋 😍 💓 💛 ❤
26
+ ```
27
+
28
+ [encode_texts.py](encode_texts.py)
29
+ Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis.
30
+
31
+ ## Transfer learning
32
+ [finetune_youtube_last.py](finetune_youtube_last.py)
33
+ Finetune the model on the SS-Youtube dataset using the 'last' method.
34
+
35
+ [finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py)
36
+ Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method.
37
+
38
+ [finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py)
39
+ Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric.
examples/__init__.py ADDED
File without changes
examples/create_twitter_vocab.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Creates a vocabulary from a tsv file.
2
+ """
3
+
4
+ import codecs
5
+ import example_helper
6
+ from torchmoji.create_vocab import VocabBuilder
7
+ from torchmoji.word_generator import TweetWordGenerator
8
+
9
+ with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU', 'utf-8') as stream:
10
+ wg = TweetWordGenerator(stream)
11
+ vb = VocabBuilder(wg)
12
+ vb.count_all_words()
13
+ vb.save_vocab()
examples/dataset_split.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Split a given dataset into three different datasets: training, validation and
3
+ testing.
4
+
5
+ This is achieved by splitting the given list of sentences into three separate
6
+ lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
7
+ explicit enumeration. The sentences are also tokenised using the given
8
+ vocabulary.
9
+
10
+ Also splits a given list of dictionaries containing information about
11
+ each sentence.
12
+
13
+ An additional parameter can be set 'extend_with', which will extend the given
14
+ vocabulary with up to 'extend_with' tokens, taken from the training dataset.
15
+ '''
16
+ from __future__ import print_function, unicode_literals
17
+ import example_helper
18
+ import json
19
+
20
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
21
+
22
+ DATASET = [
23
+ 'I am sentence 0',
24
+ 'I am sentence 1',
25
+ 'I am sentence 2',
26
+ 'I am sentence 3',
27
+ 'I am sentence 4',
28
+ 'I am sentence 5',
29
+ 'I am sentence 6',
30
+ 'I am sentence 7',
31
+ 'I am sentence 8',
32
+ 'I am sentence 9 newword',
33
+ ]
34
+
35
+ INFO_DICTS = [
36
+ {'label': 'sentence 0'},
37
+ {'label': 'sentence 1'},
38
+ {'label': 'sentence 2'},
39
+ {'label': 'sentence 3'},
40
+ {'label': 'sentence 4'},
41
+ {'label': 'sentence 5'},
42
+ {'label': 'sentence 6'},
43
+ {'label': 'sentence 7'},
44
+ {'label': 'sentence 8'},
45
+ {'label': 'sentence 9'},
46
+ ]
47
+
48
+ with open('../model/vocabulary.json', 'r') as f:
49
+ vocab = json.load(f)
50
+ st = SentenceTokenizer(vocab, 30)
51
+
52
+ # Split using the default split ratio
53
+ print(st.split_train_val_test(DATASET, INFO_DICTS))
54
+
55
+ # Split explicitly
56
+ print(st.split_train_val_test(DATASET,
57
+ INFO_DICTS,
58
+ [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
59
+ extend_with=1))
examples/encode_texts.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """ Use torchMoji to encode texts into emotional feature vectors.
4
+ """
5
+ from __future__ import print_function, division, unicode_literals
6
+ import json
7
+
8
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
9
+ from torchmoji.model_def import torchmoji_feature_encoding
10
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
11
+
12
+ TEST_SENTENCES = ['I love mom\'s cooking',
13
+ 'I love how you never reply back..',
14
+ 'I love cruising with my homies',
15
+ 'I love messing with yo mind!!',
16
+ 'I love you and now you\'re just gone..',
17
+ 'This is shit',
18
+ 'This is the shit']
19
+
20
+ maxlen = 30
21
+ batch_size = 32
22
+
23
+ print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
24
+ with open(VOCAB_PATH, 'r') as f:
25
+ vocabulary = json.load(f)
26
+ st = SentenceTokenizer(vocabulary, maxlen)
27
+ tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
28
+
29
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
30
+ model = torchmoji_feature_encoding(PRETRAINED_PATH)
31
+ print(model)
32
+
33
+ print('Encoding texts..')
34
+ encoding = model(tokenized)
35
+
36
+ print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
37
+ print(encoding[0,:5])
38
+
39
+ # Now you could visualize the encodings to see differences,
40
+ # run a logistic regression classifier on top,
41
+ # or basically anything you'd like to do.
examples/example_helper.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """ Module import helper.
2
+ Modifies PATH in order to allow us to import the torchmoji directory.
3
+ """
4
+ import sys
5
+ from os.path import abspath, dirname
6
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
examples/finetune_insults_chain-thaw.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finetuning example.
2
+
3
+ Trains the torchMoji model on the kaggle insults dataset, using the 'chain-thaw'
4
+ finetuning method and the accuracy metric. See the blog post at
5
+ https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0
6
+ for more information. Note that results may differ a bit due to slight
7
+ changes in preprocessing and train/val/test split.
8
+
9
+ The 'chain-thaw' method does the following:
10
+ 0) Load all weights except for the softmax layer. Extend the embedding layer if
11
+ necessary, initialising the new weights with random values.
12
+ 1) Freeze every layer except the last (softmax) layer and train it.
13
+ 2) Freeze every layer except the first layer and train it.
14
+ 3) Freeze every layer except the second etc., until the second last layer.
15
+ 4) Unfreeze all layers and train entire model.
16
+ """
17
+
18
+ from __future__ import print_function
19
+ import example_helper
20
+ import json
21
+ from torchmoji.model_def import torchmoji_transfer
22
+ from torchmoji.global_variables import PRETRAINED_PATH
23
+ from torchmoji.finetuning import (
24
+ load_benchmark,
25
+ finetune)
26
+
27
+
28
+ DATASET_PATH = '../data/kaggle-insults/raw.pickle'
29
+ nb_classes = 2
30
+
31
+ with open('../model/vocabulary.json', 'r') as f:
32
+ vocab = json.load(f)
33
+
34
+ # Load dataset. Extend the existing vocabulary with up to 10000 tokens from
35
+ # the training dataset.
36
+ data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
37
+
38
+ # Set up model and finetune. Note that we have to extend the embedding layer
39
+ # with the number of tokens added to the vocabulary.
40
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
41
+ print(model)
42
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
43
+ data['batch_size'], method='chain-thaw')
44
+ print('Acc: {}'.format(acc))
examples/finetune_semeval_class-avg_f1.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finetuning example.
2
+
3
+ Trains the torchMoji model on the SemEval emotion dataset, using the 'last'
4
+ finetuning method and the class average F1 metric.
5
+
6
+ The 'last' method does the following:
7
+ 0) Load all weights except for the softmax layer. Do not add tokens to the
8
+ vocabulary and do not extend the embedding layer.
9
+ 1) Freeze all layers except for the softmax layer.
10
+ 2) Train.
11
+
12
+ The class average F1 metric does the following:
13
+ 1) For each class, relabel the dataset into binary classification
14
+ (belongs to/does not belong to this class).
15
+ 2) Calculate F1 score for each class.
16
+ 3) Compute the average of all F1 scores.
17
+ """
18
+
19
+ from __future__ import print_function
20
+ import example_helper
21
+ import json
22
+ from torchmoji.finetuning import load_benchmark
23
+ from torchmoji.class_avg_finetuning import class_avg_finetune
24
+ from torchmoji.model_def import torchmoji_transfer
25
+ from torchmoji.global_variables import PRETRAINED_PATH
26
+
27
+ DATASET_PATH = '../data/SE0714/raw.pickle'
28
+ nb_classes = 3
29
+
30
+ with open('../model/vocabulary.json', 'r') as f:
31
+ vocab = json.load(f)
32
+
33
+
34
+ # Load dataset. Extend the existing vocabulary with up to 10000 tokens from
35
+ # the training dataset.
36
+ data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
37
+
38
+ # Set up model and finetune. Note that we have to extend the embedding layer
39
+ # with the number of tokens added to the vocabulary.
40
+ #
41
+ # Also note that when using class average F1 to evaluate, the model has to be
42
+ # defined with two classes, since the model will be trained for each class
43
+ # separately.
44
+ model = torchmoji_transfer(2, PRETRAINED_PATH, extend_embedding=data['added'])
45
+ print(model)
46
+
47
+ # For finetuning however, pass in the actual number of classes.
48
+ model, f1 = class_avg_finetune(model, data['texts'], data['labels'],
49
+ nb_classes, data['batch_size'], method='last')
50
+ print('F1: {}'.format(f1))
examples/finetune_youtube_last.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Finetuning example.
2
+
3
+ Trains the torchMoji model on the SS-Youtube dataset, using the 'last'
4
+ finetuning method and the accuracy metric.
5
+
6
+ The 'last' method does the following:
7
+ 0) Load all weights except for the softmax layer. Do not add tokens to the
8
+ vocabulary and do not extend the embedding layer.
9
+ 1) Freeze all layers except for the softmax layer.
10
+ 2) Train.
11
+ """
12
+
13
+ from __future__ import print_function
14
+ import example_helper
15
+ import json
16
+ from torchmoji.model_def import torchmoji_transfer
17
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, ROOT_PATH
18
+ from torchmoji.finetuning import (
19
+ load_benchmark,
20
+ finetune)
21
+
22
+ DATASET_PATH = '{}/data/SS-Youtube/raw.pickle'.format(ROOT_PATH)
23
+ nb_classes = 2
24
+
25
+ with open(VOCAB_PATH, 'r') as f:
26
+ vocab = json.load(f)
27
+
28
+ # Load dataset.
29
+ data = load_benchmark(DATASET_PATH, vocab)
30
+
31
+ # Set up model and finetune
32
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
33
+ print(model)
34
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes, data['batch_size'], method='last')
35
+ print('Acc: {}'.format(acc))
examples/score_texts_emojis.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """ Use torchMoji to score texts for emoji distribution.
4
+
5
+ The resulting emoji ids (0-63) correspond to the mapping
6
+ in emoji_overview.png file at the root of the torchMoji repo.
7
+
8
+ Writes the result to a csv file.
9
+ """
10
+
11
+ from __future__ import print_function, division, unicode_literals
12
+
13
+ import sys
14
+ from os.path import abspath, dirname
15
+
16
+ import json
17
+ import csv
18
+ import numpy as np
19
+
20
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
21
+ from torchmoji.model_def import torchmoji_emojis
22
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
23
+
24
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
25
+
26
+ OUTPUT_PATH = 'test_sentences.csv'
27
+
28
+ TEST_SENTENCES = ['I love mom\'s cooking',
29
+ 'I love how you never reply back..',
30
+ 'I love cruising with my homies',
31
+ 'I love messing with yo mind!!',
32
+ 'I love you and now you\'re just gone..',
33
+ 'This is shit',
34
+ 'This is the shit']
35
+
36
+
37
+ def top_elements(array, k):
38
+ ind = np.argpartition(array, -k)[-k:]
39
+ return ind[np.argsort(array[ind])][::-1]
40
+
41
+ maxlen = 30
42
+
43
+ print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
44
+ with open(VOCAB_PATH, 'r') as f:
45
+ vocabulary = json.load(f)
46
+
47
+ st = SentenceTokenizer(vocabulary, maxlen)
48
+
49
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
50
+ model = torchmoji_emojis(PRETRAINED_PATH)
51
+ print(model)
52
+
53
+ def doImportableFunction():
54
+ print('Running predictions.')
55
+ tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
56
+ prob = model(tokenized)
57
+
58
+ for prob in [prob]:
59
+ # Find top emojis for each sentence. Emoji ids (0-63)
60
+ # correspond to the mapping in emoji_overview.png
61
+ # at the root of the torchMoji repo.
62
+ print('Writing results to {}'.format(OUTPUT_PATH))
63
+ scores = []
64
+ for i, t in enumerate(TEST_SENTENCES):
65
+ t_tokens = tokenized[i]
66
+ t_score = [t]
67
+ t_prob = prob[i]
68
+ ind_top = top_elements(t_prob, 5)
69
+ t_score.append(sum(t_prob[ind_top]))
70
+ t_score.extend(ind_top)
71
+ t_score.extend([t_prob[ind] for ind in ind_top])
72
+ scores.append(t_score)
73
+ print(t_score)
74
+
75
+ with open(OUTPUT_PATH, 'w') as csvfile:
76
+ writer = csv.writer(csvfile, delimiter=str(','), lineterminator='\n')
77
+ writer.writerow(['Text', 'Top5%',
78
+ 'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
79
+ 'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'])
80
+ for i, row in enumerate(scores):
81
+ try:
82
+ writer.writerow(row)
83
+ except:
84
+ print("Exception at row {}!".format(i))
85
+ return
examples/text_emojize.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """ Use torchMoji to predict emojis from a single text input
4
+ """
5
+
6
+ from __future__ import print_function, division, unicode_literals
7
+ import example_helper
8
+ import json
9
+ import csv
10
+ import argparse
11
+
12
+ import numpy as np
13
+ import emoji
14
+
15
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
16
+ from torchmoji.model_def import torchmoji_emojis
17
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
18
+
19
+ # Emoji map in emoji_overview.png
20
+ EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
21
+ :pensive: :ok_hand: :blush: :heart: :smirk: \
22
+ :grin: :notes: :flushed: :100: :sleeping: \
23
+ :relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
24
+ :sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
25
+ :neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
26
+ :v: :sunglasses: :rage: :thumbsup: :cry: \
27
+ :sleepy: :yum: :triumph: :hand: :mask: \
28
+ :clap: :eyes: :gun: :persevere: :smiling_imp: \
29
+ :sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
30
+ :wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
31
+ :angry: :no_good: :muscle: :facepunch: :purple_heart: \
32
+ :sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')
33
+
34
+ def top_elements(array, k):
35
+ ind = np.argpartition(array, -k)[-k:]
36
+ return ind[np.argsort(array[ind])][::-1]
37
+
38
+ if __name__ == "__main__":
39
+ argparser = argparse.ArgumentParser()
40
+ argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
41
+ argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
42
+ args = argparser.parse_args()
43
+
44
+ # Tokenizing using dictionary
45
+ with open(VOCAB_PATH, 'r') as f:
46
+ vocabulary = json.load(f)
47
+
48
+ st = SentenceTokenizer(vocabulary, args.maxlen)
49
+
50
+ # Loading model
51
+ model = torchmoji_emojis(PRETRAINED_PATH)
52
+ # Running predictions
53
+ tokenized, _, _ = st.tokenize_sentences([args.text])
54
+ # Get sentence probability
55
+ prob = model(tokenized)[0]
56
+
57
+ # Top emoji id
58
+ emoji_ids = top_elements(prob, 5)
59
+
60
+ # map to emojis
61
+ emojis = map(lambda x: EMOJIS[x], emoji_ids)
62
+
63
+ print(emoji.emojize("{} {}".format(args.text,' '.join(emojis)), use_aliases=True))
examples/tokenize_dataset.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Take a given list of sentences and turn it into a numpy array, where each
3
+ number corresponds to a word. Padding is used (number 0) to ensure fixed length
4
+ of sentences.
5
+ """
6
+
7
+ from __future__ import print_function, unicode_literals
8
+ import example_helper
9
+ import json
10
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
11
+
12
+ with open('../model/vocabulary.json', 'r') as f:
13
+ vocabulary = json.load(f)
14
+
15
+ st = SentenceTokenizer(vocabulary, 30)
16
+ test_sentences = [
17
+ '\u2014 -- \u203c !!\U0001F602',
18
+ 'Hello world!',
19
+ 'This is a sample tweet #example',
20
+ ]
21
+
22
+ tokens, infos, stats = st.tokenize_sentences(test_sentences)
23
+
24
+ print(tokens)
25
+ print(infos)
26
+ print(stats)
examples/vocab_extension.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extend the given vocabulary using dataset-specific words.
3
+
4
+ 1. First create a vocabulary for the specific dataset.
5
+ 2. Find all words not in our vocabulary, but in the dataset vocabulary.
6
+ 3. Take top X (default=1000) of these words and add them to the vocabulary.
7
+ 4. Save this combined vocabulary and embedding matrix, which can now be used.
8
+ """
9
+
10
+ from __future__ import print_function, unicode_literals
11
+ import example_helper
12
+ import json
13
+ from torchmoji.create_vocab import extend_vocab, VocabBuilder
14
+ from torchmoji.word_generator import WordGenerator
15
+
16
+ new_words = ['#zzzzaaazzz', 'newword', 'newword']
17
+ word_gen = WordGenerator(new_words)
18
+ vb = VocabBuilder(word_gen)
19
+ vb.count_all_words()
20
+
21
+ with open('../model/vocabulary.json') as f:
22
+ vocab = json.load(f)
23
+
24
+ print(len(vocab))
25
+ print(vb.word_counts)
26
+ extend_vocab(vocab, vb, max_tokens=1)
27
+
28
+ # 'newword' should be added because it's more frequent in the given vocab
29
+ print(vocab['newword'])
30
+ print(len(vocab))
scripts/analyze_all_results.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ # allow us to import the codebase directory
4
+ import sys
5
+ import glob
6
+ import numpy as np
7
+ from os.path import dirname, abspath
8
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
9
+
10
+ DATASETS = ['SE0714', 'Olympic', 'PsychExp', 'SS-Twitter', 'SS-Youtube',
11
+ 'SCv1', 'SV2-GEN'] # 'SE1604' excluded due to Twitter's ToS
12
+
13
+ def get_results(dset):
14
+ METHOD = 'last'
15
+ RESULTS_DIR = 'results/'
16
+ RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, dset, METHOD))
17
+ assert len(RESULT_PATHS)
18
+
19
+ scores = []
20
+ for path in RESULT_PATHS:
21
+ with open(path) as f:
22
+ score = f.readline().split(':')[1]
23
+ scores.append(float(score))
24
+
25
+ average = np.mean(scores)
26
+ maximum = max(scores)
27
+ minimum = min(scores)
28
+ std = np.std(scores)
29
+
30
+ print('Dataset: {}'.format(dset))
31
+ print('Method: {}'.format(METHOD))
32
+ print('Number of results: {}'.format(len(scores)))
33
+ print('--------------------------')
34
+ print('Average: {}'.format(average))
35
+ print('Maximum: {}'.format(maximum))
36
+ print('Minimum: {}'.format(minimum))
37
+ print('Standard deviaton: {}'.format(std))
38
+
39
+ for dset in DATASETS:
40
+ get_results(dset)
scripts/analyze_results.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ import sys
4
+ import glob
5
+ import numpy as np
6
+
7
+ DATASET = 'SS-Twitter' # 'SE1604' excluded due to Twitter's ToS
8
+ METHOD = 'new'
9
+
10
+ # Optional usage: analyze_results.py <dataset> <method>
11
+ if len(sys.argv) == 3:
12
+ DATASET = sys.argv[1]
13
+ METHOD = sys.argv[2]
14
+
15
+ RESULTS_DIR = 'results/'
16
+ RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, DATASET, METHOD))
17
+
18
+ if not RESULT_PATHS:
19
+ print('Could not find results for \'{}\' using \'{}\' in directory \'{}\'.'.format(DATASET, METHOD, RESULTS_DIR))
20
+ else:
21
+ scores = []
22
+ for path in RESULT_PATHS:
23
+ with open(path) as f:
24
+ score = f.readline().split(':')[1]
25
+ scores.append(float(score))
26
+
27
+ average = np.mean(scores)
28
+ maximum = max(scores)
29
+ minimum = min(scores)
30
+ std = np.std(scores)
31
+
32
+ print('Dataset: {}'.format(DATASET))
33
+ print('Method: {}'.format(METHOD))
34
+ print('Number of results: {}'.format(len(scores)))
35
+ print('--------------------------')
36
+ print('Average: {}'.format(average))
37
+ print('Maximum: {}'.format(maximum))
38
+ print('Minimum: {}'.format(minimum))
39
+ print('Standard deviaton: {}'.format(std))
scripts/calculate_coverages.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import pickle
3
+ import json
4
+ import csv
5
+ import sys
6
+ from io import open
7
+
8
+ # Allow us to import the torchmoji directory
9
+ from os.path import dirname, abspath
10
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
11
+
12
+ from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
13
+
14
+ try:
15
+ unicode # Python 2
16
+ except NameError:
17
+ unicode = str # Python 3
18
+
19
+ IS_PYTHON2 = int(sys.version[0]) == 2
20
+
21
+ OUTPUT_PATH = 'coverage.csv'
22
+ DATASET_PATHS = [
23
+ '../data/Olympic/raw.pickle',
24
+ '../data/PsychExp/raw.pickle',
25
+ '../data/SCv1/raw.pickle',
26
+ '../data/SCv2-GEN/raw.pickle',
27
+ '../data/SE0714/raw.pickle',
28
+ #'../data/SE1604/raw.pickle', # Excluded due to Twitter's ToS
29
+ '../data/SS-Twitter/raw.pickle',
30
+ '../data/SS-Youtube/raw.pickle',
31
+ ]
32
+
33
+ with open('../model/vocabulary.json', 'r') as f:
34
+ vocab = json.load(f)
35
+
36
+ results = []
37
+ for p in DATASET_PATHS:
38
+ coverage_result = [p]
39
+ print('Calculating coverage for {}'.format(p))
40
+ with open(p, 'rb') as f:
41
+ if IS_PYTHON2:
42
+ s = pickle.load(f)
43
+ else:
44
+ s = pickle.load(f, fix_imports=True)
45
+
46
+ # Decode data
47
+ try:
48
+ s['texts'] = [unicode(x) for x in s['texts']]
49
+ except UnicodeDecodeError:
50
+ s['texts'] = [x.decode('utf-8') for x in s['texts']]
51
+
52
+ # Own
53
+ st = SentenceTokenizer({}, 30)
54
+ tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
55
+ [s['train_ind'],
56
+ s['val_ind'],
57
+ s['test_ind']],
58
+ extend_with=10000)
59
+ coverage_result.append(coverage(tests[2]))
60
+
61
+ # Last
62
+ st = SentenceTokenizer(vocab, 30)
63
+ tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
64
+ [s['train_ind'],
65
+ s['val_ind'],
66
+ s['test_ind']],
67
+ extend_with=0)
68
+ coverage_result.append(coverage(tests[2]))
69
+
70
+ # Full
71
+ st = SentenceTokenizer(vocab, 30)
72
+ tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
73
+ [s['train_ind'],
74
+ s['val_ind'],
75
+ s['test_ind']],
76
+ extend_with=10000)
77
+ coverage_result.append(coverage(tests[2]))
78
+
79
+ results.append(coverage_result)
80
+
81
+ with open(OUTPUT_PATH, 'wb') as csvfile:
82
+ writer = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
83
+ writer.writerow(['Dataset', 'Own', 'Last', 'Full'])
84
+ for i, row in enumerate(results):
85
+ try:
86
+ writer.writerow(row)
87
+ except:
88
+ print("Exception at row {}!".format(i))
89
+
90
+ print('Saved to {}'.format(OUTPUT_PATH))
scripts/convert_all_datasets.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ import json
4
+ import math
5
+ import pickle
6
+ import sys
7
+ from io import open
8
+ import numpy as np
9
+ from os.path import abspath, dirname
10
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
11
+
12
+ from torchmoji.word_generator import WordGenerator
13
+ from torchmoji.create_vocab import VocabBuilder
14
+ from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
15
+ from torchmoji.tokenizer import tokenize
16
+
17
+ try:
18
+ unicode # Python 2
19
+ except NameError:
20
+ unicode = str # Python 3
21
+
22
+ IS_PYTHON2 = int(sys.version[0]) == 2
23
+
24
+ DATASETS = [
25
+ 'Olympic',
26
+ 'PsychExp',
27
+ 'SCv1',
28
+ 'SCv2-GEN',
29
+ 'SE0714',
30
+ #'SE1604', # Excluded due to Twitter's ToS
31
+ 'SS-Twitter',
32
+ 'SS-Youtube',
33
+ ]
34
+
35
+ DIR = '../data'
36
+ FILENAME_RAW = 'raw.pickle'
37
+ FILENAME_OWN = 'own_vocab.pickle'
38
+ FILENAME_OUR = 'twitter_vocab.pickle'
39
+ FILENAME_COMBINED = 'combined_vocab.pickle'
40
+
41
+
42
+ def roundup(x):
43
+ return int(math.ceil(x / 10.0)) * 10
44
+
45
+
46
+ def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
47
+ return {'dataset': dset,
48
+ 'train_texts': train_texts,
49
+ 'val_texts': val_texts,
50
+ 'test_texts': test_texts,
51
+ 'train_labels': train_labels,
52
+ 'val_labels': val_labels,
53
+ 'test_labels': test_labels}
54
+
55
+ def convert_dataset(filepath, extend_with, vocab):
56
+ print('-- Generating {} '.format(filepath))
57
+ sys.stdout.flush()
58
+ st = SentenceTokenizer(vocab, maxlen)
59
+ tokenized, dicts, _ = st.split_train_val_test(texts,
60
+ labels,
61
+ [data['train_ind'],
62
+ data['val_ind'],
63
+ data['test_ind']],
64
+ extend_with=extend_with)
65
+ pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
66
+ dicts[0], dicts[1], dicts[2])
67
+ with open(filepath, 'w') as f:
68
+ pickle.dump(pick, f)
69
+ cover = coverage(tokenized[2])
70
+
71
+ print(' done. Coverage: {}'.format(cover))
72
+
73
+ with open('../model/vocabulary.json', 'r') as f:
74
+ vocab = json.load(f)
75
+
76
+ for dset in DATASETS:
77
+ print('Converting {}'.format(dset))
78
+
79
+ PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
80
+ PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
81
+ PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
82
+ PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)
83
+
84
+ with open(PATH_RAW, 'rb') as dataset:
85
+ if IS_PYTHON2:
86
+ data = pickle.load(dataset)
87
+ else:
88
+ data = pickle.load(dataset, fix_imports=True)
89
+
90
+ # Decode data
91
+ try:
92
+ texts = [unicode(x) for x in data['texts']]
93
+ except UnicodeDecodeError:
94
+ texts = [x.decode('utf-8') for x in data['texts']]
95
+
96
+ wg = WordGenerator(texts)
97
+ vb = VocabBuilder(wg)
98
+ vb.count_all_words()
99
+
100
+ # Calculate max length of sequences considered
101
+ # Adjust batch_size accordingly to prevent GPU overflow
102
+ lengths = [len(tokenize(t)) for t in texts]
103
+ maxlen = roundup(np.percentile(lengths, 80.0))
104
+
105
+ # Extract labels
106
+ labels = [x['label'] for x in data['info']]
107
+
108
+ convert_dataset(PATH_OWN, 50000, {})
109
+ convert_dataset(PATH_OUR, 0, vocab)
110
+ convert_dataset(PATH_COMBINED, 10000, vocab)
scripts/download_weights.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ from subprocess import call
4
+ from builtins import input
5
+
6
+ curr_folder = os.path.basename(os.path.normpath(os.getcwd()))
7
+
8
+ weights_filename = 'pytorch_model.bin'
9
+ weights_folder = 'model'
10
+ weights_path = '{}/{}'.format(weights_folder, weights_filename)
11
+ if curr_folder == 'scripts':
12
+ weights_path = '../' + weights_path
13
+ weights_download_link = 'https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0#'
14
+
15
+
16
+ MB_FACTOR = float(1<<20)
17
+
18
+ def prompt():
19
+ while True:
20
+ valid = {
21
+ 'y': True,
22
+ 'ye': True,
23
+ 'yes': True,
24
+ 'n': False,
25
+ 'no': False,
26
+ }
27
+ choice = input().lower()
28
+ if choice in valid:
29
+ return valid[choice]
30
+ else:
31
+ print('Please respond with \'y\' or \'n\' (or \'yes\' or \'no\')')
32
+
33
+ download = True
34
+ if os.path.exists(weights_path):
35
+ print('Weight file already exists at {}. Would you like to redownload it anyway? [y/n]'.format(weights_path))
36
+ download = prompt()
37
+ already_exists = True
38
+ else:
39
+ already_exists = False
40
+
41
+ if download:
42
+ print('About to download the pretrained weights file from {}'.format(weights_download_link))
43
+ if already_exists == False:
44
+ print('The size of the file is roughly 85MB. Continue? [y/n]')
45
+ else:
46
+ os.unlink(weights_path)
47
+
48
+ if already_exists or prompt():
49
+ print('Downloading...')
50
+
51
+ #urllib.urlretrieve(weights_download_link, weights_path)
52
+ #with open(weights_path,'wb') as f:
53
+ # f.write(requests.get(weights_download_link).content)
54
+
55
+ # downloading using wget due to issues with urlretrieve and requests
56
+ sys_call = 'wget {} -O {}'.format(weights_download_link, os.path.abspath(weights_path))
57
+ print("Running system call: {}".format(sys_call))
58
+ call(sys_call, shell=True)
59
+
60
+ if os.path.getsize(weights_path) / MB_FACTOR < 80:
61
+ raise ValueError("Download finished, but the resulting file is too small! " +
62
+ "It\'s only {} bytes.".format(os.path.getsize(weights_path)))
63
+ print('Downloaded weights to {}'.format(weights_path))
64
+ else:
65
+ print('Exiting.')
scripts/finetune_dataset.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Finetuning example.
2
+ """
3
+ from __future__ import print_function
4
+ import sys
5
+ import numpy as np
6
+ from os.path import abspath, dirname
7
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
8
+
9
+ import json
10
+ import math
11
+ from torchmoji.model_def import torchmoji_transfer
12
+ from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
13
+ from torchmoji.finetuning import (
14
+ load_benchmark,
15
+ finetune)
16
+ from torchmoji.class_avg_finetuning import class_avg_finetune
17
+
18
+ def roundup(x):
19
+ return int(math.ceil(x / 10.0)) * 10
20
+
21
+
22
+ # Format: (dataset_name,
23
+ # path_to_dataset,
24
+ # nb_classes,
25
+ # use_f1_score)
26
+ DATASETS = [
27
+ #('SE0714', '../data/SE0714/raw.pickle', 3, True),
28
+ #('Olympic', '../data/Olympic/raw.pickle', 4, True),
29
+ #('PsychExp', '../data/PsychExp/raw.pickle', 7, True),
30
+ #('SS-Twitter', '../data/SS-Twitter/raw.pickle', 2, False),
31
+ ('SS-Youtube', '../data/SS-Youtube/raw.pickle', 2, False),
32
+ #('SE1604', '../data/SE1604/raw.pickle', 3, False), # Excluded due to Twitter's ToS
33
+ #('SCv1', '../data/SCv1/raw.pickle', 2, True),
34
+ #('SCv2-GEN', '../data/SCv2-GEN/raw.pickle', 2, True)
35
+ ]
36
+
37
+ RESULTS_DIR = 'results'
38
+
39
+ # 'new' | 'last' | 'full' | 'chain-thaw'
40
+ FINETUNE_METHOD = 'last'
41
+ VERBOSE = 1
42
+
43
+ nb_tokens = 50000
44
+ nb_epochs = 1000
45
+ epoch_size = 1000
46
+
47
+ with open(VOCAB_PATH, 'r') as f:
48
+ vocab = json.load(f)
49
+
50
+ for rerun_iter in range(5):
51
+ for p in DATASETS:
52
+
53
+ # debugging
54
+ assert len(vocab) == nb_tokens
55
+
56
+ dset = p[0]
57
+ path = p[1]
58
+ nb_classes = p[2]
59
+ use_f1_score = p[3]
60
+
61
+ if FINETUNE_METHOD == 'last':
62
+ extend_with = 0
63
+ elif FINETUNE_METHOD in ['new', 'full', 'chain-thaw']:
64
+ extend_with = 10000
65
+ else:
66
+ raise ValueError('Finetuning method not recognised!')
67
+
68
+ # Load dataset.
69
+ data = load_benchmark(path, vocab, extend_with=extend_with)
70
+
71
+ (X_train, y_train) = (data['texts'][0], data['labels'][0])
72
+ (X_val, y_val) = (data['texts'][1], data['labels'][1])
73
+ (X_test, y_test) = (data['texts'][2], data['labels'][2])
74
+
75
+ weight_path = PRETRAINED_PATH if FINETUNE_METHOD != 'new' else None
76
+ nb_model_classes = 2 if use_f1_score else nb_classes
77
+ model = torchmoji_transfer(
78
+ nb_model_classes,
79
+ weight_path,
80
+ extend_embedding=data['added'])
81
+ print(model)
82
+
83
+ # Training
84
+ print('Training: {}'.format(path))
85
+ if use_f1_score:
86
+ model, result = class_avg_finetune(model, data['texts'],
87
+ data['labels'],
88
+ nb_classes, data['batch_size'],
89
+ FINETUNE_METHOD,
90
+ verbose=VERBOSE)
91
+ else:
92
+ model, result = finetune(model, data['texts'], data['labels'],
93
+ nb_classes, data['batch_size'],
94
+ FINETUNE_METHOD, metric='acc',
95
+ verbose=VERBOSE)
96
+
97
+ # Write results
98
+ if use_f1_score:
99
+ print('Overall F1 score (dset = {}): {}'.format(dset, result))
100
+ with open('{}/{}_{}_{}_results.txt'.
101
+ format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
102
+ "w") as f:
103
+ f.write("F1: {}\n".format(result))
104
+ else:
105
+ print('Test accuracy (dset = {}): {}'.format(dset, result))
106
+ with open('{}/{}_{}_{}_results.txt'.
107
+ format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
108
+ "w") as f:
109
+ f.write("Acc: {}\n".format(result))
scripts/results/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
setup.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup(
4
+ name='torchmoji',
5
+ version='1.0',
6
+ packages=['torchmoji'],
7
+ description='torchMoji',
8
+ include_package_data=True,
9
+ install_requires=[
10
+ 'emoji==0.4.5',
11
+ 'numpy==1.13.1',
12
+ 'scipy==0.19.1',
13
+ 'scikit-learn==0.19.0',
14
+ 'text-unidecode==1.0',
15
+ ],
16
+ )
tests/test_finetuning.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, print_function, division, unicode_literals
2
+
3
+ import test_helper
4
+
5
+ from nose.plugins.attrib import attr
6
+ import json
7
+ import numpy as np
8
+
9
+ from torchmoji.class_avg_finetuning import relabel
10
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
11
+
12
+ from torchmoji.finetuning import (
13
+ calculate_batchsize_maxlen,
14
+ freeze_layers,
15
+ change_trainable,
16
+ finetune,
17
+ load_benchmark
18
+ )
19
+ from torchmoji.model_def import (
20
+ torchmoji_transfer,
21
+ torchmoji_feature_encoding,
22
+ torchmoji_emojis
23
+ )
24
+ from torchmoji.global_variables import (
25
+ PRETRAINED_PATH,
26
+ NB_TOKENS,
27
+ VOCAB_PATH,
28
+ ROOT_PATH
29
+ )
30
+
31
+
32
+ def test_calculate_batchsize_maxlen():
33
+ """ Batch size and max length are calculated properly.
34
+ """
35
+ texts = ['a b c d',
36
+ 'e f g h i']
37
+ batch_size, maxlen = calculate_batchsize_maxlen(texts)
38
+
39
+ assert batch_size == 250
40
+ assert maxlen == 10, maxlen
41
+
42
+
43
+ def test_freeze_layers():
44
+ """ Correct layers are frozen.
45
+ """
46
+ model = torchmoji_transfer(5)
47
+ keyword = 'output_layer'
48
+
49
+ model = freeze_layers(model, unfrozen_keyword=keyword)
50
+
51
+ for name, module in model.named_children():
52
+ trainable = keyword.lower() in name.lower()
53
+ assert all(p.requires_grad == trainable for p in module.parameters())
54
+
55
+
56
+ def test_change_trainable():
57
+ """ change_trainable() changes trainability of layers.
58
+ """
59
+ model = torchmoji_transfer(5)
60
+ change_trainable(model.embed, False)
61
+ assert not any(p.requires_grad for p in model.embed.parameters())
62
+ change_trainable(model.embed, True)
63
+ assert all(p.requires_grad for p in model.embed.parameters())
64
+
65
+
66
+ def test_torchmoji_transfer_extend_embedding():
67
+ """ Defining torchmoji with extension.
68
+ """
69
+ extend_with = 50
70
+ model = torchmoji_transfer(5, weight_path=PRETRAINED_PATH,
71
+ extend_embedding=extend_with)
72
+ embedding_layer = model.embed
73
+ assert embedding_layer.weight.size()[0] == NB_TOKENS + extend_with
74
+
75
+
76
+ def test_torchmoji_return_attention():
77
+ seq_tensor = np.array([[1]])
78
+ # test the output of the normal model
79
+ model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
80
+ # check correct number of outputs
81
+ assert len(model(seq_tensor)) == 1
82
+ # repeat above described tests when returning attention weights
83
+ model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True)
84
+ assert len(model(seq_tensor)) == 2
85
+
86
+
87
+ def test_relabel():
88
+ """ relabel() works with multi-class labels.
89
+ """
90
+ nb_classes = 3
91
+ inputs = np.array([
92
+ [True, False, False],
93
+ [False, True, False],
94
+ [True, False, True],
95
+ ])
96
+ expected_0 = np.array([True, False, True])
97
+ expected_1 = np.array([False, True, False])
98
+ expected_2 = np.array([False, False, True])
99
+
100
+ assert np.array_equal(relabel(inputs, 0, nb_classes), expected_0)
101
+ assert np.array_equal(relabel(inputs, 1, nb_classes), expected_1)
102
+ assert np.array_equal(relabel(inputs, 2, nb_classes), expected_2)
103
+
104
+
105
+ def test_relabel_binary():
106
+ """ relabel() works with binary classification (no changes to labels)
107
+ """
108
+ nb_classes = 2
109
+ inputs = np.array([True, False, False])
110
+
111
+ assert np.array_equal(relabel(inputs, 0, nb_classes), inputs)
112
+
113
+
114
+ @attr('slow')
115
+ def test_finetune_full():
116
+ """ finetuning using 'full'.
117
+ """
118
+ DATASET_PATH = ROOT_PATH+'/data/SS-Youtube/raw.pickle'
119
+ nb_classes = 2
120
+ # Keras and pyTorch implementation of the Adam optimizer are slightly different and change a bit the results
121
+ # We reduce the min accuracy needed here to pass the test
122
+ # See e.g. https://discuss.pytorch.org/t/suboptimal-convergence-when-compared-with-tensorflow-model/5099/11
123
+ min_acc = 0.68
124
+
125
+ with open(VOCAB_PATH, 'r') as f:
126
+ vocab = json.load(f)
127
+
128
+ data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
129
+ print('Loading pyTorch model from {}.'.format(PRETRAINED_PATH))
130
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
131
+ print(model)
132
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
133
+ data['batch_size'], method='full', nb_epochs=1)
134
+
135
+ print("Finetune full SS-Youtube 1 epoch acc: {}".format(acc))
136
+ assert acc >= min_acc
137
+
138
+
139
+ @attr('slow')
140
+ def test_finetune_last():
141
+ """ finetuning using 'last'.
142
+ """
143
+ dataset_path = ROOT_PATH + '/data/SS-Youtube/raw.pickle'
144
+ nb_classes = 2
145
+ min_acc = 0.68
146
+
147
+ with open(VOCAB_PATH, 'r') as f:
148
+ vocab = json.load(f)
149
+
150
+ data = load_benchmark(dataset_path, vocab)
151
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
152
+ model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
153
+ print(model)
154
+ model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
155
+ data['batch_size'], method='last', nb_epochs=1)
156
+
157
+ print("Finetune last SS-Youtube 1 epoch acc: {}".format(acc))
158
+
159
+ assert acc >= min_acc
160
+
161
+
162
+ def test_score_emoji():
163
+ """ Emoji predictions make sense.
164
+ """
165
+ test_sentences = [
166
+ 'I love mom\'s cooking',
167
+ 'I love how you never reply back..',
168
+ 'I love cruising with my homies',
169
+ 'I love messing with yo mind!!',
170
+ 'I love you and now you\'re just gone..',
171
+ 'This is shit',
172
+ 'This is the shit'
173
+ ]
174
+
175
+ expected = [
176
+ np.array([36, 4, 8, 16, 47]),
177
+ np.array([1, 19, 55, 25, 46]),
178
+ np.array([31, 6, 30, 15, 13]),
179
+ np.array([54, 44, 9, 50, 49]),
180
+ np.array([46, 5, 27, 35, 34]),
181
+ np.array([55, 32, 27, 1, 37]),
182
+ np.array([48, 11, 6, 31, 9])
183
+ ]
184
+
185
+ def top_elements(array, k):
186
+ ind = np.argpartition(array, -k)[-k:]
187
+ return ind[np.argsort(array[ind])][::-1]
188
+
189
+ # Initialize by loading dictionary and tokenize texts
190
+ with open(VOCAB_PATH, 'r') as f:
191
+ vocabulary = json.load(f)
192
+
193
+ st = SentenceTokenizer(vocabulary, 30)
194
+ tokens, _, _ = st.tokenize_sentences(test_sentences)
195
+
196
+ # Load model and run
197
+ model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
198
+ prob = model(tokens)
199
+
200
+ # Find top emojis for each sentence
201
+ for i, t_prob in enumerate(list(prob)):
202
+ assert np.array_equal(top_elements(t_prob, 5), expected[i])
203
+
204
+
205
+ def test_encode_texts():
206
+ """ Text encoding is stable.
207
+ """
208
+
209
+ TEST_SENTENCES = ['I love mom\'s cooking',
210
+ 'I love how you never reply back..',
211
+ 'I love cruising with my homies',
212
+ 'I love messing with yo mind!!',
213
+ 'I love you and now you\'re just gone..',
214
+ 'This is shit',
215
+ 'This is the shit']
216
+
217
+
218
+ maxlen = 30
219
+ batch_size = 32
220
+
221
+ with open(VOCAB_PATH, 'r') as f:
222
+ vocabulary = json.load(f)
223
+
224
+ st = SentenceTokenizer(vocabulary, maxlen)
225
+
226
+ print('Loading model from {}.'.format(PRETRAINED_PATH))
227
+ model = torchmoji_feature_encoding(PRETRAINED_PATH)
228
+ print(model)
229
+ tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
230
+ encoding = model(tokenized)
231
+
232
+ avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
233
+ assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
234
+
235
+ test_encode_texts()
tests/test_helper.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """ Module import helper.
2
+ Modifies PATH in order to allow us to import the torchmoji directory.
3
+ """
4
+ import sys
5
+ from os.path import abspath, dirname
6
+ sys.path.insert(0, dirname(dirname(abspath(__file__))))
tests/test_sentence_tokenizer.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, print_function, division, unicode_literals
2
+ import test_helper
3
+ import json
4
+
5
+ from torchmoji.sentence_tokenizer import SentenceTokenizer
6
+
7
+ sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
8
+
9
+ dicts = [
10
+ {'label': 0},
11
+ {'label': 1},
12
+ {'label': 2},
13
+ {'label': 3},
14
+ {'label': 4},
15
+ {'label': 5},
16
+ {'label': 6},
17
+ {'label': 7},
18
+ {'label': 8},
19
+ {'label': 9},
20
+ ]
21
+
22
+ train_ind = [0, 5, 3, 6, 8]
23
+ val_ind = [9, 2, 1]
24
+ test_ind = [4, 7]
25
+
26
+ with open('../model/vocabulary.json', 'r') as f:
27
+ vocab = json.load(f)
28
+
29
+ def test_dataset_split_parameter():
30
+ """ Dataset is split in the desired ratios
31
+ """
32
+ split_parameter = [0.7, 0.1, 0.2]
33
+ st = SentenceTokenizer(vocab, 30)
34
+
35
+ result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
36
+ split_parameter, extend_with=0)
37
+ train = result[0]
38
+ val = result[1]
39
+ test = result[2]
40
+
41
+ train_dicts = result_dicts[0]
42
+ val_dicts = result_dicts[1]
43
+ test_dicts = result_dicts[2]
44
+
45
+ assert len(train) == len(sentences) * split_parameter[0]
46
+ assert len(val) == len(sentences) * split_parameter[1]
47
+ assert len(test) == len(sentences) * split_parameter[2]
48
+
49
+ assert len(train_dicts) == len(dicts) * split_parameter[0]
50
+ assert len(val_dicts) == len(dicts) * split_parameter[1]
51
+ assert len(test_dicts) == len(dicts) * split_parameter[2]
52
+
53
+ def test_dataset_split_explicit():
54
+ """ Dataset is split according to given indices
55
+ """
56
+ split_parameter = [train_ind, val_ind, test_ind]
57
+ st = SentenceTokenizer(vocab, 30)
58
+ tokenized, _, _ = st.tokenize_sentences(sentences)
59
+
60
+ result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
61
+ train = result[0]
62
+ val = result[1]
63
+ test = result[2]
64
+
65
+ train_dicts = result_dicts[0]
66
+ val_dicts = result_dicts[1]
67
+ test_dicts = result_dicts[2]
68
+
69
+ tokenized = tokenized
70
+
71
+ for i, sentence in enumerate(sentences):
72
+ if i in train_ind:
73
+ assert tokenized[i] in train
74
+ assert dicts[i] in train_dicts
75
+ elif i in val_ind:
76
+ assert tokenized[i] in val
77
+ assert dicts[i] in val_dicts
78
+ elif i in test_ind:
79
+ assert tokenized[i] in test
80
+ assert dicts[i] in test_dicts
81
+
82
+ assert len(train) == len(train_ind)
83
+ assert len(val) == len(val_ind)
84
+ assert len(test) == len(test_ind)
85
+ assert len(train_dicts) == len(train_ind)
86
+ assert len(val_dicts) == len(val_ind)
87
+ assert len(test_dicts) == len(test_ind)
88
+
89
+ def test_id_to_sentence():
90
+ """Tokenizing and converting back preserves the input.
91
+ """
92
+ vb = {'CUSTOM_MASK': 0,
93
+ 'aasdf': 1000,
94
+ 'basdf': 2000}
95
+
96
+ sentence = 'aasdf basdf basdf basdf'
97
+ st = SentenceTokenizer(vb, 30)
98
+ token, _, _ = st.tokenize_sentences([sentence])
99
+ assert st.to_sentence(token[0]) == sentence
100
+
101
+ def test_id_to_sentence_with_unknown():
102
+ """Tokenizing and converting back preserves the input, except for unknowns.
103
+ """
104
+ vb = {'CUSTOM_MASK': 0,
105
+ 'CUSTOM_UNKNOWN': 1,
106
+ 'aasdf': 1000,
107
+ 'basdf': 2000}
108
+
109
+ sentence = 'aasdf basdf ccc'
110
+ expected = 'aasdf basdf CUSTOM_UNKNOWN'
111
+ st = SentenceTokenizer(vb, 30)
112
+ token, _, _ = st.tokenize_sentences([sentence])
113
+ assert st.to_sentence(token[0]) == expected
tests/test_tokenizer.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ Tokenization tests.
3
+ """
4
+ from __future__ import absolute_import, print_function, division, unicode_literals
5
+
6
+ import sys
7
+ from nose.tools import nottest
8
+ from os.path import dirname, abspath
9
+ sys.path.append(dirname(dirname(abspath(__file__))))
10
+ from torchmoji.tokenizer import tokenize
11
+
12
+ TESTS_NORMAL = [
13
+ ('200K words!', ['200', 'K', 'words', '!']),
14
+ ]
15
+
16
+ TESTS_EMOJIS = [
17
+ ('i \U0001f496 you to the moon and back',
18
+ ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']),
19
+ ("i\U0001f496you to the \u2605's and back",
20
+ ['i', '\U0001f496', 'you', 'to', 'the',
21
+ '\u2605', "'", 's', 'and', 'back']),
22
+ ('~<3~', ['~', '<3', '~']),
23
+ ('<333', ['<333']),
24
+ (':-)', [':-)']),
25
+ ('>:-(', ['>:-(']),
26
+ ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661',
27
+ ['\u266b', '\u266a', '\u2605', '\u2606',
28
+ '\u2665', '\u2764', '\u2661']),
29
+ ]
30
+
31
+ TESTS_URLS = [
32
+ ('www.sample.com', ['www.sample.com']),
33
+ ('http://endless.horse', ['http://endless.horse']),
34
+ ('https://github.mit.ed', ['https://github.mit.ed']),
35
+ ]
36
+
37
+ TESTS_TWITTER = [
38
+ ('#blacklivesmatter', ['#blacklivesmatter']),
39
+ ('#99_percent.', ['#99_percent', '.']),
40
+ ('the#99%', ['the', '#99', '%']),
41
+ ('@golden_zenith', ['@golden_zenith']),
42
+ ('@99_percent', ['@99_percent']),
43
+ ('latte-express@mit.ed', ['latte-express@mit.ed']),
44
+ ]
45
+
46
+ TESTS_PHONE_NUMS = [
47
+ ('518)528-0252', ['518', ')', '528', '-', '0252']),
48
+ ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']),
49
+ ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']),
50
+ ]
51
+
52
+ TESTS_DATETIME = [
53
+ ('15:00', ['15', ':', '00']),
54
+ ('2:00pm', ['2', ':', '00', 'pm']),
55
+ ('9/14/16', ['9', '/', '14', '/', '16']),
56
+ ]
57
+
58
+ TESTS_CURRENCIES = [
59
+ ('517.933\xa3', ['517', '.', '933', '\xa3']),
60
+ ('$517.87', ['$', '517', '.', '87']),
61
+ ('1201.6598', ['1201', '.', '6598']),
62
+ ('120,6', ['120', ',', '6']),
63
+ ('10,00\u20ac', ['10', ',', '00', '\u20ac']),
64
+ ('1,000', ['1', ',', '000']),
65
+ ('1200pesos', ['1200', 'pesos']),
66
+ ]
67
+
68
+ TESTS_NUM_SYM = [
69
+ ('5162f', ['5162', 'f']),
70
+ ('f5162', ['f', '5162']),
71
+ ('1203(', ['1203', '(']),
72
+ ('(1203)', ['(', '1203', ')']),
73
+ ('1200/', ['1200', '/']),
74
+ ('1200+', ['1200', '+']),
75
+ ('1202o-east', ['1202', 'o-east']),
76
+ ('1200r', ['1200', 'r']),
77
+ ('1200-1400', ['1200', '-', '1400']),
78
+ ('120/today', ['120', '/', 'today']),
79
+ ('today/120', ['today', '/', '120']),
80
+ ('120/5', ['120', '/', '5']),
81
+ ("120'/5", ['120', "'", '/', '5']),
82
+ ('120/5pro', ['120', '/', '5', 'pro']),
83
+ ("1200's,)", ['1200', "'", 's', ',', ')']),
84
+ ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']),
85
+ ]
86
+
87
+ TESTS_PUNCTUATION = [
88
+ ("don''t", ['don', "''", 't']),
89
+ ("don'tcha", ["don'tcha"]),
90
+ ('no?!?!;', ['no', '?', '!', '?', '!', ';']),
91
+ ('no??!!..', ['no', '??', '!!', '..']),
92
+ ('a.m.', ['a.m.']),
93
+ ('.s.u', ['.', 's', '.', 'u']),
94
+ ('!!i..n__', ['!!', 'i', '..', 'n', '__']),
95
+ ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3',
96
+ '>', ')', 'u', 'Mr.', '!']),
97
+ ('-->', ['--', '>']),
98
+ ('->', ['-', '>']),
99
+ ('<-', ['<', '-']),
100
+ ('<--', ['<', '--']),
101
+ ('hello (@person)', ['hello', '(', '@person', ')']),
102
+ ]
103
+
104
+
105
+ def test_normal():
106
+ """ Normal/combined usage.
107
+ """
108
+ test_base(TESTS_NORMAL)
109
+
110
+
111
+ def test_emojis():
112
+ """ Tokenizing emojis/emoticons/decorations.
113
+ """
114
+ test_base(TESTS_EMOJIS)
115
+
116
+
117
+ def test_urls():
118
+ """ Tokenizing URLs.
119
+ """
120
+ test_base(TESTS_URLS)
121
+
122
+
123
+ def test_twitter():
124
+ """ Tokenizing hashtags, mentions and emails.
125
+ """
126
+ test_base(TESTS_TWITTER)
127
+
128
+
129
+ def test_phone_nums():
130
+ """ Tokenizing phone numbers.
131
+ """
132
+ test_base(TESTS_PHONE_NUMS)
133
+
134
+
135
+ def test_datetime():
136
+ """ Tokenizing dates and times.
137
+ """
138
+ test_base(TESTS_DATETIME)
139
+
140
+
141
+ def test_currencies():
142
+ """ Tokenizing currencies.
143
+ """
144
+ test_base(TESTS_CURRENCIES)
145
+
146
+
147
+ def test_num_sym():
148
+ """ Tokenizing combinations of numbers and symbols.
149
+ """
150
+ test_base(TESTS_NUM_SYM)
151
+
152
+
153
+ def test_punctuation():
154
+ """ Tokenizing punctuation and contractions.
155
+ """
156
+ test_base(TESTS_PUNCTUATION)
157
+
158
+
159
+ @nottest
160
+ def test_base(tests):
161
+ """ Base function for running tests.
162
+ """
163
+ for (test, expected) in tests:
164
+ actual = tokenize(test)
165
+ assert actual == expected, \
166
+ "Tokenization of \'{}\' failed, expected: {}, actual: {}"\
167
+ .format(test, expected, actual)
tests/test_word_generator.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import sys
3
+ from os.path import dirname, abspath
4
+ sys.path.append(dirname(dirname(abspath(__file__))))
5
+ from nose.tools import raises
6
+ from torchmoji.word_generator import WordGenerator
7
+
8
+ IS_PYTHON2 = int(sys.version[0]) == 2
9
+
10
+ @raises(ValueError)
11
+ def test_only_unicode_accepted():
12
+ """ Non-Unicode strings raise a ValueError.
13
+ In Python 3 all string are Unicode
14
+ """
15
+ if not IS_PYTHON2:
16
+ raise ValueError("You are using python 3 so this test should always pass")
17
+
18
+ sentences = [
19
+ u'Hello world',
20
+ u'I am unicode',
21
+ 'I am not unicode',
22
+ ]
23
+
24
+ wg = WordGenerator(sentences)
25
+ for w in wg:
26
+ pass
27
+
28
+
29
+ def test_unicode_sentences_ignored_if_set():
30
+ """ Strings with Unicode characters tokenize to empty array if they're not allowed.
31
+ """
32
+ sentence = [u'Dobrý den, jak se máš?']
33
+ wg = WordGenerator(sentence, allow_unicode_text=False)
34
+ assert wg.get_words(sentence[0]) == []
35
+
36
+
37
+ def test_check_ascii():
38
+ """ check_ascii recognises ASCII words properly.
39
+ In Python 3 all string are Unicode
40
+ """
41
+ if not IS_PYTHON2:
42
+ return
43
+
44
+ wg = WordGenerator([])
45
+ assert wg.check_ascii('ASCII')
46
+ assert not wg.check_ascii('ščřžýá')
47
+ assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢')
48
+
49
+
50
+ def test_convert_unicode_word():
51
+ """ convert_unicode_word converts Unicode words correctly.
52
+ """
53
+ wg = WordGenerator([], allow_unicode_text=True)
54
+
55
+ result = wg.convert_unicode_word(u'č')
56
+ assert result == (True, u'\u010d'), '{}'.format(result)
57
+
58
+
59
+ def test_convert_unicode_word_ignores_if_set():
60
+ """ convert_unicode_word ignores Unicode words if set.
61
+ """
62
+ wg = WordGenerator([], allow_unicode_text=False)
63
+
64
+ result = wg.convert_unicode_word(u'č')
65
+ assert result == (False, ''), '{}'.format(result)
66
+
67
+
68
+ def test_convert_unicode_chars():
69
+ """ convert_unicode_word correctly converts accented characters.
70
+ """
71
+ wg = WordGenerator([], allow_unicode_text=True)
72
+ result = wg.convert_unicode_word(u'ěščřžýáíé')
73
+ assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result)