LuisV commited on
Commit
dfd271a
·
1 Parent(s): 4859d06

adding artemis package

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. imageprocessing/artemis/LICENSE +23 -0
  2. imageprocessing/artemis/README.md +160 -0
  3. imageprocessing/artemis/artemis/__init__.py +6 -0
  4. imageprocessing/artemis/artemis/analysis/__init__.py +4 -0
  5. imageprocessing/artemis/artemis/analysis/emotion_centric.py +72 -0
  6. imageprocessing/artemis/artemis/analysis/feature_extraction.py +84 -0
  7. imageprocessing/artemis/artemis/analysis/paintings_meta_data.py +26 -0
  8. imageprocessing/artemis/artemis/analysis/utils.py +80 -0
  9. imageprocessing/artemis/artemis/captioning/__init__.py +4 -0
  10. imageprocessing/artemis/artemis/captioning/sample_captions.py +78 -0
  11. imageprocessing/artemis/artemis/captioning/senti_cap_anps.py +111 -0
  12. imageprocessing/artemis/artemis/data/glove.6B.100d.vocabulary.txt +0 -0
  13. imageprocessing/artemis/artemis/data/image-emotion-histogram.csv +0 -0
  14. imageprocessing/artemis/artemis/data/speaker_sampling_configs/full_hyper_param_ablation.json.txt +182 -0
  15. imageprocessing/artemis/artemis/data/speaker_sampling_configs/mini_hyper_param_ablation.json.txt +12 -0
  16. imageprocessing/artemis/artemis/data/speaker_sampling_configs/selected_hyper_params.json.txt +7 -0
  17. imageprocessing/artemis/artemis/data/symspell_frequency_dictionary_en_82_765.txt +0 -0
  18. imageprocessing/artemis/artemis/data/wiki_art_duplicate_paintings.pkl +3 -0
  19. imageprocessing/artemis/artemis/data/wiki_art_genre_class.csv +0 -0
  20. imageprocessing/artemis/artemis/emotions.py +79 -0
  21. imageprocessing/artemis/artemis/evaluation/__init__.py +7 -0
  22. imageprocessing/artemis/artemis/evaluation/bleu.py +34 -0
  23. imageprocessing/artemis/artemis/evaluation/emotion_alignment.py +87 -0
  24. imageprocessing/artemis/artemis/evaluation/longest_common_subseq.py +63 -0
  25. imageprocessing/artemis/artemis/evaluation/metaphors.py +42 -0
  26. imageprocessing/artemis/artemis/evaluation/single_caption_per_image.py +214 -0
  27. imageprocessing/artemis/artemis/in_out/__init__.py +4 -0
  28. imageprocessing/artemis/artemis/in_out/arguments.py +199 -0
  29. imageprocessing/artemis/artemis/in_out/basics.py +230 -0
  30. imageprocessing/artemis/artemis/in_out/cleaning.py +87 -0
  31. imageprocessing/artemis/artemis/in_out/coco.py +30 -0
  32. imageprocessing/artemis/artemis/in_out/datasets.py +224 -0
  33. imageprocessing/artemis/artemis/in_out/neural_net_oriented.py +336 -0
  34. imageprocessing/artemis/artemis/language/__init__.py +4 -0
  35. imageprocessing/artemis/artemis/language/adjective_noun_pairs.py +44 -0
  36. imageprocessing/artemis/artemis/language/basics.py +132 -0
  37. imageprocessing/artemis/artemis/language/language_preprocessing.py +224 -0
  38. imageprocessing/artemis/artemis/language/part_of_speech.py +40 -0
  39. imageprocessing/artemis/artemis/language/spelling.py +634 -0
  40. imageprocessing/artemis/artemis/neural_models/__init__.py +4 -0
  41. imageprocessing/artemis/artemis/neural_models/attention.py +45 -0
  42. imageprocessing/artemis/artemis/neural_models/attentive_decoder.py +696 -0
  43. imageprocessing/artemis/artemis/neural_models/distances.py +67 -0
  44. imageprocessing/artemis/artemis/neural_models/image_emotion_clf.py +75 -0
  45. imageprocessing/artemis/artemis/neural_models/lstm_encoder.py +95 -0
  46. imageprocessing/artemis/artemis/neural_models/mlp.py +78 -0
  47. imageprocessing/artemis/artemis/neural_models/resnet_encoder.py +103 -0
  48. imageprocessing/artemis/artemis/neural_models/show_attend_tell.py +45 -0
  49. imageprocessing/artemis/artemis/neural_models/text_emotional_clf.py +94 -0
  50. imageprocessing/artemis/artemis/neural_models/word_embeddings.py +123 -0
imageprocessing/artemis/LICENSE ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ArtEmis: Affective Language for Art
2
+
3
+ The MIT License (MIT)
4
+
5
+ Copyright (c) 2021 Panos Achlioptas
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
imageprocessing/artemis/README.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## ArtEmis: Affective Language for Visual Art
2
+ A codebase created and maintained by <a href="https://ai.stanford.edu/~optas" target="_blank">Panos Achlioptas</a>.
3
+
4
+ ![representative](https://github.com/optas/artemis/blob/master/doc/images/speaker_productions_teaser.png)
5
+
6
+
7
+ ### Introduction
8
+ This work is based on the [arXiv tech report](https://arxiv.org/abs/2101.07396) which is __provisionally__ accepted in [CVPR-2021](http://cvpr2021.thecvf.com/), for an <b>Oral</b> presentation.
9
+
10
+ ### Citation
11
+ If you find this work useful in your research, please consider citing:
12
+
13
+ @article{achlioptas2021artemis,
14
+ title={ArtEmis: Affective Language for Visual Art},
15
+ author={Achlioptas, Panos and Ovsjanikov, Maks and Haydarov, Kilichbek and
16
+ Elhoseiny, Mohamed and Guibas, Leonidas},
17
+ journal = {CoRR},
18
+ volume = {abs/2101.07396},
19
+ year={2021}
20
+ }
21
+
22
+ ### Dataset
23
+ To get the most out of this repo, please __download__ the data associated with ArtEmis by filling this [form](https://forms.gle/7eqiRgb764uTuexd7).
24
+
25
+ ### Installation
26
+ This code has been tested with Python 3.6.9, Pytorch 1.3.1, CUDA 10.0 on Ubuntu 16.04.
27
+
28
+ Assuming some (potentially) virtual environment and __python 3x__
29
+ ```Console
30
+ git clone https://github.com/optas/artemis.git
31
+ cd artemis
32
+ pip install -e .
33
+ ```
34
+ This will install the repo with all its dependencies (listed in setup.py) and will enable you to do things like:
35
+ ```
36
+ from artemis.models import xx
37
+ ```
38
+ (provided you add this artemis repo in your PYTHON-PATH)
39
+
40
+ ### Playing with ArtEmis
41
+
42
+ #### Step-1 (important &nbsp; :pushpin:)
43
+
44
+ __Preprocess the provided annotations__ (spell-check, patch, tokenize, make train/val/test splits, etc.).
45
+ ```Console
46
+ artemis/scripts/preprocess_artemis_data.py
47
+ ```
48
+ This script allows you to preprocess ArtEmis according to your needs. The __default__ arguments will do __minimal__
49
+ preprocessing so the resulting output can be used to _fairly_ compare ArtEmis with other datasets; and, derive most _faithful_ statistics
50
+ about ArtEmis's nature. That is what we used in our __analysis__ and what you should use in "Step-2" below. With this in mind do:
51
+ ```Console
52
+ python artemis/scripts/preprocess_artemis_data.py -save-out-dir <ADD_YOURS> -raw-artemis-data-csv <ADD_YOURS>
53
+ ```
54
+
55
+ If you wish to train __deep-nets__ (speakers, emotion-classifiers etc.) *exactly* as we did it in our paper, then you need to rerun this script
56
+ by providing only a single extra optional argument ("__--preprocess-for-deep-nets True__"). This will do more aggressive filtering and you should use its output for
57
+ "Steps-3" and "Steps-4" below. Use a different save-out-dir to avoid overwritting the output of previous runs.
58
+ ```Console
59
+ python artemis/scripts/preprocess_artemis_data.py -save-out-dir <ADD_YOURS> -raw-artemis-data-csv <ADD_YOURS> --preprocess-for-deep-nets True
60
+ ```
61
+ To understand and customize the different hyper-parameters please read the details in the provided _help_ messages of the used argparse.
62
+
63
+ #### Step-2
64
+ __Analyze & explore the dataset__. :microscope:
65
+
66
+ Using the _minimally_ preprocessed version of ArtEmis which includes __all__ (454,684) collected annotation.
67
+
68
+ 1. This is a great place to __start__ :checkered_flag:. Run this [notebook](artemis/notebooks/analysis/analyzing_artemis.ipynb) to do basic _linguistic_, _emotion_ & _art-oriented_ __analysis__ of the ArtEmis dataset.
69
+ 2. Run this [notebook](artemis/notebooks/analysis/concreteness_subjectivity_sentiment_and_POS.ipynb) to analyze ArtEmis in terms of its: _concreteness_, _subjectivity_, _sentiment_ and _Parts-of-Speech_. Optionally, contrast these values with
70
+ with other common datasets like COCO.
71
+ 3. Run this [notebook](artemis/notebooks/analysis/extract_emotion_histogram_per_image.ipynb) to extract the _emotion histograms_ (empirical distributions) of each artwork. This in __necessary__ for the Step-3 (1).
72
+ 4. Run this [notebook](artemis/notebooks/analysis/emotion_entropy_per_genre_or_artstyle.ipynb) to analyze the extracted emotion histograms (previous step) per art genre and style.
73
+
74
+ #### Step-3
75
+
76
+ __Train and evaluate emotion-centric image & text classifiers__. :hearts:
77
+
78
+ Using the preprocessed version of ArtEmis for __deep-nets__ which includes 429,431 annotations.
79
+ (Training on a single GPU from scratch is a matter of __minutes__ for these classifiers!)
80
+
81
+ 1. Run this [notebook](artemis/notebooks/deep_nets/emotions/image_to_emotion_classifier.ipynb) to train an __image-to-emotion__ classifier.
82
+ 2. Run this [notebook](artemis/notebooks/deep_nets/emotions/utterance_to_emotion_classifier.ipynb) to train an LSTM-based __utterance-to-emotion__ classifier. Or, this [notebook](artemis/notebooks/deep_nets/emotions/utterance_to_emotion_with_transformer.ipynb) to train a BERT-based one.
83
+
84
+
85
+ #### Step-4
86
+ __Train & evaluate neural-speakers.__ :bomb:
87
+
88
+ - To __train__ our customized SAT model on ArtEmis (__~2 hours__ to train in a single GPU!) do:
89
+ ```Console
90
+ python artemis/scripts/train_speaker.py -log-dir <ADD_YOURS> -data-dir <ADD_YOURS> -img-dir <ADD_YOURS>
91
+
92
+ log-dir: where to save the output of the training process, models etc.
93
+ data-dir: directory that contains the _input_ data
94
+ the directory that contains the ouput of preprocess_artemis_data.py: e.g.,
95
+ the artemis_preprocessed.csv, the vocabulary.pkl
96
+ img-dir: the top folder containing the WikiArt image dataset in its "standard" format:
97
+ img-dir/art_style/painting_xx.jpg
98
+ ```
99
+
100
+ Note. The default optional arguments will create the same vanilla-speaker variant we used in the CVPR21 paper.
101
+
102
+ - To __train__ the __emotionally-grounded__ variant of SAT add an extra parameter in the above call:
103
+ ```Console
104
+ python artemis/scripts/train_speaker.py -log-dir <ADD_YOURS> -data-dir <ADD_YOURS> -img-dir <ADD_YOURS>
105
+ --use-emo-grounding True
106
+ ```
107
+ - To __sample__ utterances from a trained speaker:
108
+ ```Console
109
+ python artemis/scripts/sample_speaker.py -arguments
110
+ ```
111
+ For an explanation of the arguments see the argparse help messages. It is worth noting that when you
112
+ want to sample an emotionally-grounded variant you need to provide a pretrained image2emotion
113
+ classifier. The image2emotion will be used to deduce _the most likely_ emotion of an image, and input this emotion to
114
+ the speaker. See Step-3 (1) for how to train such a net.
115
+
116
+ - To __evaluate__ the quality of the sampled captions (e.g., per BLEU, emotional alignment, methaphors etc.) use this
117
+ [notebook](artemis/notebooks/deep_nets/speakers/evaluate_sampled_captions.ipynb). As a bonus you can use it to inspect the _neural attention_ placed on
118
+ the different tokens/images.
119
+
120
+ ### MISC
121
+ - You can make a _pseudo_ "neural speaker" by copying training-sentences to the test according to __Nearest-Neighbors__ in a pretrained
122
+ network feature space by running this 5 min. [notebook](artemis/notebooks/deep_nets/speakers/nearest_neighbor_speaker.ipynb).
123
+
124
+
125
+ ### Pretrained Models (used in CVPR21-paper)
126
+ * [Image-To-Emotion classifier (81MB)](https://www.dropbox.com/s/8dfj3b36q15iieo/best_model.pt?dl=0)
127
+ - use it within notebook of Step.3.1 or to _sample_ emotionally grounded speaker (Step.4.sample).
128
+
129
+ * [LSTM-based Text-To-Emotion classifier (8MB)](https://www.dropbox.com/s/ruczzggqu1i6nof/best_model.pt?dl=0)
130
+ - use it within inside notebook of Step.3.2 or to _evaluate_ the samples of a speaker (Step.4.evaluate) | e.g., needed for emotional-alignment.
131
+
132
+ * [SAT-Speaker (434MB)](https://www.dropbox.com/s/tnbfws0m3yi06ge/vanilla_sat_speaker_cvpr21.zip?dl=0)
133
+ * [SAT-Speaker-with-emotion-grounding (431MB)](https://www.dropbox.com/s/0erh464wag8ods1/emo_grounded_sat_speaker_cvpr21.zip?dl=0)
134
+
135
+ + The above two links include also our _sampled captions_ for the test-split. You can use them to evaluate the speakers without resampling them. Please read the included README.txt.
136
+
137
+ + __Caveats__: ArtEmis is a real-world dataset containing the opinion and sentiment of thousands of people. It is expected thus to contain text with biases, factual inaccuracies, and perhaps foul language. Please use responsibly.
138
+ The provided models are likely to be biased and/or inaccurate in ways reflected in the training data.
139
+
140
+ ### News
141
+
142
+ - :champagne: &nbsp; ArtEmis has attracted already some noticeable media coverage. E.g., @ [New-Scientist](https://www.newscientist.com/article/2266240-ai-art-critic-can-predict-which-emotions-a-painting-will-evoke),
143
+ [HAI](https://hai.stanford.edu/news/artists-intent-ai-recognizes-emotions-visual-art),
144
+ [MarkTechPost](https://www.marktechpost.com/2021/01/30/stanford-researchers-introduces-artemis-a-dataset-containing-439k-emotion-attributions),
145
+ [KCBS-Radio](https://ai.stanford.edu/~optas/data/interviews/artemis/kcbs/SAT-AI-ART_2_2-6-21(disco_mix).mp3),
146
+ [Communications of ACM](https://cacm.acm.org/news/250312-ai-art-critic-can-predict-which-emotions-a-painting-will-evoke/fulltext),
147
+ [Synced Review](https://medium.com/@Synced/ai-art-critic-new-dataset-and-models-make-emotional-sense-of-visual-artworks-2289c6c71299),
148
+ [École Polytechnique](https://www.polytechnique.edu/fr/content/des-algorithmes-emotifs-face-des-oeuvres-dart),
149
+ [Forbes Science](https://www.forbes.com/sites/evaamsen/2021/03/30/artificial-intelligence-is-learning-to-categorize-and-talk-about-art/).
150
+
151
+ - :telephone_receiver: &nbsp; __important__ More code, will be added in April. Namely, for the ANP-baseline, the comparisons of ArtEmis with other datasets, please do a git-pull at that time. The update will be _seamless_! During this first months, if you have _ANY_ question feel free to send me an email at __optas@stanford.edu__.
152
+
153
+ - :trophy: &nbsp; If you are developing more models with ArtEmis and you want to incorporate them here please talk to me or simply do a pull-request.
154
+
155
+
156
+ #### License
157
+ This code is released under MIT License (see LICENSE file for details).
158
+ _In simple words, if you copy/use parts of this code please __keep the copyright note__ in place._
159
+
160
+
imageprocessing/artemis/artemis/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
4
+ """
5
+ from .in_out.basics import files_in_subdirs
6
+ from .in_out.basics import pickle_data, unpickle_data
imageprocessing/artemis/artemis/analysis/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
4
+ """
imageprocessing/artemis/artemis/analysis/emotion_centric.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for emotion-centric analysis.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 10/22/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import pandas as pd
10
+ import matplotlib.pylab as plt
11
+
12
+ from ..emotions import ARTEMIS_EMOTIONS, positive_negative_else
13
+
14
+
15
+ def df_to_emotion_histogram(df, palette=plt.cm.Pastel1, emotion_column='emotion', verbose=False):
16
+ """ Take a dataset like ArtEmis and return a histogram over the emotion choices made by the annotators.
17
+ :param df: dataframe carrying dataset
18
+ :param palette: matplotlib color palette, e.g., plt.cm.jet
19
+ :param emotion_column: (str) indicate which column of the dataframe carries the emotion
20
+ :return: a list carrying the resulting histogram figure.
21
+ """
22
+ hist_vals = []
23
+ for emotion in ARTEMIS_EMOTIONS:
24
+ hist_vals.append(sum(df[emotion_column] == emotion) / len(df))
25
+
26
+ norm = plt.Normalize(min(hist_vals), max(hist_vals))
27
+ colors = palette(norm(hist_vals))
28
+
29
+ s = pd.DataFrame({"emotions": ARTEMIS_EMOTIONS, "vals": hist_vals})
30
+ s.set_index("emotions", drop=True, inplace=True)
31
+ plt.figure()
32
+ s.index.name = None
33
+ ax = s.plot.bar(grid=True, figsize=(12,4), color=colors, fontsize=16, rot=45, legend=False, ec="k")
34
+ ax.set_ylabel('Percentage of data', fontsize=15)
35
+
36
+ for rec, col in zip(ax.patches, colors):
37
+ rec.set_color(col)
38
+
39
+ plt.tight_layout()
40
+ res = [plt.gcf()]
41
+
42
+ plt.figure()
43
+ s = df[emotion_column].apply(positive_negative_else).value_counts() / len(df)
44
+
45
+ if verbose:
46
+ print('Pos-Neg-Else, percents:', s.round(3))
47
+
48
+ ax = s.plot.bar(grid=True, figsize=(8,4), fontsize=16, rot=45, legend=False, color='gray')
49
+ ax.set_xticklabels(['positive', 'negative', 'else'])
50
+ plt.tight_layout()
51
+ res.append(plt.gcf())
52
+
53
+ return res
54
+
55
+
56
+ def has_emotion_max_dominance(grouped_df, exclude_se=False, return_max=False):
57
+ """ I.e., same emotion was selected (among all nine emotions) at least by half annotators.
58
+ :param grouped_df: dataframe of dataset grouped by stimuli, e.g., images.
59
+ :param exclude_se: if True, ignore the groups where the maximizer is the something-else category
60
+ :param return_max: return for each group that has dominance the emotion type that has the gathered the maximum annotations.
61
+ :return:
62
+ """
63
+ vals = grouped_df.emotion.value_counts()
64
+ maxim = vals.max()
65
+ threshold = vals.sum() / 2
66
+ res = maxim >= threshold
67
+ if exclude_se:
68
+ res &= vals.idxmax() != 'something else'
69
+ if return_max:
70
+ return res, vals.idxmax()
71
+ else:
72
+ return res
imageprocessing/artemis/artemis/analysis/feature_extraction.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Routines to extract features from images.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 6/14/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ import torchvision.transforms as transforms
11
+ import numpy as np
12
+ from PIL import Image
13
+ from torchvision import models
14
+
15
+ from ..in_out.datasets import ImageClassificationDataset
16
+ from ..in_out.neural_net_oriented import image_net_mean, image_net_std
17
+ from ..neural_models.resnet_encoder import ResnetEncoder
18
+
19
+
20
+ @torch.no_grad()
21
+ def get_forward_features_of_dataset(encoder, dataloader, device, data_in_batch='image'):
22
+ b_size = dataloader.batch_size
23
+ for i, batch in enumerate(dataloader):
24
+ feats = encoder(batch[data_in_batch].to(device))
25
+ feats = feats.cpu().numpy().astype('float32')
26
+
27
+ if i == 0:
28
+ features = np.zeros((len(dataloader.dataset), feats.shape[1]), dtype='float32')
29
+
30
+ if i < len(dataloader) - 1:
31
+ features[i * b_size: (i + 1) * b_size] = feats
32
+ else:
33
+ # special treatment for final batch
34
+ features[i * b_size:] = feats
35
+ return features
36
+
37
+
38
+ def image_transformation(img_dim, pretraining='image_net'):
39
+ if pretraining == 'image_net':
40
+ normalize = transforms.Normalize(mean=image_net_mean, std=image_net_std)
41
+ else:
42
+ raise NotImplementedError('')
43
+
44
+ res = transforms.Compose([transforms.Resize((img_dim, img_dim), Image.LANCZOS),
45
+ transforms.ToTensor(), normalize])
46
+
47
+ return res
48
+
49
+
50
+ def vgg_encoder(device):
51
+ vgg = models.vgg16_bn(pretrained=True).to(device).eval()
52
+ feature_storage = []
53
+ def hook(module, hook_input, hook_output):
54
+ feature_storage.append(hook_output.detach_().cpu().numpy())
55
+ vgg.classifier[4].register_forward_hook(hook) # last relu layer before classification.
56
+ return vgg, feature_storage
57
+
58
+
59
+ @torch.no_grad()
60
+ def extract_visual_features(image_files, img_dim, method='resnet18',
61
+ batch_size=128, n_workers=12, device='cuda'):
62
+
63
+
64
+ img_transform = image_transformation(img_dim)
65
+ dataset = ImageClassificationDataset(image_files, img_transform=img_transform)
66
+
67
+ loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size,
68
+ shuffle=False, num_workers=n_workers)
69
+
70
+ if method.startswith('resnet'):
71
+ vis_encoder = ResnetEncoder(method, 1).to(device).eval()
72
+ features = get_forward_features_of_dataset(vis_encoder, loader, device)
73
+
74
+ elif method.startswith('vgg'):
75
+ vis_encoder, features = vgg_encoder(device)
76
+ for batch in loader:
77
+ vis_encoder(batch['image'].to(device))
78
+ features = np.vstack(features)
79
+
80
+ elif method.startswith('random'):
81
+ vis_encoder = ResnetEncoder('resnet18', 1, pretrained=False).to(device).eval()
82
+ features = get_forward_features_of_dataset(vis_encoder, loader, device)
83
+
84
+ return features
imageprocessing/artemis/artemis/analysis/paintings_meta_data.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manually selected famous paintings that can be optionally put in a test-set.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 6/23/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ masterpieces_for_test = [
10
+ 'leonardo-da-vinci_mona-lisa',
11
+ 'vincent-van-gogh_the-starry-night-1889(1)',
12
+ 'vincent-van-gogh_the-starry-night-1888-1',
13
+ 'vincent-van-gogh_the-starry-night-1889-1',
14
+ 'vincent-van-gogh_the-starry-night-1888-2',
15
+ 'vincent-van-gogh_the-starry-night-1888',
16
+ 'johannes-vermeer_the-girl-with-a-pearl-earring',
17
+ 'robert-silvers_girl-with-the-pearl-earring-2008',
18
+ 'robert-silvers_guernica-photomosaic-mounted-on-aluminum',
19
+ 'gustav-klimt_the-kiss-1908(1)',
20
+ 'leonardo-da-vinci_the-lady-with-the-ermine-cecilia-gallerani-1496',
21
+ 'vincent-van-gogh_cafe-terrace-on-the-place-du-forum-1888(1)',
22
+ 'vincent-van-gogh_the-cafe-terrace-on-the-place-du-forum-arles-at-night-1888',
23
+ 'vincent-van-gogh_cafe-terrace-place-du-forum-arles-1888(1)',
24
+ 'eugene-delacroix_the-liberty-leading-the-people-1830',
25
+ 'claude-monet_impression-sunrise',
26
+ 'james-mcneill-whistler_arrangement-in-grey-and-black-no-1-portrait-of-the-artist-s-mother-1871']
imageprocessing/artemis/artemis/analysis/utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Auxiliary routines to be used when analyzing/comparing ArtEmis in terms of its subjectivity, abstractness etc.
3
+ See also notebooks/analysis/concreteness_subjectivity_sentiment.ipynb
4
+
5
+ The MIT License (MIT)
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+ import numpy as np
9
+ from collections import defaultdict
10
+ from tqdm.notebook import tqdm as tqdm_notebook
11
+
12
+ from collections import Counter
13
+ from ..language.basics import ngrams
14
+
15
+ def contains_word(tokenized_sentences, word_set):
16
+ boolean_mask = tokenized_sentences.apply(lambda x: len(set(x).intersection(word_set)) >= 1)
17
+ return boolean_mask
18
+
19
+ def contains_bigrams(tokens, bigram_set):
20
+ token_bigrams = set([' '.join(b) for b in ngrams(tokens, 2)])
21
+ return any(x in bigram_set for x in token_bigrams)
22
+
23
+
24
+ def concreteness_of_sentence(tokens, word_to_concreteness, count_bigrams=True):
25
+ "Sorry, will add add explanation in April..."
26
+
27
+ bigram_vals = [] # concreteness values of found bigrams
28
+ if count_bigrams:
29
+ # find bigrams that occur and their multiplicity
30
+ bigrams = Counter(ngrams(tokens, 2))
31
+ utterance = ' '.join(tokens)
32
+ for bigram, cnt in bigrams.items():
33
+ bigram = ' '.join(bigram)
34
+ if bigram in word_to_concreteness:
35
+ for _ in range(cnt):
36
+ bigram_vals.append(word_to_concreteness[bigram])
37
+ utterance = utterance.replace(bigram, '') # remove bigrams from the utterance
38
+ # to not double-count/score them
39
+ tokens = utterance.split()
40
+
41
+ unigram_vals = [word_to_concreteness[t] for t in tokens if t in word_to_concreteness]
42
+ conc_vals = unigram_vals + bigram_vals
43
+
44
+ if len(conc_vals) == 0:
45
+ return None
46
+ return sum(conc_vals) / len(conc_vals)
47
+
48
+
49
+ def pos_analysis(df, group_cols=None, round_decimal=1):
50
+ # Assumes nltk universal pos-tagging
51
+ # & df['pos'] has the part-of-speech tags
52
+ # analysis along the POS used in the paper
53
+
54
+ pos_syms = ['NOUN', 'PRON', 'ADJ', 'ADP', 'VERB']
55
+ pos_names = ['Nouns', 'Pronouns', 'Adjectives', 'Adpositions', 'Verbs']
56
+
57
+ if group_cols is not None:
58
+ groups = df.groupby(group_cols)
59
+ group_stats = []
60
+ group_lens = []
61
+ for n, gg in tqdm_notebook(groups):
62
+ g_stats = defaultdict(set)
63
+ group_lens.append(len(gg))
64
+ for t, p in zip(gg.tokens, gg.pos):
65
+ for x, y in zip(t, p):
66
+ g_stats[y[1]].add(x)
67
+ group_stats.append(g_stats)
68
+
69
+ for ps, pn in zip(pos_syms, pos_names):
70
+ u_pos = []
71
+ u_pos_norm = []
72
+ for i, s in enumerate(group_stats):
73
+ u_pos.append(len(s[ps]))
74
+ u_pos_norm.append(u_pos[-1] / group_lens[i])
75
+ print(pn, '{:.{}f}'.format(np.mean(u_pos), round_decimal), '{:.{}f}'.format(np.mean(u_pos_norm), round_decimal))
76
+ else:
77
+ for ps, pn in zip(pos_syms, pos_names):
78
+ print(pn, df.pos.apply(lambda x: len([i[0] for i in x if i[1] == ps])).mean().round(round_decimal))
79
+
80
+
imageprocessing/artemis/artemis/captioning/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
4
+ """
imageprocessing/artemis/artemis/captioning/sample_captions.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper functions for sampling (@test -- inference-time) a neural-speaker.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 20/1/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from torch.utils.data import DataLoader
12
+
13
+ from ..neural_models.attentive_decoder import sample_captions, sample_captions_beam_search, properize_captions
14
+ from ..in_out.basics import wikiart_file_name_to_style_and_painting
15
+ from ..emotions import IDX_TO_EMOTION
16
+ from ..utils.vocabulary import UNK
17
+
18
+
19
+ def versatile_caption_sampler(speaker, data_loader, device, max_utterance_len, sampling_rule='beam',
20
+ beam_size=None, topk=None, temperature=1, drop_unk=True, use_bert_unk=False,
21
+ drop_bigrams=False):
22
+ """Provides all implemented sampling methods according to the sampling_rule input parameter.
23
+ """
24
+ vocab = speaker.decoder.vocab
25
+
26
+ if sampling_rule == 'beam':
27
+ dset = data_loader.dataset
28
+ loader = DataLoader(dset, num_workers=data_loader.num_workers) # batch-size=1
29
+
30
+ max_iter = 8 * max_utterance_len # should be large enough
31
+ beam_captions, alphas, beam_scores = sample_captions_beam_search(speaker, loader, beam_size,
32
+ device, max_iter=max_iter,
33
+ temperature=temperature,
34
+ drop_unk=drop_unk,
35
+ drop_bigrams=drop_bigrams)
36
+ # first is highest scoring caption which is the only we keep here
37
+ captions = [c[0] for c in beam_captions]
38
+ alphas = [np.array(a[0]) for a in alphas] # each alpha covers all tokens: <sos>, token1, ..., <eos>
39
+ else:
40
+ captions, alphas = sample_captions(speaker, data_loader, max_utterance_len=max_utterance_len,
41
+ sampling_rule=sampling_rule, device=device, temperature=temperature,
42
+ topk=topk, drop_unk=drop_unk, drop_bigrams=drop_bigrams)
43
+
44
+ captions = properize_captions(captions, vocab).tolist()
45
+ captions = tokens_to_strings(captions, vocab, bert_unk=use_bert_unk)
46
+ return captions, alphas
47
+
48
+
49
+ def captions_as_dataframe(captions_dataset, captions_predicted, wiki_art_data=True):
50
+ """convert the dataset/predicted-utterances (captions) to a pandas dataframe."""
51
+ if wiki_art_data:
52
+ temp = captions_dataset.image_files.apply(wikiart_file_name_to_style_and_painting)
53
+ art_style, painting = zip(*temp)
54
+ grounding_emotion = [IDX_TO_EMOTION.get(x, None) for x in captions_dataset.emotions.tolist()]
55
+ df = pd.DataFrame([art_style, painting, grounding_emotion, captions_predicted]).transpose()
56
+ column_names = ['art_style', 'painting', 'grounding_emotion', 'caption']
57
+ df.columns = column_names
58
+ else:
59
+ image_files = captions_dataset.image_files.tolist()
60
+ grounding_emotion = [IDX_TO_EMOTION.get(x, None) for x in captions_dataset.emotions.tolist()]
61
+ df = pd.DataFrame([image_files, grounding_emotion, captions_predicted]).transpose()
62
+ column_names = ['image_file', 'grounding_emotion', 'caption']
63
+ df.columns = column_names
64
+ return df
65
+
66
+
67
+ def tokens_to_strings(token_list, vocab, bert_unk=True):
68
+ """ Bert uses [UNK] to represent the unknown symbol.
69
+ :param token_list:
70
+ :param vocab:
71
+ :param bert_unk:
72
+ :return:
73
+ """
74
+ res = [vocab.decode_print(c) for c in token_list]
75
+ if bert_unk:
76
+ res = [c.replace(UNK, '[UNK]') for c in res]
77
+ return res
78
+
imageprocessing/artemis/artemis/captioning/senti_cap_anps.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Handling ANP-data // injection of sentiment according to SentiCap: https://arxiv.org/pdf/1510.01431.pdf
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 10/19/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+
8
+ Note:
9
+ Given the lack of time to add comments: PLEASE SEE directly notebook "sentimentalize_utterances_with_anps"
10
+ for use-case.
11
+ """
12
+
13
+ import nltk
14
+ import numpy.random as random
15
+ from collections import defaultdict
16
+
17
+ def read_senticap_anps(senticap_anp_file):
18
+ """
19
+ :param senticap_anp_file:
20
+ :return: twp lists, first has positive ANPs [beautiful dog, nice person] the second negative.
21
+ """
22
+ positive_anps = []
23
+ negative_anps = []
24
+ current_sentiment = 'positive' # the file lists first the postives, then all the negatives
25
+ with open(senticap_anp_file) as fin:
26
+ for i, line in enumerate(fin):
27
+ if i == 0:
28
+ continue
29
+
30
+ if "Negative ANPs:" in line:
31
+ current_sentiment = 'negative'
32
+ continue
33
+
34
+ anp = line.rstrip()
35
+
36
+ if len(anp) == 0:
37
+ continue
38
+
39
+ if current_sentiment == 'negative':
40
+ negative_anps.append(anp)
41
+ else:
42
+ positive_anps.append(anp)
43
+ return positive_anps, negative_anps
44
+
45
+
46
+ def build_senticap_noun_to_ajectives(pos_anps, neg_anps):
47
+ res = dict()
48
+ for tag, anps in zip(['positive', 'negative'], [pos_anps, neg_anps]):
49
+ res[tag] = defaultdict(list)
50
+ for anp in anps:
51
+ adjective, noun = anp.split()
52
+ res[tag][noun].append(adjective)
53
+ return res
54
+
55
+
56
+ def nouns_and_adjectives_of_senticap(pos_sent_anp, neg_sent_anp):
57
+ all_nouns = set()
58
+ all_adjectives = set()
59
+ for catalogue in [pos_sent_anp, neg_sent_anp]:
60
+ for item in catalogue:
61
+ adjective, noun = item.split()
62
+ all_nouns.add(noun)
63
+ all_adjectives.add(adjective)
64
+ return all_nouns, all_adjectives
65
+
66
+
67
+ def add_anp_to_sentence(sentence_tokenized, noun_to_adj, rule='random_adjective'):
68
+ """ Pick a noun of the sentence at that is a key of the noun_to_adj dictionary at random. Given the rule
69
+ pick the corresponding adjective from the noun_to_adj and add it before the noun. Return the new sentence.
70
+ If such a noun does not exist, apply no changes and return None.
71
+ :param sentence_tokenized: ['a', 'running' 'dog']
72
+ :param noun_to_adj: e.g., dog -> {happy, sad}, cat -> {funny, happy} etc.
73
+ :param rule: if "most_frequent_adjective" the noun_to_adj also includes frequencies:
74
+ e.g., dog -> {(happy 5), (sad, 1)}
75
+ :return:
76
+ """
77
+ sentence_tokenized = sentence_tokenized.copy()
78
+ pos = nltk.pos_tag(sentence_tokenized)
79
+ noun_pos = [i for i, x in enumerate(pos) if x[1][0] == 'N'] # all noun locationns
80
+
81
+ valid_noun_pos = []
82
+ # Drop nouns that do not have adjective ANP.
83
+ for p in noun_pos:
84
+ if sentence_tokenized[p] in noun_to_adj:
85
+ valid_noun_pos.append(p)
86
+
87
+ if len(valid_noun_pos) == 0:
88
+ return None
89
+
90
+
91
+ valid_noun_pos = sorted(valid_noun_pos) # sort for reproducibility
92
+ random.shuffle(valid_noun_pos)
93
+ picked_noun_pos = valid_noun_pos[0] # pick a noun at random
94
+ picked_noun = sentence_tokenized[picked_noun_pos]
95
+
96
+ if rule == 'random_adjective':
97
+ valid_adjectives = sorted(noun_to_adj[picked_noun]) # sort for reproducibility
98
+ random.shuffle(valid_adjectives)
99
+ picked_adjective = valid_adjectives[0]
100
+
101
+ elif rule == 'most_frequent_adjective':
102
+ most_freq_adjective_with_freq = sorted(noun_to_adj[picked_noun], key=lambda x: x[1])[-1]
103
+ picked_adjective = most_freq_adjective_with_freq[0]
104
+
105
+ ## Avoid adding an existing adjective (e.g., happy happy man)
106
+ if picked_noun_pos > 0 and sentence_tokenized[picked_noun_pos-1] == picked_adjective:
107
+ pass
108
+ else:
109
+ sentence_tokenized.insert(picked_noun_pos, picked_adjective)
110
+
111
+ return ' '.join(sentence_tokenized)
imageprocessing/artemis/artemis/data/glove.6B.100d.vocabulary.txt ADDED
The diff for this file is too large to render. See raw diff
 
imageprocessing/artemis/artemis/data/image-emotion-histogram.csv ADDED
The diff for this file is too large to render. See raw diff
 
imageprocessing/artemis/artemis/data/speaker_sampling_configs/full_hyper_param_ablation.json.txt ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "sampling_rule": "topk",
4
+ "temperature": 1.0,
5
+ "topk": 10
6
+ },
7
+ {
8
+ "sampling_rule": "topk",
9
+ "temperature": 0.8,
10
+ "topk": 10
11
+ },
12
+ {
13
+ "sampling_rule": "topk",
14
+ "temperature": 0.5,
15
+ "topk": 10
16
+ },
17
+ {
18
+ "sampling_rule": "topk",
19
+ "temperature": 0.3,
20
+ "topk": 10
21
+ },
22
+ {
23
+ "sampling_rule": "topk",
24
+ "temperature": 0.2,
25
+ "topk": 10
26
+ },
27
+ {
28
+ "sampling_rule": "topk",
29
+ "temperature": 1.0,
30
+ "topk": 15
31
+ },
32
+ {
33
+ "sampling_rule": "topk",
34
+ "temperature": 0.8,
35
+ "topk": 15
36
+ },
37
+ {
38
+ "sampling_rule": "topk",
39
+ "temperature": 0.5,
40
+ "topk": 15
41
+ },
42
+ {
43
+ "sampling_rule": "topk",
44
+ "temperature": 0.3,
45
+ "topk": 15
46
+ },
47
+ {
48
+ "sampling_rule": "topk",
49
+ "temperature": 0.2,
50
+ "topk": 15
51
+ },
52
+ {
53
+ "sampling_rule": "topk",
54
+ "temperature": 1.0,
55
+ "topk": 20
56
+ },
57
+ {
58
+ "sampling_rule": "topk",
59
+ "temperature": 0.8,
60
+ "topk": 20
61
+ },
62
+ {
63
+ "sampling_rule": "topk",
64
+ "temperature": 0.5,
65
+ "topk": 20
66
+ },
67
+ {
68
+ "sampling_rule": "topk",
69
+ "temperature": 0.3,
70
+ "topk": 20
71
+ },
72
+ {
73
+ "sampling_rule": "topk",
74
+ "temperature": 0.2,
75
+ "topk": 20
76
+ },
77
+ {
78
+ "sampling_rule": "topk",
79
+ "temperature": 1.0,
80
+ "topk": 5
81
+ },
82
+ {
83
+ "sampling_rule": "topk",
84
+ "temperature": 0.8,
85
+ "topk": 5
86
+ },
87
+ {
88
+ "sampling_rule": "topk",
89
+ "temperature": 0.5,
90
+ "topk": 5
91
+ },
92
+ {
93
+ "sampling_rule": "topk",
94
+ "temperature": 0.3,
95
+ "topk": 5
96
+ },
97
+ {
98
+ "sampling_rule": "topk",
99
+ "temperature": 0.2,
100
+ "topk": 5
101
+ },
102
+ {
103
+ "sampling_rule": "topk",
104
+ "temperature": 1.0,
105
+ "topk": 3
106
+ },
107
+ {
108
+ "sampling_rule": "topk",
109
+ "temperature": 0.8,
110
+ "topk": 3
111
+ },
112
+ {
113
+ "sampling_rule": "topk",
114
+ "temperature": 0.5,
115
+ "topk": 3
116
+ },
117
+ {
118
+ "sampling_rule": "topk",
119
+ "temperature": 0.3,
120
+ "topk": 3
121
+ },
122
+ {
123
+ "sampling_rule": "topk",
124
+ "temperature": 0.2,
125
+ "topk": 3
126
+ },
127
+ {
128
+ "sampling_rule": "topk",
129
+ "temperature": 0.2,
130
+ "topk": 1
131
+ },
132
+ {
133
+ "sampling_rule": "beam",
134
+ "temperature": 1.0,
135
+ "beam_size": 5
136
+ },
137
+ {
138
+ "sampling_rule": "beam",
139
+ "temperature": 0.8,
140
+ "beam_size": 5
141
+ },
142
+ {
143
+ "sampling_rule": "beam",
144
+ "temperature": 0.5,
145
+ "beam_size": 5
146
+ },
147
+ {
148
+ "sampling_rule": "beam",
149
+ "temperature": 0.3,
150
+ "beam_size": 5
151
+ },
152
+ {
153
+ "sampling_rule": "beam",
154
+ "temperature": 0.2,
155
+ "beam_size": 5
156
+ },
157
+ {
158
+ "sampling_rule": "beam",
159
+ "temperature": 1.0,
160
+ "beam_size": 10
161
+ },
162
+ {
163
+ "sampling_rule": "beam",
164
+ "temperature": 0.8,
165
+ "beam_size": 10
166
+ },
167
+ {
168
+ "sampling_rule": "beam",
169
+ "temperature": 0.5,
170
+ "beam_size": 10
171
+ },
172
+ {
173
+ "sampling_rule": "beam",
174
+ "temperature": 0.3,
175
+ "beam_size": 10
176
+ },
177
+ {
178
+ "sampling_rule": "beam",
179
+ "temperature": 0.2,
180
+ "beam_size": 10
181
+ }
182
+ ]
imageprocessing/artemis/artemis/data/speaker_sampling_configs/mini_hyper_param_ablation.json.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "sampling_rule": "beam",
4
+ "temperature": 0.5,
5
+ "beam_size": 5
6
+ },
7
+ {
8
+ "sampling_rule": "beam",
9
+ "temperature": 0.3,
10
+ "beam_size": 5
11
+ }
12
+ ]
imageprocessing/artemis/artemis/data/speaker_sampling_configs/selected_hyper_params.json.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "sampling_rule": "beam",
4
+ "temperature": 0.3,
5
+ "beam_size": 5
6
+ }
7
+ ]
imageprocessing/artemis/artemis/data/symspell_frequency_dictionary_en_82_765.txt ADDED
The diff for this file is too large to render. See raw diff
 
imageprocessing/artemis/artemis/data/wiki_art_duplicate_paintings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487d4325d3a75f86c7a1f5fd05fc424924c182c391f8a645e81f1c0dd58e4a27
3
+ size 233854
imageprocessing/artemis/artemis/data/wiki_art_genre_class.csv ADDED
The diff for this file is too large to render. See raw diff
 
imageprocessing/artemis/artemis/emotions.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mostly some constants & very simple function to encode/handle the emotion attributes of ArtEmis.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 02/11/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+
10
+ ARTEMIS_EMOTIONS = ['amusement', 'awe', 'contentment', 'excitement',
11
+ 'anger', 'disgust', 'fear', 'sadness', 'something else']
12
+
13
+ EMOTION_TO_IDX = {e: i for i, e in enumerate(ARTEMIS_EMOTIONS)}
14
+
15
+
16
+ IDX_TO_EMOTION = {EMOTION_TO_IDX[e]: e for e in EMOTION_TO_IDX}
17
+
18
+
19
+ POS_NEG_ELSE = {'amusement': 0, 'awe': 0, 'contentment': 0, 'excitement': 0,
20
+ 'anger': 1, 'disgust': 1, 'fear': 1, 'sadness': 1,
21
+ 'something else': 2}
22
+
23
+
24
+ COLORS = {'amusement': '#EE82EE',
25
+ 'awe': '#FFFF00',
26
+ 'contentment': '#87CEEB',
27
+ 'excitement': '#DC143C',
28
+ 'anger': '#000080',
29
+ 'disgust': '#F0E68C',
30
+ 'fear': '#C0C0C0',
31
+ 'sadness': '#696969',
32
+ 'something else': '#228B22'}
33
+
34
+
35
+ LARGER_EMOTION_VOCAB = {('bored', 'boring', 'apathy', 'boredom', 'indifferent', 'dull', 'uninteresting', 'uninterested'),
36
+ ('shock', 'shocked'),
37
+ ('confused', 'confusion', 'confuses', 'puzzled', 'puzzling',
38
+ 'perplexed', 'perplexing', 'confusing', 'odd', 'weird'),
39
+ ('surprised',),
40
+ ('anticipation',),
41
+ ('empowerment',),
42
+ ('hope', 'hopeful', 'optimistic'),
43
+ ('neutral',),
44
+ ('rage',),
45
+ ('happy', 'happiness'),
46
+ ('grief',),
47
+ ('shame',),
48
+ ('resent',),
49
+ ('creepy',),
50
+ ('disappointment',),
51
+ ('depressing', 'depressed'),
52
+ ('bothered', 'disturbed', 'bothersome'),
53
+ ('overwhelmed',),
54
+ ('anxiety', 'anxious'),
55
+ ('thrilled',),
56
+ ('surprised', 'surprising'),
57
+ ('uncomfortable',),
58
+ ('curious', 'curiosity', 'wonder', 'intrigued', 'interested', 'interests', 'interesting', 'intriguing'),
59
+ ('alerted', 'alert'),
60
+ ('insult', 'insulted'),
61
+ ('shy',),
62
+ ('nostalgia', 'nostalgic'),
63
+ ('exhilarating', 'exhilarated')}
64
+
65
+
66
+ def positive_negative_else(emotion):
67
+ """ Map a feeling string (e.g. 'awe') to an integer indicating if it is a positive, negative, or else.
68
+ :param emotion: (string)
69
+ :return: int
70
+ """
71
+ return POS_NEG_ELSE[emotion]
72
+
73
+
74
+ def emotion_to_int(emotion):
75
+ """ Map a feeling string (e.g. 'awe') to a unique integer.
76
+ :param emotion: (string)
77
+ :return: int
78
+ """
79
+ return EMOTION_TO_IDX[emotion]
imageprocessing/artemis/artemis/evaluation/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO: add description
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 8/29/20, for Python 3.x
6
+ Copyright (c) 2020 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
imageprocessing/artemis/artemis/evaluation/bleu.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BLEU via NLTK
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 8/31/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
8
+ import pandas as pd
9
+ from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
10
+
11
+ cc = SmoothingFunction()
12
+
13
+ def sentence_bleu_for_hypotheses(references, hypothesis, max_grams=4, smoothing_function=None):
14
+ """ Compute the BLEU score for the hypothesis (e.g., generated captions) against given references acting
15
+ as ground-truth.
16
+ :param references: (list of lists of lists) of len M. Each sublist contains strings. [['a', 'boy'], ['rock', 'music']]
17
+ :param hypothesis: (list of lists)
18
+ :param max_grams: int, bleu-max_grams i.e., when 4, computes bleu-4
19
+ :param smoothing_function:
20
+ :return: a Series containing the scores in the same order as the input
21
+ Note: see nltk.bleu_score.sentence_bleu
22
+ """
23
+ if len(references) != len(hypothesis):
24
+ raise ValueError('Each reference (set) comes with a single hypothesis')
25
+ if type(references[0]) != list or type(hypothesis[0]) != list:
26
+ raise ValueError('Bad input types: use tokenized strings, and lists of tokens.')
27
+
28
+ scores = []
29
+ weights = (1.0 / max_grams, ) * max_grams
30
+
31
+ for i in range(len(references)):
32
+ scores.append(sentence_bleu(references[i], hypothesis[i], weights=weights,
33
+ smoothing_function=smoothing_function))
34
+ return pd.Series(scores)
imageprocessing/artemis/artemis/evaluation/emotion_alignment.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Measuring the emotion-alignment between a generation and the ground-truth (emotion).
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 8/31/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ import numpy as np
11
+ from ..utils.basic import iterate_in_chunks
12
+
13
+
14
+ @torch.no_grad()
15
+ def image_to_emotion(img2emo_clf, data_loader, device):
16
+ """ For each image of the underlying dataset predict an emotion
17
+ :param img2emo_clf: nn.Module
18
+ :param data_loader: torch loader of dataset to iterate
19
+ :param device: gpu placement
20
+ :return:
21
+ """
22
+ img2emo_clf.eval()
23
+ emo_of_img_preds = []
24
+ for batch in data_loader:
25
+ predictions = img2emo_clf(batch['image'].to(device)).cpu()
26
+ emo_of_img_preds.append(predictions)
27
+ emo_of_img_preds = torch.cat(emo_of_img_preds)
28
+ return emo_of_img_preds
29
+
30
+
31
+ @torch.no_grad()
32
+ def text_to_emotion(txt2em_clf, encoded_tokens, device, batch_size=1000):
33
+ """
34
+ :param txt2em_clf:
35
+ :param encoded_tokens: Tensor carrying the text encoded
36
+ :param device:
37
+ :param batch_size:
38
+ :return:
39
+ """
40
+ txt2em_clf.eval()
41
+ emotion_txt_preds = []
42
+ for chunk in iterate_in_chunks(encoded_tokens, batch_size):
43
+ emotion_txt_preds.append(txt2em_clf(chunk.to(device)).cpu())
44
+
45
+ emotion_txt_preds = torch.cat(emotion_txt_preds)
46
+ maximizers = torch.argmax(emotion_txt_preds, -1)
47
+ return emotion_txt_preds, maximizers
48
+
49
+
50
+ def unique_maximizer(a_list):
51
+ """ if there is an element of the input list that appears
52
+ strictly more frequent than any other element
53
+ :param a_list:
54
+ :return:
55
+ """
56
+ u_elements, u_cnt = np.unique(a_list, return_counts=True)
57
+ has_umax = sum(u_cnt == u_cnt.max()) == 1
58
+ umax = u_elements[u_cnt.argmax()]
59
+ return has_umax, umax
60
+
61
+
62
+ def dominant_maximizer(a_list):
63
+ """ if there is an element of the input list that appears
64
+ at least half the time
65
+ :param a_list:
66
+ :return:
67
+ """
68
+ u_elements, u_cnt = np.unique(a_list, return_counts=True)
69
+
70
+ has_umax = u_cnt.max() >= len(a_list) / 2
71
+
72
+ if len(u_cnt) >= 2: # make sure the second most frequent does not match the first.
73
+ a, b = sorted(u_cnt)[-2:]
74
+ if a == b:
75
+ has_umax = False
76
+
77
+ umax = u_elements[u_cnt.argmax()]
78
+ return has_umax, umax
79
+
80
+
81
+ def occurrence_list_to_distribution(list_of_ints, n_support):
82
+ """e.g., [0, 8, 8, 8] -> [1/4, 0, ..., 3/4, 0, ...]"""
83
+ distribution = np.zeros(n_support, dtype=np.float32)
84
+ for i in list_of_ints:
85
+ distribution[i] += 1
86
+ distribution /= sum(distribution)
87
+ return distribution
imageprocessing/artemis/artemis/evaluation/longest_common_subseq.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Originally created at 10/5/20, for Python 3.x
4
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
5
+ """
6
+
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+
10
+ def lcs(s1, s2):
11
+ """
12
+ Longest common subsequence of two iterables. A subsequence is a
13
+ sequence that appears in the same relative order, but not necessarily contiguous.
14
+ :param s1: first iterable
15
+ :param s2: second iterable
16
+ :return: (list) the lcs
17
+ """
18
+ matrix = [[[] for _ in range(len(s2))] for _ in range(len(s1))]
19
+ for i in range(len(s1)):
20
+ for j in range(len(s2)):
21
+ if s1[i] == s2[j]:
22
+ if i == 0 or j == 0:
23
+ matrix[i][j] = [s1[i]]
24
+ else:
25
+ matrix[i][j] = matrix[i-1][j-1] + [s1[i]]
26
+ else:
27
+ matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)
28
+ cs = matrix[-1][-1]
29
+ return cs
30
+
31
+
32
+ def captions_lcs_from_training_utterances(captions_tokenized, train_utters_tokenized):
33
+ maximizers = np.zeros(len(captions_tokenized), dtype=int)
34
+ max_lcs = np.zeros(len(captions_tokenized))
35
+ averages = np.zeros(len(captions_tokenized))
36
+ for i, caption in enumerate(tqdm(captions_tokenized)):
37
+ caption_res = [len(lcs(caption, tr_example)) for tr_example in train_utters_tokenized]
38
+ max_loc = np.argmax(caption_res)
39
+ maximizers[i] = max_loc
40
+ max_lcs[i] = caption_res[max_loc]
41
+ averages[i] = np.mean(caption_res)
42
+ return max_lcs, averages, maximizers
43
+
44
+
45
+ ###
46
+ # Panos Note:
47
+ # a) '[the] contours shadowing [and] details make this painting [look like a] photograph the way the hair is
48
+ # layered and [the eyes] gazing off to space are fantastic'
49
+ # b) '[the] red [and] black paint strokes [look like a] bunch on [the eyes]'
50
+ # (a), (b) have lcs = 7
51
+ # but,
52
+ # a) '[the woman] is pretty nice and [has a] welcoming [facial expression]'
53
+ # b) '[the woman] looks very elegant since she [has] such [a] beautiful [facial expression]'
54
+ # (a), (b) have lcs = 6
55
+ # implying that removing stop-word articles "a", "the" could make this more realistic, since the first pair is way more
56
+ # dissimilar than the second.
57
+ # also if you use this to compare to systems; the length of the utterance could be used to normalize the bias the length
58
+ # brings in.
59
+ ###
60
+
61
+
62
+
63
+
imageprocessing/artemis/artemis/evaluation/metaphors.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Greedy-approximate counting of similes/methaphors present in a set of sentences.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 9/1/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ metaphorical_substrings = {'could be',
10
+ 'appears to be',
11
+ 'appear to be',
12
+ 'reminds me',
13
+ 'remind me',
14
+ 'seems like',
15
+ 'looks like',
16
+ 'look like',
17
+ 'is like',
18
+ 'are like',
19
+ 'think of',
20
+ 'resembles',
21
+ 'resembling'
22
+ }
23
+
24
+
25
+ def makes_metaphor_via_substring_matching(sentences, substrings=None):
26
+ """
27
+ :param sentences: list of strings
28
+ :param substrings: iterable with substrings of which the occurrence implies a metaphor is made
29
+ :return: list with booleans
30
+ """
31
+ if substrings is None:
32
+ substrings = metaphorical_substrings
33
+
34
+ makes_metaphor = []
35
+ for s in sentences:
36
+ yes = False
37
+ for m in substrings:
38
+ if m in s:
39
+ yes = True
40
+ break
41
+ makes_metaphor.append(yes)
42
+ return makes_metaphor
imageprocessing/artemis/artemis/evaluation/single_caption_per_image.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Some grouping of various evaluation evaluation routines that assume that assume that for a given set of reference
3
+ sentences there is a _single_ caption (sample) generated.
4
+
5
+ The MIT License (MIT)
6
+ Originally created at 9/1/20, for Python 3.x
7
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
8
+ """
9
+
10
+ import torch
11
+ import warnings
12
+ import pandas as pd
13
+ import numpy as np
14
+
15
+
16
+ from .bleu import sentence_bleu_for_hypotheses, cc
17
+ from .metaphors import makes_metaphor_via_substring_matching
18
+ from .emotion_alignment import text_to_emotion
19
+ from .pycocoevalcap import Bleu, Cider, Meteor, Spice, Rouge
20
+ from .emotion_alignment import dominant_maximizer, occurrence_list_to_distribution
21
+ from .longest_common_subseq import captions_lcs_from_training_utterances
22
+ from ..utils.basic import cross_entropy
23
+
24
+ ALL_METRICS = {'bleu', 'cider', 'spice', 'meteor', 'rouge', 'emo_alignment', 'metaphor', 'lcs'}
25
+
26
+
27
+ def emotional_alignment(hypothesis, emotions, vocab, txt2em_clf, device):
28
+ """ text 2 emotion, then compare with ground-truth.
29
+ :param hypothesis:
30
+ :param emotions: (list of list of int) human emotion-annotations (ground-truth) e.g., [[0, 1] [1]]
31
+ :param vocab:
32
+ :param txt2em_clf:
33
+ :param device:
34
+ :return:
35
+ """
36
+
37
+ # from text to emotion
38
+ hypothesis_tokenized = hypothesis.apply(lambda x: x.split())
39
+ max_len = hypothesis_tokenized.apply(lambda x: len(x)).max()
40
+ hypothesis = hypothesis_tokenized.apply(lambda x: np.array(vocab.encode(x, max_len=max_len)))
41
+ hypothesis = torch.from_numpy(np.vstack(hypothesis))
42
+ pred_logits, pred_maximizer = text_to_emotion(txt2em_clf, hypothesis, device)
43
+
44
+ # convert emotion lists to distributions to measure cross-entropy
45
+ n_emotions = 9
46
+ emo_dists = torch.from_numpy(np.vstack(emotions.apply(lambda x: occurrence_list_to_distribution(x, n_emotions))))
47
+ x_entropy = cross_entropy(pred_logits, emo_dists).item()
48
+
49
+ # constrain predictions to those of images with dominant maximizer of emotion
50
+ has_max, maximizer = zip(*emotions.apply(dominant_maximizer))
51
+ emotion_mask = np.array(has_max)
52
+ masked_emotion = np.array(maximizer)[emotion_mask]
53
+
54
+ guess_correct = masked_emotion == pred_maximizer[emotion_mask].cpu().numpy()
55
+ accuracy = guess_correct.mean()
56
+
57
+ return accuracy, x_entropy
58
+
59
+
60
+ def bleu_scores_via_nltk(hypothesis, references, smoothing_function=cc.method1):
61
+ """
62
+ :param hypothesis: dataframe of strings
63
+ :param references: dataframe of list of strings
64
+ :param smoothing_function:
65
+ :return:
66
+ """
67
+
68
+ # first tokenize
69
+ hypothesis_tokenized = hypothesis.apply(lambda x: x.split())
70
+ references_tokenized = references.apply(lambda x: [i.split() for i in x])
71
+
72
+ results = dict()
73
+ for max_grams in range(1, 5):
74
+ with warnings.catch_warnings():
75
+ warnings.simplefilter("ignore")
76
+ scores = sentence_bleu_for_hypotheses(references_tokenized,
77
+ hypothesis_tokenized,
78
+ max_grams,
79
+ smoothing_function)
80
+ results['BLEU-{}'.format(max_grams)] = scores
81
+ return results
82
+
83
+
84
+ def dataframes_to_coco_eval_format(references, hypothesis):
85
+ references = {i: [k for k in x] for i, x in enumerate(references)}
86
+ hypothesis = {i: [x] for i, x in enumerate(hypothesis)}
87
+ return references, hypothesis
88
+
89
+
90
+ def pycoco_bleu_scores(hypothesis, references):
91
+ references, hypothesis = dataframes_to_coco_eval_format(references, hypothesis)
92
+ scorer = Bleu()
93
+ average_score, all_scores = scorer.compute_score(references, hypothesis)
94
+ # Note: average_score takes into account epsilons: tiny/small
95
+ # this won't be reflected if you take the direct average of all_scores.
96
+ return average_score, all_scores
97
+
98
+
99
+ def pycoco_eval_scores(hypothesis, references, metric):
100
+ references, hypothesis = dataframes_to_coco_eval_format(references, hypothesis)
101
+ if metric == 'cider':
102
+ scorer = Cider()
103
+ elif metric == 'meteor':
104
+ scorer = Meteor()
105
+ elif metric == 'spice':
106
+ scorer = Spice()
107
+ elif metric == 'rouge':
108
+ scorer = Rouge()
109
+ else:
110
+ raise ValueError
111
+ avg, all_scores = scorer.compute_score(references, hypothesis)
112
+ return pd.Series(all_scores)
113
+
114
+
115
+ def apply_basic_evaluations(hypothesis, references, ref_emotions, txt2emo_clf, text2emo_vocab,
116
+ lcs_sample=None, train_utterances=None, nltk_bleu=False, smoothing_function=cc.method1,
117
+ device="cuda", random_seed=2021,
118
+ methods_to_do=ALL_METRICS):
119
+ """
120
+ :param hypothesis: list of strings ['a man', 'a woman']
121
+ :param references: list of list of strings [['a man', 'a tall man'], ['a woman']]
122
+ :param ref_emotions: emotions corresponding to references list of list of integers [[0, 1] [1]]
123
+
124
+ :param text2emo_vocab:
125
+ :param txt2emo_clf:
126
+ :param device:
127
+ :param smoothing_function:
128
+ :return:
129
+ """
130
+ results = []
131
+ stat_track = ['mean', 'std']
132
+
133
+ ##
134
+ ## BLEU:1-4
135
+ ##
136
+ if 'bleu' in methods_to_do:
137
+ if nltk_bleu:
138
+ res = bleu_scores_via_nltk(hypothesis, references, smoothing_function=smoothing_function)
139
+ for metric, scores in res.items():
140
+ stats = scores.describe()[stat_track]
141
+ stats = pd.concat([pd.Series({'metric': metric}), stats])
142
+ results.append(stats)
143
+ else:
144
+ #py-coco based
145
+ b_scores = pycoco_bleu_scores(hypothesis, references)
146
+ for i in range(4):
147
+ metric = f'BLEU-{i}'
148
+ mu = b_scores[0][i]
149
+ # note the std below reflects the values without the 'tiny' adaptation (unlike the mu)
150
+ # avg_dummy = np.mean(b_scores[1][i]) # this is the average without the tiny adaptation.
151
+ std = np.std(b_scores[1][i])
152
+ stats = pd.concat([pd.Series({'metric': metric}), pd.Series({'mean': mu, 'std':std})])
153
+ results.append(stats)
154
+ print('BLEU: done')
155
+
156
+ ##
157
+ ## CIDER, SPICE, METEOR, ROUGE-L
158
+ ##
159
+ coco_requested = False
160
+ for metric in ['cider', 'spice', 'meteor', 'rouge']:
161
+ if metric in methods_to_do:
162
+ stats = pycoco_eval_scores(hypothesis, references, metric).describe()[stat_track]
163
+ stats = pd.concat([pd.Series({'metric': metric.upper()}), stats])
164
+ results.append(stats)
165
+ coco_requested = True
166
+ if coco_requested:
167
+ print('COCO-based-metrics: done')
168
+
169
+ ##
170
+ ## Emotional-Alignment
171
+ ##
172
+ if 'emo_alignment' in methods_to_do:
173
+ emo_accuracy, emo_xentopy = emotional_alignment(hypothesis, ref_emotions, text2emo_vocab, txt2emo_clf, device)
174
+ stats = pd.Series(emo_accuracy, dtype=float)
175
+ stats = stats.describe()[stat_track]
176
+ stats = pd.concat([pd.Series({'metric': 'Emo-Alignment-ACC'}), stats])
177
+ results.append(stats)
178
+
179
+ stats = pd.Series(emo_xentopy, dtype=float)
180
+ stats = stats.describe()[stat_track]
181
+ stats = pd.concat([pd.Series({'metric': 'Emo-Alignment-XENT'}), stats])
182
+ results.append(stats)
183
+ print('EMO-ALIGN: done')
184
+
185
+ ##
186
+ ## Metaphor-like expressions
187
+ ##
188
+ if 'metaphor' in methods_to_do:
189
+ met_mask = makes_metaphor_via_substring_matching(hypothesis)
190
+ stats = pd.Series(met_mask, dtype=float)
191
+ stats = stats.describe()[stat_track]
192
+ stats = pd.concat([pd.Series({'metric': 'Metaphors'}), stats])
193
+ results.append(stats)
194
+ print('Metaphor-like expressions: Done')
195
+
196
+ ##
197
+ ## Novelty via Longest Common Subsequence
198
+ ##
199
+ if 'lcs' in methods_to_do:
200
+ np.random.seed(random_seed) # since you will (normally) sub-sample
201
+ train_utters_tokenized = [u.split() for u in train_utterances]
202
+ uts = pd.Series(train_utters_tokenized).sample(lcs_sample[0]).to_list()
203
+ hypo_token = hypothesis.apply(lambda x: x.split()).sample(lcs_sample[1]).to_list()
204
+
205
+ max_lcs, mean_lcs, _ = captions_lcs_from_training_utterances(hypo_token, uts)
206
+ stats = pd.Series(max_lcs).describe()[stat_track]
207
+ stats = pd.concat([pd.Series({'metric': 'max-LCS'}), stats])
208
+ results.append(stats)
209
+ stats = pd.Series(mean_lcs).describe()[stat_track]
210
+ stats = pd.concat([pd.Series({'metric': 'mean-LCS'}), stats])
211
+ results.append(stats)
212
+ print('Novelty via Longest Common Subsequence: Done')
213
+
214
+ return results
imageprocessing/artemis/artemis/in_out/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
4
+ """
imageprocessing/artemis/artemis/in_out/arguments.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Argument handling.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at early 2020, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+ import argparse
9
+ import json
10
+ import pprint
11
+ import pathlib
12
+ import os.path as osp
13
+ from datetime import datetime
14
+ from .basics import create_dir
15
+
16
+
17
+ def str2bool(v):
18
+ """ boolean values for argparse
19
+ """
20
+ if isinstance(v, bool):
21
+ return v
22
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
23
+ return True
24
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
25
+ return False
26
+ else:
27
+ raise argparse.ArgumentTypeError('Boolean value expected.')
28
+
29
+
30
+ def parse_train_speaker_arguments(notebook_options=None, save_args=False):
31
+ """ Default/Main arguments for training a SAT neural-speaker (via ArtEmis).
32
+ :param notebook_options: list, if you are using this via a jupyter notebook
33
+ :return: argparse.ArgumentParser
34
+ """
35
+
36
+ parser = argparse.ArgumentParser(description='training-a-neural-speaker')
37
+
38
+ ## Non-optional arguments
39
+ parser.add_argument('-log-dir', type=str, required=True, help='where to save training-progress, model, etc.')
40
+ parser.add_argument('-data-dir', type=str, required=True, help='path to ArtEmis/COCO preprocessed data')
41
+ parser.add_argument('-img-dir', type=str, required=True, help='path to top image (e.g., WikiArt) dir')
42
+
43
+ # Model parameters
44
+ parser.add_argument('--img-dim', type=int, default=256, help='images will be resized to be squared with this many pixels')
45
+ parser.add_argument('--lanczos', type=str2bool, default=True, help='apply lanczos resampling when resizing')
46
+ parser.add_argument('--atn-spatial-img-size', type=int, help='optional, if provided reshapes the spatial output dimension of the '
47
+ 'visual encode in this X this "pixels" using average-pooling. ')
48
+
49
+ parser.add_argument('--atn-cover-img-alpha', type=float, default=1, help='attention to cover entire image when '
50
+ 'marginalized over tokens')
51
+ parser.add_argument('--attention-dim', type=int, default=512)
52
+ parser.add_argument('--rnn-hidden-dim', type=int, default=512)
53
+ parser.add_argument('--word-embedding-dim', type=int, default=128)
54
+ parser.add_argument('--vis-encoder', type=str, default='resnet34', choices=['resnet18',
55
+ 'resnet34',
56
+ 'resnet50',
57
+ 'resnet101'], help='visual-encoder backbone')
58
+ parser.add_argument('--dropout-rate', type=float, default=0.1)
59
+ parser.add_argument('--teacher-forcing-ratio', type=int, default=1)
60
+
61
+ parser.add_argument('--use-emo-grounding', type=str2bool, default=False)
62
+ parser.add_argument('--emo-grounding-dims', nargs=2, type=int, default=[9, 9], help='[input] number of emotions x the'
63
+ 'the size of the projection layer that '
64
+ 'will be used to transform the one-hot emotion'
65
+ 'to a grounding vector.')
66
+
67
+
68
+ # Training parameters
69
+ parser.add_argument('--resume-path', type=str, help='model-path to resume from')
70
+ parser.add_argument('--fine-tune-data', type=str)
71
+ parser.add_argument('--batch-size', type=int, default=128)
72
+ parser.add_argument('--num-workers', type=int, default=6)
73
+ parser.add_argument('--gpu', type=str, default='0')
74
+ parser.add_argument('--encoder-lr', type=float, default=1e-4)
75
+ parser.add_argument('--decoder-lr', type=float, default=5e-4)
76
+ parser.add_argument('--max-train-epochs', type=int, default=50)
77
+ parser.add_argument('--train-patience', type=int, default=5, help='maximum consecutive epochs where the validation '
78
+ 'Neg-LL does not improve before we stop training.')
79
+ parser.add_argument('--lr-patience', type=int, default=2, help='maximum waiting of epochs where the validation '
80
+ 'Neg-LL does not improve before we reduce the'
81
+ 'learning-rate.')
82
+ parser.add_argument('--save-each-epoch', type=str2bool, default=True, help='Save the model at each epoch, else will only save'
83
+ 'the one that achieved the minimal '
84
+ 'Negative-Log-Likelihood in the validation split.')
85
+
86
+ # Misc
87
+ parser.add_argument('--dataset', type=str, default='artemis')
88
+ parser.add_argument('--random-seed', type=int, default=2021)
89
+ parser.add_argument('--debug', default=False, type=str2bool)
90
+ parser.add_argument('--use-timestamp', default=True, type=str2bool)
91
+
92
+ # Parse arguments
93
+ if notebook_options is not None: # Pass options directly
94
+ args = parser.parse_args(notebook_options)
95
+ else:
96
+ args = parser.parse_args() # Read from command line.
97
+
98
+ if args.use_timestamp:
99
+ timestamp = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
100
+ args.log_dir = create_dir(osp.join(args.log_dir, timestamp))
101
+
102
+ # pprint them
103
+ args_string = pprint.pformat(vars(args))
104
+ print(args_string)
105
+
106
+ if save_args:
107
+ out = osp.join(args.log_dir, 'config.json.txt')
108
+ with open(out, 'w') as f_out:
109
+ json.dump(vars(args), f_out, indent=4, sort_keys=True)
110
+
111
+ return args
112
+
113
+
114
+ def parse_test_speaker_arguments(notebook_options=None):
115
+ """ Parameters for testing (sampling) a neural-speaker.
116
+ :param notebook_options: list, if you are using this via a jupyter notebook
117
+ :return: argparse.ArgumentParser
118
+ """
119
+ parser = argparse.ArgumentParser(description='testing-a-neural-speaker')
120
+
121
+ ## Basic required arguments
122
+ parser.add_argument('-speaker-saved-args', type=str, required=True, help='config.json.txt file for saved speaker model (output of train_speaker.py)')
123
+ parser.add_argument('-speaker-checkpoint', type=str, required=True, help='saved model checkpoint ("best_model.pt" (output of train_speaker.py)')
124
+ parser.add_argument('-img-dir', type=str, required=True, help='path to top image dir (typically that\'s the WikiArt top-dir)')
125
+ parser.add_argument('-out-file', type=str, required=True, help='file to save the sampled utterances, their attention etc. as a pkl')
126
+
127
+ ## Basic optional arguments
128
+ parser.add_argument('--split', type=str, default='test', choices=['train', 'test', 'val', 'rest'], help='set the split of the dataset you want to annotate '
129
+ 'the code will load the dataset based on the dir-location marked '
130
+ 'in the input config.json.txt file. '
131
+ 'this param has no effect if a custom-data-csv is passed.')
132
+
133
+ parser.add_argument('--custom-data-csv', type=str, help='if you want to annotate your own set of images. Please '
134
+ 'see the code for what this csv should look like. ')
135
+
136
+ parser.add_argument('--subsample-data', type=int, default=-1, help='if not -1, will subsample the underlying dataset'
137
+ 'and will annotated only this many images.')
138
+
139
+
140
+ ## Optional arguments controlling the generation/sampling process
141
+ parser.add_argument('--max-utterance-len', type=int, help='maximum allowed lenght for any sampled utterances. If not given '
142
+ 'the maximum found in the underlying dataset split will be used.'
143
+ 'Fot the official ArtEmis split for deep-nets that is 30 tokens.')
144
+
145
+ parser.add_argument('--drop-unk', type=str2bool, default=True, help='if True, do not create samples that contain the '
146
+ 'unknown token')
147
+
148
+ parser.add_argument('--drop-bigrams', type=str2bool, default=True, help='if True, prevent the same bigram to occur '
149
+ 'twice in a sampled utterance')
150
+
151
+
152
+ ## To enable the pass of multiple configurations for the sampler at once! i.e., so you can try many
153
+ ## sampling temperatures, methods to sample (beam-search vs. topk), beam-size (or more)
154
+ ## You can provide a simple .json that specifies these values you want to try.
155
+ ## See >> data/speaker_sampling_configs << for examples
156
+ ## Note. if you pass nothing the >> data/speaker_sampling_configs/selected_hyper_params.json.txt << will be used
157
+ ## these are parameters used in the the paper.
158
+ parser.add_argument('--sampling-config-file', type=str, help='Note. if max-len, drop-unk '
159
+ 'and drop-bigrams are not specified in the json'
160
+ 'the directly provided values of these parameters '
161
+ 'will be used.')
162
+
163
+
164
+ parser.add_argument('--random-seed', type=int, default=2021, help='if -1 it won\'t have an effect; else the sampler '
165
+ 'becomes deterministic')
166
+
167
+ parser.add_argument('--img2emo-checkpoint', type=str, help='checkpoint file of image-2-emotion classifier that will '
168
+ 'be used to sample the grounding emotion that will be used '
169
+ 'by the speaker, if you pass an emotionally-grouned speaker. '
170
+ 'Note. if you pass/use an emo-grounded speaker - this argument '
171
+ 'becomes required, except if you are using your own custom-data-csv '
172
+ 'where you can specify the grounding emotion manually.' )
173
+
174
+ parser.add_argument('--gpu', type=str, default='0')
175
+ parser.add_argument('--n-workers', type=int)
176
+
177
+ parser.add_argument('--compute-nll', type=str2bool, default=False, help='Compute the negative-log-likelihood of '
178
+ 'the dataset under the the saved speaker model.')
179
+
180
+
181
+
182
+ # Parse arguments
183
+ if notebook_options is not None: # Pass options directly
184
+ args = parser.parse_args(notebook_options)
185
+ else:
186
+ args = parser.parse_args() # Read from command line.
187
+
188
+ # load "default"
189
+ if args.sampling_config_file is None:
190
+ up_dir = osp.split(pathlib.Path(__file__).parent.absolute())[0]
191
+ args.sampling_config_file = osp.join(up_dir, 'data/speaker_sampling_configs/selected_hyper_params.json.txt')
192
+
193
+ # pprint them
194
+ print('\nParameters Specified:')
195
+ args_string = pprint.pformat(vars(args))
196
+ print(args_string)
197
+ print('\n')
198
+
199
+ return args
imageprocessing/artemis/artemis/in_out/basics.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Basic (simple) I/O Utilities.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in 2019, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import re
10
+ import os
11
+ import json
12
+ import sys
13
+ import numpy as np
14
+ import pandas as pd
15
+ import os.path as osp
16
+ import pprint
17
+ import logging
18
+ from argparse import ArgumentParser
19
+ from IPython.display import display
20
+ from PIL import Image
21
+ from six.moves import cPickle, range
22
+ from ..emotions import ARTEMIS_EMOTIONS
23
+
24
+
25
+ def files_in_subdirs(top_dir, search_pattern):
26
+ join = osp.join
27
+ regex = re.compile(search_pattern)
28
+ for path, _, files in os.walk(top_dir):
29
+ for name in files:
30
+ full_name = join(path, name)
31
+ if regex.search(full_name):
32
+ yield full_name
33
+
34
+
35
+ def create_dir(dir_path):
36
+ """ Creates a directory (or nested directories) if they don't exist.
37
+ """
38
+ if not osp.exists(dir_path):
39
+ os.makedirs(dir_path)
40
+
41
+ return dir_path
42
+
43
+
44
+ def pickle_data(file_name, *args):
45
+ """Using (c)Pickle to save multiple python objects in a single file.
46
+ """
47
+ out_file = open(file_name, 'wb')
48
+ cPickle.dump(len(args), out_file, protocol=2)
49
+ for item in args:
50
+ cPickle.dump(item, out_file, protocol=2)
51
+ out_file.close()
52
+
53
+
54
+ def unpickle_data(file_name, python2_to_3=False):
55
+ """ Restore data previously saved with pickle_data().
56
+ :param file_name: file holding the pickled data.
57
+ :param python2_to_3: (boolean), if True, pickle happened under python2x, unpickling under python3x.
58
+ :return: an generator over the un-pickled items.
59
+ Note, about implementing the python2_to_3 see
60
+ https://stackoverflow.com/questions/28218466/unpickling-a-python-2-object-with-python-3
61
+ """
62
+ in_file = open(file_name, 'rb')
63
+ if python2_to_3:
64
+ size = cPickle.load(in_file, encoding='latin1')
65
+ else:
66
+ size = cPickle.load(in_file)
67
+
68
+ for _ in range(size):
69
+ if python2_to_3:
70
+ yield cPickle.load(in_file, encoding='latin1')
71
+ else:
72
+ yield cPickle.load(in_file)
73
+ in_file.close()
74
+
75
+
76
+ def load_raw_amt_csv_hit_responses(top_csv_folder, verbose=True, only_approved=True,
77
+ keep_cols=None, drop_rorschach=True, has_emotions=True):
78
+ """
79
+ :param top_csv_folder:
80
+ :param verbose:
81
+ :param only_approved:
82
+ :param keep_cols:
83
+ :param drop_rorschach:
84
+ :param has_emotions: set to False to load wiki-art annotations that are objective (OLA-dataset)
85
+ :return:
86
+ """
87
+
88
+ all_collected_csv = [f for f in files_in_subdirs(top_csv_folder, '.csv$')]
89
+
90
+ if verbose:
91
+ print('{} files loaded'.format(len(all_collected_csv)))
92
+
93
+ all_csv_names = [osp.basename(f) for f in all_collected_csv]
94
+ assert len(all_csv_names) == len(set(all_csv_names)) # unique names
95
+
96
+ all_dfs = []
97
+ for f in all_collected_csv: # load each .csv
98
+ df = pd.read_csv(f)
99
+ # print(df['AssignmentStatus'].unique())
100
+ in_submission_mode = (df['AssignmentStatus'] == 'Submitted').sum()
101
+ if in_submission_mode > 0:
102
+ print('In {}, {} examples are still in submitted mode.'.format(osp.basename(f), in_submission_mode))
103
+ if only_approved:
104
+ df = df[df['AssignmentStatus'] == 'Approved']
105
+ all_dfs.append(df)
106
+ df = pd.concat(all_dfs)
107
+
108
+ # Rename columns
109
+ new_cols = [c.replace('choice.', '') for c in [c.replace('Answer.', '') for c in df.columns]]
110
+ new_cols = [c.lower() for c in new_cols]
111
+ df.columns = new_cols
112
+ df = df.reset_index()
113
+
114
+ # Keep ML-related columns
115
+ ml_related_cols = ['workerid', 'input.image_url', 'utterance']
116
+ # Add potential extras requested at the input
117
+ if keep_cols is not None:
118
+ ml_related_cols += keep_cols
119
+
120
+ if has_emotions:
121
+ _, x = np.where(df[ARTEMIS_EMOTIONS])
122
+ emotion_chosen = pd.Series(np.array(ARTEMIS_EMOTIONS)[x], name='emotion')
123
+ df = pd.concat([df[ml_related_cols], emotion_chosen], axis=1)
124
+ else:
125
+ df = df[ml_related_cols]
126
+
127
+ # Derivative columns
128
+ def url_to_painting_name(x):
129
+ tokens = x.split('/')
130
+ return tokens[-1][:-len('.jpg')]
131
+
132
+ def url_to_art_style(x):
133
+ tokens = x.split('/')
134
+ return tokens[-2]
135
+
136
+ df['painting'] = df['input.image_url'].apply(url_to_painting_name)
137
+ df['art_style'] = df['input.image_url'].apply(url_to_art_style)
138
+ df = df.drop(['input.image_url'], axis=1)
139
+
140
+ if drop_rorschach:
141
+ df = df[df['art_style'] != 'test']
142
+ df.reset_index(inplace=True, drop=True)
143
+
144
+ if verbose:
145
+ print('Loading responses:', len(df))
146
+ print('Column Names:', [c for c in df.columns])
147
+
148
+ return df
149
+
150
+
151
+ def splitall(path):
152
+ """
153
+ Examples:
154
+ splitall('a/b/c') -> ['a', 'b', 'c']
155
+ splitall('/a/b/c/') -> ['/', 'a', 'b', 'c', '']
156
+
157
+ NOTE: https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s16.html
158
+ """
159
+ allparts = []
160
+ while 1:
161
+ parts = osp.split(path)
162
+ if parts[0] == path: # Sentinel for absolute paths.
163
+ allparts.insert(0, parts[0])
164
+ break
165
+ elif parts[1] == path: # Sentinel for relative paths.
166
+ allparts.insert(0, parts[1])
167
+ break
168
+ else:
169
+ path = parts[0]
170
+ allparts.insert(0, parts[1])
171
+ return allparts
172
+
173
+
174
+ def wikiart_file_name_to_style_and_painting(filename):
175
+ """
176
+ Assumes a filename of a painting of wiki-art.
177
+ :param filename:
178
+ :return:
179
+ """
180
+ s = splitall(filename)
181
+ return s[-2], s[-1][:-len('.jpg')]
182
+
183
+
184
+ def show_random_captions(df, top_img_dir):
185
+ painting, art_style = df.sample(1)[['painting', 'art_style']].iloc[0]
186
+ print(art_style, painting)
187
+ display(Image.open(osp.join(top_img_dir, art_style, painting + '.jpg')))
188
+ s = df[(df.painting == painting) & (df.art_style == art_style)]
189
+ for e, u in zip(s['emotion'], s['utterance']):
190
+ print('{}:\t{}'.format(e.upper(), u))
191
+
192
+
193
+ def read_saved_args(config_file, override_args=None, verbose=False):
194
+ """
195
+ :param config_file: json file containing arguments
196
+ :param override_args: dict e.g., {'gpu': '0'}
197
+ :param verbose:
198
+ :return:
199
+ """
200
+ parser = ArgumentParser()
201
+ args = parser.parse_args([])
202
+ with open(config_file, 'r') as f_in:
203
+ args.__dict__ = json.load(f_in)
204
+
205
+ if override_args is not None:
206
+ for key, val in override_args.items():
207
+ args.__setattr__(key, val)
208
+
209
+ if verbose:
210
+ args_string = pprint.pformat(vars(args))
211
+ print(args_string)
212
+
213
+ return args
214
+
215
+
216
+ def create_logger(log_dir, std_out=True):
217
+ logger = logging.getLogger()
218
+ logger.setLevel(logging.INFO)
219
+ formatter = logging.Formatter('%(asctime)s - %(message)s')
220
+
221
+ # Add logging to file handler
222
+ file_handler = logging.FileHandler(osp.join(log_dir, 'log.txt'))
223
+ file_handler.setLevel(logging.INFO)
224
+ file_handler.setFormatter(formatter)
225
+ logger.addHandler(file_handler)
226
+
227
+ # Add stdout to also print statements there
228
+ if std_out:
229
+ logger.addHandler(logging.StreamHandler(sys.stdout))
230
+ return logger
imageprocessing/artemis/artemis/in_out/cleaning.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Cleaning Utilities.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in 2020, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import pathlib
10
+ import os.path as osp
11
+ from tqdm import tqdm_notebook as tqdm
12
+ from ..in_out.basics import unpickle_data, splitall
13
+
14
+
15
+ def load_duplicate_paintings_of_wikiart(duplicates_pkl_file=None, verbose=True):
16
+ """ Return a list containing wikiArt paintings that are double-listed.
17
+ :param duplicates_pkl_file: (opt) pkl file containing the duplicate groups.
18
+ :return: (list of list) each sublist contains tuples like (art_style, painting) that are duplicates.
19
+
20
+ Note. If duplicates_pkl_file==None, the stored inside the repo .pkl file will be used. The duplicates indicated in
21
+ the .pkl were found by a combination of running the `fdupes' program and a manual check on Nearest-Neighbors of a
22
+ pretrained ResNet on ImageNet that had very small distances.
23
+ """
24
+ if duplicates_pkl_file is None:
25
+ up_dir = osp.split(pathlib.Path(__file__).parent.absolute())[0]
26
+ duplicates_pkl_file = osp.join(up_dir, 'data/wiki_art_duplicate_paintings.pkl')
27
+ # Note. This file contains duplicates that were found using
28
+ duplicates_as_list = next(unpickle_data(duplicates_pkl_file))
29
+ if verbose:
30
+ print("Using {} groups of paintings that are visually identical (duplicates).".format(len(duplicates_as_list)))
31
+ return duplicates_as_list
32
+
33
+
34
+ def drop_duplicate_paintings(wiki_art_image_files, duplicate_groups=None):
35
+ """
36
+ :param wiki_art_image_files: (list) with filenames of the form xx/xx/art_style/painting.jpg
37
+ :param duplicate_groups: list of list, each item is a collection of (art_style, painting) tuples that are duplicates.
38
+ :return: a new list where from each duplicate group only one (the first) painting is kept.
39
+ """
40
+ if duplicate_groups is None:
41
+ duplicate_groups = load_duplicate_paintings_of_wikiart()
42
+
43
+ drop_these = set()
44
+ for dup_g in duplicate_groups:
45
+ drop_these.update(dup_g[1:]) # drop all but first
46
+
47
+ clean_img_files = []
48
+ dropped = 0
49
+ for img_file in wiki_art_image_files:
50
+ tokens = splitall(img_file)
51
+ painting = tokens[-1][:-len('.jpg')]
52
+ art_style = tokens[-2]
53
+ key = (art_style, painting)
54
+ if key in drop_these:
55
+ dropped += 1
56
+ else:
57
+ clean_img_files.append(img_file)
58
+ print('Dropping {} from {} paintings that are duplicates of one painting that is kept.'.format(dropped,
59
+ len(wiki_art_image_files)))
60
+ return clean_img_files
61
+
62
+
63
+ def merge_artemis_annotations_on_wikiart_duplicates(dataset_df, duplicate_groups=None, verbose=True):
64
+ """
65
+ :param dataset_df:
66
+ :param duplicate_groups:
67
+ :return:
68
+ """
69
+
70
+ if duplicate_groups is None:
71
+ duplicate_groups = load_duplicate_paintings_of_wikiart()
72
+
73
+ n_merged_stimuli = 0
74
+ for dup_g in tqdm(duplicate_groups):
75
+ keep_this = dup_g[0]
76
+ drop_these = dup_g[1:] # drop all but first
77
+ for stimulus in drop_these:
78
+ mask = (dataset_df['art_style'] == stimulus[0]) & (dataset_df['painting'] == stimulus[1])
79
+ n_merged_stimuli += sum(mask)
80
+ dataset_df.loc[mask, ['art_style']] = keep_this[0]
81
+ dataset_df.loc[mask, ['painting']] = keep_this[1]
82
+ if verbose:
83
+ print('{} stimuli were merged.'.format(n_merged_stimuli))
84
+ return dataset_df
85
+
86
+
87
+
imageprocessing/artemis/artemis/in_out/coco.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ COCO related I/O operations
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 10/18/20, for Python 3.x
6
+ Copyright (c) 2020 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import os.path as osp
10
+
11
+ def coco_image_name_to_image_file(image_name, top_img_dir, year=2014):
12
+ if image_name.startswith('COCO_val'):
13
+ return osp.join(top_img_dir, 'val' + str(year), image_name)
14
+ elif image_name.startswith('COCO_train'):
15
+ return osp.join(top_img_dir, 'train' + str(year), image_name)
16
+ else:
17
+ raise ValueError
18
+
19
+
20
+ def karpathize(df):
21
+ ## Per Karpathy's tweet: restval is actually train.
22
+ df.split[df.split == 'restval'] = 'train'
23
+
24
+
25
+ def prepare_coco_dataframe_for_training(df, top_img_dir):
26
+ # assign file-names to each image
27
+ df = df.assign(image_files = df.image.apply(lambda x: coco_image_name_to_image_file(x, top_img_dir)))
28
+ # fix splits
29
+ karpathize(df)
30
+ return df
imageprocessing/artemis/artemis/in_out/datasets.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Originally in 2020, for Python 3.x
4
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
5
+ """
6
+
7
+ import torch
8
+ import numpy as np
9
+ import pandas as pd
10
+ from PIL import Image
11
+ from torch.utils.data import Dataset, DataLoader
12
+ from ..evaluation.emotion_alignment import image_to_emotion
13
+ from ..emotions import emotion_to_int
14
+
15
+
16
+ class AffectiveCaptionDataset(Dataset):
17
+ """ Basically, an image, with a caption, and an indicated emotion.
18
+ """
19
+ def __init__(self, image_files, tokens, emotions, n_emotions=9, img_transform=None, one_hot_emo=True):
20
+ super(AffectiveCaptionDataset, self).__init__()
21
+ self.image_files = image_files
22
+ self.tokens = tokens
23
+ self.emotions = emotions
24
+ self.n_emotions = n_emotions
25
+ self.img_transform = img_transform
26
+ self.one_hot_emo = one_hot_emo
27
+
28
+ def __getitem__(self, index):
29
+ text = np.array(self.tokens[index]).astype(dtype=np.long)
30
+
31
+ if self.image_files is not None:
32
+ img = Image.open(self.image_files[index])
33
+
34
+ if img.mode is not 'RGB':
35
+ img = img.convert('RGB')
36
+
37
+ if self.img_transform is not None:
38
+ img = self.img_transform(img)
39
+ else:
40
+ img = []
41
+
42
+ if self.n_emotions > 0:
43
+ if self.one_hot_emo:
44
+ emotion = np.zeros(self.n_emotions, dtype=np.float32)
45
+ emotion[self.emotions[index]] = 1
46
+ else:
47
+ emotion = self.emotions[index]
48
+ else:
49
+ emotion = []
50
+
51
+ res = {'image': img, 'emotion': emotion, 'tokens': text, 'index': index}
52
+ return res
53
+
54
+ def __len__(self):
55
+ return len(self.tokens)
56
+
57
+
58
+ class ImageClassificationDataset(Dataset):
59
+ def __init__(self, image_files, labels=None, img_transform=None, rgb_only=True):
60
+ super(ImageClassificationDataset, self).__init__()
61
+ self.image_files = image_files
62
+ self.labels = labels
63
+ self.img_transform = img_transform
64
+ self.rgb_only = rgb_only
65
+
66
+ def __getitem__(self, index):
67
+ img = Image.open(self.image_files[index])
68
+
69
+ if self.rgb_only and img.mode is not 'RGB':
70
+ img = img.convert('RGB')
71
+
72
+ if self.img_transform is not None:
73
+ img = self.img_transform(img)
74
+
75
+ label = []
76
+ if self.labels is not None:
77
+ label = self.labels[index]
78
+
79
+ res = {'image': img, 'label': label, 'index': index}
80
+ return res
81
+
82
+ def __len__(self):
83
+ return len(self.image_files)
84
+
85
+
86
+ def sub_sample_dataloader(dataloader, sample_size, seed=None, shuffle=False):
87
+ """ Given any torch dataloader create a sub-sampled version of it.
88
+ :param dataloader:
89
+ :param sample_size:
90
+ :param seed:
91
+ :param shuffle:
92
+ :return: dataloader of Subset
93
+ """
94
+
95
+ dataset = dataloader.dataset
96
+ n_total = len(dataset)
97
+
98
+ if sample_size > n_total:
99
+ raise ValueError
100
+
101
+ if seed is not None:
102
+ torch.manual_seed(seed)
103
+
104
+ sb_dataset = torch.utils.data.random_split(dataset, [sample_size, n_total-sample_size])[0]
105
+ bsize = min(dataloader.batch_size, sample_size)
106
+ sample_loader = torch.utils.data.DataLoader(dataset=sb_dataset,
107
+ batch_size=bsize,
108
+ shuffle=shuffle,
109
+ num_workers=dataloader.num_workers)
110
+ return sample_loader
111
+
112
+
113
+
114
+ def sub_index_affective_dataloader(affective_dataloader, indices, shuffle=False):
115
+ """ Given a torch dataloader and a sequence of integers; extract the corresponding items of the
116
+ carried dataset on the specific indices and make a new dataloader with them.
117
+ :param affective_dataloader: torch.utils.data.DataLoader for AffectiveCaptionDataset
118
+ :param indices: sequence of integers indexing the underlying dataset (dataframe).
119
+ :param shuffle: shuffle the data of the resulting dataloader
120
+ :return: dataloader of AffectiveCaptionDataset
121
+ """
122
+ dataset = affective_dataloader.dataset
123
+ r_img_files = dataset.image_files.iloc[indices].copy()
124
+ r_tokens = dataset.tokens.iloc[indices].copy()
125
+ r_emotions = dataset.emotions.iloc[indices].copy()
126
+
127
+ r_img_files.reset_index(inplace=True, drop=True)
128
+ r_tokens.reset_index(inplace=True, drop=True)
129
+ r_emotions.reset_index(inplace=True, drop=True)
130
+
131
+ r_dset = AffectiveCaptionDataset(image_files=r_img_files, tokens=r_tokens,
132
+ emotions=r_emotions, img_transform=dataset.img_transform)
133
+
134
+ batch_size = min(len(indices), affective_dataloader.batch_size)
135
+
136
+ r_loader = torch.utils.data.DataLoader(r_dset,
137
+ shuffle=shuffle,
138
+ batch_size=batch_size,
139
+ num_workers=affective_dataloader.num_workers)
140
+ return r_loader
141
+
142
+
143
+ def group_annotations_per_image(affective_dataset):
144
+ """ Group the annotations per image.
145
+ :param affective_dataset: an AffectiveCaptionDataset
146
+ :return: for each image its tokens/emotions as pandas Dataframes
147
+ """
148
+ df = pd.concat([affective_dataset.image_files, affective_dataset.tokens, affective_dataset.emotions], axis=1)
149
+ tokens_grouped = df.groupby('image_files')['tokens_encoded'].apply(list).reset_index(name='tokens_encoded')
150
+ emotion_grouped = df.groupby('image_files')['emotion_label'].apply(list).reset_index(name='emotion')
151
+ assert all(tokens_grouped['image_files'] == emotion_grouped['image_files'])
152
+ return tokens_grouped['image_files'], tokens_grouped, emotion_grouped
153
+
154
+
155
+ def default_grounding_dataset_from_affective_loader(loader, img2emo_clf=None, device=None, n_workers=None):
156
+ """
157
+ Convenience function. Given a loader carrying an affective dataset, make a new loader only w.r.t.
158
+ unique images of the dataset, & optionally add to each image the emotion predicted by the img2emo_clf.
159
+ The new loader can be used to sample utterances over the unique images.
160
+ :param loader:
161
+ :param img2emo_clf:
162
+ :param device:
163
+ :return:
164
+ """
165
+ affective_dataset = loader.dataset
166
+ img_files, tokens, emotions = group_annotations_per_image(affective_dataset)
167
+
168
+ img_trans = affective_dataset.img_transform
169
+ batch_size = loader.batch_size
170
+
171
+ if n_workers is None:
172
+ n_workers = loader.num_workers
173
+
174
+ dummy = pd.Series(np.ones(len(img_files), dtype=int) * -1)
175
+
176
+ # possibly predict grounding emotions
177
+ if img2emo_clf is not None:
178
+ temp_dataset = ImageClassificationDataset(image_files=img_files,
179
+ img_transform=img_trans)
180
+ img_dataloader = DataLoader(temp_dataset, batch_size, num_workers=n_workers)
181
+ emo_pred_distribution = image_to_emotion(img2emo_clf, img_dataloader, device)
182
+
183
+ grounding_emo = pd.Series(emo_pred_distribution.argmax(-1).tolist()) # use maximizer of emotions.
184
+ else:
185
+ grounding_emo = dummy
186
+
187
+ new_dataset = AffectiveCaptionDataset(img_files, tokens=dummy, emotions=grounding_emo,
188
+ img_transform=img_trans)
189
+
190
+ new_loader = DataLoader(dataset=new_dataset, batch_size=batch_size, num_workers=n_workers)
191
+ return new_loader
192
+
193
+
194
+ def custom_grounding_dataset_similar_to_affective_loader(grounding_data_csv, loader, n_workers=None):
195
+ """
196
+ Convenience function. Given a csv indicating (grounding) images on the hard-drive and a loader carrying an affective
197
+ dataset, make a new loader with the csv images using the same configuration (e.g., img_transform) as the loader.
198
+ :param grounding_data_csv: (csv filename)
199
+ - has to have one column named "image_file" that corresponds to the file-names of the images.
200
+ - (optionally) can have also a "grounding_emotion" column with values like "contentment"
201
+ :param loader:
202
+ :return:
203
+ """
204
+ df = pd.read_csv(grounding_data_csv)
205
+ image_files = df['image_file']
206
+ dummy = pd.Series(np.ones(len(image_files), dtype=int) * -1)
207
+ if 'grounding_emotion' in df.columns:
208
+ emotions = df.emotion.apply(emotion_to_int)
209
+ else:
210
+ emotions = dummy
211
+
212
+ standard_dset = loader.dataset
213
+ custom_dataset = AffectiveCaptionDataset(image_files, dummy, emotions=emotions,
214
+ n_emotions=standard_dset.n_emotions,
215
+ img_transform=standard_dset.img_transform,
216
+ one_hot_emo=standard_dset.one_hot_emo)
217
+ if n_workers is None:
218
+ n_workers = loader.num_workers
219
+
220
+ custom_data_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
221
+ batch_size=min(loader.batch_size, len(custom_dataset)),
222
+ num_workers=n_workers)
223
+ return custom_data_loader
224
+
imageprocessing/artemis/artemis/in_out/neural_net_oriented.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ I/O routines directly related to torch-based neural-models & their (training etc.) dataset processing.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 10/2/20, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ import random
11
+ import warnings
12
+ import numpy as np
13
+ import pandas as pd
14
+ import os.path as osp
15
+ import multiprocessing as mp
16
+ import torchvision.transforms as transforms
17
+
18
+ from ast import literal_eval
19
+ from PIL import Image
20
+
21
+ from .basics import read_saved_args
22
+ from .datasets import AffectiveCaptionDataset, ImageClassificationDataset
23
+ from ..utils.vocabulary import Vocabulary
24
+ from ..neural_models.show_attend_tell import describe_model as describe_sat
25
+
26
+
27
+ image_net_mean = [0.485, 0.456, 0.406]
28
+ image_net_std = [0.229, 0.224, 0.225]
29
+
30
+
31
+ def max_io_workers():
32
+ """return all/max possible available cpus of machine."""
33
+ return max(mp.cpu_count() - 1, 1)
34
+
35
+
36
+ def image_transformation(img_dim, lanczos=True):
37
+ """simple transformation/pre-processing of image data."""
38
+
39
+ if lanczos:
40
+ resample_method = Image.LANCZOS
41
+ else:
42
+ resample_method = Image.BILINEAR
43
+
44
+ normalize = transforms.Normalize(mean=image_net_mean, std=image_net_std)
45
+ img_transforms = dict()
46
+ img_transforms['train'] = transforms.Compose([transforms.Resize((img_dim, img_dim), resample_method),
47
+ transforms.ToTensor(),
48
+ normalize])
49
+
50
+ # Use same transformations as in train (since no data-augmentation is applied in train)
51
+ img_transforms['test'] = img_transforms['train']
52
+ img_transforms['val'] = img_transforms['train']
53
+ img_transforms['rest'] = img_transforms['train']
54
+ return img_transforms
55
+
56
+
57
+ def df_to_pytorch_dataset(df, args):
58
+ if args.num_workers == -1:
59
+ n_workers = max_io_workers()
60
+ else:
61
+ n_workers = args.num_workers
62
+
63
+ load_imgs = True
64
+ if hasattr(args, 'use_imgs') and not args.use_imgs: # build a dataset without images (e.g., text/emotion only)
65
+ load_imgs = False
66
+
67
+ one_hot_emo = True
68
+ if hasattr(args, 'one_hot_emo') and not args.one_hot_emo: # turn off the one-hot, keep the integer (e.g., when a using xentropy)
69
+ one_hot_emo = False
70
+
71
+ img_transforms = None
72
+ if load_imgs:
73
+ img_transforms = image_transformation(args.img_dim, lanczos=args.lanczos)
74
+
75
+ if args.dataset == 'artemis':
76
+ datasets = pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, args.img_dir, one_hot_emo=one_hot_emo)
77
+ elif args.dataset == 'ola': # Objective Language for Art.
78
+ datasets = pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, args.img_dir, n_emotions=0)
79
+ elif args.dataset == 'coco':
80
+ datasets = pass_coco_splits_to_datasets(df, load_imgs, img_transforms)
81
+ else:
82
+ raise ValueError('training dataset not recognized.')
83
+
84
+ dataloaders = dict()
85
+ for split in datasets:
86
+ b_size = args.batch_size if split=='train' else args.batch_size * 2
87
+ dataloaders[split] = torch.utils.data.DataLoader(dataset=datasets[split],
88
+ batch_size=b_size,
89
+ shuffle=split=='train',
90
+ num_workers=n_workers)
91
+ return dataloaders, datasets
92
+
93
+
94
+ def pass_coco_splits_to_datasets(df, load_imgs, img_transforms, n_emotions=0):
95
+ datasets = dict()
96
+ for split, g in df.groupby('split'):
97
+ g.reset_index(inplace=True, drop=True) # so that direct ([]) indexing in get_item works
98
+ img_files = None
99
+ img_trans = None
100
+
101
+ if load_imgs:
102
+ img_files = g['image_files']
103
+ img_trans = img_transforms[split]
104
+
105
+ dataset = AffectiveCaptionDataset(img_files, g.tokens_encoded, g.emotion_label, img_transform=img_trans,
106
+ n_emotions=n_emotions)
107
+ datasets[split] = dataset
108
+ return datasets
109
+
110
+
111
+ def pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, top_img_dir, n_emotions=9, one_hot_emo=True):
112
+ datasets = dict()
113
+ for split, g in df.groupby('split'):
114
+ g.reset_index(inplace=True, drop=True) # so that direct ([]) indexing in get_item works
115
+ img_files = None
116
+ img_trans = None
117
+
118
+ if load_imgs:
119
+ img_files = g.apply(lambda x : osp.join(top_img_dir, x.art_style, x.painting + '.jpg'), axis=1)
120
+ img_files.name = 'image_files'
121
+ img_trans = img_transforms[split]
122
+
123
+ dataset = AffectiveCaptionDataset(img_files, g.tokens_encoded, g.emotion_label, n_emotions=n_emotions,
124
+ img_transform=img_trans, one_hot_emo=one_hot_emo)
125
+
126
+ datasets[split] = dataset
127
+ return datasets
128
+
129
+
130
+ def read_preprocessed_data_df(args, verbose=False):
131
+ if args.dataset == 'artemis':
132
+ file_name = 'artemis_preprocessed.csv'
133
+ elif args.dataset == 'coco':
134
+ file_name = 'coco_preprocessed.csv'
135
+ else:
136
+ raise ValueError('Unknown Dataset.')
137
+
138
+ if hasattr(args, 'fine_tune_data') and args.fine_tune_data:
139
+ df = pd.read_csv(args.fine_tune_data) # allow explicit data passing
140
+ else:
141
+ df = pd.read_csv(osp.join(args.data_dir, file_name))
142
+
143
+ df.tokens_encoded = df.tokens_encoded.apply(literal_eval)
144
+
145
+ if verbose:
146
+ print('Loaded {} utterances'.format(len(df)))
147
+ return df
148
+
149
+
150
+ def image_emotion_distribution_df_to_pytorch_dataset(df, args, drop_thres=None):
151
+ """ Convert the pandas dataframe that carries information about images and emotion (distributions) to a
152
+ dataset that is amenable to deep-learning (e.g., for an image2emotion classifier).
153
+ :param df:
154
+ :param args:
155
+ :param drop_thres: (optional, float) if provided each distribution of the training will only consist of examples
156
+ for which the maximizing emotion aggregates more than this (drop_thres) mass.
157
+ :return: pytorch dataloaders & datasets
158
+ """
159
+ dataloaders = dict()
160
+ datasets = dict()
161
+ img_transforms = image_transformation(args.img_dim, lanczos=args.lanczos)
162
+
163
+ if args.num_workers == -1:
164
+ n_workers = max_io_workers()
165
+ else:
166
+ n_workers = args.num_workers
167
+
168
+ for split, g in df.groupby('split'):
169
+ g.reset_index(inplace=True, drop=True)
170
+
171
+ if split == 'train' and drop_thres is not None:
172
+ noise_mask = g['emotion_distribution'].apply(lambda x: max(x) > drop_thres)
173
+ print('Keeping {} of the training data, since for the rest their emotion-maximizer is too low.'.format(noise_mask.mean()))
174
+ g = g[noise_mask]
175
+ g.reset_index(inplace=True, drop=True)
176
+
177
+
178
+ img_files = g.apply(lambda x : osp.join(args.img_dir, x.art_style, x.painting + '.jpg'), axis=1)
179
+ img_files.name = 'image_files'
180
+
181
+ dataset = ImageClassificationDataset(img_files, g.emotion_distribution,
182
+ img_transform=img_transforms[split])
183
+
184
+ datasets[split] = dataset
185
+ b_size = args.batch_size if split=='train' else args.batch_size * 2
186
+ dataloaders[split] = torch.utils.data.DataLoader(dataset=dataset,
187
+ batch_size=b_size,
188
+ shuffle=split=='train',
189
+ num_workers=n_workers)
190
+ return dataloaders, datasets
191
+
192
+
193
+ def seed_torch_code(seed, strict=False):
194
+ """Control pseudo-randomness for reproducibility.
195
+ :param manual_seed: (int) random-seed
196
+ :param strict: (boolean) if True, cudnn operates in a deterministic manner
197
+ """
198
+ random.seed(seed)
199
+ np.random.seed(seed)
200
+ torch.manual_seed(seed)
201
+ torch.cuda.manual_seed_all(seed)
202
+ if strict:
203
+ torch.backends.cudnn.deterministic = True
204
+ torch.backends.cudnn.benchmark = False
205
+
206
+
207
+ def save_state_dicts(checkpoint_file, epoch=None, **kwargs):
208
+ """ Save torch items with a state_dict
209
+ """
210
+ checkpoint = dict()
211
+
212
+ if epoch is not None:
213
+ checkpoint['epoch'] = epoch
214
+
215
+ for key, value in kwargs.items():
216
+ checkpoint[key] = value.state_dict()
217
+
218
+ torch.save(checkpoint, checkpoint_file)
219
+
220
+
221
+ def load_state_dicts(checkpoint_file, map_location=None, **kwargs):
222
+ """ Load torch items from saved state_dictionaries
223
+ """
224
+ if map_location is None:
225
+ checkpoint = torch.load(checkpoint_file)
226
+ else:
227
+ checkpoint = torch.load(checkpoint_file, map_location=map_location)
228
+
229
+ for key, value in kwargs.items():
230
+ value.load_state_dict(checkpoint[key])
231
+
232
+ epoch = checkpoint.get('epoch')
233
+ if epoch:
234
+ return epoch
235
+
236
+
237
+ def torch_save_model(model, path):
238
+ """ Wrap torch.save to catch standard warning of not finding the nested implementations.
239
+ :param model:
240
+ :param path:
241
+ :return:
242
+ """
243
+ with warnings.catch_warnings():
244
+ warnings.simplefilter("ignore")
245
+ return torch.save(model, path)
246
+
247
+
248
+ def torch_load_model(checkpoint_file, map_location=None):
249
+ """ Wrap torch.load to catch standard warning of not finding the nested implementations.
250
+ :param checkpoint_file:
251
+ :param map_location:
252
+ :return:
253
+ """
254
+ with warnings.catch_warnings():
255
+ warnings.simplefilter("ignore")
256
+ model = torch.load(checkpoint_file, map_location=map_location)
257
+ return model
258
+
259
+
260
+ def load_saved_speaker(args_file, model_ckp, with_data=False, override_args=None, verbose=False):
261
+ """
262
+ :param args_file: saved argparse arguments with model's description (and location of used data)
263
+ :param model_ckp: saved checkpoint with model's parameters.
264
+ :param with_data:
265
+ :param override_args:
266
+ :return:
267
+ Note, the model is loaded and returned in cpu.
268
+ """
269
+ if verbose:
270
+ print('Loading saved speaker trained with parameters:')
271
+ args = read_saved_args(args_file, override_args=override_args, verbose=verbose)
272
+
273
+ # Prepare empty model
274
+ vocab = Vocabulary.load(osp.join(args.data_dir, 'vocabulary.pkl'))
275
+ print('Using a vocabulary of size', len(vocab))
276
+ model = describe_sat(vocab, args)
277
+
278
+ # Load save weights
279
+ epoch = load_state_dicts(model_ckp, model=model, map_location='cpu')
280
+ print('Loading speaker model at epoch {}.'.format(epoch))
281
+
282
+ # Load data
283
+ if with_data:
284
+ df = read_preprocessed_data_df(args, verbose=True)
285
+ data_loaders, _ = df_to_pytorch_dataset(df, args)
286
+ else:
287
+ data_loaders = None
288
+
289
+ return model, epoch, data_loaders
290
+
291
+
292
+ def deprocess_img(img, std=None, mean=None, clamp=None, inplace=False):
293
+ if not inplace:
294
+ img = img.clone()
295
+
296
+ if img.ndimension() == 4: # batch of images
297
+ pass
298
+ # single_img = False
299
+ elif img.ndimension() == 3: # single image
300
+ img = img.view([1] + list(img.shape))
301
+ # single_img = True
302
+ else:
303
+ raise ValueError()
304
+
305
+ dtype = img.dtype
306
+ n_channels = img.size(1)
307
+
308
+ if std is not None:
309
+ std = torch.as_tensor(std, dtype=dtype, device=img.device)
310
+ img.mul_(std.view([1, n_channels, 1, 1]))
311
+
312
+ if mean is not None:
313
+ mean = torch.as_tensor(mean, dtype=dtype, device=img.device)
314
+ img.add_(mean.view([1, n_channels, 1, 1]))
315
+
316
+ if clamp is not None:
317
+ img.clamp_(clamp[0], clamp[1])
318
+
319
+ return img
320
+
321
+
322
+ def to_img(tensor, mean=None, std=None):
323
+ """ Convert tensor object to PIL.Image(s)
324
+ :param tensor:
325
+ :param mean:
326
+ :param std:
327
+ :return:
328
+ """
329
+ image = tensor.clone().detach()
330
+ image = deprocess_img(image, mean=mean, std=std)
331
+ # Add 0.5 after un-normalizing to [0, 255] to round to nearest integer
332
+ array = image.mul_(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy()
333
+ image = []
334
+ for im in array:
335
+ image.append(Image.fromarray(im))
336
+ return image
imageprocessing/artemis/artemis/language/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
4
+ """
imageprocessing/artemis/artemis/language/adjective_noun_pairs.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Some operations to handle Adjective-Noun Pairs. E.g., useful for sentiment injection
3
+
4
+ The MIT License (MIT)
5
+ Originally created mid 2020, for Python 3.x
6
+ Copyright (c) 2020 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ from collections import Counter
10
+ from .part_of_speech import nltk_parallel_tagging_of_tokens
11
+
12
+ def collect_anps_of_sentence(tokenized_pos_tagged_sentence, tagset='universal'):
13
+ """ return all ANPs that occur in consecutive positions.
14
+ tokenized_pos_tagged_sentence: list, containing the result of calling from nltk.pos_tag on a tokenized sentence.
15
+ E.g., [('a', 'DT'), ('big', 'JJ'), ('man', 'NN')]
16
+ """
17
+ n_tokens = len(tokenized_pos_tagged_sentence)
18
+ collected = []
19
+
20
+ if tagset == 'universal':
21
+ for i, p in enumerate(tokenized_pos_tagged_sentence):
22
+ if p[1] == 'ADJ' and i < n_tokens -1:
23
+ if tokenized_pos_tagged_sentence[i+1][1] == 'NOUN':
24
+ collected.append(p[0] + ' ' + tokenized_pos_tagged_sentence[i+1][0])
25
+ elif tagset == 'penn':
26
+ for i, p in enumerate(tokenized_pos_tagged_sentence):
27
+ if p[1].startswith('J') and i < n_tokens -1:
28
+ if tokenized_pos_tagged_sentence[i+1][1].startswith('N'):
29
+ collected.append(p[0] + ' ' + tokenized_pos_tagged_sentence[i+1][0])
30
+ else:
31
+ raise ValueError()
32
+ return collected
33
+
34
+
35
+ def collect_anp_statistics_of_collection(token_series):
36
+ """ E.g., e.g., how frequent is the ANP "happy man" in the token_series.
37
+ :param token_series: pd.Series, each row is a tokenized sentence
38
+ :return:
39
+ """
40
+ part_of_s = nltk_parallel_tagging_of_tokens(token_series)
41
+ anps = part_of_s.apply(collect_anps_of_sentence)
42
+ anp_counter = Counter()
43
+ anps.apply(anp_counter.update)
44
+ return anp_counter, anps, part_of_s
imageprocessing/artemis/artemis/language/basics.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A set of functions that are useful for processing textual data.
3
+
4
+ The MIT License (MIT)
5
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
6
+ """
7
+
8
+ import pandas as pd
9
+ import multiprocessing as mp
10
+ from multiprocessing import Pool
11
+ from collections import defaultdict
12
+ from itertools import tee, islice
13
+ from symspellpy.symspellpy import SymSpell
14
+
15
+ from .language_preprocessing import unquote_words, expand_contractions
16
+ from .language_preprocessing import manual_sentence_spelling, manual_tokenized_sentence_spelling
17
+ from ..language.spelling import sentence_spelling_dictionary as artemis_sentence_spelling_dictionary
18
+ from ..language.spelling import token_spelling_dictionary as artemis_token_spelling_dictionary
19
+ from ..language.spelling import missing_from_glove_but_are_actual_words
20
+ from ..neural_models.word_embeddings import load_glove_pretrained_embedding
21
+
22
+
23
+
24
+ def ngrams(lst, n):
25
+ """ Return the ngrams of a list of tokens.
26
+ :param lst: the tokens
27
+ :param n: n of n-grams
28
+ :return:
29
+ """
30
+ tlst = lst
31
+ while True:
32
+ a, b = tee(tlst)
33
+ l = tuple(islice(a, n))
34
+ if len(l) == n:
35
+ yield l
36
+ next(b)
37
+ tlst = b
38
+ else:
39
+ break
40
+
41
+
42
+ def parallel_apply(iterable, func, n_processes=None):
43
+ """ Apply func in parallel to chunks of the iterable based on multiple processes.
44
+ :param iterable:
45
+ :param func: simple function that does not change the state of global variables.
46
+ :param n_processes: (int) how many processes to split the data over
47
+ :return:
48
+ """
49
+ n_items = len(iterable)
50
+ if n_processes is None:
51
+ n_processes = min(4 * mp.cpu_count(), n_items)
52
+ pool = Pool(n_processes)
53
+ chunks = int(n_items / n_processes)
54
+ res = []
55
+ for data in pool.imap(func, iterable, chunksize=chunks):
56
+ res.append(data)
57
+ pool.close()
58
+ pool.join()
59
+ return res
60
+
61
+
62
+ def tokenize_and_spell(df, glove_file, freq_file, tokenizer, parallel=True, inplace=True, spell_check=True):
63
+ speller = SymSpell()
64
+ loaded = speller.load_dictionary(freq_file, term_index=0, count_index=1)
65
+ print('SymSpell spell-checker loaded:', loaded)
66
+ golden_vocabulary = load_glove_pretrained_embedding(glove_file, only_words=True, verbose=True)
67
+ golden_vocabulary = golden_vocabulary.union(missing_from_glove_but_are_actual_words)
68
+ print('Updating Glove vocabulary with *valid* ArtEmis words that are missing from it.')
69
+ missed_tokens = defaultdict(list)
70
+
71
+ def automatic_token_speller(token_list, max_edit_distance=1):
72
+ new_tokens = []
73
+ for token in token_list:
74
+ if token in golden_vocabulary:
75
+ new_tokens.append(token) # no spell check
76
+ else:
77
+ spells = speller.lookup(token, max_edit_distance)
78
+ if len(spells) > 0: # found a spelled checked version
79
+ new_tokens.append(spells[0].term)
80
+ else: # spell checking failed
81
+ context = " ".join(token_list)
82
+ missed_tokens[token].append(context)
83
+ new_tokens.append(token)
84
+ return new_tokens
85
+
86
+ if not spell_check:
87
+ automatic_token_speller = None
88
+
89
+ clean_text, tokens, spelled_tokens = pre_process_text(df.utterance,
90
+ artemis_sentence_spelling_dictionary,
91
+ artemis_token_spelling_dictionary,
92
+ tokenizer,
93
+ token_speller=automatic_token_speller,
94
+ parallel=parallel)
95
+
96
+ if inplace:
97
+ df['tokens'] = spelled_tokens
98
+ df['tokens_len'] = df.tokens.apply(lambda x : len(x))
99
+ df['utterance_spelled'] = df.tokens.apply(lambda x : ' '.join(x))
100
+ return missed_tokens
101
+ else:
102
+ return missed_tokens, spelled_tokens
103
+
104
+
105
+ def pre_process_text(text, manual_sentence_speller, manual_token_speller,
106
+ tokenizer, token_speller=None, parallel=True):
107
+
108
+ clean_text = text.apply(lambda x: manual_sentence_spelling(x, manual_sentence_speller)) # sentence-to-sentence map
109
+ clean_text = clean_text.apply(lambda x: x.lower())
110
+ clean_text = clean_text.apply(unquote_words)
111
+
112
+ if parallel:
113
+ clean_text = pd.Series(parallel_apply(clean_text, expand_contractions))
114
+ else:
115
+ clean_text = clean_text.apply(expand_contractions)
116
+
117
+ basic_punct = '.?!,:;/\-~*_=[–]{}$^@|%#<—>'
118
+ punct_to_space = str.maketrans(basic_punct, ' ' * len(basic_punct)) # map punctuation to space
119
+ clean_text = clean_text.apply(lambda x: x.translate(punct_to_space))
120
+
121
+ if parallel:
122
+ tokens = pd.Series(parallel_apply(clean_text, tokenizer))
123
+ else:
124
+ tokens = clean_text.apply(tokenizer)
125
+
126
+ spelled_tokens = tokens.apply(lambda x: manual_tokenized_sentence_spelling(x,
127
+ spelling_dictionary=manual_token_speller)
128
+ )
129
+ if token_speller is not None:
130
+ spelled_tokens = spelled_tokens.apply(token_speller)
131
+
132
+ return clean_text, tokens, spelled_tokens
imageprocessing/artemis/artemis/language/language_preprocessing.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A set of functions that are useful for pre-processing textual data: uniformizing the words, spelling, etc.
3
+
4
+ The MIT License (MIT)
5
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
6
+ """
7
+
8
+ import re
9
+
10
+ contractions_dict = {
11
+ "ain't": "am not",
12
+ "aren't": "are not",
13
+ "can't": "cannot",
14
+ "can't've": "cannot have",
15
+ "'cause": "because",
16
+ "could've": "could have",
17
+ "couldn't": "could not",
18
+ "couldn't've": "could not have",
19
+ "didn't": "did not",
20
+ "doesn't": "does not",
21
+ "don't": "do not",
22
+ "hadn't": "had not",
23
+ "hadn't've": "had not have",
24
+ "hasn't": "has not",
25
+ "haven't": "have not",
26
+ "he'd": "he had",
27
+ "he'd've": "he would have",
28
+ "he'll": "he will",
29
+ "he'll've": "he will have",
30
+ "he's": "he is",
31
+ "how'd": "how did",
32
+ "how'd'y": "how do you",
33
+ "how'll": "how will",
34
+ "how's": "how is",
35
+ "i'd": "I had",
36
+ "i'd've": "I would have",
37
+ "i'll": "I will",
38
+ "i'll've": "I will have",
39
+ "i'm": "I am",
40
+ "i've": "I have",
41
+ "isn't": "is not",
42
+ "it'd": "it had",
43
+ "it'd've": "it would have",
44
+ "it'll": "it will",
45
+ "it'll've": "iit will have",
46
+ "it's": "it is",
47
+ "let's": "let us",
48
+ "ma'am": "madam",
49
+ "mayn't": "may not",
50
+ "might've": "might have",
51
+ "mightn't": "might not",
52
+ "mightn't've": "might not have",
53
+ "must've": "must have",
54
+ "mustn't": "must not",
55
+ "mustn't've": "must not have",
56
+ "needn't": "need not",
57
+ "needn't've": "need not have",
58
+ "o'clock": "of the clock",
59
+ "oughtn't": "ought not",
60
+ "oughtn't've": "ought not have",
61
+ "shan't": "shall not",
62
+ "sha'n't": "shall not",
63
+ "shan't've": "shall not have",
64
+ "she'd": "she had",
65
+ "she'd've": "she would have",
66
+ "she'll": "she will",
67
+ "she'll've": "she will have",
68
+ "she's": "she is",
69
+ "should've": "should have",
70
+ "shouldn't": "should not",
71
+ "shouldn't've": "should not have",
72
+ "so've": "so have",
73
+ "so's": "so is",
74
+ "that'd": "that had",
75
+ "that'd've": "that would have",
76
+ "that's": "that is",
77
+ "there'd": "there had",
78
+ "there'd've": "there would have",
79
+ "there's": "there is",
80
+ "they'd": "they had",
81
+ "they'd've": "they would have",
82
+ "they'll": "they will",
83
+ "they'll've": "they will have",
84
+ "they're": "they are",
85
+ "they've": "they have",
86
+ "to've": "to have",
87
+ "wasn't": "was not",
88
+ "we'd": "we had",
89
+ "we'd've": "we would have",
90
+ "we'll": "we will",
91
+ "we'll've": "we will have",
92
+ "we're": "we are",
93
+ "we've": "we have",
94
+ "weren't": "were not",
95
+ "what'll": "what will",
96
+ "what'll've": "what will have",
97
+ "what're": "what are",
98
+ "what's": "what is",
99
+ "what've": "what have",
100
+ "when's": "when is",
101
+ "when've": "when have",
102
+ "where'd": "where did",
103
+ "where's": "where is",
104
+ "where've": "where have",
105
+ "who'll": "who will",
106
+ "who'll've": "who will have",
107
+ "who's": "who is",
108
+ "who've": "who have",
109
+ "why's": "why is",
110
+ "why've": "why have",
111
+ "will've": "will have",
112
+ "won't": "will not",
113
+ "won't've": "will not have",
114
+ "would've": "would have",
115
+ "wouldn't": "would not",
116
+ "wouldn't've": "would not have",
117
+ "y'all": "you all",
118
+ "y'all'd": "you all would",
119
+ "y'all'd've": "you all would have",
120
+ "y'all're": "you all are",
121
+ "y'all've": "you all have",
122
+ "you'd": "you had",
123
+ "you'd've": "you would have",
124
+ "you'll": "you will",
125
+ "you'll've": "you will have",
126
+ "you're": "you are",
127
+ "you've": "you have",
128
+ "do'nt": "do not",
129
+ "does\'nt": "does not"
130
+ }
131
+
132
+ CONTRACTION_RE = re.compile('({})'.format('|'.join(contractions_dict.keys())),
133
+ flags=re.IGNORECASE | re.DOTALL)
134
+
135
+
136
+ def expand_contractions(text, contractions=None, lower_i=True):
137
+ """ Expand the contractions of the text (if any).
138
+ Example: You're a good father. -> you are a good father.
139
+ :param text: (string)
140
+ :param contractions: (dict)
141
+ :param lower_i: boolean, if True (I'm -> 'i am' not 'I am')
142
+ :return: (string)
143
+
144
+ Note:
145
+ Side-effect: lower-casing. E.g., You're -> you are.
146
+ """
147
+ if contractions is None:
148
+ contractions = contractions_dict # Use one define in this .py
149
+
150
+ def expand_match(contraction):
151
+ match = contraction.group(0)
152
+ expanded_contraction = contractions.get(match)
153
+ if expanded_contraction is None:
154
+ expanded_contraction = contractions.get(match.lower())
155
+ if lower_i:
156
+ expanded_contraction = expanded_contraction.lower()
157
+ return expanded_contraction
158
+
159
+ expanded_text = CONTRACTION_RE.sub(expand_match, text)
160
+ return expanded_text
161
+
162
+
163
+ QUOTES_RE_STR = r"""(?:['|"][\w]+['|"])""" # Words encapsulated in apostrophes.
164
+ QUOTES_RE = re.compile(r"(%s)" % QUOTES_RE_STR, flags=re.VERBOSE | re.IGNORECASE | re.UNICODE)
165
+
166
+
167
+ def unquote_words(s):
168
+ """ 'king' - > king, "queen" -> queen """
169
+ iterator = QUOTES_RE.finditer(s)
170
+ new_sentence = list(s)
171
+ for match in iterator:
172
+ start, end = match.span()
173
+ new_sentence[start] = ' '
174
+ new_sentence[end-1] = ' '
175
+ new_sentence = "".join(new_sentence)
176
+ return new_sentence
177
+
178
+
179
+ def manual_sentence_spelling(x, spelling_dictionary):
180
+ """
181
+ Applies spelling on an entire string, if x is a key of the spelling_dictionary.
182
+ :param x: (string) sentence to potentially be corrected
183
+ :param spelling_dictionary: correction map
184
+ :return: the sentence corrected
185
+ """
186
+ if x in spelling_dictionary:
187
+ return spelling_dictionary[x]
188
+ else:
189
+ return x
190
+
191
+
192
+ def manual_tokenized_sentence_spelling(tokens, spelling_dictionary):
193
+ """
194
+ :param tokens: (list of tokens) to potentially be corrected
195
+ :param spelling_dictionary: correction map
196
+ :return: a list of corrected tokens
197
+ """
198
+ new_tokens = []
199
+ for token in tokens:
200
+ if token in spelling_dictionary:
201
+ res = spelling_dictionary[token]
202
+ if type(res) == list:
203
+ new_tokens.extend(res)
204
+ else:
205
+ new_tokens.append(res)
206
+ else:
207
+ new_tokens.append(token)
208
+ return new_tokens
209
+
210
+
211
+ # noinspection PyInterpreter
212
+ if __name__ == "__main__":
213
+ import pandas as pd
214
+ text = pd.DataFrame({'data': ["I'm a 'good' MAN", "You can't be likee this."]})
215
+ print("Original Text:")
216
+ print(text.data)
217
+
218
+ manual_speller = {'You can\'t be likee this.': 'You can\'t be like this.'}
219
+ text.data = text.data.apply(lambda x: manual_sentence_spelling(x, manual_speller))
220
+ text.data = text.data.apply(lambda x: x.lower())
221
+ text.data = text.data.apply(unquote_words)
222
+ text.data = text.data.apply(expand_contractions)
223
+ print("Corrected Text:")
224
+ print(text.data)
imageprocessing/artemis/artemis/language/part_of_speech.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Part of speech tagging at speed for two libraries.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in 2020, for Python 3.x - last updated in early 2021.
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+ import dask.dataframe as dd
9
+ import multiprocessing as mp
10
+ from nltk.tag import pos_tag
11
+
12
+ try:
13
+ import spacy
14
+ except:
15
+ pass
16
+
17
+
18
+ def nltk_parallel_tagging_of_tokens(tokens, n_partitions=None, tagset='universal'):
19
+ """ pos-tagging
20
+ :param tokens: pd.Series with tokenized utterances as rows. e.g., [['a', 'man'], ['a', 'big', 'man'], ...]
21
+ :return: a pd.Series with the result of applying pos_tag in each row. e.g.,
22
+ [(a, DT), (man, NN)], [('a', 'DT'), ('big', 'JJ'), ('man', 'NN')]]
23
+ """
24
+ if n_partitions is None:
25
+ n_partitions = mp.cpu_count() * 4
26
+ ddata = dd.from_pandas(tokens, npartitions=n_partitions)
27
+ tagged_tokens =\
28
+ ddata.map_partitions(lambda x: x.apply((lambda y: pos_tag(y, tagset=tagset)))).compute(scheduler='processes')
29
+
30
+ return tagged_tokens
31
+
32
+
33
+ def spacy_pos_tagging(utterances, nlp=None):
34
+ if nlp is None:
35
+ nlp = spacy.load('en_core_web_sm')
36
+
37
+ utters = utterances.astype('unicode').values
38
+ docs = nlp.pipe(utters, batch_size=1000, n_threads=-1)
39
+ pos = [[t.pos_ for t in d if not t.is_space] for d in docs]
40
+ return pos
imageprocessing/artemis/artemis/language/spelling.py ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Auxiliary spelling utilities.
3
+
4
+ It's called [may-rah-kee]: https://travelwithmeraki.com/meaning-of-meraki/
5
+
6
+ The MIT License (MIT)
7
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab.
8
+ """
9
+
10
+
11
+ ##
12
+ ## Dictionary, mapping entire "raw" collected sentences to new sentences. These were emailed separately from the AMT submissions
13
+ ## from pedantic and good! annotators, who wanted to correct their original submission.
14
+ ##
15
+ sentence_spelling_dictionary = {
16
+ 'thewaytheshapeschangethespace': 'the way the shapes change the space',
17
+ 'brightcolorsandanimals': 'bright colors and animals',
18
+ 'calmweatherandpeopleworkingtogether': 'calm weather and people working together',
19
+ 'thesoftcolors': 'the soft colors',
20
+ 'itlookslikeapeacefullocation': 'it looks like a peaceful location',
21
+ 'Iliketoplayinthesnow': 'I like to play in the snow',
22
+ 'thecalmwaters': 'the calm waters',
23
+ 'itseemsincompletesomehow': 'it seems incomplete somehow',
24
+ 'thefiguresseemsomehowawkward': 'the figures seem somehow awkward',
25
+ 'thegreenandlackofpeople': 'the green and lack of people',
26
+ 'boatingisanadventure': 'boating is an adventure',
27
+ 'thereseemstobetoomuchsky': 'there seems to be too much sky',
28
+ 'onthewholeitlookslikeawallpaperswatch': 'on the whole it looks like a wallpapers watch',
29
+ 'thecolorofthewater': 'the color of the water',
30
+ 'alloftheplantsandthewatershowingthroughthem': 'all of the plants and the water showing through them',
31
+ 'thecolorcombination': 'the color combination',
32
+ 'theslashesacrosstheworkandthedarkcolors': 'the slashes across the work and the dark colors',
33
+ 'thesimplicityandopenness': 'the simplicity and openness',
34
+ 'theproportionbetweenheadandbody': 'the proportion between head and body',
35
+ 'thebrightcolorsandthefeelingofmotion': 'the bright colors and the feeling of motion',
36
+ 'thedetailinthehands': 'the detail in the hands',
37
+ 'thelengthofthewoman\'sneck': 'the length of the woman\'s neck',
38
+ 'thedifferentflocksofbirdsinthesky': 'the different flocks of birds in the sky',
39
+ 'theskillshownindrawingthefigures': 'the skill shown in drawing the figures',
40
+ 'thenaturallookingskintone': 'the natural looking skin tone',
41
+ 'theyeadsthatdon\'tseemtobeattatchedtoanything': 'the yeads that don\'t seem to be attatched to anything',
42
+ 'themasksheisholding': 'the mask she is holding',
43
+ 'theapparantageofthepiece': 'the apparent age of the piece',
44
+ 'thepinkbows': 'the pink bows',
45
+ 'theintensityonthefaceofthemaninfrontofthewoman.': 'the intensity on the face of the man in front of the woman',
46
+ 'theshapesandcolors-itlookshard,painful': 'the shapes and colors - it looks hard, painful',
47
+ 'thewaytheguitarisbrokenupandmagnifiedbutstillidentifiable.': 'the way the guitar is broken up and magnified but still identifiable',
48
+ 'veryimpressedwiththewaytheartistcreatedlight': 'very impressed with the way the artist created light',
49
+ 'itlookslikepeoplearewaitingforsomeeventtohappenlikeaboatraceorsomething': 'it looks like people are waiting for some event to happen like a boat race or something',
50
+ 'wonderingifthisisapaintingoratextile': 'wondering if this is a painting or a textile',
51
+ 'itseemstoodarkfortheactivity': 'it seems too dark for the activity',
52
+ 'Ithinkitskindofweirdhowherhipsmakealmostacircle': 'I think its kind of weird how her hips make almost a circle',
53
+ 'thesoftnessofthefiguremakesitfeellikeiamintrudingonanintimatemoment': 'the softness of the figure makes it feel like i am intruding on an intimate moment',
54
+ 'Idon\'tseepieceslikethisas\'art\'itcouldbethewallinsomeone\'shouse': 'I don\'t see pieces like this as \'art\' it could be the wall in someone\'s house',
55
+ 'thereflectionofthetreesinthepool': 'the reflection of the trees in the pool',
56
+ 'thepainonhisface': 'the pain on his face',
57
+ 'itremindsmeofastringofbeadsasmallchildwouldmake': 'it reminds me of a string of beads a small child would make',
58
+ 'thefigurebeneaththetreeappearsveryrelaxed': 'the figure beneath the tree appears very relaxed',
59
+ 'thefacialexpressionisveryloving': 'the facial expression is very loving',
60
+ 'thecolorsandactivitymakeitlooklikeafunplacetobe': 'the colors and activity make it look like a fun place to be',
61
+ 'itlookslikeaniceareatogoforawalk': 'it looks like an ice area to go for a walk',
62
+ 'theyappeartobeayoungcoupleinlove': 'they appear to be a young couple in love',
63
+ 'allofthelittledetailsareamazing': 'all of the little details are amazing',
64
+ 'knowledgeofthehistoryassociatedwiththeperson': 'knowledge of the history associated with the person',
65
+ 'itlookslikeaveryoldpaintedtextile': 'it looks like a very old painted textile',
66
+ 'allofthebrightcolorsjustmakemehappy': 'all of the bright colors just make me happy',
67
+ 'dificultysortingoutwhetherthefigureismaleorfemale': 'dificulty sorting out whether the figure is male or female',
68
+ 'Itseemskindoflikeaposteryou\'dputinaclassroom.': 'It seems kind of like a poster you\'d put in a classroom',
69
+ 'Theexpressionontheman\'sfaceappearsangry.': 'The expression on the man\'s face appears angry.',
70
+ 'it\'d dark and creepy and weirdly sexual in a bad way.': 'it\'s dark and creepy and weirdly sexual in a bad way.',
71
+ 'looks disgusting looks like a cross desser disgusting': 'looks disgusting looks like a cross dresser disgusting',
72
+ 'Big skirts and bloomers on dancing ladies definitelymake the mood excitement.': 'big skirts and bloomers on dancing ladies definitely make an exciting mood',
73
+ 'The rays eminntating from sum over the town os exhilarating': 'the rays emanating from sun over the town are exhilarating',
74
+ 'the way the artist uses black and white really gives this a different feel tp the painting': 'the way the artist uses black and white really gives this a different feel to the painting',
75
+ 'This reminds me of jumbled graffiti I saw and had tp clean up every once in a while when I was younger': 'This reminds me of jumbled graffiti I saw and had to clean up every once in a while when I was younger',
76
+ 'looks like the cabins i stayed in on my trip tp a dude ranch': 'looks like the cabins i stayed in on my trip to a dude ranch',
77
+ 'A young woman applies make-up tp her face as she sits in a pretty robe. A pleasant but unfished work of art.': 'A young woman applies make-up to her face as she sits in a pretty robe. A pleasant but unfinished work of art.',
78
+ 'old and not kept up well, bug nice scene of a man at work': 'old and not kept up well, but nice scene of a man at work',
79
+ 'Again this painting is dill its lifeless and no color.': 'Again this painting is dull, it is lifeless and has no color.',
80
+ 'she smile smartly': 'she smiles smartly',
81
+ 'The mountain makes me think of a strong safehold, and a feeling of shelter.': 'The mountain makes me think of a stronghold giving me a feeling of shelter',
82
+ 'The detail in the surundsing like the clock tower and statue make it more inspiering': 'The detail in the surroundings like the clock tower and statue make it more inspiring',
83
+ 'The man is chained down and left to be attacked by a bird of prey while another man non chalantly watches what is taking place.': 'The man is chained down and left to be attacked by a bird of prey while another man nonchalantly watches what is taking place.',
84
+ 'This is a beautiful scene with the mosques in the background and the vegetation in the front.': 'This is a beautiful scene with the onion dome churches in the background and the vegetation in the front.',
85
+ 'The colors make me feel like I\'m looking at someone important. I feel a since of awe over them because of their attire.': 'The colors make me feel like I\'m looking at someone important. I feel a sense of awe over them because of their attire.',
86
+ 'the detective ihas found the diar everyone knew she kept and now hopefully he wi findout what led up to her breakdown ': 'the detective has found the diary everyone knew she kept and now hopefully he will find out what led up to her breakdown',
87
+ 'The VanGoghishness of this makes me smile and wonder how they do it.': 'The Van Gogh like quality of this makes me smile and wonder how they do it.',
88
+ 'The colors and shapes compliment each otherakes look like a adult childs painting.': 'The colors and shapes compliment each other and looks like an adult child\'s painting.',
89
+ 'I prefer more realistic still ifes.': 'I prefer more realistic still lifes.'
90
+ }
91
+
92
+
93
+
94
+
95
+ ##
96
+ ## Dictionary, mapping words to replacement words to densify the dataset (e.g., colour -> color), or more likely to spell
97
+ ## check them. ## Curated manually by Panos circa 2020.
98
+ ##
99
+ token_spelling_dictionary = {'colour': 'color',
100
+ 'colours': 'colors',
101
+ 'thecountry': ['the', 'country'],
102
+ 'minimamistic': 'minimalistic',
103
+ 'littlefinger': ['little', 'finger'],
104
+ 'im': ['i', 'am'],
105
+ 'greatfull': 'grateful',
106
+ 'skinnydippers': ['skinny', 'dippers'],
107
+ 'goingnon': ['going', 'on'],
108
+ 'rainclouds': ['rain', 'clouds'],
109
+ 'lillypads': ['lily', 'pads'],
110
+ 'paintinglike': ['painting', 'like'],
111
+ 'somekind': ['some', 'kind'],
112
+ 'overexaggerated': ['over', 'exaggerated'],
113
+ 'smokeshop': ['smoke', 'shop'],
114
+ 'fearinspiring': ['fear', 'inspiring'],
115
+ 'thebackround': ['the', 'background'],
116
+ 'raincloud': ['rain', 'cloud'],
117
+ 'wideopen': ['wide', 'open'],
118
+ 'crusifiction': 'crucifixion',
119
+ 'tablesetting': ['table', 'setting'],
120
+ 'vividcolors': ['vivid', 'colors'],
121
+ 'willhave': ['will', 'have'],
122
+ 'thisbpainting': ['this', 'painting'],
123
+ 'alongthis': ['along', 'this'],
124
+ 'crucifications': 'crucifixion',
125
+ 'overexaggeration': ['over', 'exaggeration'],
126
+ 'snacktime': ['snack', 'time'],
127
+ 'beaurocratic': 'bureaucratic',
128
+ 'nonsensicalness': ['nonsensical', 'ness'],
129
+ 'chubbyness': 'chubbiness',
130
+ 'distatestful': 'distasteful',
131
+ 'disapportioned': 'disproportionate',
132
+ 'becauseofthe': ['because', 'of', 'the'],
133
+ 'hahahahaha': 'haha',
134
+ 'hahahahaa': 'haha',
135
+ 'annoniminity': 'anonymity',
136
+ 'realisticand': ['realistic', 'and'],
137
+ 'feellike': ['feel', 'like'],
138
+ 'clostiphobic': 'claustrophobic',
139
+ 'thegolden': ['the', 'golden'],
140
+ 'minimalstic': 'minimalistic',
141
+ 'artdeco': ['art', 'deco'],
142
+ 'paddleboards': ['paddle', 'boards'],
143
+ 'fitbtogetherv': ['fit', 'together'],
144
+ 'doingthat': ['doing', 'that'],
145
+ 'stormclouds': ['storm', 'clouds'],
146
+ 'feelanxious': ['feel', 'anxious'],
147
+ 'withpeople':['with', 'people'],
148
+ 'nuditythen': ['nudity', 'then'],
149
+ 'whatbappears': ['what', 'appears'],
150
+ 'womenafter': ['women', 'after'],
151
+ 'funerallike': ['funeral', 'like'],
152
+ 'thebridge': ['the', 'bridge'],
153
+ 'focalpoint': ['focal', 'point'],
154
+ 'crussifiction': 'crucifixion',
155
+ 'extrocinary': 'extraordinary',
156
+ 'adrodgenous': 'androgynous',
157
+ 'whimsacle': 'whimsical',
158
+ 'nonabrasive': ['non', 'abrasive'],
159
+ 'alienlike': ['alien', 'like'],
160
+ 'intricitally': 'intricately',
161
+ 'straightlines': ['straight', 'lines'],
162
+ 'shouldnt': ['should', 'not'],
163
+ 'favortire': 'favorite',
164
+ 'downsyndrome': ['down', 'syndrome'],
165
+ 'silluete': 'silhouette',
166
+ 'provideenough': ['provide', 'enough'],
167
+ 'waterpainting':['water', 'painting'],
168
+ 'the19th': ['the', 'nineteenth'],
169
+ 'oldfashoned': ['old', 'fashioned'],
170
+ 'colorblocking': ['color', 'blocking'],
171
+ 'gesticulates': 'testiculates',
172
+ 'notknow': ['not', 'know'],
173
+ 'crucifixiction': 'crucifixion',
174
+ 'cruxifiction': 'crucifixion',
175
+ 'contementent': 'contentment',
176
+ 'underconstruction': ['under', 'construction'],
177
+ 'cartoonfrom': ['cartoon', 'from'],
178
+ 'downwardlooks': ['downward', 'looks'],
179
+ 'unrelateable': 'unrelatable',
180
+ 'ofvthose': ['of', 'those'],
181
+ 'rainbowlike': ['rainbow', 'like'],
182
+ 'thegesture': ['the', 'gesture'],
183
+ 'pencilwork': ['pencil', 'work'],
184
+ 'perfectlycovered': ['perfectly', 'covered'],
185
+ 'eitherway': ['either', 'way'],
186
+ 'andpeaceful': ['and', 'peaceful'],
187
+ 'cloudforms': ['cloud', 'forms'],
188
+ 'peoplejust': ['people', 'just'],
189
+ 'pyscadellic': 'psychedelic',
190
+ 'maybepreparing': ['maybe', 'preparing'],
191
+ 'thisbmakes': ['this', 'makes'],
192
+ 'thispainting': ['this', 'painting'],
193
+ 'combinationmakes': ['combination', 'makes'],
194
+ 'rightside': ['right', 'side'],
195
+ 'saysnothing': ['says', 'nothing'],
196
+ 'individualness': 'individualism',
197
+ 'verynostalgic': ['very', 'nostalgic'],
198
+ 'hyperrealistic': ['hyper', 'realistic'],
199
+ 'wimsicle': 'whimsical',
200
+ 'aweinspiring': ['awe', 'inspiring'],
201
+ 'resturarunt': 'restaurant',
202
+ 'cruxification': 'crucifixion',
203
+ 'mistiruous': 'mysterious',
204
+ 'streetlamp': ['street', 'lamp'],
205
+ 'sadnessand': ['sadness', 'and'],
206
+ 'republicancult': ['republican', 'cult'],
207
+ 'mogilianni': 'modigliani',
208
+ 'raphealite': 'raphaelite',
209
+ 'immeidtaley': 'immediately',
210
+ 'duckface': ['duck', 'face'],
211
+ 'kinglike': ['king', 'like'],
212
+ 'monaleesa': ['mona', 'lisa'],
213
+ 'antispication': 'anticipation',
214
+ 'womendid': ['women', 'did'],
215
+ 'jailcell': ['jail', 'cell'],
216
+ 'thispeicemakes': ['this', 'piece', 'makes'],
217
+ 'pceaful': 'peaceful',
218
+ 'showpeople': ['show', 'people'],
219
+ 'colorsand': ['colors', 'and'],
220
+ 'lovevthe': ['love', 'the'],
221
+ 'mewithoutyou': ['me', 'without', 'you'],
222
+ 'microexpression': ['micro', 'expression'],
223
+ 'doesnnt': ['does', 'not'],
224
+ 'airfilter': ['air', 'filter'],
225
+ 'appostols': 'apostles',
226
+ 'acrossthe': ['across', 'the'],
227
+ 'andaroused': ['and', 'aroused'],
228
+ 'bluecolor': ['blue', 'color'],
229
+ 'broadstrokes': ['broad', 'strokes'],
230
+ 'bullethole': ['bullet', 'hole'],
231
+ 'shadowlike': ['shadow', 'like'],
232
+ 'shepardplaying': ['shepard', 'playing'],
233
+ 'siporportioned': 'disproportionate',
234
+ 'skyremind': ['sky', 'remind'],
235
+ 'theblending': ['the', 'blending'],
236
+ 'thoughtfuland': ['thoughtful', 'and'],
237
+ 'yellowbrowns': ['yellow', 'browns'],
238
+ 'creeeppyyy': 'creepy',
239
+ 'crosslegged': ['cross', 'legged'],
240
+ 'cupshave': ['cups', 'have'],
241
+ 'dissapoinment': 'disappointment',
242
+ 'drinkbest': ['drink', 'best'],
243
+ 'dragonlike': ['dragon', 'like'],
244
+ 'dressform': ['dress', 'form'],
245
+ 'farmlife': ['farm', 'life'],
246
+ 'inbtween': ['in', 'between'],
247
+ 'averageparentproblems': ['average', 'parent', 'problems'],
248
+ 'aroundthe': ['around', 'the'],
249
+ 'anythingabout': ['anything', 'about'],
250
+ 'bootylicous': 'bootylicious',
251
+ 'andwhat': ['and', 'what'],
252
+ 'applestore': ['apple', 'store'],
253
+ 'archioligist': 'archaeologist',
254
+ 'archtypical': 'archetypal',
255
+ 'armorwear': ['armor', 'wear'],
256
+ 'assumingely': 'assumingly',
257
+ 'beachtown': ['beach', 'town'],
258
+ 'beenshot': ['been', 'shot'],
259
+ 'bluemountains': ['blue', 'mountains'],
260
+ 'boldcolors': ['bold', 'colors'],
261
+ 'buddawhateverhisname': ['buddha', 'whatever', 'his', 'name'],
262
+ 'buttcrack': ['butt', 'crack'],
263
+ 'candytown': 'candytown',
264
+ 'colorsare': ['colors', 'are'],
265
+ 'colorscale': ['color', 'scale'],
266
+ 'cominginto': ['coming', 'into'],
267
+ 'commonfolk': ['common', 'folk'],
268
+ 'cottonballs': ['cotton', 'balls'],
269
+ 'excuuuuse': 'excuse',
270
+ 'eyesockets': ['eye', 'sockets'],
271
+ 'facelooking': ['face', 'looking'],
272
+ 'fromthis': ['from', 'this'],
273
+ 'pokerface': ['poker', 'face'],
274
+ 'thefountain': ['the', 'fountain'],
275
+ 'thinkpeople': ['think', 'people'],
276
+ 'uncomfomfortable': 'uncomfortable',
277
+ 'upsidedown': ['upside', 'down'],
278
+ 'vangough': ['van', 'gogh'],
279
+ 'vangogh': ['van', 'gogh'],
280
+ 'yaaaay': 'yay',
281
+ 'uhhhh': 'uhh',
282
+ 'thedark': ['the', 'dark'],
283
+ 'tallships': ['tall', 'ships'],
284
+ 'stilllife': ['still', 'life'],
285
+ 'stillframe': ['still', 'frame'],
286
+ 'mmmmmm': 'mmm',
287
+ 'marvelone': ['marvel', 'one'],
288
+ 'lookhomeless': ['look', 'homeless'],
289
+ 'likealot': ['like', 'a', 'lot'],
290
+ 'interestesting': 'interesting',
291
+ 'intriuiging': 'intriguing',
292
+ 'icecreams': ['ice', 'creams'],
293
+ 'awwwww': 'aww',
294
+ 'slavemaster': ['slave', 'master'],
295
+ 'pictureshould': ['picture', 'should'],
296
+ 'onhisface': ['on', 'his', 'face'],
297
+ 'likethis': ['like', 'this'],
298
+ 'inkwork': ['ink', 'work'],
299
+ 'grapejuice': ['grape', 'juice'],
300
+ 'flowerlike': ['flower', 'like'],
301
+ 'understandthe': ['understand', 'the'],
302
+ 'welldressed': ['well', 'dressed'],
303
+ 'wouldlove': ['would', 'love'],
304
+ 'blendedinto': ['blended', 'into'],
305
+ 'buttcheeks': ['butt', 'cheeks'],
306
+ 'clownlike':['clown', 'like'],
307
+ 'davinchi': ['da', 'vinci'],
308
+ 'veryperfect': ['very', 'perfect'],
309
+ 'supervillian': 'supervillain',
310
+ 'simpleand': ['simple', 'and'],
311
+ 'seemsout': ['seems', 'out'],
312
+ 'rainbowmeeting': ['rainbow', 'meeting'],
313
+ 'strobelights': ['strobe', 'lights'],
314
+ 'subltness': 'subtleness',
315
+ 'throughthe': ['through', 'the'],
316
+ 'paintingfreaks': ['painting', 'freaks'],
317
+ 'muchgoing': ['much', 'going'],
318
+ 'meditterean': 'mediterranean',
319
+ 'instaneous': 'instantaneous',
320
+ 'helpthe': ['help', 'the'],
321
+ 'bizzarly': 'bizarrely',
322
+ 'crimescene': ['crime', 'scene'],
323
+ 'deathlife': ['death', 'life'],
324
+ 'dancefight': ['dance', 'fight'],
325
+ 'blahblahblah': ['blah', 'blah', 'blah'],
326
+ 'disporportioned': 'disproportionate',
327
+ 'dreamstate': ['dream', 'state'],
328
+ 'eithermight': ['either', 'might'],
329
+ 'enviornemt': 'environment',
330
+ 'greenbackground': ['green', 'background'],
331
+ 'greybackground': ['grey', 'background'],
332
+ 'handrwawing': ['hand' 'drawing'],
333
+ 'happycause': ['happy', 'cause'],
334
+ 'thelayout': ['the', 'layout'],
335
+ 'greatgrandparent': ['great', 'grand', 'parent'],
336
+ 'greatgrandparents': ['great', 'grand', 'parents'],
337
+ 'likesomething': ['like', 'something'],
338
+ 'likethey': ['like', 'they'],
339
+ 'makingthe': ['making', 'the'],
340
+ 'mideviltimes': ['medieval', 'times'],
341
+ 'moviestar': ['movie', 'star'],
342
+ 'shroudlike': ['shroud', 'like'],
343
+ 'blackscale': ['black', 'scale'],
344
+ 'bothsides': ['both', 'sides'],
345
+ 'fallevening': ['fall', 'evening'],
346
+ 'breaklight': ['break', 'light'],
347
+ 'springgarden': ['spring', 'garden'],
348
+ 'pointalist': 'pointillism',
349
+ 'hemmeroid': 'hemorrhoid',
350
+ 'bonaroo': 'bonnaroo',
351
+ 'boardshorts': ['board', 'shorts'],
352
+ 'luminousand': ['luminous', 'and'],
353
+ 'iceskating': ['ice', 'skating'],
354
+ 'ewwww' :'ew',
355
+ 'bloodsplatter': ['blood', 'splatter'],
356
+ 'beastlike': ['beast', 'like'],
357
+ 'entendra': 'entendre',
358
+ 'dollbaby': ['doll', 'baby'],
359
+ 'eachothers': ['each', 'others'],
360
+ 'backlooking': ['back', 'looking'],
361
+ 'enjoynthe': ['enjoy', 'the'],
362
+ 'stormcloud': ['storm', 'cloud'],
363
+ 'playwriter': ['play', 'writer'],
364
+ 'hyroglifics': 'hieroglyphics',
365
+ 'lilypads': ['lily', 'pads'],
366
+ 'ivreqlly': ['i', 'really'],
367
+ 'kindnof': ['kind', 'of'],
368
+ 'selfconcious': ['self', 'conscious'],
369
+ 'reprensation': 'representation',
370
+ 'eerieness' : 'eeriness',
371
+ 'paining': 'painting',
372
+ 'thats': ['that', 'is'],
373
+ 'xmas': 'christmas',
374
+ 'swordbearer' : ['sword', 'bearer'],
375
+ 'outcseeing': ['out', 'seeing'],
376
+ 'gatheredaround': ['gathered', 'around'],
377
+ 'lockeroom': ['locker', 'room'],
378
+ 'adrogonius': 'androgynous',
379
+ 'mezmesring': 'mesmerising',
380
+ 'powderoom': ['powder', 'room'],
381
+ 'tenalady': ['tena', 'lady', 'pads'],
382
+ 'storytale': ['story', 'tale'],
383
+ 'dipropratnly': 'disproportionately',
384
+ 'clotheless': 'clothless',
385
+ 'maculopothy': 'maculopathy',
386
+ 'meanmugging': ['mean', 'mugging'],
387
+ 'shadowwork': ['shadow', 'work'],
388
+ 'paintstrokes': ['paint', 'strokes'],
389
+ 'makenit': ['make', 'it'],
390
+ 'ofcolors': ['of', 'colors'],
391
+ 'thevdevilish': ['the', 'devilish'],
392
+ 'lilipads': ['lily', 'pads'],
393
+ 'lilypad': ['lily', 'pad'],
394
+ 'prusinors': 'prisoners',
395
+ 'thebattle': ['the', 'battle'],
396
+ 'bathingsuit': ['bathing', 'suit'],
397
+ 'thencolors': ['the', 'colors'],
398
+ 'morexcitingand': ['more', 'exciting', 'and'],
399
+ 'thebeggining': ['the', 'beginning'],
400
+ 'imageryand': ['imagery', 'and'],
401
+ 'contentness': 'contentedness',
402
+ 'oversimplicity': ['over', 'simplicity'],
403
+ 'overexausted': ['over', 'exhausted'],
404
+ 'uninterst': 'uninterest',
405
+ 'theanfels': ['the', 'angels'],
406
+ 'bittypenis': ['bitty', 'penis'],
407
+ 'intellegiant': 'intelligent',
408
+ 'fauxfur': ['faux', 'fur'],
409
+ 'togetherther': 'together',
410
+ 'murakmi': 'murakami',
411
+ 'diffinterate': 'different',
412
+ 'deatheater': ['death', 'eater'],
413
+ 'grafittied': 'graffitied',
414
+ 'colortheme': ['color', 'theme'],
415
+ 'herevening': ['her', 'evening'],
416
+ 'comradarie': 'camaraderie',
417
+ 'gradeintly': 'gradiently',
418
+ 'womenreally': ['woman', 'really'],
419
+ 'renduveousing': 'rendezvousing',
420
+ 'unsettleness': 'unsettledness',
421
+ 'desolutioned': 'disillusioned',
422
+ 'bucketlist': ['bucket', 'list'],
423
+ 'contrastful': 'contrasting',
424
+ 'snailshell': ['snail', 'shell'],
425
+ 'figureswithin': ['figures', 'within'],
426
+ 'semitrical': 'symmetrical',
427
+ 'happinessand': ['happiness', 'and'],
428
+ 'firepit':['fire', 'pit'],
429
+ 'firepits':['fire', 'pits'],
430
+ 'spectrumand': ['spectrum', 'and'],
431
+ 'skyblue': ['sky', 'blue'],
432
+ 'duststorm': ['dust', 'storm'],
433
+ 'ultrawide': ['ultra', 'wide'],
434
+ 'containmatated': 'contaminated',
435
+ 'dressesbis': ['dresses', 'is'],
436
+ 'underdetailed': ['under', 'detailed'],
437
+ 'pitchblack': ['pitch', 'black'],
438
+ 'andvserious': ['and', 'serious'],
439
+ 'peaceand': ['peace', 'and'],
440
+ 'drawingnif': 'drawing',
441
+ 'patternsmake': ['patterns', 'make'],
442
+ 'andvwilling': ['and', 'willing'],
443
+ 'thecdeeamy': ['the', 'dreamy'],
444
+ 'puntilism': 'pointillism',
445
+ 'thecangel': ['the', 'angel'],
446
+ 'awestriking': ['awe', 'striking'],
447
+ 'awestrucking': ['awe', 'striking'],
448
+ 'awestrikng': ['awe', 'striking'],
449
+ 'ofvthe': ['of', 'the'],
450
+ 'desaturatation': 'desaturation',
451
+ 'colrscare': ['colors', 'are'],
452
+ 'looksmessy': ['looks', 'messy'],
453
+ 'thecfeelingvthis': ['the', 'feeling', 'for', 'this'],
454
+ 'manyngood': ['many', 'and', 'good'],
455
+ 'mandsface': ['man\'s', 'face'],
456
+ 'essencence': 'essence',
457
+ 'confuseable': 'confusing',
458
+ 'frizzyness': 'frizziness',
459
+ 'waterbuffalo': ['water', 'buffalo'],
460
+ 'cinaplex' :'cineplex',
461
+ 'clocktowers': ['clock', 'towers'],
462
+ 'aysterutym': 'austerity',
463
+ 'conthemporan': 'contemporary',
464
+ 'coldsore': ['cold', 'sore'],
465
+ 'redflas': ['red', 'flash'],
466
+ 'pompnceremony': ['pomp', 'and', 'ceremony'],
467
+ 'etchisketch': ['etch', 'a', 'sketch'],
468
+ 'durdledoor': ['durdle' 'door'],
469
+ 'eyessquinted': ['eyes', 'squinted'],
470
+ 'colorfullness': 'colorfulness',
471
+ 'christchild': ['christ', 'child'],
472
+ 'wispyness': 'wispiness',
473
+ 'whispiness': 'wispiness',
474
+ 'imaturebut': ['immature', 'but'],
475
+ 'raphealites': 'raphaelites',
476
+ 'late1700': ['late', '1700'],
477
+ 'remnicient': 'reminiscent',
478
+ 'twonsubjecta': ['two', 'subjects'],
479
+ 'awestricken': 'awestruck',
480
+ 'withnumerous': ['with', 'numerous'],
481
+ 'colorsmake': ['colors', 'make'],
482
+ 'vmcolors': ['colors'],
483
+ 'roseyness': 'rosiness',
484
+ 'holdingthe': ['holding', 'the'],
485
+ 'gruesomness': 'gruesomeness',
486
+ 'linedrawing': ['line', 'drawing'],
487
+ 'orangatange': 'orangutan',
488
+ 'naaahhhh': 'nah',
489
+ 'micropattern': ['micro', 'pattern'],
490
+ 'nephilims': 'nephilim',
491
+ 'middleaged': ['middle', 'aged'],
492
+ 'thevnanyvdifferent': ['the', 'many', 'different'],
493
+ 'flirtatously': 'flirtatiously',
494
+ 'nitemare': 'nightmare',
495
+ 'okaaaay': 'ok',
496
+ 'crucufication': 'crucifixion',
497
+ 'manywindow': ['many', 'windows'],
498
+ 'panaroema': 'panorama',
499
+ 'wowwwwwww': 'wow',
500
+ 'theaqua': ['the', 'aqua'],
501
+ 'andexcited': ['and', 'excited'],
502
+ 'frommthe': ['from', 'the'],
503
+ 'thecanal': ['the', 'canal'],
504
+ 'focalpointcof': ['focal', 'point', 'of'],
505
+ 'silouhete': 'silhouette',
506
+ 'physcadelic': 'psychedelic',
507
+ 'tonesmakes': ['tones', 'make'],
508
+ 'reallyenjoying': ['really', 'enjoying'],
509
+ 'disportionate': 'disproportionate',
510
+ 'spidermonkey': ['spider', 'monkey'],
511
+ 'lookswise': ['looks', 'wise'],
512
+ 'wasewas': 'was',
513
+ 'inbthe': ['in', 'the'],
514
+ 'coronvirus': 'coronavirus',
515
+ 'overdramtic': 'overdramatic',
516
+ 'favarouite': 'favorite',
517
+ 'reallyike': ['really', 'like'],
518
+ 'thesoldier': ['the', 'soldier'],
519
+ 'flowerboxes': ['flower', 'boxes'],
520
+ 'envoirment': 'environment',
521
+ 'theirfaces': ['their', 'faces'],
522
+ 'neccasssary': 'necessary',
523
+ 'ghostlyness': 'ghostliness',
524
+ 'trytophobia': 'trypophobia',
525
+ 'tripophobia': 'trypophobia',
526
+ 'woodprinting': ['wood', 'printing'],
527
+ 'roomoom': 'room',
528
+ 'outmyself': ['out', 'myself'],
529
+ 'evildoing': ['evil', 'doing'],
530
+ 'deliousious': 'delicious',
531
+ 'thebfigure': ['the', 'figure'],
532
+ 'sleeptime': ['sleep', 'time'],
533
+ 'isnspooky': ['is', 'spooky'],
534
+ 'comtempory': 'contemporary',
535
+ 'smilingred': ['smiling', 'red'],
536
+ 'ooranateness': 'ornateness',
537
+ 'joyfilled': ['joy', 'filled'],
538
+ 'ghosttown': ['ghost', 'town'],
539
+ 'obvious–that': ['obvious', 'that'],
540
+ 'photobooth': ['photo', 'booth'],
541
+ 'thinknof': ['think', 'of'],
542
+ 'extrodianary': 'extraordinary',
543
+ 'thewindow': ['the', 'window'],
544
+ "'indistinctive": 'indistinctive',
545
+ 'vilianouis': 'villainous',
546
+ 'farmtown': ['farm', 'town'],
547
+ 'handdrawing': ['hand', 'drawing'],
548
+ 'sophisticatedcriminal': ['sophisticated', 'criminal'],
549
+ 'beautifuldepiction': ['beautiful', 'depiction'],
550
+ 'plantetscolliding': ['planets', 'colliding'],
551
+ 'greytones': ['grey', 'tones'],
552
+ 'likepaint': ['like', 'paint'],
553
+ 'leatherworker': ['leather', 'worker'],
554
+ 'cobrownand': ['brown', 'and'],
555
+ 'bluegreens': ['blue', 'greens'],
556
+ 'polkadots': ['polka', 'dots'],
557
+ 'attirewear': ['attire', 'wear'],
558
+ 'disssary': 'disarray',
559
+ 'simplictiness': 'simplicity',
560
+ 'likelord': ['like', 'a', 'lord'],
561
+ 'orbtalking': ['or', 'talking'],
562
+ 'colorscheme': ['color', 'scheme'],
563
+ 'grouchypants': ['grouchy', 'pants'],
564
+ 'renosanse': 'renaissance',
565
+ 'renessciance': 'renaissance',
566
+ 'impaitily': 'impatiently',
567
+ 'hyrogliphic': 'hieroglyphic',
568
+ 'enduresess': 'endures',
569
+ 'orangecand': ['orange', 'and'],
570
+ 'emiotnals': 'emotion',
571
+ 'countryclub': ['country', 'club'],
572
+ 'branchhill': ['branch', 'hill'],
573
+ 'homospiens': ['homo', 'sapiens'],
574
+ 'beautifuland': ['beautiful', 'and'],
575
+ 'birchtree': ['birch', 'tree'],
576
+ 'seemslike': ['seems', 'like'],
577
+ 'beuaktufl': 'beautiful',
578
+ 'appearlike': ['appear', 'like'],
579
+ 'browngrounds': ['brown', 'grounds'],
580
+ 'morningtime': ['morning', 'time'],
581
+ 'jerrsaic': 'jurassic',
582
+ 'feelabout': ['feel','about'],
583
+ 'linestrokes': ['line', 'strokes'],
584
+ 'lifesized': ['life', 'sized'],
585
+ 'thevlower': ['the', 'lower'],
586
+ 'paitngig': 'painting',
587
+ 'handdrawn': ['hand', 'drawn'],
588
+ 'facefrom': ['face', 'from'],
589
+ 'treesmake': ['trees', 'make'],
590
+ 'chesspiece': ['chess', 'piece'],
591
+ 'balletdancer': ['ballet', 'dancer'],
592
+ 'motionblurr': ['motion', 'blur'],
593
+ 'varietyframes': ['variety', 'frames'],
594
+ 'nondetailed': ['non', 'detailed'],
595
+ 'shadowsplus': ['shadows', 'plus'],
596
+ 'bellpeppers': ['bell', 'peppers'],
597
+ 'thebackground': ['the', 'background'],
598
+ 'playwith': ['play', 'with'],
599
+ 'facialmexpression': ['facial', 'expression'],
600
+ 'compositionntells': ['composition', 'tells'],
601
+ 'playfulexpression': ['playful', 'expression'],
602
+ 'somethingforeboding': ['something', 'foreboding'],
603
+ 'everythingnbeing': ['everything', 'being'],
604
+ 'beingbsomseperated': ['being', 'separated'],
605
+ 'nececassry': 'necessary',
606
+ 'oppurnity': 'opportunity',
607
+ 'undistinguishable': 'indistinguishable'
608
+ }
609
+
610
+
611
+ ##
612
+ ## Set, containing words found in ArtEmis but not in Glove. (for the curious reader...)
613
+ ## Curated manually by Panos circa 2020.
614
+ ##
615
+ missing_from_glove_but_are_actual_words = {
616
+ 'agfacolor', 'photobomb', 'modernness', 'altamouras',
617
+ 'invitingness', 'kinkadian', 'unfinishedness',
618
+ 'gainsboro', 'normalness', 'harmoniousness', 'tenebrism',
619
+ 'neckpiece', 'immenseness', 'distastefulness', 'delicateness',
620
+ 'disjointedness', 'midground', 'pulchritudinously', 'maculopathy',
621
+ 'ornateness', 'alienesque', 'bemedaled', 'mundaneness', 'ghoulishness',
622
+ 'hecticness', 'comfortability', 'deathscape', 'snowpiercer', 'acuarela',
623
+ 'pedophillic', 'monochromatically', 'futuristically', 'remnicient',
624
+ 'sereneness', 'tenebrism', 'midground', 'delicateness', 'ornateness',
625
+ 'neckpiece', 'pompousness', 'comfortability', 'contentful', 'disjointedness',
626
+ 'delicateness', 'suitcoat', 'slenderman', 'wispiness', 'realisticness',
627
+ 'splotchiness', 'chubbiness', 'respectfulness', 'chemtrail', 'ominousness',
628
+ 'douchebag', 'naturescape', 'indistinctive', 'hellscape', 'blobbiness',
629
+ 'mountainscape', 'exoticness', 'colorscape', 'overdramatic', 'snowscape',
630
+ 'oceanscape', 'stunningness', 'hyperreligiosity', 'trypophobia', 'treescape',
631
+ 'prayerfulness', 'slothlike', 'tablescape', 'indistinctive', 'imaginativeness',
632
+ 'sincereness', 'rejoicement', 'loyalness', 'hypersexualization', 'solemnness',
633
+ 'boringness', 'hypersexualizing', 'centermost'
634
+ }
imageprocessing/artemis/artemis/neural_models/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ The MIT License (MIT)
3
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
4
+ """
imageprocessing/artemis/artemis/neural_models/attention.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Language-Vision Attention Utilities.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in 2020, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+
10
+ from torch import nn
11
+
12
+
13
+ class AdditiveVisioLinguistic(nn.Module):
14
+ """
15
+ Given a vector summarizing the linguistic information processed by a pipeline
16
+ (e.g. k-th output of RNN) attend to a 2D grid (e.g., image pixels).
17
+ This mechanism *adds* the two sources of information to compute the attention (hence the name additive).
18
+ """
19
+
20
+ def __init__(self, encoder_dim, decoder_dim, attention_dim):
21
+ """
22
+ :param encoder_dim: (int) feature size (last dimension) of encoded images (e.g., [B x H x W] x encoder_dim)
23
+ :param decoder_dim: (int) feature size of decoder's output (summarizing linguistic information)
24
+ :param attention_dim: (int) feature size size of the attention space
25
+ """
26
+ super(AdditiveVisioLinguistic, self).__init__()
27
+ self.encoder_att = nn.Linear(encoder_dim, attention_dim) # linear layer to transform encoded image
28
+ self.decoder_att = nn.Linear(decoder_dim, attention_dim) # linear layer to transform decoder's output
29
+ self.full_att = nn.Linear(attention_dim, 1) # linear layer to calculate values to be softmax-ed
30
+ self.relu = nn.ReLU(inplace=True)
31
+ self.softmax = nn.Softmax(dim=1) # softmax layer to calculate weights
32
+
33
+ def __call__(self, encoder_out, decoder_hidden):
34
+ """
35
+ Forward propagation.
36
+ :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
37
+ :param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim)
38
+ :return: attention weighted encoding, weights
39
+ """
40
+ att1 = self.encoder_att(encoder_out) # (batch_size, num_pixels, attention_dim)
41
+ att2 = self.decoder_att(decoder_hidden) # (batch_size, attention_dim)
42
+ att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2) # (batch_size, num_pixels)
43
+ alpha = self.softmax(att) # (batch_size, num_pixels)
44
+ attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1) # (batch_size, encoder_dim)
45
+ return attention_weighted_encoding, alpha
imageprocessing/artemis/artemis/neural_models/attentive_decoder.py ADDED
@@ -0,0 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Decoding module for a neural speaker (with attention capabilities).
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 06/15/19, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ import random
11
+ import time
12
+ import warnings
13
+ import tqdm
14
+ import math
15
+ import numpy as np
16
+ import torch.nn.functional as F
17
+ from torch import nn
18
+ from torch.nn.utils.rnn import pack_padded_sequence
19
+ from torch.nn.utils import clip_grad_norm_
20
+
21
+ from .attention import AdditiveVisioLinguistic
22
+ from ..utils.stats import AverageMeter
23
+
24
+
25
+ class AttentiveDecoder(nn.Module):
26
+ """
27
+ Note: code adapted from: https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning
28
+ implementing a solid version of Show, Attend, and Tell. Many thanks Sagar and the team.
29
+
30
+ Special (optional) features:
31
+ - use stochastic teacher forcer
32
+ - add auxiliary input data at each decoding step (besides each 'previous' token).
33
+ - tie the weights of the encoder/decoder weight matrices
34
+ """
35
+ def __init__(self, word_embedding, rnn_hidden_dim, encoder_dim, attention_dim,
36
+ vocab, dropout_rate=0, tie_weights=False, teacher_forcing_ratio=1,
37
+ auxiliary_net=None, auxiliary_dim=0):
38
+ """
39
+ :param word_embedding: nn.Embedding
40
+ :param rnn_hidden_dim: hidden (and thus output) dimension of the decoding rnn
41
+ :param encoder_dim: feature dimension of encoded stimulus
42
+ :param attention_dim: feature dimension over which attention is computed
43
+ :param vocab: artemis.utils.vocabulary instance
44
+ :param dropout: dropout rate
45
+ :param tie_weights: (opt, boolean) if True, the hidden-to-word weights are equal (tied) to the word-embeddings,
46
+ see https://arxiv.org/abs/1611.01462 for explanation of why this might be a good idea.
47
+ :param teacher_forcing_ratio:
48
+ :param auxiliary_net: (optional) nn.Module that will be feeding the decoder at each time step
49
+ with some "auxiliary" information (say an emotion label). Obviously, this information is separate than the
50
+ output of the typically used image-encoder.
51
+ :param auxiliary_dim: (int, optional) the output feature-dimension of the auxiliary net.
52
+ """
53
+ super(AttentiveDecoder, self).__init__()
54
+ self.vocab = vocab
55
+ self.vocab_size = len(vocab)
56
+ self.word_embedding = word_embedding
57
+ self.auxiliary_net = auxiliary_net
58
+ self.uses_aux_data = False
59
+
60
+ if auxiliary_dim > 0:
61
+ self.uses_aux_data = True
62
+
63
+ self.decode_step = nn.LSTMCell(word_embedding.embedding_dim + encoder_dim + auxiliary_dim, rnn_hidden_dim)
64
+ self.attention = AdditiveVisioLinguistic(encoder_dim, rnn_hidden_dim, attention_dim)
65
+
66
+ if dropout_rate > 0:
67
+ self.dropout = nn.Dropout(p=dropout_rate, inplace=True)
68
+ else:
69
+ self.dropout = nn.Identity()
70
+
71
+ self.init_h = nn.Linear(encoder_dim, rnn_hidden_dim) # linear layer to find initial hidden state of LSTMCell
72
+ self.init_c = nn.Linear(encoder_dim, rnn_hidden_dim) # linear layer to find initial cell state of LSTMCell
73
+ self.f_beta = nn.Linear(rnn_hidden_dim, encoder_dim) # linear layer to create a sigmoid-activated gate
74
+ self.sigmoid = nn.Sigmoid()
75
+ self.next_word = nn.Linear(rnn_hidden_dim, self.vocab_size) # linear layer to find scores over vocabulary
76
+ self.init_weights()
77
+ self.teacher_forcing_ratio = teacher_forcing_ratio
78
+
79
+ if tie_weights:
80
+ if self.word_embedding.embedding_dim != rnn_hidden_dim:
81
+ raise ValueError('When using the tied weights')
82
+ print('tying weights of encoder/decoder')
83
+ self.next_word.weight = self.word_embedding.weight
84
+
85
+ def init_hidden_state(self, encoder_out):
86
+ """
87
+ Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
88
+ :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
89
+ :return: hidden state, cell state
90
+ """
91
+ mean_encoder_out = encoder_out.mean(dim=1)
92
+ h = self.init_h(mean_encoder_out) # (batch_size, decoder_dim)
93
+ c = self.init_c(mean_encoder_out)
94
+ return h, c
95
+
96
+ def init_weights(self, init_range=0.1):
97
+ """ Better initialization """
98
+ self.word_embedding.weight.data.uniform_(-init_range, init_range) # remove if pre-trained model comes up
99
+ self.next_word.bias.data.zero_()
100
+ self.next_word.weight.data.uniform_(-init_range, init_range)
101
+
102
+ def __call__(self, encoder_out, captions, auxiliary_data=None):
103
+ """ Forward propagation.
104
+ :param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim)
105
+ :param captions: encoded captions, a tensor of dimension (batch_size, max_caption_length)
106
+ :param auxiliary_data: extra information associated with the images (batch_size, some_dim)
107
+ :return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices
108
+ """
109
+ return self.sort_captions_and_forward(encoder_out, captions, auxiliary_data=auxiliary_data)
110
+
111
+ def sort_captions_and_forward(self, encoder_out, captions, auxiliary_data=None):
112
+ """ Feed forward that ...
113
+ :param encoder_out:
114
+ :param captions:
115
+ :return:
116
+ """
117
+ batch_size = encoder_out.size(0)
118
+ encoder_dim = encoder_out.size(-1)
119
+
120
+ # Flatten image
121
+ encoder_out = encoder_out.view(batch_size, -1, encoder_dim) # (batch_size, num_pixels, encoder_dim)
122
+ num_pixels = encoder_out.size(1)
123
+
124
+ decode_lengths = torch.where(captions == self.vocab.eos)[1] # "<sos> I am <eos>" => decode_length = 3
125
+ # we do not feed <eos> as input to generate
126
+ # something after it
127
+
128
+ # Sort input data by decreasing lengths to reduce compute below
129
+ decode_lengths, sort_ind = decode_lengths.sort(dim=0, descending=True)
130
+ encoder_out = encoder_out[sort_ind]
131
+ captions = captions[sort_ind]
132
+
133
+ if auxiliary_data is not None:
134
+ auxiliary_data = auxiliary_data[sort_ind]
135
+ auxiliary_data = self.auxiliary_net(auxiliary_data)
136
+
137
+ # prepare for unravelling
138
+ embeddings = self.word_embedding(captions) # (batch_size, max_caption_length, embed_dim)
139
+ h, c = self.init_hidden_state(encoder_out) # (batch_size, decoder_dim)
140
+ decode_lengths = decode_lengths.tolist()
141
+ device = embeddings.device
142
+
143
+ # Create tensors to hold word prediction logits and attention maps (alphas)
144
+ predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(device)
145
+ alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)
146
+
147
+ # At each time-step, decode by
148
+ # attention-weighing the encoder's output based on the decoder's previous hidden state output
149
+ # then generate a new word in the decoder with the previous word and the attention weighted encoding
150
+ for t in range(max(decode_lengths)):
151
+ batch_size_t = sum([l > t for l in decode_lengths])
152
+ h = h[:batch_size_t] # effective h
153
+ attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t], h)
154
+ gate = self.sigmoid(self.f_beta(h)) # gating scalar, (batch_size_t, encoder_dim)
155
+ attention_weighted_encoding = gate * attention_weighted_encoding
156
+
157
+ use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False
158
+
159
+ if use_teacher_forcing or t == 0:
160
+ decoder_lang_input = embeddings[:batch_size_t, t]
161
+ else:
162
+ _, top_pred = preds[:batch_size_t].topk(1)
163
+ top_pred = top_pred.squeeze(-1).detach() # detach from history as input
164
+ decoder_lang_input = self.word_embedding(top_pred)
165
+
166
+ if auxiliary_data is not None:
167
+ auxiliary_data_t = auxiliary_data[:batch_size_t]
168
+ decoder_in = torch.cat([decoder_lang_input, attention_weighted_encoding, auxiliary_data_t], dim=1)
169
+ else:
170
+ decoder_in = torch.cat([decoder_lang_input, attention_weighted_encoding], dim=1)
171
+
172
+ h, c = self.decode_step(decoder_in, (h, c[:batch_size_t])) # (batch_size_t, decoder_dim)
173
+
174
+ preds = self.next_word(self.dropout(h)) # (batch_size_t, vocab_size)
175
+ predictions[:batch_size_t, t] = preds
176
+ alphas[:batch_size_t, t] = alpha
177
+ return predictions, captions, decode_lengths, alphas, sort_ind
178
+
179
+ def attend_and_predict_next_word(self, encoder_out, h, c, tokens, aux_data=None):
180
+ """Given current hidden/memory state of the decoder and the input tokens, guess the next tokens
181
+ and update the hidden/memory states.
182
+ :param encoder_out: the grounding
183
+ :param h: current hidden state
184
+ :param c: current memory state
185
+ :param tokens: current token input to the decoder
186
+ :return: logits over vocabulary distribution, updated h/c
187
+ """
188
+ attention_weighted_encoding, alpha = self.attention(encoder_out, h)
189
+ gate = self.sigmoid(self.f_beta(h)) # gating scalar, (batch_size_t, encoder_dim)
190
+ attention_weighted_encoding = gate * attention_weighted_encoding
191
+ embeddings = self.word_embedding(tokens) # (batch_size, embed_dim)
192
+
193
+ decoder_input = torch.cat([embeddings, attention_weighted_encoding], dim=1)
194
+
195
+ if aux_data is not None:
196
+ aux_feat = self.auxiliary_net(aux_data)
197
+ decoder_input = torch.cat([decoder_input, aux_feat], dim=1)
198
+
199
+ h, c = self.decode_step(decoder_input, (h, c)) # (batch_size_t, decoder_dim)
200
+ logits = self.next_word(h) # (batch_size_t, vocab_size)
201
+ return h, c, logits, alpha
202
+
203
+
204
+ def single_epoch_train(train_loader, model, criterion, optimizer, epoch, device, tb_writer=None, **kwargs):
205
+ """ Perform training for one epoch.
206
+ :param train_loader: DataLoader for training data
207
+ :param model: nn.ModuleDict with 'encoder', 'decoder' keys
208
+ :param criterion: loss layer
209
+ :param optimizer: optimizer
210
+ :param epoch: epoch number
211
+ :param device:
212
+ """
213
+ alpha_c = kwargs.get('alpha_c', 1.0) # Weight of doubly stochastic (attention) regularization.
214
+ grad_clip = kwargs.get('grad_clip', 5.0) # Gradient clipping (norm magnitude)
215
+ print_freq = kwargs.get('print_freq', 100)
216
+ use_emotion = kwargs.get('use_emotion', False)
217
+
218
+ batch_time = AverageMeter() # forward prop. + back prop. time
219
+ data_time = AverageMeter() # data loading time
220
+ entropy_loss_meter = AverageMeter() # entropy loss (per word decoded)
221
+ total_loss_meter = AverageMeter()
222
+ start = time.time()
223
+ steps_taken = (epoch-1) * len(train_loader.dataset)
224
+ model.train()
225
+
226
+ for i, batch in enumerate(train_loader):
227
+ imgs = batch['image'].to(device)
228
+ caps = batch['tokens'].to(device)
229
+ b_size = len(imgs)
230
+ data_time.update(time.time() - start)
231
+
232
+ if use_emotion:
233
+ emotion = batch['emotion'].to(device)
234
+ res = model.decoder(model.encoder(imgs), caps, emotion)
235
+ else:
236
+ res = model.decoder(model.encoder(imgs), caps)
237
+ logits, caps_sorted, decode_lengths, alphas, sort_ind = res
238
+
239
+ # Since we decoded starting with <sos>, the targets are all words after <sos>, up to <eos>
240
+ targets = caps_sorted[:, 1:]
241
+
242
+ # Remove time-steps that we didn't decode at, or are pads
243
+ # pack_padded_sequence is an easy trick to do this
244
+ logits = pack_padded_sequence(logits, decode_lengths, batch_first=True)
245
+ targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)
246
+
247
+ # Calculate loss
248
+ ent_loss = criterion(logits.data, targets.data)
249
+ total_loss = ent_loss
250
+
251
+ # Add doubly stochastic attention regularization
252
+ # Note. some implementation simply do this like: d_atn_loss = alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
253
+ # here we take care of the fact that some samples in the same batch have more/less tokens than others.
254
+ if alpha_c > 0:
255
+ total_energy = torch.from_numpy(np.array(decode_lengths)) / alphas.shape[-1] # n_tokens / num_pixels
256
+ total_energy.unsqueeze_(-1) # B x 1
257
+ total_energy = total_energy.to(device)
258
+ d_atn_loss = alpha_c * ((total_energy - alphas.sum(dim=1)) ** 2).mean()
259
+ total_loss += d_atn_loss
260
+
261
+ # Back prop.
262
+ optimizer.zero_grad()
263
+ total_loss.backward()
264
+ if grad_clip is not None:
265
+ clip_grad_norm_(model.parameters(), grad_clip)
266
+
267
+ # Update weights
268
+ optimizer.step()
269
+
270
+ # Keep track of metrics
271
+ entropy_loss_meter.update(ent_loss.item(), sum(decode_lengths))
272
+ total_loss_meter.update(total_loss.item(), sum(decode_lengths))
273
+ batch_time.update(time.time() - start)
274
+ start = time.time()
275
+ steps_taken += b_size
276
+
277
+ # Print status
278
+ if print_freq is not None and i % print_freq == 0:
279
+ print('Epoch: [{0}][{1}/{2}]\t'
280
+ 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
281
+ 'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
282
+ 'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(epoch, i, len(train_loader),
283
+ batch_time=batch_time,
284
+ data_time=data_time,
285
+ loss=total_loss_meter))
286
+ if tb_writer is not None:
287
+ tb_writer.add_scalar('training-entropy-loss-with-batch-granularity', entropy_loss_meter.avg, steps_taken)
288
+
289
+ return total_loss_meter.avg
290
+
291
+
292
+ @torch.no_grad()
293
+ def negative_log_likelihood(model, data_loader, device):
294
+ """
295
+ :param model:
296
+ :param data_loader:
297
+ :param device:
298
+ :param phase:
299
+ :return:
300
+ """
301
+ model.eval()
302
+ nll = AverageMeter()
303
+
304
+ aux_data = None
305
+ for batch in data_loader:
306
+ imgs = batch['image'].to(device)
307
+ caps = batch['tokens'].to(device)
308
+
309
+ # TODO Refactor
310
+ if model.decoder.uses_aux_data:
311
+ aux_data = batch['emotion'].to(device)
312
+
313
+ logits, caps_sorted, decode_lengths, alphas, sort_ind = model.decoder(model.encoder(imgs), caps, aux_data)
314
+
315
+ # Since we decoded starting with <sos>, the targets are all words after <sos>, up to <eos>
316
+ targets = caps_sorted[:, 1:]
317
+
318
+ # Remove time-steps that we didn't decode at, or are pads
319
+ # pack_padded_sequence is an easy trick to do this
320
+ logits = pack_padded_sequence(logits, decode_lengths, batch_first=True)
321
+ targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)
322
+
323
+ # Calculate loss
324
+ loss = F.cross_entropy(logits.data, targets.data)
325
+ nll.update(loss.item(), sum(decode_lengths))
326
+ return nll.avg
327
+
328
+
329
+ @torch.no_grad()
330
+ def log_prob_of_caption(model, img, tokens, temperature=1):
331
+ """Given a captioning model, return the log-probability of a caption given an image.
332
+ This version expects a batch of images, each assotiated with a single caption.
333
+ :param model: encoder/decoder speaker
334
+ :param img: Tensor B x channels x spatial-dims
335
+ :param tokens: Tensor B x max-n-tokens
336
+ :return log_probs: Tensor of size B x max-n-tokens holding the log-probs of each token of each caption
337
+ """
338
+
339
+ encoder = model.encoder
340
+ decoder = model.decoder
341
+
342
+ assert all(tokens[:, 0] == decoder.vocab.sos)
343
+
344
+ max_steps = tokens.shape[1]
345
+ encoder_out = encoder(img)
346
+ batch_size = encoder_out.size(0)
347
+ encoder_dim = encoder_out.size(-1)
348
+ encoder_out = encoder_out.view(batch_size, -1, encoder_dim)
349
+
350
+ # Create tensors to hold log-probs
351
+ log_probs = torch.zeros(batch_size, max_steps).to(tokens.device)
352
+ h, c = decoder.init_hidden_state(encoder_out)
353
+
354
+ for t in range(max_steps - 1):
355
+ h, c, pred_t, _ = decoder.attend_and_predict_next_word(encoder_out, h, c, tokens[:, t])
356
+
357
+ if temperature != 1:
358
+ pred_t /= temperature
359
+
360
+ pred_t = F.log_softmax(pred_t, dim=1)
361
+ log_probs[:, t] = pred_t[torch.arange(batch_size), tokens[:, t+1]] # prob. of guessing next token
362
+
363
+ lens = torch.where(tokens == decoder.vocab.eos)[1] # true tokens + 1 for <eos>
364
+ mask = torch.zeros_like(log_probs)
365
+ mask[torch.arange(mask.shape[0]), lens] = 1
366
+ mask = mask.cumsum(dim=1).to(torch.bool)
367
+ log_probs.masked_fill_(mask, 0) # set to zero all positions after the true size of the caption
368
+ return log_probs, lens
369
+
370
+
371
+ @torch.no_grad()
372
+ def sample_captions(model, loader, max_utterance_len, sampling_rule, device, temperature=1,
373
+ topk=None, drop_unk=True, drop_bigrams=False):
374
+ """
375
+ :param model:
376
+ :param loader:
377
+ :param max_utterance_len: maximum allowed length of captions
378
+ :param sampling_rule: (str) 'argmax' or 'multinomial', or 'topk'
379
+ :return:
380
+ attention_weights: (torch cpu Tensor) N-images x encoded_image_size (e.g., 7 x 7) x max_utterance_len
381
+ attention_weights[:,0] corresponds to the attention map over the <SOS> symbol
382
+ """
383
+ if sampling_rule not in ['argmax', 'multinomial', 'topk']:
384
+ raise ValueError('Unknown sampling rule.')
385
+
386
+ model.eval()
387
+ all_predictions = []
388
+ attention_weights = []
389
+ unk = model.decoder.vocab.unk
390
+
391
+ use_aux_data = model.decoder.uses_aux_data
392
+ aux_data = None
393
+
394
+ for batch in loader:
395
+ imgs = batch['image'].to(device)
396
+
397
+ if use_aux_data:
398
+ aux_data = batch['emotion'].to(device)
399
+
400
+ encoder_out = model.encoder(imgs)
401
+ enc_image_size = encoder_out.size(1)
402
+ batch_size = encoder_out.size(0)
403
+ encoder_dim = encoder_out.size(-1)
404
+
405
+ # Flatten image
406
+ encoder_out = encoder_out.view(batch_size, -1, encoder_dim) # (batch_size, num_pixels, encoder_dim)
407
+
408
+ # Create tensors to hold word predictions
409
+ max_steps = max_utterance_len + 1 # one extra step for EOS marker
410
+ predictions = torch.zeros(batch_size, max_steps).to(device)
411
+
412
+ # Initialize decoder state
413
+ decoder = model.decoder
414
+ h, c = decoder.init_hidden_state(encoder_out) # (batch_size, decoder_dim)
415
+
416
+ # Tensor to store previous words at each step; now they're just <sos>
417
+ prev_words = torch.LongTensor([decoder.vocab.sos] * batch_size).to(device)
418
+
419
+ for t in range(max_steps):
420
+ h, c, pred_t, alpha = decoder.attend_and_predict_next_word(encoder_out, h, c, prev_words, aux_data=aux_data)
421
+ if t > 0: # at t=1 it sees <sos> as the previous word
422
+ alpha = alpha.view(-1, enc_image_size, enc_image_size) # (bsize, enc_image_size, enc_image_size)
423
+ attention_weights.append(alpha.cpu())
424
+
425
+ pred_t /= temperature
426
+
427
+ if drop_unk:
428
+ pred_t[:, unk] = -math.inf
429
+
430
+ if t > 0:
431
+ pred_t[:, prev_words] = -math.inf # avoid repeating the same word twice
432
+
433
+ if t > 1:
434
+ pred_t[:, predictions[:,t-2].long()] = -math.inf # avoid repeating the prev-prev word
435
+
436
+ if drop_bigrams and t > 1:
437
+ prev_usage = predictions[:, :t-1] # of the previous word (e.g, xx yy xx) (first xx)
438
+ x, y = torch.where(prev_usage == torch.unsqueeze(prev_words, -1))
439
+ y += 1 # word-after-last-in-prev-usage (e.g., yy in above)
440
+ y = prev_usage[x, y].long()
441
+ pred_t[x, y] = -math.inf
442
+
443
+ if sampling_rule == 'argmax':
444
+ prev_words = torch.argmax(pred_t, 1)
445
+ elif sampling_rule == 'multinomial':
446
+ probability = torch.softmax(pred_t, 1)
447
+ prev_words = torch.multinomial(probability, 1).squeeze_(-1)
448
+ elif sampling_rule == 'topk':
449
+ row_idx = torch.arange(batch_size)
450
+ row_idx = row_idx.view([1, -1]).repeat(topk, 1).t()
451
+ # do soft-max after you zero-out non topk (you could also do this before, ask me/Panos if need be:) )
452
+ val, ind = pred_t.topk(topk, dim=1)
453
+ val = torch.softmax(val, 1)
454
+ probability = torch.zeros_like(pred_t) # only the top-k logits will have non-zero prob.
455
+ probability[row_idx, ind] = val
456
+ prev_words = torch.multinomial(probability, 1).squeeze_(-1)
457
+
458
+ predictions[:, t] = prev_words
459
+ all_predictions.append(predictions.cpu().long())
460
+ all_predictions = torch.cat(all_predictions)
461
+ attention_weights = torch.stack(attention_weights, 1)
462
+ return all_predictions, attention_weights
463
+
464
+
465
+ @torch.no_grad()
466
+ def sample_captions_beam_search(model, data_loader, beam_size, device, temperature=1, max_iter=500,
467
+ drop_unk=True, drop_bigrams=False):
468
+ """
469
+ :param model (encoder, decoder)
470
+ :param data_loader:
471
+ :param beam_size:
472
+ :param drop_unk:
473
+ :return:
474
+
475
+ hypotheses_alphas: list carrying the attention maps over the encoded-pixel space for each produced token.
476
+ Note: batch size must be one.
477
+ """
478
+
479
+ if data_loader.batch_size != 1:
480
+ raise ValueError('not implemented for bigger batch-sizes')
481
+
482
+ model.eval()
483
+ decoder = model.decoder
484
+ vocab = model.decoder.vocab
485
+
486
+ captions = list()
487
+ hypotheses_alphas = list()
488
+ caption_log_prob = list()
489
+
490
+ aux_feat = None
491
+ for batch in tqdm.tqdm(data_loader): # For each image (batch-size = 1)
492
+ image = batch['image'].to(device) # (1, 3, H, W)
493
+
494
+ if model.decoder.uses_aux_data:
495
+ aux_data = batch['emotion'].to(device)
496
+ aux_feat = model.decoder.auxiliary_net(aux_data)
497
+
498
+ k = beam_size
499
+ encoder_out = model.encoder(image) # (1, enc_image_size, enc_image_size, encoder_dim)
500
+ enc_image_size = encoder_out.size(1)
501
+ encoder_dim = encoder_out.size(3)
502
+
503
+ # Flatten encoding
504
+ encoder_out = encoder_out.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim)
505
+ num_pixels = encoder_out.size(1)
506
+
507
+ # We'll treat the problem as having a batch size of k
508
+ encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim)
509
+
510
+ # Tensor to store top k previous words at each step; now they're just <sos>
511
+ k_prev_words = torch.LongTensor([[vocab.sos]] * k).to(device) # (k, 1)
512
+
513
+ # Tensor to store top k sequences; now they're just <sos>
514
+ seqs = k_prev_words # (k, 1)
515
+
516
+ # Tensor to store top k sequences' scores; now they're just 0
517
+ top_k_scores = torch.zeros(k, 1).to(device) # (k, 1)
518
+
519
+ # Tensor to store top k sequences' alphas; now they're just 1s
520
+ seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device) # (k, 1, enc_image_size, enc_image_size)
521
+
522
+ # Lists to store completed sequences and scores
523
+ complete_seqs = list()
524
+ complete_seqs_alpha = list()
525
+ complete_seqs_scores = list()
526
+
527
+ # Start decoding
528
+ step = 1
529
+ h, c = decoder.init_hidden_state(encoder_out)
530
+
531
+ # s (below) is a number less than or equal to k, because sequences are removed
532
+ # from this process once they hit <eos>
533
+ while True:
534
+ embeddings = decoder.word_embedding(k_prev_words).squeeze(1) # (s, embed_dim)
535
+ awe, alpha = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels)
536
+ alpha = alpha.view(-1, enc_image_size, enc_image_size) # (s, enc_image_size, enc_image_size)
537
+ gate = decoder.sigmoid(decoder.f_beta(h)) # gating scalar, (s, encoder_dim)
538
+ awe = gate * awe
539
+ decoder_input = torch.cat([embeddings, awe], dim=1)
540
+
541
+ if aux_feat is not None:
542
+ af = torch.repeat_interleave(aux_feat, decoder_input.shape[0], dim=0)
543
+ decoder_input = torch.cat([decoder_input, af], dim=1)
544
+
545
+ h, c = decoder.decode_step(decoder_input, (h, c)) # (s, decoder_dim)
546
+ scores = decoder.next_word(h) # (s, vocab_size)
547
+
548
+ if temperature != 1:
549
+ scores /= temperature
550
+
551
+ scores = F.log_softmax(scores, dim=1)
552
+
553
+ if drop_unk:
554
+ scores[:, vocab.unk] = -math.inf
555
+
556
+ if drop_bigrams and step > 2:
557
+ # drop bi-grams with frequency higher than 1.
558
+ prev_usage = seqs[:, :step-1]
559
+ x, y = torch.where(prev_usage == k_prev_words)
560
+ y += 1 # word-after-last-in-prev-usage
561
+ y = seqs[x, y]
562
+ scores[x,y] = -math.inf
563
+
564
+ if step > 2:
565
+ ## drop x and x
566
+ and_token = decoder.vocab('and')
567
+ x, y = torch.where(k_prev_words == and_token)
568
+ pre_and_word = seqs[x, step-2]
569
+ scores[x, pre_and_word] = -math.inf
570
+
571
+ # Add log-probabilities
572
+ scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size)
573
+
574
+ # For the first step, all k points will have the same scores (since same k previous words, h, c)
575
+ if step == 1:
576
+ top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s)
577
+ else:
578
+ # Unroll and find top scores, and their unrolled indices
579
+ top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True) # (s)
580
+
581
+ # Convert unrolled indices to actual indices of scores
582
+ prev_word_inds = top_k_words / len(vocab) # (s)
583
+ next_word_inds = top_k_words % len(vocab) # (s)
584
+
585
+ # Add new words to sequences
586
+ seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1)
587
+ seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
588
+ dim=1) # (s, step+1, enc_image_size, enc_image_size)
589
+
590
+ # Which sequences are incomplete (didn't reach <eos>)?
591
+ incomplete_inds = [ind for ind, word in enumerate(next_word_inds) if word != vocab.eos]
592
+ complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))
593
+
594
+ # Set aside complete sequences
595
+ if len(complete_inds) > 0:
596
+ complete_seqs.extend(seqs[complete_inds].tolist())
597
+ complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
598
+ complete_seqs_scores.extend(top_k_scores[complete_inds].tolist())
599
+ k -= len(complete_inds) # reduce beam length accordingly
600
+
601
+ # Proceed with incomplete sequences
602
+ if k == 0:
603
+ break
604
+ seqs = seqs[incomplete_inds]
605
+ seqs_alpha = seqs_alpha[incomplete_inds]
606
+
607
+ h = h[prev_word_inds[incomplete_inds]]
608
+ c = c[prev_word_inds[incomplete_inds]]
609
+ encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
610
+ top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
611
+ k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)
612
+
613
+ # Break if things have been going on too long
614
+ if step > max_iter:
615
+ break
616
+ step += 1
617
+
618
+ s_idx = np.argsort(complete_seqs_scores)[::-1]
619
+ complete_seqs_scores = [complete_seqs_scores[i] for i in s_idx]
620
+ complete_seqs = [complete_seqs[i] for i in s_idx]
621
+ alphas = [complete_seqs_alpha[i] for i in s_idx]
622
+
623
+ captions.append(complete_seqs)
624
+ caption_log_prob.append(complete_seqs_scores)
625
+ hypotheses_alphas.append(alphas)
626
+ return captions, hypotheses_alphas, caption_log_prob
627
+
628
+
629
+ @torch.no_grad()
630
+ def properize_captions(captions, vocab, add_sos=True):
631
+ """
632
+ :param captions: torch Tensor holding M x max_len integers
633
+ :param vocab:
634
+ :param add_sos:
635
+ :return:
636
+ """
637
+ # ensure they end with eos.
638
+
639
+ new_captions = []
640
+ missed_eos = 0
641
+ for caption in captions.cpu():
642
+ ending = torch.where(caption == vocab.eos)[0]
643
+ if len(ending) >= 1: # at least one <eos> symbol is found
644
+ first_eos = ending[0]
645
+ if first_eos < len(caption):
646
+ caption[first_eos+1:] = vocab.pad
647
+ else:
648
+ missed_eos += 1
649
+ caption[-1] = vocab.eos
650
+ new_captions.append(caption)
651
+
652
+ new_captions = torch.stack(new_captions)
653
+
654
+ dummy = torch.unique(torch.where(new_captions == vocab.eos)[0])
655
+ assert len(dummy) == len(new_captions) # assert all have an eos.
656
+
657
+ if add_sos:
658
+ sos = torch.LongTensor([vocab.sos] * len(new_captions)).view(-1, 1)
659
+ new_captions = torch.cat([sos, new_captions], dim=1)
660
+ if missed_eos > 0:
661
+ warnings.warn('{} sentences without <eos> were generated.'.format(missed_eos))
662
+ return new_captions
663
+
664
+
665
+ def log_prob_of_dataset(model, data_loader, device, temperature=1):
666
+ all_log_probs = []
667
+ all_lens = []
668
+ model.eval()
669
+ for batch in data_loader:
670
+ imgs = batch['image'].to(device)
671
+ tokens = batch['tokens'].to(device)
672
+ log_probs, n_tokens = log_prob_of_caption(model, imgs, tokens, temperature=temperature)
673
+ all_log_probs.append(log_probs.cpu())
674
+ all_lens.append(n_tokens.cpu())
675
+
676
+ all_log_probs = torch.cat(all_log_probs, dim=0)
677
+ all_lens = torch.cat(all_lens, dim=0)
678
+ return all_log_probs, all_lens
679
+
680
+
681
+ def perplexity_of_dataset(model, data_loader, device):
682
+ """ for a test corpus perplexity is 2 ^ {-l} where l is log_2(prob_of_sentences) * M, where M is the number
683
+ of tokens in the dataset.
684
+ :param model:
685
+ :param data_loader:
686
+ :param device:
687
+ :return:
688
+ """
689
+ all_log_probs, all_lens = log_prob_of_dataset(model, data_loader, device)
690
+ log_prob_per_sent = torch.sum(all_log_probs, 1).double() # sum over tokens to get the log_p of each utterance
691
+ prob_per_sent = torch.exp(log_prob_per_sent)
692
+ n_tokens = torch.sum(all_lens).double() # number of words in dataset
693
+ average_log_prob = torch.sum(torch.log2(prob_per_sent)) / n_tokens # log_2 for perplexity
694
+ perplexity = 2.0 ** (-average_log_prob)
695
+ return perplexity, prob_per_sent, all_lens
696
+
imageprocessing/artemis/artemis/neural_models/distances.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for distance measurements in GPU.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 07/2019, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ from torch.nn.functional import normalize
11
+
12
+ def cdist(x1, x2, epsilon=1e-16):
13
+ """
14
+ :param x1: N x Feat-dim
15
+ :param x2: N x Feat-dim
16
+ :param epsilon:
17
+ :return: N x N matrix
18
+ """
19
+ x1_norm = x1.pow(2).sum(dim=-1, keepdim=True)
20
+ x2_norm = x2.pow(2).sum(dim=-1, keepdim=True)
21
+ inner_prod = torch.mm(x1, x2.t())
22
+ res = x1_norm - 2.0 * inner_prod + x2_norm.t() # You need to transpose for broadcasting to be correct.
23
+ res.clamp_min_(epsilon).sqrt_()
24
+ return res
25
+
26
+
27
+ def exclude_identity_from_neighbor_search(all_pairwise_dists, identities):
28
+ """
29
+ :param all_pairwise_dists: M x N matrix of distances
30
+ :param identities: the k-th row of all_pairwise_dists, should exclude the identities[k] entry.
31
+ :return:
32
+ """
33
+ all_pairwise_dists[range(all_pairwise_dists.size(0)), identities] = float("Inf")
34
+ return all_pairwise_dists
35
+
36
+
37
+ def k_euclidean_neighbors(k, x1, x2, exclude_identity=False, identities=None):
38
+ """ For each row vector in x1 the k-nearest neighbors in x2.
39
+ :param k:
40
+ :param x1: M x Feat-dim
41
+ :param x2: N x Feat-dim
42
+ :param exclude_identity:
43
+ :param identities:
44
+ :return: M x k
45
+ """
46
+ all_cross_pairwise_dists = cdist(x1, x2)
47
+ if exclude_identity:
48
+ all_cross_pairwise_dists = exclude_identity_from_neighbor_search(all_cross_pairwise_dists, identities)
49
+ n_dists, n_ids = all_cross_pairwise_dists.topk(k=k, dim=1, largest=False, sorted=True)
50
+ return n_dists, n_ids
51
+
52
+
53
+ def k_cosine_neighbors(k, x1, x2, exclude_identity=False, identities=None):
54
+ """ For each row vector in x1 the k-nearest neighbors in x2.
55
+ :param k:
56
+ :param x1: M x Feat-dim
57
+ :param x2: N x Feat-dim
58
+ :param exclude_identity:
59
+ :param identities:
60
+ :return: M x k
61
+ """
62
+ all_cross_pairwise_dists = torch.mm(normalize(x1, dim=1, p=2), normalize(x2, dim=1, p=2).t())
63
+ all_cross_pairwise_dists = 1.0 - all_cross_pairwise_dists
64
+ if exclude_identity:
65
+ all_cross_pairwise_dists = exclude_identity_from_neighbor_search(all_cross_pairwise_dists, identities)
66
+ n_dists, n_ids = all_cross_pairwise_dists.topk(k=k, dim=1, largest=False, sorted=True)
67
+ return n_dists, n_ids
imageprocessing/artemis/artemis/neural_models/image_emotion_clf.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Given an image guess a distribution over the emotion labels.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in 2020, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch import nn
12
+ from tqdm.notebook import tqdm as tqdm_notebook
13
+
14
+ from ..utils.stats import AverageMeter
15
+
16
+
17
+ class ImageEmotionClassifier(nn.Module):
18
+ def __init__(self, img_encoder, clf_head):
19
+ super(ImageEmotionClassifier, self).__init__()
20
+ self.img_encoder = img_encoder
21
+ self.clf_head = clf_head
22
+
23
+ def __call__(self, img):
24
+ feat = self.img_encoder(img)
25
+ logits = self.clf_head(feat)
26
+ return logits
27
+
28
+
29
+ def single_epoch_train(model, data_loader, criterion, optimizer, device):
30
+ epoch_loss = AverageMeter()
31
+ model.train()
32
+ for batch in tqdm_notebook(data_loader):
33
+ img = batch['image'].to(device)
34
+ labels = batch['label'].to(device) # emotion_distribution
35
+ logits = model(img)
36
+
37
+ # Calculate loss
38
+ loss = criterion(logits, labels)
39
+
40
+ # Back prop.
41
+ optimizer.zero_grad()
42
+ loss.backward()
43
+ optimizer.step()
44
+
45
+ b_size = len(labels)
46
+ epoch_loss.update(loss.item(), b_size)
47
+ return epoch_loss.avg
48
+
49
+
50
+ @torch.no_grad()
51
+ def evaluate_on_dataset(model, data_loader, criterion, device, detailed=True, kl_div=True):
52
+ epoch_loss = AverageMeter()
53
+ model.eval()
54
+ epoch_confidence = []
55
+ for batch in tqdm_notebook(data_loader):
56
+ img = batch['image'].to(device)
57
+ labels = batch['label'].to(device) # emotion_distribution
58
+ logits = model(img)
59
+
60
+ # Calculate loss
61
+ loss = criterion(logits, labels)
62
+
63
+ if detailed:
64
+ if kl_div:
65
+ epoch_confidence.append(torch.exp(logits).cpu()) # logits are log-soft-max
66
+ else:
67
+ epoch_confidence.append(F.softmax(logits, dim=-1).cpu()) # logits are pure logits
68
+
69
+ b_size = len(labels)
70
+ epoch_loss.update(loss.item(), b_size)
71
+
72
+ if detailed:
73
+ epoch_confidence = torch.cat(epoch_confidence).numpy()
74
+
75
+ return epoch_loss.avg, epoch_confidence
imageprocessing/artemis/artemis/neural_models/lstm_encoder.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Encoding discrete tokens with LSTMs.
3
+
4
+ The MIT License (MIT)
5
+ Originally created at 2019, (updated on January 2020) for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ from torch import nn
11
+ from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
12
+
13
+
14
+ class LSTMEncoder(nn.Module):
15
+ """A feed-forward network that processes discrete tokens via an LSTM."""
16
+
17
+ def __init__(self, n_input, n_hidden, word_embedding, word_transformation=None,
18
+ bidirectional=False, init_h=None, init_c=None, eos_symbol=None, feature_type='last'):
19
+ """
20
+ :param n_input: (int) input dim of LSTM
21
+ :param n_hidden: (int) hidden dim of LSTM
22
+ :param word_embedding: (nn.Embedding) vectors representing words
23
+ :param word_transformation: (opt, nn.Module) to apply some transformation on the word
24
+ embeddings before they are consumed by the LSTM.
25
+ :param bidirectional: boolean, whether to use a bi-RNN
26
+ :param init_h: (opt, nn.Module) for initializing LSTM hidden state
27
+ :param init_c: (opt, nn.Module) for initializing LSTM memory
28
+ :param eos_symbol: (opt, int) integer marking end of sentence
29
+ :param feature_type: (opt, string) how to process the output of the LSTM,
30
+ valid options = ['last', 'max', 'mean', 'all']
31
+ """
32
+
33
+ super().__init__()
34
+ self.word_embedding = word_embedding
35
+ self.n_hidden = n_hidden
36
+ self.eos = eos_symbol
37
+ self.feature_type = feature_type
38
+
39
+ # auxiliary (optional) networks
40
+ self.word_transformation = word_transformation
41
+ self.init_h = init_h
42
+ self.init_c = init_c
43
+
44
+ self.rnn = nn.LSTM(input_size=n_input, hidden_size=n_hidden,
45
+ bidirectional=bidirectional, batch_first=True)
46
+
47
+ def out_dim(self):
48
+ rnn = self.rnn
49
+ mult = 2 if rnn.bidirectional else 1
50
+ return rnn.num_layers * rnn.hidden_size * mult
51
+
52
+ def __call__(self, tokens, grounding=None, len_of_sequence=None):
53
+ """
54
+ :param tokens:
55
+ :param grounding: (Tensor, opt)
56
+ :param len_of_sequence: (Tensor:, opt) singleton tensor of shape (B,) carrying the length of the tokens
57
+ :return: the encoded by the LSTM tokens
58
+ Note: a) tokens are padded with the <sos> token
59
+ """
60
+ w_emb = self.word_embedding(tokens[:, 1:]) # skip <sos>
61
+ if self.word_transformation is not None:
62
+ w_emb = self.word_transformation(w_emb)
63
+
64
+ device = w_emb.device
65
+
66
+ if len_of_sequence is None:
67
+ len_of_sequence = torch.where(tokens == self.eos)[1] - 1 # ignore <sos>
68
+
69
+ x_packed = pack_padded_sequence(w_emb, len_of_sequence, enforce_sorted=False, batch_first=True)
70
+
71
+ self.rnn.flatten_parameters()
72
+
73
+ if grounding is not None:
74
+ h0 = self.init_h(grounding).unsqueeze(0) # rep-mat if multiple LSTM cells.
75
+ c0 = self.init_c(grounding).unsqueeze(0)
76
+ rnn_out, _ = self.rnn(x_packed, (h0, c0))
77
+ else:
78
+ rnn_out, _ = self.rnn(x_packed)
79
+
80
+ rnn_out, dummy = pad_packed_sequence(rnn_out, batch_first=True)
81
+
82
+ if self.feature_type == 'last':
83
+ batch_size = len(tokens)
84
+ lang_feat = rnn_out[torch.arange(batch_size), len_of_sequence-1]
85
+ elif self.feature_type == 'max':
86
+ lang_feat = rnn_out.max(1).values
87
+ elif self.feature_type == 'mean':
88
+ lang_feat = rnn_out.sum(1)
89
+ lang_feat /= len_of_sequence.view(-1, 1) # broadcasting
90
+ elif self.feature_type == 'all':
91
+ lang_feat = rnn_out
92
+ else:
93
+ raise ValueError('Unknown LSTM feature requested.')
94
+
95
+ return lang_feat
imageprocessing/artemis/artemis/neural_models/mlp.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-Linear Perceptron packaged nicely for convenience.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in late 2019, for Python 3.x. Last updated in 2021.
6
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ from torch import nn
10
+ import numpy as np
11
+
12
+ def optional_repeat(value, times):
13
+ """ helper function, to repeat a parameter's value many times
14
+ :param value: an single basic python type (int, float, boolean, string), or a list with length equals to times
15
+ :param times: int, how many times to repeat
16
+ :return: a list with length equal to times
17
+ """
18
+ if type(value) is not list:
19
+ value = [value]
20
+
21
+ if len(value) != 1 and len(value) != times:
22
+ raise ValueError('The value should be a singleton, or be a list with times length.')
23
+
24
+ if len(value) == times:
25
+ return value # do nothing
26
+
27
+ return np.array(value).repeat(times).tolist()
28
+
29
+
30
+ class MLP(nn.Module):
31
+ """ Multi-near perceptron. That is a k-layer deep network where each layer is a fully-connected layer, with
32
+ (optionally) batch-norm, a non-linearity and dropout. The last layer (output) is always a 'pure' linear function.
33
+ """
34
+ def __init__(self, in_feat_dims, out_channels, b_norm=True, dropout_rate=0,
35
+ non_linearity=nn.ReLU(inplace=True), closure=None):
36
+ """Constructor
37
+ :param in_feat_dims: input feature dimensions
38
+ :param out_channels: list of ints describing each the number hidden/final neurons. The
39
+ :param b_norm: True/False, or list of booleans
40
+ :param dropout_rate: int, or list of int values
41
+ :param non_linearity: nn.Module
42
+ :param closure: optional nn.Module to use at the end of the MLP
43
+ """
44
+ super(MLP, self).__init__()
45
+ self.hidden_dimensions = out_channels[:-1]
46
+ self.embedding_dimension = out_channels[-1]
47
+
48
+ n_layers = len(out_channels)
49
+ dropout_rate = optional_repeat(dropout_rate, n_layers-1)
50
+ b_norm = optional_repeat(b_norm, n_layers-1)
51
+
52
+ previous_feat_dim = in_feat_dims
53
+ all_ops = []
54
+
55
+ for depth in range(len(out_channels)):
56
+ out_dim = out_channels[depth]
57
+ affine_op = nn.Linear(previous_feat_dim, out_dim, bias=True)
58
+ all_ops.append(affine_op)
59
+
60
+ if depth < len(out_channels) - 1:
61
+ if b_norm[depth]:
62
+ all_ops.append(nn.BatchNorm1d(out_dim))
63
+
64
+ if non_linearity is not None:
65
+ all_ops.append(non_linearity)
66
+
67
+ if dropout_rate[depth] > 0:
68
+ all_ops.append(nn.Dropout(p=dropout_rate[depth]))
69
+
70
+ previous_feat_dim = out_dim
71
+
72
+ if closure is not None:
73
+ all_ops.append(closure)
74
+
75
+ self.net = nn.Sequential(*all_ops)
76
+
77
+ def __call__(self, x):
78
+ return self.net(x)
imageprocessing/artemis/artemis/neural_models/resnet_encoder.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rensnet Wrapper.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in late 2019, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+
10
+ import torch
11
+ from torch import nn
12
+ from torchvision import models
13
+
14
+
15
+ class ResnetEncoder(nn.Module):
16
+ """Convenience wrapper around resnet models"""
17
+ def __init__(self, backbone, adapt_image_size=None, drop=2, pretrained=True, verbose=False):
18
+ """
19
+ :param backbone: (string) resnet-S, S in [18, 34, 50, 101]
20
+ :param adapt_image_size: (opt, int) if given forward feature has
21
+ [B, adapt_image_size, adapt_image_size, feat-dim]
22
+ :param drop: how many of the last layers/blocks to drop.
23
+ :param pretrained: (Boolean)
24
+ :param verbose: (opt, Boolean) if true print actions taken.
25
+ Note: in total there are 10 layers/blocks. The last two are an adaptive_pooling and an FC, the
26
+ previous layers give rise to convolutional maps of increasing spatial size.
27
+ """
28
+
29
+ if drop == 0 and adapt_image_size is not None:
30
+ raise ValueError('Trying to apply adaptive pooling while keeping the entire model (drop=0).')
31
+
32
+ super(ResnetEncoder, self).__init__()
33
+ backbones = {
34
+ 'resnet18': models.resnet18,
35
+ 'resnet34': models.resnet34,
36
+ 'resnet50': models.resnet50,
37
+ 'resnet101': models.resnet101,
38
+ }
39
+
40
+ self.name = backbone
41
+ self.drop = drop
42
+ self.resnet = backbones[self.name](pretrained=pretrained)
43
+
44
+ # Remove linear and last adaptive pool layer
45
+ if drop > 0:
46
+ modules = list(self.resnet.children())
47
+ if verbose:
48
+ print('Removing the last {} layers of a {}'.format(drop, self.name))
49
+ print(modules[-drop:])
50
+ modules = modules[:-drop]
51
+ self.resnet = nn.Sequential(*modules)
52
+
53
+ self.adaptive_pool = None
54
+ if adapt_image_size is not None:
55
+ self.adaptive_pool = nn.AdaptiveAvgPool2d((adapt_image_size, adapt_image_size))
56
+
57
+ if pretrained:
58
+ for p in self.resnet.parameters():
59
+ p.requires_grad = False
60
+
61
+ def __call__(self, images):
62
+ """Forward prop.
63
+ :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
64
+ :return: encoded images
65
+ """
66
+ out = self.resnet(images) # (B, F, ceil(image_size/32), ceil(image_size/32))
67
+
68
+ if self.adaptive_pool is not None:
69
+ out = self.adaptive_pool(out) # (B, F, adapt_image_size, adapt_image_size)
70
+
71
+ if self.drop > 0: # convolutional-like output
72
+ out = out.permute(0, 2, 3, 1) # bring feature-dim last.
73
+ out = torch.squeeze(torch.squeeze(out, 1), 1) # In case adapt_image_size == 1, remove dimensions
74
+ return out
75
+
76
+ def unfreeze(self, level=5, verbose=False):
77
+ """Allow or prevent the computation of gradients for blocks after level.
78
+ The smaller the level, the less pretrained the resnet will be.
79
+ """
80
+ all_layers = list(self.resnet.children())
81
+
82
+ if verbose:
83
+ ll = len(all_layers)
84
+ print('From {} layers, you are unfreezing the last {}'.format(ll, ll-level))
85
+
86
+ for c in all_layers[level:]:
87
+ for p in c.parameters():
88
+ p.requires_grad = True
89
+ return self
90
+
91
+ def embedding_dimension(self):
92
+ """The feature (channel) dimension of the last layer"""
93
+ if self.drop == 0:
94
+ return 1000 #Imagenet Classes
95
+
96
+ if self.drop == 2:
97
+ return 512 if int(self.name.replace('resnet', '')) < 50 else 2048
98
+
99
+ if self.drop == 3:
100
+ return 256 if int(self.name.replace('resnet', '')) < 50 else 1024
101
+
102
+ raise NotImplementedError
103
+
imageprocessing/artemis/artemis/neural_models/show_attend_tell.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A custom implementation of Show-Attend-&-Tell for ArtEmis: Affective Language for Visual Art
3
+
4
+ The MIT License (MIT)
5
+ Originally created in early 2020, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ from torch import nn
10
+ from .resnet_encoder import ResnetEncoder
11
+ from .attentive_decoder import AttentiveDecoder
12
+
13
+
14
+ def describe_model(vocab, args):
15
+ """ Describe the architecture of a SAT speaker with a resnet encoder.
16
+ :param vocab:
17
+ :param args:
18
+ :return:
19
+ """
20
+ word_embedding = nn.Embedding(len(vocab), args.word_embedding_dim, padding_idx=vocab.pad)
21
+
22
+ encoder = ResnetEncoder(args.vis_encoder, adapt_image_size=args.atn_spatial_img_size).unfreeze()
23
+ encoder_out_dim = encoder.embedding_dimension()
24
+
25
+ emo_ground_dim = 0
26
+ emo_projection_net = None
27
+ if args.use_emo_grounding:
28
+ emo_in_dim = args.emo_grounding_dims[0]
29
+ emo_ground_dim = args.emo_grounding_dims[1]
30
+ # obviously one could use more complex nets here instead of using a "linear" layer.
31
+ # in my estimate, this is not going to be useful:)
32
+ emo_projection_net = nn.Sequential(*[nn.Linear(emo_in_dim, emo_ground_dim), nn.ReLU()])
33
+
34
+ decoder = AttentiveDecoder(word_embedding,
35
+ args.rnn_hidden_dim,
36
+ encoder_out_dim,
37
+ args.attention_dim,
38
+ vocab,
39
+ dropout_rate=args.dropout_rate,
40
+ teacher_forcing_ratio=args.teacher_forcing_ratio,
41
+ auxiliary_net=emo_projection_net,
42
+ auxiliary_dim=emo_ground_dim)
43
+
44
+ model = nn.ModuleDict({'encoder': encoder, 'decoder': decoder})
45
+ return model
imageprocessing/artemis/artemis/neural_models/text_emotional_clf.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Given an utterance (an optionally an image) guess a distribution over the emotion labels.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in 2020, for Python 3.x
6
+ Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch import nn
12
+ from tqdm.notebook import tqdm as tqdm_notebook
13
+
14
+ from ..utils.stats import AverageMeter
15
+
16
+
17
+ class TextEmotionClassifier(nn.Module):
18
+ def __init__(self, text_encoder, clf_head, img_encoder=None):
19
+ super(TextEmotionClassifier, self).__init__()
20
+ self.text_encoder = text_encoder
21
+ self.clf_head = clf_head
22
+ self.img_encoder = img_encoder
23
+
24
+ def __call__(self, text, img=None):
25
+ if img is not None:
26
+ img_feat = self.img_encoder(img)
27
+ feat = self.text_encoder(text, img_feat)
28
+ else:
29
+ feat = self.text_encoder(text)
30
+
31
+ logits = self.clf_head(feat)
32
+ return logits
33
+
34
+
35
+ def single_epoch_train(model, data_loader, use_vision, criterion, optimizer, device):
36
+ epoch_loss = AverageMeter()
37
+ epoch_acc = AverageMeter()
38
+ model.train()
39
+ for batch in tqdm_notebook(data_loader):
40
+ labels = batch['emotion'].to(device)
41
+ tokens = batch['tokens'].to(device)
42
+
43
+ if use_vision:
44
+ img = batch['image'].to(device)
45
+ logits = model(tokens, img)
46
+ else:
47
+ logits = model(tokens)
48
+
49
+ # Calculate loss
50
+ loss = criterion(logits, labels)
51
+ acc = torch.mean((logits.argmax(1) == labels).double())
52
+
53
+ # Back prop.
54
+ optimizer.zero_grad()
55
+ loss.backward()
56
+ optimizer.step()
57
+
58
+ b_size = len(labels)
59
+ epoch_loss.update(loss.item(), b_size)
60
+ epoch_acc.update(acc.item(), b_size)
61
+ return epoch_loss.avg, epoch_acc.avg
62
+
63
+
64
+ @torch.no_grad()
65
+ def evaluate_on_dataset(model, data_loader, use_vision, criterion, device, detailed=True):
66
+ epoch_loss = AverageMeter()
67
+ epoch_acc = AverageMeter()
68
+ model.eval()
69
+ epoch_confidence = []
70
+ for batch in tqdm_notebook(data_loader):
71
+ labels = batch['emotion'].to(device)
72
+ tokens = batch['tokens'].to(device)
73
+ if use_vision:
74
+ img = batch['image'].to(device)
75
+ logits = model(tokens, img)
76
+ else:
77
+ logits = model(tokens)
78
+
79
+ # Calculate loss
80
+ loss = criterion(logits, labels)
81
+ guessed_correct = logits.argmax(1) == labels
82
+ acc = torch.mean(guessed_correct.double())
83
+
84
+ if detailed:
85
+ epoch_confidence.append(F.softmax(logits, dim=-1).cpu())
86
+
87
+ b_size = len(labels)
88
+ epoch_loss.update(loss.item(), b_size)
89
+ epoch_acc.update(acc.item(), b_size)
90
+
91
+ if detailed:
92
+ epoch_confidence = torch.cat(epoch_confidence).numpy()
93
+
94
+ return epoch_loss.avg, epoch_acc.avg, epoch_confidence
imageprocessing/artemis/artemis/neural_models/word_embeddings.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities to load pretrained word embeddings like those of GloVe.
3
+
4
+ The MIT License (MIT)
5
+ Originally created in late 2019, for Python 3.x - last updated in 2021.
6
+ Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
7
+ """
8
+
9
+ import warnings
10
+ import torch
11
+ import numpy as np
12
+ from collections import Counter
13
+
14
+
15
+ def make_pretrained_embedding(vocab, pretrained_vectors, freeze=True, sigma=1, random_seed=None):
16
+ """ Make a torch.nn.Embedding based for a given vocabulary and a collection of
17
+ pretrained word-embedding vectors.
18
+ :param vocab: speakers_listeners.build_vocab.Vocabulary
19
+ :param pretrained_vectors: dictionary of words mapped to np.array vectors
20
+ (like those returned from ```load_glove_pretrained_embedding```).
21
+ :param freeze, (opt, boolean) if True the embedding is not using gradients to optimize itself (fine-tune).
22
+ :param sigma, (opt, int) standard-deviation of Gaussian used to sample when a word is not in the pretrained_vectors
23
+ :param random_seed (opt, int) to seed the numpy Gaussian
24
+ :return: torch.nn.Embedding
25
+
26
+ Note: this implementation will freeze all words if freeze=True, irrespectively of if the words are in the
27
+ pretrained_vectors collection or not (OOV: Out-of-Vocabulary). If you want to fine-tune the OOV you need to adapt
28
+ like this: https://discuss.pytorch.org/t/updating-part-of-an-embedding-matrix-only-for-out-of-vocab-words/33297
29
+ """
30
+ for ss in vocab.special_symbols:
31
+ if ss in pretrained_vectors:
32
+ warnings.warn('the special symbol {} is found in the pretrained embedding.')
33
+
34
+ # Initialize weight matrix with correct dimensions and all zeros
35
+ random_key = next(iter(pretrained_vectors))
36
+ emb_dim = len(pretrained_vectors[random_key])
37
+ emb_dtype = pretrained_vectors[random_key].dtype
38
+ n_words = len(vocab)
39
+ weights = np.zeros((n_words, emb_dim), dtype=emb_dtype)
40
+
41
+ if random_seed is not None:
42
+ np.random.seed(random_seed)
43
+
44
+ for word, idx in vocab.word2idx.items():
45
+ if word in pretrained_vectors:
46
+ weights[idx] = pretrained_vectors[word]
47
+ else:
48
+ weights[idx] = sigma * np.random.randn(emb_dim)
49
+
50
+ padding_idx = None
51
+ if hasattr(vocab, 'pad'):
52
+ print('using padding symbol of provided vocabulary.')
53
+ padding_idx = vocab.pad
54
+ weights[padding_idx] = np.zeros(emb_dim)
55
+
56
+ embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(weights), freeze=freeze, padding_idx=padding_idx)
57
+ return embedding
58
+
59
+
60
+ def load_glove_pretrained_embedding(glove_file, dtype=np.float32, only_words=False, verbose=False):
61
+ """
62
+ :param glove_file: file downloaded from Glove website
63
+ :param dtype: how to save the word-embeddings
64
+ :param only_words: do not return the embedding vectors, only the words considered
65
+ :param verbose: print, or not side-information
66
+ :return: dictionary of words mapped to np.array vectors
67
+ """
68
+
69
+ if verbose:
70
+ print("Loading glove word embeddings.")
71
+
72
+ embedding = dict()
73
+ with open(glove_file) as f_in:
74
+ for line in f_in:
75
+ s_line = line.split()
76
+ token = s_line[0]
77
+ if only_words:
78
+ embedding[token] = 0
79
+ else:
80
+ w_embedding = np.array([float(val) for val in s_line[1:]], dtype=dtype)
81
+ embedding[token] = w_embedding
82
+ if only_words:
83
+ embedding = set(list(embedding.keys()))
84
+
85
+ if verbose:
86
+ print("Done.", len(embedding), "words loaded.")
87
+ return embedding
88
+
89
+
90
+ def init_token_bias(encoded_token_list, vocab=None, dtype=np.float32, trainable=True):
91
+ """ Make a bias vector based on the (log) probability of the frequency of each word
92
+ in the training data similar to https://arxiv.org/abs/1412.2306
93
+ This bias can used to initialize the hidden-to-next-word layer for faster convergence.
94
+ :param encoded_token_list: [[tokens-of-utterance-1-as-ints] [tokens-of-utterance-2]...]
95
+ :param vocab: speakers_listeners.build_vocab.Vocabulary
96
+ :param dtype:
97
+ :param trainable: (opt, bool) permit training or not of the resulting bias vector
98
+ :return: (torch.Parameter) bias vector
99
+ """
100
+ counter = Counter()
101
+ for tokens in encoded_token_list:
102
+ counter.update(tokens)
103
+
104
+ n_items = len(counter)
105
+ if vocab is not None:
106
+ if n_items != len(vocab):
107
+ warnings.warn('init_token_bias: Vobab contains more tokens than given token lists.')
108
+ n_items = max(n_items, len(vocab))
109
+ counter[vocab.sos] = counter[vocab.pad] = min(counter.values())
110
+
111
+ bias_vector = np.ones(n_items, dtype=dtype) # initialize
112
+
113
+ for position, frequency in counter.items():
114
+ bias_vector[position] = frequency
115
+
116
+ # Log probability
117
+ bias_vector /= np.sum(bias_vector)
118
+ bias_vector = np.log(bias_vector)
119
+ bias_vector -= np.max(bias_vector)
120
+
121
+ bias_vector = torch.from_numpy(bias_vector)
122
+ bias_vector = torch.nn.Parameter(bias_vector, requires_grad=trainable)
123
+ return bias_vector