LuisV
commited on
Commit
·
dfd271a
1
Parent(s):
4859d06
adding artemis package
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- imageprocessing/artemis/LICENSE +23 -0
- imageprocessing/artemis/README.md +160 -0
- imageprocessing/artemis/artemis/__init__.py +6 -0
- imageprocessing/artemis/artemis/analysis/__init__.py +4 -0
- imageprocessing/artemis/artemis/analysis/emotion_centric.py +72 -0
- imageprocessing/artemis/artemis/analysis/feature_extraction.py +84 -0
- imageprocessing/artemis/artemis/analysis/paintings_meta_data.py +26 -0
- imageprocessing/artemis/artemis/analysis/utils.py +80 -0
- imageprocessing/artemis/artemis/captioning/__init__.py +4 -0
- imageprocessing/artemis/artemis/captioning/sample_captions.py +78 -0
- imageprocessing/artemis/artemis/captioning/senti_cap_anps.py +111 -0
- imageprocessing/artemis/artemis/data/glove.6B.100d.vocabulary.txt +0 -0
- imageprocessing/artemis/artemis/data/image-emotion-histogram.csv +0 -0
- imageprocessing/artemis/artemis/data/speaker_sampling_configs/full_hyper_param_ablation.json.txt +182 -0
- imageprocessing/artemis/artemis/data/speaker_sampling_configs/mini_hyper_param_ablation.json.txt +12 -0
- imageprocessing/artemis/artemis/data/speaker_sampling_configs/selected_hyper_params.json.txt +7 -0
- imageprocessing/artemis/artemis/data/symspell_frequency_dictionary_en_82_765.txt +0 -0
- imageprocessing/artemis/artemis/data/wiki_art_duplicate_paintings.pkl +3 -0
- imageprocessing/artemis/artemis/data/wiki_art_genre_class.csv +0 -0
- imageprocessing/artemis/artemis/emotions.py +79 -0
- imageprocessing/artemis/artemis/evaluation/__init__.py +7 -0
- imageprocessing/artemis/artemis/evaluation/bleu.py +34 -0
- imageprocessing/artemis/artemis/evaluation/emotion_alignment.py +87 -0
- imageprocessing/artemis/artemis/evaluation/longest_common_subseq.py +63 -0
- imageprocessing/artemis/artemis/evaluation/metaphors.py +42 -0
- imageprocessing/artemis/artemis/evaluation/single_caption_per_image.py +214 -0
- imageprocessing/artemis/artemis/in_out/__init__.py +4 -0
- imageprocessing/artemis/artemis/in_out/arguments.py +199 -0
- imageprocessing/artemis/artemis/in_out/basics.py +230 -0
- imageprocessing/artemis/artemis/in_out/cleaning.py +87 -0
- imageprocessing/artemis/artemis/in_out/coco.py +30 -0
- imageprocessing/artemis/artemis/in_out/datasets.py +224 -0
- imageprocessing/artemis/artemis/in_out/neural_net_oriented.py +336 -0
- imageprocessing/artemis/artemis/language/__init__.py +4 -0
- imageprocessing/artemis/artemis/language/adjective_noun_pairs.py +44 -0
- imageprocessing/artemis/artemis/language/basics.py +132 -0
- imageprocessing/artemis/artemis/language/language_preprocessing.py +224 -0
- imageprocessing/artemis/artemis/language/part_of_speech.py +40 -0
- imageprocessing/artemis/artemis/language/spelling.py +634 -0
- imageprocessing/artemis/artemis/neural_models/__init__.py +4 -0
- imageprocessing/artemis/artemis/neural_models/attention.py +45 -0
- imageprocessing/artemis/artemis/neural_models/attentive_decoder.py +696 -0
- imageprocessing/artemis/artemis/neural_models/distances.py +67 -0
- imageprocessing/artemis/artemis/neural_models/image_emotion_clf.py +75 -0
- imageprocessing/artemis/artemis/neural_models/lstm_encoder.py +95 -0
- imageprocessing/artemis/artemis/neural_models/mlp.py +78 -0
- imageprocessing/artemis/artemis/neural_models/resnet_encoder.py +103 -0
- imageprocessing/artemis/artemis/neural_models/show_attend_tell.py +45 -0
- imageprocessing/artemis/artemis/neural_models/text_emotional_clf.py +94 -0
- imageprocessing/artemis/artemis/neural_models/word_embeddings.py +123 -0
imageprocessing/artemis/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ArtEmis: Affective Language for Art
|
2 |
+
|
3 |
+
The MIT License (MIT)
|
4 |
+
|
5 |
+
Copyright (c) 2021 Panos Achlioptas
|
6 |
+
|
7 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
8 |
+
of this software and associated documentation files (the "Software"), to deal
|
9 |
+
in the Software without restriction, including without limitation the rights
|
10 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11 |
+
copies of the Software, and to permit persons to whom the Software is
|
12 |
+
furnished to do so, subject to the following conditions:
|
13 |
+
|
14 |
+
The above copyright notice and this permission notice shall be included in all
|
15 |
+
copies or substantial portions of the Software.
|
16 |
+
|
17 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23 |
+
SOFTWARE.
|
imageprocessing/artemis/README.md
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## ArtEmis: Affective Language for Visual Art
|
2 |
+
A codebase created and maintained by <a href="https://ai.stanford.edu/~optas" target="_blank">Panos Achlioptas</a>.
|
3 |
+
|
4 |
+
![representative](https://github.com/optas/artemis/blob/master/doc/images/speaker_productions_teaser.png)
|
5 |
+
|
6 |
+
|
7 |
+
### Introduction
|
8 |
+
This work is based on the [arXiv tech report](https://arxiv.org/abs/2101.07396) which is __provisionally__ accepted in [CVPR-2021](http://cvpr2021.thecvf.com/), for an <b>Oral</b> presentation.
|
9 |
+
|
10 |
+
### Citation
|
11 |
+
If you find this work useful in your research, please consider citing:
|
12 |
+
|
13 |
+
@article{achlioptas2021artemis,
|
14 |
+
title={ArtEmis: Affective Language for Visual Art},
|
15 |
+
author={Achlioptas, Panos and Ovsjanikov, Maks and Haydarov, Kilichbek and
|
16 |
+
Elhoseiny, Mohamed and Guibas, Leonidas},
|
17 |
+
journal = {CoRR},
|
18 |
+
volume = {abs/2101.07396},
|
19 |
+
year={2021}
|
20 |
+
}
|
21 |
+
|
22 |
+
### Dataset
|
23 |
+
To get the most out of this repo, please __download__ the data associated with ArtEmis by filling this [form](https://forms.gle/7eqiRgb764uTuexd7).
|
24 |
+
|
25 |
+
### Installation
|
26 |
+
This code has been tested with Python 3.6.9, Pytorch 1.3.1, CUDA 10.0 on Ubuntu 16.04.
|
27 |
+
|
28 |
+
Assuming some (potentially) virtual environment and __python 3x__
|
29 |
+
```Console
|
30 |
+
git clone https://github.com/optas/artemis.git
|
31 |
+
cd artemis
|
32 |
+
pip install -e .
|
33 |
+
```
|
34 |
+
This will install the repo with all its dependencies (listed in setup.py) and will enable you to do things like:
|
35 |
+
```
|
36 |
+
from artemis.models import xx
|
37 |
+
```
|
38 |
+
(provided you add this artemis repo in your PYTHON-PATH)
|
39 |
+
|
40 |
+
### Playing with ArtEmis
|
41 |
+
|
42 |
+
#### Step-1 (important :pushpin:)
|
43 |
+
|
44 |
+
__Preprocess the provided annotations__ (spell-check, patch, tokenize, make train/val/test splits, etc.).
|
45 |
+
```Console
|
46 |
+
artemis/scripts/preprocess_artemis_data.py
|
47 |
+
```
|
48 |
+
This script allows you to preprocess ArtEmis according to your needs. The __default__ arguments will do __minimal__
|
49 |
+
preprocessing so the resulting output can be used to _fairly_ compare ArtEmis with other datasets; and, derive most _faithful_ statistics
|
50 |
+
about ArtEmis's nature. That is what we used in our __analysis__ and what you should use in "Step-2" below. With this in mind do:
|
51 |
+
```Console
|
52 |
+
python artemis/scripts/preprocess_artemis_data.py -save-out-dir <ADD_YOURS> -raw-artemis-data-csv <ADD_YOURS>
|
53 |
+
```
|
54 |
+
|
55 |
+
If you wish to train __deep-nets__ (speakers, emotion-classifiers etc.) *exactly* as we did it in our paper, then you need to rerun this script
|
56 |
+
by providing only a single extra optional argument ("__--preprocess-for-deep-nets True__"). This will do more aggressive filtering and you should use its output for
|
57 |
+
"Steps-3" and "Steps-4" below. Use a different save-out-dir to avoid overwritting the output of previous runs.
|
58 |
+
```Console
|
59 |
+
python artemis/scripts/preprocess_artemis_data.py -save-out-dir <ADD_YOURS> -raw-artemis-data-csv <ADD_YOURS> --preprocess-for-deep-nets True
|
60 |
+
```
|
61 |
+
To understand and customize the different hyper-parameters please read the details in the provided _help_ messages of the used argparse.
|
62 |
+
|
63 |
+
#### Step-2
|
64 |
+
__Analyze & explore the dataset__. :microscope:
|
65 |
+
|
66 |
+
Using the _minimally_ preprocessed version of ArtEmis which includes __all__ (454,684) collected annotation.
|
67 |
+
|
68 |
+
1. This is a great place to __start__ :checkered_flag:. Run this [notebook](artemis/notebooks/analysis/analyzing_artemis.ipynb) to do basic _linguistic_, _emotion_ & _art-oriented_ __analysis__ of the ArtEmis dataset.
|
69 |
+
2. Run this [notebook](artemis/notebooks/analysis/concreteness_subjectivity_sentiment_and_POS.ipynb) to analyze ArtEmis in terms of its: _concreteness_, _subjectivity_, _sentiment_ and _Parts-of-Speech_. Optionally, contrast these values with
|
70 |
+
with other common datasets like COCO.
|
71 |
+
3. Run this [notebook](artemis/notebooks/analysis/extract_emotion_histogram_per_image.ipynb) to extract the _emotion histograms_ (empirical distributions) of each artwork. This in __necessary__ for the Step-3 (1).
|
72 |
+
4. Run this [notebook](artemis/notebooks/analysis/emotion_entropy_per_genre_or_artstyle.ipynb) to analyze the extracted emotion histograms (previous step) per art genre and style.
|
73 |
+
|
74 |
+
#### Step-3
|
75 |
+
|
76 |
+
__Train and evaluate emotion-centric image & text classifiers__. :hearts:
|
77 |
+
|
78 |
+
Using the preprocessed version of ArtEmis for __deep-nets__ which includes 429,431 annotations.
|
79 |
+
(Training on a single GPU from scratch is a matter of __minutes__ for these classifiers!)
|
80 |
+
|
81 |
+
1. Run this [notebook](artemis/notebooks/deep_nets/emotions/image_to_emotion_classifier.ipynb) to train an __image-to-emotion__ classifier.
|
82 |
+
2. Run this [notebook](artemis/notebooks/deep_nets/emotions/utterance_to_emotion_classifier.ipynb) to train an LSTM-based __utterance-to-emotion__ classifier. Or, this [notebook](artemis/notebooks/deep_nets/emotions/utterance_to_emotion_with_transformer.ipynb) to train a BERT-based one.
|
83 |
+
|
84 |
+
|
85 |
+
#### Step-4
|
86 |
+
__Train & evaluate neural-speakers.__ :bomb:
|
87 |
+
|
88 |
+
- To __train__ our customized SAT model on ArtEmis (__~2 hours__ to train in a single GPU!) do:
|
89 |
+
```Console
|
90 |
+
python artemis/scripts/train_speaker.py -log-dir <ADD_YOURS> -data-dir <ADD_YOURS> -img-dir <ADD_YOURS>
|
91 |
+
|
92 |
+
log-dir: where to save the output of the training process, models etc.
|
93 |
+
data-dir: directory that contains the _input_ data
|
94 |
+
the directory that contains the ouput of preprocess_artemis_data.py: e.g.,
|
95 |
+
the artemis_preprocessed.csv, the vocabulary.pkl
|
96 |
+
img-dir: the top folder containing the WikiArt image dataset in its "standard" format:
|
97 |
+
img-dir/art_style/painting_xx.jpg
|
98 |
+
```
|
99 |
+
|
100 |
+
Note. The default optional arguments will create the same vanilla-speaker variant we used in the CVPR21 paper.
|
101 |
+
|
102 |
+
- To __train__ the __emotionally-grounded__ variant of SAT add an extra parameter in the above call:
|
103 |
+
```Console
|
104 |
+
python artemis/scripts/train_speaker.py -log-dir <ADD_YOURS> -data-dir <ADD_YOURS> -img-dir <ADD_YOURS>
|
105 |
+
--use-emo-grounding True
|
106 |
+
```
|
107 |
+
- To __sample__ utterances from a trained speaker:
|
108 |
+
```Console
|
109 |
+
python artemis/scripts/sample_speaker.py -arguments
|
110 |
+
```
|
111 |
+
For an explanation of the arguments see the argparse help messages. It is worth noting that when you
|
112 |
+
want to sample an emotionally-grounded variant you need to provide a pretrained image2emotion
|
113 |
+
classifier. The image2emotion will be used to deduce _the most likely_ emotion of an image, and input this emotion to
|
114 |
+
the speaker. See Step-3 (1) for how to train such a net.
|
115 |
+
|
116 |
+
- To __evaluate__ the quality of the sampled captions (e.g., per BLEU, emotional alignment, methaphors etc.) use this
|
117 |
+
[notebook](artemis/notebooks/deep_nets/speakers/evaluate_sampled_captions.ipynb). As a bonus you can use it to inspect the _neural attention_ placed on
|
118 |
+
the different tokens/images.
|
119 |
+
|
120 |
+
### MISC
|
121 |
+
- You can make a _pseudo_ "neural speaker" by copying training-sentences to the test according to __Nearest-Neighbors__ in a pretrained
|
122 |
+
network feature space by running this 5 min. [notebook](artemis/notebooks/deep_nets/speakers/nearest_neighbor_speaker.ipynb).
|
123 |
+
|
124 |
+
|
125 |
+
### Pretrained Models (used in CVPR21-paper)
|
126 |
+
* [Image-To-Emotion classifier (81MB)](https://www.dropbox.com/s/8dfj3b36q15iieo/best_model.pt?dl=0)
|
127 |
+
- use it within notebook of Step.3.1 or to _sample_ emotionally grounded speaker (Step.4.sample).
|
128 |
+
|
129 |
+
* [LSTM-based Text-To-Emotion classifier (8MB)](https://www.dropbox.com/s/ruczzggqu1i6nof/best_model.pt?dl=0)
|
130 |
+
- use it within inside notebook of Step.3.2 or to _evaluate_ the samples of a speaker (Step.4.evaluate) | e.g., needed for emotional-alignment.
|
131 |
+
|
132 |
+
* [SAT-Speaker (434MB)](https://www.dropbox.com/s/tnbfws0m3yi06ge/vanilla_sat_speaker_cvpr21.zip?dl=0)
|
133 |
+
* [SAT-Speaker-with-emotion-grounding (431MB)](https://www.dropbox.com/s/0erh464wag8ods1/emo_grounded_sat_speaker_cvpr21.zip?dl=0)
|
134 |
+
|
135 |
+
+ The above two links include also our _sampled captions_ for the test-split. You can use them to evaluate the speakers without resampling them. Please read the included README.txt.
|
136 |
+
|
137 |
+
+ __Caveats__: ArtEmis is a real-world dataset containing the opinion and sentiment of thousands of people. It is expected thus to contain text with biases, factual inaccuracies, and perhaps foul language. Please use responsibly.
|
138 |
+
The provided models are likely to be biased and/or inaccurate in ways reflected in the training data.
|
139 |
+
|
140 |
+
### News
|
141 |
+
|
142 |
+
- :champagne: ArtEmis has attracted already some noticeable media coverage. E.g., @ [New-Scientist](https://www.newscientist.com/article/2266240-ai-art-critic-can-predict-which-emotions-a-painting-will-evoke),
|
143 |
+
[HAI](https://hai.stanford.edu/news/artists-intent-ai-recognizes-emotions-visual-art),
|
144 |
+
[MarkTechPost](https://www.marktechpost.com/2021/01/30/stanford-researchers-introduces-artemis-a-dataset-containing-439k-emotion-attributions),
|
145 |
+
[KCBS-Radio](https://ai.stanford.edu/~optas/data/interviews/artemis/kcbs/SAT-AI-ART_2_2-6-21(disco_mix).mp3),
|
146 |
+
[Communications of ACM](https://cacm.acm.org/news/250312-ai-art-critic-can-predict-which-emotions-a-painting-will-evoke/fulltext),
|
147 |
+
[Synced Review](https://medium.com/@Synced/ai-art-critic-new-dataset-and-models-make-emotional-sense-of-visual-artworks-2289c6c71299),
|
148 |
+
[École Polytechnique](https://www.polytechnique.edu/fr/content/des-algorithmes-emotifs-face-des-oeuvres-dart),
|
149 |
+
[Forbes Science](https://www.forbes.com/sites/evaamsen/2021/03/30/artificial-intelligence-is-learning-to-categorize-and-talk-about-art/).
|
150 |
+
|
151 |
+
- :telephone_receiver: __important__ More code, will be added in April. Namely, for the ANP-baseline, the comparisons of ArtEmis with other datasets, please do a git-pull at that time. The update will be _seamless_! During this first months, if you have _ANY_ question feel free to send me an email at __optas@stanford.edu__.
|
152 |
+
|
153 |
+
- :trophy: If you are developing more models with ArtEmis and you want to incorporate them here please talk to me or simply do a pull-request.
|
154 |
+
|
155 |
+
|
156 |
+
#### License
|
157 |
+
This code is released under MIT License (see LICENSE file for details).
|
158 |
+
_In simple words, if you copy/use parts of this code please __keep the copyright note__ in place._
|
159 |
+
|
160 |
+
|
imageprocessing/artemis/artemis/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
4 |
+
"""
|
5 |
+
from .in_out.basics import files_in_subdirs
|
6 |
+
from .in_out.basics import pickle_data, unpickle_data
|
imageprocessing/artemis/artemis/analysis/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
4 |
+
"""
|
imageprocessing/artemis/artemis/analysis/emotion_centric.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for emotion-centric analysis.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 10/22/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import matplotlib.pylab as plt
|
11 |
+
|
12 |
+
from ..emotions import ARTEMIS_EMOTIONS, positive_negative_else
|
13 |
+
|
14 |
+
|
15 |
+
def df_to_emotion_histogram(df, palette=plt.cm.Pastel1, emotion_column='emotion', verbose=False):
|
16 |
+
""" Take a dataset like ArtEmis and return a histogram over the emotion choices made by the annotators.
|
17 |
+
:param df: dataframe carrying dataset
|
18 |
+
:param palette: matplotlib color palette, e.g., plt.cm.jet
|
19 |
+
:param emotion_column: (str) indicate which column of the dataframe carries the emotion
|
20 |
+
:return: a list carrying the resulting histogram figure.
|
21 |
+
"""
|
22 |
+
hist_vals = []
|
23 |
+
for emotion in ARTEMIS_EMOTIONS:
|
24 |
+
hist_vals.append(sum(df[emotion_column] == emotion) / len(df))
|
25 |
+
|
26 |
+
norm = plt.Normalize(min(hist_vals), max(hist_vals))
|
27 |
+
colors = palette(norm(hist_vals))
|
28 |
+
|
29 |
+
s = pd.DataFrame({"emotions": ARTEMIS_EMOTIONS, "vals": hist_vals})
|
30 |
+
s.set_index("emotions", drop=True, inplace=True)
|
31 |
+
plt.figure()
|
32 |
+
s.index.name = None
|
33 |
+
ax = s.plot.bar(grid=True, figsize=(12,4), color=colors, fontsize=16, rot=45, legend=False, ec="k")
|
34 |
+
ax.set_ylabel('Percentage of data', fontsize=15)
|
35 |
+
|
36 |
+
for rec, col in zip(ax.patches, colors):
|
37 |
+
rec.set_color(col)
|
38 |
+
|
39 |
+
plt.tight_layout()
|
40 |
+
res = [plt.gcf()]
|
41 |
+
|
42 |
+
plt.figure()
|
43 |
+
s = df[emotion_column].apply(positive_negative_else).value_counts() / len(df)
|
44 |
+
|
45 |
+
if verbose:
|
46 |
+
print('Pos-Neg-Else, percents:', s.round(3))
|
47 |
+
|
48 |
+
ax = s.plot.bar(grid=True, figsize=(8,4), fontsize=16, rot=45, legend=False, color='gray')
|
49 |
+
ax.set_xticklabels(['positive', 'negative', 'else'])
|
50 |
+
plt.tight_layout()
|
51 |
+
res.append(plt.gcf())
|
52 |
+
|
53 |
+
return res
|
54 |
+
|
55 |
+
|
56 |
+
def has_emotion_max_dominance(grouped_df, exclude_se=False, return_max=False):
|
57 |
+
""" I.e., same emotion was selected (among all nine emotions) at least by half annotators.
|
58 |
+
:param grouped_df: dataframe of dataset grouped by stimuli, e.g., images.
|
59 |
+
:param exclude_se: if True, ignore the groups where the maximizer is the something-else category
|
60 |
+
:param return_max: return for each group that has dominance the emotion type that has the gathered the maximum annotations.
|
61 |
+
:return:
|
62 |
+
"""
|
63 |
+
vals = grouped_df.emotion.value_counts()
|
64 |
+
maxim = vals.max()
|
65 |
+
threshold = vals.sum() / 2
|
66 |
+
res = maxim >= threshold
|
67 |
+
if exclude_se:
|
68 |
+
res &= vals.idxmax() != 'something else'
|
69 |
+
if return_max:
|
70 |
+
return res, vals.idxmax()
|
71 |
+
else:
|
72 |
+
return res
|
imageprocessing/artemis/artemis/analysis/feature_extraction.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Routines to extract features from images.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 6/14/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import torchvision.transforms as transforms
|
11 |
+
import numpy as np
|
12 |
+
from PIL import Image
|
13 |
+
from torchvision import models
|
14 |
+
|
15 |
+
from ..in_out.datasets import ImageClassificationDataset
|
16 |
+
from ..in_out.neural_net_oriented import image_net_mean, image_net_std
|
17 |
+
from ..neural_models.resnet_encoder import ResnetEncoder
|
18 |
+
|
19 |
+
|
20 |
+
@torch.no_grad()
|
21 |
+
def get_forward_features_of_dataset(encoder, dataloader, device, data_in_batch='image'):
|
22 |
+
b_size = dataloader.batch_size
|
23 |
+
for i, batch in enumerate(dataloader):
|
24 |
+
feats = encoder(batch[data_in_batch].to(device))
|
25 |
+
feats = feats.cpu().numpy().astype('float32')
|
26 |
+
|
27 |
+
if i == 0:
|
28 |
+
features = np.zeros((len(dataloader.dataset), feats.shape[1]), dtype='float32')
|
29 |
+
|
30 |
+
if i < len(dataloader) - 1:
|
31 |
+
features[i * b_size: (i + 1) * b_size] = feats
|
32 |
+
else:
|
33 |
+
# special treatment for final batch
|
34 |
+
features[i * b_size:] = feats
|
35 |
+
return features
|
36 |
+
|
37 |
+
|
38 |
+
def image_transformation(img_dim, pretraining='image_net'):
|
39 |
+
if pretraining == 'image_net':
|
40 |
+
normalize = transforms.Normalize(mean=image_net_mean, std=image_net_std)
|
41 |
+
else:
|
42 |
+
raise NotImplementedError('')
|
43 |
+
|
44 |
+
res = transforms.Compose([transforms.Resize((img_dim, img_dim), Image.LANCZOS),
|
45 |
+
transforms.ToTensor(), normalize])
|
46 |
+
|
47 |
+
return res
|
48 |
+
|
49 |
+
|
50 |
+
def vgg_encoder(device):
|
51 |
+
vgg = models.vgg16_bn(pretrained=True).to(device).eval()
|
52 |
+
feature_storage = []
|
53 |
+
def hook(module, hook_input, hook_output):
|
54 |
+
feature_storage.append(hook_output.detach_().cpu().numpy())
|
55 |
+
vgg.classifier[4].register_forward_hook(hook) # last relu layer before classification.
|
56 |
+
return vgg, feature_storage
|
57 |
+
|
58 |
+
|
59 |
+
@torch.no_grad()
|
60 |
+
def extract_visual_features(image_files, img_dim, method='resnet18',
|
61 |
+
batch_size=128, n_workers=12, device='cuda'):
|
62 |
+
|
63 |
+
|
64 |
+
img_transform = image_transformation(img_dim)
|
65 |
+
dataset = ImageClassificationDataset(image_files, img_transform=img_transform)
|
66 |
+
|
67 |
+
loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size,
|
68 |
+
shuffle=False, num_workers=n_workers)
|
69 |
+
|
70 |
+
if method.startswith('resnet'):
|
71 |
+
vis_encoder = ResnetEncoder(method, 1).to(device).eval()
|
72 |
+
features = get_forward_features_of_dataset(vis_encoder, loader, device)
|
73 |
+
|
74 |
+
elif method.startswith('vgg'):
|
75 |
+
vis_encoder, features = vgg_encoder(device)
|
76 |
+
for batch in loader:
|
77 |
+
vis_encoder(batch['image'].to(device))
|
78 |
+
features = np.vstack(features)
|
79 |
+
|
80 |
+
elif method.startswith('random'):
|
81 |
+
vis_encoder = ResnetEncoder('resnet18', 1, pretrained=False).to(device).eval()
|
82 |
+
features = get_forward_features_of_dataset(vis_encoder, loader, device)
|
83 |
+
|
84 |
+
return features
|
imageprocessing/artemis/artemis/analysis/paintings_meta_data.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Manually selected famous paintings that can be optionally put in a test-set.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 6/23/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
masterpieces_for_test = [
|
10 |
+
'leonardo-da-vinci_mona-lisa',
|
11 |
+
'vincent-van-gogh_the-starry-night-1889(1)',
|
12 |
+
'vincent-van-gogh_the-starry-night-1888-1',
|
13 |
+
'vincent-van-gogh_the-starry-night-1889-1',
|
14 |
+
'vincent-van-gogh_the-starry-night-1888-2',
|
15 |
+
'vincent-van-gogh_the-starry-night-1888',
|
16 |
+
'johannes-vermeer_the-girl-with-a-pearl-earring',
|
17 |
+
'robert-silvers_girl-with-the-pearl-earring-2008',
|
18 |
+
'robert-silvers_guernica-photomosaic-mounted-on-aluminum',
|
19 |
+
'gustav-klimt_the-kiss-1908(1)',
|
20 |
+
'leonardo-da-vinci_the-lady-with-the-ermine-cecilia-gallerani-1496',
|
21 |
+
'vincent-van-gogh_cafe-terrace-on-the-place-du-forum-1888(1)',
|
22 |
+
'vincent-van-gogh_the-cafe-terrace-on-the-place-du-forum-arles-at-night-1888',
|
23 |
+
'vincent-van-gogh_cafe-terrace-place-du-forum-arles-1888(1)',
|
24 |
+
'eugene-delacroix_the-liberty-leading-the-people-1830',
|
25 |
+
'claude-monet_impression-sunrise',
|
26 |
+
'james-mcneill-whistler_arrangement-in-grey-and-black-no-1-portrait-of-the-artist-s-mother-1871']
|
imageprocessing/artemis/artemis/analysis/utils.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Auxiliary routines to be used when analyzing/comparing ArtEmis in terms of its subjectivity, abstractness etc.
|
3 |
+
See also notebooks/analysis/concreteness_subjectivity_sentiment.ipynb
|
4 |
+
|
5 |
+
The MIT License (MIT)
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
import numpy as np
|
9 |
+
from collections import defaultdict
|
10 |
+
from tqdm.notebook import tqdm as tqdm_notebook
|
11 |
+
|
12 |
+
from collections import Counter
|
13 |
+
from ..language.basics import ngrams
|
14 |
+
|
15 |
+
def contains_word(tokenized_sentences, word_set):
|
16 |
+
boolean_mask = tokenized_sentences.apply(lambda x: len(set(x).intersection(word_set)) >= 1)
|
17 |
+
return boolean_mask
|
18 |
+
|
19 |
+
def contains_bigrams(tokens, bigram_set):
|
20 |
+
token_bigrams = set([' '.join(b) for b in ngrams(tokens, 2)])
|
21 |
+
return any(x in bigram_set for x in token_bigrams)
|
22 |
+
|
23 |
+
|
24 |
+
def concreteness_of_sentence(tokens, word_to_concreteness, count_bigrams=True):
|
25 |
+
"Sorry, will add add explanation in April..."
|
26 |
+
|
27 |
+
bigram_vals = [] # concreteness values of found bigrams
|
28 |
+
if count_bigrams:
|
29 |
+
# find bigrams that occur and their multiplicity
|
30 |
+
bigrams = Counter(ngrams(tokens, 2))
|
31 |
+
utterance = ' '.join(tokens)
|
32 |
+
for bigram, cnt in bigrams.items():
|
33 |
+
bigram = ' '.join(bigram)
|
34 |
+
if bigram in word_to_concreteness:
|
35 |
+
for _ in range(cnt):
|
36 |
+
bigram_vals.append(word_to_concreteness[bigram])
|
37 |
+
utterance = utterance.replace(bigram, '') # remove bigrams from the utterance
|
38 |
+
# to not double-count/score them
|
39 |
+
tokens = utterance.split()
|
40 |
+
|
41 |
+
unigram_vals = [word_to_concreteness[t] for t in tokens if t in word_to_concreteness]
|
42 |
+
conc_vals = unigram_vals + bigram_vals
|
43 |
+
|
44 |
+
if len(conc_vals) == 0:
|
45 |
+
return None
|
46 |
+
return sum(conc_vals) / len(conc_vals)
|
47 |
+
|
48 |
+
|
49 |
+
def pos_analysis(df, group_cols=None, round_decimal=1):
|
50 |
+
# Assumes nltk universal pos-tagging
|
51 |
+
# & df['pos'] has the part-of-speech tags
|
52 |
+
# analysis along the POS used in the paper
|
53 |
+
|
54 |
+
pos_syms = ['NOUN', 'PRON', 'ADJ', 'ADP', 'VERB']
|
55 |
+
pos_names = ['Nouns', 'Pronouns', 'Adjectives', 'Adpositions', 'Verbs']
|
56 |
+
|
57 |
+
if group_cols is not None:
|
58 |
+
groups = df.groupby(group_cols)
|
59 |
+
group_stats = []
|
60 |
+
group_lens = []
|
61 |
+
for n, gg in tqdm_notebook(groups):
|
62 |
+
g_stats = defaultdict(set)
|
63 |
+
group_lens.append(len(gg))
|
64 |
+
for t, p in zip(gg.tokens, gg.pos):
|
65 |
+
for x, y in zip(t, p):
|
66 |
+
g_stats[y[1]].add(x)
|
67 |
+
group_stats.append(g_stats)
|
68 |
+
|
69 |
+
for ps, pn in zip(pos_syms, pos_names):
|
70 |
+
u_pos = []
|
71 |
+
u_pos_norm = []
|
72 |
+
for i, s in enumerate(group_stats):
|
73 |
+
u_pos.append(len(s[ps]))
|
74 |
+
u_pos_norm.append(u_pos[-1] / group_lens[i])
|
75 |
+
print(pn, '{:.{}f}'.format(np.mean(u_pos), round_decimal), '{:.{}f}'.format(np.mean(u_pos_norm), round_decimal))
|
76 |
+
else:
|
77 |
+
for ps, pn in zip(pos_syms, pos_names):
|
78 |
+
print(pn, df.pos.apply(lambda x: len([i[0] for i in x if i[1] == ps])).mean().round(round_decimal))
|
79 |
+
|
80 |
+
|
imageprocessing/artemis/artemis/captioning/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
4 |
+
"""
|
imageprocessing/artemis/artemis/captioning/sample_captions.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Helper functions for sampling (@test -- inference-time) a neural-speaker.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 20/1/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
|
13 |
+
from ..neural_models.attentive_decoder import sample_captions, sample_captions_beam_search, properize_captions
|
14 |
+
from ..in_out.basics import wikiart_file_name_to_style_and_painting
|
15 |
+
from ..emotions import IDX_TO_EMOTION
|
16 |
+
from ..utils.vocabulary import UNK
|
17 |
+
|
18 |
+
|
19 |
+
def versatile_caption_sampler(speaker, data_loader, device, max_utterance_len, sampling_rule='beam',
|
20 |
+
beam_size=None, topk=None, temperature=1, drop_unk=True, use_bert_unk=False,
|
21 |
+
drop_bigrams=False):
|
22 |
+
"""Provides all implemented sampling methods according to the sampling_rule input parameter.
|
23 |
+
"""
|
24 |
+
vocab = speaker.decoder.vocab
|
25 |
+
|
26 |
+
if sampling_rule == 'beam':
|
27 |
+
dset = data_loader.dataset
|
28 |
+
loader = DataLoader(dset, num_workers=data_loader.num_workers) # batch-size=1
|
29 |
+
|
30 |
+
max_iter = 8 * max_utterance_len # should be large enough
|
31 |
+
beam_captions, alphas, beam_scores = sample_captions_beam_search(speaker, loader, beam_size,
|
32 |
+
device, max_iter=max_iter,
|
33 |
+
temperature=temperature,
|
34 |
+
drop_unk=drop_unk,
|
35 |
+
drop_bigrams=drop_bigrams)
|
36 |
+
# first is highest scoring caption which is the only we keep here
|
37 |
+
captions = [c[0] for c in beam_captions]
|
38 |
+
alphas = [np.array(a[0]) for a in alphas] # each alpha covers all tokens: <sos>, token1, ..., <eos>
|
39 |
+
else:
|
40 |
+
captions, alphas = sample_captions(speaker, data_loader, max_utterance_len=max_utterance_len,
|
41 |
+
sampling_rule=sampling_rule, device=device, temperature=temperature,
|
42 |
+
topk=topk, drop_unk=drop_unk, drop_bigrams=drop_bigrams)
|
43 |
+
|
44 |
+
captions = properize_captions(captions, vocab).tolist()
|
45 |
+
captions = tokens_to_strings(captions, vocab, bert_unk=use_bert_unk)
|
46 |
+
return captions, alphas
|
47 |
+
|
48 |
+
|
49 |
+
def captions_as_dataframe(captions_dataset, captions_predicted, wiki_art_data=True):
|
50 |
+
"""convert the dataset/predicted-utterances (captions) to a pandas dataframe."""
|
51 |
+
if wiki_art_data:
|
52 |
+
temp = captions_dataset.image_files.apply(wikiart_file_name_to_style_and_painting)
|
53 |
+
art_style, painting = zip(*temp)
|
54 |
+
grounding_emotion = [IDX_TO_EMOTION.get(x, None) for x in captions_dataset.emotions.tolist()]
|
55 |
+
df = pd.DataFrame([art_style, painting, grounding_emotion, captions_predicted]).transpose()
|
56 |
+
column_names = ['art_style', 'painting', 'grounding_emotion', 'caption']
|
57 |
+
df.columns = column_names
|
58 |
+
else:
|
59 |
+
image_files = captions_dataset.image_files.tolist()
|
60 |
+
grounding_emotion = [IDX_TO_EMOTION.get(x, None) for x in captions_dataset.emotions.tolist()]
|
61 |
+
df = pd.DataFrame([image_files, grounding_emotion, captions_predicted]).transpose()
|
62 |
+
column_names = ['image_file', 'grounding_emotion', 'caption']
|
63 |
+
df.columns = column_names
|
64 |
+
return df
|
65 |
+
|
66 |
+
|
67 |
+
def tokens_to_strings(token_list, vocab, bert_unk=True):
|
68 |
+
""" Bert uses [UNK] to represent the unknown symbol.
|
69 |
+
:param token_list:
|
70 |
+
:param vocab:
|
71 |
+
:param bert_unk:
|
72 |
+
:return:
|
73 |
+
"""
|
74 |
+
res = [vocab.decode_print(c) for c in token_list]
|
75 |
+
if bert_unk:
|
76 |
+
res = [c.replace(UNK, '[UNK]') for c in res]
|
77 |
+
return res
|
78 |
+
|
imageprocessing/artemis/artemis/captioning/senti_cap_anps.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Handling ANP-data // injection of sentiment according to SentiCap: https://arxiv.org/pdf/1510.01431.pdf
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 10/19/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
|
8 |
+
Note:
|
9 |
+
Given the lack of time to add comments: PLEASE SEE directly notebook "sentimentalize_utterances_with_anps"
|
10 |
+
for use-case.
|
11 |
+
"""
|
12 |
+
|
13 |
+
import nltk
|
14 |
+
import numpy.random as random
|
15 |
+
from collections import defaultdict
|
16 |
+
|
17 |
+
def read_senticap_anps(senticap_anp_file):
|
18 |
+
"""
|
19 |
+
:param senticap_anp_file:
|
20 |
+
:return: twp lists, first has positive ANPs [beautiful dog, nice person] the second negative.
|
21 |
+
"""
|
22 |
+
positive_anps = []
|
23 |
+
negative_anps = []
|
24 |
+
current_sentiment = 'positive' # the file lists first the postives, then all the negatives
|
25 |
+
with open(senticap_anp_file) as fin:
|
26 |
+
for i, line in enumerate(fin):
|
27 |
+
if i == 0:
|
28 |
+
continue
|
29 |
+
|
30 |
+
if "Negative ANPs:" in line:
|
31 |
+
current_sentiment = 'negative'
|
32 |
+
continue
|
33 |
+
|
34 |
+
anp = line.rstrip()
|
35 |
+
|
36 |
+
if len(anp) == 0:
|
37 |
+
continue
|
38 |
+
|
39 |
+
if current_sentiment == 'negative':
|
40 |
+
negative_anps.append(anp)
|
41 |
+
else:
|
42 |
+
positive_anps.append(anp)
|
43 |
+
return positive_anps, negative_anps
|
44 |
+
|
45 |
+
|
46 |
+
def build_senticap_noun_to_ajectives(pos_anps, neg_anps):
|
47 |
+
res = dict()
|
48 |
+
for tag, anps in zip(['positive', 'negative'], [pos_anps, neg_anps]):
|
49 |
+
res[tag] = defaultdict(list)
|
50 |
+
for anp in anps:
|
51 |
+
adjective, noun = anp.split()
|
52 |
+
res[tag][noun].append(adjective)
|
53 |
+
return res
|
54 |
+
|
55 |
+
|
56 |
+
def nouns_and_adjectives_of_senticap(pos_sent_anp, neg_sent_anp):
|
57 |
+
all_nouns = set()
|
58 |
+
all_adjectives = set()
|
59 |
+
for catalogue in [pos_sent_anp, neg_sent_anp]:
|
60 |
+
for item in catalogue:
|
61 |
+
adjective, noun = item.split()
|
62 |
+
all_nouns.add(noun)
|
63 |
+
all_adjectives.add(adjective)
|
64 |
+
return all_nouns, all_adjectives
|
65 |
+
|
66 |
+
|
67 |
+
def add_anp_to_sentence(sentence_tokenized, noun_to_adj, rule='random_adjective'):
|
68 |
+
""" Pick a noun of the sentence at that is a key of the noun_to_adj dictionary at random. Given the rule
|
69 |
+
pick the corresponding adjective from the noun_to_adj and add it before the noun. Return the new sentence.
|
70 |
+
If such a noun does not exist, apply no changes and return None.
|
71 |
+
:param sentence_tokenized: ['a', 'running' 'dog']
|
72 |
+
:param noun_to_adj: e.g., dog -> {happy, sad}, cat -> {funny, happy} etc.
|
73 |
+
:param rule: if "most_frequent_adjective" the noun_to_adj also includes frequencies:
|
74 |
+
e.g., dog -> {(happy 5), (sad, 1)}
|
75 |
+
:return:
|
76 |
+
"""
|
77 |
+
sentence_tokenized = sentence_tokenized.copy()
|
78 |
+
pos = nltk.pos_tag(sentence_tokenized)
|
79 |
+
noun_pos = [i for i, x in enumerate(pos) if x[1][0] == 'N'] # all noun locationns
|
80 |
+
|
81 |
+
valid_noun_pos = []
|
82 |
+
# Drop nouns that do not have adjective ANP.
|
83 |
+
for p in noun_pos:
|
84 |
+
if sentence_tokenized[p] in noun_to_adj:
|
85 |
+
valid_noun_pos.append(p)
|
86 |
+
|
87 |
+
if len(valid_noun_pos) == 0:
|
88 |
+
return None
|
89 |
+
|
90 |
+
|
91 |
+
valid_noun_pos = sorted(valid_noun_pos) # sort for reproducibility
|
92 |
+
random.shuffle(valid_noun_pos)
|
93 |
+
picked_noun_pos = valid_noun_pos[0] # pick a noun at random
|
94 |
+
picked_noun = sentence_tokenized[picked_noun_pos]
|
95 |
+
|
96 |
+
if rule == 'random_adjective':
|
97 |
+
valid_adjectives = sorted(noun_to_adj[picked_noun]) # sort for reproducibility
|
98 |
+
random.shuffle(valid_adjectives)
|
99 |
+
picked_adjective = valid_adjectives[0]
|
100 |
+
|
101 |
+
elif rule == 'most_frequent_adjective':
|
102 |
+
most_freq_adjective_with_freq = sorted(noun_to_adj[picked_noun], key=lambda x: x[1])[-1]
|
103 |
+
picked_adjective = most_freq_adjective_with_freq[0]
|
104 |
+
|
105 |
+
## Avoid adding an existing adjective (e.g., happy happy man)
|
106 |
+
if picked_noun_pos > 0 and sentence_tokenized[picked_noun_pos-1] == picked_adjective:
|
107 |
+
pass
|
108 |
+
else:
|
109 |
+
sentence_tokenized.insert(picked_noun_pos, picked_adjective)
|
110 |
+
|
111 |
+
return ' '.join(sentence_tokenized)
|
imageprocessing/artemis/artemis/data/glove.6B.100d.vocabulary.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
imageprocessing/artemis/artemis/data/image-emotion-histogram.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
imageprocessing/artemis/artemis/data/speaker_sampling_configs/full_hyper_param_ablation.json.txt
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"sampling_rule": "topk",
|
4 |
+
"temperature": 1.0,
|
5 |
+
"topk": 10
|
6 |
+
},
|
7 |
+
{
|
8 |
+
"sampling_rule": "topk",
|
9 |
+
"temperature": 0.8,
|
10 |
+
"topk": 10
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"sampling_rule": "topk",
|
14 |
+
"temperature": 0.5,
|
15 |
+
"topk": 10
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"sampling_rule": "topk",
|
19 |
+
"temperature": 0.3,
|
20 |
+
"topk": 10
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"sampling_rule": "topk",
|
24 |
+
"temperature": 0.2,
|
25 |
+
"topk": 10
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"sampling_rule": "topk",
|
29 |
+
"temperature": 1.0,
|
30 |
+
"topk": 15
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"sampling_rule": "topk",
|
34 |
+
"temperature": 0.8,
|
35 |
+
"topk": 15
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"sampling_rule": "topk",
|
39 |
+
"temperature": 0.5,
|
40 |
+
"topk": 15
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"sampling_rule": "topk",
|
44 |
+
"temperature": 0.3,
|
45 |
+
"topk": 15
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"sampling_rule": "topk",
|
49 |
+
"temperature": 0.2,
|
50 |
+
"topk": 15
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"sampling_rule": "topk",
|
54 |
+
"temperature": 1.0,
|
55 |
+
"topk": 20
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"sampling_rule": "topk",
|
59 |
+
"temperature": 0.8,
|
60 |
+
"topk": 20
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"sampling_rule": "topk",
|
64 |
+
"temperature": 0.5,
|
65 |
+
"topk": 20
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"sampling_rule": "topk",
|
69 |
+
"temperature": 0.3,
|
70 |
+
"topk": 20
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"sampling_rule": "topk",
|
74 |
+
"temperature": 0.2,
|
75 |
+
"topk": 20
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"sampling_rule": "topk",
|
79 |
+
"temperature": 1.0,
|
80 |
+
"topk": 5
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"sampling_rule": "topk",
|
84 |
+
"temperature": 0.8,
|
85 |
+
"topk": 5
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"sampling_rule": "topk",
|
89 |
+
"temperature": 0.5,
|
90 |
+
"topk": 5
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"sampling_rule": "topk",
|
94 |
+
"temperature": 0.3,
|
95 |
+
"topk": 5
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"sampling_rule": "topk",
|
99 |
+
"temperature": 0.2,
|
100 |
+
"topk": 5
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"sampling_rule": "topk",
|
104 |
+
"temperature": 1.0,
|
105 |
+
"topk": 3
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"sampling_rule": "topk",
|
109 |
+
"temperature": 0.8,
|
110 |
+
"topk": 3
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"sampling_rule": "topk",
|
114 |
+
"temperature": 0.5,
|
115 |
+
"topk": 3
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"sampling_rule": "topk",
|
119 |
+
"temperature": 0.3,
|
120 |
+
"topk": 3
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"sampling_rule": "topk",
|
124 |
+
"temperature": 0.2,
|
125 |
+
"topk": 3
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"sampling_rule": "topk",
|
129 |
+
"temperature": 0.2,
|
130 |
+
"topk": 1
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"sampling_rule": "beam",
|
134 |
+
"temperature": 1.0,
|
135 |
+
"beam_size": 5
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"sampling_rule": "beam",
|
139 |
+
"temperature": 0.8,
|
140 |
+
"beam_size": 5
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"sampling_rule": "beam",
|
144 |
+
"temperature": 0.5,
|
145 |
+
"beam_size": 5
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"sampling_rule": "beam",
|
149 |
+
"temperature": 0.3,
|
150 |
+
"beam_size": 5
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"sampling_rule": "beam",
|
154 |
+
"temperature": 0.2,
|
155 |
+
"beam_size": 5
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"sampling_rule": "beam",
|
159 |
+
"temperature": 1.0,
|
160 |
+
"beam_size": 10
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"sampling_rule": "beam",
|
164 |
+
"temperature": 0.8,
|
165 |
+
"beam_size": 10
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"sampling_rule": "beam",
|
169 |
+
"temperature": 0.5,
|
170 |
+
"beam_size": 10
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"sampling_rule": "beam",
|
174 |
+
"temperature": 0.3,
|
175 |
+
"beam_size": 10
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"sampling_rule": "beam",
|
179 |
+
"temperature": 0.2,
|
180 |
+
"beam_size": 10
|
181 |
+
}
|
182 |
+
]
|
imageprocessing/artemis/artemis/data/speaker_sampling_configs/mini_hyper_param_ablation.json.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"sampling_rule": "beam",
|
4 |
+
"temperature": 0.5,
|
5 |
+
"beam_size": 5
|
6 |
+
},
|
7 |
+
{
|
8 |
+
"sampling_rule": "beam",
|
9 |
+
"temperature": 0.3,
|
10 |
+
"beam_size": 5
|
11 |
+
}
|
12 |
+
]
|
imageprocessing/artemis/artemis/data/speaker_sampling_configs/selected_hyper_params.json.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"sampling_rule": "beam",
|
4 |
+
"temperature": 0.3,
|
5 |
+
"beam_size": 5
|
6 |
+
}
|
7 |
+
]
|
imageprocessing/artemis/artemis/data/symspell_frequency_dictionary_en_82_765.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
imageprocessing/artemis/artemis/data/wiki_art_duplicate_paintings.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:487d4325d3a75f86c7a1f5fd05fc424924c182c391f8a645e81f1c0dd58e4a27
|
3 |
+
size 233854
|
imageprocessing/artemis/artemis/data/wiki_art_genre_class.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
imageprocessing/artemis/artemis/emotions.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Mostly some constants & very simple function to encode/handle the emotion attributes of ArtEmis.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 02/11/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
|
10 |
+
ARTEMIS_EMOTIONS = ['amusement', 'awe', 'contentment', 'excitement',
|
11 |
+
'anger', 'disgust', 'fear', 'sadness', 'something else']
|
12 |
+
|
13 |
+
EMOTION_TO_IDX = {e: i for i, e in enumerate(ARTEMIS_EMOTIONS)}
|
14 |
+
|
15 |
+
|
16 |
+
IDX_TO_EMOTION = {EMOTION_TO_IDX[e]: e for e in EMOTION_TO_IDX}
|
17 |
+
|
18 |
+
|
19 |
+
POS_NEG_ELSE = {'amusement': 0, 'awe': 0, 'contentment': 0, 'excitement': 0,
|
20 |
+
'anger': 1, 'disgust': 1, 'fear': 1, 'sadness': 1,
|
21 |
+
'something else': 2}
|
22 |
+
|
23 |
+
|
24 |
+
COLORS = {'amusement': '#EE82EE',
|
25 |
+
'awe': '#FFFF00',
|
26 |
+
'contentment': '#87CEEB',
|
27 |
+
'excitement': '#DC143C',
|
28 |
+
'anger': '#000080',
|
29 |
+
'disgust': '#F0E68C',
|
30 |
+
'fear': '#C0C0C0',
|
31 |
+
'sadness': '#696969',
|
32 |
+
'something else': '#228B22'}
|
33 |
+
|
34 |
+
|
35 |
+
LARGER_EMOTION_VOCAB = {('bored', 'boring', 'apathy', 'boredom', 'indifferent', 'dull', 'uninteresting', 'uninterested'),
|
36 |
+
('shock', 'shocked'),
|
37 |
+
('confused', 'confusion', 'confuses', 'puzzled', 'puzzling',
|
38 |
+
'perplexed', 'perplexing', 'confusing', 'odd', 'weird'),
|
39 |
+
('surprised',),
|
40 |
+
('anticipation',),
|
41 |
+
('empowerment',),
|
42 |
+
('hope', 'hopeful', 'optimistic'),
|
43 |
+
('neutral',),
|
44 |
+
('rage',),
|
45 |
+
('happy', 'happiness'),
|
46 |
+
('grief',),
|
47 |
+
('shame',),
|
48 |
+
('resent',),
|
49 |
+
('creepy',),
|
50 |
+
('disappointment',),
|
51 |
+
('depressing', 'depressed'),
|
52 |
+
('bothered', 'disturbed', 'bothersome'),
|
53 |
+
('overwhelmed',),
|
54 |
+
('anxiety', 'anxious'),
|
55 |
+
('thrilled',),
|
56 |
+
('surprised', 'surprising'),
|
57 |
+
('uncomfortable',),
|
58 |
+
('curious', 'curiosity', 'wonder', 'intrigued', 'interested', 'interests', 'interesting', 'intriguing'),
|
59 |
+
('alerted', 'alert'),
|
60 |
+
('insult', 'insulted'),
|
61 |
+
('shy',),
|
62 |
+
('nostalgia', 'nostalgic'),
|
63 |
+
('exhilarating', 'exhilarated')}
|
64 |
+
|
65 |
+
|
66 |
+
def positive_negative_else(emotion):
|
67 |
+
""" Map a feeling string (e.g. 'awe') to an integer indicating if it is a positive, negative, or else.
|
68 |
+
:param emotion: (string)
|
69 |
+
:return: int
|
70 |
+
"""
|
71 |
+
return POS_NEG_ELSE[emotion]
|
72 |
+
|
73 |
+
|
74 |
+
def emotion_to_int(emotion):
|
75 |
+
""" Map a feeling string (e.g. 'awe') to a unique integer.
|
76 |
+
:param emotion: (string)
|
77 |
+
:return: int
|
78 |
+
"""
|
79 |
+
return EMOTION_TO_IDX[emotion]
|
imageprocessing/artemis/artemis/evaluation/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO: add description
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 8/29/20, for Python 3.x
|
6 |
+
Copyright (c) 2020 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
imageprocessing/artemis/artemis/evaluation/bleu.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
BLEU via NLTK
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 8/31/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
import pandas as pd
|
9 |
+
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
10 |
+
|
11 |
+
cc = SmoothingFunction()
|
12 |
+
|
13 |
+
def sentence_bleu_for_hypotheses(references, hypothesis, max_grams=4, smoothing_function=None):
|
14 |
+
""" Compute the BLEU score for the hypothesis (e.g., generated captions) against given references acting
|
15 |
+
as ground-truth.
|
16 |
+
:param references: (list of lists of lists) of len M. Each sublist contains strings. [['a', 'boy'], ['rock', 'music']]
|
17 |
+
:param hypothesis: (list of lists)
|
18 |
+
:param max_grams: int, bleu-max_grams i.e., when 4, computes bleu-4
|
19 |
+
:param smoothing_function:
|
20 |
+
:return: a Series containing the scores in the same order as the input
|
21 |
+
Note: see nltk.bleu_score.sentence_bleu
|
22 |
+
"""
|
23 |
+
if len(references) != len(hypothesis):
|
24 |
+
raise ValueError('Each reference (set) comes with a single hypothesis')
|
25 |
+
if type(references[0]) != list or type(hypothesis[0]) != list:
|
26 |
+
raise ValueError('Bad input types: use tokenized strings, and lists of tokens.')
|
27 |
+
|
28 |
+
scores = []
|
29 |
+
weights = (1.0 / max_grams, ) * max_grams
|
30 |
+
|
31 |
+
for i in range(len(references)):
|
32 |
+
scores.append(sentence_bleu(references[i], hypothesis[i], weights=weights,
|
33 |
+
smoothing_function=smoothing_function))
|
34 |
+
return pd.Series(scores)
|
imageprocessing/artemis/artemis/evaluation/emotion_alignment.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Measuring the emotion-alignment between a generation and the ground-truth (emotion).
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 8/31/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import numpy as np
|
11 |
+
from ..utils.basic import iterate_in_chunks
|
12 |
+
|
13 |
+
|
14 |
+
@torch.no_grad()
|
15 |
+
def image_to_emotion(img2emo_clf, data_loader, device):
|
16 |
+
""" For each image of the underlying dataset predict an emotion
|
17 |
+
:param img2emo_clf: nn.Module
|
18 |
+
:param data_loader: torch loader of dataset to iterate
|
19 |
+
:param device: gpu placement
|
20 |
+
:return:
|
21 |
+
"""
|
22 |
+
img2emo_clf.eval()
|
23 |
+
emo_of_img_preds = []
|
24 |
+
for batch in data_loader:
|
25 |
+
predictions = img2emo_clf(batch['image'].to(device)).cpu()
|
26 |
+
emo_of_img_preds.append(predictions)
|
27 |
+
emo_of_img_preds = torch.cat(emo_of_img_preds)
|
28 |
+
return emo_of_img_preds
|
29 |
+
|
30 |
+
|
31 |
+
@torch.no_grad()
|
32 |
+
def text_to_emotion(txt2em_clf, encoded_tokens, device, batch_size=1000):
|
33 |
+
"""
|
34 |
+
:param txt2em_clf:
|
35 |
+
:param encoded_tokens: Tensor carrying the text encoded
|
36 |
+
:param device:
|
37 |
+
:param batch_size:
|
38 |
+
:return:
|
39 |
+
"""
|
40 |
+
txt2em_clf.eval()
|
41 |
+
emotion_txt_preds = []
|
42 |
+
for chunk in iterate_in_chunks(encoded_tokens, batch_size):
|
43 |
+
emotion_txt_preds.append(txt2em_clf(chunk.to(device)).cpu())
|
44 |
+
|
45 |
+
emotion_txt_preds = torch.cat(emotion_txt_preds)
|
46 |
+
maximizers = torch.argmax(emotion_txt_preds, -1)
|
47 |
+
return emotion_txt_preds, maximizers
|
48 |
+
|
49 |
+
|
50 |
+
def unique_maximizer(a_list):
|
51 |
+
""" if there is an element of the input list that appears
|
52 |
+
strictly more frequent than any other element
|
53 |
+
:param a_list:
|
54 |
+
:return:
|
55 |
+
"""
|
56 |
+
u_elements, u_cnt = np.unique(a_list, return_counts=True)
|
57 |
+
has_umax = sum(u_cnt == u_cnt.max()) == 1
|
58 |
+
umax = u_elements[u_cnt.argmax()]
|
59 |
+
return has_umax, umax
|
60 |
+
|
61 |
+
|
62 |
+
def dominant_maximizer(a_list):
|
63 |
+
""" if there is an element of the input list that appears
|
64 |
+
at least half the time
|
65 |
+
:param a_list:
|
66 |
+
:return:
|
67 |
+
"""
|
68 |
+
u_elements, u_cnt = np.unique(a_list, return_counts=True)
|
69 |
+
|
70 |
+
has_umax = u_cnt.max() >= len(a_list) / 2
|
71 |
+
|
72 |
+
if len(u_cnt) >= 2: # make sure the second most frequent does not match the first.
|
73 |
+
a, b = sorted(u_cnt)[-2:]
|
74 |
+
if a == b:
|
75 |
+
has_umax = False
|
76 |
+
|
77 |
+
umax = u_elements[u_cnt.argmax()]
|
78 |
+
return has_umax, umax
|
79 |
+
|
80 |
+
|
81 |
+
def occurrence_list_to_distribution(list_of_ints, n_support):
|
82 |
+
"""e.g., [0, 8, 8, 8] -> [1/4, 0, ..., 3/4, 0, ...]"""
|
83 |
+
distribution = np.zeros(n_support, dtype=np.float32)
|
84 |
+
for i in list_of_ints:
|
85 |
+
distribution[i] += 1
|
86 |
+
distribution /= sum(distribution)
|
87 |
+
return distribution
|
imageprocessing/artemis/artemis/evaluation/longest_common_subseq.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Originally created at 10/5/20, for Python 3.x
|
4 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
5 |
+
"""
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
def lcs(s1, s2):
|
11 |
+
"""
|
12 |
+
Longest common subsequence of two iterables. A subsequence is a
|
13 |
+
sequence that appears in the same relative order, but not necessarily contiguous.
|
14 |
+
:param s1: first iterable
|
15 |
+
:param s2: second iterable
|
16 |
+
:return: (list) the lcs
|
17 |
+
"""
|
18 |
+
matrix = [[[] for _ in range(len(s2))] for _ in range(len(s1))]
|
19 |
+
for i in range(len(s1)):
|
20 |
+
for j in range(len(s2)):
|
21 |
+
if s1[i] == s2[j]:
|
22 |
+
if i == 0 or j == 0:
|
23 |
+
matrix[i][j] = [s1[i]]
|
24 |
+
else:
|
25 |
+
matrix[i][j] = matrix[i-1][j-1] + [s1[i]]
|
26 |
+
else:
|
27 |
+
matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)
|
28 |
+
cs = matrix[-1][-1]
|
29 |
+
return cs
|
30 |
+
|
31 |
+
|
32 |
+
def captions_lcs_from_training_utterances(captions_tokenized, train_utters_tokenized):
|
33 |
+
maximizers = np.zeros(len(captions_tokenized), dtype=int)
|
34 |
+
max_lcs = np.zeros(len(captions_tokenized))
|
35 |
+
averages = np.zeros(len(captions_tokenized))
|
36 |
+
for i, caption in enumerate(tqdm(captions_tokenized)):
|
37 |
+
caption_res = [len(lcs(caption, tr_example)) for tr_example in train_utters_tokenized]
|
38 |
+
max_loc = np.argmax(caption_res)
|
39 |
+
maximizers[i] = max_loc
|
40 |
+
max_lcs[i] = caption_res[max_loc]
|
41 |
+
averages[i] = np.mean(caption_res)
|
42 |
+
return max_lcs, averages, maximizers
|
43 |
+
|
44 |
+
|
45 |
+
###
|
46 |
+
# Panos Note:
|
47 |
+
# a) '[the] contours shadowing [and] details make this painting [look like a] photograph the way the hair is
|
48 |
+
# layered and [the eyes] gazing off to space are fantastic'
|
49 |
+
# b) '[the] red [and] black paint strokes [look like a] bunch on [the eyes]'
|
50 |
+
# (a), (b) have lcs = 7
|
51 |
+
# but,
|
52 |
+
# a) '[the woman] is pretty nice and [has a] welcoming [facial expression]'
|
53 |
+
# b) '[the woman] looks very elegant since she [has] such [a] beautiful [facial expression]'
|
54 |
+
# (a), (b) have lcs = 6
|
55 |
+
# implying that removing stop-word articles "a", "the" could make this more realistic, since the first pair is way more
|
56 |
+
# dissimilar than the second.
|
57 |
+
# also if you use this to compare to systems; the length of the utterance could be used to normalize the bias the length
|
58 |
+
# brings in.
|
59 |
+
###
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
imageprocessing/artemis/artemis/evaluation/metaphors.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Greedy-approximate counting of similes/methaphors present in a set of sentences.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 9/1/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
metaphorical_substrings = {'could be',
|
10 |
+
'appears to be',
|
11 |
+
'appear to be',
|
12 |
+
'reminds me',
|
13 |
+
'remind me',
|
14 |
+
'seems like',
|
15 |
+
'looks like',
|
16 |
+
'look like',
|
17 |
+
'is like',
|
18 |
+
'are like',
|
19 |
+
'think of',
|
20 |
+
'resembles',
|
21 |
+
'resembling'
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
def makes_metaphor_via_substring_matching(sentences, substrings=None):
|
26 |
+
"""
|
27 |
+
:param sentences: list of strings
|
28 |
+
:param substrings: iterable with substrings of which the occurrence implies a metaphor is made
|
29 |
+
:return: list with booleans
|
30 |
+
"""
|
31 |
+
if substrings is None:
|
32 |
+
substrings = metaphorical_substrings
|
33 |
+
|
34 |
+
makes_metaphor = []
|
35 |
+
for s in sentences:
|
36 |
+
yes = False
|
37 |
+
for m in substrings:
|
38 |
+
if m in s:
|
39 |
+
yes = True
|
40 |
+
break
|
41 |
+
makes_metaphor.append(yes)
|
42 |
+
return makes_metaphor
|
imageprocessing/artemis/artemis/evaluation/single_caption_per_image.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Some grouping of various evaluation evaluation routines that assume that assume that for a given set of reference
|
3 |
+
sentences there is a _single_ caption (sample) generated.
|
4 |
+
|
5 |
+
The MIT License (MIT)
|
6 |
+
Originally created at 9/1/20, for Python 3.x
|
7 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
8 |
+
"""
|
9 |
+
|
10 |
+
import torch
|
11 |
+
import warnings
|
12 |
+
import pandas as pd
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
|
16 |
+
from .bleu import sentence_bleu_for_hypotheses, cc
|
17 |
+
from .metaphors import makes_metaphor_via_substring_matching
|
18 |
+
from .emotion_alignment import text_to_emotion
|
19 |
+
from .pycocoevalcap import Bleu, Cider, Meteor, Spice, Rouge
|
20 |
+
from .emotion_alignment import dominant_maximizer, occurrence_list_to_distribution
|
21 |
+
from .longest_common_subseq import captions_lcs_from_training_utterances
|
22 |
+
from ..utils.basic import cross_entropy
|
23 |
+
|
24 |
+
ALL_METRICS = {'bleu', 'cider', 'spice', 'meteor', 'rouge', 'emo_alignment', 'metaphor', 'lcs'}
|
25 |
+
|
26 |
+
|
27 |
+
def emotional_alignment(hypothesis, emotions, vocab, txt2em_clf, device):
|
28 |
+
""" text 2 emotion, then compare with ground-truth.
|
29 |
+
:param hypothesis:
|
30 |
+
:param emotions: (list of list of int) human emotion-annotations (ground-truth) e.g., [[0, 1] [1]]
|
31 |
+
:param vocab:
|
32 |
+
:param txt2em_clf:
|
33 |
+
:param device:
|
34 |
+
:return:
|
35 |
+
"""
|
36 |
+
|
37 |
+
# from text to emotion
|
38 |
+
hypothesis_tokenized = hypothesis.apply(lambda x: x.split())
|
39 |
+
max_len = hypothesis_tokenized.apply(lambda x: len(x)).max()
|
40 |
+
hypothesis = hypothesis_tokenized.apply(lambda x: np.array(vocab.encode(x, max_len=max_len)))
|
41 |
+
hypothesis = torch.from_numpy(np.vstack(hypothesis))
|
42 |
+
pred_logits, pred_maximizer = text_to_emotion(txt2em_clf, hypothesis, device)
|
43 |
+
|
44 |
+
# convert emotion lists to distributions to measure cross-entropy
|
45 |
+
n_emotions = 9
|
46 |
+
emo_dists = torch.from_numpy(np.vstack(emotions.apply(lambda x: occurrence_list_to_distribution(x, n_emotions))))
|
47 |
+
x_entropy = cross_entropy(pred_logits, emo_dists).item()
|
48 |
+
|
49 |
+
# constrain predictions to those of images with dominant maximizer of emotion
|
50 |
+
has_max, maximizer = zip(*emotions.apply(dominant_maximizer))
|
51 |
+
emotion_mask = np.array(has_max)
|
52 |
+
masked_emotion = np.array(maximizer)[emotion_mask]
|
53 |
+
|
54 |
+
guess_correct = masked_emotion == pred_maximizer[emotion_mask].cpu().numpy()
|
55 |
+
accuracy = guess_correct.mean()
|
56 |
+
|
57 |
+
return accuracy, x_entropy
|
58 |
+
|
59 |
+
|
60 |
+
def bleu_scores_via_nltk(hypothesis, references, smoothing_function=cc.method1):
|
61 |
+
"""
|
62 |
+
:param hypothesis: dataframe of strings
|
63 |
+
:param references: dataframe of list of strings
|
64 |
+
:param smoothing_function:
|
65 |
+
:return:
|
66 |
+
"""
|
67 |
+
|
68 |
+
# first tokenize
|
69 |
+
hypothesis_tokenized = hypothesis.apply(lambda x: x.split())
|
70 |
+
references_tokenized = references.apply(lambda x: [i.split() for i in x])
|
71 |
+
|
72 |
+
results = dict()
|
73 |
+
for max_grams in range(1, 5):
|
74 |
+
with warnings.catch_warnings():
|
75 |
+
warnings.simplefilter("ignore")
|
76 |
+
scores = sentence_bleu_for_hypotheses(references_tokenized,
|
77 |
+
hypothesis_tokenized,
|
78 |
+
max_grams,
|
79 |
+
smoothing_function)
|
80 |
+
results['BLEU-{}'.format(max_grams)] = scores
|
81 |
+
return results
|
82 |
+
|
83 |
+
|
84 |
+
def dataframes_to_coco_eval_format(references, hypothesis):
|
85 |
+
references = {i: [k for k in x] for i, x in enumerate(references)}
|
86 |
+
hypothesis = {i: [x] for i, x in enumerate(hypothesis)}
|
87 |
+
return references, hypothesis
|
88 |
+
|
89 |
+
|
90 |
+
def pycoco_bleu_scores(hypothesis, references):
|
91 |
+
references, hypothesis = dataframes_to_coco_eval_format(references, hypothesis)
|
92 |
+
scorer = Bleu()
|
93 |
+
average_score, all_scores = scorer.compute_score(references, hypothesis)
|
94 |
+
# Note: average_score takes into account epsilons: tiny/small
|
95 |
+
# this won't be reflected if you take the direct average of all_scores.
|
96 |
+
return average_score, all_scores
|
97 |
+
|
98 |
+
|
99 |
+
def pycoco_eval_scores(hypothesis, references, metric):
|
100 |
+
references, hypothesis = dataframes_to_coco_eval_format(references, hypothesis)
|
101 |
+
if metric == 'cider':
|
102 |
+
scorer = Cider()
|
103 |
+
elif metric == 'meteor':
|
104 |
+
scorer = Meteor()
|
105 |
+
elif metric == 'spice':
|
106 |
+
scorer = Spice()
|
107 |
+
elif metric == 'rouge':
|
108 |
+
scorer = Rouge()
|
109 |
+
else:
|
110 |
+
raise ValueError
|
111 |
+
avg, all_scores = scorer.compute_score(references, hypothesis)
|
112 |
+
return pd.Series(all_scores)
|
113 |
+
|
114 |
+
|
115 |
+
def apply_basic_evaluations(hypothesis, references, ref_emotions, txt2emo_clf, text2emo_vocab,
|
116 |
+
lcs_sample=None, train_utterances=None, nltk_bleu=False, smoothing_function=cc.method1,
|
117 |
+
device="cuda", random_seed=2021,
|
118 |
+
methods_to_do=ALL_METRICS):
|
119 |
+
"""
|
120 |
+
:param hypothesis: list of strings ['a man', 'a woman']
|
121 |
+
:param references: list of list of strings [['a man', 'a tall man'], ['a woman']]
|
122 |
+
:param ref_emotions: emotions corresponding to references list of list of integers [[0, 1] [1]]
|
123 |
+
|
124 |
+
:param text2emo_vocab:
|
125 |
+
:param txt2emo_clf:
|
126 |
+
:param device:
|
127 |
+
:param smoothing_function:
|
128 |
+
:return:
|
129 |
+
"""
|
130 |
+
results = []
|
131 |
+
stat_track = ['mean', 'std']
|
132 |
+
|
133 |
+
##
|
134 |
+
## BLEU:1-4
|
135 |
+
##
|
136 |
+
if 'bleu' in methods_to_do:
|
137 |
+
if nltk_bleu:
|
138 |
+
res = bleu_scores_via_nltk(hypothesis, references, smoothing_function=smoothing_function)
|
139 |
+
for metric, scores in res.items():
|
140 |
+
stats = scores.describe()[stat_track]
|
141 |
+
stats = pd.concat([pd.Series({'metric': metric}), stats])
|
142 |
+
results.append(stats)
|
143 |
+
else:
|
144 |
+
#py-coco based
|
145 |
+
b_scores = pycoco_bleu_scores(hypothesis, references)
|
146 |
+
for i in range(4):
|
147 |
+
metric = f'BLEU-{i}'
|
148 |
+
mu = b_scores[0][i]
|
149 |
+
# note the std below reflects the values without the 'tiny' adaptation (unlike the mu)
|
150 |
+
# avg_dummy = np.mean(b_scores[1][i]) # this is the average without the tiny adaptation.
|
151 |
+
std = np.std(b_scores[1][i])
|
152 |
+
stats = pd.concat([pd.Series({'metric': metric}), pd.Series({'mean': mu, 'std':std})])
|
153 |
+
results.append(stats)
|
154 |
+
print('BLEU: done')
|
155 |
+
|
156 |
+
##
|
157 |
+
## CIDER, SPICE, METEOR, ROUGE-L
|
158 |
+
##
|
159 |
+
coco_requested = False
|
160 |
+
for metric in ['cider', 'spice', 'meteor', 'rouge']:
|
161 |
+
if metric in methods_to_do:
|
162 |
+
stats = pycoco_eval_scores(hypothesis, references, metric).describe()[stat_track]
|
163 |
+
stats = pd.concat([pd.Series({'metric': metric.upper()}), stats])
|
164 |
+
results.append(stats)
|
165 |
+
coco_requested = True
|
166 |
+
if coco_requested:
|
167 |
+
print('COCO-based-metrics: done')
|
168 |
+
|
169 |
+
##
|
170 |
+
## Emotional-Alignment
|
171 |
+
##
|
172 |
+
if 'emo_alignment' in methods_to_do:
|
173 |
+
emo_accuracy, emo_xentopy = emotional_alignment(hypothesis, ref_emotions, text2emo_vocab, txt2emo_clf, device)
|
174 |
+
stats = pd.Series(emo_accuracy, dtype=float)
|
175 |
+
stats = stats.describe()[stat_track]
|
176 |
+
stats = pd.concat([pd.Series({'metric': 'Emo-Alignment-ACC'}), stats])
|
177 |
+
results.append(stats)
|
178 |
+
|
179 |
+
stats = pd.Series(emo_xentopy, dtype=float)
|
180 |
+
stats = stats.describe()[stat_track]
|
181 |
+
stats = pd.concat([pd.Series({'metric': 'Emo-Alignment-XENT'}), stats])
|
182 |
+
results.append(stats)
|
183 |
+
print('EMO-ALIGN: done')
|
184 |
+
|
185 |
+
##
|
186 |
+
## Metaphor-like expressions
|
187 |
+
##
|
188 |
+
if 'metaphor' in methods_to_do:
|
189 |
+
met_mask = makes_metaphor_via_substring_matching(hypothesis)
|
190 |
+
stats = pd.Series(met_mask, dtype=float)
|
191 |
+
stats = stats.describe()[stat_track]
|
192 |
+
stats = pd.concat([pd.Series({'metric': 'Metaphors'}), stats])
|
193 |
+
results.append(stats)
|
194 |
+
print('Metaphor-like expressions: Done')
|
195 |
+
|
196 |
+
##
|
197 |
+
## Novelty via Longest Common Subsequence
|
198 |
+
##
|
199 |
+
if 'lcs' in methods_to_do:
|
200 |
+
np.random.seed(random_seed) # since you will (normally) sub-sample
|
201 |
+
train_utters_tokenized = [u.split() for u in train_utterances]
|
202 |
+
uts = pd.Series(train_utters_tokenized).sample(lcs_sample[0]).to_list()
|
203 |
+
hypo_token = hypothesis.apply(lambda x: x.split()).sample(lcs_sample[1]).to_list()
|
204 |
+
|
205 |
+
max_lcs, mean_lcs, _ = captions_lcs_from_training_utterances(hypo_token, uts)
|
206 |
+
stats = pd.Series(max_lcs).describe()[stat_track]
|
207 |
+
stats = pd.concat([pd.Series({'metric': 'max-LCS'}), stats])
|
208 |
+
results.append(stats)
|
209 |
+
stats = pd.Series(mean_lcs).describe()[stat_track]
|
210 |
+
stats = pd.concat([pd.Series({'metric': 'mean-LCS'}), stats])
|
211 |
+
results.append(stats)
|
212 |
+
print('Novelty via Longest Common Subsequence: Done')
|
213 |
+
|
214 |
+
return results
|
imageprocessing/artemis/artemis/in_out/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
4 |
+
"""
|
imageprocessing/artemis/artemis/in_out/arguments.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Argument handling.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at early 2020, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
import argparse
|
9 |
+
import json
|
10 |
+
import pprint
|
11 |
+
import pathlib
|
12 |
+
import os.path as osp
|
13 |
+
from datetime import datetime
|
14 |
+
from .basics import create_dir
|
15 |
+
|
16 |
+
|
17 |
+
def str2bool(v):
|
18 |
+
""" boolean values for argparse
|
19 |
+
"""
|
20 |
+
if isinstance(v, bool):
|
21 |
+
return v
|
22 |
+
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
23 |
+
return True
|
24 |
+
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
25 |
+
return False
|
26 |
+
else:
|
27 |
+
raise argparse.ArgumentTypeError('Boolean value expected.')
|
28 |
+
|
29 |
+
|
30 |
+
def parse_train_speaker_arguments(notebook_options=None, save_args=False):
|
31 |
+
""" Default/Main arguments for training a SAT neural-speaker (via ArtEmis).
|
32 |
+
:param notebook_options: list, if you are using this via a jupyter notebook
|
33 |
+
:return: argparse.ArgumentParser
|
34 |
+
"""
|
35 |
+
|
36 |
+
parser = argparse.ArgumentParser(description='training-a-neural-speaker')
|
37 |
+
|
38 |
+
## Non-optional arguments
|
39 |
+
parser.add_argument('-log-dir', type=str, required=True, help='where to save training-progress, model, etc.')
|
40 |
+
parser.add_argument('-data-dir', type=str, required=True, help='path to ArtEmis/COCO preprocessed data')
|
41 |
+
parser.add_argument('-img-dir', type=str, required=True, help='path to top image (e.g., WikiArt) dir')
|
42 |
+
|
43 |
+
# Model parameters
|
44 |
+
parser.add_argument('--img-dim', type=int, default=256, help='images will be resized to be squared with this many pixels')
|
45 |
+
parser.add_argument('--lanczos', type=str2bool, default=True, help='apply lanczos resampling when resizing')
|
46 |
+
parser.add_argument('--atn-spatial-img-size', type=int, help='optional, if provided reshapes the spatial output dimension of the '
|
47 |
+
'visual encode in this X this "pixels" using average-pooling. ')
|
48 |
+
|
49 |
+
parser.add_argument('--atn-cover-img-alpha', type=float, default=1, help='attention to cover entire image when '
|
50 |
+
'marginalized over tokens')
|
51 |
+
parser.add_argument('--attention-dim', type=int, default=512)
|
52 |
+
parser.add_argument('--rnn-hidden-dim', type=int, default=512)
|
53 |
+
parser.add_argument('--word-embedding-dim', type=int, default=128)
|
54 |
+
parser.add_argument('--vis-encoder', type=str, default='resnet34', choices=['resnet18',
|
55 |
+
'resnet34',
|
56 |
+
'resnet50',
|
57 |
+
'resnet101'], help='visual-encoder backbone')
|
58 |
+
parser.add_argument('--dropout-rate', type=float, default=0.1)
|
59 |
+
parser.add_argument('--teacher-forcing-ratio', type=int, default=1)
|
60 |
+
|
61 |
+
parser.add_argument('--use-emo-grounding', type=str2bool, default=False)
|
62 |
+
parser.add_argument('--emo-grounding-dims', nargs=2, type=int, default=[9, 9], help='[input] number of emotions x the'
|
63 |
+
'the size of the projection layer that '
|
64 |
+
'will be used to transform the one-hot emotion'
|
65 |
+
'to a grounding vector.')
|
66 |
+
|
67 |
+
|
68 |
+
# Training parameters
|
69 |
+
parser.add_argument('--resume-path', type=str, help='model-path to resume from')
|
70 |
+
parser.add_argument('--fine-tune-data', type=str)
|
71 |
+
parser.add_argument('--batch-size', type=int, default=128)
|
72 |
+
parser.add_argument('--num-workers', type=int, default=6)
|
73 |
+
parser.add_argument('--gpu', type=str, default='0')
|
74 |
+
parser.add_argument('--encoder-lr', type=float, default=1e-4)
|
75 |
+
parser.add_argument('--decoder-lr', type=float, default=5e-4)
|
76 |
+
parser.add_argument('--max-train-epochs', type=int, default=50)
|
77 |
+
parser.add_argument('--train-patience', type=int, default=5, help='maximum consecutive epochs where the validation '
|
78 |
+
'Neg-LL does not improve before we stop training.')
|
79 |
+
parser.add_argument('--lr-patience', type=int, default=2, help='maximum waiting of epochs where the validation '
|
80 |
+
'Neg-LL does not improve before we reduce the'
|
81 |
+
'learning-rate.')
|
82 |
+
parser.add_argument('--save-each-epoch', type=str2bool, default=True, help='Save the model at each epoch, else will only save'
|
83 |
+
'the one that achieved the minimal '
|
84 |
+
'Negative-Log-Likelihood in the validation split.')
|
85 |
+
|
86 |
+
# Misc
|
87 |
+
parser.add_argument('--dataset', type=str, default='artemis')
|
88 |
+
parser.add_argument('--random-seed', type=int, default=2021)
|
89 |
+
parser.add_argument('--debug', default=False, type=str2bool)
|
90 |
+
parser.add_argument('--use-timestamp', default=True, type=str2bool)
|
91 |
+
|
92 |
+
# Parse arguments
|
93 |
+
if notebook_options is not None: # Pass options directly
|
94 |
+
args = parser.parse_args(notebook_options)
|
95 |
+
else:
|
96 |
+
args = parser.parse_args() # Read from command line.
|
97 |
+
|
98 |
+
if args.use_timestamp:
|
99 |
+
timestamp = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
|
100 |
+
args.log_dir = create_dir(osp.join(args.log_dir, timestamp))
|
101 |
+
|
102 |
+
# pprint them
|
103 |
+
args_string = pprint.pformat(vars(args))
|
104 |
+
print(args_string)
|
105 |
+
|
106 |
+
if save_args:
|
107 |
+
out = osp.join(args.log_dir, 'config.json.txt')
|
108 |
+
with open(out, 'w') as f_out:
|
109 |
+
json.dump(vars(args), f_out, indent=4, sort_keys=True)
|
110 |
+
|
111 |
+
return args
|
112 |
+
|
113 |
+
|
114 |
+
def parse_test_speaker_arguments(notebook_options=None):
|
115 |
+
""" Parameters for testing (sampling) a neural-speaker.
|
116 |
+
:param notebook_options: list, if you are using this via a jupyter notebook
|
117 |
+
:return: argparse.ArgumentParser
|
118 |
+
"""
|
119 |
+
parser = argparse.ArgumentParser(description='testing-a-neural-speaker')
|
120 |
+
|
121 |
+
## Basic required arguments
|
122 |
+
parser.add_argument('-speaker-saved-args', type=str, required=True, help='config.json.txt file for saved speaker model (output of train_speaker.py)')
|
123 |
+
parser.add_argument('-speaker-checkpoint', type=str, required=True, help='saved model checkpoint ("best_model.pt" (output of train_speaker.py)')
|
124 |
+
parser.add_argument('-img-dir', type=str, required=True, help='path to top image dir (typically that\'s the WikiArt top-dir)')
|
125 |
+
parser.add_argument('-out-file', type=str, required=True, help='file to save the sampled utterances, their attention etc. as a pkl')
|
126 |
+
|
127 |
+
## Basic optional arguments
|
128 |
+
parser.add_argument('--split', type=str, default='test', choices=['train', 'test', 'val', 'rest'], help='set the split of the dataset you want to annotate '
|
129 |
+
'the code will load the dataset based on the dir-location marked '
|
130 |
+
'in the input config.json.txt file. '
|
131 |
+
'this param has no effect if a custom-data-csv is passed.')
|
132 |
+
|
133 |
+
parser.add_argument('--custom-data-csv', type=str, help='if you want to annotate your own set of images. Please '
|
134 |
+
'see the code for what this csv should look like. ')
|
135 |
+
|
136 |
+
parser.add_argument('--subsample-data', type=int, default=-1, help='if not -1, will subsample the underlying dataset'
|
137 |
+
'and will annotated only this many images.')
|
138 |
+
|
139 |
+
|
140 |
+
## Optional arguments controlling the generation/sampling process
|
141 |
+
parser.add_argument('--max-utterance-len', type=int, help='maximum allowed lenght for any sampled utterances. If not given '
|
142 |
+
'the maximum found in the underlying dataset split will be used.'
|
143 |
+
'Fot the official ArtEmis split for deep-nets that is 30 tokens.')
|
144 |
+
|
145 |
+
parser.add_argument('--drop-unk', type=str2bool, default=True, help='if True, do not create samples that contain the '
|
146 |
+
'unknown token')
|
147 |
+
|
148 |
+
parser.add_argument('--drop-bigrams', type=str2bool, default=True, help='if True, prevent the same bigram to occur '
|
149 |
+
'twice in a sampled utterance')
|
150 |
+
|
151 |
+
|
152 |
+
## To enable the pass of multiple configurations for the sampler at once! i.e., so you can try many
|
153 |
+
## sampling temperatures, methods to sample (beam-search vs. topk), beam-size (or more)
|
154 |
+
## You can provide a simple .json that specifies these values you want to try.
|
155 |
+
## See >> data/speaker_sampling_configs << for examples
|
156 |
+
## Note. if you pass nothing the >> data/speaker_sampling_configs/selected_hyper_params.json.txt << will be used
|
157 |
+
## these are parameters used in the the paper.
|
158 |
+
parser.add_argument('--sampling-config-file', type=str, help='Note. if max-len, drop-unk '
|
159 |
+
'and drop-bigrams are not specified in the json'
|
160 |
+
'the directly provided values of these parameters '
|
161 |
+
'will be used.')
|
162 |
+
|
163 |
+
|
164 |
+
parser.add_argument('--random-seed', type=int, default=2021, help='if -1 it won\'t have an effect; else the sampler '
|
165 |
+
'becomes deterministic')
|
166 |
+
|
167 |
+
parser.add_argument('--img2emo-checkpoint', type=str, help='checkpoint file of image-2-emotion classifier that will '
|
168 |
+
'be used to sample the grounding emotion that will be used '
|
169 |
+
'by the speaker, if you pass an emotionally-grouned speaker. '
|
170 |
+
'Note. if you pass/use an emo-grounded speaker - this argument '
|
171 |
+
'becomes required, except if you are using your own custom-data-csv '
|
172 |
+
'where you can specify the grounding emotion manually.' )
|
173 |
+
|
174 |
+
parser.add_argument('--gpu', type=str, default='0')
|
175 |
+
parser.add_argument('--n-workers', type=int)
|
176 |
+
|
177 |
+
parser.add_argument('--compute-nll', type=str2bool, default=False, help='Compute the negative-log-likelihood of '
|
178 |
+
'the dataset under the the saved speaker model.')
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
# Parse arguments
|
183 |
+
if notebook_options is not None: # Pass options directly
|
184 |
+
args = parser.parse_args(notebook_options)
|
185 |
+
else:
|
186 |
+
args = parser.parse_args() # Read from command line.
|
187 |
+
|
188 |
+
# load "default"
|
189 |
+
if args.sampling_config_file is None:
|
190 |
+
up_dir = osp.split(pathlib.Path(__file__).parent.absolute())[0]
|
191 |
+
args.sampling_config_file = osp.join(up_dir, 'data/speaker_sampling_configs/selected_hyper_params.json.txt')
|
192 |
+
|
193 |
+
# pprint them
|
194 |
+
print('\nParameters Specified:')
|
195 |
+
args_string = pprint.pformat(vars(args))
|
196 |
+
print(args_string)
|
197 |
+
print('\n')
|
198 |
+
|
199 |
+
return args
|
imageprocessing/artemis/artemis/in_out/basics.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Basic (simple) I/O Utilities.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in 2019, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import re
|
10 |
+
import os
|
11 |
+
import json
|
12 |
+
import sys
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import os.path as osp
|
16 |
+
import pprint
|
17 |
+
import logging
|
18 |
+
from argparse import ArgumentParser
|
19 |
+
from IPython.display import display
|
20 |
+
from PIL import Image
|
21 |
+
from six.moves import cPickle, range
|
22 |
+
from ..emotions import ARTEMIS_EMOTIONS
|
23 |
+
|
24 |
+
|
25 |
+
def files_in_subdirs(top_dir, search_pattern):
|
26 |
+
join = osp.join
|
27 |
+
regex = re.compile(search_pattern)
|
28 |
+
for path, _, files in os.walk(top_dir):
|
29 |
+
for name in files:
|
30 |
+
full_name = join(path, name)
|
31 |
+
if regex.search(full_name):
|
32 |
+
yield full_name
|
33 |
+
|
34 |
+
|
35 |
+
def create_dir(dir_path):
|
36 |
+
""" Creates a directory (or nested directories) if they don't exist.
|
37 |
+
"""
|
38 |
+
if not osp.exists(dir_path):
|
39 |
+
os.makedirs(dir_path)
|
40 |
+
|
41 |
+
return dir_path
|
42 |
+
|
43 |
+
|
44 |
+
def pickle_data(file_name, *args):
|
45 |
+
"""Using (c)Pickle to save multiple python objects in a single file.
|
46 |
+
"""
|
47 |
+
out_file = open(file_name, 'wb')
|
48 |
+
cPickle.dump(len(args), out_file, protocol=2)
|
49 |
+
for item in args:
|
50 |
+
cPickle.dump(item, out_file, protocol=2)
|
51 |
+
out_file.close()
|
52 |
+
|
53 |
+
|
54 |
+
def unpickle_data(file_name, python2_to_3=False):
|
55 |
+
""" Restore data previously saved with pickle_data().
|
56 |
+
:param file_name: file holding the pickled data.
|
57 |
+
:param python2_to_3: (boolean), if True, pickle happened under python2x, unpickling under python3x.
|
58 |
+
:return: an generator over the un-pickled items.
|
59 |
+
Note, about implementing the python2_to_3 see
|
60 |
+
https://stackoverflow.com/questions/28218466/unpickling-a-python-2-object-with-python-3
|
61 |
+
"""
|
62 |
+
in_file = open(file_name, 'rb')
|
63 |
+
if python2_to_3:
|
64 |
+
size = cPickle.load(in_file, encoding='latin1')
|
65 |
+
else:
|
66 |
+
size = cPickle.load(in_file)
|
67 |
+
|
68 |
+
for _ in range(size):
|
69 |
+
if python2_to_3:
|
70 |
+
yield cPickle.load(in_file, encoding='latin1')
|
71 |
+
else:
|
72 |
+
yield cPickle.load(in_file)
|
73 |
+
in_file.close()
|
74 |
+
|
75 |
+
|
76 |
+
def load_raw_amt_csv_hit_responses(top_csv_folder, verbose=True, only_approved=True,
|
77 |
+
keep_cols=None, drop_rorschach=True, has_emotions=True):
|
78 |
+
"""
|
79 |
+
:param top_csv_folder:
|
80 |
+
:param verbose:
|
81 |
+
:param only_approved:
|
82 |
+
:param keep_cols:
|
83 |
+
:param drop_rorschach:
|
84 |
+
:param has_emotions: set to False to load wiki-art annotations that are objective (OLA-dataset)
|
85 |
+
:return:
|
86 |
+
"""
|
87 |
+
|
88 |
+
all_collected_csv = [f for f in files_in_subdirs(top_csv_folder, '.csv$')]
|
89 |
+
|
90 |
+
if verbose:
|
91 |
+
print('{} files loaded'.format(len(all_collected_csv)))
|
92 |
+
|
93 |
+
all_csv_names = [osp.basename(f) for f in all_collected_csv]
|
94 |
+
assert len(all_csv_names) == len(set(all_csv_names)) # unique names
|
95 |
+
|
96 |
+
all_dfs = []
|
97 |
+
for f in all_collected_csv: # load each .csv
|
98 |
+
df = pd.read_csv(f)
|
99 |
+
# print(df['AssignmentStatus'].unique())
|
100 |
+
in_submission_mode = (df['AssignmentStatus'] == 'Submitted').sum()
|
101 |
+
if in_submission_mode > 0:
|
102 |
+
print('In {}, {} examples are still in submitted mode.'.format(osp.basename(f), in_submission_mode))
|
103 |
+
if only_approved:
|
104 |
+
df = df[df['AssignmentStatus'] == 'Approved']
|
105 |
+
all_dfs.append(df)
|
106 |
+
df = pd.concat(all_dfs)
|
107 |
+
|
108 |
+
# Rename columns
|
109 |
+
new_cols = [c.replace('choice.', '') for c in [c.replace('Answer.', '') for c in df.columns]]
|
110 |
+
new_cols = [c.lower() for c in new_cols]
|
111 |
+
df.columns = new_cols
|
112 |
+
df = df.reset_index()
|
113 |
+
|
114 |
+
# Keep ML-related columns
|
115 |
+
ml_related_cols = ['workerid', 'input.image_url', 'utterance']
|
116 |
+
# Add potential extras requested at the input
|
117 |
+
if keep_cols is not None:
|
118 |
+
ml_related_cols += keep_cols
|
119 |
+
|
120 |
+
if has_emotions:
|
121 |
+
_, x = np.where(df[ARTEMIS_EMOTIONS])
|
122 |
+
emotion_chosen = pd.Series(np.array(ARTEMIS_EMOTIONS)[x], name='emotion')
|
123 |
+
df = pd.concat([df[ml_related_cols], emotion_chosen], axis=1)
|
124 |
+
else:
|
125 |
+
df = df[ml_related_cols]
|
126 |
+
|
127 |
+
# Derivative columns
|
128 |
+
def url_to_painting_name(x):
|
129 |
+
tokens = x.split('/')
|
130 |
+
return tokens[-1][:-len('.jpg')]
|
131 |
+
|
132 |
+
def url_to_art_style(x):
|
133 |
+
tokens = x.split('/')
|
134 |
+
return tokens[-2]
|
135 |
+
|
136 |
+
df['painting'] = df['input.image_url'].apply(url_to_painting_name)
|
137 |
+
df['art_style'] = df['input.image_url'].apply(url_to_art_style)
|
138 |
+
df = df.drop(['input.image_url'], axis=1)
|
139 |
+
|
140 |
+
if drop_rorschach:
|
141 |
+
df = df[df['art_style'] != 'test']
|
142 |
+
df.reset_index(inplace=True, drop=True)
|
143 |
+
|
144 |
+
if verbose:
|
145 |
+
print('Loading responses:', len(df))
|
146 |
+
print('Column Names:', [c for c in df.columns])
|
147 |
+
|
148 |
+
return df
|
149 |
+
|
150 |
+
|
151 |
+
def splitall(path):
|
152 |
+
"""
|
153 |
+
Examples:
|
154 |
+
splitall('a/b/c') -> ['a', 'b', 'c']
|
155 |
+
splitall('/a/b/c/') -> ['/', 'a', 'b', 'c', '']
|
156 |
+
|
157 |
+
NOTE: https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s16.html
|
158 |
+
"""
|
159 |
+
allparts = []
|
160 |
+
while 1:
|
161 |
+
parts = osp.split(path)
|
162 |
+
if parts[0] == path: # Sentinel for absolute paths.
|
163 |
+
allparts.insert(0, parts[0])
|
164 |
+
break
|
165 |
+
elif parts[1] == path: # Sentinel for relative paths.
|
166 |
+
allparts.insert(0, parts[1])
|
167 |
+
break
|
168 |
+
else:
|
169 |
+
path = parts[0]
|
170 |
+
allparts.insert(0, parts[1])
|
171 |
+
return allparts
|
172 |
+
|
173 |
+
|
174 |
+
def wikiart_file_name_to_style_and_painting(filename):
|
175 |
+
"""
|
176 |
+
Assumes a filename of a painting of wiki-art.
|
177 |
+
:param filename:
|
178 |
+
:return:
|
179 |
+
"""
|
180 |
+
s = splitall(filename)
|
181 |
+
return s[-2], s[-1][:-len('.jpg')]
|
182 |
+
|
183 |
+
|
184 |
+
def show_random_captions(df, top_img_dir):
|
185 |
+
painting, art_style = df.sample(1)[['painting', 'art_style']].iloc[0]
|
186 |
+
print(art_style, painting)
|
187 |
+
display(Image.open(osp.join(top_img_dir, art_style, painting + '.jpg')))
|
188 |
+
s = df[(df.painting == painting) & (df.art_style == art_style)]
|
189 |
+
for e, u in zip(s['emotion'], s['utterance']):
|
190 |
+
print('{}:\t{}'.format(e.upper(), u))
|
191 |
+
|
192 |
+
|
193 |
+
def read_saved_args(config_file, override_args=None, verbose=False):
|
194 |
+
"""
|
195 |
+
:param config_file: json file containing arguments
|
196 |
+
:param override_args: dict e.g., {'gpu': '0'}
|
197 |
+
:param verbose:
|
198 |
+
:return:
|
199 |
+
"""
|
200 |
+
parser = ArgumentParser()
|
201 |
+
args = parser.parse_args([])
|
202 |
+
with open(config_file, 'r') as f_in:
|
203 |
+
args.__dict__ = json.load(f_in)
|
204 |
+
|
205 |
+
if override_args is not None:
|
206 |
+
for key, val in override_args.items():
|
207 |
+
args.__setattr__(key, val)
|
208 |
+
|
209 |
+
if verbose:
|
210 |
+
args_string = pprint.pformat(vars(args))
|
211 |
+
print(args_string)
|
212 |
+
|
213 |
+
return args
|
214 |
+
|
215 |
+
|
216 |
+
def create_logger(log_dir, std_out=True):
|
217 |
+
logger = logging.getLogger()
|
218 |
+
logger.setLevel(logging.INFO)
|
219 |
+
formatter = logging.Formatter('%(asctime)s - %(message)s')
|
220 |
+
|
221 |
+
# Add logging to file handler
|
222 |
+
file_handler = logging.FileHandler(osp.join(log_dir, 'log.txt'))
|
223 |
+
file_handler.setLevel(logging.INFO)
|
224 |
+
file_handler.setFormatter(formatter)
|
225 |
+
logger.addHandler(file_handler)
|
226 |
+
|
227 |
+
# Add stdout to also print statements there
|
228 |
+
if std_out:
|
229 |
+
logger.addHandler(logging.StreamHandler(sys.stdout))
|
230 |
+
return logger
|
imageprocessing/artemis/artemis/in_out/cleaning.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Data Cleaning Utilities.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in 2020, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import pathlib
|
10 |
+
import os.path as osp
|
11 |
+
from tqdm import tqdm_notebook as tqdm
|
12 |
+
from ..in_out.basics import unpickle_data, splitall
|
13 |
+
|
14 |
+
|
15 |
+
def load_duplicate_paintings_of_wikiart(duplicates_pkl_file=None, verbose=True):
|
16 |
+
""" Return a list containing wikiArt paintings that are double-listed.
|
17 |
+
:param duplicates_pkl_file: (opt) pkl file containing the duplicate groups.
|
18 |
+
:return: (list of list) each sublist contains tuples like (art_style, painting) that are duplicates.
|
19 |
+
|
20 |
+
Note. If duplicates_pkl_file==None, the stored inside the repo .pkl file will be used. The duplicates indicated in
|
21 |
+
the .pkl were found by a combination of running the `fdupes' program and a manual check on Nearest-Neighbors of a
|
22 |
+
pretrained ResNet on ImageNet that had very small distances.
|
23 |
+
"""
|
24 |
+
if duplicates_pkl_file is None:
|
25 |
+
up_dir = osp.split(pathlib.Path(__file__).parent.absolute())[0]
|
26 |
+
duplicates_pkl_file = osp.join(up_dir, 'data/wiki_art_duplicate_paintings.pkl')
|
27 |
+
# Note. This file contains duplicates that were found using
|
28 |
+
duplicates_as_list = next(unpickle_data(duplicates_pkl_file))
|
29 |
+
if verbose:
|
30 |
+
print("Using {} groups of paintings that are visually identical (duplicates).".format(len(duplicates_as_list)))
|
31 |
+
return duplicates_as_list
|
32 |
+
|
33 |
+
|
34 |
+
def drop_duplicate_paintings(wiki_art_image_files, duplicate_groups=None):
|
35 |
+
"""
|
36 |
+
:param wiki_art_image_files: (list) with filenames of the form xx/xx/art_style/painting.jpg
|
37 |
+
:param duplicate_groups: list of list, each item is a collection of (art_style, painting) tuples that are duplicates.
|
38 |
+
:return: a new list where from each duplicate group only one (the first) painting is kept.
|
39 |
+
"""
|
40 |
+
if duplicate_groups is None:
|
41 |
+
duplicate_groups = load_duplicate_paintings_of_wikiart()
|
42 |
+
|
43 |
+
drop_these = set()
|
44 |
+
for dup_g in duplicate_groups:
|
45 |
+
drop_these.update(dup_g[1:]) # drop all but first
|
46 |
+
|
47 |
+
clean_img_files = []
|
48 |
+
dropped = 0
|
49 |
+
for img_file in wiki_art_image_files:
|
50 |
+
tokens = splitall(img_file)
|
51 |
+
painting = tokens[-1][:-len('.jpg')]
|
52 |
+
art_style = tokens[-2]
|
53 |
+
key = (art_style, painting)
|
54 |
+
if key in drop_these:
|
55 |
+
dropped += 1
|
56 |
+
else:
|
57 |
+
clean_img_files.append(img_file)
|
58 |
+
print('Dropping {} from {} paintings that are duplicates of one painting that is kept.'.format(dropped,
|
59 |
+
len(wiki_art_image_files)))
|
60 |
+
return clean_img_files
|
61 |
+
|
62 |
+
|
63 |
+
def merge_artemis_annotations_on_wikiart_duplicates(dataset_df, duplicate_groups=None, verbose=True):
|
64 |
+
"""
|
65 |
+
:param dataset_df:
|
66 |
+
:param duplicate_groups:
|
67 |
+
:return:
|
68 |
+
"""
|
69 |
+
|
70 |
+
if duplicate_groups is None:
|
71 |
+
duplicate_groups = load_duplicate_paintings_of_wikiart()
|
72 |
+
|
73 |
+
n_merged_stimuli = 0
|
74 |
+
for dup_g in tqdm(duplicate_groups):
|
75 |
+
keep_this = dup_g[0]
|
76 |
+
drop_these = dup_g[1:] # drop all but first
|
77 |
+
for stimulus in drop_these:
|
78 |
+
mask = (dataset_df['art_style'] == stimulus[0]) & (dataset_df['painting'] == stimulus[1])
|
79 |
+
n_merged_stimuli += sum(mask)
|
80 |
+
dataset_df.loc[mask, ['art_style']] = keep_this[0]
|
81 |
+
dataset_df.loc[mask, ['painting']] = keep_this[1]
|
82 |
+
if verbose:
|
83 |
+
print('{} stimuli were merged.'.format(n_merged_stimuli))
|
84 |
+
return dataset_df
|
85 |
+
|
86 |
+
|
87 |
+
|
imageprocessing/artemis/artemis/in_out/coco.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
COCO related I/O operations
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 10/18/20, for Python 3.x
|
6 |
+
Copyright (c) 2020 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import os.path as osp
|
10 |
+
|
11 |
+
def coco_image_name_to_image_file(image_name, top_img_dir, year=2014):
|
12 |
+
if image_name.startswith('COCO_val'):
|
13 |
+
return osp.join(top_img_dir, 'val' + str(year), image_name)
|
14 |
+
elif image_name.startswith('COCO_train'):
|
15 |
+
return osp.join(top_img_dir, 'train' + str(year), image_name)
|
16 |
+
else:
|
17 |
+
raise ValueError
|
18 |
+
|
19 |
+
|
20 |
+
def karpathize(df):
|
21 |
+
## Per Karpathy's tweet: restval is actually train.
|
22 |
+
df.split[df.split == 'restval'] = 'train'
|
23 |
+
|
24 |
+
|
25 |
+
def prepare_coco_dataframe_for_training(df, top_img_dir):
|
26 |
+
# assign file-names to each image
|
27 |
+
df = df.assign(image_files = df.image.apply(lambda x: coco_image_name_to_image_file(x, top_img_dir)))
|
28 |
+
# fix splits
|
29 |
+
karpathize(df)
|
30 |
+
return df
|
imageprocessing/artemis/artemis/in_out/datasets.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Originally in 2020, for Python 3.x
|
4 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
5 |
+
"""
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
from PIL import Image
|
11 |
+
from torch.utils.data import Dataset, DataLoader
|
12 |
+
from ..evaluation.emotion_alignment import image_to_emotion
|
13 |
+
from ..emotions import emotion_to_int
|
14 |
+
|
15 |
+
|
16 |
+
class AffectiveCaptionDataset(Dataset):
|
17 |
+
""" Basically, an image, with a caption, and an indicated emotion.
|
18 |
+
"""
|
19 |
+
def __init__(self, image_files, tokens, emotions, n_emotions=9, img_transform=None, one_hot_emo=True):
|
20 |
+
super(AffectiveCaptionDataset, self).__init__()
|
21 |
+
self.image_files = image_files
|
22 |
+
self.tokens = tokens
|
23 |
+
self.emotions = emotions
|
24 |
+
self.n_emotions = n_emotions
|
25 |
+
self.img_transform = img_transform
|
26 |
+
self.one_hot_emo = one_hot_emo
|
27 |
+
|
28 |
+
def __getitem__(self, index):
|
29 |
+
text = np.array(self.tokens[index]).astype(dtype=np.long)
|
30 |
+
|
31 |
+
if self.image_files is not None:
|
32 |
+
img = Image.open(self.image_files[index])
|
33 |
+
|
34 |
+
if img.mode is not 'RGB':
|
35 |
+
img = img.convert('RGB')
|
36 |
+
|
37 |
+
if self.img_transform is not None:
|
38 |
+
img = self.img_transform(img)
|
39 |
+
else:
|
40 |
+
img = []
|
41 |
+
|
42 |
+
if self.n_emotions > 0:
|
43 |
+
if self.one_hot_emo:
|
44 |
+
emotion = np.zeros(self.n_emotions, dtype=np.float32)
|
45 |
+
emotion[self.emotions[index]] = 1
|
46 |
+
else:
|
47 |
+
emotion = self.emotions[index]
|
48 |
+
else:
|
49 |
+
emotion = []
|
50 |
+
|
51 |
+
res = {'image': img, 'emotion': emotion, 'tokens': text, 'index': index}
|
52 |
+
return res
|
53 |
+
|
54 |
+
def __len__(self):
|
55 |
+
return len(self.tokens)
|
56 |
+
|
57 |
+
|
58 |
+
class ImageClassificationDataset(Dataset):
|
59 |
+
def __init__(self, image_files, labels=None, img_transform=None, rgb_only=True):
|
60 |
+
super(ImageClassificationDataset, self).__init__()
|
61 |
+
self.image_files = image_files
|
62 |
+
self.labels = labels
|
63 |
+
self.img_transform = img_transform
|
64 |
+
self.rgb_only = rgb_only
|
65 |
+
|
66 |
+
def __getitem__(self, index):
|
67 |
+
img = Image.open(self.image_files[index])
|
68 |
+
|
69 |
+
if self.rgb_only and img.mode is not 'RGB':
|
70 |
+
img = img.convert('RGB')
|
71 |
+
|
72 |
+
if self.img_transform is not None:
|
73 |
+
img = self.img_transform(img)
|
74 |
+
|
75 |
+
label = []
|
76 |
+
if self.labels is not None:
|
77 |
+
label = self.labels[index]
|
78 |
+
|
79 |
+
res = {'image': img, 'label': label, 'index': index}
|
80 |
+
return res
|
81 |
+
|
82 |
+
def __len__(self):
|
83 |
+
return len(self.image_files)
|
84 |
+
|
85 |
+
|
86 |
+
def sub_sample_dataloader(dataloader, sample_size, seed=None, shuffle=False):
|
87 |
+
""" Given any torch dataloader create a sub-sampled version of it.
|
88 |
+
:param dataloader:
|
89 |
+
:param sample_size:
|
90 |
+
:param seed:
|
91 |
+
:param shuffle:
|
92 |
+
:return: dataloader of Subset
|
93 |
+
"""
|
94 |
+
|
95 |
+
dataset = dataloader.dataset
|
96 |
+
n_total = len(dataset)
|
97 |
+
|
98 |
+
if sample_size > n_total:
|
99 |
+
raise ValueError
|
100 |
+
|
101 |
+
if seed is not None:
|
102 |
+
torch.manual_seed(seed)
|
103 |
+
|
104 |
+
sb_dataset = torch.utils.data.random_split(dataset, [sample_size, n_total-sample_size])[0]
|
105 |
+
bsize = min(dataloader.batch_size, sample_size)
|
106 |
+
sample_loader = torch.utils.data.DataLoader(dataset=sb_dataset,
|
107 |
+
batch_size=bsize,
|
108 |
+
shuffle=shuffle,
|
109 |
+
num_workers=dataloader.num_workers)
|
110 |
+
return sample_loader
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
def sub_index_affective_dataloader(affective_dataloader, indices, shuffle=False):
|
115 |
+
""" Given a torch dataloader and a sequence of integers; extract the corresponding items of the
|
116 |
+
carried dataset on the specific indices and make a new dataloader with them.
|
117 |
+
:param affective_dataloader: torch.utils.data.DataLoader for AffectiveCaptionDataset
|
118 |
+
:param indices: sequence of integers indexing the underlying dataset (dataframe).
|
119 |
+
:param shuffle: shuffle the data of the resulting dataloader
|
120 |
+
:return: dataloader of AffectiveCaptionDataset
|
121 |
+
"""
|
122 |
+
dataset = affective_dataloader.dataset
|
123 |
+
r_img_files = dataset.image_files.iloc[indices].copy()
|
124 |
+
r_tokens = dataset.tokens.iloc[indices].copy()
|
125 |
+
r_emotions = dataset.emotions.iloc[indices].copy()
|
126 |
+
|
127 |
+
r_img_files.reset_index(inplace=True, drop=True)
|
128 |
+
r_tokens.reset_index(inplace=True, drop=True)
|
129 |
+
r_emotions.reset_index(inplace=True, drop=True)
|
130 |
+
|
131 |
+
r_dset = AffectiveCaptionDataset(image_files=r_img_files, tokens=r_tokens,
|
132 |
+
emotions=r_emotions, img_transform=dataset.img_transform)
|
133 |
+
|
134 |
+
batch_size = min(len(indices), affective_dataloader.batch_size)
|
135 |
+
|
136 |
+
r_loader = torch.utils.data.DataLoader(r_dset,
|
137 |
+
shuffle=shuffle,
|
138 |
+
batch_size=batch_size,
|
139 |
+
num_workers=affective_dataloader.num_workers)
|
140 |
+
return r_loader
|
141 |
+
|
142 |
+
|
143 |
+
def group_annotations_per_image(affective_dataset):
|
144 |
+
""" Group the annotations per image.
|
145 |
+
:param affective_dataset: an AffectiveCaptionDataset
|
146 |
+
:return: for each image its tokens/emotions as pandas Dataframes
|
147 |
+
"""
|
148 |
+
df = pd.concat([affective_dataset.image_files, affective_dataset.tokens, affective_dataset.emotions], axis=1)
|
149 |
+
tokens_grouped = df.groupby('image_files')['tokens_encoded'].apply(list).reset_index(name='tokens_encoded')
|
150 |
+
emotion_grouped = df.groupby('image_files')['emotion_label'].apply(list).reset_index(name='emotion')
|
151 |
+
assert all(tokens_grouped['image_files'] == emotion_grouped['image_files'])
|
152 |
+
return tokens_grouped['image_files'], tokens_grouped, emotion_grouped
|
153 |
+
|
154 |
+
|
155 |
+
def default_grounding_dataset_from_affective_loader(loader, img2emo_clf=None, device=None, n_workers=None):
|
156 |
+
"""
|
157 |
+
Convenience function. Given a loader carrying an affective dataset, make a new loader only w.r.t.
|
158 |
+
unique images of the dataset, & optionally add to each image the emotion predicted by the img2emo_clf.
|
159 |
+
The new loader can be used to sample utterances over the unique images.
|
160 |
+
:param loader:
|
161 |
+
:param img2emo_clf:
|
162 |
+
:param device:
|
163 |
+
:return:
|
164 |
+
"""
|
165 |
+
affective_dataset = loader.dataset
|
166 |
+
img_files, tokens, emotions = group_annotations_per_image(affective_dataset)
|
167 |
+
|
168 |
+
img_trans = affective_dataset.img_transform
|
169 |
+
batch_size = loader.batch_size
|
170 |
+
|
171 |
+
if n_workers is None:
|
172 |
+
n_workers = loader.num_workers
|
173 |
+
|
174 |
+
dummy = pd.Series(np.ones(len(img_files), dtype=int) * -1)
|
175 |
+
|
176 |
+
# possibly predict grounding emotions
|
177 |
+
if img2emo_clf is not None:
|
178 |
+
temp_dataset = ImageClassificationDataset(image_files=img_files,
|
179 |
+
img_transform=img_trans)
|
180 |
+
img_dataloader = DataLoader(temp_dataset, batch_size, num_workers=n_workers)
|
181 |
+
emo_pred_distribution = image_to_emotion(img2emo_clf, img_dataloader, device)
|
182 |
+
|
183 |
+
grounding_emo = pd.Series(emo_pred_distribution.argmax(-1).tolist()) # use maximizer of emotions.
|
184 |
+
else:
|
185 |
+
grounding_emo = dummy
|
186 |
+
|
187 |
+
new_dataset = AffectiveCaptionDataset(img_files, tokens=dummy, emotions=grounding_emo,
|
188 |
+
img_transform=img_trans)
|
189 |
+
|
190 |
+
new_loader = DataLoader(dataset=new_dataset, batch_size=batch_size, num_workers=n_workers)
|
191 |
+
return new_loader
|
192 |
+
|
193 |
+
|
194 |
+
def custom_grounding_dataset_similar_to_affective_loader(grounding_data_csv, loader, n_workers=None):
|
195 |
+
"""
|
196 |
+
Convenience function. Given a csv indicating (grounding) images on the hard-drive and a loader carrying an affective
|
197 |
+
dataset, make a new loader with the csv images using the same configuration (e.g., img_transform) as the loader.
|
198 |
+
:param grounding_data_csv: (csv filename)
|
199 |
+
- has to have one column named "image_file" that corresponds to the file-names of the images.
|
200 |
+
- (optionally) can have also a "grounding_emotion" column with values like "contentment"
|
201 |
+
:param loader:
|
202 |
+
:return:
|
203 |
+
"""
|
204 |
+
df = pd.read_csv(grounding_data_csv)
|
205 |
+
image_files = df['image_file']
|
206 |
+
dummy = pd.Series(np.ones(len(image_files), dtype=int) * -1)
|
207 |
+
if 'grounding_emotion' in df.columns:
|
208 |
+
emotions = df.emotion.apply(emotion_to_int)
|
209 |
+
else:
|
210 |
+
emotions = dummy
|
211 |
+
|
212 |
+
standard_dset = loader.dataset
|
213 |
+
custom_dataset = AffectiveCaptionDataset(image_files, dummy, emotions=emotions,
|
214 |
+
n_emotions=standard_dset.n_emotions,
|
215 |
+
img_transform=standard_dset.img_transform,
|
216 |
+
one_hot_emo=standard_dset.one_hot_emo)
|
217 |
+
if n_workers is None:
|
218 |
+
n_workers = loader.num_workers
|
219 |
+
|
220 |
+
custom_data_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
|
221 |
+
batch_size=min(loader.batch_size, len(custom_dataset)),
|
222 |
+
num_workers=n_workers)
|
223 |
+
return custom_data_loader
|
224 |
+
|
imageprocessing/artemis/artemis/in_out/neural_net_oriented.py
ADDED
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
I/O routines directly related to torch-based neural-models & their (training etc.) dataset processing.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 10/2/20, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import random
|
11 |
+
import warnings
|
12 |
+
import numpy as np
|
13 |
+
import pandas as pd
|
14 |
+
import os.path as osp
|
15 |
+
import multiprocessing as mp
|
16 |
+
import torchvision.transforms as transforms
|
17 |
+
|
18 |
+
from ast import literal_eval
|
19 |
+
from PIL import Image
|
20 |
+
|
21 |
+
from .basics import read_saved_args
|
22 |
+
from .datasets import AffectiveCaptionDataset, ImageClassificationDataset
|
23 |
+
from ..utils.vocabulary import Vocabulary
|
24 |
+
from ..neural_models.show_attend_tell import describe_model as describe_sat
|
25 |
+
|
26 |
+
|
27 |
+
image_net_mean = [0.485, 0.456, 0.406]
|
28 |
+
image_net_std = [0.229, 0.224, 0.225]
|
29 |
+
|
30 |
+
|
31 |
+
def max_io_workers():
|
32 |
+
"""return all/max possible available cpus of machine."""
|
33 |
+
return max(mp.cpu_count() - 1, 1)
|
34 |
+
|
35 |
+
|
36 |
+
def image_transformation(img_dim, lanczos=True):
|
37 |
+
"""simple transformation/pre-processing of image data."""
|
38 |
+
|
39 |
+
if lanczos:
|
40 |
+
resample_method = Image.LANCZOS
|
41 |
+
else:
|
42 |
+
resample_method = Image.BILINEAR
|
43 |
+
|
44 |
+
normalize = transforms.Normalize(mean=image_net_mean, std=image_net_std)
|
45 |
+
img_transforms = dict()
|
46 |
+
img_transforms['train'] = transforms.Compose([transforms.Resize((img_dim, img_dim), resample_method),
|
47 |
+
transforms.ToTensor(),
|
48 |
+
normalize])
|
49 |
+
|
50 |
+
# Use same transformations as in train (since no data-augmentation is applied in train)
|
51 |
+
img_transforms['test'] = img_transforms['train']
|
52 |
+
img_transforms['val'] = img_transforms['train']
|
53 |
+
img_transforms['rest'] = img_transforms['train']
|
54 |
+
return img_transforms
|
55 |
+
|
56 |
+
|
57 |
+
def df_to_pytorch_dataset(df, args):
|
58 |
+
if args.num_workers == -1:
|
59 |
+
n_workers = max_io_workers()
|
60 |
+
else:
|
61 |
+
n_workers = args.num_workers
|
62 |
+
|
63 |
+
load_imgs = True
|
64 |
+
if hasattr(args, 'use_imgs') and not args.use_imgs: # build a dataset without images (e.g., text/emotion only)
|
65 |
+
load_imgs = False
|
66 |
+
|
67 |
+
one_hot_emo = True
|
68 |
+
if hasattr(args, 'one_hot_emo') and not args.one_hot_emo: # turn off the one-hot, keep the integer (e.g., when a using xentropy)
|
69 |
+
one_hot_emo = False
|
70 |
+
|
71 |
+
img_transforms = None
|
72 |
+
if load_imgs:
|
73 |
+
img_transforms = image_transformation(args.img_dim, lanczos=args.lanczos)
|
74 |
+
|
75 |
+
if args.dataset == 'artemis':
|
76 |
+
datasets = pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, args.img_dir, one_hot_emo=one_hot_emo)
|
77 |
+
elif args.dataset == 'ola': # Objective Language for Art.
|
78 |
+
datasets = pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, args.img_dir, n_emotions=0)
|
79 |
+
elif args.dataset == 'coco':
|
80 |
+
datasets = pass_coco_splits_to_datasets(df, load_imgs, img_transforms)
|
81 |
+
else:
|
82 |
+
raise ValueError('training dataset not recognized.')
|
83 |
+
|
84 |
+
dataloaders = dict()
|
85 |
+
for split in datasets:
|
86 |
+
b_size = args.batch_size if split=='train' else args.batch_size * 2
|
87 |
+
dataloaders[split] = torch.utils.data.DataLoader(dataset=datasets[split],
|
88 |
+
batch_size=b_size,
|
89 |
+
shuffle=split=='train',
|
90 |
+
num_workers=n_workers)
|
91 |
+
return dataloaders, datasets
|
92 |
+
|
93 |
+
|
94 |
+
def pass_coco_splits_to_datasets(df, load_imgs, img_transforms, n_emotions=0):
|
95 |
+
datasets = dict()
|
96 |
+
for split, g in df.groupby('split'):
|
97 |
+
g.reset_index(inplace=True, drop=True) # so that direct ([]) indexing in get_item works
|
98 |
+
img_files = None
|
99 |
+
img_trans = None
|
100 |
+
|
101 |
+
if load_imgs:
|
102 |
+
img_files = g['image_files']
|
103 |
+
img_trans = img_transforms[split]
|
104 |
+
|
105 |
+
dataset = AffectiveCaptionDataset(img_files, g.tokens_encoded, g.emotion_label, img_transform=img_trans,
|
106 |
+
n_emotions=n_emotions)
|
107 |
+
datasets[split] = dataset
|
108 |
+
return datasets
|
109 |
+
|
110 |
+
|
111 |
+
def pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, top_img_dir, n_emotions=9, one_hot_emo=True):
|
112 |
+
datasets = dict()
|
113 |
+
for split, g in df.groupby('split'):
|
114 |
+
g.reset_index(inplace=True, drop=True) # so that direct ([]) indexing in get_item works
|
115 |
+
img_files = None
|
116 |
+
img_trans = None
|
117 |
+
|
118 |
+
if load_imgs:
|
119 |
+
img_files = g.apply(lambda x : osp.join(top_img_dir, x.art_style, x.painting + '.jpg'), axis=1)
|
120 |
+
img_files.name = 'image_files'
|
121 |
+
img_trans = img_transforms[split]
|
122 |
+
|
123 |
+
dataset = AffectiveCaptionDataset(img_files, g.tokens_encoded, g.emotion_label, n_emotions=n_emotions,
|
124 |
+
img_transform=img_trans, one_hot_emo=one_hot_emo)
|
125 |
+
|
126 |
+
datasets[split] = dataset
|
127 |
+
return datasets
|
128 |
+
|
129 |
+
|
130 |
+
def read_preprocessed_data_df(args, verbose=False):
|
131 |
+
if args.dataset == 'artemis':
|
132 |
+
file_name = 'artemis_preprocessed.csv'
|
133 |
+
elif args.dataset == 'coco':
|
134 |
+
file_name = 'coco_preprocessed.csv'
|
135 |
+
else:
|
136 |
+
raise ValueError('Unknown Dataset.')
|
137 |
+
|
138 |
+
if hasattr(args, 'fine_tune_data') and args.fine_tune_data:
|
139 |
+
df = pd.read_csv(args.fine_tune_data) # allow explicit data passing
|
140 |
+
else:
|
141 |
+
df = pd.read_csv(osp.join(args.data_dir, file_name))
|
142 |
+
|
143 |
+
df.tokens_encoded = df.tokens_encoded.apply(literal_eval)
|
144 |
+
|
145 |
+
if verbose:
|
146 |
+
print('Loaded {} utterances'.format(len(df)))
|
147 |
+
return df
|
148 |
+
|
149 |
+
|
150 |
+
def image_emotion_distribution_df_to_pytorch_dataset(df, args, drop_thres=None):
|
151 |
+
""" Convert the pandas dataframe that carries information about images and emotion (distributions) to a
|
152 |
+
dataset that is amenable to deep-learning (e.g., for an image2emotion classifier).
|
153 |
+
:param df:
|
154 |
+
:param args:
|
155 |
+
:param drop_thres: (optional, float) if provided each distribution of the training will only consist of examples
|
156 |
+
for which the maximizing emotion aggregates more than this (drop_thres) mass.
|
157 |
+
:return: pytorch dataloaders & datasets
|
158 |
+
"""
|
159 |
+
dataloaders = dict()
|
160 |
+
datasets = dict()
|
161 |
+
img_transforms = image_transformation(args.img_dim, lanczos=args.lanczos)
|
162 |
+
|
163 |
+
if args.num_workers == -1:
|
164 |
+
n_workers = max_io_workers()
|
165 |
+
else:
|
166 |
+
n_workers = args.num_workers
|
167 |
+
|
168 |
+
for split, g in df.groupby('split'):
|
169 |
+
g.reset_index(inplace=True, drop=True)
|
170 |
+
|
171 |
+
if split == 'train' and drop_thres is not None:
|
172 |
+
noise_mask = g['emotion_distribution'].apply(lambda x: max(x) > drop_thres)
|
173 |
+
print('Keeping {} of the training data, since for the rest their emotion-maximizer is too low.'.format(noise_mask.mean()))
|
174 |
+
g = g[noise_mask]
|
175 |
+
g.reset_index(inplace=True, drop=True)
|
176 |
+
|
177 |
+
|
178 |
+
img_files = g.apply(lambda x : osp.join(args.img_dir, x.art_style, x.painting + '.jpg'), axis=1)
|
179 |
+
img_files.name = 'image_files'
|
180 |
+
|
181 |
+
dataset = ImageClassificationDataset(img_files, g.emotion_distribution,
|
182 |
+
img_transform=img_transforms[split])
|
183 |
+
|
184 |
+
datasets[split] = dataset
|
185 |
+
b_size = args.batch_size if split=='train' else args.batch_size * 2
|
186 |
+
dataloaders[split] = torch.utils.data.DataLoader(dataset=dataset,
|
187 |
+
batch_size=b_size,
|
188 |
+
shuffle=split=='train',
|
189 |
+
num_workers=n_workers)
|
190 |
+
return dataloaders, datasets
|
191 |
+
|
192 |
+
|
193 |
+
def seed_torch_code(seed, strict=False):
|
194 |
+
"""Control pseudo-randomness for reproducibility.
|
195 |
+
:param manual_seed: (int) random-seed
|
196 |
+
:param strict: (boolean) if True, cudnn operates in a deterministic manner
|
197 |
+
"""
|
198 |
+
random.seed(seed)
|
199 |
+
np.random.seed(seed)
|
200 |
+
torch.manual_seed(seed)
|
201 |
+
torch.cuda.manual_seed_all(seed)
|
202 |
+
if strict:
|
203 |
+
torch.backends.cudnn.deterministic = True
|
204 |
+
torch.backends.cudnn.benchmark = False
|
205 |
+
|
206 |
+
|
207 |
+
def save_state_dicts(checkpoint_file, epoch=None, **kwargs):
|
208 |
+
""" Save torch items with a state_dict
|
209 |
+
"""
|
210 |
+
checkpoint = dict()
|
211 |
+
|
212 |
+
if epoch is not None:
|
213 |
+
checkpoint['epoch'] = epoch
|
214 |
+
|
215 |
+
for key, value in kwargs.items():
|
216 |
+
checkpoint[key] = value.state_dict()
|
217 |
+
|
218 |
+
torch.save(checkpoint, checkpoint_file)
|
219 |
+
|
220 |
+
|
221 |
+
def load_state_dicts(checkpoint_file, map_location=None, **kwargs):
|
222 |
+
""" Load torch items from saved state_dictionaries
|
223 |
+
"""
|
224 |
+
if map_location is None:
|
225 |
+
checkpoint = torch.load(checkpoint_file)
|
226 |
+
else:
|
227 |
+
checkpoint = torch.load(checkpoint_file, map_location=map_location)
|
228 |
+
|
229 |
+
for key, value in kwargs.items():
|
230 |
+
value.load_state_dict(checkpoint[key])
|
231 |
+
|
232 |
+
epoch = checkpoint.get('epoch')
|
233 |
+
if epoch:
|
234 |
+
return epoch
|
235 |
+
|
236 |
+
|
237 |
+
def torch_save_model(model, path):
|
238 |
+
""" Wrap torch.save to catch standard warning of not finding the nested implementations.
|
239 |
+
:param model:
|
240 |
+
:param path:
|
241 |
+
:return:
|
242 |
+
"""
|
243 |
+
with warnings.catch_warnings():
|
244 |
+
warnings.simplefilter("ignore")
|
245 |
+
return torch.save(model, path)
|
246 |
+
|
247 |
+
|
248 |
+
def torch_load_model(checkpoint_file, map_location=None):
|
249 |
+
""" Wrap torch.load to catch standard warning of not finding the nested implementations.
|
250 |
+
:param checkpoint_file:
|
251 |
+
:param map_location:
|
252 |
+
:return:
|
253 |
+
"""
|
254 |
+
with warnings.catch_warnings():
|
255 |
+
warnings.simplefilter("ignore")
|
256 |
+
model = torch.load(checkpoint_file, map_location=map_location)
|
257 |
+
return model
|
258 |
+
|
259 |
+
|
260 |
+
def load_saved_speaker(args_file, model_ckp, with_data=False, override_args=None, verbose=False):
|
261 |
+
"""
|
262 |
+
:param args_file: saved argparse arguments with model's description (and location of used data)
|
263 |
+
:param model_ckp: saved checkpoint with model's parameters.
|
264 |
+
:param with_data:
|
265 |
+
:param override_args:
|
266 |
+
:return:
|
267 |
+
Note, the model is loaded and returned in cpu.
|
268 |
+
"""
|
269 |
+
if verbose:
|
270 |
+
print('Loading saved speaker trained with parameters:')
|
271 |
+
args = read_saved_args(args_file, override_args=override_args, verbose=verbose)
|
272 |
+
|
273 |
+
# Prepare empty model
|
274 |
+
vocab = Vocabulary.load(osp.join(args.data_dir, 'vocabulary.pkl'))
|
275 |
+
print('Using a vocabulary of size', len(vocab))
|
276 |
+
model = describe_sat(vocab, args)
|
277 |
+
|
278 |
+
# Load save weights
|
279 |
+
epoch = load_state_dicts(model_ckp, model=model, map_location='cpu')
|
280 |
+
print('Loading speaker model at epoch {}.'.format(epoch))
|
281 |
+
|
282 |
+
# Load data
|
283 |
+
if with_data:
|
284 |
+
df = read_preprocessed_data_df(args, verbose=True)
|
285 |
+
data_loaders, _ = df_to_pytorch_dataset(df, args)
|
286 |
+
else:
|
287 |
+
data_loaders = None
|
288 |
+
|
289 |
+
return model, epoch, data_loaders
|
290 |
+
|
291 |
+
|
292 |
+
def deprocess_img(img, std=None, mean=None, clamp=None, inplace=False):
|
293 |
+
if not inplace:
|
294 |
+
img = img.clone()
|
295 |
+
|
296 |
+
if img.ndimension() == 4: # batch of images
|
297 |
+
pass
|
298 |
+
# single_img = False
|
299 |
+
elif img.ndimension() == 3: # single image
|
300 |
+
img = img.view([1] + list(img.shape))
|
301 |
+
# single_img = True
|
302 |
+
else:
|
303 |
+
raise ValueError()
|
304 |
+
|
305 |
+
dtype = img.dtype
|
306 |
+
n_channels = img.size(1)
|
307 |
+
|
308 |
+
if std is not None:
|
309 |
+
std = torch.as_tensor(std, dtype=dtype, device=img.device)
|
310 |
+
img.mul_(std.view([1, n_channels, 1, 1]))
|
311 |
+
|
312 |
+
if mean is not None:
|
313 |
+
mean = torch.as_tensor(mean, dtype=dtype, device=img.device)
|
314 |
+
img.add_(mean.view([1, n_channels, 1, 1]))
|
315 |
+
|
316 |
+
if clamp is not None:
|
317 |
+
img.clamp_(clamp[0], clamp[1])
|
318 |
+
|
319 |
+
return img
|
320 |
+
|
321 |
+
|
322 |
+
def to_img(tensor, mean=None, std=None):
|
323 |
+
""" Convert tensor object to PIL.Image(s)
|
324 |
+
:param tensor:
|
325 |
+
:param mean:
|
326 |
+
:param std:
|
327 |
+
:return:
|
328 |
+
"""
|
329 |
+
image = tensor.clone().detach()
|
330 |
+
image = deprocess_img(image, mean=mean, std=std)
|
331 |
+
# Add 0.5 after un-normalizing to [0, 255] to round to nearest integer
|
332 |
+
array = image.mul_(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy()
|
333 |
+
image = []
|
334 |
+
for im in array:
|
335 |
+
image.append(Image.fromarray(im))
|
336 |
+
return image
|
imageprocessing/artemis/artemis/language/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
4 |
+
"""
|
imageprocessing/artemis/artemis/language/adjective_noun_pairs.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Some operations to handle Adjective-Noun Pairs. E.g., useful for sentiment injection
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created mid 2020, for Python 3.x
|
6 |
+
Copyright (c) 2020 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
from collections import Counter
|
10 |
+
from .part_of_speech import nltk_parallel_tagging_of_tokens
|
11 |
+
|
12 |
+
def collect_anps_of_sentence(tokenized_pos_tagged_sentence, tagset='universal'):
|
13 |
+
""" return all ANPs that occur in consecutive positions.
|
14 |
+
tokenized_pos_tagged_sentence: list, containing the result of calling from nltk.pos_tag on a tokenized sentence.
|
15 |
+
E.g., [('a', 'DT'), ('big', 'JJ'), ('man', 'NN')]
|
16 |
+
"""
|
17 |
+
n_tokens = len(tokenized_pos_tagged_sentence)
|
18 |
+
collected = []
|
19 |
+
|
20 |
+
if tagset == 'universal':
|
21 |
+
for i, p in enumerate(tokenized_pos_tagged_sentence):
|
22 |
+
if p[1] == 'ADJ' and i < n_tokens -1:
|
23 |
+
if tokenized_pos_tagged_sentence[i+1][1] == 'NOUN':
|
24 |
+
collected.append(p[0] + ' ' + tokenized_pos_tagged_sentence[i+1][0])
|
25 |
+
elif tagset == 'penn':
|
26 |
+
for i, p in enumerate(tokenized_pos_tagged_sentence):
|
27 |
+
if p[1].startswith('J') and i < n_tokens -1:
|
28 |
+
if tokenized_pos_tagged_sentence[i+1][1].startswith('N'):
|
29 |
+
collected.append(p[0] + ' ' + tokenized_pos_tagged_sentence[i+1][0])
|
30 |
+
else:
|
31 |
+
raise ValueError()
|
32 |
+
return collected
|
33 |
+
|
34 |
+
|
35 |
+
def collect_anp_statistics_of_collection(token_series):
|
36 |
+
""" E.g., e.g., how frequent is the ANP "happy man" in the token_series.
|
37 |
+
:param token_series: pd.Series, each row is a tokenized sentence
|
38 |
+
:return:
|
39 |
+
"""
|
40 |
+
part_of_s = nltk_parallel_tagging_of_tokens(token_series)
|
41 |
+
anps = part_of_s.apply(collect_anps_of_sentence)
|
42 |
+
anp_counter = Counter()
|
43 |
+
anps.apply(anp_counter.update)
|
44 |
+
return anp_counter, anps, part_of_s
|
imageprocessing/artemis/artemis/language/basics.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A set of functions that are useful for processing textual data.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
6 |
+
"""
|
7 |
+
|
8 |
+
import pandas as pd
|
9 |
+
import multiprocessing as mp
|
10 |
+
from multiprocessing import Pool
|
11 |
+
from collections import defaultdict
|
12 |
+
from itertools import tee, islice
|
13 |
+
from symspellpy.symspellpy import SymSpell
|
14 |
+
|
15 |
+
from .language_preprocessing import unquote_words, expand_contractions
|
16 |
+
from .language_preprocessing import manual_sentence_spelling, manual_tokenized_sentence_spelling
|
17 |
+
from ..language.spelling import sentence_spelling_dictionary as artemis_sentence_spelling_dictionary
|
18 |
+
from ..language.spelling import token_spelling_dictionary as artemis_token_spelling_dictionary
|
19 |
+
from ..language.spelling import missing_from_glove_but_are_actual_words
|
20 |
+
from ..neural_models.word_embeddings import load_glove_pretrained_embedding
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def ngrams(lst, n):
|
25 |
+
""" Return the ngrams of a list of tokens.
|
26 |
+
:param lst: the tokens
|
27 |
+
:param n: n of n-grams
|
28 |
+
:return:
|
29 |
+
"""
|
30 |
+
tlst = lst
|
31 |
+
while True:
|
32 |
+
a, b = tee(tlst)
|
33 |
+
l = tuple(islice(a, n))
|
34 |
+
if len(l) == n:
|
35 |
+
yield l
|
36 |
+
next(b)
|
37 |
+
tlst = b
|
38 |
+
else:
|
39 |
+
break
|
40 |
+
|
41 |
+
|
42 |
+
def parallel_apply(iterable, func, n_processes=None):
|
43 |
+
""" Apply func in parallel to chunks of the iterable based on multiple processes.
|
44 |
+
:param iterable:
|
45 |
+
:param func: simple function that does not change the state of global variables.
|
46 |
+
:param n_processes: (int) how many processes to split the data over
|
47 |
+
:return:
|
48 |
+
"""
|
49 |
+
n_items = len(iterable)
|
50 |
+
if n_processes is None:
|
51 |
+
n_processes = min(4 * mp.cpu_count(), n_items)
|
52 |
+
pool = Pool(n_processes)
|
53 |
+
chunks = int(n_items / n_processes)
|
54 |
+
res = []
|
55 |
+
for data in pool.imap(func, iterable, chunksize=chunks):
|
56 |
+
res.append(data)
|
57 |
+
pool.close()
|
58 |
+
pool.join()
|
59 |
+
return res
|
60 |
+
|
61 |
+
|
62 |
+
def tokenize_and_spell(df, glove_file, freq_file, tokenizer, parallel=True, inplace=True, spell_check=True):
|
63 |
+
speller = SymSpell()
|
64 |
+
loaded = speller.load_dictionary(freq_file, term_index=0, count_index=1)
|
65 |
+
print('SymSpell spell-checker loaded:', loaded)
|
66 |
+
golden_vocabulary = load_glove_pretrained_embedding(glove_file, only_words=True, verbose=True)
|
67 |
+
golden_vocabulary = golden_vocabulary.union(missing_from_glove_but_are_actual_words)
|
68 |
+
print('Updating Glove vocabulary with *valid* ArtEmis words that are missing from it.')
|
69 |
+
missed_tokens = defaultdict(list)
|
70 |
+
|
71 |
+
def automatic_token_speller(token_list, max_edit_distance=1):
|
72 |
+
new_tokens = []
|
73 |
+
for token in token_list:
|
74 |
+
if token in golden_vocabulary:
|
75 |
+
new_tokens.append(token) # no spell check
|
76 |
+
else:
|
77 |
+
spells = speller.lookup(token, max_edit_distance)
|
78 |
+
if len(spells) > 0: # found a spelled checked version
|
79 |
+
new_tokens.append(spells[0].term)
|
80 |
+
else: # spell checking failed
|
81 |
+
context = " ".join(token_list)
|
82 |
+
missed_tokens[token].append(context)
|
83 |
+
new_tokens.append(token)
|
84 |
+
return new_tokens
|
85 |
+
|
86 |
+
if not spell_check:
|
87 |
+
automatic_token_speller = None
|
88 |
+
|
89 |
+
clean_text, tokens, spelled_tokens = pre_process_text(df.utterance,
|
90 |
+
artemis_sentence_spelling_dictionary,
|
91 |
+
artemis_token_spelling_dictionary,
|
92 |
+
tokenizer,
|
93 |
+
token_speller=automatic_token_speller,
|
94 |
+
parallel=parallel)
|
95 |
+
|
96 |
+
if inplace:
|
97 |
+
df['tokens'] = spelled_tokens
|
98 |
+
df['tokens_len'] = df.tokens.apply(lambda x : len(x))
|
99 |
+
df['utterance_spelled'] = df.tokens.apply(lambda x : ' '.join(x))
|
100 |
+
return missed_tokens
|
101 |
+
else:
|
102 |
+
return missed_tokens, spelled_tokens
|
103 |
+
|
104 |
+
|
105 |
+
def pre_process_text(text, manual_sentence_speller, manual_token_speller,
|
106 |
+
tokenizer, token_speller=None, parallel=True):
|
107 |
+
|
108 |
+
clean_text = text.apply(lambda x: manual_sentence_spelling(x, manual_sentence_speller)) # sentence-to-sentence map
|
109 |
+
clean_text = clean_text.apply(lambda x: x.lower())
|
110 |
+
clean_text = clean_text.apply(unquote_words)
|
111 |
+
|
112 |
+
if parallel:
|
113 |
+
clean_text = pd.Series(parallel_apply(clean_text, expand_contractions))
|
114 |
+
else:
|
115 |
+
clean_text = clean_text.apply(expand_contractions)
|
116 |
+
|
117 |
+
basic_punct = '.?!,:;/\-~*_=[–]{}$^@|%#<—>'
|
118 |
+
punct_to_space = str.maketrans(basic_punct, ' ' * len(basic_punct)) # map punctuation to space
|
119 |
+
clean_text = clean_text.apply(lambda x: x.translate(punct_to_space))
|
120 |
+
|
121 |
+
if parallel:
|
122 |
+
tokens = pd.Series(parallel_apply(clean_text, tokenizer))
|
123 |
+
else:
|
124 |
+
tokens = clean_text.apply(tokenizer)
|
125 |
+
|
126 |
+
spelled_tokens = tokens.apply(lambda x: manual_tokenized_sentence_spelling(x,
|
127 |
+
spelling_dictionary=manual_token_speller)
|
128 |
+
)
|
129 |
+
if token_speller is not None:
|
130 |
+
spelled_tokens = spelled_tokens.apply(token_speller)
|
131 |
+
|
132 |
+
return clean_text, tokens, spelled_tokens
|
imageprocessing/artemis/artemis/language/language_preprocessing.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A set of functions that are useful for pre-processing textual data: uniformizing the words, spelling, etc.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
6 |
+
"""
|
7 |
+
|
8 |
+
import re
|
9 |
+
|
10 |
+
contractions_dict = {
|
11 |
+
"ain't": "am not",
|
12 |
+
"aren't": "are not",
|
13 |
+
"can't": "cannot",
|
14 |
+
"can't've": "cannot have",
|
15 |
+
"'cause": "because",
|
16 |
+
"could've": "could have",
|
17 |
+
"couldn't": "could not",
|
18 |
+
"couldn't've": "could not have",
|
19 |
+
"didn't": "did not",
|
20 |
+
"doesn't": "does not",
|
21 |
+
"don't": "do not",
|
22 |
+
"hadn't": "had not",
|
23 |
+
"hadn't've": "had not have",
|
24 |
+
"hasn't": "has not",
|
25 |
+
"haven't": "have not",
|
26 |
+
"he'd": "he had",
|
27 |
+
"he'd've": "he would have",
|
28 |
+
"he'll": "he will",
|
29 |
+
"he'll've": "he will have",
|
30 |
+
"he's": "he is",
|
31 |
+
"how'd": "how did",
|
32 |
+
"how'd'y": "how do you",
|
33 |
+
"how'll": "how will",
|
34 |
+
"how's": "how is",
|
35 |
+
"i'd": "I had",
|
36 |
+
"i'd've": "I would have",
|
37 |
+
"i'll": "I will",
|
38 |
+
"i'll've": "I will have",
|
39 |
+
"i'm": "I am",
|
40 |
+
"i've": "I have",
|
41 |
+
"isn't": "is not",
|
42 |
+
"it'd": "it had",
|
43 |
+
"it'd've": "it would have",
|
44 |
+
"it'll": "it will",
|
45 |
+
"it'll've": "iit will have",
|
46 |
+
"it's": "it is",
|
47 |
+
"let's": "let us",
|
48 |
+
"ma'am": "madam",
|
49 |
+
"mayn't": "may not",
|
50 |
+
"might've": "might have",
|
51 |
+
"mightn't": "might not",
|
52 |
+
"mightn't've": "might not have",
|
53 |
+
"must've": "must have",
|
54 |
+
"mustn't": "must not",
|
55 |
+
"mustn't've": "must not have",
|
56 |
+
"needn't": "need not",
|
57 |
+
"needn't've": "need not have",
|
58 |
+
"o'clock": "of the clock",
|
59 |
+
"oughtn't": "ought not",
|
60 |
+
"oughtn't've": "ought not have",
|
61 |
+
"shan't": "shall not",
|
62 |
+
"sha'n't": "shall not",
|
63 |
+
"shan't've": "shall not have",
|
64 |
+
"she'd": "she had",
|
65 |
+
"she'd've": "she would have",
|
66 |
+
"she'll": "she will",
|
67 |
+
"she'll've": "she will have",
|
68 |
+
"she's": "she is",
|
69 |
+
"should've": "should have",
|
70 |
+
"shouldn't": "should not",
|
71 |
+
"shouldn't've": "should not have",
|
72 |
+
"so've": "so have",
|
73 |
+
"so's": "so is",
|
74 |
+
"that'd": "that had",
|
75 |
+
"that'd've": "that would have",
|
76 |
+
"that's": "that is",
|
77 |
+
"there'd": "there had",
|
78 |
+
"there'd've": "there would have",
|
79 |
+
"there's": "there is",
|
80 |
+
"they'd": "they had",
|
81 |
+
"they'd've": "they would have",
|
82 |
+
"they'll": "they will",
|
83 |
+
"they'll've": "they will have",
|
84 |
+
"they're": "they are",
|
85 |
+
"they've": "they have",
|
86 |
+
"to've": "to have",
|
87 |
+
"wasn't": "was not",
|
88 |
+
"we'd": "we had",
|
89 |
+
"we'd've": "we would have",
|
90 |
+
"we'll": "we will",
|
91 |
+
"we'll've": "we will have",
|
92 |
+
"we're": "we are",
|
93 |
+
"we've": "we have",
|
94 |
+
"weren't": "were not",
|
95 |
+
"what'll": "what will",
|
96 |
+
"what'll've": "what will have",
|
97 |
+
"what're": "what are",
|
98 |
+
"what's": "what is",
|
99 |
+
"what've": "what have",
|
100 |
+
"when's": "when is",
|
101 |
+
"when've": "when have",
|
102 |
+
"where'd": "where did",
|
103 |
+
"where's": "where is",
|
104 |
+
"where've": "where have",
|
105 |
+
"who'll": "who will",
|
106 |
+
"who'll've": "who will have",
|
107 |
+
"who's": "who is",
|
108 |
+
"who've": "who have",
|
109 |
+
"why's": "why is",
|
110 |
+
"why've": "why have",
|
111 |
+
"will've": "will have",
|
112 |
+
"won't": "will not",
|
113 |
+
"won't've": "will not have",
|
114 |
+
"would've": "would have",
|
115 |
+
"wouldn't": "would not",
|
116 |
+
"wouldn't've": "would not have",
|
117 |
+
"y'all": "you all",
|
118 |
+
"y'all'd": "you all would",
|
119 |
+
"y'all'd've": "you all would have",
|
120 |
+
"y'all're": "you all are",
|
121 |
+
"y'all've": "you all have",
|
122 |
+
"you'd": "you had",
|
123 |
+
"you'd've": "you would have",
|
124 |
+
"you'll": "you will",
|
125 |
+
"you'll've": "you will have",
|
126 |
+
"you're": "you are",
|
127 |
+
"you've": "you have",
|
128 |
+
"do'nt": "do not",
|
129 |
+
"does\'nt": "does not"
|
130 |
+
}
|
131 |
+
|
132 |
+
CONTRACTION_RE = re.compile('({})'.format('|'.join(contractions_dict.keys())),
|
133 |
+
flags=re.IGNORECASE | re.DOTALL)
|
134 |
+
|
135 |
+
|
136 |
+
def expand_contractions(text, contractions=None, lower_i=True):
|
137 |
+
""" Expand the contractions of the text (if any).
|
138 |
+
Example: You're a good father. -> you are a good father.
|
139 |
+
:param text: (string)
|
140 |
+
:param contractions: (dict)
|
141 |
+
:param lower_i: boolean, if True (I'm -> 'i am' not 'I am')
|
142 |
+
:return: (string)
|
143 |
+
|
144 |
+
Note:
|
145 |
+
Side-effect: lower-casing. E.g., You're -> you are.
|
146 |
+
"""
|
147 |
+
if contractions is None:
|
148 |
+
contractions = contractions_dict # Use one define in this .py
|
149 |
+
|
150 |
+
def expand_match(contraction):
|
151 |
+
match = contraction.group(0)
|
152 |
+
expanded_contraction = contractions.get(match)
|
153 |
+
if expanded_contraction is None:
|
154 |
+
expanded_contraction = contractions.get(match.lower())
|
155 |
+
if lower_i:
|
156 |
+
expanded_contraction = expanded_contraction.lower()
|
157 |
+
return expanded_contraction
|
158 |
+
|
159 |
+
expanded_text = CONTRACTION_RE.sub(expand_match, text)
|
160 |
+
return expanded_text
|
161 |
+
|
162 |
+
|
163 |
+
QUOTES_RE_STR = r"""(?:['|"][\w]+['|"])""" # Words encapsulated in apostrophes.
|
164 |
+
QUOTES_RE = re.compile(r"(%s)" % QUOTES_RE_STR, flags=re.VERBOSE | re.IGNORECASE | re.UNICODE)
|
165 |
+
|
166 |
+
|
167 |
+
def unquote_words(s):
|
168 |
+
""" 'king' - > king, "queen" -> queen """
|
169 |
+
iterator = QUOTES_RE.finditer(s)
|
170 |
+
new_sentence = list(s)
|
171 |
+
for match in iterator:
|
172 |
+
start, end = match.span()
|
173 |
+
new_sentence[start] = ' '
|
174 |
+
new_sentence[end-1] = ' '
|
175 |
+
new_sentence = "".join(new_sentence)
|
176 |
+
return new_sentence
|
177 |
+
|
178 |
+
|
179 |
+
def manual_sentence_spelling(x, spelling_dictionary):
|
180 |
+
"""
|
181 |
+
Applies spelling on an entire string, if x is a key of the spelling_dictionary.
|
182 |
+
:param x: (string) sentence to potentially be corrected
|
183 |
+
:param spelling_dictionary: correction map
|
184 |
+
:return: the sentence corrected
|
185 |
+
"""
|
186 |
+
if x in spelling_dictionary:
|
187 |
+
return spelling_dictionary[x]
|
188 |
+
else:
|
189 |
+
return x
|
190 |
+
|
191 |
+
|
192 |
+
def manual_tokenized_sentence_spelling(tokens, spelling_dictionary):
|
193 |
+
"""
|
194 |
+
:param tokens: (list of tokens) to potentially be corrected
|
195 |
+
:param spelling_dictionary: correction map
|
196 |
+
:return: a list of corrected tokens
|
197 |
+
"""
|
198 |
+
new_tokens = []
|
199 |
+
for token in tokens:
|
200 |
+
if token in spelling_dictionary:
|
201 |
+
res = spelling_dictionary[token]
|
202 |
+
if type(res) == list:
|
203 |
+
new_tokens.extend(res)
|
204 |
+
else:
|
205 |
+
new_tokens.append(res)
|
206 |
+
else:
|
207 |
+
new_tokens.append(token)
|
208 |
+
return new_tokens
|
209 |
+
|
210 |
+
|
211 |
+
# noinspection PyInterpreter
|
212 |
+
if __name__ == "__main__":
|
213 |
+
import pandas as pd
|
214 |
+
text = pd.DataFrame({'data': ["I'm a 'good' MAN", "You can't be likee this."]})
|
215 |
+
print("Original Text:")
|
216 |
+
print(text.data)
|
217 |
+
|
218 |
+
manual_speller = {'You can\'t be likee this.': 'You can\'t be like this.'}
|
219 |
+
text.data = text.data.apply(lambda x: manual_sentence_spelling(x, manual_speller))
|
220 |
+
text.data = text.data.apply(lambda x: x.lower())
|
221 |
+
text.data = text.data.apply(unquote_words)
|
222 |
+
text.data = text.data.apply(expand_contractions)
|
223 |
+
print("Corrected Text:")
|
224 |
+
print(text.data)
|
imageprocessing/artemis/artemis/language/part_of_speech.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Part of speech tagging at speed for two libraries.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in 2020, for Python 3.x - last updated in early 2021.
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
import dask.dataframe as dd
|
9 |
+
import multiprocessing as mp
|
10 |
+
from nltk.tag import pos_tag
|
11 |
+
|
12 |
+
try:
|
13 |
+
import spacy
|
14 |
+
except:
|
15 |
+
pass
|
16 |
+
|
17 |
+
|
18 |
+
def nltk_parallel_tagging_of_tokens(tokens, n_partitions=None, tagset='universal'):
|
19 |
+
""" pos-tagging
|
20 |
+
:param tokens: pd.Series with tokenized utterances as rows. e.g., [['a', 'man'], ['a', 'big', 'man'], ...]
|
21 |
+
:return: a pd.Series with the result of applying pos_tag in each row. e.g.,
|
22 |
+
[(a, DT), (man, NN)], [('a', 'DT'), ('big', 'JJ'), ('man', 'NN')]]
|
23 |
+
"""
|
24 |
+
if n_partitions is None:
|
25 |
+
n_partitions = mp.cpu_count() * 4
|
26 |
+
ddata = dd.from_pandas(tokens, npartitions=n_partitions)
|
27 |
+
tagged_tokens =\
|
28 |
+
ddata.map_partitions(lambda x: x.apply((lambda y: pos_tag(y, tagset=tagset)))).compute(scheduler='processes')
|
29 |
+
|
30 |
+
return tagged_tokens
|
31 |
+
|
32 |
+
|
33 |
+
def spacy_pos_tagging(utterances, nlp=None):
|
34 |
+
if nlp is None:
|
35 |
+
nlp = spacy.load('en_core_web_sm')
|
36 |
+
|
37 |
+
utters = utterances.astype('unicode').values
|
38 |
+
docs = nlp.pipe(utters, batch_size=1000, n_threads=-1)
|
39 |
+
pos = [[t.pos_ for t in d if not t.is_space] for d in docs]
|
40 |
+
return pos
|
imageprocessing/artemis/artemis/language/spelling.py
ADDED
@@ -0,0 +1,634 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Auxiliary spelling utilities.
|
3 |
+
|
4 |
+
It's called [may-rah-kee]: https://travelwithmeraki.com/meaning-of-meraki/
|
5 |
+
|
6 |
+
The MIT License (MIT)
|
7 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab.
|
8 |
+
"""
|
9 |
+
|
10 |
+
|
11 |
+
##
|
12 |
+
## Dictionary, mapping entire "raw" collected sentences to new sentences. These were emailed separately from the AMT submissions
|
13 |
+
## from pedantic and good! annotators, who wanted to correct their original submission.
|
14 |
+
##
|
15 |
+
sentence_spelling_dictionary = {
|
16 |
+
'thewaytheshapeschangethespace': 'the way the shapes change the space',
|
17 |
+
'brightcolorsandanimals': 'bright colors and animals',
|
18 |
+
'calmweatherandpeopleworkingtogether': 'calm weather and people working together',
|
19 |
+
'thesoftcolors': 'the soft colors',
|
20 |
+
'itlookslikeapeacefullocation': 'it looks like a peaceful location',
|
21 |
+
'Iliketoplayinthesnow': 'I like to play in the snow',
|
22 |
+
'thecalmwaters': 'the calm waters',
|
23 |
+
'itseemsincompletesomehow': 'it seems incomplete somehow',
|
24 |
+
'thefiguresseemsomehowawkward': 'the figures seem somehow awkward',
|
25 |
+
'thegreenandlackofpeople': 'the green and lack of people',
|
26 |
+
'boatingisanadventure': 'boating is an adventure',
|
27 |
+
'thereseemstobetoomuchsky': 'there seems to be too much sky',
|
28 |
+
'onthewholeitlookslikeawallpaperswatch': 'on the whole it looks like a wallpapers watch',
|
29 |
+
'thecolorofthewater': 'the color of the water',
|
30 |
+
'alloftheplantsandthewatershowingthroughthem': 'all of the plants and the water showing through them',
|
31 |
+
'thecolorcombination': 'the color combination',
|
32 |
+
'theslashesacrosstheworkandthedarkcolors': 'the slashes across the work and the dark colors',
|
33 |
+
'thesimplicityandopenness': 'the simplicity and openness',
|
34 |
+
'theproportionbetweenheadandbody': 'the proportion between head and body',
|
35 |
+
'thebrightcolorsandthefeelingofmotion': 'the bright colors and the feeling of motion',
|
36 |
+
'thedetailinthehands': 'the detail in the hands',
|
37 |
+
'thelengthofthewoman\'sneck': 'the length of the woman\'s neck',
|
38 |
+
'thedifferentflocksofbirdsinthesky': 'the different flocks of birds in the sky',
|
39 |
+
'theskillshownindrawingthefigures': 'the skill shown in drawing the figures',
|
40 |
+
'thenaturallookingskintone': 'the natural looking skin tone',
|
41 |
+
'theyeadsthatdon\'tseemtobeattatchedtoanything': 'the yeads that don\'t seem to be attatched to anything',
|
42 |
+
'themasksheisholding': 'the mask she is holding',
|
43 |
+
'theapparantageofthepiece': 'the apparent age of the piece',
|
44 |
+
'thepinkbows': 'the pink bows',
|
45 |
+
'theintensityonthefaceofthemaninfrontofthewoman.': 'the intensity on the face of the man in front of the woman',
|
46 |
+
'theshapesandcolors-itlookshard,painful': 'the shapes and colors - it looks hard, painful',
|
47 |
+
'thewaytheguitarisbrokenupandmagnifiedbutstillidentifiable.': 'the way the guitar is broken up and magnified but still identifiable',
|
48 |
+
'veryimpressedwiththewaytheartistcreatedlight': 'very impressed with the way the artist created light',
|
49 |
+
'itlookslikepeoplearewaitingforsomeeventtohappenlikeaboatraceorsomething': 'it looks like people are waiting for some event to happen like a boat race or something',
|
50 |
+
'wonderingifthisisapaintingoratextile': 'wondering if this is a painting or a textile',
|
51 |
+
'itseemstoodarkfortheactivity': 'it seems too dark for the activity',
|
52 |
+
'Ithinkitskindofweirdhowherhipsmakealmostacircle': 'I think its kind of weird how her hips make almost a circle',
|
53 |
+
'thesoftnessofthefiguremakesitfeellikeiamintrudingonanintimatemoment': 'the softness of the figure makes it feel like i am intruding on an intimate moment',
|
54 |
+
'Idon\'tseepieceslikethisas\'art\'itcouldbethewallinsomeone\'shouse': 'I don\'t see pieces like this as \'art\' it could be the wall in someone\'s house',
|
55 |
+
'thereflectionofthetreesinthepool': 'the reflection of the trees in the pool',
|
56 |
+
'thepainonhisface': 'the pain on his face',
|
57 |
+
'itremindsmeofastringofbeadsasmallchildwouldmake': 'it reminds me of a string of beads a small child would make',
|
58 |
+
'thefigurebeneaththetreeappearsveryrelaxed': 'the figure beneath the tree appears very relaxed',
|
59 |
+
'thefacialexpressionisveryloving': 'the facial expression is very loving',
|
60 |
+
'thecolorsandactivitymakeitlooklikeafunplacetobe': 'the colors and activity make it look like a fun place to be',
|
61 |
+
'itlookslikeaniceareatogoforawalk': 'it looks like an ice area to go for a walk',
|
62 |
+
'theyappeartobeayoungcoupleinlove': 'they appear to be a young couple in love',
|
63 |
+
'allofthelittledetailsareamazing': 'all of the little details are amazing',
|
64 |
+
'knowledgeofthehistoryassociatedwiththeperson': 'knowledge of the history associated with the person',
|
65 |
+
'itlookslikeaveryoldpaintedtextile': 'it looks like a very old painted textile',
|
66 |
+
'allofthebrightcolorsjustmakemehappy': 'all of the bright colors just make me happy',
|
67 |
+
'dificultysortingoutwhetherthefigureismaleorfemale': 'dificulty sorting out whether the figure is male or female',
|
68 |
+
'Itseemskindoflikeaposteryou\'dputinaclassroom.': 'It seems kind of like a poster you\'d put in a classroom',
|
69 |
+
'Theexpressionontheman\'sfaceappearsangry.': 'The expression on the man\'s face appears angry.',
|
70 |
+
'it\'d dark and creepy and weirdly sexual in a bad way.': 'it\'s dark and creepy and weirdly sexual in a bad way.',
|
71 |
+
'looks disgusting looks like a cross desser disgusting': 'looks disgusting looks like a cross dresser disgusting',
|
72 |
+
'Big skirts and bloomers on dancing ladies definitelymake the mood excitement.': 'big skirts and bloomers on dancing ladies definitely make an exciting mood',
|
73 |
+
'The rays eminntating from sum over the town os exhilarating': 'the rays emanating from sun over the town are exhilarating',
|
74 |
+
'the way the artist uses black and white really gives this a different feel tp the painting': 'the way the artist uses black and white really gives this a different feel to the painting',
|
75 |
+
'This reminds me of jumbled graffiti I saw and had tp clean up every once in a while when I was younger': 'This reminds me of jumbled graffiti I saw and had to clean up every once in a while when I was younger',
|
76 |
+
'looks like the cabins i stayed in on my trip tp a dude ranch': 'looks like the cabins i stayed in on my trip to a dude ranch',
|
77 |
+
'A young woman applies make-up tp her face as she sits in a pretty robe. A pleasant but unfished work of art.': 'A young woman applies make-up to her face as she sits in a pretty robe. A pleasant but unfinished work of art.',
|
78 |
+
'old and not kept up well, bug nice scene of a man at work': 'old and not kept up well, but nice scene of a man at work',
|
79 |
+
'Again this painting is dill its lifeless and no color.': 'Again this painting is dull, it is lifeless and has no color.',
|
80 |
+
'she smile smartly': 'she smiles smartly',
|
81 |
+
'The mountain makes me think of a strong safehold, and a feeling of shelter.': 'The mountain makes me think of a stronghold giving me a feeling of shelter',
|
82 |
+
'The detail in the surundsing like the clock tower and statue make it more inspiering': 'The detail in the surroundings like the clock tower and statue make it more inspiring',
|
83 |
+
'The man is chained down and left to be attacked by a bird of prey while another man non chalantly watches what is taking place.': 'The man is chained down and left to be attacked by a bird of prey while another man nonchalantly watches what is taking place.',
|
84 |
+
'This is a beautiful scene with the mosques in the background and the vegetation in the front.': 'This is a beautiful scene with the onion dome churches in the background and the vegetation in the front.',
|
85 |
+
'The colors make me feel like I\'m looking at someone important. I feel a since of awe over them because of their attire.': 'The colors make me feel like I\'m looking at someone important. I feel a sense of awe over them because of their attire.',
|
86 |
+
'the detective ihas found the diar everyone knew she kept and now hopefully he wi findout what led up to her breakdown ': 'the detective has found the diary everyone knew she kept and now hopefully he will find out what led up to her breakdown',
|
87 |
+
'The VanGoghishness of this makes me smile and wonder how they do it.': 'The Van Gogh like quality of this makes me smile and wonder how they do it.',
|
88 |
+
'The colors and shapes compliment each otherakes look like a adult childs painting.': 'The colors and shapes compliment each other and looks like an adult child\'s painting.',
|
89 |
+
'I prefer more realistic still ifes.': 'I prefer more realistic still lifes.'
|
90 |
+
}
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
##
|
96 |
+
## Dictionary, mapping words to replacement words to densify the dataset (e.g., colour -> color), or more likely to spell
|
97 |
+
## check them. ## Curated manually by Panos circa 2020.
|
98 |
+
##
|
99 |
+
token_spelling_dictionary = {'colour': 'color',
|
100 |
+
'colours': 'colors',
|
101 |
+
'thecountry': ['the', 'country'],
|
102 |
+
'minimamistic': 'minimalistic',
|
103 |
+
'littlefinger': ['little', 'finger'],
|
104 |
+
'im': ['i', 'am'],
|
105 |
+
'greatfull': 'grateful',
|
106 |
+
'skinnydippers': ['skinny', 'dippers'],
|
107 |
+
'goingnon': ['going', 'on'],
|
108 |
+
'rainclouds': ['rain', 'clouds'],
|
109 |
+
'lillypads': ['lily', 'pads'],
|
110 |
+
'paintinglike': ['painting', 'like'],
|
111 |
+
'somekind': ['some', 'kind'],
|
112 |
+
'overexaggerated': ['over', 'exaggerated'],
|
113 |
+
'smokeshop': ['smoke', 'shop'],
|
114 |
+
'fearinspiring': ['fear', 'inspiring'],
|
115 |
+
'thebackround': ['the', 'background'],
|
116 |
+
'raincloud': ['rain', 'cloud'],
|
117 |
+
'wideopen': ['wide', 'open'],
|
118 |
+
'crusifiction': 'crucifixion',
|
119 |
+
'tablesetting': ['table', 'setting'],
|
120 |
+
'vividcolors': ['vivid', 'colors'],
|
121 |
+
'willhave': ['will', 'have'],
|
122 |
+
'thisbpainting': ['this', 'painting'],
|
123 |
+
'alongthis': ['along', 'this'],
|
124 |
+
'crucifications': 'crucifixion',
|
125 |
+
'overexaggeration': ['over', 'exaggeration'],
|
126 |
+
'snacktime': ['snack', 'time'],
|
127 |
+
'beaurocratic': 'bureaucratic',
|
128 |
+
'nonsensicalness': ['nonsensical', 'ness'],
|
129 |
+
'chubbyness': 'chubbiness',
|
130 |
+
'distatestful': 'distasteful',
|
131 |
+
'disapportioned': 'disproportionate',
|
132 |
+
'becauseofthe': ['because', 'of', 'the'],
|
133 |
+
'hahahahaha': 'haha',
|
134 |
+
'hahahahaa': 'haha',
|
135 |
+
'annoniminity': 'anonymity',
|
136 |
+
'realisticand': ['realistic', 'and'],
|
137 |
+
'feellike': ['feel', 'like'],
|
138 |
+
'clostiphobic': 'claustrophobic',
|
139 |
+
'thegolden': ['the', 'golden'],
|
140 |
+
'minimalstic': 'minimalistic',
|
141 |
+
'artdeco': ['art', 'deco'],
|
142 |
+
'paddleboards': ['paddle', 'boards'],
|
143 |
+
'fitbtogetherv': ['fit', 'together'],
|
144 |
+
'doingthat': ['doing', 'that'],
|
145 |
+
'stormclouds': ['storm', 'clouds'],
|
146 |
+
'feelanxious': ['feel', 'anxious'],
|
147 |
+
'withpeople':['with', 'people'],
|
148 |
+
'nuditythen': ['nudity', 'then'],
|
149 |
+
'whatbappears': ['what', 'appears'],
|
150 |
+
'womenafter': ['women', 'after'],
|
151 |
+
'funerallike': ['funeral', 'like'],
|
152 |
+
'thebridge': ['the', 'bridge'],
|
153 |
+
'focalpoint': ['focal', 'point'],
|
154 |
+
'crussifiction': 'crucifixion',
|
155 |
+
'extrocinary': 'extraordinary',
|
156 |
+
'adrodgenous': 'androgynous',
|
157 |
+
'whimsacle': 'whimsical',
|
158 |
+
'nonabrasive': ['non', 'abrasive'],
|
159 |
+
'alienlike': ['alien', 'like'],
|
160 |
+
'intricitally': 'intricately',
|
161 |
+
'straightlines': ['straight', 'lines'],
|
162 |
+
'shouldnt': ['should', 'not'],
|
163 |
+
'favortire': 'favorite',
|
164 |
+
'downsyndrome': ['down', 'syndrome'],
|
165 |
+
'silluete': 'silhouette',
|
166 |
+
'provideenough': ['provide', 'enough'],
|
167 |
+
'waterpainting':['water', 'painting'],
|
168 |
+
'the19th': ['the', 'nineteenth'],
|
169 |
+
'oldfashoned': ['old', 'fashioned'],
|
170 |
+
'colorblocking': ['color', 'blocking'],
|
171 |
+
'gesticulates': 'testiculates',
|
172 |
+
'notknow': ['not', 'know'],
|
173 |
+
'crucifixiction': 'crucifixion',
|
174 |
+
'cruxifiction': 'crucifixion',
|
175 |
+
'contementent': 'contentment',
|
176 |
+
'underconstruction': ['under', 'construction'],
|
177 |
+
'cartoonfrom': ['cartoon', 'from'],
|
178 |
+
'downwardlooks': ['downward', 'looks'],
|
179 |
+
'unrelateable': 'unrelatable',
|
180 |
+
'ofvthose': ['of', 'those'],
|
181 |
+
'rainbowlike': ['rainbow', 'like'],
|
182 |
+
'thegesture': ['the', 'gesture'],
|
183 |
+
'pencilwork': ['pencil', 'work'],
|
184 |
+
'perfectlycovered': ['perfectly', 'covered'],
|
185 |
+
'eitherway': ['either', 'way'],
|
186 |
+
'andpeaceful': ['and', 'peaceful'],
|
187 |
+
'cloudforms': ['cloud', 'forms'],
|
188 |
+
'peoplejust': ['people', 'just'],
|
189 |
+
'pyscadellic': 'psychedelic',
|
190 |
+
'maybepreparing': ['maybe', 'preparing'],
|
191 |
+
'thisbmakes': ['this', 'makes'],
|
192 |
+
'thispainting': ['this', 'painting'],
|
193 |
+
'combinationmakes': ['combination', 'makes'],
|
194 |
+
'rightside': ['right', 'side'],
|
195 |
+
'saysnothing': ['says', 'nothing'],
|
196 |
+
'individualness': 'individualism',
|
197 |
+
'verynostalgic': ['very', 'nostalgic'],
|
198 |
+
'hyperrealistic': ['hyper', 'realistic'],
|
199 |
+
'wimsicle': 'whimsical',
|
200 |
+
'aweinspiring': ['awe', 'inspiring'],
|
201 |
+
'resturarunt': 'restaurant',
|
202 |
+
'cruxification': 'crucifixion',
|
203 |
+
'mistiruous': 'mysterious',
|
204 |
+
'streetlamp': ['street', 'lamp'],
|
205 |
+
'sadnessand': ['sadness', 'and'],
|
206 |
+
'republicancult': ['republican', 'cult'],
|
207 |
+
'mogilianni': 'modigliani',
|
208 |
+
'raphealite': 'raphaelite',
|
209 |
+
'immeidtaley': 'immediately',
|
210 |
+
'duckface': ['duck', 'face'],
|
211 |
+
'kinglike': ['king', 'like'],
|
212 |
+
'monaleesa': ['mona', 'lisa'],
|
213 |
+
'antispication': 'anticipation',
|
214 |
+
'womendid': ['women', 'did'],
|
215 |
+
'jailcell': ['jail', 'cell'],
|
216 |
+
'thispeicemakes': ['this', 'piece', 'makes'],
|
217 |
+
'pceaful': 'peaceful',
|
218 |
+
'showpeople': ['show', 'people'],
|
219 |
+
'colorsand': ['colors', 'and'],
|
220 |
+
'lovevthe': ['love', 'the'],
|
221 |
+
'mewithoutyou': ['me', 'without', 'you'],
|
222 |
+
'microexpression': ['micro', 'expression'],
|
223 |
+
'doesnnt': ['does', 'not'],
|
224 |
+
'airfilter': ['air', 'filter'],
|
225 |
+
'appostols': 'apostles',
|
226 |
+
'acrossthe': ['across', 'the'],
|
227 |
+
'andaroused': ['and', 'aroused'],
|
228 |
+
'bluecolor': ['blue', 'color'],
|
229 |
+
'broadstrokes': ['broad', 'strokes'],
|
230 |
+
'bullethole': ['bullet', 'hole'],
|
231 |
+
'shadowlike': ['shadow', 'like'],
|
232 |
+
'shepardplaying': ['shepard', 'playing'],
|
233 |
+
'siporportioned': 'disproportionate',
|
234 |
+
'skyremind': ['sky', 'remind'],
|
235 |
+
'theblending': ['the', 'blending'],
|
236 |
+
'thoughtfuland': ['thoughtful', 'and'],
|
237 |
+
'yellowbrowns': ['yellow', 'browns'],
|
238 |
+
'creeeppyyy': 'creepy',
|
239 |
+
'crosslegged': ['cross', 'legged'],
|
240 |
+
'cupshave': ['cups', 'have'],
|
241 |
+
'dissapoinment': 'disappointment',
|
242 |
+
'drinkbest': ['drink', 'best'],
|
243 |
+
'dragonlike': ['dragon', 'like'],
|
244 |
+
'dressform': ['dress', 'form'],
|
245 |
+
'farmlife': ['farm', 'life'],
|
246 |
+
'inbtween': ['in', 'between'],
|
247 |
+
'averageparentproblems': ['average', 'parent', 'problems'],
|
248 |
+
'aroundthe': ['around', 'the'],
|
249 |
+
'anythingabout': ['anything', 'about'],
|
250 |
+
'bootylicous': 'bootylicious',
|
251 |
+
'andwhat': ['and', 'what'],
|
252 |
+
'applestore': ['apple', 'store'],
|
253 |
+
'archioligist': 'archaeologist',
|
254 |
+
'archtypical': 'archetypal',
|
255 |
+
'armorwear': ['armor', 'wear'],
|
256 |
+
'assumingely': 'assumingly',
|
257 |
+
'beachtown': ['beach', 'town'],
|
258 |
+
'beenshot': ['been', 'shot'],
|
259 |
+
'bluemountains': ['blue', 'mountains'],
|
260 |
+
'boldcolors': ['bold', 'colors'],
|
261 |
+
'buddawhateverhisname': ['buddha', 'whatever', 'his', 'name'],
|
262 |
+
'buttcrack': ['butt', 'crack'],
|
263 |
+
'candytown': 'candytown',
|
264 |
+
'colorsare': ['colors', 'are'],
|
265 |
+
'colorscale': ['color', 'scale'],
|
266 |
+
'cominginto': ['coming', 'into'],
|
267 |
+
'commonfolk': ['common', 'folk'],
|
268 |
+
'cottonballs': ['cotton', 'balls'],
|
269 |
+
'excuuuuse': 'excuse',
|
270 |
+
'eyesockets': ['eye', 'sockets'],
|
271 |
+
'facelooking': ['face', 'looking'],
|
272 |
+
'fromthis': ['from', 'this'],
|
273 |
+
'pokerface': ['poker', 'face'],
|
274 |
+
'thefountain': ['the', 'fountain'],
|
275 |
+
'thinkpeople': ['think', 'people'],
|
276 |
+
'uncomfomfortable': 'uncomfortable',
|
277 |
+
'upsidedown': ['upside', 'down'],
|
278 |
+
'vangough': ['van', 'gogh'],
|
279 |
+
'vangogh': ['van', 'gogh'],
|
280 |
+
'yaaaay': 'yay',
|
281 |
+
'uhhhh': 'uhh',
|
282 |
+
'thedark': ['the', 'dark'],
|
283 |
+
'tallships': ['tall', 'ships'],
|
284 |
+
'stilllife': ['still', 'life'],
|
285 |
+
'stillframe': ['still', 'frame'],
|
286 |
+
'mmmmmm': 'mmm',
|
287 |
+
'marvelone': ['marvel', 'one'],
|
288 |
+
'lookhomeless': ['look', 'homeless'],
|
289 |
+
'likealot': ['like', 'a', 'lot'],
|
290 |
+
'interestesting': 'interesting',
|
291 |
+
'intriuiging': 'intriguing',
|
292 |
+
'icecreams': ['ice', 'creams'],
|
293 |
+
'awwwww': 'aww',
|
294 |
+
'slavemaster': ['slave', 'master'],
|
295 |
+
'pictureshould': ['picture', 'should'],
|
296 |
+
'onhisface': ['on', 'his', 'face'],
|
297 |
+
'likethis': ['like', 'this'],
|
298 |
+
'inkwork': ['ink', 'work'],
|
299 |
+
'grapejuice': ['grape', 'juice'],
|
300 |
+
'flowerlike': ['flower', 'like'],
|
301 |
+
'understandthe': ['understand', 'the'],
|
302 |
+
'welldressed': ['well', 'dressed'],
|
303 |
+
'wouldlove': ['would', 'love'],
|
304 |
+
'blendedinto': ['blended', 'into'],
|
305 |
+
'buttcheeks': ['butt', 'cheeks'],
|
306 |
+
'clownlike':['clown', 'like'],
|
307 |
+
'davinchi': ['da', 'vinci'],
|
308 |
+
'veryperfect': ['very', 'perfect'],
|
309 |
+
'supervillian': 'supervillain',
|
310 |
+
'simpleand': ['simple', 'and'],
|
311 |
+
'seemsout': ['seems', 'out'],
|
312 |
+
'rainbowmeeting': ['rainbow', 'meeting'],
|
313 |
+
'strobelights': ['strobe', 'lights'],
|
314 |
+
'subltness': 'subtleness',
|
315 |
+
'throughthe': ['through', 'the'],
|
316 |
+
'paintingfreaks': ['painting', 'freaks'],
|
317 |
+
'muchgoing': ['much', 'going'],
|
318 |
+
'meditterean': 'mediterranean',
|
319 |
+
'instaneous': 'instantaneous',
|
320 |
+
'helpthe': ['help', 'the'],
|
321 |
+
'bizzarly': 'bizarrely',
|
322 |
+
'crimescene': ['crime', 'scene'],
|
323 |
+
'deathlife': ['death', 'life'],
|
324 |
+
'dancefight': ['dance', 'fight'],
|
325 |
+
'blahblahblah': ['blah', 'blah', 'blah'],
|
326 |
+
'disporportioned': 'disproportionate',
|
327 |
+
'dreamstate': ['dream', 'state'],
|
328 |
+
'eithermight': ['either', 'might'],
|
329 |
+
'enviornemt': 'environment',
|
330 |
+
'greenbackground': ['green', 'background'],
|
331 |
+
'greybackground': ['grey', 'background'],
|
332 |
+
'handrwawing': ['hand' 'drawing'],
|
333 |
+
'happycause': ['happy', 'cause'],
|
334 |
+
'thelayout': ['the', 'layout'],
|
335 |
+
'greatgrandparent': ['great', 'grand', 'parent'],
|
336 |
+
'greatgrandparents': ['great', 'grand', 'parents'],
|
337 |
+
'likesomething': ['like', 'something'],
|
338 |
+
'likethey': ['like', 'they'],
|
339 |
+
'makingthe': ['making', 'the'],
|
340 |
+
'mideviltimes': ['medieval', 'times'],
|
341 |
+
'moviestar': ['movie', 'star'],
|
342 |
+
'shroudlike': ['shroud', 'like'],
|
343 |
+
'blackscale': ['black', 'scale'],
|
344 |
+
'bothsides': ['both', 'sides'],
|
345 |
+
'fallevening': ['fall', 'evening'],
|
346 |
+
'breaklight': ['break', 'light'],
|
347 |
+
'springgarden': ['spring', 'garden'],
|
348 |
+
'pointalist': 'pointillism',
|
349 |
+
'hemmeroid': 'hemorrhoid',
|
350 |
+
'bonaroo': 'bonnaroo',
|
351 |
+
'boardshorts': ['board', 'shorts'],
|
352 |
+
'luminousand': ['luminous', 'and'],
|
353 |
+
'iceskating': ['ice', 'skating'],
|
354 |
+
'ewwww' :'ew',
|
355 |
+
'bloodsplatter': ['blood', 'splatter'],
|
356 |
+
'beastlike': ['beast', 'like'],
|
357 |
+
'entendra': 'entendre',
|
358 |
+
'dollbaby': ['doll', 'baby'],
|
359 |
+
'eachothers': ['each', 'others'],
|
360 |
+
'backlooking': ['back', 'looking'],
|
361 |
+
'enjoynthe': ['enjoy', 'the'],
|
362 |
+
'stormcloud': ['storm', 'cloud'],
|
363 |
+
'playwriter': ['play', 'writer'],
|
364 |
+
'hyroglifics': 'hieroglyphics',
|
365 |
+
'lilypads': ['lily', 'pads'],
|
366 |
+
'ivreqlly': ['i', 'really'],
|
367 |
+
'kindnof': ['kind', 'of'],
|
368 |
+
'selfconcious': ['self', 'conscious'],
|
369 |
+
'reprensation': 'representation',
|
370 |
+
'eerieness' : 'eeriness',
|
371 |
+
'paining': 'painting',
|
372 |
+
'thats': ['that', 'is'],
|
373 |
+
'xmas': 'christmas',
|
374 |
+
'swordbearer' : ['sword', 'bearer'],
|
375 |
+
'outcseeing': ['out', 'seeing'],
|
376 |
+
'gatheredaround': ['gathered', 'around'],
|
377 |
+
'lockeroom': ['locker', 'room'],
|
378 |
+
'adrogonius': 'androgynous',
|
379 |
+
'mezmesring': 'mesmerising',
|
380 |
+
'powderoom': ['powder', 'room'],
|
381 |
+
'tenalady': ['tena', 'lady', 'pads'],
|
382 |
+
'storytale': ['story', 'tale'],
|
383 |
+
'dipropratnly': 'disproportionately',
|
384 |
+
'clotheless': 'clothless',
|
385 |
+
'maculopothy': 'maculopathy',
|
386 |
+
'meanmugging': ['mean', 'mugging'],
|
387 |
+
'shadowwork': ['shadow', 'work'],
|
388 |
+
'paintstrokes': ['paint', 'strokes'],
|
389 |
+
'makenit': ['make', 'it'],
|
390 |
+
'ofcolors': ['of', 'colors'],
|
391 |
+
'thevdevilish': ['the', 'devilish'],
|
392 |
+
'lilipads': ['lily', 'pads'],
|
393 |
+
'lilypad': ['lily', 'pad'],
|
394 |
+
'prusinors': 'prisoners',
|
395 |
+
'thebattle': ['the', 'battle'],
|
396 |
+
'bathingsuit': ['bathing', 'suit'],
|
397 |
+
'thencolors': ['the', 'colors'],
|
398 |
+
'morexcitingand': ['more', 'exciting', 'and'],
|
399 |
+
'thebeggining': ['the', 'beginning'],
|
400 |
+
'imageryand': ['imagery', 'and'],
|
401 |
+
'contentness': 'contentedness',
|
402 |
+
'oversimplicity': ['over', 'simplicity'],
|
403 |
+
'overexausted': ['over', 'exhausted'],
|
404 |
+
'uninterst': 'uninterest',
|
405 |
+
'theanfels': ['the', 'angels'],
|
406 |
+
'bittypenis': ['bitty', 'penis'],
|
407 |
+
'intellegiant': 'intelligent',
|
408 |
+
'fauxfur': ['faux', 'fur'],
|
409 |
+
'togetherther': 'together',
|
410 |
+
'murakmi': 'murakami',
|
411 |
+
'diffinterate': 'different',
|
412 |
+
'deatheater': ['death', 'eater'],
|
413 |
+
'grafittied': 'graffitied',
|
414 |
+
'colortheme': ['color', 'theme'],
|
415 |
+
'herevening': ['her', 'evening'],
|
416 |
+
'comradarie': 'camaraderie',
|
417 |
+
'gradeintly': 'gradiently',
|
418 |
+
'womenreally': ['woman', 'really'],
|
419 |
+
'renduveousing': 'rendezvousing',
|
420 |
+
'unsettleness': 'unsettledness',
|
421 |
+
'desolutioned': 'disillusioned',
|
422 |
+
'bucketlist': ['bucket', 'list'],
|
423 |
+
'contrastful': 'contrasting',
|
424 |
+
'snailshell': ['snail', 'shell'],
|
425 |
+
'figureswithin': ['figures', 'within'],
|
426 |
+
'semitrical': 'symmetrical',
|
427 |
+
'happinessand': ['happiness', 'and'],
|
428 |
+
'firepit':['fire', 'pit'],
|
429 |
+
'firepits':['fire', 'pits'],
|
430 |
+
'spectrumand': ['spectrum', 'and'],
|
431 |
+
'skyblue': ['sky', 'blue'],
|
432 |
+
'duststorm': ['dust', 'storm'],
|
433 |
+
'ultrawide': ['ultra', 'wide'],
|
434 |
+
'containmatated': 'contaminated',
|
435 |
+
'dressesbis': ['dresses', 'is'],
|
436 |
+
'underdetailed': ['under', 'detailed'],
|
437 |
+
'pitchblack': ['pitch', 'black'],
|
438 |
+
'andvserious': ['and', 'serious'],
|
439 |
+
'peaceand': ['peace', 'and'],
|
440 |
+
'drawingnif': 'drawing',
|
441 |
+
'patternsmake': ['patterns', 'make'],
|
442 |
+
'andvwilling': ['and', 'willing'],
|
443 |
+
'thecdeeamy': ['the', 'dreamy'],
|
444 |
+
'puntilism': 'pointillism',
|
445 |
+
'thecangel': ['the', 'angel'],
|
446 |
+
'awestriking': ['awe', 'striking'],
|
447 |
+
'awestrucking': ['awe', 'striking'],
|
448 |
+
'awestrikng': ['awe', 'striking'],
|
449 |
+
'ofvthe': ['of', 'the'],
|
450 |
+
'desaturatation': 'desaturation',
|
451 |
+
'colrscare': ['colors', 'are'],
|
452 |
+
'looksmessy': ['looks', 'messy'],
|
453 |
+
'thecfeelingvthis': ['the', 'feeling', 'for', 'this'],
|
454 |
+
'manyngood': ['many', 'and', 'good'],
|
455 |
+
'mandsface': ['man\'s', 'face'],
|
456 |
+
'essencence': 'essence',
|
457 |
+
'confuseable': 'confusing',
|
458 |
+
'frizzyness': 'frizziness',
|
459 |
+
'waterbuffalo': ['water', 'buffalo'],
|
460 |
+
'cinaplex' :'cineplex',
|
461 |
+
'clocktowers': ['clock', 'towers'],
|
462 |
+
'aysterutym': 'austerity',
|
463 |
+
'conthemporan': 'contemporary',
|
464 |
+
'coldsore': ['cold', 'sore'],
|
465 |
+
'redflas': ['red', 'flash'],
|
466 |
+
'pompnceremony': ['pomp', 'and', 'ceremony'],
|
467 |
+
'etchisketch': ['etch', 'a', 'sketch'],
|
468 |
+
'durdledoor': ['durdle' 'door'],
|
469 |
+
'eyessquinted': ['eyes', 'squinted'],
|
470 |
+
'colorfullness': 'colorfulness',
|
471 |
+
'christchild': ['christ', 'child'],
|
472 |
+
'wispyness': 'wispiness',
|
473 |
+
'whispiness': 'wispiness',
|
474 |
+
'imaturebut': ['immature', 'but'],
|
475 |
+
'raphealites': 'raphaelites',
|
476 |
+
'late1700': ['late', '1700'],
|
477 |
+
'remnicient': 'reminiscent',
|
478 |
+
'twonsubjecta': ['two', 'subjects'],
|
479 |
+
'awestricken': 'awestruck',
|
480 |
+
'withnumerous': ['with', 'numerous'],
|
481 |
+
'colorsmake': ['colors', 'make'],
|
482 |
+
'vmcolors': ['colors'],
|
483 |
+
'roseyness': 'rosiness',
|
484 |
+
'holdingthe': ['holding', 'the'],
|
485 |
+
'gruesomness': 'gruesomeness',
|
486 |
+
'linedrawing': ['line', 'drawing'],
|
487 |
+
'orangatange': 'orangutan',
|
488 |
+
'naaahhhh': 'nah',
|
489 |
+
'micropattern': ['micro', 'pattern'],
|
490 |
+
'nephilims': 'nephilim',
|
491 |
+
'middleaged': ['middle', 'aged'],
|
492 |
+
'thevnanyvdifferent': ['the', 'many', 'different'],
|
493 |
+
'flirtatously': 'flirtatiously',
|
494 |
+
'nitemare': 'nightmare',
|
495 |
+
'okaaaay': 'ok',
|
496 |
+
'crucufication': 'crucifixion',
|
497 |
+
'manywindow': ['many', 'windows'],
|
498 |
+
'panaroema': 'panorama',
|
499 |
+
'wowwwwwww': 'wow',
|
500 |
+
'theaqua': ['the', 'aqua'],
|
501 |
+
'andexcited': ['and', 'excited'],
|
502 |
+
'frommthe': ['from', 'the'],
|
503 |
+
'thecanal': ['the', 'canal'],
|
504 |
+
'focalpointcof': ['focal', 'point', 'of'],
|
505 |
+
'silouhete': 'silhouette',
|
506 |
+
'physcadelic': 'psychedelic',
|
507 |
+
'tonesmakes': ['tones', 'make'],
|
508 |
+
'reallyenjoying': ['really', 'enjoying'],
|
509 |
+
'disportionate': 'disproportionate',
|
510 |
+
'spidermonkey': ['spider', 'monkey'],
|
511 |
+
'lookswise': ['looks', 'wise'],
|
512 |
+
'wasewas': 'was',
|
513 |
+
'inbthe': ['in', 'the'],
|
514 |
+
'coronvirus': 'coronavirus',
|
515 |
+
'overdramtic': 'overdramatic',
|
516 |
+
'favarouite': 'favorite',
|
517 |
+
'reallyike': ['really', 'like'],
|
518 |
+
'thesoldier': ['the', 'soldier'],
|
519 |
+
'flowerboxes': ['flower', 'boxes'],
|
520 |
+
'envoirment': 'environment',
|
521 |
+
'theirfaces': ['their', 'faces'],
|
522 |
+
'neccasssary': 'necessary',
|
523 |
+
'ghostlyness': 'ghostliness',
|
524 |
+
'trytophobia': 'trypophobia',
|
525 |
+
'tripophobia': 'trypophobia',
|
526 |
+
'woodprinting': ['wood', 'printing'],
|
527 |
+
'roomoom': 'room',
|
528 |
+
'outmyself': ['out', 'myself'],
|
529 |
+
'evildoing': ['evil', 'doing'],
|
530 |
+
'deliousious': 'delicious',
|
531 |
+
'thebfigure': ['the', 'figure'],
|
532 |
+
'sleeptime': ['sleep', 'time'],
|
533 |
+
'isnspooky': ['is', 'spooky'],
|
534 |
+
'comtempory': 'contemporary',
|
535 |
+
'smilingred': ['smiling', 'red'],
|
536 |
+
'ooranateness': 'ornateness',
|
537 |
+
'joyfilled': ['joy', 'filled'],
|
538 |
+
'ghosttown': ['ghost', 'town'],
|
539 |
+
'obvious–that': ['obvious', 'that'],
|
540 |
+
'photobooth': ['photo', 'booth'],
|
541 |
+
'thinknof': ['think', 'of'],
|
542 |
+
'extrodianary': 'extraordinary',
|
543 |
+
'thewindow': ['the', 'window'],
|
544 |
+
"'indistinctive": 'indistinctive',
|
545 |
+
'vilianouis': 'villainous',
|
546 |
+
'farmtown': ['farm', 'town'],
|
547 |
+
'handdrawing': ['hand', 'drawing'],
|
548 |
+
'sophisticatedcriminal': ['sophisticated', 'criminal'],
|
549 |
+
'beautifuldepiction': ['beautiful', 'depiction'],
|
550 |
+
'plantetscolliding': ['planets', 'colliding'],
|
551 |
+
'greytones': ['grey', 'tones'],
|
552 |
+
'likepaint': ['like', 'paint'],
|
553 |
+
'leatherworker': ['leather', 'worker'],
|
554 |
+
'cobrownand': ['brown', 'and'],
|
555 |
+
'bluegreens': ['blue', 'greens'],
|
556 |
+
'polkadots': ['polka', 'dots'],
|
557 |
+
'attirewear': ['attire', 'wear'],
|
558 |
+
'disssary': 'disarray',
|
559 |
+
'simplictiness': 'simplicity',
|
560 |
+
'likelord': ['like', 'a', 'lord'],
|
561 |
+
'orbtalking': ['or', 'talking'],
|
562 |
+
'colorscheme': ['color', 'scheme'],
|
563 |
+
'grouchypants': ['grouchy', 'pants'],
|
564 |
+
'renosanse': 'renaissance',
|
565 |
+
'renessciance': 'renaissance',
|
566 |
+
'impaitily': 'impatiently',
|
567 |
+
'hyrogliphic': 'hieroglyphic',
|
568 |
+
'enduresess': 'endures',
|
569 |
+
'orangecand': ['orange', 'and'],
|
570 |
+
'emiotnals': 'emotion',
|
571 |
+
'countryclub': ['country', 'club'],
|
572 |
+
'branchhill': ['branch', 'hill'],
|
573 |
+
'homospiens': ['homo', 'sapiens'],
|
574 |
+
'beautifuland': ['beautiful', 'and'],
|
575 |
+
'birchtree': ['birch', 'tree'],
|
576 |
+
'seemslike': ['seems', 'like'],
|
577 |
+
'beuaktufl': 'beautiful',
|
578 |
+
'appearlike': ['appear', 'like'],
|
579 |
+
'browngrounds': ['brown', 'grounds'],
|
580 |
+
'morningtime': ['morning', 'time'],
|
581 |
+
'jerrsaic': 'jurassic',
|
582 |
+
'feelabout': ['feel','about'],
|
583 |
+
'linestrokes': ['line', 'strokes'],
|
584 |
+
'lifesized': ['life', 'sized'],
|
585 |
+
'thevlower': ['the', 'lower'],
|
586 |
+
'paitngig': 'painting',
|
587 |
+
'handdrawn': ['hand', 'drawn'],
|
588 |
+
'facefrom': ['face', 'from'],
|
589 |
+
'treesmake': ['trees', 'make'],
|
590 |
+
'chesspiece': ['chess', 'piece'],
|
591 |
+
'balletdancer': ['ballet', 'dancer'],
|
592 |
+
'motionblurr': ['motion', 'blur'],
|
593 |
+
'varietyframes': ['variety', 'frames'],
|
594 |
+
'nondetailed': ['non', 'detailed'],
|
595 |
+
'shadowsplus': ['shadows', 'plus'],
|
596 |
+
'bellpeppers': ['bell', 'peppers'],
|
597 |
+
'thebackground': ['the', 'background'],
|
598 |
+
'playwith': ['play', 'with'],
|
599 |
+
'facialmexpression': ['facial', 'expression'],
|
600 |
+
'compositionntells': ['composition', 'tells'],
|
601 |
+
'playfulexpression': ['playful', 'expression'],
|
602 |
+
'somethingforeboding': ['something', 'foreboding'],
|
603 |
+
'everythingnbeing': ['everything', 'being'],
|
604 |
+
'beingbsomseperated': ['being', 'separated'],
|
605 |
+
'nececassry': 'necessary',
|
606 |
+
'oppurnity': 'opportunity',
|
607 |
+
'undistinguishable': 'indistinguishable'
|
608 |
+
}
|
609 |
+
|
610 |
+
|
611 |
+
##
|
612 |
+
## Set, containing words found in ArtEmis but not in Glove. (for the curious reader...)
|
613 |
+
## Curated manually by Panos circa 2020.
|
614 |
+
##
|
615 |
+
missing_from_glove_but_are_actual_words = {
|
616 |
+
'agfacolor', 'photobomb', 'modernness', 'altamouras',
|
617 |
+
'invitingness', 'kinkadian', 'unfinishedness',
|
618 |
+
'gainsboro', 'normalness', 'harmoniousness', 'tenebrism',
|
619 |
+
'neckpiece', 'immenseness', 'distastefulness', 'delicateness',
|
620 |
+
'disjointedness', 'midground', 'pulchritudinously', 'maculopathy',
|
621 |
+
'ornateness', 'alienesque', 'bemedaled', 'mundaneness', 'ghoulishness',
|
622 |
+
'hecticness', 'comfortability', 'deathscape', 'snowpiercer', 'acuarela',
|
623 |
+
'pedophillic', 'monochromatically', 'futuristically', 'remnicient',
|
624 |
+
'sereneness', 'tenebrism', 'midground', 'delicateness', 'ornateness',
|
625 |
+
'neckpiece', 'pompousness', 'comfortability', 'contentful', 'disjointedness',
|
626 |
+
'delicateness', 'suitcoat', 'slenderman', 'wispiness', 'realisticness',
|
627 |
+
'splotchiness', 'chubbiness', 'respectfulness', 'chemtrail', 'ominousness',
|
628 |
+
'douchebag', 'naturescape', 'indistinctive', 'hellscape', 'blobbiness',
|
629 |
+
'mountainscape', 'exoticness', 'colorscape', 'overdramatic', 'snowscape',
|
630 |
+
'oceanscape', 'stunningness', 'hyperreligiosity', 'trypophobia', 'treescape',
|
631 |
+
'prayerfulness', 'slothlike', 'tablescape', 'indistinctive', 'imaginativeness',
|
632 |
+
'sincereness', 'rejoicement', 'loyalness', 'hypersexualization', 'solemnness',
|
633 |
+
'boringness', 'hypersexualizing', 'centermost'
|
634 |
+
}
|
imageprocessing/artemis/artemis/neural_models/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The MIT License (MIT)
|
3 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
4 |
+
"""
|
imageprocessing/artemis/artemis/neural_models/attention.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Language-Vision Attention Utilities.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in 2020, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
|
10 |
+
from torch import nn
|
11 |
+
|
12 |
+
|
13 |
+
class AdditiveVisioLinguistic(nn.Module):
|
14 |
+
"""
|
15 |
+
Given a vector summarizing the linguistic information processed by a pipeline
|
16 |
+
(e.g. k-th output of RNN) attend to a 2D grid (e.g., image pixels).
|
17 |
+
This mechanism *adds* the two sources of information to compute the attention (hence the name additive).
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self, encoder_dim, decoder_dim, attention_dim):
|
21 |
+
"""
|
22 |
+
:param encoder_dim: (int) feature size (last dimension) of encoded images (e.g., [B x H x W] x encoder_dim)
|
23 |
+
:param decoder_dim: (int) feature size of decoder's output (summarizing linguistic information)
|
24 |
+
:param attention_dim: (int) feature size size of the attention space
|
25 |
+
"""
|
26 |
+
super(AdditiveVisioLinguistic, self).__init__()
|
27 |
+
self.encoder_att = nn.Linear(encoder_dim, attention_dim) # linear layer to transform encoded image
|
28 |
+
self.decoder_att = nn.Linear(decoder_dim, attention_dim) # linear layer to transform decoder's output
|
29 |
+
self.full_att = nn.Linear(attention_dim, 1) # linear layer to calculate values to be softmax-ed
|
30 |
+
self.relu = nn.ReLU(inplace=True)
|
31 |
+
self.softmax = nn.Softmax(dim=1) # softmax layer to calculate weights
|
32 |
+
|
33 |
+
def __call__(self, encoder_out, decoder_hidden):
|
34 |
+
"""
|
35 |
+
Forward propagation.
|
36 |
+
:param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
|
37 |
+
:param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim)
|
38 |
+
:return: attention weighted encoding, weights
|
39 |
+
"""
|
40 |
+
att1 = self.encoder_att(encoder_out) # (batch_size, num_pixels, attention_dim)
|
41 |
+
att2 = self.decoder_att(decoder_hidden) # (batch_size, attention_dim)
|
42 |
+
att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2) # (batch_size, num_pixels)
|
43 |
+
alpha = self.softmax(att) # (batch_size, num_pixels)
|
44 |
+
attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1) # (batch_size, encoder_dim)
|
45 |
+
return attention_weighted_encoding, alpha
|
imageprocessing/artemis/artemis/neural_models/attentive_decoder.py
ADDED
@@ -0,0 +1,696 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Decoding module for a neural speaker (with attention capabilities).
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 06/15/19, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import random
|
11 |
+
import time
|
12 |
+
import warnings
|
13 |
+
import tqdm
|
14 |
+
import math
|
15 |
+
import numpy as np
|
16 |
+
import torch.nn.functional as F
|
17 |
+
from torch import nn
|
18 |
+
from torch.nn.utils.rnn import pack_padded_sequence
|
19 |
+
from torch.nn.utils import clip_grad_norm_
|
20 |
+
|
21 |
+
from .attention import AdditiveVisioLinguistic
|
22 |
+
from ..utils.stats import AverageMeter
|
23 |
+
|
24 |
+
|
25 |
+
class AttentiveDecoder(nn.Module):
|
26 |
+
"""
|
27 |
+
Note: code adapted from: https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning
|
28 |
+
implementing a solid version of Show, Attend, and Tell. Many thanks Sagar and the team.
|
29 |
+
|
30 |
+
Special (optional) features:
|
31 |
+
- use stochastic teacher forcer
|
32 |
+
- add auxiliary input data at each decoding step (besides each 'previous' token).
|
33 |
+
- tie the weights of the encoder/decoder weight matrices
|
34 |
+
"""
|
35 |
+
def __init__(self, word_embedding, rnn_hidden_dim, encoder_dim, attention_dim,
|
36 |
+
vocab, dropout_rate=0, tie_weights=False, teacher_forcing_ratio=1,
|
37 |
+
auxiliary_net=None, auxiliary_dim=0):
|
38 |
+
"""
|
39 |
+
:param word_embedding: nn.Embedding
|
40 |
+
:param rnn_hidden_dim: hidden (and thus output) dimension of the decoding rnn
|
41 |
+
:param encoder_dim: feature dimension of encoded stimulus
|
42 |
+
:param attention_dim: feature dimension over which attention is computed
|
43 |
+
:param vocab: artemis.utils.vocabulary instance
|
44 |
+
:param dropout: dropout rate
|
45 |
+
:param tie_weights: (opt, boolean) if True, the hidden-to-word weights are equal (tied) to the word-embeddings,
|
46 |
+
see https://arxiv.org/abs/1611.01462 for explanation of why this might be a good idea.
|
47 |
+
:param teacher_forcing_ratio:
|
48 |
+
:param auxiliary_net: (optional) nn.Module that will be feeding the decoder at each time step
|
49 |
+
with some "auxiliary" information (say an emotion label). Obviously, this information is separate than the
|
50 |
+
output of the typically used image-encoder.
|
51 |
+
:param auxiliary_dim: (int, optional) the output feature-dimension of the auxiliary net.
|
52 |
+
"""
|
53 |
+
super(AttentiveDecoder, self).__init__()
|
54 |
+
self.vocab = vocab
|
55 |
+
self.vocab_size = len(vocab)
|
56 |
+
self.word_embedding = word_embedding
|
57 |
+
self.auxiliary_net = auxiliary_net
|
58 |
+
self.uses_aux_data = False
|
59 |
+
|
60 |
+
if auxiliary_dim > 0:
|
61 |
+
self.uses_aux_data = True
|
62 |
+
|
63 |
+
self.decode_step = nn.LSTMCell(word_embedding.embedding_dim + encoder_dim + auxiliary_dim, rnn_hidden_dim)
|
64 |
+
self.attention = AdditiveVisioLinguistic(encoder_dim, rnn_hidden_dim, attention_dim)
|
65 |
+
|
66 |
+
if dropout_rate > 0:
|
67 |
+
self.dropout = nn.Dropout(p=dropout_rate, inplace=True)
|
68 |
+
else:
|
69 |
+
self.dropout = nn.Identity()
|
70 |
+
|
71 |
+
self.init_h = nn.Linear(encoder_dim, rnn_hidden_dim) # linear layer to find initial hidden state of LSTMCell
|
72 |
+
self.init_c = nn.Linear(encoder_dim, rnn_hidden_dim) # linear layer to find initial cell state of LSTMCell
|
73 |
+
self.f_beta = nn.Linear(rnn_hidden_dim, encoder_dim) # linear layer to create a sigmoid-activated gate
|
74 |
+
self.sigmoid = nn.Sigmoid()
|
75 |
+
self.next_word = nn.Linear(rnn_hidden_dim, self.vocab_size) # linear layer to find scores over vocabulary
|
76 |
+
self.init_weights()
|
77 |
+
self.teacher_forcing_ratio = teacher_forcing_ratio
|
78 |
+
|
79 |
+
if tie_weights:
|
80 |
+
if self.word_embedding.embedding_dim != rnn_hidden_dim:
|
81 |
+
raise ValueError('When using the tied weights')
|
82 |
+
print('tying weights of encoder/decoder')
|
83 |
+
self.next_word.weight = self.word_embedding.weight
|
84 |
+
|
85 |
+
def init_hidden_state(self, encoder_out):
|
86 |
+
"""
|
87 |
+
Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
|
88 |
+
:param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
|
89 |
+
:return: hidden state, cell state
|
90 |
+
"""
|
91 |
+
mean_encoder_out = encoder_out.mean(dim=1)
|
92 |
+
h = self.init_h(mean_encoder_out) # (batch_size, decoder_dim)
|
93 |
+
c = self.init_c(mean_encoder_out)
|
94 |
+
return h, c
|
95 |
+
|
96 |
+
def init_weights(self, init_range=0.1):
|
97 |
+
""" Better initialization """
|
98 |
+
self.word_embedding.weight.data.uniform_(-init_range, init_range) # remove if pre-trained model comes up
|
99 |
+
self.next_word.bias.data.zero_()
|
100 |
+
self.next_word.weight.data.uniform_(-init_range, init_range)
|
101 |
+
|
102 |
+
def __call__(self, encoder_out, captions, auxiliary_data=None):
|
103 |
+
""" Forward propagation.
|
104 |
+
:param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim)
|
105 |
+
:param captions: encoded captions, a tensor of dimension (batch_size, max_caption_length)
|
106 |
+
:param auxiliary_data: extra information associated with the images (batch_size, some_dim)
|
107 |
+
:return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices
|
108 |
+
"""
|
109 |
+
return self.sort_captions_and_forward(encoder_out, captions, auxiliary_data=auxiliary_data)
|
110 |
+
|
111 |
+
def sort_captions_and_forward(self, encoder_out, captions, auxiliary_data=None):
|
112 |
+
""" Feed forward that ...
|
113 |
+
:param encoder_out:
|
114 |
+
:param captions:
|
115 |
+
:return:
|
116 |
+
"""
|
117 |
+
batch_size = encoder_out.size(0)
|
118 |
+
encoder_dim = encoder_out.size(-1)
|
119 |
+
|
120 |
+
# Flatten image
|
121 |
+
encoder_out = encoder_out.view(batch_size, -1, encoder_dim) # (batch_size, num_pixels, encoder_dim)
|
122 |
+
num_pixels = encoder_out.size(1)
|
123 |
+
|
124 |
+
decode_lengths = torch.where(captions == self.vocab.eos)[1] # "<sos> I am <eos>" => decode_length = 3
|
125 |
+
# we do not feed <eos> as input to generate
|
126 |
+
# something after it
|
127 |
+
|
128 |
+
# Sort input data by decreasing lengths to reduce compute below
|
129 |
+
decode_lengths, sort_ind = decode_lengths.sort(dim=0, descending=True)
|
130 |
+
encoder_out = encoder_out[sort_ind]
|
131 |
+
captions = captions[sort_ind]
|
132 |
+
|
133 |
+
if auxiliary_data is not None:
|
134 |
+
auxiliary_data = auxiliary_data[sort_ind]
|
135 |
+
auxiliary_data = self.auxiliary_net(auxiliary_data)
|
136 |
+
|
137 |
+
# prepare for unravelling
|
138 |
+
embeddings = self.word_embedding(captions) # (batch_size, max_caption_length, embed_dim)
|
139 |
+
h, c = self.init_hidden_state(encoder_out) # (batch_size, decoder_dim)
|
140 |
+
decode_lengths = decode_lengths.tolist()
|
141 |
+
device = embeddings.device
|
142 |
+
|
143 |
+
# Create tensors to hold word prediction logits and attention maps (alphas)
|
144 |
+
predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(device)
|
145 |
+
alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)
|
146 |
+
|
147 |
+
# At each time-step, decode by
|
148 |
+
# attention-weighing the encoder's output based on the decoder's previous hidden state output
|
149 |
+
# then generate a new word in the decoder with the previous word and the attention weighted encoding
|
150 |
+
for t in range(max(decode_lengths)):
|
151 |
+
batch_size_t = sum([l > t for l in decode_lengths])
|
152 |
+
h = h[:batch_size_t] # effective h
|
153 |
+
attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t], h)
|
154 |
+
gate = self.sigmoid(self.f_beta(h)) # gating scalar, (batch_size_t, encoder_dim)
|
155 |
+
attention_weighted_encoding = gate * attention_weighted_encoding
|
156 |
+
|
157 |
+
use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False
|
158 |
+
|
159 |
+
if use_teacher_forcing or t == 0:
|
160 |
+
decoder_lang_input = embeddings[:batch_size_t, t]
|
161 |
+
else:
|
162 |
+
_, top_pred = preds[:batch_size_t].topk(1)
|
163 |
+
top_pred = top_pred.squeeze(-1).detach() # detach from history as input
|
164 |
+
decoder_lang_input = self.word_embedding(top_pred)
|
165 |
+
|
166 |
+
if auxiliary_data is not None:
|
167 |
+
auxiliary_data_t = auxiliary_data[:batch_size_t]
|
168 |
+
decoder_in = torch.cat([decoder_lang_input, attention_weighted_encoding, auxiliary_data_t], dim=1)
|
169 |
+
else:
|
170 |
+
decoder_in = torch.cat([decoder_lang_input, attention_weighted_encoding], dim=1)
|
171 |
+
|
172 |
+
h, c = self.decode_step(decoder_in, (h, c[:batch_size_t])) # (batch_size_t, decoder_dim)
|
173 |
+
|
174 |
+
preds = self.next_word(self.dropout(h)) # (batch_size_t, vocab_size)
|
175 |
+
predictions[:batch_size_t, t] = preds
|
176 |
+
alphas[:batch_size_t, t] = alpha
|
177 |
+
return predictions, captions, decode_lengths, alphas, sort_ind
|
178 |
+
|
179 |
+
def attend_and_predict_next_word(self, encoder_out, h, c, tokens, aux_data=None):
|
180 |
+
"""Given current hidden/memory state of the decoder and the input tokens, guess the next tokens
|
181 |
+
and update the hidden/memory states.
|
182 |
+
:param encoder_out: the grounding
|
183 |
+
:param h: current hidden state
|
184 |
+
:param c: current memory state
|
185 |
+
:param tokens: current token input to the decoder
|
186 |
+
:return: logits over vocabulary distribution, updated h/c
|
187 |
+
"""
|
188 |
+
attention_weighted_encoding, alpha = self.attention(encoder_out, h)
|
189 |
+
gate = self.sigmoid(self.f_beta(h)) # gating scalar, (batch_size_t, encoder_dim)
|
190 |
+
attention_weighted_encoding = gate * attention_weighted_encoding
|
191 |
+
embeddings = self.word_embedding(tokens) # (batch_size, embed_dim)
|
192 |
+
|
193 |
+
decoder_input = torch.cat([embeddings, attention_weighted_encoding], dim=1)
|
194 |
+
|
195 |
+
if aux_data is not None:
|
196 |
+
aux_feat = self.auxiliary_net(aux_data)
|
197 |
+
decoder_input = torch.cat([decoder_input, aux_feat], dim=1)
|
198 |
+
|
199 |
+
h, c = self.decode_step(decoder_input, (h, c)) # (batch_size_t, decoder_dim)
|
200 |
+
logits = self.next_word(h) # (batch_size_t, vocab_size)
|
201 |
+
return h, c, logits, alpha
|
202 |
+
|
203 |
+
|
204 |
+
def single_epoch_train(train_loader, model, criterion, optimizer, epoch, device, tb_writer=None, **kwargs):
|
205 |
+
""" Perform training for one epoch.
|
206 |
+
:param train_loader: DataLoader for training data
|
207 |
+
:param model: nn.ModuleDict with 'encoder', 'decoder' keys
|
208 |
+
:param criterion: loss layer
|
209 |
+
:param optimizer: optimizer
|
210 |
+
:param epoch: epoch number
|
211 |
+
:param device:
|
212 |
+
"""
|
213 |
+
alpha_c = kwargs.get('alpha_c', 1.0) # Weight of doubly stochastic (attention) regularization.
|
214 |
+
grad_clip = kwargs.get('grad_clip', 5.0) # Gradient clipping (norm magnitude)
|
215 |
+
print_freq = kwargs.get('print_freq', 100)
|
216 |
+
use_emotion = kwargs.get('use_emotion', False)
|
217 |
+
|
218 |
+
batch_time = AverageMeter() # forward prop. + back prop. time
|
219 |
+
data_time = AverageMeter() # data loading time
|
220 |
+
entropy_loss_meter = AverageMeter() # entropy loss (per word decoded)
|
221 |
+
total_loss_meter = AverageMeter()
|
222 |
+
start = time.time()
|
223 |
+
steps_taken = (epoch-1) * len(train_loader.dataset)
|
224 |
+
model.train()
|
225 |
+
|
226 |
+
for i, batch in enumerate(train_loader):
|
227 |
+
imgs = batch['image'].to(device)
|
228 |
+
caps = batch['tokens'].to(device)
|
229 |
+
b_size = len(imgs)
|
230 |
+
data_time.update(time.time() - start)
|
231 |
+
|
232 |
+
if use_emotion:
|
233 |
+
emotion = batch['emotion'].to(device)
|
234 |
+
res = model.decoder(model.encoder(imgs), caps, emotion)
|
235 |
+
else:
|
236 |
+
res = model.decoder(model.encoder(imgs), caps)
|
237 |
+
logits, caps_sorted, decode_lengths, alphas, sort_ind = res
|
238 |
+
|
239 |
+
# Since we decoded starting with <sos>, the targets are all words after <sos>, up to <eos>
|
240 |
+
targets = caps_sorted[:, 1:]
|
241 |
+
|
242 |
+
# Remove time-steps that we didn't decode at, or are pads
|
243 |
+
# pack_padded_sequence is an easy trick to do this
|
244 |
+
logits = pack_padded_sequence(logits, decode_lengths, batch_first=True)
|
245 |
+
targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)
|
246 |
+
|
247 |
+
# Calculate loss
|
248 |
+
ent_loss = criterion(logits.data, targets.data)
|
249 |
+
total_loss = ent_loss
|
250 |
+
|
251 |
+
# Add doubly stochastic attention regularization
|
252 |
+
# Note. some implementation simply do this like: d_atn_loss = alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
|
253 |
+
# here we take care of the fact that some samples in the same batch have more/less tokens than others.
|
254 |
+
if alpha_c > 0:
|
255 |
+
total_energy = torch.from_numpy(np.array(decode_lengths)) / alphas.shape[-1] # n_tokens / num_pixels
|
256 |
+
total_energy.unsqueeze_(-1) # B x 1
|
257 |
+
total_energy = total_energy.to(device)
|
258 |
+
d_atn_loss = alpha_c * ((total_energy - alphas.sum(dim=1)) ** 2).mean()
|
259 |
+
total_loss += d_atn_loss
|
260 |
+
|
261 |
+
# Back prop.
|
262 |
+
optimizer.zero_grad()
|
263 |
+
total_loss.backward()
|
264 |
+
if grad_clip is not None:
|
265 |
+
clip_grad_norm_(model.parameters(), grad_clip)
|
266 |
+
|
267 |
+
# Update weights
|
268 |
+
optimizer.step()
|
269 |
+
|
270 |
+
# Keep track of metrics
|
271 |
+
entropy_loss_meter.update(ent_loss.item(), sum(decode_lengths))
|
272 |
+
total_loss_meter.update(total_loss.item(), sum(decode_lengths))
|
273 |
+
batch_time.update(time.time() - start)
|
274 |
+
start = time.time()
|
275 |
+
steps_taken += b_size
|
276 |
+
|
277 |
+
# Print status
|
278 |
+
if print_freq is not None and i % print_freq == 0:
|
279 |
+
print('Epoch: [{0}][{1}/{2}]\t'
|
280 |
+
'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
281 |
+
'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
|
282 |
+
'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(epoch, i, len(train_loader),
|
283 |
+
batch_time=batch_time,
|
284 |
+
data_time=data_time,
|
285 |
+
loss=total_loss_meter))
|
286 |
+
if tb_writer is not None:
|
287 |
+
tb_writer.add_scalar('training-entropy-loss-with-batch-granularity', entropy_loss_meter.avg, steps_taken)
|
288 |
+
|
289 |
+
return total_loss_meter.avg
|
290 |
+
|
291 |
+
|
292 |
+
@torch.no_grad()
|
293 |
+
def negative_log_likelihood(model, data_loader, device):
|
294 |
+
"""
|
295 |
+
:param model:
|
296 |
+
:param data_loader:
|
297 |
+
:param device:
|
298 |
+
:param phase:
|
299 |
+
:return:
|
300 |
+
"""
|
301 |
+
model.eval()
|
302 |
+
nll = AverageMeter()
|
303 |
+
|
304 |
+
aux_data = None
|
305 |
+
for batch in data_loader:
|
306 |
+
imgs = batch['image'].to(device)
|
307 |
+
caps = batch['tokens'].to(device)
|
308 |
+
|
309 |
+
# TODO Refactor
|
310 |
+
if model.decoder.uses_aux_data:
|
311 |
+
aux_data = batch['emotion'].to(device)
|
312 |
+
|
313 |
+
logits, caps_sorted, decode_lengths, alphas, sort_ind = model.decoder(model.encoder(imgs), caps, aux_data)
|
314 |
+
|
315 |
+
# Since we decoded starting with <sos>, the targets are all words after <sos>, up to <eos>
|
316 |
+
targets = caps_sorted[:, 1:]
|
317 |
+
|
318 |
+
# Remove time-steps that we didn't decode at, or are pads
|
319 |
+
# pack_padded_sequence is an easy trick to do this
|
320 |
+
logits = pack_padded_sequence(logits, decode_lengths, batch_first=True)
|
321 |
+
targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)
|
322 |
+
|
323 |
+
# Calculate loss
|
324 |
+
loss = F.cross_entropy(logits.data, targets.data)
|
325 |
+
nll.update(loss.item(), sum(decode_lengths))
|
326 |
+
return nll.avg
|
327 |
+
|
328 |
+
|
329 |
+
@torch.no_grad()
|
330 |
+
def log_prob_of_caption(model, img, tokens, temperature=1):
|
331 |
+
"""Given a captioning model, return the log-probability of a caption given an image.
|
332 |
+
This version expects a batch of images, each assotiated with a single caption.
|
333 |
+
:param model: encoder/decoder speaker
|
334 |
+
:param img: Tensor B x channels x spatial-dims
|
335 |
+
:param tokens: Tensor B x max-n-tokens
|
336 |
+
:return log_probs: Tensor of size B x max-n-tokens holding the log-probs of each token of each caption
|
337 |
+
"""
|
338 |
+
|
339 |
+
encoder = model.encoder
|
340 |
+
decoder = model.decoder
|
341 |
+
|
342 |
+
assert all(tokens[:, 0] == decoder.vocab.sos)
|
343 |
+
|
344 |
+
max_steps = tokens.shape[1]
|
345 |
+
encoder_out = encoder(img)
|
346 |
+
batch_size = encoder_out.size(0)
|
347 |
+
encoder_dim = encoder_out.size(-1)
|
348 |
+
encoder_out = encoder_out.view(batch_size, -1, encoder_dim)
|
349 |
+
|
350 |
+
# Create tensors to hold log-probs
|
351 |
+
log_probs = torch.zeros(batch_size, max_steps).to(tokens.device)
|
352 |
+
h, c = decoder.init_hidden_state(encoder_out)
|
353 |
+
|
354 |
+
for t in range(max_steps - 1):
|
355 |
+
h, c, pred_t, _ = decoder.attend_and_predict_next_word(encoder_out, h, c, tokens[:, t])
|
356 |
+
|
357 |
+
if temperature != 1:
|
358 |
+
pred_t /= temperature
|
359 |
+
|
360 |
+
pred_t = F.log_softmax(pred_t, dim=1)
|
361 |
+
log_probs[:, t] = pred_t[torch.arange(batch_size), tokens[:, t+1]] # prob. of guessing next token
|
362 |
+
|
363 |
+
lens = torch.where(tokens == decoder.vocab.eos)[1] # true tokens + 1 for <eos>
|
364 |
+
mask = torch.zeros_like(log_probs)
|
365 |
+
mask[torch.arange(mask.shape[0]), lens] = 1
|
366 |
+
mask = mask.cumsum(dim=1).to(torch.bool)
|
367 |
+
log_probs.masked_fill_(mask, 0) # set to zero all positions after the true size of the caption
|
368 |
+
return log_probs, lens
|
369 |
+
|
370 |
+
|
371 |
+
@torch.no_grad()
|
372 |
+
def sample_captions(model, loader, max_utterance_len, sampling_rule, device, temperature=1,
|
373 |
+
topk=None, drop_unk=True, drop_bigrams=False):
|
374 |
+
"""
|
375 |
+
:param model:
|
376 |
+
:param loader:
|
377 |
+
:param max_utterance_len: maximum allowed length of captions
|
378 |
+
:param sampling_rule: (str) 'argmax' or 'multinomial', or 'topk'
|
379 |
+
:return:
|
380 |
+
attention_weights: (torch cpu Tensor) N-images x encoded_image_size (e.g., 7 x 7) x max_utterance_len
|
381 |
+
attention_weights[:,0] corresponds to the attention map over the <SOS> symbol
|
382 |
+
"""
|
383 |
+
if sampling_rule not in ['argmax', 'multinomial', 'topk']:
|
384 |
+
raise ValueError('Unknown sampling rule.')
|
385 |
+
|
386 |
+
model.eval()
|
387 |
+
all_predictions = []
|
388 |
+
attention_weights = []
|
389 |
+
unk = model.decoder.vocab.unk
|
390 |
+
|
391 |
+
use_aux_data = model.decoder.uses_aux_data
|
392 |
+
aux_data = None
|
393 |
+
|
394 |
+
for batch in loader:
|
395 |
+
imgs = batch['image'].to(device)
|
396 |
+
|
397 |
+
if use_aux_data:
|
398 |
+
aux_data = batch['emotion'].to(device)
|
399 |
+
|
400 |
+
encoder_out = model.encoder(imgs)
|
401 |
+
enc_image_size = encoder_out.size(1)
|
402 |
+
batch_size = encoder_out.size(0)
|
403 |
+
encoder_dim = encoder_out.size(-1)
|
404 |
+
|
405 |
+
# Flatten image
|
406 |
+
encoder_out = encoder_out.view(batch_size, -1, encoder_dim) # (batch_size, num_pixels, encoder_dim)
|
407 |
+
|
408 |
+
# Create tensors to hold word predictions
|
409 |
+
max_steps = max_utterance_len + 1 # one extra step for EOS marker
|
410 |
+
predictions = torch.zeros(batch_size, max_steps).to(device)
|
411 |
+
|
412 |
+
# Initialize decoder state
|
413 |
+
decoder = model.decoder
|
414 |
+
h, c = decoder.init_hidden_state(encoder_out) # (batch_size, decoder_dim)
|
415 |
+
|
416 |
+
# Tensor to store previous words at each step; now they're just <sos>
|
417 |
+
prev_words = torch.LongTensor([decoder.vocab.sos] * batch_size).to(device)
|
418 |
+
|
419 |
+
for t in range(max_steps):
|
420 |
+
h, c, pred_t, alpha = decoder.attend_and_predict_next_word(encoder_out, h, c, prev_words, aux_data=aux_data)
|
421 |
+
if t > 0: # at t=1 it sees <sos> as the previous word
|
422 |
+
alpha = alpha.view(-1, enc_image_size, enc_image_size) # (bsize, enc_image_size, enc_image_size)
|
423 |
+
attention_weights.append(alpha.cpu())
|
424 |
+
|
425 |
+
pred_t /= temperature
|
426 |
+
|
427 |
+
if drop_unk:
|
428 |
+
pred_t[:, unk] = -math.inf
|
429 |
+
|
430 |
+
if t > 0:
|
431 |
+
pred_t[:, prev_words] = -math.inf # avoid repeating the same word twice
|
432 |
+
|
433 |
+
if t > 1:
|
434 |
+
pred_t[:, predictions[:,t-2].long()] = -math.inf # avoid repeating the prev-prev word
|
435 |
+
|
436 |
+
if drop_bigrams and t > 1:
|
437 |
+
prev_usage = predictions[:, :t-1] # of the previous word (e.g, xx yy xx) (first xx)
|
438 |
+
x, y = torch.where(prev_usage == torch.unsqueeze(prev_words, -1))
|
439 |
+
y += 1 # word-after-last-in-prev-usage (e.g., yy in above)
|
440 |
+
y = prev_usage[x, y].long()
|
441 |
+
pred_t[x, y] = -math.inf
|
442 |
+
|
443 |
+
if sampling_rule == 'argmax':
|
444 |
+
prev_words = torch.argmax(pred_t, 1)
|
445 |
+
elif sampling_rule == 'multinomial':
|
446 |
+
probability = torch.softmax(pred_t, 1)
|
447 |
+
prev_words = torch.multinomial(probability, 1).squeeze_(-1)
|
448 |
+
elif sampling_rule == 'topk':
|
449 |
+
row_idx = torch.arange(batch_size)
|
450 |
+
row_idx = row_idx.view([1, -1]).repeat(topk, 1).t()
|
451 |
+
# do soft-max after you zero-out non topk (you could also do this before, ask me/Panos if need be:) )
|
452 |
+
val, ind = pred_t.topk(topk, dim=1)
|
453 |
+
val = torch.softmax(val, 1)
|
454 |
+
probability = torch.zeros_like(pred_t) # only the top-k logits will have non-zero prob.
|
455 |
+
probability[row_idx, ind] = val
|
456 |
+
prev_words = torch.multinomial(probability, 1).squeeze_(-1)
|
457 |
+
|
458 |
+
predictions[:, t] = prev_words
|
459 |
+
all_predictions.append(predictions.cpu().long())
|
460 |
+
all_predictions = torch.cat(all_predictions)
|
461 |
+
attention_weights = torch.stack(attention_weights, 1)
|
462 |
+
return all_predictions, attention_weights
|
463 |
+
|
464 |
+
|
465 |
+
@torch.no_grad()
|
466 |
+
def sample_captions_beam_search(model, data_loader, beam_size, device, temperature=1, max_iter=500,
|
467 |
+
drop_unk=True, drop_bigrams=False):
|
468 |
+
"""
|
469 |
+
:param model (encoder, decoder)
|
470 |
+
:param data_loader:
|
471 |
+
:param beam_size:
|
472 |
+
:param drop_unk:
|
473 |
+
:return:
|
474 |
+
|
475 |
+
hypotheses_alphas: list carrying the attention maps over the encoded-pixel space for each produced token.
|
476 |
+
Note: batch size must be one.
|
477 |
+
"""
|
478 |
+
|
479 |
+
if data_loader.batch_size != 1:
|
480 |
+
raise ValueError('not implemented for bigger batch-sizes')
|
481 |
+
|
482 |
+
model.eval()
|
483 |
+
decoder = model.decoder
|
484 |
+
vocab = model.decoder.vocab
|
485 |
+
|
486 |
+
captions = list()
|
487 |
+
hypotheses_alphas = list()
|
488 |
+
caption_log_prob = list()
|
489 |
+
|
490 |
+
aux_feat = None
|
491 |
+
for batch in tqdm.tqdm(data_loader): # For each image (batch-size = 1)
|
492 |
+
image = batch['image'].to(device) # (1, 3, H, W)
|
493 |
+
|
494 |
+
if model.decoder.uses_aux_data:
|
495 |
+
aux_data = batch['emotion'].to(device)
|
496 |
+
aux_feat = model.decoder.auxiliary_net(aux_data)
|
497 |
+
|
498 |
+
k = beam_size
|
499 |
+
encoder_out = model.encoder(image) # (1, enc_image_size, enc_image_size, encoder_dim)
|
500 |
+
enc_image_size = encoder_out.size(1)
|
501 |
+
encoder_dim = encoder_out.size(3)
|
502 |
+
|
503 |
+
# Flatten encoding
|
504 |
+
encoder_out = encoder_out.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim)
|
505 |
+
num_pixels = encoder_out.size(1)
|
506 |
+
|
507 |
+
# We'll treat the problem as having a batch size of k
|
508 |
+
encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim)
|
509 |
+
|
510 |
+
# Tensor to store top k previous words at each step; now they're just <sos>
|
511 |
+
k_prev_words = torch.LongTensor([[vocab.sos]] * k).to(device) # (k, 1)
|
512 |
+
|
513 |
+
# Tensor to store top k sequences; now they're just <sos>
|
514 |
+
seqs = k_prev_words # (k, 1)
|
515 |
+
|
516 |
+
# Tensor to store top k sequences' scores; now they're just 0
|
517 |
+
top_k_scores = torch.zeros(k, 1).to(device) # (k, 1)
|
518 |
+
|
519 |
+
# Tensor to store top k sequences' alphas; now they're just 1s
|
520 |
+
seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device) # (k, 1, enc_image_size, enc_image_size)
|
521 |
+
|
522 |
+
# Lists to store completed sequences and scores
|
523 |
+
complete_seqs = list()
|
524 |
+
complete_seqs_alpha = list()
|
525 |
+
complete_seqs_scores = list()
|
526 |
+
|
527 |
+
# Start decoding
|
528 |
+
step = 1
|
529 |
+
h, c = decoder.init_hidden_state(encoder_out)
|
530 |
+
|
531 |
+
# s (below) is a number less than or equal to k, because sequences are removed
|
532 |
+
# from this process once they hit <eos>
|
533 |
+
while True:
|
534 |
+
embeddings = decoder.word_embedding(k_prev_words).squeeze(1) # (s, embed_dim)
|
535 |
+
awe, alpha = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels)
|
536 |
+
alpha = alpha.view(-1, enc_image_size, enc_image_size) # (s, enc_image_size, enc_image_size)
|
537 |
+
gate = decoder.sigmoid(decoder.f_beta(h)) # gating scalar, (s, encoder_dim)
|
538 |
+
awe = gate * awe
|
539 |
+
decoder_input = torch.cat([embeddings, awe], dim=1)
|
540 |
+
|
541 |
+
if aux_feat is not None:
|
542 |
+
af = torch.repeat_interleave(aux_feat, decoder_input.shape[0], dim=0)
|
543 |
+
decoder_input = torch.cat([decoder_input, af], dim=1)
|
544 |
+
|
545 |
+
h, c = decoder.decode_step(decoder_input, (h, c)) # (s, decoder_dim)
|
546 |
+
scores = decoder.next_word(h) # (s, vocab_size)
|
547 |
+
|
548 |
+
if temperature != 1:
|
549 |
+
scores /= temperature
|
550 |
+
|
551 |
+
scores = F.log_softmax(scores, dim=1)
|
552 |
+
|
553 |
+
if drop_unk:
|
554 |
+
scores[:, vocab.unk] = -math.inf
|
555 |
+
|
556 |
+
if drop_bigrams and step > 2:
|
557 |
+
# drop bi-grams with frequency higher than 1.
|
558 |
+
prev_usage = seqs[:, :step-1]
|
559 |
+
x, y = torch.where(prev_usage == k_prev_words)
|
560 |
+
y += 1 # word-after-last-in-prev-usage
|
561 |
+
y = seqs[x, y]
|
562 |
+
scores[x,y] = -math.inf
|
563 |
+
|
564 |
+
if step > 2:
|
565 |
+
## drop x and x
|
566 |
+
and_token = decoder.vocab('and')
|
567 |
+
x, y = torch.where(k_prev_words == and_token)
|
568 |
+
pre_and_word = seqs[x, step-2]
|
569 |
+
scores[x, pre_and_word] = -math.inf
|
570 |
+
|
571 |
+
# Add log-probabilities
|
572 |
+
scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size)
|
573 |
+
|
574 |
+
# For the first step, all k points will have the same scores (since same k previous words, h, c)
|
575 |
+
if step == 1:
|
576 |
+
top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s)
|
577 |
+
else:
|
578 |
+
# Unroll and find top scores, and their unrolled indices
|
579 |
+
top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True) # (s)
|
580 |
+
|
581 |
+
# Convert unrolled indices to actual indices of scores
|
582 |
+
prev_word_inds = top_k_words / len(vocab) # (s)
|
583 |
+
next_word_inds = top_k_words % len(vocab) # (s)
|
584 |
+
|
585 |
+
# Add new words to sequences
|
586 |
+
seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1)
|
587 |
+
seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
|
588 |
+
dim=1) # (s, step+1, enc_image_size, enc_image_size)
|
589 |
+
|
590 |
+
# Which sequences are incomplete (didn't reach <eos>)?
|
591 |
+
incomplete_inds = [ind for ind, word in enumerate(next_word_inds) if word != vocab.eos]
|
592 |
+
complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))
|
593 |
+
|
594 |
+
# Set aside complete sequences
|
595 |
+
if len(complete_inds) > 0:
|
596 |
+
complete_seqs.extend(seqs[complete_inds].tolist())
|
597 |
+
complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
|
598 |
+
complete_seqs_scores.extend(top_k_scores[complete_inds].tolist())
|
599 |
+
k -= len(complete_inds) # reduce beam length accordingly
|
600 |
+
|
601 |
+
# Proceed with incomplete sequences
|
602 |
+
if k == 0:
|
603 |
+
break
|
604 |
+
seqs = seqs[incomplete_inds]
|
605 |
+
seqs_alpha = seqs_alpha[incomplete_inds]
|
606 |
+
|
607 |
+
h = h[prev_word_inds[incomplete_inds]]
|
608 |
+
c = c[prev_word_inds[incomplete_inds]]
|
609 |
+
encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
|
610 |
+
top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
|
611 |
+
k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)
|
612 |
+
|
613 |
+
# Break if things have been going on too long
|
614 |
+
if step > max_iter:
|
615 |
+
break
|
616 |
+
step += 1
|
617 |
+
|
618 |
+
s_idx = np.argsort(complete_seqs_scores)[::-1]
|
619 |
+
complete_seqs_scores = [complete_seqs_scores[i] for i in s_idx]
|
620 |
+
complete_seqs = [complete_seqs[i] for i in s_idx]
|
621 |
+
alphas = [complete_seqs_alpha[i] for i in s_idx]
|
622 |
+
|
623 |
+
captions.append(complete_seqs)
|
624 |
+
caption_log_prob.append(complete_seqs_scores)
|
625 |
+
hypotheses_alphas.append(alphas)
|
626 |
+
return captions, hypotheses_alphas, caption_log_prob
|
627 |
+
|
628 |
+
|
629 |
+
@torch.no_grad()
|
630 |
+
def properize_captions(captions, vocab, add_sos=True):
|
631 |
+
"""
|
632 |
+
:param captions: torch Tensor holding M x max_len integers
|
633 |
+
:param vocab:
|
634 |
+
:param add_sos:
|
635 |
+
:return:
|
636 |
+
"""
|
637 |
+
# ensure they end with eos.
|
638 |
+
|
639 |
+
new_captions = []
|
640 |
+
missed_eos = 0
|
641 |
+
for caption in captions.cpu():
|
642 |
+
ending = torch.where(caption == vocab.eos)[0]
|
643 |
+
if len(ending) >= 1: # at least one <eos> symbol is found
|
644 |
+
first_eos = ending[0]
|
645 |
+
if first_eos < len(caption):
|
646 |
+
caption[first_eos+1:] = vocab.pad
|
647 |
+
else:
|
648 |
+
missed_eos += 1
|
649 |
+
caption[-1] = vocab.eos
|
650 |
+
new_captions.append(caption)
|
651 |
+
|
652 |
+
new_captions = torch.stack(new_captions)
|
653 |
+
|
654 |
+
dummy = torch.unique(torch.where(new_captions == vocab.eos)[0])
|
655 |
+
assert len(dummy) == len(new_captions) # assert all have an eos.
|
656 |
+
|
657 |
+
if add_sos:
|
658 |
+
sos = torch.LongTensor([vocab.sos] * len(new_captions)).view(-1, 1)
|
659 |
+
new_captions = torch.cat([sos, new_captions], dim=1)
|
660 |
+
if missed_eos > 0:
|
661 |
+
warnings.warn('{} sentences without <eos> were generated.'.format(missed_eos))
|
662 |
+
return new_captions
|
663 |
+
|
664 |
+
|
665 |
+
def log_prob_of_dataset(model, data_loader, device, temperature=1):
|
666 |
+
all_log_probs = []
|
667 |
+
all_lens = []
|
668 |
+
model.eval()
|
669 |
+
for batch in data_loader:
|
670 |
+
imgs = batch['image'].to(device)
|
671 |
+
tokens = batch['tokens'].to(device)
|
672 |
+
log_probs, n_tokens = log_prob_of_caption(model, imgs, tokens, temperature=temperature)
|
673 |
+
all_log_probs.append(log_probs.cpu())
|
674 |
+
all_lens.append(n_tokens.cpu())
|
675 |
+
|
676 |
+
all_log_probs = torch.cat(all_log_probs, dim=0)
|
677 |
+
all_lens = torch.cat(all_lens, dim=0)
|
678 |
+
return all_log_probs, all_lens
|
679 |
+
|
680 |
+
|
681 |
+
def perplexity_of_dataset(model, data_loader, device):
|
682 |
+
""" for a test corpus perplexity is 2 ^ {-l} where l is log_2(prob_of_sentences) * M, where M is the number
|
683 |
+
of tokens in the dataset.
|
684 |
+
:param model:
|
685 |
+
:param data_loader:
|
686 |
+
:param device:
|
687 |
+
:return:
|
688 |
+
"""
|
689 |
+
all_log_probs, all_lens = log_prob_of_dataset(model, data_loader, device)
|
690 |
+
log_prob_per_sent = torch.sum(all_log_probs, 1).double() # sum over tokens to get the log_p of each utterance
|
691 |
+
prob_per_sent = torch.exp(log_prob_per_sent)
|
692 |
+
n_tokens = torch.sum(all_lens).double() # number of words in dataset
|
693 |
+
average_log_prob = torch.sum(torch.log2(prob_per_sent)) / n_tokens # log_2 for perplexity
|
694 |
+
perplexity = 2.0 ** (-average_log_prob)
|
695 |
+
return perplexity, prob_per_sent, all_lens
|
696 |
+
|
imageprocessing/artemis/artemis/neural_models/distances.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for distance measurements in GPU.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 07/2019, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from torch.nn.functional import normalize
|
11 |
+
|
12 |
+
def cdist(x1, x2, epsilon=1e-16):
|
13 |
+
"""
|
14 |
+
:param x1: N x Feat-dim
|
15 |
+
:param x2: N x Feat-dim
|
16 |
+
:param epsilon:
|
17 |
+
:return: N x N matrix
|
18 |
+
"""
|
19 |
+
x1_norm = x1.pow(2).sum(dim=-1, keepdim=True)
|
20 |
+
x2_norm = x2.pow(2).sum(dim=-1, keepdim=True)
|
21 |
+
inner_prod = torch.mm(x1, x2.t())
|
22 |
+
res = x1_norm - 2.0 * inner_prod + x2_norm.t() # You need to transpose for broadcasting to be correct.
|
23 |
+
res.clamp_min_(epsilon).sqrt_()
|
24 |
+
return res
|
25 |
+
|
26 |
+
|
27 |
+
def exclude_identity_from_neighbor_search(all_pairwise_dists, identities):
|
28 |
+
"""
|
29 |
+
:param all_pairwise_dists: M x N matrix of distances
|
30 |
+
:param identities: the k-th row of all_pairwise_dists, should exclude the identities[k] entry.
|
31 |
+
:return:
|
32 |
+
"""
|
33 |
+
all_pairwise_dists[range(all_pairwise_dists.size(0)), identities] = float("Inf")
|
34 |
+
return all_pairwise_dists
|
35 |
+
|
36 |
+
|
37 |
+
def k_euclidean_neighbors(k, x1, x2, exclude_identity=False, identities=None):
|
38 |
+
""" For each row vector in x1 the k-nearest neighbors in x2.
|
39 |
+
:param k:
|
40 |
+
:param x1: M x Feat-dim
|
41 |
+
:param x2: N x Feat-dim
|
42 |
+
:param exclude_identity:
|
43 |
+
:param identities:
|
44 |
+
:return: M x k
|
45 |
+
"""
|
46 |
+
all_cross_pairwise_dists = cdist(x1, x2)
|
47 |
+
if exclude_identity:
|
48 |
+
all_cross_pairwise_dists = exclude_identity_from_neighbor_search(all_cross_pairwise_dists, identities)
|
49 |
+
n_dists, n_ids = all_cross_pairwise_dists.topk(k=k, dim=1, largest=False, sorted=True)
|
50 |
+
return n_dists, n_ids
|
51 |
+
|
52 |
+
|
53 |
+
def k_cosine_neighbors(k, x1, x2, exclude_identity=False, identities=None):
|
54 |
+
""" For each row vector in x1 the k-nearest neighbors in x2.
|
55 |
+
:param k:
|
56 |
+
:param x1: M x Feat-dim
|
57 |
+
:param x2: N x Feat-dim
|
58 |
+
:param exclude_identity:
|
59 |
+
:param identities:
|
60 |
+
:return: M x k
|
61 |
+
"""
|
62 |
+
all_cross_pairwise_dists = torch.mm(normalize(x1, dim=1, p=2), normalize(x2, dim=1, p=2).t())
|
63 |
+
all_cross_pairwise_dists = 1.0 - all_cross_pairwise_dists
|
64 |
+
if exclude_identity:
|
65 |
+
all_cross_pairwise_dists = exclude_identity_from_neighbor_search(all_cross_pairwise_dists, identities)
|
66 |
+
n_dists, n_ids = all_cross_pairwise_dists.topk(k=k, dim=1, largest=False, sorted=True)
|
67 |
+
return n_dists, n_ids
|
imageprocessing/artemis/artemis/neural_models/image_emotion_clf.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Given an image guess a distribution over the emotion labels.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in 2020, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import torch.nn.functional as F
|
11 |
+
from torch import nn
|
12 |
+
from tqdm.notebook import tqdm as tqdm_notebook
|
13 |
+
|
14 |
+
from ..utils.stats import AverageMeter
|
15 |
+
|
16 |
+
|
17 |
+
class ImageEmotionClassifier(nn.Module):
|
18 |
+
def __init__(self, img_encoder, clf_head):
|
19 |
+
super(ImageEmotionClassifier, self).__init__()
|
20 |
+
self.img_encoder = img_encoder
|
21 |
+
self.clf_head = clf_head
|
22 |
+
|
23 |
+
def __call__(self, img):
|
24 |
+
feat = self.img_encoder(img)
|
25 |
+
logits = self.clf_head(feat)
|
26 |
+
return logits
|
27 |
+
|
28 |
+
|
29 |
+
def single_epoch_train(model, data_loader, criterion, optimizer, device):
|
30 |
+
epoch_loss = AverageMeter()
|
31 |
+
model.train()
|
32 |
+
for batch in tqdm_notebook(data_loader):
|
33 |
+
img = batch['image'].to(device)
|
34 |
+
labels = batch['label'].to(device) # emotion_distribution
|
35 |
+
logits = model(img)
|
36 |
+
|
37 |
+
# Calculate loss
|
38 |
+
loss = criterion(logits, labels)
|
39 |
+
|
40 |
+
# Back prop.
|
41 |
+
optimizer.zero_grad()
|
42 |
+
loss.backward()
|
43 |
+
optimizer.step()
|
44 |
+
|
45 |
+
b_size = len(labels)
|
46 |
+
epoch_loss.update(loss.item(), b_size)
|
47 |
+
return epoch_loss.avg
|
48 |
+
|
49 |
+
|
50 |
+
@torch.no_grad()
|
51 |
+
def evaluate_on_dataset(model, data_loader, criterion, device, detailed=True, kl_div=True):
|
52 |
+
epoch_loss = AverageMeter()
|
53 |
+
model.eval()
|
54 |
+
epoch_confidence = []
|
55 |
+
for batch in tqdm_notebook(data_loader):
|
56 |
+
img = batch['image'].to(device)
|
57 |
+
labels = batch['label'].to(device) # emotion_distribution
|
58 |
+
logits = model(img)
|
59 |
+
|
60 |
+
# Calculate loss
|
61 |
+
loss = criterion(logits, labels)
|
62 |
+
|
63 |
+
if detailed:
|
64 |
+
if kl_div:
|
65 |
+
epoch_confidence.append(torch.exp(logits).cpu()) # logits are log-soft-max
|
66 |
+
else:
|
67 |
+
epoch_confidence.append(F.softmax(logits, dim=-1).cpu()) # logits are pure logits
|
68 |
+
|
69 |
+
b_size = len(labels)
|
70 |
+
epoch_loss.update(loss.item(), b_size)
|
71 |
+
|
72 |
+
if detailed:
|
73 |
+
epoch_confidence = torch.cat(epoch_confidence).numpy()
|
74 |
+
|
75 |
+
return epoch_loss.avg, epoch_confidence
|
imageprocessing/artemis/artemis/neural_models/lstm_encoder.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Encoding discrete tokens with LSTMs.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created at 2019, (updated on January 2020) for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from torch import nn
|
11 |
+
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
|
12 |
+
|
13 |
+
|
14 |
+
class LSTMEncoder(nn.Module):
|
15 |
+
"""A feed-forward network that processes discrete tokens via an LSTM."""
|
16 |
+
|
17 |
+
def __init__(self, n_input, n_hidden, word_embedding, word_transformation=None,
|
18 |
+
bidirectional=False, init_h=None, init_c=None, eos_symbol=None, feature_type='last'):
|
19 |
+
"""
|
20 |
+
:param n_input: (int) input dim of LSTM
|
21 |
+
:param n_hidden: (int) hidden dim of LSTM
|
22 |
+
:param word_embedding: (nn.Embedding) vectors representing words
|
23 |
+
:param word_transformation: (opt, nn.Module) to apply some transformation on the word
|
24 |
+
embeddings before they are consumed by the LSTM.
|
25 |
+
:param bidirectional: boolean, whether to use a bi-RNN
|
26 |
+
:param init_h: (opt, nn.Module) for initializing LSTM hidden state
|
27 |
+
:param init_c: (opt, nn.Module) for initializing LSTM memory
|
28 |
+
:param eos_symbol: (opt, int) integer marking end of sentence
|
29 |
+
:param feature_type: (opt, string) how to process the output of the LSTM,
|
30 |
+
valid options = ['last', 'max', 'mean', 'all']
|
31 |
+
"""
|
32 |
+
|
33 |
+
super().__init__()
|
34 |
+
self.word_embedding = word_embedding
|
35 |
+
self.n_hidden = n_hidden
|
36 |
+
self.eos = eos_symbol
|
37 |
+
self.feature_type = feature_type
|
38 |
+
|
39 |
+
# auxiliary (optional) networks
|
40 |
+
self.word_transformation = word_transformation
|
41 |
+
self.init_h = init_h
|
42 |
+
self.init_c = init_c
|
43 |
+
|
44 |
+
self.rnn = nn.LSTM(input_size=n_input, hidden_size=n_hidden,
|
45 |
+
bidirectional=bidirectional, batch_first=True)
|
46 |
+
|
47 |
+
def out_dim(self):
|
48 |
+
rnn = self.rnn
|
49 |
+
mult = 2 if rnn.bidirectional else 1
|
50 |
+
return rnn.num_layers * rnn.hidden_size * mult
|
51 |
+
|
52 |
+
def __call__(self, tokens, grounding=None, len_of_sequence=None):
|
53 |
+
"""
|
54 |
+
:param tokens:
|
55 |
+
:param grounding: (Tensor, opt)
|
56 |
+
:param len_of_sequence: (Tensor:, opt) singleton tensor of shape (B,) carrying the length of the tokens
|
57 |
+
:return: the encoded by the LSTM tokens
|
58 |
+
Note: a) tokens are padded with the <sos> token
|
59 |
+
"""
|
60 |
+
w_emb = self.word_embedding(tokens[:, 1:]) # skip <sos>
|
61 |
+
if self.word_transformation is not None:
|
62 |
+
w_emb = self.word_transformation(w_emb)
|
63 |
+
|
64 |
+
device = w_emb.device
|
65 |
+
|
66 |
+
if len_of_sequence is None:
|
67 |
+
len_of_sequence = torch.where(tokens == self.eos)[1] - 1 # ignore <sos>
|
68 |
+
|
69 |
+
x_packed = pack_padded_sequence(w_emb, len_of_sequence, enforce_sorted=False, batch_first=True)
|
70 |
+
|
71 |
+
self.rnn.flatten_parameters()
|
72 |
+
|
73 |
+
if grounding is not None:
|
74 |
+
h0 = self.init_h(grounding).unsqueeze(0) # rep-mat if multiple LSTM cells.
|
75 |
+
c0 = self.init_c(grounding).unsqueeze(0)
|
76 |
+
rnn_out, _ = self.rnn(x_packed, (h0, c0))
|
77 |
+
else:
|
78 |
+
rnn_out, _ = self.rnn(x_packed)
|
79 |
+
|
80 |
+
rnn_out, dummy = pad_packed_sequence(rnn_out, batch_first=True)
|
81 |
+
|
82 |
+
if self.feature_type == 'last':
|
83 |
+
batch_size = len(tokens)
|
84 |
+
lang_feat = rnn_out[torch.arange(batch_size), len_of_sequence-1]
|
85 |
+
elif self.feature_type == 'max':
|
86 |
+
lang_feat = rnn_out.max(1).values
|
87 |
+
elif self.feature_type == 'mean':
|
88 |
+
lang_feat = rnn_out.sum(1)
|
89 |
+
lang_feat /= len_of_sequence.view(-1, 1) # broadcasting
|
90 |
+
elif self.feature_type == 'all':
|
91 |
+
lang_feat = rnn_out
|
92 |
+
else:
|
93 |
+
raise ValueError('Unknown LSTM feature requested.')
|
94 |
+
|
95 |
+
return lang_feat
|
imageprocessing/artemis/artemis/neural_models/mlp.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Multi-Linear Perceptron packaged nicely for convenience.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in late 2019, for Python 3.x. Last updated in 2021.
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
from torch import nn
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
def optional_repeat(value, times):
|
13 |
+
""" helper function, to repeat a parameter's value many times
|
14 |
+
:param value: an single basic python type (int, float, boolean, string), or a list with length equals to times
|
15 |
+
:param times: int, how many times to repeat
|
16 |
+
:return: a list with length equal to times
|
17 |
+
"""
|
18 |
+
if type(value) is not list:
|
19 |
+
value = [value]
|
20 |
+
|
21 |
+
if len(value) != 1 and len(value) != times:
|
22 |
+
raise ValueError('The value should be a singleton, or be a list with times length.')
|
23 |
+
|
24 |
+
if len(value) == times:
|
25 |
+
return value # do nothing
|
26 |
+
|
27 |
+
return np.array(value).repeat(times).tolist()
|
28 |
+
|
29 |
+
|
30 |
+
class MLP(nn.Module):
|
31 |
+
""" Multi-near perceptron. That is a k-layer deep network where each layer is a fully-connected layer, with
|
32 |
+
(optionally) batch-norm, a non-linearity and dropout. The last layer (output) is always a 'pure' linear function.
|
33 |
+
"""
|
34 |
+
def __init__(self, in_feat_dims, out_channels, b_norm=True, dropout_rate=0,
|
35 |
+
non_linearity=nn.ReLU(inplace=True), closure=None):
|
36 |
+
"""Constructor
|
37 |
+
:param in_feat_dims: input feature dimensions
|
38 |
+
:param out_channels: list of ints describing each the number hidden/final neurons. The
|
39 |
+
:param b_norm: True/False, or list of booleans
|
40 |
+
:param dropout_rate: int, or list of int values
|
41 |
+
:param non_linearity: nn.Module
|
42 |
+
:param closure: optional nn.Module to use at the end of the MLP
|
43 |
+
"""
|
44 |
+
super(MLP, self).__init__()
|
45 |
+
self.hidden_dimensions = out_channels[:-1]
|
46 |
+
self.embedding_dimension = out_channels[-1]
|
47 |
+
|
48 |
+
n_layers = len(out_channels)
|
49 |
+
dropout_rate = optional_repeat(dropout_rate, n_layers-1)
|
50 |
+
b_norm = optional_repeat(b_norm, n_layers-1)
|
51 |
+
|
52 |
+
previous_feat_dim = in_feat_dims
|
53 |
+
all_ops = []
|
54 |
+
|
55 |
+
for depth in range(len(out_channels)):
|
56 |
+
out_dim = out_channels[depth]
|
57 |
+
affine_op = nn.Linear(previous_feat_dim, out_dim, bias=True)
|
58 |
+
all_ops.append(affine_op)
|
59 |
+
|
60 |
+
if depth < len(out_channels) - 1:
|
61 |
+
if b_norm[depth]:
|
62 |
+
all_ops.append(nn.BatchNorm1d(out_dim))
|
63 |
+
|
64 |
+
if non_linearity is not None:
|
65 |
+
all_ops.append(non_linearity)
|
66 |
+
|
67 |
+
if dropout_rate[depth] > 0:
|
68 |
+
all_ops.append(nn.Dropout(p=dropout_rate[depth]))
|
69 |
+
|
70 |
+
previous_feat_dim = out_dim
|
71 |
+
|
72 |
+
if closure is not None:
|
73 |
+
all_ops.append(closure)
|
74 |
+
|
75 |
+
self.net = nn.Sequential(*all_ops)
|
76 |
+
|
77 |
+
def __call__(self, x):
|
78 |
+
return self.net(x)
|
imageprocessing/artemis/artemis/neural_models/resnet_encoder.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Rensnet Wrapper.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in late 2019, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
|
10 |
+
import torch
|
11 |
+
from torch import nn
|
12 |
+
from torchvision import models
|
13 |
+
|
14 |
+
|
15 |
+
class ResnetEncoder(nn.Module):
|
16 |
+
"""Convenience wrapper around resnet models"""
|
17 |
+
def __init__(self, backbone, adapt_image_size=None, drop=2, pretrained=True, verbose=False):
|
18 |
+
"""
|
19 |
+
:param backbone: (string) resnet-S, S in [18, 34, 50, 101]
|
20 |
+
:param adapt_image_size: (opt, int) if given forward feature has
|
21 |
+
[B, adapt_image_size, adapt_image_size, feat-dim]
|
22 |
+
:param drop: how many of the last layers/blocks to drop.
|
23 |
+
:param pretrained: (Boolean)
|
24 |
+
:param verbose: (opt, Boolean) if true print actions taken.
|
25 |
+
Note: in total there are 10 layers/blocks. The last two are an adaptive_pooling and an FC, the
|
26 |
+
previous layers give rise to convolutional maps of increasing spatial size.
|
27 |
+
"""
|
28 |
+
|
29 |
+
if drop == 0 and adapt_image_size is not None:
|
30 |
+
raise ValueError('Trying to apply adaptive pooling while keeping the entire model (drop=0).')
|
31 |
+
|
32 |
+
super(ResnetEncoder, self).__init__()
|
33 |
+
backbones = {
|
34 |
+
'resnet18': models.resnet18,
|
35 |
+
'resnet34': models.resnet34,
|
36 |
+
'resnet50': models.resnet50,
|
37 |
+
'resnet101': models.resnet101,
|
38 |
+
}
|
39 |
+
|
40 |
+
self.name = backbone
|
41 |
+
self.drop = drop
|
42 |
+
self.resnet = backbones[self.name](pretrained=pretrained)
|
43 |
+
|
44 |
+
# Remove linear and last adaptive pool layer
|
45 |
+
if drop > 0:
|
46 |
+
modules = list(self.resnet.children())
|
47 |
+
if verbose:
|
48 |
+
print('Removing the last {} layers of a {}'.format(drop, self.name))
|
49 |
+
print(modules[-drop:])
|
50 |
+
modules = modules[:-drop]
|
51 |
+
self.resnet = nn.Sequential(*modules)
|
52 |
+
|
53 |
+
self.adaptive_pool = None
|
54 |
+
if adapt_image_size is not None:
|
55 |
+
self.adaptive_pool = nn.AdaptiveAvgPool2d((adapt_image_size, adapt_image_size))
|
56 |
+
|
57 |
+
if pretrained:
|
58 |
+
for p in self.resnet.parameters():
|
59 |
+
p.requires_grad = False
|
60 |
+
|
61 |
+
def __call__(self, images):
|
62 |
+
"""Forward prop.
|
63 |
+
:param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
|
64 |
+
:return: encoded images
|
65 |
+
"""
|
66 |
+
out = self.resnet(images) # (B, F, ceil(image_size/32), ceil(image_size/32))
|
67 |
+
|
68 |
+
if self.adaptive_pool is not None:
|
69 |
+
out = self.adaptive_pool(out) # (B, F, adapt_image_size, adapt_image_size)
|
70 |
+
|
71 |
+
if self.drop > 0: # convolutional-like output
|
72 |
+
out = out.permute(0, 2, 3, 1) # bring feature-dim last.
|
73 |
+
out = torch.squeeze(torch.squeeze(out, 1), 1) # In case adapt_image_size == 1, remove dimensions
|
74 |
+
return out
|
75 |
+
|
76 |
+
def unfreeze(self, level=5, verbose=False):
|
77 |
+
"""Allow or prevent the computation of gradients for blocks after level.
|
78 |
+
The smaller the level, the less pretrained the resnet will be.
|
79 |
+
"""
|
80 |
+
all_layers = list(self.resnet.children())
|
81 |
+
|
82 |
+
if verbose:
|
83 |
+
ll = len(all_layers)
|
84 |
+
print('From {} layers, you are unfreezing the last {}'.format(ll, ll-level))
|
85 |
+
|
86 |
+
for c in all_layers[level:]:
|
87 |
+
for p in c.parameters():
|
88 |
+
p.requires_grad = True
|
89 |
+
return self
|
90 |
+
|
91 |
+
def embedding_dimension(self):
|
92 |
+
"""The feature (channel) dimension of the last layer"""
|
93 |
+
if self.drop == 0:
|
94 |
+
return 1000 #Imagenet Classes
|
95 |
+
|
96 |
+
if self.drop == 2:
|
97 |
+
return 512 if int(self.name.replace('resnet', '')) < 50 else 2048
|
98 |
+
|
99 |
+
if self.drop == 3:
|
100 |
+
return 256 if int(self.name.replace('resnet', '')) < 50 else 1024
|
101 |
+
|
102 |
+
raise NotImplementedError
|
103 |
+
|
imageprocessing/artemis/artemis/neural_models/show_attend_tell.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A custom implementation of Show-Attend-&-Tell for ArtEmis: Affective Language for Visual Art
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in early 2020, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
from torch import nn
|
10 |
+
from .resnet_encoder import ResnetEncoder
|
11 |
+
from .attentive_decoder import AttentiveDecoder
|
12 |
+
|
13 |
+
|
14 |
+
def describe_model(vocab, args):
|
15 |
+
""" Describe the architecture of a SAT speaker with a resnet encoder.
|
16 |
+
:param vocab:
|
17 |
+
:param args:
|
18 |
+
:return:
|
19 |
+
"""
|
20 |
+
word_embedding = nn.Embedding(len(vocab), args.word_embedding_dim, padding_idx=vocab.pad)
|
21 |
+
|
22 |
+
encoder = ResnetEncoder(args.vis_encoder, adapt_image_size=args.atn_spatial_img_size).unfreeze()
|
23 |
+
encoder_out_dim = encoder.embedding_dimension()
|
24 |
+
|
25 |
+
emo_ground_dim = 0
|
26 |
+
emo_projection_net = None
|
27 |
+
if args.use_emo_grounding:
|
28 |
+
emo_in_dim = args.emo_grounding_dims[0]
|
29 |
+
emo_ground_dim = args.emo_grounding_dims[1]
|
30 |
+
# obviously one could use more complex nets here instead of using a "linear" layer.
|
31 |
+
# in my estimate, this is not going to be useful:)
|
32 |
+
emo_projection_net = nn.Sequential(*[nn.Linear(emo_in_dim, emo_ground_dim), nn.ReLU()])
|
33 |
+
|
34 |
+
decoder = AttentiveDecoder(word_embedding,
|
35 |
+
args.rnn_hidden_dim,
|
36 |
+
encoder_out_dim,
|
37 |
+
args.attention_dim,
|
38 |
+
vocab,
|
39 |
+
dropout_rate=args.dropout_rate,
|
40 |
+
teacher_forcing_ratio=args.teacher_forcing_ratio,
|
41 |
+
auxiliary_net=emo_projection_net,
|
42 |
+
auxiliary_dim=emo_ground_dim)
|
43 |
+
|
44 |
+
model = nn.ModuleDict({'encoder': encoder, 'decoder': decoder})
|
45 |
+
return model
|
imageprocessing/artemis/artemis/neural_models/text_emotional_clf.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Given an utterance (an optionally an image) guess a distribution over the emotion labels.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in 2020, for Python 3.x
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import torch.nn.functional as F
|
11 |
+
from torch import nn
|
12 |
+
from tqdm.notebook import tqdm as tqdm_notebook
|
13 |
+
|
14 |
+
from ..utils.stats import AverageMeter
|
15 |
+
|
16 |
+
|
17 |
+
class TextEmotionClassifier(nn.Module):
|
18 |
+
def __init__(self, text_encoder, clf_head, img_encoder=None):
|
19 |
+
super(TextEmotionClassifier, self).__init__()
|
20 |
+
self.text_encoder = text_encoder
|
21 |
+
self.clf_head = clf_head
|
22 |
+
self.img_encoder = img_encoder
|
23 |
+
|
24 |
+
def __call__(self, text, img=None):
|
25 |
+
if img is not None:
|
26 |
+
img_feat = self.img_encoder(img)
|
27 |
+
feat = self.text_encoder(text, img_feat)
|
28 |
+
else:
|
29 |
+
feat = self.text_encoder(text)
|
30 |
+
|
31 |
+
logits = self.clf_head(feat)
|
32 |
+
return logits
|
33 |
+
|
34 |
+
|
35 |
+
def single_epoch_train(model, data_loader, use_vision, criterion, optimizer, device):
|
36 |
+
epoch_loss = AverageMeter()
|
37 |
+
epoch_acc = AverageMeter()
|
38 |
+
model.train()
|
39 |
+
for batch in tqdm_notebook(data_loader):
|
40 |
+
labels = batch['emotion'].to(device)
|
41 |
+
tokens = batch['tokens'].to(device)
|
42 |
+
|
43 |
+
if use_vision:
|
44 |
+
img = batch['image'].to(device)
|
45 |
+
logits = model(tokens, img)
|
46 |
+
else:
|
47 |
+
logits = model(tokens)
|
48 |
+
|
49 |
+
# Calculate loss
|
50 |
+
loss = criterion(logits, labels)
|
51 |
+
acc = torch.mean((logits.argmax(1) == labels).double())
|
52 |
+
|
53 |
+
# Back prop.
|
54 |
+
optimizer.zero_grad()
|
55 |
+
loss.backward()
|
56 |
+
optimizer.step()
|
57 |
+
|
58 |
+
b_size = len(labels)
|
59 |
+
epoch_loss.update(loss.item(), b_size)
|
60 |
+
epoch_acc.update(acc.item(), b_size)
|
61 |
+
return epoch_loss.avg, epoch_acc.avg
|
62 |
+
|
63 |
+
|
64 |
+
@torch.no_grad()
|
65 |
+
def evaluate_on_dataset(model, data_loader, use_vision, criterion, device, detailed=True):
|
66 |
+
epoch_loss = AverageMeter()
|
67 |
+
epoch_acc = AverageMeter()
|
68 |
+
model.eval()
|
69 |
+
epoch_confidence = []
|
70 |
+
for batch in tqdm_notebook(data_loader):
|
71 |
+
labels = batch['emotion'].to(device)
|
72 |
+
tokens = batch['tokens'].to(device)
|
73 |
+
if use_vision:
|
74 |
+
img = batch['image'].to(device)
|
75 |
+
logits = model(tokens, img)
|
76 |
+
else:
|
77 |
+
logits = model(tokens)
|
78 |
+
|
79 |
+
# Calculate loss
|
80 |
+
loss = criterion(logits, labels)
|
81 |
+
guessed_correct = logits.argmax(1) == labels
|
82 |
+
acc = torch.mean(guessed_correct.double())
|
83 |
+
|
84 |
+
if detailed:
|
85 |
+
epoch_confidence.append(F.softmax(logits, dim=-1).cpu())
|
86 |
+
|
87 |
+
b_size = len(labels)
|
88 |
+
epoch_loss.update(loss.item(), b_size)
|
89 |
+
epoch_acc.update(acc.item(), b_size)
|
90 |
+
|
91 |
+
if detailed:
|
92 |
+
epoch_confidence = torch.cat(epoch_confidence).numpy()
|
93 |
+
|
94 |
+
return epoch_loss.avg, epoch_acc.avg, epoch_confidence
|
imageprocessing/artemis/artemis/neural_models/word_embeddings.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities to load pretrained word embeddings like those of GloVe.
|
3 |
+
|
4 |
+
The MIT License (MIT)
|
5 |
+
Originally created in late 2019, for Python 3.x - last updated in 2021.
|
6 |
+
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
|
7 |
+
"""
|
8 |
+
|
9 |
+
import warnings
|
10 |
+
import torch
|
11 |
+
import numpy as np
|
12 |
+
from collections import Counter
|
13 |
+
|
14 |
+
|
15 |
+
def make_pretrained_embedding(vocab, pretrained_vectors, freeze=True, sigma=1, random_seed=None):
|
16 |
+
""" Make a torch.nn.Embedding based for a given vocabulary and a collection of
|
17 |
+
pretrained word-embedding vectors.
|
18 |
+
:param vocab: speakers_listeners.build_vocab.Vocabulary
|
19 |
+
:param pretrained_vectors: dictionary of words mapped to np.array vectors
|
20 |
+
(like those returned from ```load_glove_pretrained_embedding```).
|
21 |
+
:param freeze, (opt, boolean) if True the embedding is not using gradients to optimize itself (fine-tune).
|
22 |
+
:param sigma, (opt, int) standard-deviation of Gaussian used to sample when a word is not in the pretrained_vectors
|
23 |
+
:param random_seed (opt, int) to seed the numpy Gaussian
|
24 |
+
:return: torch.nn.Embedding
|
25 |
+
|
26 |
+
Note: this implementation will freeze all words if freeze=True, irrespectively of if the words are in the
|
27 |
+
pretrained_vectors collection or not (OOV: Out-of-Vocabulary). If you want to fine-tune the OOV you need to adapt
|
28 |
+
like this: https://discuss.pytorch.org/t/updating-part-of-an-embedding-matrix-only-for-out-of-vocab-words/33297
|
29 |
+
"""
|
30 |
+
for ss in vocab.special_symbols:
|
31 |
+
if ss in pretrained_vectors:
|
32 |
+
warnings.warn('the special symbol {} is found in the pretrained embedding.')
|
33 |
+
|
34 |
+
# Initialize weight matrix with correct dimensions and all zeros
|
35 |
+
random_key = next(iter(pretrained_vectors))
|
36 |
+
emb_dim = len(pretrained_vectors[random_key])
|
37 |
+
emb_dtype = pretrained_vectors[random_key].dtype
|
38 |
+
n_words = len(vocab)
|
39 |
+
weights = np.zeros((n_words, emb_dim), dtype=emb_dtype)
|
40 |
+
|
41 |
+
if random_seed is not None:
|
42 |
+
np.random.seed(random_seed)
|
43 |
+
|
44 |
+
for word, idx in vocab.word2idx.items():
|
45 |
+
if word in pretrained_vectors:
|
46 |
+
weights[idx] = pretrained_vectors[word]
|
47 |
+
else:
|
48 |
+
weights[idx] = sigma * np.random.randn(emb_dim)
|
49 |
+
|
50 |
+
padding_idx = None
|
51 |
+
if hasattr(vocab, 'pad'):
|
52 |
+
print('using padding symbol of provided vocabulary.')
|
53 |
+
padding_idx = vocab.pad
|
54 |
+
weights[padding_idx] = np.zeros(emb_dim)
|
55 |
+
|
56 |
+
embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(weights), freeze=freeze, padding_idx=padding_idx)
|
57 |
+
return embedding
|
58 |
+
|
59 |
+
|
60 |
+
def load_glove_pretrained_embedding(glove_file, dtype=np.float32, only_words=False, verbose=False):
|
61 |
+
"""
|
62 |
+
:param glove_file: file downloaded from Glove website
|
63 |
+
:param dtype: how to save the word-embeddings
|
64 |
+
:param only_words: do not return the embedding vectors, only the words considered
|
65 |
+
:param verbose: print, or not side-information
|
66 |
+
:return: dictionary of words mapped to np.array vectors
|
67 |
+
"""
|
68 |
+
|
69 |
+
if verbose:
|
70 |
+
print("Loading glove word embeddings.")
|
71 |
+
|
72 |
+
embedding = dict()
|
73 |
+
with open(glove_file) as f_in:
|
74 |
+
for line in f_in:
|
75 |
+
s_line = line.split()
|
76 |
+
token = s_line[0]
|
77 |
+
if only_words:
|
78 |
+
embedding[token] = 0
|
79 |
+
else:
|
80 |
+
w_embedding = np.array([float(val) for val in s_line[1:]], dtype=dtype)
|
81 |
+
embedding[token] = w_embedding
|
82 |
+
if only_words:
|
83 |
+
embedding = set(list(embedding.keys()))
|
84 |
+
|
85 |
+
if verbose:
|
86 |
+
print("Done.", len(embedding), "words loaded.")
|
87 |
+
return embedding
|
88 |
+
|
89 |
+
|
90 |
+
def init_token_bias(encoded_token_list, vocab=None, dtype=np.float32, trainable=True):
|
91 |
+
""" Make a bias vector based on the (log) probability of the frequency of each word
|
92 |
+
in the training data similar to https://arxiv.org/abs/1412.2306
|
93 |
+
This bias can used to initialize the hidden-to-next-word layer for faster convergence.
|
94 |
+
:param encoded_token_list: [[tokens-of-utterance-1-as-ints] [tokens-of-utterance-2]...]
|
95 |
+
:param vocab: speakers_listeners.build_vocab.Vocabulary
|
96 |
+
:param dtype:
|
97 |
+
:param trainable: (opt, bool) permit training or not of the resulting bias vector
|
98 |
+
:return: (torch.Parameter) bias vector
|
99 |
+
"""
|
100 |
+
counter = Counter()
|
101 |
+
for tokens in encoded_token_list:
|
102 |
+
counter.update(tokens)
|
103 |
+
|
104 |
+
n_items = len(counter)
|
105 |
+
if vocab is not None:
|
106 |
+
if n_items != len(vocab):
|
107 |
+
warnings.warn('init_token_bias: Vobab contains more tokens than given token lists.')
|
108 |
+
n_items = max(n_items, len(vocab))
|
109 |
+
counter[vocab.sos] = counter[vocab.pad] = min(counter.values())
|
110 |
+
|
111 |
+
bias_vector = np.ones(n_items, dtype=dtype) # initialize
|
112 |
+
|
113 |
+
for position, frequency in counter.items():
|
114 |
+
bias_vector[position] = frequency
|
115 |
+
|
116 |
+
# Log probability
|
117 |
+
bias_vector /= np.sum(bias_vector)
|
118 |
+
bias_vector = np.log(bias_vector)
|
119 |
+
bias_vector -= np.max(bias_vector)
|
120 |
+
|
121 |
+
bias_vector = torch.from_numpy(bias_vector)
|
122 |
+
bias_vector = torch.nn.Parameter(bias_vector, requires_grad=trainable)
|
123 |
+
return bias_vector
|