Spaces:

LuisAVasquez
/

LLMs_for_Art_Commentary

Sleeping

App Files Files Community

LuisV commited on Feb 28, 2024

Commit

dfd271a

1 Parent(s): 4859d06

adding artemis package

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

imageprocessing/artemis/LICENSE +23 -0
imageprocessing/artemis/README.md +160 -0
imageprocessing/artemis/artemis/__init__.py +6 -0
imageprocessing/artemis/artemis/analysis/__init__.py +4 -0
imageprocessing/artemis/artemis/analysis/emotion_centric.py +72 -0
imageprocessing/artemis/artemis/analysis/feature_extraction.py +84 -0
imageprocessing/artemis/artemis/analysis/paintings_meta_data.py +26 -0
imageprocessing/artemis/artemis/analysis/utils.py +80 -0
imageprocessing/artemis/artemis/captioning/__init__.py +4 -0
imageprocessing/artemis/artemis/captioning/sample_captions.py +78 -0
imageprocessing/artemis/artemis/captioning/senti_cap_anps.py +111 -0
imageprocessing/artemis/artemis/data/glove.6B.100d.vocabulary.txt +0 -0
imageprocessing/artemis/artemis/data/image-emotion-histogram.csv +0 -0
imageprocessing/artemis/artemis/data/speaker_sampling_configs/full_hyper_param_ablation.json.txt +182 -0
imageprocessing/artemis/artemis/data/speaker_sampling_configs/mini_hyper_param_ablation.json.txt +12 -0
imageprocessing/artemis/artemis/data/speaker_sampling_configs/selected_hyper_params.json.txt +7 -0
imageprocessing/artemis/artemis/data/symspell_frequency_dictionary_en_82_765.txt +0 -0
imageprocessing/artemis/artemis/data/wiki_art_duplicate_paintings.pkl +3 -0
imageprocessing/artemis/artemis/data/wiki_art_genre_class.csv +0 -0
imageprocessing/artemis/artemis/emotions.py +79 -0
imageprocessing/artemis/artemis/evaluation/__init__.py +7 -0
imageprocessing/artemis/artemis/evaluation/bleu.py +34 -0
imageprocessing/artemis/artemis/evaluation/emotion_alignment.py +87 -0
imageprocessing/artemis/artemis/evaluation/longest_common_subseq.py +63 -0
imageprocessing/artemis/artemis/evaluation/metaphors.py +42 -0
imageprocessing/artemis/artemis/evaluation/single_caption_per_image.py +214 -0
imageprocessing/artemis/artemis/in_out/__init__.py +4 -0
imageprocessing/artemis/artemis/in_out/arguments.py +199 -0
imageprocessing/artemis/artemis/in_out/basics.py +230 -0
imageprocessing/artemis/artemis/in_out/cleaning.py +87 -0
imageprocessing/artemis/artemis/in_out/coco.py +30 -0
imageprocessing/artemis/artemis/in_out/datasets.py +224 -0
imageprocessing/artemis/artemis/in_out/neural_net_oriented.py +336 -0
imageprocessing/artemis/artemis/language/__init__.py +4 -0
imageprocessing/artemis/artemis/language/adjective_noun_pairs.py +44 -0
imageprocessing/artemis/artemis/language/basics.py +132 -0
imageprocessing/artemis/artemis/language/language_preprocessing.py +224 -0
imageprocessing/artemis/artemis/language/part_of_speech.py +40 -0
imageprocessing/artemis/artemis/language/spelling.py +634 -0
imageprocessing/artemis/artemis/neural_models/__init__.py +4 -0
imageprocessing/artemis/artemis/neural_models/attention.py +45 -0
imageprocessing/artemis/artemis/neural_models/attentive_decoder.py +696 -0
imageprocessing/artemis/artemis/neural_models/distances.py +67 -0
imageprocessing/artemis/artemis/neural_models/image_emotion_clf.py +75 -0
imageprocessing/artemis/artemis/neural_models/lstm_encoder.py +95 -0
imageprocessing/artemis/artemis/neural_models/mlp.py +78 -0
imageprocessing/artemis/artemis/neural_models/resnet_encoder.py +103 -0
imageprocessing/artemis/artemis/neural_models/show_attend_tell.py +45 -0
imageprocessing/artemis/artemis/neural_models/text_emotional_clf.py +94 -0
imageprocessing/artemis/artemis/neural_models/word_embeddings.py +123 -0

imageprocessing/artemis/LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+ArtEmis: Affective Language for Art
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

imageprocessing/artemis/README.md ADDED Viewed

	@@ -0,0 +1,160 @@

+## ArtEmis: Affective Language for Visual Art
+A codebase created and maintained by <a href="https://ai.stanford.edu/~optas" target="_blank">Panos Achlioptas</a>.
+![representative](https://github.com/optas/artemis/blob/master/doc/images/speaker_productions_teaser.png)
+### Introduction
+This work is based on the [arXiv tech report](https://arxiv.org/abs/2101.07396) which is __provisionally__ accepted in [CVPR-2021](http://cvpr2021.thecvf.com/), for an <b>Oral</b> presentation.
+### Citation
+If you find this work useful in your research, please consider citing:
+	@article{achlioptas2021artemis,
+        title={ArtEmis: Affective Language for Visual Art},
+        author={Achlioptas, Panos and Ovsjanikov, Maks and Haydarov, Kilichbek and
+                Elhoseiny, Mohamed and Guibas, Leonidas},
+        journal = {CoRR},
+        volume = {abs/2101.07396},
+        year={2021}
+    }
+### Dataset
+To get the most out of this repo, please __download__ the data associated with ArtEmis by filling this [form](https://forms.gle/7eqiRgb764uTuexd7).
+### Installation
+This code has been tested with Python 3.6.9, Pytorch 1.3.1, CUDA 10.0 on Ubuntu 16.04.
+Assuming some (potentially) virtual environment and __python 3x__
+```Console
+git clone https://github.com/optas/artemis.git
+cd artemis
+pip install -e .
+```
+This will install the repo with all its dependencies (listed in setup.py) and will enable you to do things like:
+```
+from artemis.models import xx
+```
+(provided you add this artemis repo in your PYTHON-PATH)
+### Playing with ArtEmis
+#### Step-1 (important &nbsp; :pushpin:)
+ __Preprocess the provided annotations__ (spell-check, patch, tokenize, make train/val/test splits, etc.).
+ ```Console
+    artemis/scripts/preprocess_artemis_data.py
+ ```
+This script allows you to preprocess ArtEmis according to your needs. The __default__ arguments will do __minimal__
+preprocessing so the resulting output can be used to _fairly_ compare ArtEmis with other datasets; and, derive most _faithful_ statistics
+about ArtEmis's nature. That is what we used in our __analysis__ and what you should use in "Step-2" below. With this in mind do:
+  ```Console
+    python artemis/scripts/preprocess_artemis_data.py -save-out-dir <ADD_YOURS> -raw-artemis-data-csv <ADD_YOURS>
+ ```
+If you wish to train __deep-nets__ (speakers, emotion-classifiers etc.) *exactly* as we did it in our paper, then you need to rerun this script
+by providing only a single extra optional argument ("__--preprocess-for-deep-nets True__"). This will do more aggressive filtering and you should use its output for
+"Steps-3" and "Steps-4" below. Use a different save-out-dir to avoid overwritting the output of previous runs.
+  ```Console
+    python artemis/scripts/preprocess_artemis_data.py -save-out-dir <ADD_YOURS> -raw-artemis-data-csv <ADD_YOURS> --preprocess-for-deep-nets True
+ ```
+To understand and customize the different hyper-parameters please read the details in the provided _help_ messages of the used argparse.
+#### Step-2
+__Analyze & explore the dataset__. :microscope:
+Using the _minimally_ preprocessed version of ArtEmis which includes __all__ (454,684) collected annotation.
+   1. This is a great place to __start__ :checkered_flag:. Run this [notebook](artemis/notebooks/analysis/analyzing_artemis.ipynb) to do basic _linguistic_, _emotion_ & _art-oriented_ __analysis__ of the ArtEmis dataset.
+   2. Run this [notebook](artemis/notebooks/analysis/concreteness_subjectivity_sentiment_and_POS.ipynb) to analyze ArtEmis in terms of its: _concreteness_, _subjectivity_, _sentiment_ and _Parts-of-Speech_. Optionally, contrast these values with
+   with other common datasets like COCO.
+   3. Run this [notebook](artemis/notebooks/analysis/extract_emotion_histogram_per_image.ipynb) to extract the _emotion histograms_ (empirical distributions) of each artwork. This in __necessary__ for the Step-3 (1).
+   4. Run this [notebook](artemis/notebooks/analysis/emotion_entropy_per_genre_or_artstyle.ipynb) to analyze the extracted emotion histograms (previous step) per art genre and style.
+#### Step-3
+__Train and evaluate emotion-centric image & text classifiers__. :hearts:
+Using the preprocessed version of ArtEmis for __deep-nets__ which includes 429,431 annotations.
+(Training on a single GPU from scratch is a matter of __minutes__ for these classifiers!)
+   1. Run this [notebook](artemis/notebooks/deep_nets/emotions/image_to_emotion_classifier.ipynb) to train an __image-to-emotion__ classifier.
+   2. Run this [notebook](artemis/notebooks/deep_nets/emotions/utterance_to_emotion_classifier.ipynb) to train an LSTM-based __utterance-to-emotion__ classifier. Or, this [notebook](artemis/notebooks/deep_nets/emotions/utterance_to_emotion_with_transformer.ipynb) to train a BERT-based one.
+#### Step-4
+__Train & evaluate neural-speakers.__ :bomb:
+   - To __train__ our customized SAT model on ArtEmis  (__~2 hours__ to train in a single GPU!) do:
+```Console
+    python artemis/scripts/train_speaker.py -log-dir <ADD_YOURS> -data-dir <ADD_YOURS> -img-dir <ADD_YOURS>
+    log-dir: where to save the output of the training process, models etc.
+    data-dir: directory that contains the _input_ data
+              the directory that contains the ouput of preprocess_artemis_data.py: e.g.,
+              the artemis_preprocessed.csv, the vocabulary.pkl
+    img-dir: the top folder containing the WikiArt image dataset in its "standard" format:
+                img-dir/art_style/painting_xx.jpg
+```
+   Note. The default optional arguments will create the same vanilla-speaker variant we used in the CVPR21 paper.
+  - To __train__ the __emotionally-grounded__ variant of SAT add an extra parameter in the above call:
+```Console
+    python artemis/scripts/train_speaker.py -log-dir <ADD_YOURS> -data-dir <ADD_YOURS> -img-dir <ADD_YOURS>
+                                            --use-emo-grounding True
+```
+   - To __sample__ utterances from a trained speaker:
+   ```Console
+    python artemis/scripts/sample_speaker.py -arguments
+   ```
+   For an explanation of the arguments see the argparse help messages. It is worth noting that when you
+   want to sample an emotionally-grounded variant you need to provide a pretrained image2emotion
+   classifier. The image2emotion will be used to deduce _the most likely_ emotion of an image, and input this emotion to
+   the speaker. See Step-3 (1) for how to train such a net.
+   - To __evaluate__ the quality of the sampled captions (e.g., per BLEU, emotional alignment, methaphors etc.) use this
+   [notebook](artemis/notebooks/deep_nets/speakers/evaluate_sampled_captions.ipynb). As a bonus you can use it to inspect the _neural attention_ placed on
+   the different tokens/images.
+### MISC
+- You can make a _pseudo_ "neural speaker" by copying training-sentences to the test according to __Nearest-Neighbors__ in a pretrained
+network feature space by running this 5 min. [notebook](artemis/notebooks/deep_nets/speakers/nearest_neighbor_speaker.ipynb).
+### Pretrained Models (used in CVPR21-paper)
+   * [Image-To-Emotion classifier (81MB)](https://www.dropbox.com/s/8dfj3b36q15iieo/best_model.pt?dl=0)
+    - use it within notebook of Step.3.1 or to _sample_ emotionally grounded speaker (Step.4.sample).
+   * [LSTM-based Text-To-Emotion classifier (8MB)](https://www.dropbox.com/s/ruczzggqu1i6nof/best_model.pt?dl=0)
+    - use it within inside notebook of Step.3.2 or to _evaluate_ the samples of a speaker (Step.4.evaluate) | e.g., needed for emotional-alignment.
+   * [SAT-Speaker (434MB)](https://www.dropbox.com/s/tnbfws0m3yi06ge/vanilla_sat_speaker_cvpr21.zip?dl=0)
+   * [SAT-Speaker-with-emotion-grounding (431MB)](https://www.dropbox.com/s/0erh464wag8ods1/emo_grounded_sat_speaker_cvpr21.zip?dl=0)
+   + The above two links include also our _sampled captions_ for the test-split. You can use them to evaluate the speakers without resampling them. Please read the included README.txt.
+   + __Caveats__: ArtEmis is a real-world dataset containing the opinion and sentiment of thousands of people. It is expected thus to contain text with biases, factual inaccuracies, and perhaps foul language. Please use responsibly.
+   The provided models are likely to be biased and/or inaccurate in ways reflected in the training data.
+### News
+- :champagne: &nbsp; ArtEmis has attracted already some noticeable media coverage. E.g., @ [New-Scientist](https://www.newscientist.com/article/2266240-ai-art-critic-can-predict-which-emotions-a-painting-will-evoke),
+[HAI](https://hai.stanford.edu/news/artists-intent-ai-recognizes-emotions-visual-art),
+[MarkTechPost](https://www.marktechpost.com/2021/01/30/stanford-researchers-introduces-artemis-a-dataset-containing-439k-emotion-attributions),
+[KCBS-Radio](https://ai.stanford.edu/~optas/data/interviews/artemis/kcbs/SAT-AI-ART_2_2-6-21(disco_mix).mp3),
+[Communications of ACM](https://cacm.acm.org/news/250312-ai-art-critic-can-predict-which-emotions-a-painting-will-evoke/fulltext),
+[Synced Review](https://medium.com/@Synced/ai-art-critic-new-dataset-and-models-make-emotional-sense-of-visual-artworks-2289c6c71299),
+[École Polytechnique](https://www.polytechnique.edu/fr/content/des-algorithmes-emotifs-face-des-oeuvres-dart),
+[Forbes Science](https://www.forbes.com/sites/evaamsen/2021/03/30/artificial-intelligence-is-learning-to-categorize-and-talk-about-art/).
+- :telephone_receiver: &nbsp; __important__ More code, will be added in April. Namely, for the ANP-baseline, the comparisons of ArtEmis with other datasets, please do a git-pull at that time. The update will be _seamless_! During this first months, if you have _ANY_ question feel free to send me an email at __optas@stanford.edu__.
+- :trophy: &nbsp; If you are developing more models with ArtEmis and you want to incorporate them here please talk to me or simply do a pull-request.
+#### License
+This code is released under MIT License (see LICENSE file for details).
+ _In simple words, if you copy/use parts of this code please __keep the copyright note__ in place._

imageprocessing/artemis/artemis/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+from .in_out.basics import files_in_subdirs
+from .in_out.basics import pickle_data, unpickle_data

imageprocessing/artemis/artemis/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""

imageprocessing/artemis/artemis/analysis/emotion_centric.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+Utilities for emotion-centric analysis.
+The MIT License (MIT)
+Originally created at 10/22/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import pandas as pd
+import matplotlib.pylab as plt
+from ..emotions import ARTEMIS_EMOTIONS, positive_negative_else
+def df_to_emotion_histogram(df, palette=plt.cm.Pastel1, emotion_column='emotion', verbose=False):
+    """ Take a dataset like ArtEmis and return a histogram over the emotion choices made by the annotators.
+    :param df: dataframe carrying dataset
+    :param palette: matplotlib color palette, e.g., plt.cm.jet
+    :param emotion_column: (str) indicate which column of the dataframe carries the emotion
+    :return: a list carrying the resulting histogram figure.
+    """
+    hist_vals = []
+    for emotion in ARTEMIS_EMOTIONS:
+        hist_vals.append(sum(df[emotion_column] == emotion) / len(df))
+    norm = plt.Normalize(min(hist_vals), max(hist_vals))
+    colors = palette(norm(hist_vals))
+    s = pd.DataFrame({"emotions": ARTEMIS_EMOTIONS, "vals": hist_vals})
+    s.set_index("emotions", drop=True, inplace=True)
+    plt.figure()
+    s.index.name = None
+    ax = s.plot.bar(grid=True, figsize=(12,4), color=colors, fontsize=16, rot=45, legend=False, ec="k")
+    ax.set_ylabel('Percentage of data', fontsize=15)
+    for rec, col in zip(ax.patches, colors):
+        rec.set_color(col)
+    plt.tight_layout()
+    res = [plt.gcf()]
+    plt.figure()
+    s = df[emotion_column].apply(positive_negative_else).value_counts() / len(df)
+    if verbose:
+        print('Pos-Neg-Else, percents:', s.round(3))
+    ax = s.plot.bar(grid=True, figsize=(8,4), fontsize=16, rot=45, legend=False, color='gray')
+    ax.set_xticklabels(['positive', 'negative', 'else'])
+    plt.tight_layout()
+    res.append(plt.gcf())
+    return res
+def has_emotion_max_dominance(grouped_df, exclude_se=False, return_max=False):
+    """ I.e., same emotion was selected (among all nine emotions) at least by half annotators.
+    :param grouped_df: dataframe of dataset grouped by stimuli, e.g., images.
+    :param exclude_se: if True, ignore the groups where the maximizer is the something-else category
+    :param return_max: return for each group that has dominance the emotion type that has the gathered the maximum annotations.
+    :return:
+    """
+    vals = grouped_df.emotion.value_counts()
+    maxim = vals.max()
+    threshold = vals.sum() / 2
+    res = maxim >= threshold
+    if exclude_se:
+        res &= vals.idxmax() != 'something else'
+    if return_max:
+        return res, vals.idxmax()
+    else:
+        return res

imageprocessing/artemis/artemis/analysis/feature_extraction.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Routines to extract features from images.
+The MIT License (MIT)
+Originally created at 6/14/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+import torchvision.transforms as transforms
+import numpy as np
+from PIL import Image
+from torchvision import models
+from ..in_out.datasets import ImageClassificationDataset
+from ..in_out.neural_net_oriented import image_net_mean, image_net_std
+from ..neural_models.resnet_encoder import ResnetEncoder
+@torch.no_grad()
+def get_forward_features_of_dataset(encoder, dataloader, device, data_in_batch='image'):
+    b_size = dataloader.batch_size
+    for i, batch in enumerate(dataloader):
+        feats = encoder(batch[data_in_batch].to(device))
+        feats = feats.cpu().numpy().astype('float32')
+        if i == 0:
+            features = np.zeros((len(dataloader.dataset), feats.shape[1]), dtype='float32')
+        if i < len(dataloader) - 1:
+            features[i * b_size: (i + 1) * b_size] = feats
+        else:
+            # special treatment for final batch
+            features[i * b_size:] = feats
+    return features
+def image_transformation(img_dim, pretraining='image_net'):
+    if pretraining == 'image_net':
+        normalize = transforms.Normalize(mean=image_net_mean, std=image_net_std)
+    else:
+        raise NotImplementedError('')
+    res = transforms.Compose([transforms.Resize((img_dim, img_dim), Image.LANCZOS),
+                              transforms.ToTensor(), normalize])
+    return res
+def vgg_encoder(device):
+    vgg = models.vgg16_bn(pretrained=True).to(device).eval()
+    feature_storage = []
+    def hook(module, hook_input, hook_output):
+        feature_storage.append(hook_output.detach_().cpu().numpy())
+    vgg.classifier[4].register_forward_hook(hook) # last relu layer before classification.
+    return vgg, feature_storage
+@torch.no_grad()
+def extract_visual_features(image_files, img_dim, method='resnet18',
+                            batch_size=128, n_workers=12, device='cuda'):
+    img_transform = image_transformation(img_dim)
+    dataset = ImageClassificationDataset(image_files, img_transform=img_transform)
+    loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size,
+                                         shuffle=False, num_workers=n_workers)
+    if method.startswith('resnet'):
+        vis_encoder = ResnetEncoder(method, 1).to(device).eval()
+        features = get_forward_features_of_dataset(vis_encoder, loader, device)
+    elif method.startswith('vgg'):
+        vis_encoder, features = vgg_encoder(device)
+        for batch in loader:
+            vis_encoder(batch['image'].to(device))
+        features = np.vstack(features)
+    elif method.startswith('random'):
+        vis_encoder = ResnetEncoder('resnet18', 1, pretrained=False).to(device).eval()
+        features = get_forward_features_of_dataset(vis_encoder, loader, device)
+    return features

imageprocessing/artemis/artemis/analysis/paintings_meta_data.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Manually selected famous paintings that can be optionally put in a test-set.
+The MIT License (MIT)
+Originally created at 6/23/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+masterpieces_for_test = [
+    'leonardo-da-vinci_mona-lisa',
+    'vincent-van-gogh_the-starry-night-1889(1)',
+    'vincent-van-gogh_the-starry-night-1888-1',
+    'vincent-van-gogh_the-starry-night-1889-1',
+    'vincent-van-gogh_the-starry-night-1888-2',
+    'vincent-van-gogh_the-starry-night-1888',
+    'johannes-vermeer_the-girl-with-a-pearl-earring',
+    'robert-silvers_girl-with-the-pearl-earring-2008',
+    'robert-silvers_guernica-photomosaic-mounted-on-aluminum',
+    'gustav-klimt_the-kiss-1908(1)',
+    'leonardo-da-vinci_the-lady-with-the-ermine-cecilia-gallerani-1496',
+    'vincent-van-gogh_cafe-terrace-on-the-place-du-forum-1888(1)',
+    'vincent-van-gogh_the-cafe-terrace-on-the-place-du-forum-arles-at-night-1888',
+    'vincent-van-gogh_cafe-terrace-place-du-forum-arles-1888(1)',
+    'eugene-delacroix_the-liberty-leading-the-people-1830',
+    'claude-monet_impression-sunrise',
+    'james-mcneill-whistler_arrangement-in-grey-and-black-no-1-portrait-of-the-artist-s-mother-1871']

imageprocessing/artemis/artemis/analysis/utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+Auxiliary routines to be used when analyzing/comparing ArtEmis in terms of its subjectivity, abstractness etc.
+See also notebooks/analysis/concreteness_subjectivity_sentiment.ipynb
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import numpy as np
+from collections import defaultdict
+from tqdm.notebook import tqdm as tqdm_notebook
+from collections import Counter
+from ..language.basics import ngrams
+def contains_word(tokenized_sentences, word_set):
+    boolean_mask = tokenized_sentences.apply(lambda x: len(set(x).intersection(word_set)) >= 1)
+    return boolean_mask
+def contains_bigrams(tokens, bigram_set):
+    token_bigrams = set([' '.join(b) for b in ngrams(tokens, 2)])
+    return any(x in bigram_set for x in token_bigrams)
+def concreteness_of_sentence(tokens, word_to_concreteness, count_bigrams=True):
+    "Sorry, will add add explanation in April..."
+    bigram_vals = [] # concreteness values of found bigrams
+    if count_bigrams:
+        # find bigrams that occur and their multiplicity
+        bigrams = Counter(ngrams(tokens, 2))
+        utterance = ' '.join(tokens)
+        for bigram, cnt in bigrams.items():
+            bigram = ' '.join(bigram)
+            if bigram in word_to_concreteness:
+                for _ in range(cnt):
+                    bigram_vals.append(word_to_concreteness[bigram])
+                utterance = utterance.replace(bigram, '') # remove bigrams from the utterance
+                                                          # to not double-count/score them
+        tokens = utterance.split()
+    unigram_vals = [word_to_concreteness[t] for t in tokens if t in word_to_concreteness]
+    conc_vals = unigram_vals + bigram_vals
+    if len(conc_vals) == 0:
+        return None
+    return sum(conc_vals) / len(conc_vals)
+def pos_analysis(df, group_cols=None, round_decimal=1):
+    # Assumes nltk universal pos-tagging
+    # & df['pos'] has the part-of-speech tags
+    # analysis along the POS used in the paper
+    pos_syms = ['NOUN', 'PRON', 'ADJ', 'ADP', 'VERB']
+    pos_names = ['Nouns', 'Pronouns', 'Adjectives', 'Adpositions', 'Verbs']
+    if group_cols is not None:
+        groups = df.groupby(group_cols)
+        group_stats = []
+        group_lens = []
+        for n, gg in tqdm_notebook(groups):
+            g_stats = defaultdict(set)
+            group_lens.append(len(gg))
+            for t, p in zip(gg.tokens, gg.pos):
+                for x, y in zip(t, p):
+                    g_stats[y[1]].add(x)
+            group_stats.append(g_stats)
+        for ps, pn in zip(pos_syms, pos_names):
+            u_pos = []
+            u_pos_norm = []
+            for i, s in enumerate(group_stats):
+                u_pos.append(len(s[ps]))
+                u_pos_norm.append(u_pos[-1] / group_lens[i])
+            print(pn, '{:.{}f}'.format(np.mean(u_pos), round_decimal),  '{:.{}f}'.format(np.mean(u_pos_norm), round_decimal))
+    else:
+        for ps, pn in zip(pos_syms, pos_names):
+            print(pn, df.pos.apply(lambda x: len([i[0] for i in x if i[1] == ps])).mean().round(round_decimal))

imageprocessing/artemis/artemis/captioning/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""

imageprocessing/artemis/artemis/captioning/sample_captions.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Helper functions for sampling (@test -- inference-time) a neural-speaker.
+The MIT License (MIT)
+Originally created at 20/1/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import pandas as pd
+import numpy as np
+from torch.utils.data import DataLoader
+from ..neural_models.attentive_decoder import sample_captions, sample_captions_beam_search, properize_captions
+from ..in_out.basics import wikiart_file_name_to_style_and_painting
+from ..emotions import IDX_TO_EMOTION
+from ..utils.vocabulary import UNK
+def versatile_caption_sampler(speaker, data_loader, device, max_utterance_len, sampling_rule='beam',
+                              beam_size=None, topk=None, temperature=1, drop_unk=True, use_bert_unk=False,
+                              drop_bigrams=False):
+    """Provides all implemented sampling methods according to the sampling_rule input parameter.
+    """
+    vocab = speaker.decoder.vocab
+    if sampling_rule == 'beam':
+        dset = data_loader.dataset
+        loader = DataLoader(dset, num_workers=data_loader.num_workers) # batch-size=1
+        max_iter = 8 * max_utterance_len # should be large enough
+        beam_captions, alphas, beam_scores = sample_captions_beam_search(speaker, loader, beam_size,
+                                                                         device, max_iter=max_iter,
+                                                                         temperature=temperature,
+                                                                         drop_unk=drop_unk,
+                                                                         drop_bigrams=drop_bigrams)
+        # first is highest scoring caption which is the only we keep here
+        captions = [c[0] for c in beam_captions]
+        alphas = [np.array(a[0]) for a in alphas]  # each alpha covers all tokens: <sos>, token1, ..., <eos>
+    else:
+        captions, alphas = sample_captions(speaker, data_loader, max_utterance_len=max_utterance_len,
+                                           sampling_rule=sampling_rule, device=device, temperature=temperature,
+                                           topk=topk, drop_unk=drop_unk, drop_bigrams=drop_bigrams)
+        captions = properize_captions(captions, vocab).tolist()
+    captions = tokens_to_strings(captions, vocab, bert_unk=use_bert_unk)
+    return captions, alphas
+def captions_as_dataframe(captions_dataset, captions_predicted, wiki_art_data=True):
+    """convert the dataset/predicted-utterances (captions) to a pandas dataframe."""
+    if wiki_art_data:
+        temp = captions_dataset.image_files.apply(wikiart_file_name_to_style_and_painting)
+        art_style, painting = zip(*temp)
+        grounding_emotion = [IDX_TO_EMOTION.get(x, None) for x in captions_dataset.emotions.tolist()]
+        df = pd.DataFrame([art_style, painting, grounding_emotion, captions_predicted]).transpose()
+        column_names = ['art_style', 'painting', 'grounding_emotion', 'caption']
+        df.columns = column_names
+    else:
+        image_files = captions_dataset.image_files.tolist()
+        grounding_emotion = [IDX_TO_EMOTION.get(x, None) for x in captions_dataset.emotions.tolist()]
+        df = pd.DataFrame([image_files, grounding_emotion, captions_predicted]).transpose()
+        column_names = ['image_file', 'grounding_emotion', 'caption']
+        df.columns = column_names
+    return df
+def tokens_to_strings(token_list, vocab, bert_unk=True):
+    """ Bert uses [UNK] to represent the unknown symbol.
+    :param token_list:
+    :param vocab:
+    :param bert_unk:
+    :return:
+    """
+    res = [vocab.decode_print(c) for c in token_list]
+    if bert_unk:
+        res = [c.replace(UNK, '[UNK]') for c in res]
+    return res

imageprocessing/artemis/artemis/captioning/senti_cap_anps.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Handling ANP-data // injection of sentiment according to SentiCap: https://arxiv.org/pdf/1510.01431.pdf
+The MIT License (MIT)
+Originally created at 10/19/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+Note:
+    Given the lack of time to add comments: PLEASE SEE directly notebook "sentimentalize_utterances_with_anps"
+for use-case.
+"""
+import nltk
+import numpy.random as random
+from collections import defaultdict
+def read_senticap_anps(senticap_anp_file):
+    """
+    :param senticap_anp_file:
+    :return: twp lists, first has positive ANPs [beautiful dog, nice person] the second negative.
+    """
+    positive_anps = []
+    negative_anps = []
+    current_sentiment = 'positive' # the file lists first the postives, then all the negatives
+    with open(senticap_anp_file) as fin:
+        for i, line in enumerate(fin):
+            if i == 0:
+                continue
+            if "Negative ANPs:" in line:
+                current_sentiment = 'negative'
+                continue
+            anp = line.rstrip()
+            if len(anp) == 0:
+                continue
+            if current_sentiment == 'negative':
+                negative_anps.append(anp)
+            else:
+                positive_anps.append(anp)
+    return positive_anps, negative_anps
+def build_senticap_noun_to_ajectives(pos_anps, neg_anps):
+    res = dict()
+    for tag, anps in zip(['positive', 'negative'], [pos_anps, neg_anps]):
+        res[tag] = defaultdict(list)
+        for anp in anps:
+            adjective, noun = anp.split()
+            res[tag][noun].append(adjective)
+    return res
+def nouns_and_adjectives_of_senticap(pos_sent_anp, neg_sent_anp):
+    all_nouns = set()
+    all_adjectives = set()
+    for catalogue in [pos_sent_anp, neg_sent_anp]:
+        for item in catalogue:
+            adjective, noun = item.split()
+            all_nouns.add(noun)
+            all_adjectives.add(adjective)
+    return all_nouns, all_adjectives
+def add_anp_to_sentence(sentence_tokenized, noun_to_adj, rule='random_adjective'):
+    """ Pick a noun of the sentence at that is a key of the noun_to_adj dictionary at random. Given the rule
+    pick the corresponding adjective from the noun_to_adj and add it before the noun. Return the new sentence.
+    If such a noun does not exist, apply no changes and return None.
+    :param sentence_tokenized: ['a', 'running' 'dog']
+    :param noun_to_adj: e.g., dog -> {happy, sad}, cat -> {funny, happy} etc.
+    :param rule: if "most_frequent_adjective" the noun_to_adj also includes frequencies:
+        e.g., dog -> {(happy 5), (sad, 1)}
+    :return:
+    """
+    sentence_tokenized = sentence_tokenized.copy()
+    pos = nltk.pos_tag(sentence_tokenized)
+    noun_pos = [i for i, x in enumerate(pos) if x[1][0] == 'N'] # all noun locationns
+    valid_noun_pos = []
+    # Drop nouns that do not have adjective ANP.
+    for p in noun_pos:
+        if sentence_tokenized[p] in noun_to_adj:
+            valid_noun_pos.append(p)
+    if len(valid_noun_pos) == 0:
+        return None
+    valid_noun_pos = sorted(valid_noun_pos)    # sort for reproducibility
+    random.shuffle(valid_noun_pos)
+    picked_noun_pos = valid_noun_pos[0]        # pick a noun at random
+    picked_noun = sentence_tokenized[picked_noun_pos]
+    if rule == 'random_adjective':
+        valid_adjectives = sorted(noun_to_adj[picked_noun]) # sort for reproducibility
+        random.shuffle(valid_adjectives)
+        picked_adjective = valid_adjectives[0]
+    elif rule == 'most_frequent_adjective':
+        most_freq_adjective_with_freq = sorted(noun_to_adj[picked_noun],  key=lambda x: x[1])[-1]
+        picked_adjective = most_freq_adjective_with_freq[0]
+    ## Avoid adding an existing adjective (e.g., happy happy man)
+    if picked_noun_pos > 0 and sentence_tokenized[picked_noun_pos-1] == picked_adjective:
+        pass
+    else:
+        sentence_tokenized.insert(picked_noun_pos, picked_adjective)
+    return ' '.join(sentence_tokenized)

imageprocessing/artemis/artemis/data/glove.6B.100d.vocabulary.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

imageprocessing/artemis/artemis/data/image-emotion-histogram.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

imageprocessing/artemis/artemis/data/speaker_sampling_configs/full_hyper_param_ablation.json.txt ADDED Viewed

	@@ -0,0 +1,182 @@

+[
+    {
+        "sampling_rule": "topk",
+        "temperature": 1.0,
+        "topk": 10
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.8,
+        "topk": 10
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.5,
+        "topk": 10
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.3,
+        "topk": 10
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.2,
+        "topk": 10
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 1.0,
+        "topk": 15
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.8,
+        "topk": 15
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.5,
+        "topk": 15
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.3,
+        "topk": 15
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.2,
+        "topk": 15
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 1.0,
+        "topk": 20
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.8,
+        "topk": 20
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.5,
+        "topk": 20
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.3,
+        "topk": 20
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.2,
+        "topk": 20
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 1.0,
+        "topk": 5
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.8,
+        "topk": 5
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.5,
+        "topk": 5
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.3,
+        "topk": 5
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.2,
+        "topk": 5
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 1.0,
+        "topk": 3
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.8,
+        "topk": 3
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.5,
+        "topk": 3
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.3,
+        "topk": 3
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.2,
+        "topk": 3
+    },
+    {
+        "sampling_rule": "topk",
+        "temperature": 0.2,
+        "topk": 1
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 1.0,
+        "beam_size": 5
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.8,
+        "beam_size": 5
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.5,
+        "beam_size": 5
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.3,
+        "beam_size": 5
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.2,
+        "beam_size": 5
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 1.0,
+        "beam_size": 10
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.8,
+        "beam_size": 10
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.5,
+        "beam_size": 10
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.3,
+        "beam_size": 10
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.2,
+        "beam_size": 10
+    }
+]

imageprocessing/artemis/artemis/data/speaker_sampling_configs/mini_hyper_param_ablation.json.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+[
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.5,
+        "beam_size": 5
+    },
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.3,
+        "beam_size": 5
+    }
+]

imageprocessing/artemis/artemis/data/speaker_sampling_configs/selected_hyper_params.json.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+[
+    {
+        "sampling_rule": "beam",
+        "temperature": 0.3,
+        "beam_size": 5
+    }
+]

imageprocessing/artemis/artemis/data/symspell_frequency_dictionary_en_82_765.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

imageprocessing/artemis/artemis/data/wiki_art_duplicate_paintings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:487d4325d3a75f86c7a1f5fd05fc424924c182c391f8a645e81f1c0dd58e4a27
+size 233854

imageprocessing/artemis/artemis/data/wiki_art_genre_class.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

imageprocessing/artemis/artemis/emotions.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Mostly some constants & very simple function to encode/handle the emotion attributes of ArtEmis.
+The MIT License (MIT)
+Originally created at 02/11/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+ARTEMIS_EMOTIONS = ['amusement', 'awe', 'contentment', 'excitement',
+                    'anger', 'disgust',  'fear', 'sadness', 'something else']
+EMOTION_TO_IDX = {e: i for i, e in enumerate(ARTEMIS_EMOTIONS)}
+IDX_TO_EMOTION = {EMOTION_TO_IDX[e]: e for e in EMOTION_TO_IDX}
+POS_NEG_ELSE = {'amusement': 0, 'awe': 0, 'contentment': 0, 'excitement': 0,
+                'anger': 1, 'disgust': 1,  'fear': 1, 'sadness': 1,
+                'something else': 2}
+COLORS = {'amusement': '#EE82EE',
+          'awe': '#FFFF00',
+          'contentment': '#87CEEB',
+          'excitement': '#DC143C',
+          'anger': '#000080',
+          'disgust': '#F0E68C',
+          'fear': '#C0C0C0',
+          'sadness': '#696969',
+          'something else': '#228B22'}
+LARGER_EMOTION_VOCAB = {('bored', 'boring', 'apathy', 'boredom', 'indifferent', 'dull', 'uninteresting', 'uninterested'),
+                        ('shock', 'shocked'),
+                        ('confused', 'confusion', 'confuses', 'puzzled', 'puzzling',
+                         'perplexed', 'perplexing', 'confusing', 'odd', 'weird'),
+                        ('surprised',),
+                        ('anticipation',),
+                        ('empowerment',),
+                        ('hope', 'hopeful', 'optimistic'),
+                        ('neutral',),
+                        ('rage',),
+                        ('happy', 'happiness'),
+                        ('grief',),
+                        ('shame',),
+                        ('resent',),
+                        ('creepy',),
+                        ('disappointment',),
+                        ('depressing', 'depressed'),
+                        ('bothered', 'disturbed', 'bothersome'),
+                        ('overwhelmed',),
+                        ('anxiety', 'anxious'),
+                        ('thrilled',),
+                        ('surprised', 'surprising'),
+                        ('uncomfortable',),
+                        ('curious', 'curiosity', 'wonder', 'intrigued', 'interested', 'interests', 'interesting', 'intriguing'),
+                        ('alerted', 'alert'),
+                        ('insult', 'insulted'),
+                        ('shy',),
+                        ('nostalgia', 'nostalgic'),
+                        ('exhilarating', 'exhilarated')}
+def positive_negative_else(emotion):
+    """ Map a feeling string (e.g. 'awe') to an integer indicating if it is a positive, negative, or else.
+    :param emotion: (string)
+    :return: int
+    """
+    return POS_NEG_ELSE[emotion]
+def emotion_to_int(emotion):
+    """ Map a feeling string (e.g. 'awe') to a unique integer.
+    :param emotion: (string)
+    :return: int
+    """
+    return EMOTION_TO_IDX[emotion]

imageprocessing/artemis/artemis/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+TODO: add description
+The MIT License (MIT)
+Originally created at 8/29/20, for Python 3.x
+Copyright (c) 2020 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""

imageprocessing/artemis/artemis/evaluation/bleu.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+BLEU via NLTK
+The MIT License (MIT)
+Originally created at 8/31/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import pandas as pd
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+cc = SmoothingFunction()
+def sentence_bleu_for_hypotheses(references, hypothesis, max_grams=4, smoothing_function=None):
+    """ Compute the BLEU score for the hypothesis (e.g., generated captions) against given references acting
+    as ground-truth.
+    :param references: (list of lists of lists) of len M. Each sublist contains strings. [['a', 'boy'], ['rock', 'music']]
+    :param hypothesis: (list of lists)
+    :param max_grams: int, bleu-max_grams i.e., when 4, computes bleu-4
+    :param smoothing_function:
+    :return: a Series containing the scores in the same order as the input
+    Note: see nltk.bleu_score.sentence_bleu
+    """
+    if len(references) != len(hypothesis):
+        raise ValueError('Each reference (set) comes with a single hypothesis')
+    if type(references[0]) != list or type(hypothesis[0]) != list:
+        raise ValueError('Bad input types: use tokenized strings, and lists of tokens.')
+    scores = []
+    weights = (1.0 / max_grams, ) * max_grams
+    for i in range(len(references)):
+        scores.append(sentence_bleu(references[i], hypothesis[i], weights=weights,
+                                    smoothing_function=smoothing_function))
+    return pd.Series(scores)

imageprocessing/artemis/artemis/evaluation/emotion_alignment.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Measuring the emotion-alignment between a generation and the ground-truth (emotion).
+The MIT License (MIT)
+Originally created at 8/31/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import torch
+import numpy as np
+from ..utils.basic import iterate_in_chunks
+@torch.no_grad()
+def image_to_emotion(img2emo_clf, data_loader, device):
+    """ For each image of the underlying dataset predict an emotion
+    :param img2emo_clf: nn.Module
+    :param data_loader: torch loader of dataset to iterate
+    :param device: gpu placement
+    :return:
+    """
+    img2emo_clf.eval()
+    emo_of_img_preds = []
+    for batch in data_loader:
+        predictions = img2emo_clf(batch['image'].to(device)).cpu()
+        emo_of_img_preds.append(predictions)
+    emo_of_img_preds = torch.cat(emo_of_img_preds)
+    return emo_of_img_preds
+@torch.no_grad()
+def text_to_emotion(txt2em_clf, encoded_tokens, device, batch_size=1000):
+    """
+    :param txt2em_clf:
+    :param encoded_tokens: Tensor carrying the text encoded
+    :param device:
+    :param batch_size:
+    :return:
+    """
+    txt2em_clf.eval()
+    emotion_txt_preds = []
+    for chunk in iterate_in_chunks(encoded_tokens, batch_size):
+        emotion_txt_preds.append(txt2em_clf(chunk.to(device)).cpu())
+    emotion_txt_preds = torch.cat(emotion_txt_preds)
+    maximizers = torch.argmax(emotion_txt_preds, -1)
+    return emotion_txt_preds, maximizers
+def unique_maximizer(a_list):
+    """ if there is an element of the input list that appears
+    strictly more frequent than any other element
+    :param a_list:
+    :return:
+    """
+    u_elements, u_cnt = np.unique(a_list, return_counts=True)
+    has_umax = sum(u_cnt == u_cnt.max()) == 1
+    umax = u_elements[u_cnt.argmax()]
+    return has_umax, umax
+def dominant_maximizer(a_list):
+    """ if there is an element of the input list that appears
+    at least half the time
+    :param a_list:
+    :return:
+    """
+    u_elements, u_cnt = np.unique(a_list, return_counts=True)
+    has_umax = u_cnt.max() >= len(a_list) / 2
+    if len(u_cnt) >= 2: # make sure the second most frequent does not match the first.
+        a, b = sorted(u_cnt)[-2:]
+        if a == b:
+            has_umax = False
+    umax = u_elements[u_cnt.argmax()]
+    return has_umax, umax
+def occurrence_list_to_distribution(list_of_ints, n_support):
+    """e.g., [0, 8, 8, 8] -> [1/4, 0, ..., 3/4, 0, ...]"""
+    distribution = np.zeros(n_support, dtype=np.float32)
+    for i in list_of_ints:
+        distribution[i] += 1
+    distribution /= sum(distribution)
+    return distribution

imageprocessing/artemis/artemis/evaluation/longest_common_subseq.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+The MIT License (MIT)
+Originally created at 10/5/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import numpy as np
+from tqdm import tqdm
+def lcs(s1, s2):
+    """
+    Longest common subsequence of two iterables. A subsequence is a
+    sequence that appears in the same relative order, but not necessarily contiguous.
+    :param s1: first iterable
+    :param s2: second iterable
+    :return: (list) the lcs
+    """
+    matrix = [[[] for _ in range(len(s2))] for _ in range(len(s1))]
+    for i in range(len(s1)):
+        for j in range(len(s2)):
+            if s1[i] == s2[j]:
+                if i == 0 or j == 0:
+                    matrix[i][j] = [s1[i]]
+                else:
+                    matrix[i][j] = matrix[i-1][j-1] + [s1[i]]
+            else:
+                matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)
+    cs = matrix[-1][-1]
+    return cs
+def captions_lcs_from_training_utterances(captions_tokenized, train_utters_tokenized):
+    maximizers =  np.zeros(len(captions_tokenized), dtype=int)
+    max_lcs = np.zeros(len(captions_tokenized))
+    averages = np.zeros(len(captions_tokenized))
+    for i, caption in enumerate(tqdm(captions_tokenized)):
+        caption_res = [len(lcs(caption, tr_example)) for tr_example in train_utters_tokenized]
+        max_loc = np.argmax(caption_res)
+        maximizers[i] = max_loc
+        max_lcs[i] = caption_res[max_loc]
+        averages[i] = np.mean(caption_res)
+    return max_lcs, averages, maximizers
+###
+# Panos Note:
+# a) '[the] contours shadowing [and] details make this painting [look like a] photograph the way the hair is
+# layered and [the eyes] gazing off to space are fantastic'
+# b) '[the] red [and] black paint strokes [look like a] bunch on [the eyes]'
+# (a), (b) have lcs = 7
+# but,
+# a) '[the woman] is pretty nice and [has a] welcoming [facial expression]'
+# b) '[the woman] looks very elegant since she [has] such [a] beautiful [facial expression]'
+# (a), (b) have lcs = 6
+# implying that removing stop-word articles "a", "the" could make this more realistic, since the first pair is way more
+# dissimilar than the second.
+# also if you use this to compare to systems; the length of the utterance could be used to normalize the bias the length
+# brings in.
+###

imageprocessing/artemis/artemis/evaluation/metaphors.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Greedy-approximate counting of similes/methaphors present in a set of sentences.
+The MIT License (MIT)
+Originally created at 9/1/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+metaphorical_substrings = {'could be',
+                           'appears to be',
+                           'appear to be',
+                           'reminds me',
+                           'remind me',
+                           'seems like',
+                           'looks like',
+                           'look like',
+                           'is like',
+                           'are like',
+                           'think of',
+                           'resembles',
+                           'resembling'
+                           }
+def makes_metaphor_via_substring_matching(sentences, substrings=None):
+    """
+    :param sentences: list of strings
+    :param substrings: iterable with substrings of which the occurrence implies a metaphor is made
+    :return: list with booleans
+    """
+    if substrings is None:
+        substrings = metaphorical_substrings
+    makes_metaphor = []
+    for s in sentences:
+        yes = False
+        for m in substrings:
+            if m in s:
+                yes = True
+                break
+        makes_metaphor.append(yes)
+    return makes_metaphor

imageprocessing/artemis/artemis/evaluation/single_caption_per_image.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Some grouping of various evaluation evaluation routines that assume that assume that for a given set of reference
+sentences there is a _single_ caption (sample) generated.
+The MIT License (MIT)
+Originally created at 9/1/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import torch
+import warnings
+import pandas as pd
+import numpy as np
+from .bleu import sentence_bleu_for_hypotheses, cc
+from .metaphors import makes_metaphor_via_substring_matching
+from .emotion_alignment import text_to_emotion
+from .pycocoevalcap import Bleu, Cider, Meteor, Spice, Rouge
+from .emotion_alignment import dominant_maximizer, occurrence_list_to_distribution
+from .longest_common_subseq import captions_lcs_from_training_utterances
+from ..utils.basic import cross_entropy
+ALL_METRICS = {'bleu', 'cider', 'spice', 'meteor', 'rouge', 'emo_alignment', 'metaphor', 'lcs'}
+def emotional_alignment(hypothesis, emotions, vocab, txt2em_clf, device):
+    """ text 2 emotion, then compare with ground-truth.
+    :param hypothesis:
+    :param emotions: (list of list of int) human emotion-annotations (ground-truth) e.g., [[0, 1] [1]]
+    :param vocab:
+    :param txt2em_clf:
+    :param device:
+    :return:
+    """
+    # from text to emotion
+    hypothesis_tokenized = hypothesis.apply(lambda x: x.split())
+    max_len = hypothesis_tokenized.apply(lambda x: len(x)).max()
+    hypothesis = hypothesis_tokenized.apply(lambda x: np.array(vocab.encode(x, max_len=max_len)))
+    hypothesis = torch.from_numpy(np.vstack(hypothesis))
+    pred_logits, pred_maximizer = text_to_emotion(txt2em_clf, hypothesis, device)
+    # convert emotion lists to distributions to measure cross-entropy
+    n_emotions = 9
+    emo_dists = torch.from_numpy(np.vstack(emotions.apply(lambda x: occurrence_list_to_distribution(x, n_emotions))))
+    x_entropy = cross_entropy(pred_logits, emo_dists).item()
+    # constrain predictions to those of images with dominant maximizer of emotion
+    has_max, maximizer = zip(*emotions.apply(dominant_maximizer))
+    emotion_mask = np.array(has_max)
+    masked_emotion = np.array(maximizer)[emotion_mask]
+    guess_correct = masked_emotion == pred_maximizer[emotion_mask].cpu().numpy()
+    accuracy = guess_correct.mean()
+    return accuracy, x_entropy
+def bleu_scores_via_nltk(hypothesis, references, smoothing_function=cc.method1):
+    """
+    :param hypothesis: dataframe of strings
+    :param references: dataframe of list of strings
+    :param smoothing_function:
+    :return:
+    """
+    # first tokenize
+    hypothesis_tokenized = hypothesis.apply(lambda x: x.split())
+    references_tokenized = references.apply(lambda x: [i.split() for i in x])
+    results = dict()
+    for max_grams in range(1, 5):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            scores = sentence_bleu_for_hypotheses(references_tokenized,
+                                                  hypothesis_tokenized,
+                                                  max_grams,
+                                                  smoothing_function)
+            results['BLEU-{}'.format(max_grams)] = scores
+    return results
+def dataframes_to_coco_eval_format(references, hypothesis):
+    references = {i: [k for k in x] for i, x in enumerate(references)}
+    hypothesis = {i: [x] for i, x in enumerate(hypothesis)}
+    return references, hypothesis
+def pycoco_bleu_scores(hypothesis, references):
+    references, hypothesis = dataframes_to_coco_eval_format(references, hypothesis)
+    scorer = Bleu()
+    average_score, all_scores = scorer.compute_score(references, hypothesis)
+    # Note: average_score takes into account epsilons: tiny/small
+    # this won't be reflected if you take the direct average of all_scores.
+    return average_score, all_scores
+def pycoco_eval_scores(hypothesis, references, metric):
+    references, hypothesis = dataframes_to_coco_eval_format(references, hypothesis)
+    if metric == 'cider':
+        scorer = Cider()
+    elif metric == 'meteor':
+        scorer = Meteor()
+    elif metric == 'spice':
+        scorer = Spice()
+    elif metric == 'rouge':
+        scorer = Rouge()
+    else:
+        raise ValueError
+    avg, all_scores = scorer.compute_score(references, hypothesis)
+    return pd.Series(all_scores)
+def apply_basic_evaluations(hypothesis, references, ref_emotions, txt2emo_clf, text2emo_vocab,
+                            lcs_sample=None, train_utterances=None, nltk_bleu=False, smoothing_function=cc.method1,
+                            device="cuda", random_seed=2021,
+                            methods_to_do=ALL_METRICS):
+    """
+    :param hypothesis: list of strings ['a man', 'a woman']
+    :param references: list of list of strings [['a man', 'a tall man'], ['a woman']]
+    :param ref_emotions: emotions corresponding to references list of list of integers [[0, 1] [1]]
+    :param text2emo_vocab:
+    :param txt2emo_clf:
+    :param device:
+    :param smoothing_function:
+    :return:
+    """
+    results = []
+    stat_track = ['mean', 'std']
+    ##
+    ## BLEU:1-4
+    ##
+    if 'bleu' in methods_to_do:
+        if nltk_bleu:
+            res = bleu_scores_via_nltk(hypothesis, references, smoothing_function=smoothing_function)
+            for metric, scores in res.items():
+                stats = scores.describe()[stat_track]
+                stats = pd.concat([pd.Series({'metric': metric}), stats])
+                results.append(stats)
+        else:
+            #py-coco based
+            b_scores = pycoco_bleu_scores(hypothesis, references)
+            for i in range(4):
+                metric = f'BLEU-{i}'
+                mu = b_scores[0][i]
+                # note the std below reflects the values without the 'tiny' adaptation (unlike the mu)
+                # avg_dummy = np.mean(b_scores[1][i]) # this is the average without the tiny adaptation.
+                std = np.std(b_scores[1][i])
+                stats = pd.concat([pd.Series({'metric': metric}), pd.Series({'mean': mu, 'std':std})])
+                results.append(stats)
+        print('BLEU: done')
+    ##
+    ## CIDER, SPICE, METEOR, ROUGE-L
+    ##
+    coco_requested = False
+    for metric in ['cider', 'spice', 'meteor', 'rouge']:
+        if metric in methods_to_do:
+            stats = pycoco_eval_scores(hypothesis, references, metric).describe()[stat_track]
+            stats = pd.concat([pd.Series({'metric': metric.upper()}), stats])
+            results.append(stats)
+            coco_requested = True
+    if coco_requested:
+        print('COCO-based-metrics: done')
+    ##
+    ## Emotional-Alignment
+    ##
+    if 'emo_alignment' in methods_to_do:
+        emo_accuracy, emo_xentopy = emotional_alignment(hypothesis, ref_emotions, text2emo_vocab, txt2emo_clf, device)
+        stats = pd.Series(emo_accuracy, dtype=float)
+        stats = stats.describe()[stat_track]
+        stats = pd.concat([pd.Series({'metric': 'Emo-Alignment-ACC'}), stats])
+        results.append(stats)
+        stats = pd.Series(emo_xentopy, dtype=float)
+        stats = stats.describe()[stat_track]
+        stats = pd.concat([pd.Series({'metric': 'Emo-Alignment-XENT'}), stats])
+        results.append(stats)
+        print('EMO-ALIGN: done')
+    ##
+    ## Metaphor-like expressions
+    ##
+    if 'metaphor' in methods_to_do:
+        met_mask = makes_metaphor_via_substring_matching(hypothesis)
+        stats = pd.Series(met_mask, dtype=float)
+        stats = stats.describe()[stat_track]
+        stats = pd.concat([pd.Series({'metric': 'Metaphors'}), stats])
+        results.append(stats)
+        print('Metaphor-like expressions: Done')
+    ##
+    ## Novelty via Longest Common Subsequence
+    ##
+    if 'lcs' in methods_to_do:
+        np.random.seed(random_seed) # since you will (normally) sub-sample
+        train_utters_tokenized = [u.split() for u in train_utterances]
+        uts = pd.Series(train_utters_tokenized).sample(lcs_sample[0]).to_list()
+        hypo_token = hypothesis.apply(lambda x: x.split()).sample(lcs_sample[1]).to_list()
+        max_lcs, mean_lcs, _ = captions_lcs_from_training_utterances(hypo_token, uts)
+        stats = pd.Series(max_lcs).describe()[stat_track]
+        stats = pd.concat([pd.Series({'metric': 'max-LCS'}), stats])
+        results.append(stats)
+        stats = pd.Series(mean_lcs).describe()[stat_track]
+        stats = pd.concat([pd.Series({'metric': 'mean-LCS'}), stats])
+        results.append(stats)
+        print('Novelty via Longest Common Subsequence: Done')
+    return results

imageprocessing/artemis/artemis/in_out/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""

imageprocessing/artemis/artemis/in_out/arguments.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Argument handling.
+The MIT License (MIT)
+Originally created at early 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import argparse
+import json
+import pprint
+import pathlib
+import os.path as osp
+from datetime import datetime
+from .basics import create_dir
+def str2bool(v):
+    """ boolean values for argparse
+    """
+    if isinstance(v, bool):
+       return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def parse_train_speaker_arguments(notebook_options=None, save_args=False):
+    """ Default/Main arguments for training a SAT neural-speaker (via ArtEmis).
+    :param notebook_options: list, if you are using this via a jupyter notebook
+    :return: argparse.ArgumentParser
+    """
+    parser = argparse.ArgumentParser(description='training-a-neural-speaker')
+    ## Non-optional arguments
+    parser.add_argument('-log-dir', type=str, required=True, help='where to save training-progress, model, etc.')
+    parser.add_argument('-data-dir', type=str, required=True, help='path to ArtEmis/COCO preprocessed data')
+    parser.add_argument('-img-dir', type=str, required=True, help='path to top image (e.g., WikiArt) dir')
+    # Model parameters
+    parser.add_argument('--img-dim', type=int, default=256, help='images will be resized to be squared with this many pixels')
+    parser.add_argument('--lanczos', type=str2bool, default=True, help='apply lanczos resampling when resizing')
+    parser.add_argument('--atn-spatial-img-size', type=int, help='optional, if provided reshapes the spatial output dimension of the '
+                                                                 'visual encode in this X this "pixels" using average-pooling. ')
+    parser.add_argument('--atn-cover-img-alpha', type=float, default=1, help='attention to cover entire image when '
+                                                                             'marginalized over tokens')
+    parser.add_argument('--attention-dim', type=int, default=512)
+    parser.add_argument('--rnn-hidden-dim', type=int, default=512)
+    parser.add_argument('--word-embedding-dim', type=int, default=128)
+    parser.add_argument('--vis-encoder', type=str, default='resnet34', choices=['resnet18',
+                                                                                'resnet34',
+                                                                                'resnet50',
+                                                                                'resnet101'], help='visual-encoder backbone')
+    parser.add_argument('--dropout-rate', type=float, default=0.1)
+    parser.add_argument('--teacher-forcing-ratio',  type=int, default=1)
+    parser.add_argument('--use-emo-grounding', type=str2bool, default=False)
+    parser.add_argument('--emo-grounding-dims', nargs=2, type=int, default=[9, 9], help='[input] number of emotions x the'
+                                                                                        'the size of the projection layer that '
+                                                                                        'will be used to transform the one-hot emotion'
+                                                                                        'to a grounding vector.')
+    # Training parameters
+    parser.add_argument('--resume-path', type=str, help='model-path to resume from')
+    parser.add_argument('--fine-tune-data', type=str)
+    parser.add_argument('--batch-size', type=int, default=128)
+    parser.add_argument('--num-workers', type=int, default=6)
+    parser.add_argument('--gpu', type=str, default='0')
+    parser.add_argument('--encoder-lr', type=float, default=1e-4)
+    parser.add_argument('--decoder-lr', type=float, default=5e-4)
+    parser.add_argument('--max-train-epochs', type=int, default=50)
+    parser.add_argument('--train-patience', type=int, default=5, help='maximum consecutive epochs where the validation '
+                                                                      'Neg-LL does not improve before we stop training.')
+    parser.add_argument('--lr-patience', type=int, default=2, help='maximum waiting of epochs where the validation '
+                                                                   'Neg-LL does not improve before we reduce the'
+                                                                   'learning-rate.')
+    parser.add_argument('--save-each-epoch', type=str2bool, default=True, help='Save the model at each epoch, else will only save'
+                                                                               'the one that achieved the minimal '
+                                                                               'Negative-Log-Likelihood in the validation split.')
+    # Misc
+    parser.add_argument('--dataset', type=str, default='artemis')
+    parser.add_argument('--random-seed', type=int, default=2021)
+    parser.add_argument('--debug', default=False, type=str2bool)
+    parser.add_argument('--use-timestamp', default=True, type=str2bool)
+    # Parse arguments
+    if notebook_options is not None:  # Pass options directly
+        args = parser.parse_args(notebook_options)
+    else:
+        args = parser.parse_args() # Read from command line.
+    if args.use_timestamp:
+        timestamp = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
+        args.log_dir = create_dir(osp.join(args.log_dir, timestamp))
+    # pprint them
+    args_string = pprint.pformat(vars(args))
+    print(args_string)
+    if save_args:
+        out = osp.join(args.log_dir, 'config.json.txt')
+        with open(out, 'w') as f_out:
+            json.dump(vars(args), f_out, indent=4, sort_keys=True)
+    return args
+def parse_test_speaker_arguments(notebook_options=None):
+    """ Parameters for testing (sampling) a neural-speaker.
+    :param notebook_options: list, if you are using this via a jupyter notebook
+    :return: argparse.ArgumentParser
+    """
+    parser = argparse.ArgumentParser(description='testing-a-neural-speaker')
+    ## Basic required arguments
+    parser.add_argument('-speaker-saved-args', type=str, required=True, help='config.json.txt file for saved speaker model (output of train_speaker.py)')
+    parser.add_argument('-speaker-checkpoint', type=str, required=True, help='saved model checkpoint ("best_model.pt" (output of train_speaker.py)')
+    parser.add_argument('-img-dir', type=str, required=True, help='path to top image dir (typically that\'s the WikiArt top-dir)')
+    parser.add_argument('-out-file', type=str, required=True, help='file to save the sampled utterances, their attention etc. as a pkl')
+    ## Basic optional arguments
+    parser.add_argument('--split', type=str, default='test', choices=['train', 'test', 'val', 'rest'], help='set the split of the dataset you want to annotate '
+                                                                                                            'the code will load the dataset based on the dir-location marked '
+                                                                                                            'in the input config.json.txt file. '
+                                                                                                            'this param has no effect if a custom-data-csv is passed.')
+    parser.add_argument('--custom-data-csv', type=str, help='if you want to annotate your own set of images. Please '
+                                                            'see the code for what this csv should look like. ')
+    parser.add_argument('--subsample-data', type=int, default=-1, help='if not -1, will subsample the underlying dataset'
+                                                                        'and will annotated only this many images.')
+    ## Optional arguments controlling the generation/sampling process
+    parser.add_argument('--max-utterance-len', type=int, help='maximum allowed lenght for any sampled utterances. If not given '
+                                                              'the maximum found in the underlying dataset split will be used.'
+                                                              'Fot the official ArtEmis split for deep-nets that is 30 tokens.')
+    parser.add_argument('--drop-unk', type=str2bool, default=True, help='if True, do not create samples that contain the '
+                                                                        'unknown token')
+    parser.add_argument('--drop-bigrams', type=str2bool, default=True, help='if True, prevent the same bigram to occur '
+                                                                            'twice in a sampled utterance')
+    ## To enable the pass of multiple configurations for the sampler at once! i.e., so you can try many
+    ## sampling temperatures, methods to sample (beam-search vs. topk), beam-size (or more)
+    ## You can provide a simple .json that specifies these values you want to try.
+    ## See  >> data/speaker_sampling_configs << for examples
+    ## Note. if you pass nothing the >> data/speaker_sampling_configs/selected_hyper_params.json.txt << will be used
+    ##       these are parameters used in the the paper.
+    parser.add_argument('--sampling-config-file', type=str, help='Note. if max-len, drop-unk '
+                                                                 'and drop-bigrams are not specified in the json'
+                                                                 'the directly provided values of these parameters '
+                                                                 'will be used.')
+    parser.add_argument('--random-seed', type=int, default=2021, help='if -1 it won\'t have an effect; else the sampler '
+                                                                      'becomes deterministic')
+    parser.add_argument('--img2emo-checkpoint', type=str, help='checkpoint file of image-2-emotion classifier that will '
+                                                               'be used to sample the grounding emotion that will be used '
+                                                               'by the speaker, if you pass an emotionally-grouned speaker. '
+                                                               'Note. if you pass/use an emo-grounded speaker - this argument '
+                                                               'becomes required, except if you are using your own custom-data-csv '
+                                                               'where you can specify the grounding emotion manually.' )
+    parser.add_argument('--gpu', type=str, default='0')
+    parser.add_argument('--n-workers', type=int)
+    parser.add_argument('--compute-nll', type=str2bool, default=False, help='Compute the negative-log-likelihood of '
+                                                                            'the dataset under the the saved speaker model.')
+    # Parse arguments
+    if notebook_options is not None:  # Pass options directly
+        args = parser.parse_args(notebook_options)
+    else:
+        args = parser.parse_args() # Read from command line.
+    # load "default"
+    if args.sampling_config_file is None:
+        up_dir = osp.split(pathlib.Path(__file__).parent.absolute())[0]
+        args.sampling_config_file = osp.join(up_dir, 'data/speaker_sampling_configs/selected_hyper_params.json.txt')
+    # pprint them
+    print('\nParameters Specified:')
+    args_string = pprint.pformat(vars(args))
+    print(args_string)
+    print('\n')
+    return args

imageprocessing/artemis/artemis/in_out/basics.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+Basic (simple) I/O Utilities.
+The MIT License (MIT)
+Originally created in 2019, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import re
+import os
+import json
+import sys
+import numpy as np
+import pandas as pd
+import os.path as osp
+import pprint
+import logging
+from argparse import ArgumentParser
+from IPython.display import display
+from PIL import Image
+from six.moves import cPickle, range
+from ..emotions import ARTEMIS_EMOTIONS
+def files_in_subdirs(top_dir, search_pattern):
+    join = osp.join
+    regex = re.compile(search_pattern)
+    for path, _, files in os.walk(top_dir):
+        for name in files:
+            full_name = join(path, name)
+            if regex.search(full_name):
+                yield full_name
+def create_dir(dir_path):
+    """ Creates a directory (or nested directories) if they don't exist.
+    """
+    if not osp.exists(dir_path):
+        os.makedirs(dir_path)
+    return dir_path
+def pickle_data(file_name, *args):
+    """Using (c)Pickle to save multiple python objects in a single file.
+    """
+    out_file = open(file_name, 'wb')
+    cPickle.dump(len(args), out_file, protocol=2)
+    for item in args:
+        cPickle.dump(item, out_file, protocol=2)
+    out_file.close()
+def unpickle_data(file_name, python2_to_3=False):
+    """ Restore data previously saved with pickle_data().
+    :param file_name: file holding the pickled data.
+    :param python2_to_3: (boolean), if True, pickle happened under python2x, unpickling under python3x.
+    :return: an generator over the un-pickled items.
+    Note, about implementing the python2_to_3 see
+        https://stackoverflow.com/questions/28218466/unpickling-a-python-2-object-with-python-3
+    """
+    in_file = open(file_name, 'rb')
+    if python2_to_3:
+        size = cPickle.load(in_file, encoding='latin1')
+    else:
+        size = cPickle.load(in_file)
+    for _ in range(size):
+        if python2_to_3:
+            yield cPickle.load(in_file, encoding='latin1')
+        else:
+            yield cPickle.load(in_file)
+    in_file.close()
+def load_raw_amt_csv_hit_responses(top_csv_folder, verbose=True, only_approved=True,
+                                   keep_cols=None, drop_rorschach=True, has_emotions=True):
+    """
+    :param top_csv_folder:
+    :param verbose:
+    :param only_approved:
+    :param keep_cols:
+    :param drop_rorschach:
+    :param has_emotions: set to False to load wiki-art annotations that are objective (OLA-dataset)
+    :return:
+    """
+    all_collected_csv = [f for f in files_in_subdirs(top_csv_folder, '.csv$')]
+    if verbose:
+        print('{} files loaded'.format(len(all_collected_csv)))
+    all_csv_names = [osp.basename(f) for f in all_collected_csv]
+    assert len(all_csv_names) == len(set(all_csv_names)) # unique names
+    all_dfs = []
+    for f in all_collected_csv:  # load each .csv
+        df = pd.read_csv(f)
+        # print(df['AssignmentStatus'].unique())
+        in_submission_mode = (df['AssignmentStatus'] == 'Submitted').sum()
+        if in_submission_mode > 0:
+            print('In {}, {} examples are still in submitted mode.'.format(osp.basename(f), in_submission_mode))
+        if only_approved:
+            df = df[df['AssignmentStatus'] == 'Approved']
+        all_dfs.append(df)
+    df = pd.concat(all_dfs)
+    # Rename columns
+    new_cols = [c.replace('choice.', '') for c in [c.replace('Answer.', '') for c in df.columns]]
+    new_cols = [c.lower() for c in new_cols]
+    df.columns = new_cols
+    df = df.reset_index()
+    # Keep ML-related columns
+    ml_related_cols = ['workerid', 'input.image_url', 'utterance']
+    # Add potential extras requested at the input
+    if keep_cols is not None:
+        ml_related_cols += keep_cols
+    if has_emotions:
+        _, x = np.where(df[ARTEMIS_EMOTIONS])
+        emotion_chosen = pd.Series(np.array(ARTEMIS_EMOTIONS)[x], name='emotion')
+        df = pd.concat([df[ml_related_cols], emotion_chosen], axis=1)
+    else:
+        df = df[ml_related_cols]
+    # Derivative columns
+    def url_to_painting_name(x):
+        tokens = x.split('/')
+        return tokens[-1][:-len('.jpg')]
+    def url_to_art_style(x):
+        tokens = x.split('/')
+        return tokens[-2]
+    df['painting'] = df['input.image_url'].apply(url_to_painting_name)
+    df['art_style'] = df['input.image_url'].apply(url_to_art_style)
+    df = df.drop(['input.image_url'], axis=1)
+    if drop_rorschach:
+        df = df[df['art_style'] != 'test']
+        df.reset_index(inplace=True, drop=True)
+    if verbose:
+        print('Loading responses:', len(df))
+        print('Column Names:', [c for c in df.columns])
+    return df
+def splitall(path):
+    """
+    Examples:
+        splitall('a/b/c') -> ['a', 'b', 'c']
+        splitall('/a/b/c/')  -> ['/', 'a', 'b', 'c', '']
+    NOTE: https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s16.html
+    """
+    allparts = []
+    while 1:
+        parts = osp.split(path)
+        if parts[0] == path:   # Sentinel for absolute paths.
+            allparts.insert(0, parts[0])
+            break
+        elif parts[1] == path: # Sentinel for relative paths.
+            allparts.insert(0, parts[1])
+            break
+        else:
+            path = parts[0]
+            allparts.insert(0, parts[1])
+    return allparts
+def wikiart_file_name_to_style_and_painting(filename):
+    """
+    Assumes a filename of a painting of wiki-art.
+    :param filename:
+    :return:
+    """
+    s = splitall(filename)
+    return s[-2], s[-1][:-len('.jpg')]
+def show_random_captions(df, top_img_dir):
+    painting, art_style = df.sample(1)[['painting', 'art_style']].iloc[0]
+    print(art_style, painting)
+    display(Image.open(osp.join(top_img_dir, art_style, painting + '.jpg')))
+    s = df[(df.painting == painting) & (df.art_style == art_style)]
+    for e, u in zip(s['emotion'], s['utterance']):
+        print('{}:\t{}'.format(e.upper(), u))
+def read_saved_args(config_file, override_args=None, verbose=False):
+    """
+    :param config_file: json file containing arguments
+    :param override_args: dict e.g., {'gpu': '0'}
+    :param verbose:
+    :return:
+    """
+    parser = ArgumentParser()
+    args = parser.parse_args([])
+    with open(config_file, 'r') as f_in:
+        args.__dict__ = json.load(f_in)
+    if override_args is not None:
+        for key, val in override_args.items():
+            args.__setattr__(key, val)
+    if verbose:
+        args_string = pprint.pformat(vars(args))
+        print(args_string)
+    return args
+def create_logger(log_dir, std_out=True):
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(asctime)s - %(message)s')
+    # Add logging to file handler
+    file_handler = logging.FileHandler(osp.join(log_dir, 'log.txt'))
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    # Add stdout to also print statements there
+    if std_out:
+        logger.addHandler(logging.StreamHandler(sys.stdout))
+    return logger

imageprocessing/artemis/artemis/in_out/cleaning.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Data Cleaning Utilities.
+The MIT License (MIT)
+Originally created in 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import pathlib
+import os.path as osp
+from tqdm import tqdm_notebook as tqdm
+from ..in_out.basics import unpickle_data, splitall
+def load_duplicate_paintings_of_wikiart(duplicates_pkl_file=None, verbose=True):
+    """ Return a list containing wikiArt paintings that are double-listed.
+    :param duplicates_pkl_file: (opt) pkl file containing the duplicate groups.
+    :return: (list of list) each sublist contains tuples like (art_style, painting) that are duplicates.
+    Note.  If duplicates_pkl_file==None, the stored inside the repo .pkl file will be used. The duplicates indicated in
+    the .pkl were found by a combination of running the `fdupes' program and a manual check on Nearest-Neighbors of a
+    pretrained ResNet on ImageNet that had very small distances.
+    """
+    if duplicates_pkl_file is None:
+        up_dir = osp.split(pathlib.Path(__file__).parent.absolute())[0]
+        duplicates_pkl_file = osp.join(up_dir, 'data/wiki_art_duplicate_paintings.pkl')
+        # Note. This file contains duplicates that were found using
+    duplicates_as_list = next(unpickle_data(duplicates_pkl_file))
+    if verbose:
+        print("Using {} groups of paintings that are visually identical (duplicates).".format(len(duplicates_as_list)))
+    return duplicates_as_list
+def drop_duplicate_paintings(wiki_art_image_files, duplicate_groups=None):
+    """
+    :param wiki_art_image_files: (list) with filenames of the form xx/xx/art_style/painting.jpg
+    :param duplicate_groups: list of list, each item is a collection of (art_style, painting) tuples that are duplicates.
+    :return: a new list where from each duplicate group only one (the first) painting is kept.
+    """
+    if duplicate_groups is None:
+        duplicate_groups = load_duplicate_paintings_of_wikiart()
+    drop_these = set()
+    for dup_g in duplicate_groups:
+        drop_these.update(dup_g[1:]) # drop all but first
+    clean_img_files = []
+    dropped = 0
+    for img_file in wiki_art_image_files:
+        tokens = splitall(img_file)
+        painting = tokens[-1][:-len('.jpg')]
+        art_style = tokens[-2]
+        key = (art_style, painting)
+        if key in drop_these:
+            dropped += 1
+        else:
+            clean_img_files.append(img_file)
+    print('Dropping {} from {} paintings that are duplicates of one painting that is kept.'.format(dropped,
+                                                                                                   len(wiki_art_image_files)))
+    return clean_img_files
+def merge_artemis_annotations_on_wikiart_duplicates(dataset_df, duplicate_groups=None, verbose=True):
+    """
+    :param dataset_df:
+    :param duplicate_groups:
+    :return:
+    """
+    if duplicate_groups is None:
+        duplicate_groups = load_duplicate_paintings_of_wikiart()
+    n_merged_stimuli = 0
+    for dup_g in tqdm(duplicate_groups):
+        keep_this = dup_g[0]
+        drop_these = dup_g[1:] # drop all but first
+        for stimulus in drop_these:
+            mask = (dataset_df['art_style'] == stimulus[0]) & (dataset_df['painting'] == stimulus[1])
+            n_merged_stimuli += sum(mask)
+            dataset_df.loc[mask, ['art_style']] = keep_this[0]
+            dataset_df.loc[mask, ['painting']] = keep_this[1]
+    if verbose:
+        print('{} stimuli were merged.'.format(n_merged_stimuli))
+    return dataset_df

imageprocessing/artemis/artemis/in_out/coco.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+COCO related I/O operations
+The MIT License (MIT)
+Originally created at 10/18/20, for Python 3.x
+Copyright (c) 2020 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import os.path as osp
+def coco_image_name_to_image_file(image_name, top_img_dir, year=2014):
+    if image_name.startswith('COCO_val'):
+        return osp.join(top_img_dir, 'val' + str(year), image_name)
+    elif image_name.startswith('COCO_train'):
+        return osp.join(top_img_dir, 'train' + str(year), image_name)
+    else:
+        raise ValueError
+def karpathize(df):
+    ## Per Karpathy's tweet: restval is actually train.
+    df.split[df.split == 'restval'] = 'train'
+def prepare_coco_dataframe_for_training(df, top_img_dir):
+    # assign file-names to each image
+    df = df.assign(image_files = df.image.apply(lambda x: coco_image_name_to_image_file(x, top_img_dir)))
+    # fix splits
+    karpathize(df)
+    return df

imageprocessing/artemis/artemis/in_out/datasets.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""
+The MIT License (MIT)
+Originally in 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+import numpy as np
+import pandas as pd
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader
+from ..evaluation.emotion_alignment import image_to_emotion
+from ..emotions import emotion_to_int
+class AffectiveCaptionDataset(Dataset):
+    """ Basically, an image, with a caption, and an indicated emotion.
+    """
+    def __init__(self, image_files, tokens, emotions, n_emotions=9, img_transform=None, one_hot_emo=True):
+        super(AffectiveCaptionDataset, self).__init__()
+        self.image_files = image_files
+        self.tokens = tokens
+        self.emotions = emotions
+        self.n_emotions = n_emotions
+        self.img_transform = img_transform
+        self.one_hot_emo = one_hot_emo
+    def __getitem__(self, index):
+        text = np.array(self.tokens[index]).astype(dtype=np.long)
+        if self.image_files is not None:
+            img = Image.open(self.image_files[index])
+            if img.mode is not 'RGB':
+                img = img.convert('RGB')
+            if self.img_transform is not None:
+                img = self.img_transform(img)
+        else:
+            img = []
+        if self.n_emotions > 0:
+            if self.one_hot_emo:
+                emotion = np.zeros(self.n_emotions, dtype=np.float32)
+                emotion[self.emotions[index]] = 1
+            else:
+                emotion = self.emotions[index]
+        else:
+            emotion = []
+        res = {'image': img, 'emotion': emotion, 'tokens': text, 'index': index}
+        return res
+    def __len__(self):
+        return len(self.tokens)
+class ImageClassificationDataset(Dataset):
+    def __init__(self, image_files, labels=None, img_transform=None, rgb_only=True):
+        super(ImageClassificationDataset, self).__init__()
+        self.image_files = image_files
+        self.labels = labels
+        self.img_transform = img_transform
+        self.rgb_only = rgb_only
+    def __getitem__(self, index):
+        img = Image.open(self.image_files[index])
+        if self.rgb_only and img.mode is not 'RGB':
+            img = img.convert('RGB')
+        if self.img_transform is not None:
+            img = self.img_transform(img)
+        label = []
+        if self.labels is not None:
+            label = self.labels[index]
+        res = {'image': img, 'label': label, 'index': index}
+        return res
+    def __len__(self):
+        return len(self.image_files)
+def sub_sample_dataloader(dataloader, sample_size, seed=None, shuffle=False):
+    """ Given any torch dataloader create a sub-sampled version of it.
+    :param dataloader:
+    :param sample_size:
+    :param seed:
+    :param shuffle:
+    :return: dataloader of Subset
+    """
+    dataset = dataloader.dataset
+    n_total = len(dataset)
+    if sample_size > n_total:
+        raise ValueError
+    if seed is not None:
+        torch.manual_seed(seed)
+    sb_dataset = torch.utils.data.random_split(dataset, [sample_size, n_total-sample_size])[0]
+    bsize = min(dataloader.batch_size, sample_size)
+    sample_loader = torch.utils.data.DataLoader(dataset=sb_dataset,
+                                                batch_size=bsize,
+                                                shuffle=shuffle,
+                                                num_workers=dataloader.num_workers)
+    return sample_loader
+def sub_index_affective_dataloader(affective_dataloader, indices, shuffle=False):
+    """ Given a torch dataloader and a sequence of integers; extract the corresponding items of the
+    carried dataset on the specific indices and make a new dataloader with them.
+    :param affective_dataloader: torch.utils.data.DataLoader for AffectiveCaptionDataset
+    :param indices: sequence of integers indexing the underlying dataset (dataframe).
+    :param shuffle: shuffle the data of the resulting dataloader
+    :return: dataloader of AffectiveCaptionDataset
+    """
+    dataset = affective_dataloader.dataset
+    r_img_files = dataset.image_files.iloc[indices].copy()
+    r_tokens = dataset.tokens.iloc[indices].copy()
+    r_emotions = dataset.emotions.iloc[indices].copy()
+    r_img_files.reset_index(inplace=True, drop=True)
+    r_tokens.reset_index(inplace=True, drop=True)
+    r_emotions.reset_index(inplace=True, drop=True)
+    r_dset = AffectiveCaptionDataset(image_files=r_img_files, tokens=r_tokens,
+                                    emotions=r_emotions, img_transform=dataset.img_transform)
+    batch_size = min(len(indices), affective_dataloader.batch_size)
+    r_loader = torch.utils.data.DataLoader(r_dset,
+                                           shuffle=shuffle,
+                                           batch_size=batch_size,
+                                           num_workers=affective_dataloader.num_workers)
+    return r_loader
+def group_annotations_per_image(affective_dataset):
+    """ Group the annotations per image.
+    :param affective_dataset: an AffectiveCaptionDataset
+    :return: for each image its tokens/emotions as pandas Dataframes
+    """
+    df = pd.concat([affective_dataset.image_files, affective_dataset.tokens, affective_dataset.emotions], axis=1)
+    tokens_grouped = df.groupby('image_files')['tokens_encoded'].apply(list).reset_index(name='tokens_encoded')
+    emotion_grouped = df.groupby('image_files')['emotion_label'].apply(list).reset_index(name='emotion')
+    assert all(tokens_grouped['image_files'] ==  emotion_grouped['image_files'])
+    return tokens_grouped['image_files'], tokens_grouped, emotion_grouped
+def default_grounding_dataset_from_affective_loader(loader, img2emo_clf=None, device=None, n_workers=None):
+    """
+    Convenience function. Given a loader carrying an affective dataset, make a new loader only w.r.t.
+    unique images of the dataset, & optionally add to each image the emotion predicted by the img2emo_clf.
+    The new loader can be used to sample utterances over the unique images.
+    :param loader:
+    :param img2emo_clf:
+    :param device:
+    :return:
+    """
+    affective_dataset = loader.dataset
+    img_files, tokens, emotions = group_annotations_per_image(affective_dataset)
+    img_trans = affective_dataset.img_transform
+    batch_size = loader.batch_size
+    if n_workers is None:
+        n_workers = loader.num_workers
+    dummy = pd.Series(np.ones(len(img_files), dtype=int) * -1)
+    # possibly predict grounding emotions
+    if img2emo_clf is not None:
+        temp_dataset = ImageClassificationDataset(image_files=img_files,
+                                                  img_transform=img_trans)
+        img_dataloader = DataLoader(temp_dataset, batch_size, num_workers=n_workers)
+        emo_pred_distribution = image_to_emotion(img2emo_clf, img_dataloader, device)
+        grounding_emo = pd.Series(emo_pred_distribution.argmax(-1).tolist())  # use maximizer of emotions.
+    else:
+        grounding_emo = dummy
+    new_dataset = AffectiveCaptionDataset(img_files, tokens=dummy, emotions=grounding_emo,
+                                          img_transform=img_trans)
+    new_loader = DataLoader(dataset=new_dataset, batch_size=batch_size, num_workers=n_workers)
+    return new_loader
+def custom_grounding_dataset_similar_to_affective_loader(grounding_data_csv, loader, n_workers=None):
+    """
+    Convenience function. Given a csv indicating (grounding) images on the hard-drive and a loader carrying an affective
+    dataset, make a new loader with the csv images using the same configuration (e.g., img_transform) as the loader.
+    :param grounding_data_csv: (csv filename)
+        - has to have one column named "image_file" that corresponds to the file-names of the images.
+        - (optionally) can have also a "grounding_emotion" column with values like "contentment"
+    :param loader:
+    :return:
+    """
+    df = pd.read_csv(grounding_data_csv)
+    image_files = df['image_file']
+    dummy = pd.Series(np.ones(len(image_files), dtype=int) * -1)
+    if 'grounding_emotion' in df.columns:
+        emotions = df.emotion.apply(emotion_to_int)
+    else:
+        emotions = dummy
+    standard_dset = loader.dataset
+    custom_dataset = AffectiveCaptionDataset(image_files, dummy, emotions=emotions,
+                                             n_emotions=standard_dset.n_emotions,
+                                             img_transform=standard_dset.img_transform,
+                                             one_hot_emo=standard_dset.one_hot_emo)
+    if n_workers is None:
+        n_workers = loader.num_workers
+    custom_data_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
+                                                     batch_size=min(loader.batch_size, len(custom_dataset)),
+                                                     num_workers=n_workers)
+    return custom_data_loader

imageprocessing/artemis/artemis/in_out/neural_net_oriented.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+I/O routines directly related to torch-based neural-models & their (training etc.) dataset processing.
+The MIT License (MIT)
+Originally created at 10/2/20, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+import random
+import warnings
+import numpy as np
+import pandas as pd
+import os.path as osp
+import multiprocessing as mp
+import torchvision.transforms as transforms
+from ast import literal_eval
+from PIL import Image
+from .basics import read_saved_args
+from .datasets import AffectiveCaptionDataset, ImageClassificationDataset
+from ..utils.vocabulary import Vocabulary
+from ..neural_models.show_attend_tell import describe_model as describe_sat
+image_net_mean = [0.485, 0.456, 0.406]
+image_net_std = [0.229, 0.224, 0.225]
+def max_io_workers():
+    """return all/max possible available cpus of machine."""
+    return max(mp.cpu_count() - 1, 1)
+def image_transformation(img_dim, lanczos=True):
+    """simple transformation/pre-processing of image data."""
+    if lanczos:
+        resample_method = Image.LANCZOS
+    else:
+        resample_method = Image.BILINEAR
+    normalize = transforms.Normalize(mean=image_net_mean, std=image_net_std)
+    img_transforms = dict()
+    img_transforms['train'] = transforms.Compose([transforms.Resize((img_dim, img_dim), resample_method),
+                                                  transforms.ToTensor(),
+                                                  normalize])
+    # Use same transformations as in train (since no data-augmentation is applied in train)
+    img_transforms['test'] = img_transforms['train']
+    img_transforms['val'] = img_transforms['train']
+    img_transforms['rest'] = img_transforms['train']
+    return img_transforms
+def df_to_pytorch_dataset(df, args):
+    if args.num_workers == -1:
+        n_workers = max_io_workers()
+    else:
+        n_workers = args.num_workers
+    load_imgs = True
+    if hasattr(args, 'use_imgs') and not args.use_imgs: # build a dataset without images (e.g., text/emotion only)
+        load_imgs = False
+    one_hot_emo = True
+    if hasattr(args, 'one_hot_emo') and not args.one_hot_emo: # turn off the one-hot, keep the integer (e.g., when a using xentropy)
+        one_hot_emo = False
+    img_transforms = None
+    if load_imgs:
+        img_transforms = image_transformation(args.img_dim, lanczos=args.lanczos)
+    if args.dataset == 'artemis':
+        datasets = pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, args.img_dir, one_hot_emo=one_hot_emo)
+    elif args.dataset == 'ola': # Objective Language for Art.
+        datasets = pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, args.img_dir, n_emotions=0)
+    elif args.dataset == 'coco':
+        datasets = pass_coco_splits_to_datasets(df, load_imgs, img_transforms)
+    else:
+        raise ValueError('training dataset not recognized.')
+    dataloaders = dict()
+    for split in datasets:
+        b_size = args.batch_size if split=='train' else args.batch_size * 2
+        dataloaders[split] = torch.utils.data.DataLoader(dataset=datasets[split],
+                                                         batch_size=b_size,
+                                                         shuffle=split=='train',
+                                                         num_workers=n_workers)
+    return dataloaders, datasets
+def pass_coco_splits_to_datasets(df, load_imgs, img_transforms, n_emotions=0):
+    datasets = dict()
+    for split, g in df.groupby('split'):
+        g.reset_index(inplace=True, drop=True) # so that direct ([]) indexing in get_item works
+        img_files = None
+        img_trans = None
+        if load_imgs:
+            img_files = g['image_files']
+            img_trans = img_transforms[split]
+        dataset = AffectiveCaptionDataset(img_files, g.tokens_encoded, g.emotion_label, img_transform=img_trans,
+                                          n_emotions=n_emotions)
+        datasets[split] = dataset
+    return datasets
+def pass_artemis_splits_to_datasets(df, load_imgs, img_transforms, top_img_dir, n_emotions=9, one_hot_emo=True):
+    datasets = dict()
+    for split, g in df.groupby('split'):
+        g.reset_index(inplace=True, drop=True) # so that direct ([]) indexing in get_item works
+        img_files = None
+        img_trans = None
+        if load_imgs:
+            img_files = g.apply(lambda x : osp.join(top_img_dir, x.art_style,  x.painting + '.jpg'), axis=1)
+            img_files.name = 'image_files'
+            img_trans = img_transforms[split]
+        dataset = AffectiveCaptionDataset(img_files, g.tokens_encoded, g.emotion_label, n_emotions=n_emotions,
+                                          img_transform=img_trans, one_hot_emo=one_hot_emo)
+        datasets[split] = dataset
+    return datasets
+def read_preprocessed_data_df(args, verbose=False):
+    if args.dataset == 'artemis':
+        file_name = 'artemis_preprocessed.csv'
+    elif args.dataset == 'coco':
+        file_name = 'coco_preprocessed.csv'
+    else:
+        raise ValueError('Unknown Dataset.')
+    if hasattr(args, 'fine_tune_data') and  args.fine_tune_data:
+        df = pd.read_csv(args.fine_tune_data) # allow explicit data passing
+    else:
+        df = pd.read_csv(osp.join(args.data_dir, file_name))
+    df.tokens_encoded = df.tokens_encoded.apply(literal_eval)
+    if verbose:
+        print('Loaded {} utterances'.format(len(df)))
+    return df
+def image_emotion_distribution_df_to_pytorch_dataset(df, args, drop_thres=None):
+    """ Convert the pandas dataframe that carries information about images and emotion (distributions) to a
+    dataset that is amenable to deep-learning (e.g., for an image2emotion classifier).
+    :param df:
+    :param args:
+    :param drop_thres: (optional, float) if provided each distribution of the training will only consist of examples
+        for which the maximizing emotion aggregates more than this (drop_thres) mass.
+    :return: pytorch dataloaders & datasets
+    """
+    dataloaders = dict()
+    datasets = dict()
+    img_transforms = image_transformation(args.img_dim, lanczos=args.lanczos)
+    if args.num_workers == -1:
+        n_workers = max_io_workers()
+    else:
+        n_workers = args.num_workers
+    for split, g in df.groupby('split'):
+        g.reset_index(inplace=True, drop=True)
+        if split == 'train' and drop_thres is not None:
+            noise_mask = g['emotion_distribution'].apply(lambda x: max(x) > drop_thres)
+            print('Keeping {} of the training data, since for the rest their emotion-maximizer is too low.'.format(noise_mask.mean()))
+            g = g[noise_mask]
+            g.reset_index(inplace=True, drop=True)
+        img_files = g.apply(lambda x : osp.join(args.img_dir, x.art_style,  x.painting + '.jpg'), axis=1)
+        img_files.name = 'image_files'
+        dataset = ImageClassificationDataset(img_files, g.emotion_distribution,
+                                             img_transform=img_transforms[split])
+        datasets[split] = dataset
+        b_size = args.batch_size if split=='train' else args.batch_size * 2
+        dataloaders[split] = torch.utils.data.DataLoader(dataset=dataset,
+                                                         batch_size=b_size,
+                                                         shuffle=split=='train',
+                                                         num_workers=n_workers)
+    return dataloaders, datasets
+def seed_torch_code(seed, strict=False):
+    """Control pseudo-randomness for reproducibility.
+    :param manual_seed: (int) random-seed
+    :param strict: (boolean) if True, cudnn operates in a deterministic manner
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if strict:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def save_state_dicts(checkpoint_file, epoch=None, **kwargs):
+    """ Save torch items with a state_dict
+    """
+    checkpoint = dict()
+    if epoch is not None:
+        checkpoint['epoch'] = epoch
+    for key, value in kwargs.items():
+        checkpoint[key] = value.state_dict()
+    torch.save(checkpoint, checkpoint_file)
+def load_state_dicts(checkpoint_file, map_location=None, **kwargs):
+    """ Load torch items from saved state_dictionaries
+    """
+    if map_location is None:
+        checkpoint = torch.load(checkpoint_file)
+    else:
+        checkpoint = torch.load(checkpoint_file, map_location=map_location)
+    for key, value in kwargs.items():
+        value.load_state_dict(checkpoint[key])
+    epoch = checkpoint.get('epoch')
+    if epoch:
+        return epoch
+def torch_save_model(model, path):
+    """ Wrap torch.save to catch standard warning of not finding the nested implementations.
+    :param model:
+    :param path:
+    :return:
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        return torch.save(model, path)
+def torch_load_model(checkpoint_file, map_location=None):
+    """ Wrap torch.load to catch standard warning of not finding the nested implementations.
+    :param checkpoint_file:
+    :param map_location:
+    :return:
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        model = torch.load(checkpoint_file, map_location=map_location)
+    return model
+def load_saved_speaker(args_file, model_ckp, with_data=False, override_args=None, verbose=False):
+    """
+    :param args_file: saved argparse arguments with model's description (and location of used data)
+    :param model_ckp: saved checkpoint with model's parameters.
+    :param with_data:
+    :param override_args:
+    :return:
+    Note, the model is loaded and returned in cpu.
+    """
+    if verbose:
+        print('Loading saved speaker trained with parameters:')
+    args = read_saved_args(args_file, override_args=override_args, verbose=verbose)
+    # Prepare empty model
+    vocab = Vocabulary.load(osp.join(args.data_dir, 'vocabulary.pkl'))
+    print('Using a vocabulary of size', len(vocab))
+    model = describe_sat(vocab, args)
+    # Load save weights
+    epoch = load_state_dicts(model_ckp, model=model, map_location='cpu')
+    print('Loading speaker model at epoch {}.'.format(epoch))
+    # Load data
+    if with_data:
+        df = read_preprocessed_data_df(args, verbose=True)
+        data_loaders, _ = df_to_pytorch_dataset(df, args)
+    else:
+        data_loaders = None
+    return model, epoch, data_loaders
+def deprocess_img(img, std=None, mean=None, clamp=None, inplace=False):
+    if not inplace:
+        img = img.clone()
+    if img.ndimension() == 4:  # batch of images
+        pass
+        # single_img = False
+    elif img.ndimension() == 3:  # single image
+        img = img.view([1] + list(img.shape))
+        # single_img = True
+    else:
+        raise ValueError()
+    dtype = img.dtype
+    n_channels = img.size(1)
+    if std is not None:
+        std = torch.as_tensor(std, dtype=dtype, device=img.device)
+        img.mul_(std.view([1, n_channels, 1, 1]))
+    if mean is not None:
+        mean = torch.as_tensor(mean, dtype=dtype, device=img.device)
+        img.add_(mean.view([1, n_channels, 1, 1]))
+    if clamp is not None:
+        img.clamp_(clamp[0], clamp[1])
+    return img
+def to_img(tensor, mean=None, std=None):
+    """ Convert tensor object to PIL.Image(s)
+    :param tensor:
+    :param mean:
+    :param std:
+    :return:
+    """
+    image = tensor.clone().detach()
+    image = deprocess_img(image, mean=mean, std=std)
+    # Add 0.5 after un-normalizing to [0, 255] to round to nearest integer
+    array = image.mul_(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy()
+    image = []
+    for im in array:
+        image.append(Image.fromarray(im))
+    return image

imageprocessing/artemis/artemis/language/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""

imageprocessing/artemis/artemis/language/adjective_noun_pairs.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Some operations to handle Adjective-Noun Pairs. E.g., useful for sentiment injection
+The MIT License (MIT)
+Originally created mid 2020, for Python 3.x
+Copyright (c) 2020 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+from collections import Counter
+from .part_of_speech import  nltk_parallel_tagging_of_tokens
+def collect_anps_of_sentence(tokenized_pos_tagged_sentence, tagset='universal'):
+    """ return all ANPs that occur in consecutive positions.
+    tokenized_pos_tagged_sentence: list, containing the result of calling from nltk.pos_tag on a tokenized sentence.
+      E.g., [('a', 'DT'), ('big', 'JJ'), ('man', 'NN')]
+    """
+    n_tokens = len(tokenized_pos_tagged_sentence)
+    collected = []
+    if tagset == 'universal':
+        for i, p in enumerate(tokenized_pos_tagged_sentence):
+            if p[1] == 'ADJ' and i < n_tokens -1:
+                if tokenized_pos_tagged_sentence[i+1][1] == 'NOUN':
+                    collected.append(p[0] + ' ' + tokenized_pos_tagged_sentence[i+1][0])
+    elif tagset == 'penn':
+        for i, p in enumerate(tokenized_pos_tagged_sentence):
+            if p[1].startswith('J') and i < n_tokens -1:
+                if tokenized_pos_tagged_sentence[i+1][1].startswith('N'):
+                    collected.append(p[0] + ' ' + tokenized_pos_tagged_sentence[i+1][0])
+    else:
+        raise ValueError()
+    return collected
+def collect_anp_statistics_of_collection(token_series):
+    """ E.g., e.g., how frequent is the ANP "happy man" in the token_series.
+    :param token_series: pd.Series, each row is a tokenized sentence
+    :return:
+    """
+    part_of_s = nltk_parallel_tagging_of_tokens(token_series)
+    anps = part_of_s.apply(collect_anps_of_sentence)
+    anp_counter = Counter()
+    anps.apply(anp_counter.update)
+    return anp_counter, anps, part_of_s

imageprocessing/artemis/artemis/language/basics.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+A set of functions that are useful for processing textual data.
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import pandas as pd
+import multiprocessing as mp
+from multiprocessing import Pool
+from collections import defaultdict
+from itertools import tee, islice
+from symspellpy.symspellpy import SymSpell
+from .language_preprocessing import unquote_words, expand_contractions
+from .language_preprocessing import manual_sentence_spelling, manual_tokenized_sentence_spelling
+from ..language.spelling import sentence_spelling_dictionary as artemis_sentence_spelling_dictionary
+from ..language.spelling import token_spelling_dictionary as artemis_token_spelling_dictionary
+from ..language.spelling import missing_from_glove_but_are_actual_words
+from ..neural_models.word_embeddings import load_glove_pretrained_embedding
+def ngrams(lst, n):
+    """ Return the ngrams of a list of tokens.
+    :param lst: the tokens
+    :param n: n of n-grams
+    :return:
+    """
+    tlst = lst
+    while True:
+        a, b = tee(tlst)
+        l = tuple(islice(a, n))
+        if len(l) == n:
+            yield l
+            next(b)
+            tlst = b
+        else:
+            break
+def parallel_apply(iterable, func, n_processes=None):
+    """ Apply func in parallel to chunks of the iterable based on multiple processes.
+    :param iterable:
+    :param func: simple function that does not change the state of global variables.
+    :param n_processes: (int) how many processes to split the data over
+    :return:
+    """
+    n_items = len(iterable)
+    if n_processes is None:
+        n_processes = min(4 * mp.cpu_count(), n_items)
+    pool = Pool(n_processes)
+    chunks = int(n_items / n_processes)
+    res = []
+    for data in pool.imap(func, iterable, chunksize=chunks):
+        res.append(data)
+    pool.close()
+    pool.join()
+    return res
+def tokenize_and_spell(df, glove_file, freq_file, tokenizer, parallel=True, inplace=True, spell_check=True):
+    speller = SymSpell()
+    loaded = speller.load_dictionary(freq_file, term_index=0, count_index=1)
+    print('SymSpell spell-checker loaded:', loaded)
+    golden_vocabulary = load_glove_pretrained_embedding(glove_file, only_words=True, verbose=True)
+    golden_vocabulary = golden_vocabulary.union(missing_from_glove_but_are_actual_words)
+    print('Updating Glove vocabulary with *valid* ArtEmis words that are missing from it.')
+    missed_tokens = defaultdict(list)
+    def automatic_token_speller(token_list, max_edit_distance=1):
+        new_tokens = []
+        for token in token_list:
+            if token in golden_vocabulary:
+                new_tokens.append(token) # no spell check
+            else:
+                spells = speller.lookup(token, max_edit_distance)
+                if len(spells) > 0:  # found a spelled checked version
+                    new_tokens.append(spells[0].term)
+                else: # spell checking failed
+                    context = " ".join(token_list)
+                    missed_tokens[token].append(context)
+                    new_tokens.append(token)
+        return new_tokens
+    if not spell_check:
+        automatic_token_speller = None
+    clean_text, tokens, spelled_tokens = pre_process_text(df.utterance,
+                                                          artemis_sentence_spelling_dictionary,
+                                                          artemis_token_spelling_dictionary,
+                                                          tokenizer,
+                                                          token_speller=automatic_token_speller,
+                                                          parallel=parallel)
+    if inplace:
+        df['tokens'] = spelled_tokens
+        df['tokens_len'] = df.tokens.apply(lambda x : len(x))
+        df['utterance_spelled'] = df.tokens.apply(lambda x : ' '.join(x))
+        return missed_tokens
+    else:
+        return missed_tokens, spelled_tokens
+def pre_process_text(text, manual_sentence_speller, manual_token_speller,
+                     tokenizer,  token_speller=None, parallel=True):
+    clean_text = text.apply(lambda x: manual_sentence_spelling(x, manual_sentence_speller)) # sentence-to-sentence map
+    clean_text = clean_text.apply(lambda x: x.lower())
+    clean_text = clean_text.apply(unquote_words)
+    if parallel:
+        clean_text = pd.Series(parallel_apply(clean_text, expand_contractions))
+    else:
+        clean_text = clean_text.apply(expand_contractions)
+    basic_punct = '.?!,:;/\-~*_=[–]{}$^@|%#<—>'
+    punct_to_space = str.maketrans(basic_punct, ' ' * len(basic_punct))  # map punctuation to space
+    clean_text = clean_text.apply(lambda x: x.translate(punct_to_space))
+    if parallel:
+        tokens = pd.Series(parallel_apply(clean_text, tokenizer))
+    else:
+        tokens = clean_text.apply(tokenizer)
+    spelled_tokens = tokens.apply(lambda x: manual_tokenized_sentence_spelling(x,
+                                                                               spelling_dictionary=manual_token_speller)
+                                  )
+    if token_speller is not None:
+        spelled_tokens = spelled_tokens.apply(token_speller)
+    return clean_text, tokens, spelled_tokens

imageprocessing/artemis/artemis/language/language_preprocessing.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""
+A set of functions that are useful for pre-processing textual data: uniformizing the words, spelling, etc.
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import re
+contractions_dict = {
+    "ain't": "am not",
+    "aren't": "are not",
+    "can't": "cannot",
+    "can't've": "cannot have",
+    "'cause": "because",
+    "could've": "could have",
+    "couldn't": "could not",
+    "couldn't've": "could not have",
+    "didn't": "did not",
+    "doesn't": "does not",
+    "don't": "do not",
+    "hadn't": "had not",
+    "hadn't've": "had not have",
+    "hasn't": "has not",
+    "haven't": "have not",
+    "he'd": "he had",
+    "he'd've": "he would have",
+    "he'll": "he will",
+    "he'll've": "he will have",
+    "he's": "he is",
+    "how'd": "how did",
+    "how'd'y": "how do you",
+    "how'll": "how will",
+    "how's": "how is",
+    "i'd": "I had",
+    "i'd've": "I would have",
+    "i'll": "I will",
+    "i'll've": "I will have",
+    "i'm": "I am",
+    "i've": "I have",
+    "isn't": "is not",
+    "it'd": "it had",
+    "it'd've": "it would have",
+    "it'll": "it will",
+    "it'll've": "iit will have",
+    "it's": "it is",
+    "let's": "let us",
+    "ma'am": "madam",
+    "mayn't": "may not",
+    "might've": "might have",
+    "mightn't": "might not",
+    "mightn't've": "might not have",
+    "must've": "must have",
+    "mustn't": "must not",
+    "mustn't've": "must not have",
+    "needn't": "need not",
+    "needn't've": "need not have",
+    "o'clock": "of the clock",
+    "oughtn't": "ought not",
+    "oughtn't've": "ought not have",
+    "shan't": "shall not",
+    "sha'n't": "shall not",
+    "shan't've": "shall not have",
+    "she'd": "she had",
+    "she'd've": "she would have",
+    "she'll": "she will",
+    "she'll've": "she will have",
+    "she's": "she is",
+    "should've": "should have",
+    "shouldn't": "should not",
+    "shouldn't've": "should not have",
+    "so've": "so have",
+    "so's": "so is",
+    "that'd": "that had",
+    "that'd've": "that would have",
+    "that's": "that is",
+    "there'd": "there had",
+    "there'd've": "there would have",
+    "there's": "there is",
+    "they'd": "they had",
+    "they'd've": "they would have",
+    "they'll": "they will",
+    "they'll've": "they will have",
+    "they're": "they are",
+    "they've": "they have",
+    "to've": "to have",
+    "wasn't": "was not",
+    "we'd": "we had",
+    "we'd've": "we would have",
+    "we'll": "we will",
+    "we'll've": "we will have",
+    "we're": "we are",
+    "we've": "we have",
+    "weren't": "were not",
+    "what'll": "what will",
+    "what'll've": "what will have",
+    "what're": "what are",
+    "what's": "what is",
+    "what've": "what have",
+    "when's": "when is",
+    "when've": "when have",
+    "where'd": "where did",
+    "where's": "where is",
+    "where've": "where have",
+    "who'll": "who will",
+    "who'll've": "who will have",
+    "who's": "who is",
+    "who've": "who have",
+    "why's": "why is",
+    "why've": "why have",
+    "will've": "will have",
+    "won't": "will not",
+    "won't've": "will not have",
+    "would've": "would have",
+    "wouldn't": "would not",
+    "wouldn't've": "would not have",
+    "y'all": "you all",
+    "y'all'd": "you all would",
+    "y'all'd've": "you all would have",
+    "y'all're": "you all are",
+    "y'all've": "you all have",
+    "you'd": "you had",
+    "you'd've": "you would have",
+    "you'll": "you will",
+    "you'll've": "you will have",
+    "you're": "you are",
+    "you've": "you have",
+    "do'nt": "do not",
+    "does\'nt": "does not"
+}
+CONTRACTION_RE = re.compile('({})'.format('|'.join(contractions_dict.keys())),
+                            flags=re.IGNORECASE | re.DOTALL)
+def expand_contractions(text, contractions=None, lower_i=True):
+    """ Expand the contractions of the text (if any).
+    Example: You're a good father. -> you are a good father.
+    :param text: (string)
+    :param contractions: (dict)
+    :param lower_i: boolean, if True (I'm -> 'i am' not 'I am')
+    :return: (string)
+    Note:
+        Side-effect: lower-casing. E.g., You're -> you are.
+    """
+    if contractions is None:
+        contractions = contractions_dict  # Use one define in this .py
+    def expand_match(contraction):
+        match = contraction.group(0)
+        expanded_contraction = contractions.get(match)
+        if expanded_contraction is None:
+            expanded_contraction = contractions.get(match.lower())
+        if lower_i:
+            expanded_contraction = expanded_contraction.lower()
+        return expanded_contraction
+    expanded_text = CONTRACTION_RE.sub(expand_match, text)
+    return expanded_text
+QUOTES_RE_STR = r"""(?:['|"][\w]+['|"])"""    # Words encapsulated in apostrophes.
+QUOTES_RE = re.compile(r"(%s)" % QUOTES_RE_STR, flags=re.VERBOSE | re.IGNORECASE | re.UNICODE)
+def unquote_words(s):
+    """ 'king' - > king, "queen" -> queen """
+    iterator = QUOTES_RE.finditer(s)
+    new_sentence = list(s)
+    for match in iterator:
+        start, end = match.span()
+        new_sentence[start] = ' '
+        new_sentence[end-1] = ' '
+    new_sentence = "".join(new_sentence)
+    return new_sentence
+def manual_sentence_spelling(x, spelling_dictionary):
+    """
+    Applies spelling on an entire string, if x is a key of the spelling_dictionary.
+    :param x: (string) sentence to potentially be corrected
+    :param spelling_dictionary: correction map
+    :return: the sentence corrected
+    """
+    if x in spelling_dictionary:
+        return spelling_dictionary[x]
+    else:
+        return x
+def manual_tokenized_sentence_spelling(tokens, spelling_dictionary):
+    """
+    :param tokens: (list of tokens) to potentially be corrected
+    :param spelling_dictionary: correction map
+    :return: a list of corrected tokens
+    """
+    new_tokens = []
+    for token in tokens:
+        if token in spelling_dictionary:
+            res = spelling_dictionary[token]
+            if type(res) == list:
+                new_tokens.extend(res)
+            else:
+                new_tokens.append(res)
+        else:
+            new_tokens.append(token)
+    return new_tokens
+# noinspection PyInterpreter
+if __name__ == "__main__":
+    import pandas as pd
+    text = pd.DataFrame({'data': ["I'm a 'good' MAN", "You can't be likee this."]})
+    print("Original Text:")
+    print(text.data)
+    manual_speller = {'You can\'t be likee this.': 'You can\'t be like this.'}
+    text.data = text.data.apply(lambda x: manual_sentence_spelling(x, manual_speller))
+    text.data = text.data.apply(lambda x: x.lower())
+    text.data = text.data.apply(unquote_words)
+    text.data = text.data.apply(expand_contractions)
+    print("Corrected Text:")
+    print(text.data)

imageprocessing/artemis/artemis/language/part_of_speech.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Part of speech tagging at speed for two libraries.
+The MIT License (MIT)
+Originally created in 2020, for Python 3.x - last updated in early 2021.
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import dask.dataframe as dd
+import multiprocessing as mp
+from nltk.tag import pos_tag
+try:
+    import spacy
+except:
+    pass
+def nltk_parallel_tagging_of_tokens(tokens, n_partitions=None, tagset='universal'):
+    """ pos-tagging
+    :param tokens: pd.Series with tokenized utterances as rows. e.g., [['a', 'man'], ['a', 'big', 'man'], ...]
+    :return: a pd.Series with the result of applying pos_tag in each row. e.g.,
+        [(a, DT), (man, NN)], [('a', 'DT'), ('big', 'JJ'), ('man', 'NN')]]
+    """
+    if n_partitions is None:
+        n_partitions = mp.cpu_count() * 4
+    ddata = dd.from_pandas(tokens, npartitions=n_partitions)
+    tagged_tokens =\
+        ddata.map_partitions(lambda x: x.apply((lambda y: pos_tag(y, tagset=tagset)))).compute(scheduler='processes')
+    return tagged_tokens
+def spacy_pos_tagging(utterances, nlp=None):
+    if nlp is None:
+        nlp = spacy.load('en_core_web_sm')
+    utters = utterances.astype('unicode').values
+    docs = nlp.pipe(utters, batch_size=1000, n_threads=-1)
+    pos = [[t.pos_ for t in d if not t.is_space] for d in docs]
+    return pos

imageprocessing/artemis/artemis/language/spelling.py ADDED Viewed

	@@ -0,0 +1,634 @@

+"""
+Auxiliary spelling utilities.
+It's called [may-rah-kee]: https://travelwithmeraki.com/meaning-of-meraki/
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab.
+"""
+##
+## Dictionary, mapping entire "raw" collected sentences to new sentences. These were emailed separately from the AMT submissions
+## from pedantic and good! annotators, who wanted to correct their original submission.
+##
+sentence_spelling_dictionary = {
+    'thewaytheshapeschangethespace': 'the way the shapes change the space',
+    'brightcolorsandanimals': 'bright colors and animals',
+    'calmweatherandpeopleworkingtogether': 'calm weather and people working together',
+    'thesoftcolors': 'the soft colors',
+    'itlookslikeapeacefullocation': 'it looks like a peaceful location',
+    'Iliketoplayinthesnow': 'I like to play in the snow',
+    'thecalmwaters': 'the calm waters',
+    'itseemsincompletesomehow': 'it seems incomplete somehow',
+    'thefiguresseemsomehowawkward': 'the figures seem somehow awkward',
+    'thegreenandlackofpeople': 'the green and lack of people',
+    'boatingisanadventure': 'boating is an adventure',
+    'thereseemstobetoomuchsky': 'there seems to be too much sky',
+    'onthewholeitlookslikeawallpaperswatch': 'on the whole it looks like a wallpapers watch',
+    'thecolorofthewater': 'the color of the water',
+    'alloftheplantsandthewatershowingthroughthem': 'all of the plants and the water showing through them',
+    'thecolorcombination': 'the color combination',
+    'theslashesacrosstheworkandthedarkcolors': 'the slashes across the work and the dark colors',
+    'thesimplicityandopenness': 'the simplicity and openness',
+    'theproportionbetweenheadandbody': 'the proportion between head and body',
+    'thebrightcolorsandthefeelingofmotion': 'the bright colors and the feeling of motion',
+    'thedetailinthehands': 'the detail in the hands',
+    'thelengthofthewoman\'sneck': 'the length of the woman\'s neck',
+    'thedifferentflocksofbirdsinthesky': 'the different flocks of birds in the sky',
+    'theskillshownindrawingthefigures': 'the skill shown in drawing the figures',
+    'thenaturallookingskintone': 'the natural looking skin tone',
+    'theyeadsthatdon\'tseemtobeattatchedtoanything': 'the yeads that don\'t seem to be attatched to anything',
+    'themasksheisholding': 'the mask she is holding',
+    'theapparantageofthepiece': 'the apparent age of the piece',
+    'thepinkbows': 'the pink bows',
+    'theintensityonthefaceofthemaninfrontofthewoman.': 'the intensity on the face of the man in front of the woman',
+    'theshapesandcolors-itlookshard,painful':  'the shapes and colors - it looks hard, painful',
+    'thewaytheguitarisbrokenupandmagnifiedbutstillidentifiable.': 'the way the guitar is broken up and magnified but still identifiable',
+    'veryimpressedwiththewaytheartistcreatedlight': 'very impressed with the way the artist created light',
+    'itlookslikepeoplearewaitingforsomeeventtohappenlikeaboatraceorsomething': 'it looks like people are waiting for some event to happen like a boat race or something',
+    'wonderingifthisisapaintingoratextile': 'wondering if this is a painting or a textile',
+    'itseemstoodarkfortheactivity': 'it seems too dark for the activity',
+    'Ithinkitskindofweirdhowherhipsmakealmostacircle': 'I think its kind of weird how her hips make almost a circle',
+    'thesoftnessofthefiguremakesitfeellikeiamintrudingonanintimatemoment': 'the softness of the figure makes it feel like i am intruding on an intimate moment',
+    'Idon\'tseepieceslikethisas\'art\'itcouldbethewallinsomeone\'shouse': 'I don\'t see pieces like this as \'art\' it could be the wall in someone\'s house',
+    'thereflectionofthetreesinthepool': 'the reflection of the trees in the pool',
+    'thepainonhisface': 'the pain on his face',
+    'itremindsmeofastringofbeadsasmallchildwouldmake': 'it reminds me of a string of beads a small child would make',
+    'thefigurebeneaththetreeappearsveryrelaxed': 'the figure beneath the tree appears very relaxed',
+    'thefacialexpressionisveryloving': 'the facial expression is very loving',
+    'thecolorsandactivitymakeitlooklikeafunplacetobe': 'the colors and activity make it look like a fun place to be',
+    'itlookslikeaniceareatogoforawalk': 'it looks like an ice area to go for a walk',
+    'theyappeartobeayoungcoupleinlove': 'they appear to be a young couple in love',
+    'allofthelittledetailsareamazing': 'all of the little details are amazing',
+    'knowledgeofthehistoryassociatedwiththeperson': 'knowledge of the history associated with the person',
+    'itlookslikeaveryoldpaintedtextile': 'it looks like a very old painted textile',
+    'allofthebrightcolorsjustmakemehappy': 'all of the bright colors just make me happy',
+    'dificultysortingoutwhetherthefigureismaleorfemale': 'dificulty sorting out whether the figure is male or female',
+    'Itseemskindoflikeaposteryou\'dputinaclassroom.': 'It seems kind of like a poster you\'d put in a classroom',
+    'Theexpressionontheman\'sfaceappearsangry.': 'The expression on the man\'s face appears angry.',
+    'it\'d dark and creepy and weirdly sexual in a bad way.': 'it\'s dark and creepy and weirdly sexual in a bad way.',
+    'looks disgusting looks like a cross desser disgusting': 'looks disgusting looks like a cross dresser disgusting',
+    'Big skirts and bloomers on dancing ladies definitelymake the mood excitement.': 'big skirts and bloomers on dancing ladies definitely make an exciting mood',
+    'The rays eminntating from sum over the town os exhilarating': 'the rays emanating from sun over the town are exhilarating',
+    'the way the artist uses black and white really gives this a different feel tp the painting': 'the way the artist uses black and white really gives this a different feel to the painting',
+    'This reminds me of jumbled graffiti I saw and had tp clean up every once in a while when I was younger': 'This reminds me of jumbled graffiti I saw and had to clean up every once in a while when I was younger',
+    'looks like the cabins i stayed in on my trip tp a dude ranch': 'looks like the cabins i stayed in on my trip to a dude ranch',
+    'A young woman applies make-up tp her face as she sits in a pretty robe. A pleasant but unfished work of art.': 'A young woman applies make-up to her face as she sits in a pretty robe. A pleasant but unfinished work of art.',
+    'old and not kept up well, bug nice scene of a man at work': 'old and not kept up well, but nice scene of a man at work',
+    'Again this painting is dill its lifeless and no color.': 'Again this painting is dull, it is lifeless and has no color.',
+    'she smile smartly': 'she smiles smartly',
+    'The mountain makes me think of a strong safehold, and a feeling of shelter.': 'The mountain makes me think of a stronghold giving me a feeling of shelter',
+    'The detail in the surundsing like the clock tower and statue make it more inspiering': 'The detail in the surroundings like the clock tower and statue make it more inspiring',
+    'The man is chained down and left to be attacked by a bird of prey while another man non chalantly watches what is taking place.': 'The man is chained down and left to be attacked by a bird of prey while another man nonchalantly watches what is taking place.',
+    'This is a beautiful scene with the mosques in the background and the vegetation in the front.': 'This is a beautiful scene with the onion dome churches in the background and the vegetation in the front.',
+    'The colors make me feel like I\'m looking at someone important. I feel a since of awe over them because of their attire.': 'The colors make me feel like I\'m looking at someone important. I feel a sense of awe over them because of their attire.',
+    'the detective ihas found the diar everyone knew she kept and now hopefully he wi findout what led up to her breakdown ': 'the detective has found the diary everyone knew she kept and now hopefully he will find out what led up to her breakdown',
+    'The VanGoghishness of this makes me smile and wonder how they do it.': 'The Van Gogh like quality of this makes me smile and wonder how they do it.',
+    'The colors and shapes compliment each otherakes look like a adult childs painting.': 'The colors and shapes compliment each other and looks like an adult child\'s painting.',
+    'I prefer more realistic still ifes.': 'I prefer more realistic still lifes.'
+}
+##
+## Dictionary, mapping words to replacement words to densify the dataset (e.g., colour -> color), or more likely to spell
+## check them. ## Curated manually by Panos circa 2020.
+##
+token_spelling_dictionary = {'colour': 'color',
+                             'colours': 'colors',
+                             'thecountry': ['the', 'country'],
+                             'minimamistic': 'minimalistic',
+                             'littlefinger': ['little', 'finger'],
+                             'im': ['i', 'am'],
+                             'greatfull': 'grateful',
+                             'skinnydippers': ['skinny', 'dippers'],
+                             'goingnon': ['going', 'on'],
+                             'rainclouds': ['rain', 'clouds'],
+                             'lillypads': ['lily', 'pads'],
+                             'paintinglike': ['painting', 'like'],
+                             'somekind': ['some', 'kind'],
+                             'overexaggerated': ['over', 'exaggerated'],
+                             'smokeshop': ['smoke', 'shop'],
+                             'fearinspiring': ['fear', 'inspiring'],
+                             'thebackround': ['the', 'background'],
+                             'raincloud': ['rain', 'cloud'],
+                             'wideopen': ['wide', 'open'],
+                             'crusifiction': 'crucifixion',
+                             'tablesetting': ['table', 'setting'],
+                             'vividcolors': ['vivid', 'colors'],
+                             'willhave': ['will', 'have'],
+                             'thisbpainting': ['this', 'painting'],
+                             'alongthis': ['along', 'this'],
+                             'crucifications': 'crucifixion',
+                             'overexaggeration': ['over', 'exaggeration'],
+                             'snacktime': ['snack', 'time'],
+                             'beaurocratic': 'bureaucratic',
+                             'nonsensicalness': ['nonsensical', 'ness'],
+                             'chubbyness': 'chubbiness',
+                             'distatestful': 'distasteful',
+                             'disapportioned': 'disproportionate',
+                             'becauseofthe': ['because', 'of', 'the'],
+                             'hahahahaha': 'haha',
+                             'hahahahaa': 'haha',
+                             'annoniminity': 'anonymity',
+                             'realisticand': ['realistic', 'and'],
+                             'feellike': ['feel', 'like'],
+                             'clostiphobic': 'claustrophobic',
+                             'thegolden': ['the', 'golden'],
+                             'minimalstic': 'minimalistic',
+                             'artdeco': ['art', 'deco'],
+                             'paddleboards': ['paddle', 'boards'],
+                             'fitbtogetherv': ['fit', 'together'],
+                             'doingthat': ['doing', 'that'],
+                             'stormclouds': ['storm', 'clouds'],
+                             'feelanxious': ['feel', 'anxious'],
+                             'withpeople':['with', 'people'],
+                             'nuditythen': ['nudity', 'then'],
+                             'whatbappears': ['what', 'appears'],
+                             'womenafter': ['women', 'after'],
+                             'funerallike': ['funeral', 'like'],
+                             'thebridge': ['the', 'bridge'],
+                             'focalpoint': ['focal', 'point'],
+                             'crussifiction': 'crucifixion',
+                             'extrocinary': 'extraordinary',
+                             'adrodgenous': 'androgynous',
+                             'whimsacle': 'whimsical',
+                             'nonabrasive': ['non', 'abrasive'],
+                             'alienlike': ['alien', 'like'],
+                             'intricitally': 'intricately',
+                             'straightlines': ['straight', 'lines'],
+                             'shouldnt': ['should', 'not'],
+                             'favortire': 'favorite',
+                             'downsyndrome': ['down', 'syndrome'],
+                             'silluete': 'silhouette',
+                             'provideenough': ['provide', 'enough'],
+                             'waterpainting':['water', 'painting'],
+                             'the19th': ['the', 'nineteenth'],
+                             'oldfashoned': ['old', 'fashioned'],
+                             'colorblocking': ['color', 'blocking'],
+                             'gesticulates': 'testiculates',
+                             'notknow': ['not', 'know'],
+                             'crucifixiction': 'crucifixion',
+                             'cruxifiction': 'crucifixion',
+                             'contementent': 'contentment',
+                             'underconstruction': ['under', 'construction'],
+                             'cartoonfrom': ['cartoon', 'from'],
+                             'downwardlooks': ['downward', 'looks'],
+                             'unrelateable': 'unrelatable',
+                             'ofvthose': ['of', 'those'],
+                             'rainbowlike': ['rainbow', 'like'],
+                             'thegesture': ['the', 'gesture'],
+                             'pencilwork': ['pencil', 'work'],
+                             'perfectlycovered': ['perfectly', 'covered'],
+                             'eitherway': ['either', 'way'],
+                             'andpeaceful': ['and', 'peaceful'],
+                             'cloudforms': ['cloud', 'forms'],
+                             'peoplejust': ['people', 'just'],
+                             'pyscadellic': 'psychedelic',
+                             'maybepreparing': ['maybe', 'preparing'],
+                             'thisbmakes': ['this', 'makes'],
+                             'thispainting': ['this', 'painting'],
+                             'combinationmakes': ['combination', 'makes'],
+                             'rightside': ['right', 'side'],
+                             'saysnothing': ['says', 'nothing'],
+                             'individualness': 'individualism',
+                             'verynostalgic': ['very', 'nostalgic'],
+                             'hyperrealistic': ['hyper', 'realistic'],
+                             'wimsicle': 'whimsical',
+                             'aweinspiring': ['awe', 'inspiring'],
+                             'resturarunt': 'restaurant',
+                             'cruxification': 'crucifixion',
+                             'mistiruous': 'mysterious',
+                             'streetlamp': ['street', 'lamp'],
+                             'sadnessand': ['sadness', 'and'],
+                             'republicancult': ['republican', 'cult'],
+                             'mogilianni': 'modigliani',
+                             'raphealite': 'raphaelite',
+                             'immeidtaley': 'immediately',
+                             'duckface': ['duck', 'face'],
+                             'kinglike': ['king', 'like'],
+                             'monaleesa': ['mona', 'lisa'],
+                             'antispication': 'anticipation',
+                             'womendid': ['women', 'did'],
+                             'jailcell': ['jail', 'cell'],
+                             'thispeicemakes': ['this', 'piece', 'makes'],
+                             'pceaful': 'peaceful',
+                             'showpeople': ['show', 'people'],
+                             'colorsand': ['colors', 'and'],
+                             'lovevthe': ['love', 'the'],
+                             'mewithoutyou': ['me', 'without', 'you'],
+                             'microexpression': ['micro', 'expression'],
+                             'doesnnt': ['does', 'not'],
+                             'airfilter': ['air', 'filter'],
+                             'appostols': 'apostles',
+                             'acrossthe': ['across', 'the'],
+                             'andaroused': ['and', 'aroused'],
+                             'bluecolor': ['blue', 'color'],
+                             'broadstrokes': ['broad', 'strokes'],
+                             'bullethole': ['bullet', 'hole'],
+                             'shadowlike': ['shadow', 'like'],
+                             'shepardplaying': ['shepard', 'playing'],
+                             'siporportioned': 'disproportionate',
+                             'skyremind': ['sky', 'remind'],
+                             'theblending': ['the', 'blending'],
+                             'thoughtfuland': ['thoughtful', 'and'],
+                             'yellowbrowns': ['yellow', 'browns'],
+                             'creeeppyyy': 'creepy',
+                             'crosslegged': ['cross', 'legged'],
+                             'cupshave': ['cups', 'have'],
+                             'dissapoinment': 'disappointment',
+                             'drinkbest': ['drink', 'best'],
+                             'dragonlike': ['dragon', 'like'],
+                             'dressform': ['dress', 'form'],
+                             'farmlife': ['farm', 'life'],
+                             'inbtween': ['in', 'between'],
+                             'averageparentproblems': ['average', 'parent', 'problems'],
+                             'aroundthe': ['around', 'the'],
+                             'anythingabout': ['anything', 'about'],
+                             'bootylicous': 'bootylicious',
+                             'andwhat': ['and', 'what'],
+                             'applestore': ['apple', 'store'],
+                             'archioligist': 'archaeologist',
+                             'archtypical': 'archetypal',
+                             'armorwear': ['armor', 'wear'],
+                             'assumingely': 'assumingly',
+                             'beachtown': ['beach', 'town'],
+                             'beenshot': ['been', 'shot'],
+                             'bluemountains': ['blue', 'mountains'],
+                             'boldcolors': ['bold', 'colors'],
+                             'buddawhateverhisname': ['buddha', 'whatever', 'his', 'name'],
+                             'buttcrack': ['butt', 'crack'],
+                             'candytown': 'candytown',
+                             'colorsare': ['colors', 'are'],
+                             'colorscale': ['color', 'scale'],
+                             'cominginto': ['coming', 'into'],
+                             'commonfolk': ['common', 'folk'],
+                             'cottonballs': ['cotton', 'balls'],
+                             'excuuuuse': 'excuse',
+                             'eyesockets': ['eye', 'sockets'],
+                             'facelooking': ['face', 'looking'],
+                             'fromthis': ['from', 'this'],
+                             'pokerface': ['poker', 'face'],
+                             'thefountain': ['the', 'fountain'],
+                             'thinkpeople': ['think', 'people'],
+                             'uncomfomfortable': 'uncomfortable',
+                             'upsidedown': ['upside', 'down'],
+                             'vangough': ['van', 'gogh'],
+                             'vangogh': ['van', 'gogh'],
+                             'yaaaay': 'yay',
+                             'uhhhh': 'uhh',
+                             'thedark': ['the', 'dark'],
+                             'tallships': ['tall', 'ships'],
+                             'stilllife': ['still', 'life'],
+                             'stillframe': ['still', 'frame'],
+                             'mmmmmm': 'mmm',
+                             'marvelone': ['marvel', 'one'],
+                             'lookhomeless': ['look', 'homeless'],
+                             'likealot': ['like', 'a', 'lot'],
+                             'interestesting': 'interesting',
+                             'intriuiging': 'intriguing',
+                             'icecreams': ['ice', 'creams'],
+                             'awwwww': 'aww',
+                             'slavemaster': ['slave', 'master'],
+                             'pictureshould': ['picture', 'should'],
+                             'onhisface': ['on', 'his', 'face'],
+                             'likethis': ['like', 'this'],
+                             'inkwork': ['ink', 'work'],
+                             'grapejuice': ['grape', 'juice'],
+                             'flowerlike': ['flower', 'like'],
+                             'understandthe': ['understand', 'the'],
+                             'welldressed': ['well', 'dressed'],
+                             'wouldlove': ['would', 'love'],
+                             'blendedinto': ['blended', 'into'],
+                             'buttcheeks': ['butt', 'cheeks'],
+                             'clownlike':['clown', 'like'],
+                             'davinchi': ['da', 'vinci'],
+                             'veryperfect': ['very', 'perfect'],
+                             'supervillian': 'supervillain',
+                             'simpleand': ['simple', 'and'],
+                             'seemsout': ['seems', 'out'],
+                             'rainbowmeeting': ['rainbow', 'meeting'],
+                             'strobelights': ['strobe', 'lights'],
+                             'subltness': 'subtleness',
+                             'throughthe': ['through', 'the'],
+                             'paintingfreaks': ['painting', 'freaks'],
+                             'muchgoing': ['much', 'going'],
+                             'meditterean': 'mediterranean',
+                             'instaneous': 'instantaneous',
+                             'helpthe': ['help', 'the'],
+                             'bizzarly': 'bizarrely',
+                             'crimescene': ['crime', 'scene'],
+                             'deathlife': ['death', 'life'],
+                             'dancefight': ['dance', 'fight'],
+                             'blahblahblah': ['blah', 'blah', 'blah'],
+                             'disporportioned': 'disproportionate',
+                             'dreamstate': ['dream', 'state'],
+                             'eithermight': ['either', 'might'],
+                             'enviornemt': 'environment',
+                             'greenbackground': ['green', 'background'],
+                             'greybackground': ['grey', 'background'],
+                             'handrwawing': ['hand' 'drawing'],
+                             'happycause': ['happy', 'cause'],
+                             'thelayout': ['the', 'layout'],
+                             'greatgrandparent': ['great', 'grand', 'parent'],
+                             'greatgrandparents': ['great', 'grand', 'parents'],
+                             'likesomething': ['like', 'something'],
+                             'likethey': ['like', 'they'],
+                             'makingthe': ['making', 'the'],
+                             'mideviltimes': ['medieval', 'times'],
+                             'moviestar': ['movie', 'star'],
+                             'shroudlike': ['shroud', 'like'],
+                             'blackscale': ['black', 'scale'],
+                             'bothsides': ['both', 'sides'],
+                             'fallevening': ['fall', 'evening'],
+                             'breaklight': ['break', 'light'],
+                             'springgarden': ['spring', 'garden'],
+                             'pointalist': 'pointillism',
+                             'hemmeroid': 'hemorrhoid',
+                             'bonaroo': 'bonnaroo',
+                             'boardshorts': ['board', 'shorts'],
+                             'luminousand': ['luminous', 'and'],
+                             'iceskating': ['ice', 'skating'],
+                             'ewwww' :'ew',
+                             'bloodsplatter': ['blood', 'splatter'],
+                             'beastlike': ['beast', 'like'],
+                             'entendra': 'entendre',
+                             'dollbaby': ['doll', 'baby'],
+                             'eachothers': ['each', 'others'],
+                             'backlooking': ['back', 'looking'],
+                             'enjoynthe': ['enjoy', 'the'],
+                             'stormcloud': ['storm', 'cloud'],
+                             'playwriter': ['play', 'writer'],
+                             'hyroglifics': 'hieroglyphics',
+                             'lilypads': ['lily', 'pads'],
+                             'ivreqlly': ['i', 'really'],
+                             'kindnof': ['kind', 'of'],
+                             'selfconcious': ['self', 'conscious'],
+                             'reprensation': 'representation',
+                             'eerieness' : 'eeriness',
+                             'paining': 'painting',
+                             'thats': ['that', 'is'],
+                             'xmas': 'christmas',
+                             'swordbearer' : ['sword', 'bearer'],
+                             'outcseeing': ['out', 'seeing'],
+                             'gatheredaround': ['gathered', 'around'],
+                             'lockeroom': ['locker', 'room'],
+                             'adrogonius': 'androgynous',
+                             'mezmesring': 'mesmerising',
+                             'powderoom': ['powder', 'room'],
+                             'tenalady': ['tena', 'lady', 'pads'],
+                             'storytale': ['story', 'tale'],
+                             'dipropratnly': 'disproportionately',
+                             'clotheless': 'clothless',
+                             'maculopothy': 'maculopathy',
+                             'meanmugging': ['mean', 'mugging'],
+                             'shadowwork': ['shadow', 'work'],
+                             'paintstrokes': ['paint', 'strokes'],
+                             'makenit': ['make', 'it'],
+                             'ofcolors': ['of', 'colors'],
+                             'thevdevilish': ['the', 'devilish'],
+                             'lilipads': ['lily', 'pads'],
+                             'lilypad': ['lily', 'pad'],
+                             'prusinors': 'prisoners',
+                             'thebattle': ['the', 'battle'],
+                             'bathingsuit': ['bathing', 'suit'],
+                             'thencolors': ['the', 'colors'],
+                             'morexcitingand': ['more', 'exciting', 'and'],
+                             'thebeggining': ['the', 'beginning'],
+                             'imageryand': ['imagery', 'and'],
+                             'contentness': 'contentedness',
+                             'oversimplicity': ['over', 'simplicity'],
+                             'overexausted': ['over', 'exhausted'],
+                             'uninterst': 'uninterest',
+                             'theanfels': ['the', 'angels'],
+                             'bittypenis': ['bitty', 'penis'],
+                             'intellegiant': 'intelligent',
+                             'fauxfur': ['faux', 'fur'],
+                             'togetherther': 'together',
+                             'murakmi': 'murakami',
+                             'diffinterate': 'different',
+                             'deatheater': ['death', 'eater'],
+                             'grafittied': 'graffitied',
+                             'colortheme': ['color', 'theme'],
+                             'herevening': ['her', 'evening'],
+                             'comradarie': 'camaraderie',
+                             'gradeintly': 'gradiently',
+                             'womenreally': ['woman', 'really'],
+                             'renduveousing': 'rendezvousing',
+                             'unsettleness': 'unsettledness',
+                             'desolutioned': 'disillusioned',
+                             'bucketlist': ['bucket', 'list'],
+                             'contrastful': 'contrasting',
+                             'snailshell': ['snail', 'shell'],
+                             'figureswithin': ['figures', 'within'],
+                             'semitrical': 'symmetrical',
+                             'happinessand': ['happiness', 'and'],
+                             'firepit':['fire', 'pit'],
+                             'firepits':['fire', 'pits'],
+                             'spectrumand': ['spectrum', 'and'],
+                             'skyblue': ['sky', 'blue'],
+                             'duststorm': ['dust', 'storm'],
+                             'ultrawide': ['ultra', 'wide'],
+                             'containmatated': 'contaminated',
+                             'dressesbis': ['dresses', 'is'],
+                             'underdetailed': ['under', 'detailed'],
+                             'pitchblack': ['pitch', 'black'],
+                             'andvserious': ['and', 'serious'],
+                             'peaceand': ['peace', 'and'],
+                             'drawingnif': 'drawing',
+                             'patternsmake': ['patterns', 'make'],
+                             'andvwilling': ['and', 'willing'],
+                             'thecdeeamy': ['the', 'dreamy'],
+                             'puntilism': 'pointillism',
+                             'thecangel': ['the', 'angel'],
+                             'awestriking': ['awe', 'striking'],
+                             'awestrucking': ['awe', 'striking'],
+                             'awestrikng': ['awe', 'striking'],
+                             'ofvthe': ['of', 'the'],
+                             'desaturatation': 'desaturation',
+                             'colrscare': ['colors', 'are'],
+                             'looksmessy': ['looks', 'messy'],
+                             'thecfeelingvthis': ['the', 'feeling', 'for', 'this'],
+                             'manyngood': ['many', 'and', 'good'],
+                             'mandsface': ['man\'s', 'face'],
+                             'essencence': 'essence',
+                             'confuseable': 'confusing',
+                             'frizzyness': 'frizziness',
+                             'waterbuffalo': ['water', 'buffalo'],
+                             'cinaplex' :'cineplex',
+                             'clocktowers': ['clock', 'towers'],
+                             'aysterutym': 'austerity',
+                             'conthemporan': 'contemporary',
+                             'coldsore': ['cold', 'sore'],
+                             'redflas': ['red', 'flash'],
+                             'pompnceremony': ['pomp', 'and', 'ceremony'],
+                             'etchisketch': ['etch', 'a', 'sketch'],
+                             'durdledoor': ['durdle' 'door'],
+                             'eyessquinted': ['eyes', 'squinted'],
+                             'colorfullness': 'colorfulness',
+                             'christchild': ['christ', 'child'],
+                             'wispyness': 'wispiness',
+                             'whispiness': 'wispiness',
+                             'imaturebut': ['immature', 'but'],
+                             'raphealites': 'raphaelites',
+                             'late1700': ['late', '1700'],
+                             'remnicient': 'reminiscent',
+                             'twonsubjecta': ['two', 'subjects'],
+                             'awestricken': 'awestruck',
+                             'withnumerous': ['with', 'numerous'],
+                             'colorsmake': ['colors', 'make'],
+                             'vmcolors': ['colors'],
+                             'roseyness': 'rosiness',
+                             'holdingthe': ['holding', 'the'],
+                             'gruesomness': 'gruesomeness',
+                             'linedrawing': ['line', 'drawing'],
+                             'orangatange': 'orangutan',
+                             'naaahhhh': 'nah',
+                             'micropattern': ['micro', 'pattern'],
+                             'nephilims': 'nephilim',
+                             'middleaged': ['middle', 'aged'],
+                             'thevnanyvdifferent': ['the', 'many', 'different'],
+                             'flirtatously': 'flirtatiously',
+                             'nitemare': 'nightmare',
+                             'okaaaay': 'ok',
+                             'crucufication':  'crucifixion',
+                             'manywindow': ['many', 'windows'],
+                             'panaroema': 'panorama',
+                             'wowwwwwww': 'wow',
+                             'theaqua': ['the', 'aqua'],
+                             'andexcited': ['and', 'excited'],
+                             'frommthe': ['from', 'the'],
+                             'thecanal': ['the', 'canal'],
+                             'focalpointcof': ['focal', 'point', 'of'],
+                             'silouhete': 'silhouette',
+                             'physcadelic': 'psychedelic',
+                             'tonesmakes': ['tones', 'make'],
+                             'reallyenjoying': ['really', 'enjoying'],
+                             'disportionate': 'disproportionate',
+                             'spidermonkey': ['spider', 'monkey'],
+                             'lookswise': ['looks', 'wise'],
+                             'wasewas': 'was',
+                             'inbthe': ['in', 'the'],
+                             'coronvirus': 'coronavirus',
+                             'overdramtic': 'overdramatic',
+                             'favarouite': 'favorite',
+                             'reallyike': ['really', 'like'],
+                             'thesoldier': ['the', 'soldier'],
+                             'flowerboxes': ['flower', 'boxes'],
+                             'envoirment': 'environment',
+                             'theirfaces': ['their', 'faces'],
+                             'neccasssary': 'necessary',
+                             'ghostlyness': 'ghostliness',
+                             'trytophobia': 'trypophobia',
+                             'tripophobia': 'trypophobia',
+                             'woodprinting': ['wood', 'printing'],
+                             'roomoom': 'room',
+                             'outmyself': ['out', 'myself'],
+                             'evildoing': ['evil', 'doing'],
+                             'deliousious': 'delicious',
+                             'thebfigure': ['the', 'figure'],
+                             'sleeptime': ['sleep', 'time'],
+                             'isnspooky': ['is', 'spooky'],
+                             'comtempory': 'contemporary',
+                             'smilingred': ['smiling', 'red'],
+                             'ooranateness': 'ornateness',
+                             'joyfilled': ['joy', 'filled'],
+                             'ghosttown': ['ghost', 'town'],
+                             'obvious–that': ['obvious', 'that'],
+                             'photobooth': ['photo', 'booth'],
+                             'thinknof': ['think', 'of'],
+                             'extrodianary': 'extraordinary',
+                             'thewindow': ['the', 'window'],
+                             "'indistinctive": 'indistinctive',
+                             'vilianouis':  'villainous',
+                             'farmtown': ['farm', 'town'],
+                             'handdrawing': ['hand', 'drawing'],
+                             'sophisticatedcriminal': ['sophisticated', 'criminal'],
+                             'beautifuldepiction': ['beautiful', 'depiction'],
+                             'plantetscolliding': ['planets', 'colliding'],
+                             'greytones': ['grey', 'tones'],
+                             'likepaint': ['like', 'paint'],
+                             'leatherworker': ['leather', 'worker'],
+                             'cobrownand': ['brown', 'and'],
+                             'bluegreens': ['blue', 'greens'],
+                             'polkadots': ['polka', 'dots'],
+                             'attirewear': ['attire', 'wear'],
+                             'disssary': 'disarray',
+                             'simplictiness': 'simplicity',
+                             'likelord': ['like', 'a', 'lord'],
+                             'orbtalking': ['or', 'talking'],
+                             'colorscheme': ['color', 'scheme'],
+                             'grouchypants': ['grouchy', 'pants'],
+                             'renosanse': 'renaissance',
+                             'renessciance': 'renaissance',
+                             'impaitily': 'impatiently',
+                             'hyrogliphic': 'hieroglyphic',
+                             'enduresess': 'endures',
+                             'orangecand': ['orange', 'and'],
+                             'emiotnals': 'emotion',
+                             'countryclub': ['country', 'club'],
+                             'branchhill': ['branch', 'hill'],
+                             'homospiens': ['homo', 'sapiens'],
+                             'beautifuland': ['beautiful', 'and'],
+                             'birchtree': ['birch', 'tree'],
+                             'seemslike': ['seems', 'like'],
+                             'beuaktufl': 'beautiful',
+                             'appearlike': ['appear', 'like'],
+                             'browngrounds': ['brown', 'grounds'],
+                             'morningtime': ['morning', 'time'],
+                             'jerrsaic': 'jurassic',
+                             'feelabout': ['feel','about'],
+                             'linestrokes': ['line', 'strokes'],
+                             'lifesized': ['life', 'sized'],
+                             'thevlower': ['the', 'lower'],
+                             'paitngig': 'painting',
+                             'handdrawn': ['hand', 'drawn'],
+                             'facefrom': ['face', 'from'],
+                             'treesmake': ['trees', 'make'],
+                             'chesspiece': ['chess', 'piece'],
+                             'balletdancer': ['ballet', 'dancer'],
+                             'motionblurr': ['motion', 'blur'],
+                             'varietyframes': ['variety', 'frames'],
+                             'nondetailed': ['non', 'detailed'],
+                             'shadowsplus': ['shadows', 'plus'],
+                             'bellpeppers': ['bell', 'peppers'],
+                             'thebackground': ['the', 'background'],
+                             'playwith': ['play', 'with'],
+                             'facialmexpression': ['facial', 'expression'],
+                             'compositionntells': ['composition', 'tells'],
+                             'playfulexpression': ['playful', 'expression'],
+                             'somethingforeboding': ['something', 'foreboding'],
+                             'everythingnbeing': ['everything', 'being'],
+                             'beingbsomseperated': ['being', 'separated'],
+                             'nececassry': 'necessary',
+                             'oppurnity': 'opportunity',
+                             'undistinguishable': 'indistinguishable'
+                             }
+##
+## Set, containing words found in ArtEmis but not in Glove. (for the curious reader...)
+## Curated manually by Panos circa 2020.
+##
+missing_from_glove_but_are_actual_words = {
+    'agfacolor', 'photobomb', 'modernness', 'altamouras',
+    'invitingness', 'kinkadian', 'unfinishedness',
+    'gainsboro', 'normalness', 'harmoniousness', 'tenebrism',
+    'neckpiece', 'immenseness', 'distastefulness', 'delicateness',
+    'disjointedness', 'midground', 'pulchritudinously', 'maculopathy',
+    'ornateness', 'alienesque', 'bemedaled', 'mundaneness', 'ghoulishness',
+    'hecticness', 'comfortability', 'deathscape', 'snowpiercer', 'acuarela',
+    'pedophillic', 'monochromatically', 'futuristically', 'remnicient',
+    'sereneness', 'tenebrism', 'midground', 'delicateness', 'ornateness',
+    'neckpiece', 'pompousness', 'comfortability', 'contentful', 'disjointedness',
+    'delicateness', 'suitcoat', 'slenderman', 'wispiness', 'realisticness',
+    'splotchiness', 'chubbiness', 'respectfulness', 'chemtrail', 'ominousness',
+    'douchebag',  'naturescape', 'indistinctive', 'hellscape', 'blobbiness',
+    'mountainscape', 'exoticness', 'colorscape', 'overdramatic', 'snowscape',
+    'oceanscape', 'stunningness', 'hyperreligiosity', 'trypophobia', 'treescape',
+    'prayerfulness', 'slothlike', 'tablescape', 'indistinctive', 'imaginativeness',
+    'sincereness', 'rejoicement', 'loyalness', 'hypersexualization', 'solemnness',
+    'boringness', 'hypersexualizing', 'centermost'
+}

imageprocessing/artemis/artemis/neural_models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+The MIT License (MIT)
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""

imageprocessing/artemis/artemis/neural_models/attention.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Language-Vision Attention Utilities.
+The MIT License (MIT)
+Originally created in 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+from torch import nn
+class AdditiveVisioLinguistic(nn.Module):
+    """
+    Given a vector summarizing the linguistic information processed by a pipeline
+    (e.g. k-th output of RNN) attend to a 2D grid (e.g., image pixels).
+    This mechanism *adds* the two sources of information to compute the attention (hence the name additive).
+    """
+    def __init__(self, encoder_dim, decoder_dim, attention_dim):
+        """
+        :param encoder_dim: (int) feature size (last dimension) of encoded images (e.g., [B x H x W] x encoder_dim)
+        :param decoder_dim: (int) feature size of decoder's output (summarizing linguistic information)
+        :param attention_dim: (int) feature size size of the attention space
+        """
+        super(AdditiveVisioLinguistic, self).__init__()
+        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  # linear layer to transform encoded image
+        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  # linear layer to transform decoder's output
+        self.full_att = nn.Linear(attention_dim, 1)  # linear layer to calculate values to be softmax-ed
+        self.relu = nn.ReLU(inplace=True)
+        self.softmax = nn.Softmax(dim=1)  # softmax layer to calculate weights
+    def __call__(self, encoder_out, decoder_hidden):
+        """
+        Forward propagation.
+        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
+        :param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim)
+        :return: attention weighted encoding, weights
+        """
+        att1 = self.encoder_att(encoder_out)  # (batch_size, num_pixels, attention_dim)
+        att2 = self.decoder_att(decoder_hidden)  # (batch_size, attention_dim)
+        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)  # (batch_size, num_pixels)
+        alpha = self.softmax(att)  # (batch_size, num_pixels)
+        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)  # (batch_size, encoder_dim)
+        return attention_weighted_encoding, alpha

imageprocessing/artemis/artemis/neural_models/attentive_decoder.py ADDED Viewed

	@@ -0,0 +1,696 @@

+"""
+Decoding module for a neural speaker (with attention capabilities).
+The MIT License (MIT)
+Originally created at 06/15/19, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+import random
+import time
+import warnings
+import tqdm
+import math
+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils import clip_grad_norm_
+from .attention import AdditiveVisioLinguistic
+from ..utils.stats import AverageMeter
+class AttentiveDecoder(nn.Module):
+    """
+    Note: code adapted from: https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning
+    implementing a solid version of Show, Attend, and Tell. Many thanks Sagar and the team.
+    Special (optional) features:
+        - use stochastic teacher forcer
+        - add auxiliary input data at each decoding step (besides each 'previous' token).
+        - tie the weights of the encoder/decoder weight matrices
+    """
+    def __init__(self, word_embedding, rnn_hidden_dim, encoder_dim, attention_dim,
+                 vocab, dropout_rate=0, tie_weights=False, teacher_forcing_ratio=1,
+                 auxiliary_net=None, auxiliary_dim=0):
+        """
+        :param word_embedding: nn.Embedding
+        :param rnn_hidden_dim: hidden (and thus output) dimension of the decoding rnn
+        :param encoder_dim: feature dimension of encoded stimulus
+        :param attention_dim: feature dimension over which attention is computed
+        :param vocab: artemis.utils.vocabulary instance
+        :param dropout: dropout rate
+        :param tie_weights: (opt, boolean) if True, the hidden-to-word weights are equal (tied) to the word-embeddings,
+            see https://arxiv.org/abs/1611.01462 for explanation of why this might be a good idea.
+        :param teacher_forcing_ratio:
+        :param auxiliary_net: (optional) nn.Module that will be feeding the decoder at each time step
+            with some "auxiliary" information (say an emotion label). Obviously, this information is separate than the
+            output of the typically used image-encoder.
+        :param auxiliary_dim: (int, optional) the output feature-dimension of the auxiliary net.
+        """
+        super(AttentiveDecoder, self).__init__()
+        self.vocab = vocab
+        self.vocab_size = len(vocab)
+        self.word_embedding = word_embedding
+        self.auxiliary_net = auxiliary_net
+        self.uses_aux_data = False
+        if auxiliary_dim > 0:
+            self.uses_aux_data = True
+        self.decode_step = nn.LSTMCell(word_embedding.embedding_dim + encoder_dim + auxiliary_dim, rnn_hidden_dim)
+        self.attention = AdditiveVisioLinguistic(encoder_dim, rnn_hidden_dim, attention_dim)
+        if dropout_rate > 0:
+            self.dropout = nn.Dropout(p=dropout_rate, inplace=True)
+        else:
+            self.dropout = nn.Identity()
+        self.init_h = nn.Linear(encoder_dim, rnn_hidden_dim)  # linear layer to find initial hidden state of LSTMCell
+        self.init_c = nn.Linear(encoder_dim, rnn_hidden_dim)  # linear layer to find initial cell state of LSTMCell
+        self.f_beta = nn.Linear(rnn_hidden_dim, encoder_dim)  # linear layer to create a sigmoid-activated gate
+        self.sigmoid = nn.Sigmoid()
+        self.next_word = nn.Linear(rnn_hidden_dim, self.vocab_size)  # linear layer to find scores over vocabulary
+        self.init_weights()
+        self.teacher_forcing_ratio = teacher_forcing_ratio
+        if tie_weights:
+            if self.word_embedding.embedding_dim != rnn_hidden_dim:
+                raise ValueError('When using the tied weights')
+            print('tying weights of encoder/decoder')
+            self.next_word.weight = self.word_embedding.weight
+    def init_hidden_state(self, encoder_out):
+        """
+        Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
+        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
+        :return: hidden state, cell state
+        """
+        mean_encoder_out = encoder_out.mean(dim=1)
+        h = self.init_h(mean_encoder_out)  # (batch_size, decoder_dim)
+        c = self.init_c(mean_encoder_out)
+        return h, c
+    def init_weights(self, init_range=0.1):
+        """ Better initialization """
+        self.word_embedding.weight.data.uniform_(-init_range, init_range)  # remove if pre-trained model comes up
+        self.next_word.bias.data.zero_()
+        self.next_word.weight.data.uniform_(-init_range, init_range)
+    def __call__(self, encoder_out, captions, auxiliary_data=None):
+        """ Forward propagation.
+        :param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim)
+        :param captions: encoded captions, a tensor of dimension (batch_size, max_caption_length)
+        :param auxiliary_data: extra information associated with the images (batch_size, some_dim)
+        :return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices
+        """
+        return self.sort_captions_and_forward(encoder_out, captions, auxiliary_data=auxiliary_data)
+    def sort_captions_and_forward(self, encoder_out, captions, auxiliary_data=None):
+        """ Feed forward that ...
+        :param encoder_out:
+        :param captions:
+        :return:
+        """
+        batch_size = encoder_out.size(0)
+        encoder_dim = encoder_out.size(-1)
+        # Flatten image
+        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  # (batch_size, num_pixels, encoder_dim)
+        num_pixels = encoder_out.size(1)
+        decode_lengths = torch.where(captions == self.vocab.eos)[1] # "<sos> I am <eos>" => decode_length = 3
+                                                                    # we do not feed <eos> as input to generate
+                                                                    # something after it
+        # Sort input data by decreasing lengths to reduce compute below
+        decode_lengths, sort_ind = decode_lengths.sort(dim=0, descending=True)
+        encoder_out = encoder_out[sort_ind]
+        captions = captions[sort_ind]
+        if auxiliary_data is not None:
+            auxiliary_data = auxiliary_data[sort_ind]
+            auxiliary_data = self.auxiliary_net(auxiliary_data)
+        # prepare for unravelling
+        embeddings = self.word_embedding(captions)  # (batch_size, max_caption_length, embed_dim)
+        h, c = self.init_hidden_state(encoder_out)  # (batch_size, decoder_dim)
+        decode_lengths = decode_lengths.tolist()
+        device = embeddings.device
+        # Create tensors to hold word prediction logits and attention maps (alphas)
+        predictions = torch.zeros(batch_size, max(decode_lengths), self.vocab_size).to(device)
+        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)
+        # At each time-step, decode by
+        # attention-weighing the encoder's output based on the decoder's previous hidden state output
+        # then generate a new word in the decoder with the previous word and the attention weighted encoding
+        for t in range(max(decode_lengths)):
+            batch_size_t = sum([l > t for l in decode_lengths])
+            h = h[:batch_size_t] # effective h
+            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t], h)
+            gate = self.sigmoid(self.f_beta(h))  # gating scalar, (batch_size_t, encoder_dim)
+            attention_weighted_encoding = gate * attention_weighted_encoding
+            use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False
+            if use_teacher_forcing or t == 0:
+                decoder_lang_input = embeddings[:batch_size_t, t]
+            else:
+                _, top_pred = preds[:batch_size_t].topk(1)
+                top_pred = top_pred.squeeze(-1).detach()  # detach from history as input
+                decoder_lang_input = self.word_embedding(top_pred)
+            if auxiliary_data is not None:
+                auxiliary_data_t = auxiliary_data[:batch_size_t]
+                decoder_in = torch.cat([decoder_lang_input, attention_weighted_encoding, auxiliary_data_t], dim=1)
+            else:
+                decoder_in = torch.cat([decoder_lang_input, attention_weighted_encoding], dim=1)
+            h, c = self.decode_step(decoder_in, (h, c[:batch_size_t]))  # (batch_size_t, decoder_dim)
+            preds = self.next_word(self.dropout(h))  # (batch_size_t, vocab_size)
+            predictions[:batch_size_t, t] = preds
+            alphas[:batch_size_t, t] = alpha
+        return predictions, captions, decode_lengths, alphas, sort_ind
+    def attend_and_predict_next_word(self, encoder_out, h, c, tokens, aux_data=None):
+        """Given current hidden/memory state of the decoder and the input tokens, guess the next tokens
+        and update the hidden/memory states.
+        :param encoder_out: the grounding
+        :param h: current hidden state
+        :param c: current memory state
+        :param tokens: current token input to the decoder
+        :return: logits over vocabulary distribution, updated h/c
+        """
+        attention_weighted_encoding, alpha = self.attention(encoder_out, h)
+        gate = self.sigmoid(self.f_beta(h))  # gating scalar, (batch_size_t, encoder_dim)
+        attention_weighted_encoding = gate * attention_weighted_encoding
+        embeddings = self.word_embedding(tokens)  # (batch_size, embed_dim)
+        decoder_input = torch.cat([embeddings, attention_weighted_encoding], dim=1)
+        if aux_data is not None:
+            aux_feat = self.auxiliary_net(aux_data)
+            decoder_input = torch.cat([decoder_input, aux_feat], dim=1)
+        h, c = self.decode_step(decoder_input, (h, c))  # (batch_size_t, decoder_dim)
+        logits = self.next_word(h)  # (batch_size_t, vocab_size)
+        return h, c, logits, alpha
+def single_epoch_train(train_loader, model, criterion, optimizer, epoch, device, tb_writer=None, **kwargs):
+    """ Perform training for one epoch.
+    :param train_loader: DataLoader for training data
+    :param model: nn.ModuleDict with 'encoder', 'decoder' keys
+    :param criterion: loss layer
+    :param optimizer: optimizer
+    :param epoch: epoch number
+    :param device:
+    """
+    alpha_c = kwargs.get('alpha_c', 1.0)  # Weight of doubly stochastic (attention) regularization.
+    grad_clip = kwargs.get('grad_clip', 5.0) # Gradient clipping (norm magnitude)
+    print_freq = kwargs.get('print_freq', 100)
+    use_emotion = kwargs.get('use_emotion', False)
+    batch_time = AverageMeter()  # forward prop. + back prop. time
+    data_time = AverageMeter()  # data loading time
+    entropy_loss_meter = AverageMeter()  # entropy loss (per word decoded)
+    total_loss_meter = AverageMeter()
+    start = time.time()
+    steps_taken = (epoch-1) * len(train_loader.dataset)
+    model.train()
+    for i, batch in enumerate(train_loader):
+        imgs = batch['image'].to(device)
+        caps = batch['tokens'].to(device)
+        b_size = len(imgs)
+        data_time.update(time.time() - start)
+        if use_emotion:
+            emotion = batch['emotion'].to(device)
+            res = model.decoder(model.encoder(imgs), caps, emotion)
+        else:
+            res = model.decoder(model.encoder(imgs), caps)
+        logits, caps_sorted, decode_lengths, alphas, sort_ind = res
+        # Since we decoded starting with <sos>, the targets are all words after <sos>, up to <eos>
+        targets = caps_sorted[:, 1:]
+        # Remove time-steps that we didn't decode at, or are pads
+        # pack_padded_sequence is an easy trick to do this
+        logits = pack_padded_sequence(logits, decode_lengths, batch_first=True)
+        targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)
+        # Calculate loss
+        ent_loss = criterion(logits.data, targets.data)
+        total_loss = ent_loss
+        # Add doubly stochastic attention regularization
+        # Note. some implementation simply do this like: d_atn_loss = alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
+        # here we take care of the fact that some samples in the same batch have more/less tokens than others.
+        if alpha_c > 0:
+            total_energy = torch.from_numpy(np.array(decode_lengths)) / alphas.shape[-1]   # n_tokens / num_pixels
+            total_energy.unsqueeze_(-1)  # B x 1
+            total_energy = total_energy.to(device)
+            d_atn_loss = alpha_c * ((total_energy - alphas.sum(dim=1)) ** 2).mean()
+            total_loss += d_atn_loss
+        # Back prop.
+        optimizer.zero_grad()
+        total_loss.backward()
+        if grad_clip is not None:
+            clip_grad_norm_(model.parameters(), grad_clip)
+        # Update weights
+        optimizer.step()
+        # Keep track of metrics
+        entropy_loss_meter.update(ent_loss.item(), sum(decode_lengths))
+        total_loss_meter.update(total_loss.item(), sum(decode_lengths))
+        batch_time.update(time.time() - start)
+        start = time.time()
+        steps_taken += b_size
+        # Print status
+        if print_freq is not None and i % print_freq == 0:
+            print('Epoch: [{0}][{1}/{2}]\t'
+                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(epoch, i, len(train_loader),
+                                                                          batch_time=batch_time,
+                                                                          data_time=data_time,
+                                                                          loss=total_loss_meter))
+        if tb_writer is not None:
+            tb_writer.add_scalar('training-entropy-loss-with-batch-granularity', entropy_loss_meter.avg, steps_taken)
+    return total_loss_meter.avg
+@torch.no_grad()
+def negative_log_likelihood(model, data_loader, device):
+    """
+    :param model:
+    :param data_loader:
+    :param device:
+    :param phase:
+    :return:
+    """
+    model.eval()
+    nll = AverageMeter()
+    aux_data = None
+    for batch in data_loader:
+        imgs = batch['image'].to(device)
+        caps = batch['tokens'].to(device)
+        # TODO Refactor
+        if model.decoder.uses_aux_data:
+            aux_data = batch['emotion'].to(device)
+        logits, caps_sorted, decode_lengths, alphas, sort_ind = model.decoder(model.encoder(imgs), caps, aux_data)
+        # Since we decoded starting with <sos>, the targets are all words after <sos>, up to <eos>
+        targets = caps_sorted[:, 1:]
+        # Remove time-steps that we didn't decode at, or are pads
+        # pack_padded_sequence is an easy trick to do this
+        logits = pack_padded_sequence(logits, decode_lengths, batch_first=True)
+        targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)
+        # Calculate loss
+        loss = F.cross_entropy(logits.data, targets.data)
+        nll.update(loss.item(), sum(decode_lengths))
+    return nll.avg
+@torch.no_grad()
+def log_prob_of_caption(model, img, tokens, temperature=1):
+    """Given a captioning model, return the log-probability of a caption given an image.
+    This version expects a batch of images, each assotiated with a single caption.
+    :param model: encoder/decoder speaker
+    :param img: Tensor B x channels x spatial-dims
+    :param tokens: Tensor B x max-n-tokens
+    :return log_probs: Tensor of size B x max-n-tokens holding the log-probs of each token of each caption
+    """
+    encoder = model.encoder
+    decoder = model.decoder
+    assert all(tokens[:, 0] == decoder.vocab.sos)
+    max_steps = tokens.shape[1]
+    encoder_out = encoder(img)
+    batch_size = encoder_out.size(0)
+    encoder_dim = encoder_out.size(-1)
+    encoder_out = encoder_out.view(batch_size, -1, encoder_dim)
+    # Create tensors to hold log-probs
+    log_probs = torch.zeros(batch_size, max_steps).to(tokens.device)
+    h, c = decoder.init_hidden_state(encoder_out)
+    for t in range(max_steps - 1):
+        h, c, pred_t, _ = decoder.attend_and_predict_next_word(encoder_out, h, c, tokens[:, t])
+        if temperature != 1:
+            pred_t /= temperature
+        pred_t = F.log_softmax(pred_t, dim=1)
+        log_probs[:, t] = pred_t[torch.arange(batch_size), tokens[:, t+1]] # prob. of guessing next token
+    lens = torch.where(tokens == decoder.vocab.eos)[1] # true tokens + 1 for <eos>
+    mask = torch.zeros_like(log_probs)
+    mask[torch.arange(mask.shape[0]), lens] = 1
+    mask = mask.cumsum(dim=1).to(torch.bool)
+    log_probs.masked_fill_(mask, 0) # set to zero all positions after the true size of the caption
+    return log_probs, lens
+@torch.no_grad()
+def sample_captions(model, loader, max_utterance_len, sampling_rule, device, temperature=1,
+                    topk=None, drop_unk=True, drop_bigrams=False):
+    """
+    :param model:
+    :param loader:
+    :param max_utterance_len: maximum allowed length of captions
+    :param sampling_rule: (str) 'argmax' or 'multinomial', or 'topk'
+    :return:
+        attention_weights: (torch cpu Tensor) N-images x encoded_image_size (e.g., 7 x 7) x  max_utterance_len
+            attention_weights[:,0] corresponds to the attention map over the <SOS> symbol
+    """
+    if sampling_rule not in ['argmax', 'multinomial', 'topk']:
+        raise ValueError('Unknown sampling rule.')
+    model.eval()
+    all_predictions = []
+    attention_weights = []
+    unk = model.decoder.vocab.unk
+    use_aux_data = model.decoder.uses_aux_data
+    aux_data = None
+    for batch in loader:
+        imgs = batch['image'].to(device)
+        if use_aux_data:
+            aux_data = batch['emotion'].to(device)
+        encoder_out = model.encoder(imgs)
+        enc_image_size = encoder_out.size(1)
+        batch_size = encoder_out.size(0)
+        encoder_dim = encoder_out.size(-1)
+        # Flatten image
+        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  # (batch_size, num_pixels, encoder_dim)
+        # Create tensors to hold word predictions
+        max_steps = max_utterance_len + 1  # one extra step for EOS marker
+        predictions = torch.zeros(batch_size, max_steps).to(device)
+        # Initialize decoder state
+        decoder = model.decoder
+        h, c = decoder.init_hidden_state(encoder_out) # (batch_size, decoder_dim)
+        # Tensor to store previous words at each step; now they're just <sos>
+        prev_words = torch.LongTensor([decoder.vocab.sos] * batch_size).to(device)
+        for t in range(max_steps):
+            h, c, pred_t, alpha = decoder.attend_and_predict_next_word(encoder_out, h, c, prev_words, aux_data=aux_data)
+            if t > 0: # at t=1 it sees <sos> as the previous word
+                alpha = alpha.view(-1, enc_image_size, enc_image_size)  # (bsize, enc_image_size, enc_image_size)
+                attention_weights.append(alpha.cpu())
+            pred_t /= temperature
+            if drop_unk:
+                pred_t[:, unk] = -math.inf
+            if t > 0:
+                pred_t[:, prev_words] = -math.inf # avoid repeating the same word twice
+            if t > 1:
+                pred_t[:, predictions[:,t-2].long()] = -math.inf # avoid repeating the prev-prev word
+            if drop_bigrams and t > 1:
+                prev_usage = predictions[:, :t-1] # of the previous word (e.g, xx yy xx) (first xx)
+                x, y = torch.where(prev_usage == torch.unsqueeze(prev_words, -1))
+                y += 1 # word-after-last-in-prev-usage  (e.g., yy in above)
+                y = prev_usage[x, y].long()
+                pred_t[x, y] = -math.inf
+            if sampling_rule == 'argmax':
+                prev_words = torch.argmax(pred_t, 1)
+            elif sampling_rule == 'multinomial':
+                probability = torch.softmax(pred_t, 1)
+                prev_words = torch.multinomial(probability, 1).squeeze_(-1)
+            elif sampling_rule == 'topk':
+                row_idx = torch.arange(batch_size)
+                row_idx = row_idx.view([1, -1]).repeat(topk, 1).t()
+                # do soft-max after you zero-out non topk (you could also do this before, ask me/Panos if need be:) )
+                val, ind = pred_t.topk(topk, dim=1)
+                val = torch.softmax(val, 1)
+                probability = torch.zeros_like(pred_t) # only the top-k logits will have non-zero prob.
+                probability[row_idx, ind] = val
+                prev_words = torch.multinomial(probability, 1).squeeze_(-1)
+            predictions[:, t] = prev_words
+        all_predictions.append(predictions.cpu().long())
+    all_predictions = torch.cat(all_predictions)
+    attention_weights = torch.stack(attention_weights, 1)
+    return all_predictions, attention_weights
+@torch.no_grad()
+def sample_captions_beam_search(model, data_loader, beam_size, device, temperature=1, max_iter=500,
+                                drop_unk=True, drop_bigrams=False):
+    """
+    :param model (encoder, decoder)
+    :param data_loader:
+    :param beam_size:
+    :param drop_unk:
+    :return:
+        hypotheses_alphas: list carrying the attention maps over the encoded-pixel space for each produced token.
+    Note: batch size must be one.
+    """
+    if data_loader.batch_size != 1:
+        raise ValueError('not implemented for bigger batch-sizes')
+    model.eval()
+    decoder = model.decoder
+    vocab = model.decoder.vocab
+    captions = list()
+    hypotheses_alphas = list()
+    caption_log_prob = list()
+    aux_feat = None
+    for batch in tqdm.tqdm(data_loader):  # For each image (batch-size = 1)
+        image = batch['image'].to(device)  # (1, 3, H, W)
+        if model.decoder.uses_aux_data:
+            aux_data = batch['emotion'].to(device)
+            aux_feat = model.decoder.auxiliary_net(aux_data)
+        k = beam_size
+        encoder_out = model.encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
+        enc_image_size = encoder_out.size(1)
+        encoder_dim = encoder_out.size(3)
+        # Flatten encoding
+        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
+        num_pixels = encoder_out.size(1)
+        # We'll treat the problem as having a batch size of k
+        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)
+        # Tensor to store top k previous words at each step; now they're just <sos>
+        k_prev_words = torch.LongTensor([[vocab.sos]] * k).to(device)  # (k, 1)
+        # Tensor to store top k sequences; now they're just <sos>
+        seqs = k_prev_words # (k, 1)
+        # Tensor to store top k sequences' scores; now they're just 0
+        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)
+        # Tensor to store top k sequences' alphas; now they're just 1s
+        seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device)  # (k, 1, enc_image_size, enc_image_size)
+        # Lists to store completed sequences and scores
+        complete_seqs = list()
+        complete_seqs_alpha = list()
+        complete_seqs_scores = list()
+        # Start decoding
+        step = 1
+        h, c = decoder.init_hidden_state(encoder_out)
+        # s (below) is a number less than or equal to k, because sequences are removed
+        # from this process once they hit <eos>
+        while True:
+            embeddings = decoder.word_embedding(k_prev_words).squeeze(1)  # (s, embed_dim)
+            awe, alpha = decoder.attention(encoder_out, h)   # (s, encoder_dim), (s, num_pixels)
+            alpha = alpha.view(-1, enc_image_size, enc_image_size)  # (s, enc_image_size, enc_image_size)
+            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
+            awe = gate * awe
+            decoder_input = torch.cat([embeddings, awe], dim=1)
+            if aux_feat is not None:
+                af = torch.repeat_interleave(aux_feat, decoder_input.shape[0], dim=0)
+                decoder_input = torch.cat([decoder_input, af], dim=1)
+            h, c = decoder.decode_step(decoder_input, (h, c))  # (s, decoder_dim)
+            scores = decoder.next_word(h)  # (s, vocab_size)
+            if temperature != 1:
+                scores /= temperature
+            scores = F.log_softmax(scores, dim=1)
+            if drop_unk:
+                scores[:, vocab.unk] = -math.inf
+            if drop_bigrams and step > 2:
+                # drop bi-grams with frequency higher than 1.
+                prev_usage = seqs[:, :step-1]
+                x, y = torch.where(prev_usage == k_prev_words)
+                y += 1 # word-after-last-in-prev-usage
+                y = seqs[x, y]
+                scores[x,y] = -math.inf
+            if step > 2:
+                ## drop x and x
+                and_token = decoder.vocab('and')
+                x, y = torch.where(k_prev_words == and_token)
+                pre_and_word = seqs[x, step-2]
+                scores[x, pre_and_word] = -math.inf
+            # Add log-probabilities
+            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)
+            # For the first step, all k points will have the same scores (since same k previous words, h, c)
+            if step == 1:
+                top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
+            else:
+                # Unroll and find top scores, and their unrolled indices
+                top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)
+            # Convert unrolled indices to actual indices of scores
+            prev_word_inds = top_k_words / len(vocab)  # (s)
+            next_word_inds = top_k_words % len(vocab)  # (s)
+            # Add new words to sequences
+            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
+            seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
+                               dim=1)  # (s, step+1, enc_image_size, enc_image_size)
+            # Which sequences are incomplete (didn't reach <eos>)?
+            incomplete_inds = [ind for ind, word in enumerate(next_word_inds) if word != vocab.eos]
+            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))
+            # Set aside complete sequences
+            if len(complete_inds) > 0:
+                complete_seqs.extend(seqs[complete_inds].tolist())
+                complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
+                complete_seqs_scores.extend(top_k_scores[complete_inds].tolist())
+            k -= len(complete_inds)  # reduce beam length accordingly
+            # Proceed with incomplete sequences
+            if k == 0:
+                break
+            seqs = seqs[incomplete_inds]
+            seqs_alpha = seqs_alpha[incomplete_inds]
+            h = h[prev_word_inds[incomplete_inds]]
+            c = c[prev_word_inds[incomplete_inds]]
+            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
+            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
+            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)
+            # Break if things have been going on too long
+            if step > max_iter:
+                break
+            step += 1
+        s_idx = np.argsort(complete_seqs_scores)[::-1]
+        complete_seqs_scores = [complete_seqs_scores[i] for i in s_idx]
+        complete_seqs = [complete_seqs[i] for i in s_idx]
+        alphas = [complete_seqs_alpha[i] for i in s_idx]
+        captions.append(complete_seqs)
+        caption_log_prob.append(complete_seqs_scores)
+        hypotheses_alphas.append(alphas)
+    return captions, hypotheses_alphas, caption_log_prob
+@torch.no_grad()
+def properize_captions(captions, vocab, add_sos=True):
+    """
+    :param captions: torch Tensor holding M x max_len integers
+    :param vocab:
+    :param add_sos:
+    :return:
+    """
+    # ensure they end with eos.
+    new_captions = []
+    missed_eos = 0
+    for caption in captions.cpu():
+        ending = torch.where(caption == vocab.eos)[0]
+        if len(ending) >= 1: # at least one <eos> symbol is found
+            first_eos = ending[0]
+            if first_eos < len(caption):
+                caption[first_eos+1:] = vocab.pad
+        else:
+            missed_eos += 1
+            caption[-1] = vocab.eos
+        new_captions.append(caption)
+    new_captions = torch.stack(new_captions)
+    dummy = torch.unique(torch.where(new_captions == vocab.eos)[0])
+    assert len(dummy) == len(new_captions) # assert all have an eos.
+    if add_sos:
+        sos = torch.LongTensor([vocab.sos] * len(new_captions)).view(-1, 1)
+        new_captions = torch.cat([sos, new_captions], dim=1)
+    if missed_eos > 0:
+        warnings.warn('{} sentences without <eos> were generated.'.format(missed_eos))
+    return new_captions
+def log_prob_of_dataset(model, data_loader, device, temperature=1):
+    all_log_probs = []
+    all_lens = []
+    model.eval()
+    for batch in data_loader:
+        imgs = batch['image'].to(device)
+        tokens = batch['tokens'].to(device)
+        log_probs, n_tokens = log_prob_of_caption(model, imgs, tokens, temperature=temperature)
+        all_log_probs.append(log_probs.cpu())
+        all_lens.append(n_tokens.cpu())
+    all_log_probs = torch.cat(all_log_probs, dim=0)
+    all_lens = torch.cat(all_lens, dim=0)
+    return all_log_probs, all_lens
+def perplexity_of_dataset(model, data_loader, device):
+    """ for a test corpus perplexity is 2 ^ {-l} where l is log_2(prob_of_sentences) * M, where M is the number
+    of tokens in the dataset.
+    :param model:
+    :param data_loader:
+    :param device:
+    :return:
+    """
+    all_log_probs, all_lens = log_prob_of_dataset(model, data_loader, device)
+    log_prob_per_sent = torch.sum(all_log_probs, 1).double() # sum over tokens to get the log_p of each utterance
+    prob_per_sent = torch.exp(log_prob_per_sent)
+    n_tokens = torch.sum(all_lens).double()  # number of words in dataset
+    average_log_prob = torch.sum(torch.log2(prob_per_sent)) / n_tokens   # log_2 for perplexity
+    perplexity = 2.0 ** (-average_log_prob)
+    return perplexity, prob_per_sent, all_lens

imageprocessing/artemis/artemis/neural_models/distances.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Utilities for distance measurements in GPU.
+The MIT License (MIT)
+Originally created at 07/2019, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+from torch.nn.functional import normalize
+def cdist(x1, x2, epsilon=1e-16):
+    """
+    :param x1: N x Feat-dim
+    :param x2: N x Feat-dim
+    :param epsilon:
+    :return: N x N matrix
+    """
+    x1_norm = x1.pow(2).sum(dim=-1, keepdim=True)
+    x2_norm = x2.pow(2).sum(dim=-1, keepdim=True)
+    inner_prod = torch.mm(x1, x2.t())
+    res = x1_norm - 2.0 * inner_prod + x2_norm.t()   # You need to transpose for broadcasting to be correct.
+    res.clamp_min_(epsilon).sqrt_()
+    return res
+def exclude_identity_from_neighbor_search(all_pairwise_dists, identities):
+    """
+    :param all_pairwise_dists: M x N matrix of distances
+    :param identities: the k-th row of all_pairwise_dists, should exclude the identities[k] entry.
+    :return:
+    """
+    all_pairwise_dists[range(all_pairwise_dists.size(0)), identities] = float("Inf")
+    return all_pairwise_dists
+def k_euclidean_neighbors(k, x1, x2, exclude_identity=False, identities=None):
+    """ For each row vector in x1 the k-nearest neighbors in x2.
+    :param k:
+    :param x1: M x Feat-dim
+    :param x2: N x Feat-dim
+    :param exclude_identity:
+    :param identities:
+    :return: M x k
+    """
+    all_cross_pairwise_dists = cdist(x1, x2)
+    if exclude_identity:
+        all_cross_pairwise_dists = exclude_identity_from_neighbor_search(all_cross_pairwise_dists, identities)
+    n_dists, n_ids = all_cross_pairwise_dists.topk(k=k, dim=1, largest=False, sorted=True)
+    return n_dists, n_ids
+def k_cosine_neighbors(k, x1, x2, exclude_identity=False, identities=None):
+    """ For each row vector in x1 the k-nearest neighbors in x2.
+    :param k:
+    :param x1: M x Feat-dim
+    :param x2: N x Feat-dim
+    :param exclude_identity:
+    :param identities:
+    :return: M x k
+    """
+    all_cross_pairwise_dists = torch.mm(normalize(x1, dim=1, p=2), normalize(x2, dim=1, p=2).t())
+    all_cross_pairwise_dists = 1.0 - all_cross_pairwise_dists
+    if exclude_identity:
+        all_cross_pairwise_dists = exclude_identity_from_neighbor_search(all_cross_pairwise_dists, identities)
+    n_dists, n_ids = all_cross_pairwise_dists.topk(k=k, dim=1, largest=False, sorted=True)
+    return n_dists, n_ids

imageprocessing/artemis/artemis/neural_models/image_emotion_clf.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Given an image guess a distribution over the emotion labels.
+The MIT License (MIT)
+Originally created in 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm.notebook import tqdm as tqdm_notebook
+from ..utils.stats import AverageMeter
+class ImageEmotionClassifier(nn.Module):
+    def __init__(self, img_encoder, clf_head):
+        super(ImageEmotionClassifier, self).__init__()
+        self.img_encoder = img_encoder
+        self.clf_head = clf_head
+    def __call__(self, img):
+        feat = self.img_encoder(img)
+        logits = self.clf_head(feat)
+        return logits
+def single_epoch_train(model, data_loader, criterion, optimizer, device):
+    epoch_loss = AverageMeter()
+    model.train()
+    for batch in tqdm_notebook(data_loader):
+        img = batch['image'].to(device)
+        labels = batch['label'].to(device) # emotion_distribution
+        logits = model(img)
+        # Calculate loss
+        loss = criterion(logits, labels)
+        # Back prop.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        b_size = len(labels)
+        epoch_loss.update(loss.item(), b_size)
+    return epoch_loss.avg
+@torch.no_grad()
+def evaluate_on_dataset(model, data_loader, criterion, device, detailed=True, kl_div=True):
+    epoch_loss = AverageMeter()
+    model.eval()
+    epoch_confidence = []
+    for batch in tqdm_notebook(data_loader):
+        img = batch['image'].to(device)
+        labels = batch['label'].to(device) # emotion_distribution
+        logits = model(img)
+        # Calculate loss
+        loss = criterion(logits, labels)
+        if detailed:
+            if kl_div:
+                epoch_confidence.append(torch.exp(logits).cpu())  # logits are log-soft-max
+            else:
+                epoch_confidence.append(F.softmax(logits, dim=-1).cpu()) # logits are pure logits
+        b_size = len(labels)
+        epoch_loss.update(loss.item(), b_size)
+    if detailed:
+        epoch_confidence = torch.cat(epoch_confidence).numpy()
+    return epoch_loss.avg, epoch_confidence

imageprocessing/artemis/artemis/neural_models/lstm_encoder.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Encoding discrete tokens with LSTMs.
+The MIT License (MIT)
+Originally created at 2019, (updated on January 2020) for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+class LSTMEncoder(nn.Module):
+    """A feed-forward network that processes discrete tokens via an LSTM."""
+    def __init__(self, n_input, n_hidden, word_embedding, word_transformation=None,
+                 bidirectional=False, init_h=None, init_c=None, eos_symbol=None, feature_type='last'):
+        """
+        :param n_input: (int) input dim of LSTM
+        :param n_hidden: (int) hidden dim of LSTM
+        :param word_embedding: (nn.Embedding) vectors representing words
+        :param word_transformation: (opt, nn.Module) to apply some transformation on the word
+        embeddings before they are consumed by the LSTM.
+        :param bidirectional: boolean, whether to use a bi-RNN
+        :param init_h: (opt, nn.Module) for initializing LSTM hidden state
+        :param init_c: (opt, nn.Module) for initializing LSTM memory
+        :param eos_symbol: (opt, int) integer marking end of sentence
+        :param feature_type: (opt, string) how to process the output of the LSTM,
+            valid options = ['last', 'max', 'mean', 'all']
+        """
+        super().__init__()
+        self.word_embedding = word_embedding
+        self.n_hidden = n_hidden
+        self.eos = eos_symbol
+        self.feature_type = feature_type
+        # auxiliary (optional) networks
+        self.word_transformation = word_transformation
+        self.init_h = init_h
+        self.init_c = init_c
+        self.rnn = nn.LSTM(input_size=n_input, hidden_size=n_hidden,
+                           bidirectional=bidirectional, batch_first=True)
+    def out_dim(self):
+        rnn = self.rnn
+        mult = 2 if rnn.bidirectional else 1
+        return rnn.num_layers * rnn.hidden_size * mult
+    def __call__(self, tokens, grounding=None, len_of_sequence=None):
+        """
+        :param tokens:
+        :param grounding: (Tensor, opt)
+        :param len_of_sequence: (Tensor:, opt) singleton tensor of shape (B,) carrying the length of the tokens
+        :return: the encoded by the LSTM tokens
+            Note: a) tokens are padded with the <sos> token
+        """
+        w_emb = self.word_embedding(tokens[:, 1:]) # skip <sos>
+        if self.word_transformation is not None:
+            w_emb = self.word_transformation(w_emb)
+        device = w_emb.device
+        if len_of_sequence is None:
+            len_of_sequence = torch.where(tokens == self.eos)[1] - 1  # ignore <sos>
+        x_packed = pack_padded_sequence(w_emb, len_of_sequence, enforce_sorted=False, batch_first=True)
+        self.rnn.flatten_parameters()
+        if grounding is not None:
+            h0 = self.init_h(grounding).unsqueeze(0)  # rep-mat if multiple LSTM cells.
+            c0 = self.init_c(grounding).unsqueeze(0)
+            rnn_out, _ = self.rnn(x_packed, (h0, c0))
+        else:
+            rnn_out, _ = self.rnn(x_packed)
+        rnn_out, dummy = pad_packed_sequence(rnn_out, batch_first=True)
+        if self.feature_type == 'last':
+            batch_size = len(tokens)
+            lang_feat = rnn_out[torch.arange(batch_size), len_of_sequence-1]
+        elif self.feature_type == 'max':
+            lang_feat = rnn_out.max(1).values
+        elif self.feature_type == 'mean':
+            lang_feat = rnn_out.sum(1)
+            lang_feat /= len_of_sequence.view(-1, 1)  # broadcasting
+        elif self.feature_type == 'all':
+            lang_feat = rnn_out
+        else:
+            raise ValueError('Unknown LSTM feature requested.')
+        return lang_feat

imageprocessing/artemis/artemis/neural_models/mlp.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Multi-Linear Perceptron packaged nicely for convenience.
+The MIT License (MIT)
+Originally created in late 2019, for Python 3.x. Last updated in 2021.
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+from torch import nn
+import numpy as np
+def optional_repeat(value, times):
+    """ helper function, to repeat a parameter's value many times
+    :param value: an single basic python type (int, float, boolean, string), or a list with length equals to times
+    :param times: int, how many times to repeat
+    :return: a list with length equal to times
+    """
+    if type(value) is not list:
+        value = [value]
+    if len(value) != 1 and len(value) != times:
+        raise ValueError('The value should be a singleton, or be a list with times length.')
+    if len(value) == times:
+        return value # do nothing
+    return np.array(value).repeat(times).tolist()
+class MLP(nn.Module):
+    """ Multi-near perceptron. That is a k-layer deep network where each layer is a fully-connected layer, with
+    (optionally) batch-norm, a non-linearity and dropout. The last layer (output) is always a 'pure' linear function.
+    """
+    def __init__(self, in_feat_dims, out_channels, b_norm=True, dropout_rate=0,
+                 non_linearity=nn.ReLU(inplace=True), closure=None):
+        """Constructor
+        :param in_feat_dims: input feature dimensions
+        :param out_channels: list of ints describing each the number hidden/final neurons. The
+        :param b_norm: True/False, or list of booleans
+        :param dropout_rate: int, or list of int values
+        :param non_linearity: nn.Module
+        :param closure: optional nn.Module to use at the end of the MLP
+        """
+        super(MLP, self).__init__()
+        self.hidden_dimensions = out_channels[:-1]
+        self.embedding_dimension = out_channels[-1]
+        n_layers = len(out_channels)
+        dropout_rate = optional_repeat(dropout_rate, n_layers-1)
+        b_norm = optional_repeat(b_norm, n_layers-1)
+        previous_feat_dim = in_feat_dims
+        all_ops = []
+        for depth in range(len(out_channels)):
+            out_dim = out_channels[depth]
+            affine_op = nn.Linear(previous_feat_dim, out_dim, bias=True)
+            all_ops.append(affine_op)
+            if depth < len(out_channels) - 1:
+                if b_norm[depth]:
+                    all_ops.append(nn.BatchNorm1d(out_dim))
+                if non_linearity is not None:
+                    all_ops.append(non_linearity)
+                if dropout_rate[depth] > 0:
+                    all_ops.append(nn.Dropout(p=dropout_rate[depth]))
+            previous_feat_dim = out_dim
+        if closure is not None:
+            all_ops.append(closure)
+        self.net = nn.Sequential(*all_ops)
+    def __call__(self, x):
+        return self.net(x)

imageprocessing/artemis/artemis/neural_models/resnet_encoder.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Rensnet Wrapper.
+The MIT License (MIT)
+Originally created in late 2019, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+from torch import nn
+from torchvision import models
+class ResnetEncoder(nn.Module):
+    """Convenience wrapper around resnet models"""
+    def __init__(self, backbone, adapt_image_size=None, drop=2, pretrained=True, verbose=False):
+        """
+        :param backbone: (string) resnet-S, S in [18, 34, 50, 101]
+        :param adapt_image_size: (opt, int) if given forward feature has
+            [B, adapt_image_size, adapt_image_size, feat-dim]
+        :param drop: how many of the last layers/blocks to drop.
+        :param pretrained: (Boolean)
+        :param verbose: (opt, Boolean) if true print actions taken.
+        Note: in total there are 10 layers/blocks. The last two are an adaptive_pooling and an FC, the
+        previous layers give rise to convolutional maps of increasing spatial size.
+        """
+        if drop == 0 and adapt_image_size is not None:
+            raise ValueError('Trying to apply adaptive pooling while keeping the entire model (drop=0).')
+        super(ResnetEncoder, self).__init__()
+        backbones = {
+                'resnet18': models.resnet18,
+                'resnet34': models.resnet34,
+                'resnet50': models.resnet50,
+                'resnet101': models.resnet101,
+        }
+        self.name = backbone
+        self.drop = drop
+        self.resnet = backbones[self.name](pretrained=pretrained)
+        # Remove linear and last adaptive pool layer
+        if drop > 0:
+            modules = list(self.resnet.children())
+            if verbose:
+                print('Removing the last {} layers of a {}'.format(drop, self.name))
+                print(modules[-drop:])
+            modules = modules[:-drop]
+            self.resnet = nn.Sequential(*modules)
+        self.adaptive_pool = None
+        if adapt_image_size is not None:
+            self.adaptive_pool = nn.AdaptiveAvgPool2d((adapt_image_size, adapt_image_size))
+        if pretrained:
+            for p in self.resnet.parameters():
+                p.requires_grad = False
+    def __call__(self, images):
+        """Forward prop.
+            :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
+            :return: encoded images
+        """
+        out = self.resnet(images) # (B, F, ceil(image_size/32), ceil(image_size/32))
+        if self.adaptive_pool is not None:
+            out = self.adaptive_pool(out)  # (B, F, adapt_image_size, adapt_image_size)
+        if self.drop > 0: # convolutional-like output
+            out = out.permute(0, 2, 3, 1)      # bring feature-dim last.
+            out = torch.squeeze(torch.squeeze(out, 1), 1)  # In case adapt_image_size == 1, remove dimensions
+        return out
+    def unfreeze(self, level=5, verbose=False):
+        """Allow or prevent the computation of gradients for blocks after level.
+        The smaller the level, the less pretrained the resnet will be.
+        """
+        all_layers = list(self.resnet.children())
+        if verbose:
+            ll = len(all_layers)
+            print('From {} layers, you are unfreezing the last {}'.format(ll, ll-level))
+        for c in all_layers[level:]:
+            for p in c.parameters():
+                p.requires_grad = True
+        return self
+    def embedding_dimension(self):
+        """The feature (channel) dimension of the last layer"""
+        if self.drop == 0:
+            return 1000  #Imagenet Classes
+        if self.drop == 2:
+            return 512 if int(self.name.replace('resnet', '')) < 50 else 2048
+        if self.drop == 3:
+            return 256 if int(self.name.replace('resnet', '')) < 50 else 1024
+        raise NotImplementedError

imageprocessing/artemis/artemis/neural_models/show_attend_tell.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+A custom implementation of Show-Attend-&-Tell for ArtEmis: Affective Language for Visual Art
+The MIT License (MIT)
+Originally created in early 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+from torch import nn
+from .resnet_encoder import ResnetEncoder
+from .attentive_decoder import AttentiveDecoder
+def describe_model(vocab, args):
+    """ Describe the architecture of a SAT speaker with a resnet encoder.
+    :param vocab:
+    :param args:
+    :return:
+    """
+    word_embedding = nn.Embedding(len(vocab), args.word_embedding_dim, padding_idx=vocab.pad)
+    encoder = ResnetEncoder(args.vis_encoder, adapt_image_size=args.atn_spatial_img_size).unfreeze()
+    encoder_out_dim = encoder.embedding_dimension()
+    emo_ground_dim = 0
+    emo_projection_net = None
+    if args.use_emo_grounding:
+        emo_in_dim = args.emo_grounding_dims[0]
+        emo_ground_dim = args.emo_grounding_dims[1]
+        # obviously one could use more complex nets here instead of using a "linear" layer.
+        # in my estimate, this is not going to be useful:)
+        emo_projection_net = nn.Sequential(*[nn.Linear(emo_in_dim, emo_ground_dim), nn.ReLU()])
+    decoder = AttentiveDecoder(word_embedding,
+                               args.rnn_hidden_dim,
+                               encoder_out_dim,
+                               args.attention_dim,
+                               vocab,
+                               dropout_rate=args.dropout_rate,
+                               teacher_forcing_ratio=args.teacher_forcing_ratio,
+                               auxiliary_net=emo_projection_net,
+                               auxiliary_dim=emo_ground_dim)
+    model = nn.ModuleDict({'encoder': encoder, 'decoder': decoder})
+    return model

imageprocessing/artemis/artemis/neural_models/text_emotional_clf.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Given an utterance (an optionally an image) guess a distribution over the emotion labels.
+The MIT License (MIT)
+Originally created in 2020, for Python 3.x
+Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm.notebook import tqdm as tqdm_notebook
+from ..utils.stats import AverageMeter
+class TextEmotionClassifier(nn.Module):
+    def __init__(self, text_encoder, clf_head, img_encoder=None):
+        super(TextEmotionClassifier, self).__init__()
+        self.text_encoder = text_encoder
+        self.clf_head = clf_head
+        self.img_encoder = img_encoder
+    def __call__(self, text, img=None):
+        if img is not None:
+            img_feat = self.img_encoder(img)
+            feat = self.text_encoder(text, img_feat)
+        else:
+            feat = self.text_encoder(text)
+        logits = self.clf_head(feat)
+        return logits
+def single_epoch_train(model, data_loader, use_vision, criterion, optimizer, device):
+    epoch_loss = AverageMeter()
+    epoch_acc = AverageMeter()
+    model.train()
+    for batch in tqdm_notebook(data_loader):
+        labels = batch['emotion'].to(device)
+        tokens = batch['tokens'].to(device)
+        if use_vision:
+            img = batch['image'].to(device)
+            logits = model(tokens, img)
+        else:
+            logits = model(tokens)
+        # Calculate loss
+        loss = criterion(logits, labels)
+        acc = torch.mean((logits.argmax(1) == labels).double())
+        # Back prop.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        b_size = len(labels)
+        epoch_loss.update(loss.item(), b_size)
+        epoch_acc.update(acc.item(), b_size)
+    return epoch_loss.avg, epoch_acc.avg
+@torch.no_grad()
+def evaluate_on_dataset(model, data_loader, use_vision, criterion, device, detailed=True):
+    epoch_loss = AverageMeter()
+    epoch_acc = AverageMeter()
+    model.eval()
+    epoch_confidence = []
+    for batch in tqdm_notebook(data_loader):
+        labels = batch['emotion'].to(device)
+        tokens = batch['tokens'].to(device)
+        if use_vision:
+            img = batch['image'].to(device)
+            logits = model(tokens, img)
+        else:
+            logits = model(tokens)
+        # Calculate loss
+        loss = criterion(logits, labels)
+        guessed_correct = logits.argmax(1) == labels
+        acc = torch.mean(guessed_correct.double())
+        if detailed:
+            epoch_confidence.append(F.softmax(logits, dim=-1).cpu())
+        b_size = len(labels)
+        epoch_loss.update(loss.item(), b_size)
+        epoch_acc.update(acc.item(), b_size)
+    if detailed:
+        epoch_confidence = torch.cat(epoch_confidence).numpy()
+    return epoch_loss.avg, epoch_acc.avg, epoch_confidence

imageprocessing/artemis/artemis/neural_models/word_embeddings.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Utilities to load pretrained word embeddings like those of GloVe.
+The MIT License (MIT)
+Originally created in late 2019, for Python 3.x - last updated in 2021.
+Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
+"""
+import warnings
+import torch
+import numpy as np
+from collections import Counter
+def make_pretrained_embedding(vocab, pretrained_vectors, freeze=True, sigma=1, random_seed=None):
+    """ Make a torch.nn.Embedding based for a given vocabulary and a collection of
+    pretrained word-embedding vectors.
+    :param vocab: speakers_listeners.build_vocab.Vocabulary
+    :param pretrained_vectors: dictionary of words mapped to np.array vectors
+    (like those returned from ```load_glove_pretrained_embedding```).
+    :param freeze, (opt, boolean) if True the embedding is not using gradients to optimize itself (fine-tune).
+    :param sigma, (opt, int) standard-deviation of Gaussian used to sample when a word is not in the pretrained_vectors
+    :param random_seed (opt, int) to seed the numpy Gaussian
+    :return: torch.nn.Embedding
+    Note: this implementation will freeze all words if freeze=True, irrespectively of if the words are in the
+    pretrained_vectors collection or not (OOV: Out-of-Vocabulary). If you want to fine-tune the OOV you need to adapt
+    like this: https://discuss.pytorch.org/t/updating-part-of-an-embedding-matrix-only-for-out-of-vocab-words/33297
+    """
+    for ss in vocab.special_symbols:
+        if ss in pretrained_vectors:
+            warnings.warn('the special symbol {} is found in the pretrained embedding.')
+    # Initialize weight matrix with correct dimensions and all zeros
+    random_key = next(iter(pretrained_vectors))
+    emb_dim = len(pretrained_vectors[random_key])
+    emb_dtype = pretrained_vectors[random_key].dtype
+    n_words = len(vocab)
+    weights = np.zeros((n_words, emb_dim), dtype=emb_dtype)
+    if random_seed is not None:
+        np.random.seed(random_seed)
+    for word, idx in vocab.word2idx.items():
+        if word in pretrained_vectors:
+            weights[idx] = pretrained_vectors[word]
+        else:
+            weights[idx] = sigma * np.random.randn(emb_dim)
+    padding_idx = None
+    if hasattr(vocab, 'pad'):
+        print('using padding symbol of provided vocabulary.')
+        padding_idx = vocab.pad
+        weights[padding_idx] = np.zeros(emb_dim)
+    embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(weights), freeze=freeze, padding_idx=padding_idx)
+    return embedding
+def load_glove_pretrained_embedding(glove_file, dtype=np.float32, only_words=False, verbose=False):
+    """
+    :param glove_file: file downloaded from Glove website
+    :param dtype: how to save the word-embeddings
+    :param only_words: do not return the embedding vectors, only the words considered
+    :param verbose: print, or not side-information
+    :return: dictionary of words mapped to np.array vectors
+    """
+    if verbose:
+        print("Loading glove word embeddings.")
+    embedding = dict()
+    with open(glove_file) as f_in:
+        for line in f_in:
+            s_line = line.split()
+            token = s_line[0]
+            if only_words:
+                embedding[token] = 0
+            else:
+                w_embedding = np.array([float(val) for val in s_line[1:]], dtype=dtype)
+                embedding[token] = w_embedding
+    if only_words:
+        embedding = set(list(embedding.keys()))
+    if verbose:
+        print("Done.", len(embedding), "words loaded.")
+    return embedding
+def init_token_bias(encoded_token_list, vocab=None, dtype=np.float32, trainable=True):
+    """ Make a bias vector based on the (log) probability of the frequency of each word
+    in the training data similar to https://arxiv.org/abs/1412.2306
+    This bias can used to initialize the hidden-to-next-word layer for faster convergence.
+    :param encoded_token_list: [[tokens-of-utterance-1-as-ints] [tokens-of-utterance-2]...]
+    :param vocab: speakers_listeners.build_vocab.Vocabulary
+    :param dtype:
+    :param trainable: (opt, bool) permit training or not of the resulting bias vector
+    :return: (torch.Parameter) bias vector
+    """
+    counter = Counter()
+    for tokens in encoded_token_list:
+        counter.update(tokens)
+    n_items = len(counter)
+    if vocab is not None:
+        if n_items != len(vocab):
+            warnings.warn('init_token_bias: Vobab contains more tokens than given token lists.')
+            n_items = max(n_items, len(vocab))
+        counter[vocab.sos] = counter[vocab.pad] = min(counter.values())
+    bias_vector = np.ones(n_items, dtype=dtype) # initialize
+    for position, frequency in counter.items():
+        bias_vector[position] = frequency
+    #  Log probability
+    bias_vector /= np.sum(bias_vector)
+    bias_vector = np.log(bias_vector)
+    bias_vector -= np.max(bias_vector)
+    bias_vector = torch.from_numpy(bias_vector)
+    bias_vector = torch.nn.Parameter(bias_vector, requires_grad=trainable)
+    return bias_vector