import gradio as gr
import torch
import os

from PIL import Image
import pandas as pd

from transformers import CLIPProcessor, CLIPModel

checkpoint = "vincentclaes/emoji-predictor"
# x_, _, files = next(os.walk("./emojis"))
# no_of_emojis = range(len(files))
# emojis_as_images = [Image.open(f"emojis/{i}.png") for i in no_of_emojis]
adjectives = pd.read_table("./adjectives.txt", header=None)[0].to_list()
K = 4

processor = CLIPProcessor.from_pretrained(checkpoint)
model = CLIPModel.from_pretrained(checkpoint)


def concat_images(*images):
    """Generate composite of all supplied images.
    https://stackoverflow.com/a/71315656/1771155
    """
    # Get the widest width.
    width = max(image.width for image in images)
    # Add up all the heights.
    height = max(image.height for image in images)
    # set the correct size of width and heigtht of composite.
    composite = Image.new("RGB", (2 * width, 2 * height))
    assert K == 4, "We expect 4 suggestions, other numbers won't work."
    for i, image in enumerate(images):
        if i == 0:
            composite.paste(image, (0, 0))
        elif i == 1:
            composite.paste(image, (width, 0))
        elif i == 2:
            composite.paste(image, (0, height))
        elif i == 3:
            composite.paste(image, (width, height))
    return composite


def get_tag(emoji, tags, model=model, processor=processor, K=4):
    if tags:
        tags = tags.strip().split(",")
    else:
        tags = adjectives
    inputs = processor(
        text=tags, images=emoji, return_tensors="pt", padding=True, truncation=True
    )
    outputs = model(**inputs)

    # we take the softmax to get the label probabilities
    probs = outputs.logits_per_text.softmax(dim=0)
    probs_formatted = torch.tensor([prob[0] for prob in probs])
    values, indices = probs_formatted.topk(K)
    return "Tag (confidence): " + ", ".join(
        [f"{tags[i]} ({round(v.item(), 2)})" for v, i in zip(values, indices)]
    )


title = "Tagging an Emoji"
description = """You provide an Emoji and our few-shot fine tuned CLIP model will suggest some tags that are appropriate.\n
We use the 228 most common adjectives in english](https://grammar.yourdictionary.com/parts-of-speech/adjectives/list-of-adjective-words.html).\n
You can also specify your own custom tags, for example; love,hate,fun,bitterness.
"""

examples = [[f"emojis/{i}.png"] for i in range(32)]

text = gr.inputs.Textbox(
    placeholder="Enter a text and we will try to predict an emoji..."
)
gr.Interface(
    fn=get_tag,
    inputs=[
        gr.components.Image(type="pil", label="emoji"),
        gr.components.Textbox(
            label="tags",
            placeholder="Provide a comma seperated list of tags; tag1,tag2,tag3,...",
        ),
    ],
    outputs=gr.Textbox(),
    examples=examples,
    examples_per_page=32,
    title=title,
    description=description,
).launch()