File size: 3,376 Bytes
7d1df38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dab50d
7d1df38
4dab50d
 
7d1df38
4dab50d
7d1df38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dab50d
7d1df38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from huggingface_hub import hf_hub_url, cached_download
from PIL import Image
import os
import json
import glob
import random
from typing import Any, Dict, List
import torch
import torchvision

import wordsegment as ws

from virtex.config import Config
from virtex.factories import TokenizerFactory, PretrainingModelFactory
from virtex.utils.checkpointing import CheckpointManager

CONFIG_PATH = "config.yaml"
MODEL_PATH = "checkpoint_last5.pth"
VALID_SUBREDDITS_PATH = "subreddit_list.json"
SAMPLES_PATH = "./samples/*.jpg"

class ImageLoader():
    def __init__(self):
        self.transformer = torchvision.transforms.Compose([torchvision.transforms.Resize(256),
                               torchvision.transforms.CenterCrop(224),
                               torchvision.transforms.ToTensor()])
    def load(self, im_path):
        im = torch.FloatTensor(self.transformer(Image.open(im_path))).unsqueeze(0)
        return {"image": im}
    def transform(self, image):
        im = torch.FloatTensor(self.transformer(image)).unsqueeze(0)
        return {"image": im}

class VirTexModel():
    def __init__(self):
        self.config = Config(CONFIG_PATH)
        ws.load()
        self.device = 'cpu'
        self.tokenizer = TokenizerFactory.from_config(self.config)
        self.model = PretrainingModelFactory.from_config(self.config).to(self.device)
        CheckpointManager(model=self.model).load("./checkpoint_last5.pth")
        self.model.eval()
        self.valid_subs = json.load(open(VALID_SUBREDDITS_PATH))
        
    def predict(self, image_dict, sub_prompt = None, prompt = ""):
        if sub_prompt is None:
            subreddit_tokens = torch.tensor([self.model.sos_index], device=self.device).long()
        else:
            subreddit_tokens = torch.tensor([self.tokenizer.token_to_id(sub_prompt)], device=self.device).long()
        predictions: List[Dict[str, Any]] = []
        
        is_valid_subreddit = False
        subreddit, rest_of_caption = "", ""
        image_dict["decode_prompt"] = subreddit_tokens
        while not is_valid_subreddit:
            
            with torch.no_grad():
                caption = self.model(image_dict)["predictions"][0].tolist()
                
            if self.tokenizer.token_to_id("[SEP]") in caption:
                sep_index = caption.index(self.tokenizer.token_to_id("[SEP]"))
                caption[sep_index] = self.tokenizer.token_to_id("://")
            
            caption = self.tokenizer.decode(caption)
            
            if "://" in caption:
                subreddit, rest_of_caption = caption.split("://")
                subreddit = "".join(subreddit.split())
                rest_of_caption = rest_of_caption.strip()
            else:
                subreddit, rest_of_caption = "", caption
            
            is_valid_subreddit = True if sub_prompt is not None else subreddit in self.valid_subs
            
            
        return subreddit, rest_of_caption

def download_files():
    #download model files
    download_files = [CONFIG_PATH, MODEL_PATH, VALID_SUBREDDITS_PATH]
    for f in download_files:
        fp = cached_download(hf_hub_url("zamborg/redcaps", filename=f))
        os.system(f"cp {fp} ./{f}")

def get_samples():
    return glob.glob(SAMPLES_PATH)

def get_rand_img(samples):
    return samples[random.randint(0,len(samples)-1)]