|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
|
|
import os |
|
import pickle |
|
import torch |
|
|
|
from argparse import Namespace |
|
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline |
|
from io import BytesIO |
|
from src.model import get_model |
|
from src.utils.output_utils import prepare_output |
|
from torchvision import transforms |
|
|
|
|
|
REPO_ID = "Launchpad/inversecooking" |
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
use_gpu = True |
|
device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu') |
|
map_loc = None if torch.cuda.is_available() and use_gpu else 'cpu' |
|
|
|
|
|
ingrs_vocab = pickle.load( |
|
open(hf_hub_download(REPO_ID, 'data/ingr_vocab.pkl', token=HF_TOKEN), 'rb') |
|
) |
|
vocab = pickle.load( |
|
open(hf_hub_download(REPO_ID, 'data/instr_vocab.pkl', token=HF_TOKEN), 'rb') |
|
) |
|
|
|
ingr_vocab_size = len(ingrs_vocab) |
|
instrs_vocab_size = len(vocab) |
|
|
|
|
|
args = Namespace( |
|
aux_data_dir='../data', batch_size=128, beam=-1, crop_size=224, |
|
decay_lr=True, dropout_decoder_i=0.3, dropout_decoder_r=0.3, |
|
dropout_encoder=0.3, embed_size=512, es_metric='loss', |
|
eval_split='val', finetune_after=-1, get_perplexity=False, |
|
greedy=False, image_model='resnet50', image_size=256, |
|
ingrs_only=True, label_smoothing_ingr=0.1, learning_rate=0.001, |
|
log_step=10, log_term=False, loss_weight=[1.0, 0.0, 0.0, 0.0], |
|
lr_decay_every=1, lr_decay_rate=0.99, max_eval=4096, maxnumims=5, |
|
maxnuminstrs=10, maxnumlabels=20, maxseqlen=15, model_name='model', |
|
n_att=8, n_att_ingrs=4, num_epochs=400, num_workers=8, numgens=3, |
|
patience=50, project_name='inversecooking', |
|
recipe1m_dir='path/to/recipe1m', recipe_only=False, resume=False, |
|
save_dir='path/to/save/models', scale_learning_rate_cnn=0.01, |
|
suff='', temperature=1.0, tensorboard=True, transf_layers=16, |
|
transf_layers_ingrs=4, transfer_from='', use_lmdb=True, |
|
use_true_ingrs=False, weight_decay=0.0 |
|
) |
|
args.maxseqlen = 15 |
|
args.ingrs_only = False |
|
|
|
|
|
model = get_model(args, ingr_vocab_size, instrs_vocab_size) |
|
model.load_state_dict(torch.load( |
|
hf_hub_download(REPO_ID, 'data/modelbest.ckpt', token=HF_TOKEN), map_location=map_loc) |
|
) |
|
model = model.to(device) |
|
model.eval() |
|
model.ingrs_only = False |
|
model.recipe_only = False |
|
|
|
transform_list = [] |
|
transform_list.append(transforms.Resize(256)) |
|
transform_list.append(transforms.CenterCrop(224)) |
|
transform_list.append(transforms.ToTensor()) |
|
transform_list.append(transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))) |
|
transform = transforms.Compose(transform_list) |
|
|
|
|
|
greedy = [True, False, False, False] |
|
beam = [-1, -1, -1, -1] |
|
temperature = 1.0 |
|
numgens = 1 |
|
|
|
|
|
pipe = StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4').to('cuda') |
|
|
|
def generate_image(input_img): |
|
|
|
|
|
image_tensor = transform(input_img).unsqueeze(0).to(device) |
|
|
|
for i in range(numgens): |
|
with torch.no_grad(): |
|
outputs = model.sample(image_tensor, greedy=greedy[i], |
|
temperature=temperature, beam=beam[i], true_ingrs=None) |
|
|
|
ingr_ids = outputs['ingr_ids'].cpu().numpy() |
|
recipe_ids = outputs['recipe_ids'].cpu().numpy() |
|
|
|
outs, valid = prepare_output(recipe_ids[0], ingr_ids[0], ingrs_vocab, vocab) |
|
|
|
recipe_name = outs['title'] |
|
ingredients = outs['ingrs'] |
|
|
|
|
|
ingredients = ', '.join(ingredients) |
|
prompt = "Fancy food plating of " + recipe_name + " with ingredients " + ingredients |
|
print(prompt) |
|
|
|
|
|
|
|
|
|
new_image = pipe(prompt).images[0] |
|
return new_image |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Image("https://www.ocf.berkeley.edu/~launchpad/media/uploads/project_logos/414478903_2298162417059609_260250523028403756_n_yt9pGFm.png", elem_id="logo-img", show_label=False, show_share_button=False, show_download_button=False) |
|
|
|
with gr.Column(scale=3): |
|
gr.Markdown("""Lunchpad is a [Launchpad](https://launchpad.studentorg.berkeley.edu/) project (Spring 2023) that transforms pictures of food to fancy plated versions through a novel transformer architecture and latent diffusion models. |
|
<br/><br/> |
|
**Model**: [Inverse Cooking](https://arxiv.org/abs/1812.06164), [Stable-Diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) |
|
<br/> |
|
**Developed by**: Sebastian Zhao, Annabelle Park, Nikhil Pitta, Tanush Talati, Rahul Vijay, Jade Wang, Tony Xin |
|
""" |
|
) |
|
with gr.Row(): |
|
gr.Interface(generate_image, gr.Image(), "image") |
|
|
|
if __name__ == '__main__': |
|
demo.launch() |
|
|