File size: 2,158 Bytes
17fc570
3558088
4dba69f
4e343fc
17fc570
a472fc4
3558088
be608b3
7d3d31c
 
4dba69f
 
 
a04adb6
 
4dba69f
fe3afc1
a04adb6
 
4dba69f
a04adb6
 
 
 
 
 
 
 
 
be608b3
fe3afc1
be608b3
a919f29
 
be608b3
a919f29
 
a04adb6
3558088
a919f29
 
42524e6
a919f29
07e1de8
 
a04adb6
 
 
a472fc4
07e1de8
a04adb6
a472fc4
a919f29
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import streamlit as st
from PIL import Image
from transformers import pipeline as transformer
from diffusers import StableDiffusionPipeline

captions = []

with st.sidebar:
    files = st.file_uploader("Upload images to blend", accept_multiple_files=True)
    st.divider()
    caption_model = st.selectbox("Caption Model", [
        "Salesforce/blip-image-captioning-large",
        "nlpconnect/vit-gpt2-image-captioning",
        "microsoft/git-base",
        "ydshieh/vit-gpt2-coco-en"
    ])
    caption_max_tokens = st.number_input("Image Caption: Max Tokens", value=20)
    st.divider()
    caption_concat_joiner = st.text_input("Caption Concatenation Joiner", value=" ")
    st.divider()
    diffusion_model = st.selectbox("Diffusion Model", [
        "stabilityai/stable-diffusion-xl-base-1.0",
        "runwayml/stable-diffusion-v1-5",
        "stabilityai/stable-diffusion-2-1",
        "CompVis/stable-diffusion-v1-4"
    ])
    image_gen_height = st.number_input("Stable Diffusion: Height", value=512)
    image_gen_width = st.number_input("Stable Diffusion: Width", value=512)
    image_gen_steps = st.slider("Stable Diffusion: Inference Steps", value=50)
    image_gen_guidance = st.slider("Stable Diffusion: Guidance Scale", value=7.5)
    image_gen_number = st.number_input("Stable Diffusion: Images Generated", value=1)

for file_name in files:
    image = Image.open(file_name)

    with st.spinner('Captioning Provided Image'):
        captioner = transformer(model=caption_model)
        caption = captioner(image, max_new_tokens=caption_max_tokens)[0]['generated_text']

    captions.append(caption)
    st.image(image, caption=caption)

if len(captions) > 0:
    st.divider()

    description = caption_concat_joiner.join(captions)

    pipe = StableDiffusionPipeline.from_pretrained(diffusion_model)

    with st.spinner(f'Generating Photo for "{description}"'):
        images = pipe(description, height=image_gen_height, width=image_gen_width, num_inference_steps=image_gen_steps, guidance_scale=image_gen_guidance, num_images_per_prompt=image_gen_number).images

    for image in images:
        st.image(image, caption=description)