File size: 10,541 Bytes
9b7b0f8
 
9669aec
 
e346b2b
 
9669aec
 
 
 
51264ff
176d22b
8eb54f6
 
 
 
2ce0115
1aad160
 
 
 
9669aec
7bdb7e9
9669aec
 
2a25d35
31287b1
2a25d35
 
b8ee318
 
 
 
b8a1fe8
b8ee318
 
 
8eb54f6
 
0b1f706
 
e112c7f
272b583
8eb54f6
b8ee318
 
 
8eb54f6
b8ee318
 
 
c841331
b8ee318
 
b9c51f8
0b51211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bdb7e9
 
 
9669aec
7bdb7e9
9669aec
7bdb7e9
9669aec
 
b8ee318
 
 
 
0b51211
 
b8ee318
 
 
9669aec
b8ee318
9669aec
0b51211
9669aec
 
7bdb7e9
9669aec
 
 
 
 
 
 
7bdb7e9
b8ee318
9669aec
 
 
 
 
 
 
 
 
 
7bdb7e9
 
 
 
 
 
 
 
0b51211
 
 
 
 
7bdb7e9
b8ee318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b51211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bdb7e9
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import gradio as gr
import torch
import os
import glob
import spaces

from datetime import datetime
from PIL import Image
from diffusers.utils import load_image
from diffusers import EulerDiscreteScheduler
from pipline_StableDiffusion_ConsistentID import ConsistentIDStableDiffusionPipeline
from huggingface_hub import hf_hub_download
### Model can be imported from https://github.com/zllrunning/face-parsing.PyTorch?tab=readme-ov-file
### We use the ckpt of 79999_iter.pth: https://drive.google.com/open?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812
### Thanks for the open source of face-parsing model.
from models.BiSeNet.model import BiSeNet

# zero = torch.Tensor([0]).cuda()
# print(zero.device) # <-- 'cpu' 🤔
# device = zero.device # "cuda"
device = "cuda"

# Gets the absolute path of the current script
script_directory = os.path.dirname(os.path.realpath(__file__))

# download ConsistentID checkpoint to cache
base_model_path = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
consistentID_path = hf_hub_download(repo_id="JackAILab/ConsistentID", filename="ConsistentID-v1.bin", repo_type="model")

### Load base model
pipe = ConsistentIDStableDiffusionPipeline.from_pretrained(
    base_model_path, 
    torch_dtype=torch.float16, 
    safety_checker=None, # use_safetensors=True, 
    variant="fp16"
).to(device)

### Load other pretrained models
## BiSenet
bise_net_cp_path = hf_hub_download(repo_id="JackAILab/ConsistentID", filename="face_parsing.pth", local_dir="./checkpoints")
bise_net = BiSeNet(n_classes = 19)
bise_net.load_state_dict(torch.load(bise_net_cp_path), map_location="cpu") # device fail
bise_net.cuda()

### Load consistentID_model checkpoint
pipe.load_ConsistentID_model(
    os.path.dirname(consistentID_path),
    bise_net,
    subfolder="",
    weight_name=os.path.basename(consistentID_path),
    trigger_word="img",
)
pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)


# @torch.inference_mode()
# def Enhance_prompt(prompt,select_images):
    
#     llva_prompt = f'Please ignore the image. Enhance the following text prompt for me. You can associate more details with the character\'s gesture, environment, and decent clothing:"{prompt}".' 
#     args = type('Args', (), {
#         "model_path": llva_model_path,
#         "model_base": None,
#         "model_name": get_model_name_from_path(llva_model_path),
#         "query": llva_prompt,
#         "conv_mode": None,
#         "image_file": select_images,
#         "sep": ",",
#         "temperature": 0,
#         "top_p": None,
#         "num_beams": 1,
#         "max_new_tokens": 512
#     })() 
#     Enhanced_prompt = eval_model(args, llva_tokenizer, llva_model, llva_image_processor)

#     return Enhanced_prompt
    
@spaces.GPU
def process(inputImage,prompt,negative_prompt):

    # hyper-parameter
    select_images = load_image(Image.fromarray(inputImage))
    num_steps = 50
    merge_steps = 30
    
    if prompt == "":
        prompt = "A man, in a forest"
        prompt = "A man, with backpack, in a raining tropical forest, adventuring, holding a flashlight, in mist, seeking animals"
        prompt = "A person, in a sowm, wearing santa hat and a scarf, with a cottage behind"
    else:
        # prompt=Enhance_prompt(prompt,blank_image) # TODO
        prompt=prompt
        print(prompt)
        pass
        
    if negative_prompt == "":
        negative_prompt = ",monochrome, lowres, bad anatomy, worst quality, low quality, blurry"

    # Extend Prompt
    prompt = "cinematic photo," + prompt + ", 50mm photograph, half-length portrait, film, bokeh, professional, 4k, highly detailed"

    negtive_prompt_group="((((ugly)))), (((duplicate))), ((morbid)), ((mutilated)), [out of frame], extra fingers, mutated hands, ((poorly drawn hands)), ((poorly drawn face)), (((mutation))), (((deformed))), ((ugly)), blurry, ((bad anatomy)), (((bad proportions))), ((extra limbs)), cloned face, (((disfigured))). out of frame, ugly, extra limbs, (bad anatomy), gross proportions, (malformed limbs), ((missing arms)), ((missing legs)), (((extra arms))), (((extra legs))), mutated hands, (fused fingers), (too many fingers), (((long neck)))"
    negative_prompt = negative_prompt + negtive_prompt_group
    
    seed = torch.randint(0, 1000, (1,)).item()
    generator = torch.Generator(device=device).manual_seed(seed)

    images = pipe(
        prompt=prompt,
        width=512,    
        height=768,
        input_id_images=select_images,
        negative_prompt=negative_prompt,
        num_images_per_prompt=1,
        num_inference_steps=num_steps,
        start_merge_step=merge_steps,
        generator=generator,
    ).images[0]

    current_date = datetime.today()

    output_dir = script_directory + f"/images/gradio_outputs"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    images.save(os.path.join(output_dir, f"{current_date}-{seed}.jpg"))

    return os.path.join(output_dir, f"{current_date}-{seed}.jpg")

# # Gets the templates
# script_directory = os.path.dirname(os.path.realpath(__file__))
# # preset_template = glob.glob(script_directory+"/images/templates/*.png")
# preset_template = glob.glob("./images/templates/*.png")
# preset_template = preset_template + glob.glob("./images/templates/*.jpg")

# # Use Blocks Create Gradio 
# with gr.Blocks(title="ConsistentID Demo") as demo:
#     gr.Markdown("# ConsistentID Demo")
#     gr.Markdown("\
#         Put the reference figure to be redrawn into the box below (There is a small probability of referensing failure. You can submit it repeatedly)")
#     gr.Markdown("\
#         If you find our work interesting, please leave a star in GitHub for us!<br>\
#         https://github.com/JackAILab/ConsistentID")
#     with gr.Row():
#         with gr.Column():
#             model_selected_tab = gr.State(0)
#             with gr.TabItem("template images") as template_images_tab:
#                 template_gallery_list = [(i, i) for i in preset_template]
#                 gallery = gr.Gallery(template_gallery_list,columns=[4], rows=[2], object_fit="contain", height="auto",show_label=False)
                
#                 def select_function(evt: gr.SelectData):
#                     return preset_template[evt.index]

#                 selected_template_images = gr.Text(show_label=False, visible=False, placeholder="Selected")
#                 gallery.select(select_function, None, selected_template_images)
#             with gr.TabItem("Upload Image") as upload_image_tab:
#                 costum_image = gr.Image(label="Upload Image")

#             model_selected_tabs = [template_images_tab, upload_image_tab]
#             for i, tab in enumerate(model_selected_tabs):
#                 tab.select(fn=lambda tabnum=i: tabnum, inputs=[], outputs=[model_selected_tab])

#             with gr.Column():
#                 prompt_selected_tab = gr.State(0)
#                 with gr.TabItem("template prompts") as template_prompts_tab:
#                     prompt_selected = gr.Dropdown(value="A person, police officer, half body shot", elem_id='dropdown', choices=[
#                         "A woman in a wedding dress",
#                         "A woman, queen, in a gorgeous palace",
#                         "A man sitting at the beach with sunset", 
#                         "A person, police officer, half body shot", 
#                         "A man, sailor, in a boat above ocean",
#                         "A women wearing headphone, listening music", 
#                         "A man, firefighter, half body shot"], label=f"prepared prompts")

#                 with gr.TabItem("custom prompt") as custom_prompt_tab:
#                     prompt = gr.Textbox(label="prompt",placeholder="A man/woman wearing a santa hat")
#                     nagetive_prompt = gr.Textbox(label="negative prompt",placeholder="monochrome, lowres, bad anatomy, worst quality, low quality, blurry")
            
#                 prompt_selected_tabs = [template_prompts_tab, custom_prompt_tab]
#                 for i, tab in enumerate(prompt_selected_tabs):
#                     tab.select(fn=lambda tabnum=i: tabnum, inputs=[], outputs=[prompt_selected_tab])
            
#             retouching = gr.Checkbox(label="face retouching",value=False)
#             width = gr.Slider(label="image width",minimum=256,maximum=768,value=512,step=8)
#             height = gr.Slider(label="image height",minimum=256,maximum=768,value=768,step=8)
#             width.release(lambda x,y: min(1280-x,y), inputs=[width,height], outputs=[height])
#             height.release(lambda x,y: min(1280-y,x), inputs=[width,height], outputs=[width])
#             merge_steps = gr.Slider(label="step starting to merge facial details(30 is recommended)",minimum=10,maximum=50,value=30,step=1)
            
#             btn = gr.Button("Run")
#         with gr.Column():
#             out = gr.Image(label="Output")
#             gr.Markdown('''
#                 N.B.:<br/>
#                 - If the proportion of face in the image is too small, the probability of an error will be slightly higher, and the similarity will also significantly decrease.)
#                 - At the same time, use prompt with \"man\" or \"woman\" instead of \"person\" as much as possible, as that may cause the model to be confused whether the protagonist is male or female.
#                 - Due to insufficient graphics memory on the demo server, there is an upper limit on the resolution for generating samples. We will support the generation of SDXL as soon as possible<br/><br/>
#                 ''')
#         btn.click(fn=process, inputs=[selected_template_images,costum_image,prompt,nagetive_prompt,prompt_selected,retouching
#             ,model_selected_tab,prompt_selected_tab,width,height,merge_steps], outputs=out)

iface = gr.Interface(
    fn=process,
    inputs=[
        gr.Image(label="Upload Image"), 
        gr.Textbox(label="prompt",placeholder="A man, in a forest, adventuring"),
        gr.Textbox(label="negative prompt",placeholder="monochrome, lowres, bad anatomy, worst quality, low quality, blurry"),
    ],
    outputs=[
        gr.Image(label="Output"), 
    ],
    title="ConsistentID Demo",
    description="Put reference portrait below" ,
    allow_flagging="never"
)

iface.launch() # zero.device

# @spaces.GPU
# def greet(n):
#     print(zero.device) # <-- 'cuda:0' 🤗
#     return f"Hello {zero + n} Tensor"

# demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
# demo.launch()