IDM-VTON

Runtime error

App Files Files Community

cocktailpeanut commited on Apr 23, 2024

Commit

ec0e0d5

1 Parent(s): d1b3d39

update

Browse files

Files changed (1) hide show

app.py +155 -72

app.py CHANGED Viewed

@@ -27,6 +27,7 @@ from torchvision.transforms.functional import to_pil_image
 import devicetorch
 def pil_to_binary_mask(pil_image, threshold=0):
     np_image = np.array(pil_image)
@@ -45,10 +46,12 @@ def pil_to_binary_mask(pil_image, threshold=0):
 base_path = 'yisol/IDM-VTON'
 example_path = os.path.join(os.path.dirname(__file__), 'example')
 unet = UNet2DConditionModel.from_pretrained(
     base_path,
     subfolder="unet",
-    torch_dtype=torch.float16,
 )
 unet.requires_grad_(False)
 tokenizer_one = AutoTokenizer.from_pretrained(
@@ -68,28 +71,33 @@ noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler"
 text_encoder_one = CLIPTextModel.from_pretrained(
     base_path,
     subfolder="text_encoder",
-    torch_dtype=torch.float16,
 )
 text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
     base_path,
     subfolder="text_encoder_2",
-    torch_dtype=torch.float16,
 )
 image_encoder = CLIPVisionModelWithProjection.from_pretrained(
     base_path,
     subfolder="image_encoder",
-    torch_dtype=torch.float16,
     )
 vae = AutoencoderKL.from_pretrained(base_path,
                                     subfolder="vae",
-                                    torch_dtype=torch.float16,
 )
 # "stabilityai/stable-diffusion-xl-base-1.0",
 UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
     base_path,
     subfolder="unet_encoder",
-    torch_dtype=torch.float16,
 )
 parsing_model = Parsing(0)
@@ -119,7 +127,8 @@ pipe = TryonPipeline.from_pretrained(
         tokenizer_2 = tokenizer_two,
         scheduler = noise_scheduler,
         image_encoder=image_encoder,
-        torch_dtype=torch.float16,
 )
 pipe.unet_encoder = UNet_Encoder
@@ -127,14 +136,12 @@ pipe.unet_encoder = UNet_Encoder
 def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed):
     #device = "cuda"
     device = devicetorch.get(torch)
     openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
     pipe.unet_encoder.to(device)
     garm_img= garm_img.convert("RGB").resize((768,1024))
-    human_img_orig = dict["background"].convert("RGB")
     if is_checked_crop:
         width, height = human_img_orig.size
         target_width = int(min(width, height * (3 / 4)))
@@ -148,8 +155,6 @@ def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_ste
         human_img = cropped_img.resize((768,1024))
     else:
         human_img = human_img_orig.resize((768,1024))
     if is_checked:
         keypoints = openpose_model(human_img.resize((384,512)))
         model_parse, _ = parsing_model(human_img.resize((384,512)))
@@ -165,82 +170,161 @@ def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_ste
     human_img_arg = _apply_exif_orientation(human_img.resize((384,512)))
     human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
     #args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
-    args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', device))
     # verbosity = getattr(args, "verbosity", None)
-    pose_img = args.func(args,human_img_arg)
-    pose_img = pose_img[:,:,::-1]
     pose_img = Image.fromarray(pose_img).resize((768,1024))
     with torch.no_grad():
         # Extract the images
-        with torch.autocast(device_type=device):
-        #with torch.cuda.amp.autocast():
-            with torch.no_grad():
-                prompt = "model is wearing " + garment_des
-                negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
-                with torch.inference_mode():
-                    (
-                        prompt_embeds,
-                        negative_prompt_embeds,
-                        pooled_prompt_embeds,
-                        negative_pooled_prompt_embeds,
-                    ) = pipe.encode_prompt(
-                        prompt,
-                        num_images_per_prompt=1,
-                        do_classifier_free_guidance=True,
-                        negative_prompt=negative_prompt,
-                    )
-                    prompt = "a photo of " + garment_des
                     negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
-                    if not isinstance(prompt, List):
-                        prompt = [prompt] * 1
-                    if not isinstance(negative_prompt, List):
-                        negative_prompt = [negative_prompt] * 1
                     with torch.inference_mode():
                         (
-                            prompt_embeds_c,
-                            _,
-                            _,
-                            _,
                         ) = pipe.encode_prompt(
                             prompt,
                             num_images_per_prompt=1,
-                            do_classifier_free_guidance=False,
                             negative_prompt=negative_prompt,
                         )
-                    pose_img =  tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16)
-                    garm_tensor =  tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16)
-                    generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
-                    images = pipe(
-                        prompt_embeds=prompt_embeds.to(device,torch.float16),
-                        negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
-                        pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
-                        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
-                        num_inference_steps=denoise_steps,
-                        generator=generator,
-                        strength = 1.0,
-                        pose_img = pose_img.to(device,torch.float16),
-                        text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
-                        cloth = garm_tensor.to(device,torch.float16),
-                        mask_image=mask,
-                        image=human_img,
-                        height=1024,
-                        width=768,
-                        ip_adapter_image = garm_img.resize((768,1024)),
-                        guidance_scale=2.0,
-                    )[0]
     if is_checked_crop:
-        out_img = images[0].resize(crop_size)
-        human_img_orig.paste(out_img, (int(left), int(top)))
         return human_img_orig, mask_gray
     else:
         return images[0], mask_gray
@@ -311,8 +395,7 @@ with image_blocks as demo:
     try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked,is_checked_crop, denoise_steps, seed], outputs=[image_out,masked_img], api_name='tryon')
-image_blocks.launch()

 import devicetorch
+torch_dtype = devicetorch.dtype(torch)
 def pil_to_binary_mask(pil_image, threshold=0):
     np_image = np.array(pil_image)
 base_path = 'yisol/IDM-VTON'
 example_path = os.path.join(os.path.dirname(__file__), 'example')
+dtype = devicetorch.dtype(torch)
 unet = UNet2DConditionModel.from_pretrained(
     base_path,
     subfolder="unet",
+    #torch_dtype=torch.float16,
+    torch_dtype=dtype,
 )
 unet.requires_grad_(False)
 tokenizer_one = AutoTokenizer.from_pretrained(
 text_encoder_one = CLIPTextModel.from_pretrained(
     base_path,
     subfolder="text_encoder",
+    #torch_dtype=torch.float16,
+    torch_dtype=dtype,
 )
 text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
     base_path,
     subfolder="text_encoder_2",
+    #torch_dtype=torch.float16,
+    torch_dtype=dtype,
 )
 image_encoder = CLIPVisionModelWithProjection.from_pretrained(
     base_path,
     subfolder="image_encoder",
+    #torch_dtype=torch.float16,
+    torch_dtype=dtype,
     )
 vae = AutoencoderKL.from_pretrained(base_path,
                                     subfolder="vae",
+                                    #torch_dtype=torch.float16,
+                                    torch_dtype=dtype,
 )
 # "stabilityai/stable-diffusion-xl-base-1.0",
 UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
     base_path,
     subfolder="unet_encoder",
+    #torch_dtype=torch.float16,
+    torch_dtype=dtype,
 )
 parsing_model = Parsing(0)
         tokenizer_2 = tokenizer_two,
         scheduler = noise_scheduler,
         image_encoder=image_encoder,
+        #torch_dtype=torch.float16,
+        torch_dtype=dtype,
 )
 pipe.unet_encoder = UNet_Encoder
 def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed):
     #device = "cuda"
     device = devicetorch.get(torch)
     openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
     pipe.unet_encoder.to(device)
     garm_img= garm_img.convert("RGB").resize((768,1024))
+    human_img_orig = dict["background"].convert("RGB")
     if is_checked_crop:
         width, height = human_img_orig.size
         target_width = int(min(width, height * (3 / 4)))
         human_img = cropped_img.resize((768,1024))
     else:
         human_img = human_img_orig.resize((768,1024))
     if is_checked:
         keypoints = openpose_model(human_img.resize((384,512)))
         model_parse, _ = parsing_model(human_img.resize((384,512)))
     human_img_arg = _apply_exif_orientation(human_img.resize((384,512)))
     human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
     #args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
+    model_device = "cpu"
+    if device == "cuda":
+        model_device = "cuda"
+    args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', model_device))
     # verbosity = getattr(args, "verbosity", None)
+    pose_img = args.func(args,human_img_arg)
+    pose_img = pose_img[:,:,::-1]
     pose_img = Image.fromarray(pose_img).resize((768,1024))
+    #pose_img = Image.fromarray(pose_img).resize((512, 768))
     with torch.no_grad():
         # Extract the images
+        if device == "cuda":
+            with torch.cuda.amp.autocast():
+                with torch.no_grad():
+                    prompt = "model is wearing " + garment_des
                     negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
                     with torch.inference_mode():
                         (
+                            prompt_embeds,
+                            negative_prompt_embeds,
+                            pooled_prompt_embeds,
+                            negative_pooled_prompt_embeds,
                         ) = pipe.encode_prompt(
                             prompt,
                             num_images_per_prompt=1,
+                            do_classifier_free_guidance=True,
                             negative_prompt=negative_prompt,
                         )
+                        prompt = "a photo of " + garment_des
+                        negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+                        if not isinstance(prompt, List):
+                            prompt = [prompt] * 1
+                        if not isinstance(negative_prompt, List):
+                            negative_prompt = [negative_prompt] * 1
+                        with torch.inference_mode():
+                            (
+                                prompt_embeds_c,
+                                _,
+                                _,
+                                _,
+                            ) = pipe.encode_prompt(
+                                prompt,
+                                num_images_per_prompt=1,
+                                do_classifier_free_guidance=False,
+                                negative_prompt=negative_prompt,
+                            )
+                        #pose_img =  tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16)
+                        pose_img =  tensor_transfrom(pose_img).unsqueeze(0).to(device,dtype)
+                        #garm_tensor =  tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16)
+                        garm_tensor =  tensor_transfrom(garm_img).unsqueeze(0).to(device,dtype)
+                        generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
+                        images = pipe(
+                            prompt_embeds=prompt_embeds.to(device,dtype),
+                            #prompt_embeds=prompt_embeds.to(device,torch.float16),
+                            negative_prompt_embeds=negative_prompt_embeds.to(device,dtype),
+                            #negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
+                            pooled_prompt_embeds=pooled_prompt_embeds.to(device,dtype),
+                            #pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
+                            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,dtype),
+                            #negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
+                            num_inference_steps=denoise_steps,
+                            generator=generator,
+                            strength = 1.0,
+                            #pose_img = pose_img.to(device,torch.float16),
+                            pose_img = pose_img.to(device,dtype),
+                            #text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
+                            text_embeds_cloth=prompt_embeds_c.to(device,dtype),
+                            #cloth = garm_tensor.to(device,torch.float16),
+                            cloth = garm_tensor.to(device,dtype),
+                            mask_image=mask,
+                            image=human_img,
+                            height=1024,
+                            width=768,
+                            ip_adapter_image = garm_img.resize((768,1024)),
+                            guidance_scale=2.0,
+                        )[0]
+        else:
+            prompt = "model is wearing " + garment_des
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+            with torch.inference_mode():
+                (
+                    prompt_embeds,
+                    negative_prompt_embeds,
+                    pooled_prompt_embeds,
+                    negative_pooled_prompt_embeds,
+                ) = pipe.encode_prompt(
+                    prompt,
+                    num_images_per_prompt=1,
+                    do_classifier_free_guidance=True,
+                    negative_prompt=negative_prompt,
+                )
+                prompt = "a photo of " + garment_des
+                negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+                if not isinstance(prompt, List):
+                    prompt = [prompt] * 1
+                if not isinstance(negative_prompt, List):
+                    negative_prompt = [negative_prompt] * 1
+                with torch.inference_mode():
+                    (
+                        prompt_embeds_c,
+                        _,
+                        _,
+                        _,
+                    ) = pipe.encode_prompt(
+                        prompt,
+                        num_images_per_prompt=1,
+                        do_classifier_free_guidance=False,
+                        negative_prompt=negative_prompt,
+                    )
+                #pose_img =  tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16)
+                pose_img =  tensor_transfrom(pose_img).unsqueeze(0).to(device,dtype)
+                #garm_tensor =  tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16)
+                garm_tensor =  tensor_transfrom(garm_img).unsqueeze(0).to(device,dtype)
+                generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
+                images = pipe(
+                    prompt_embeds=prompt_embeds.to(device,dtype),
+                    #prompt_embeds=prompt_embeds.to(device,torch.float16),
+                    negative_prompt_embeds=negative_prompt_embeds.to(device,dtype),
+                    #negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
+                    pooled_prompt_embeds=pooled_prompt_embeds.to(device,dtype),
+                    #pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
+                    negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,dtype),
+                    #negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
+                    num_inference_steps=denoise_steps,
+                    generator=generator,
+                    strength = 1.0,
+                    #pose_img = pose_img.to(device,torch.float16),
+                    pose_img = pose_img.to(device,dtype),
+                    #text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
+                    text_embeds_cloth=prompt_embeds_c.to(device,dtype),
+                    #cloth = garm_tensor.to(device,torch.float16),
+                    cloth = garm_tensor.to(device,dtype),
+                    mask_image=mask,
+                    image=human_img,
+                    height=1024,
+                    width=768,
+                    ip_adapter_image = garm_img.resize((768,1024)),
+                    guidance_scale=2.0,
+                )[0]
     if is_checked_crop:
+        out_img = images[0].resize(crop_size)
+        human_img_orig.paste(out_img, (int(left), int(top)))
         return human_img_orig, mask_gray
     else:
         return images[0], mask_gray
     try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked,is_checked_crop, denoise_steps, seed], outputs=[image_out,masked_img], api_name='tryon')
+image_blocks.launch()