Spaces:

aletrn
/

lisa-on-cuda

Paused

App Files Files Community

alessandro trinca tornidor commited on Feb 27, 2024

Commit

1186076

1 Parent(s): 8facf64

[refactor] return the inference function to inject the model

Browse files

Files changed (1) hide show

app.py +133 -131

app.py CHANGED Viewed

@@ -196,138 +196,138 @@ def get_model(args_to_parse):
     return _model, _clip_image_processor, _tokenizer, _transform
-args = parse_args(sys.argv[1:])
-model, clip_image_processor, tokenizer, transform = get_model(args)
-## to be implemented
-def inference(input_str, input_image):
-    ## filter out special chars
-    input_str = nh3.clean(
-        input_str,
-        tags={
-            "a",
-            "abbr",
-            "acronym",
-            "b",
-            "blockquote",
-            "code",
-            "em",
-            "i",
-            "li",
-            "ol",
-            "strong",
-            "ul",
-        },
-        attributes={
-            "a": {"href", "title"},
-            "abbr": {"title"},
-            "acronym": {"title"},
-        },
-        url_schemes={"http", "https", "mailto"},
-        link_rel=None,
-    )
-    print("input_str: ", input_str, "input_image: ", input_image)
-    ## input valid check
-    if not re.match(r"^[A-Za-z ,.!?\'\"]+$", input_str) or len(input_str) < 1:
-        output_str = "[Error] Invalid input: ", input_str
-        # output_image = np.zeros((128, 128, 3))
-        ## error happened
-        output_image = cv2.imread("./resources/error_happened.png")[:, :, ::-1]
-        return output_image, output_str
-    # Model Inference
-    conv = conversation_lib.conv_templates[args.conv_type].copy()
-    conv.messages = []
-    prompt = input_str
-    prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
-    if args.use_mm_start_end:
-        replace_token = (
-            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
         )
-        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
-    conv.append_message(conv.roles[0], prompt)
-    conv.append_message(conv.roles[1], "")
-    prompt = conv.get_prompt()
-    image_np = cv2.imread(input_image)
-    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
-    original_size_list = [image_np.shape[:2]]
-    image_clip = (
-        clip_image_processor.preprocess(image_np, return_tensors="pt")[
-            "pixel_values"
-        ][0]
-        .unsqueeze(0)
-        .cuda()
-    )
-    if args.precision == "bf16":
-        image_clip = image_clip.bfloat16()
-    elif args.precision == "fp16":
-        image_clip = image_clip.half()
-    else:
-        image_clip = image_clip.float()
-    image = transform.apply_image(image_np)
-    resize_list = [image.shape[:2]]
-    image = (
-        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
-        .unsqueeze(0)
-        .cuda()
-    )
-    if args.precision == "bf16":
-        image = image.bfloat16()
-    elif args.precision == "fp16":
-        image = image.half()
-    else:
-        image = image.float()
-    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
-    input_ids = input_ids.unsqueeze(0).cuda()
-    output_ids, pred_masks = model.evaluate(
-        image_clip,
-        image,
-        input_ids,
-        resize_list,
-        original_size_list,
-        max_new_tokens=512,
-        tokenizer=tokenizer,
-    )
-    output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
-    text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
-    text_output = text_output.replace("\n", "").replace("  ", " ")
-    text_output = text_output.split("ASSISTANT: ")[-1]
-    print("text_output: ", text_output)
-    save_img = None
-    for i, pred_mask in enumerate(pred_masks):
-        if pred_mask.shape[0] == 0:
-            continue
-        pred_mask = pred_mask.detach().cpu().numpy()[0]
-        pred_mask = pred_mask > 0
-        save_img = image_np.copy()
-        save_img[pred_mask] = (
-            image_np * 0.5
-            + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
-        )[pred_mask]
-    output_str = "ASSITANT: " + text_output  # input_str
-    if save_img is not None:
-        output_image = save_img  # input_image
-    else:
-        ## no seg output
-        output_image = cv2.imread("./resources/no_seg_out.png")[:, :, ::-1]
-    return output_image, output_str
 def server_runner(
@@ -361,8 +361,10 @@ def server_runner(
 if __name__ == '__main__':
     server_runner(
-        inference,
         debug=True,
         server_name="0.0.0.0"
     )

     return _model, _clip_image_processor, _tokenizer, _transform
+def get_inference_model_by_args(args_to_parse):
+    model, clip_image_processor, tokenizer, transform = get_model(args_to_parse)
+    ## to be implemented
+    def inference(input_str, input_image):
+        ## filter out special chars
+        input_str = nh3.clean(
+            input_str,
+            tags={
+                "a",
+                "abbr",
+                "acronym",
+                "b",
+                "blockquote",
+                "code",
+                "em",
+                "i",
+                "li",
+                "ol",
+                "strong",
+                "ul",
+            },
+            attributes={
+                "a": {"href", "title"},
+                "abbr": {"title"},
+                "acronym": {"title"},
+            },
+            url_schemes={"http", "https", "mailto"},
+            link_rel=None,
         )
+        print("input_str: ", input_str, "input_image: ", input_image)
+        ## input valid check
+        if not re.match(r"^[A-Za-z ,.!?\'\"]+$", input_str) or len(input_str) < 1:
+            output_str = "[Error] Invalid input: ", input_str
+            # output_image = np.zeros((128, 128, 3))
+            ## error happened
+            output_image = cv2.imread("./resources/error_happened.png")[:, :, ::-1]
+            return output_image, output_str
+        # Model Inference
+        conv = conversation_lib.conv_templates[args.conv_type].copy()
+        conv.messages = []
+        prompt = input_str
+        prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
+        if args.use_mm_start_end:
+            replace_token = (
+                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+            )
+            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], "")
+        prompt = conv.get_prompt()
+        image_np = cv2.imread(input_image)
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        original_size_list = [image_np.shape[:2]]
+        image_clip = (
+            clip_image_processor.preprocess(image_np, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image_clip = image_clip.bfloat16()
+        elif args.precision == "fp16":
+            image_clip = image_clip.half()
+        else:
+            image_clip = image_clip.float()
+        image = transform.apply_image(image_np)
+        resize_list = [image.shape[:2]]
+        image = (
+            preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image = image.bfloat16()
+        elif args.precision == "fp16":
+            image = image.half()
+        else:
+            image = image.float()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0).cuda()
+        output_ids, pred_masks = model.evaluate(
+            image_clip,
+            image,
+            input_ids,
+            resize_list,
+            original_size_list,
+            max_new_tokens=512,
+            tokenizer=tokenizer,
+        )
+        output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
+        text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
+        text_output = text_output.replace("\n", "").replace("  ", " ")
+        text_output = text_output.split("ASSISTANT: ")[-1]
+        print("text_output: ", text_output)
+        save_img = None
+        for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0:
+                continue
+            pred_mask = pred_mask.detach().cpu().numpy()[0]
+            pred_mask = pred_mask > 0
+            save_img = image_np.copy()
+            save_img[pred_mask] = (
+                image_np * 0.5
+                + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+            )[pred_mask]
+        output_str = "ASSITANT: " + text_output  # input_str
+        if save_img is not None:
+            output_image = save_img  # input_image
+        else:
+            ## no seg output
+            output_image = cv2.imread("./resources/no_seg_out.png")[:, :, ::-1]
+        return output_image, output_str
+    return inference
 def server_runner(
 if __name__ == '__main__':
+    args = parse_args(sys.argv[1:])
+    inference_fn = get_inference_model_by_args(args)
     server_runner(
+        inference_fn,
         debug=True,
         server_name="0.0.0.0"
     )