Co-Instruct

Sleeping

File size: 9,433 Bytes

f8ea2c9
848ce1e
 
f8ea2c9
 
8bef29c
 
f8ea2c9
8bef29c
 
 
f8ea2c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bef29c
 
 
c3164df
8bef29c
 
bf92928
8bef29c
 
 
f8ea2c9
8bef29c
 
 
 
 
b898ea7
 
 
 
 
 
 
bf92928
 
 
 
 
 
 
 
 
 
 
 
8bef29c
 
f8ea2c9
8bef29c
 
b898ea7
 
 
 
bf92928
 
 
 
 
 
8132ec4
8bef29c
8132ec4
f8ea2c9
 
 
 
bdb5c5d
 
 
f8ea2c9
 
f8692f6
8132ec4
 
f8692f6
 
8bef29c
f8692f6
 
 
 
 
 
 
8bef29c
848ce1e
 
43ba66f
838b9d1
43ba66f
848ce1e
 
 
8bef29c
 
b898ea7
 
d506e15
bf92928
f8ea2c9
 
1920c80
f8ea2c9
 
 
 
 
 
 
bdb5c5d
f8ea2c9
8bef29c

import os, yaml
import gradio as gr
import requests
import argparse

from PIL import Image

import numpy as np
import torch
from transformers import AutoModelForCausalLM

from huggingface_hub import hf_hub_download


## InstructIR Plugin ##
from insir_models import instructir
from insir_text.models import LanguageModel, LMHead

hf_hub_download(repo_id="marcosv/InstructIR", filename="im_instructir-7d.pt", local_dir="./")
hf_hub_download(repo_id="marcosv/InstructIR", filename="lm_instructir-7d.pt", local_dir="./")

CONFIG     = "eval5d.yml"
LM_MODEL   = "lm_instructir-7d.pt"
MODEL_NAME = "im_instructir-7d.pt"

def dict2namespace(config):
    namespace = argparse.Namespace()
    for key, value in config.items():
        if isinstance(value, dict):
            new_value = dict2namespace(value)
        else:
            new_value = value
        setattr(namespace, key, new_value)
    return namespace


# parse config file
with open(os.path.join(CONFIG), "r") as f:
    config = yaml.safe_load(f)

cfg = dict2namespace(config)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
ir_model = instructir.create_model(input_channels =cfg.model.in_ch, width=cfg.model.width, enc_blks = cfg.model.enc_blks, 
                            middle_blk_num = cfg.model.middle_blk_num, dec_blks = cfg.model.dec_blks, txtdim=cfg.model.textdim)
ir_model = ir_model.to(device)
print ("IMAGE MODEL CKPT:", MODEL_NAME)
ir_model.load_state_dict(torch.load(MODEL_NAME, map_location="cpu"), strict=True)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
LMODEL = cfg.llm.model
language_model = LanguageModel(model=LMODEL)
lm_head = LMHead(embedding_dim=cfg.llm.model_dim, hidden_dim=cfg.llm.embd_dim, num_classes=cfg.llm.nclasses)
lm_head = lm_head.to(device)

print("LMHEAD MODEL CKPT:", LM_MODEL)
lm_head.load_state_dict(torch.load(LM_MODEL, map_location="cpu"), strict=True)

def process_img(image, prompt=None):
    if prompt is None:
        prompt = chat("How to improve the quality of the image?", [], image, None, None, None)
        prompt += "Please help me improve its quality!"
        print(prompt)
    img = np.array(image)
    img = img / 255.
    img = img.astype(np.float32)
    y = torch.tensor(img).permute(2,0,1).unsqueeze(0).to(device)

    lm_embd = language_model(prompt)
    lm_embd = lm_embd.to(device)

    with torch.no_grad():
        text_embd, deg_pred = lm_head(lm_embd)
        x_hat = ir_model(y, text_embd)

    restored_img = x_hat.squeeze().permute(1,2,0).clamp_(0, 1).cpu().detach().numpy()
    restored_img = np.clip(restored_img, 0. , 1.)

    restored_img = (restored_img * 255.0).round().astype(np.uint8)  # float32 to uint8
    return Image.fromarray(restored_img) #(image, Image.fromarray(restored_img))

## InstructIR Plugin ##
model = AutoModelForCausalLM.from_pretrained("q-future/co-instruct-preview", 
                                             trust_remote_code=True, 
                                             torch_dtype=torch.float16, 
                                             attn_implementation="eager",
                                             device_map={"":"cuda:0"})

def chat(message, history, image_1, image_2, image_3, image_4):
    print(history)
    if history:
        if image_1 is not None and image_2 is None:
            past_message = "USER: The input image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1]
            for i in range((len(history) - 1)):
                past_message += "USER:" +history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
            message = past_message + "USER:" + message + " ASSISTANT:"
            images = [image_1]
        if image_1 is not None and image_2 is not None:
            if image_3 is None:
                past_message = "USER: The first image: <|image|>\nThe second image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
                for i in range((len(history) - 1)):
                    past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
                message = past_message + "USER:" + message + " ASSISTANT:"
                images = [image_1, image_2]
            else:
                if image_4 is None:
                    past_message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
                    for i in range((len(history) - 1)):
                        past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
                    message = past_message + "USER:" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3]
                else:
                    past_message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>\nThe fourth image:<|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
                    for i in range((len(history) - 1)):
                        past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
                    message = past_message + "USER:" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3, image_4]
    else:  
        if image_1 is not None and image_2 is None:
            message = "USER: The input image: <|image|>" + message + " ASSISTANT:"
            images = [image_1]
        if image_1 is not None and image_2 is not None:
            if image_3 is None:
                message = "USER: The first image: <|image|>\nThe second image: <|image|>" + message + " ASSISTANT:"
                images = [image_1, image_2]
            else:
                if image_4 is None:
                    message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3]
                else:
                    message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>\nThe fourth image:<|image|>" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3, image_4]
    
    print(message)
    
    return model.tokenizer.batch_decode(model.chat(message, images, max_new_tokens=600).clamp(0, 100000))[0].split("ASSISTANT:")[-1]

#### Image,Prompts examples
examples = [
    ["Which part of the image is relatively clearer, the upper part or the lower part? Please analyze in details.", "examples/sausage.jpg", None],
    ["Which image is noisy, and which one is with motion blur? Please analyze in details.", "examples/211.jpg", "examples/frog.png"],
            ["What is the problem in this image, and how to fix it? Please answer my questions one by one.", "examples/lol_748.png", None],
]

#<h1 align="center"><a href="https://github.com/Q-Future/Q-Instruct"><img src="https://github.com/Q-Future/Q-Instruct/blob/main/q_instruct_logo.png?raw=true", alt="Q-Instruct (mPLUG-Owl-2)" border="0" style="margin: 0 auto; height: 85px;" /></a> </h1>


title = "Co-Instruct-Plus🧑‍🏫🖌️"
with gr.Blocks(title="Co-Instruct-Plus🧑‍🏫🖌️") as demo:
    title_markdown = ("""


<div align="center">Built upon <strong>Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models</strong></div>

<div align="center">Build upin Co-Instruct: <strong>Towards Open-ended Visual Quality Comparison</strong></div>   

<div align="center">Co-Instruct is the Upgraded Version of Q-Instruct with Multi-image (up to 4, same as GPT-4V) Support! We also support <a href='https://huggingface.co/marcosv/InstructIR'>InstructIR</a> as PLUGIN!</div>
<h5 align="center"> Please find our more accurate visual scoring demo on <a href='https://huggingface.co/spaces/teowu/OneScorer'>[OneScorer]</a>!</h2>
<div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
        <a href='https://github.com/Q-Future/Q-Instruct'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
        <a href="https://Q-Instruct.github.io/Q-Instruct/fig/Q_Instruct_v0_1_preview.pdf"><img src="https://img.shields.io/badge/Technical-Report-red"></a>
        <a href='https://github.com/Q-Future/Q-Instruct/stargazers'><img src='https://img.shields.io/github/stars/Q-Future/Q-Instruct.svg?style=social'></a>
    </div>
</div>
""")
    gr.Markdown(title_markdown)
    with gr.Row():
            input_img_1 = gr.Image(type='pil', label="Image 1 (First image)")
            input_img_2 = gr.Image(type='pil', label="Image 2 (Second image)")
            input_img_3 = gr.Image(type='pil', label="Image 3 (Third image)")
            input_img_4 = gr.Image(type='pil', label="Image 4 (Third image)")
    with gr.Row():
        with gr.Column(scale=2):
            gr.ChatInterface(fn = chat, additional_inputs=[input_img_1, input_img_2, input_img_3, input_img_4], theme="Soft", examples=examples)
        with gr.Column(scale=1):
            input_image_ir = gr.Image(type="pil", label="Image for Auto Restoration")
            output_image_ir = gr.Image(type="pil", label="Output of Auto Restoration")
            gr.Interface(
                fn=process_img,
                inputs=[input_image_ir],
                outputs=[output_image_ir],
                examples=["examples/gopro.png", "examples/noise50.png", "examples/lol_748.png"],
            )
    demo.launch(share=True)