CogVLM-CogAgent

Sleeping

File size: 9,157 Bytes

8d00201
 
 
 
da4f6a0
 
ca06aae
 
da4f6a0
 
 
8d00201
ff1a337
8d00201
6accf0d
 
da4f6a0
6accf0d
da4f6a0
8d00201
 
 
 
 
 
 
ca06aae
 
 
 
8d00201
 
 
 
da4f6a0
8d00201
 
6accf0d
da4f6a0
 
 
 
8d00201
 
 
6accf0d
8d00201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6accf0d
8d00201
 
 
 
 
 
 
da4f6a0
 
 
 
 
 
 
 
 
ca06aae
da4f6a0
 
 
8d00201
da4f6a0
 
 
 
8d00201
da4f6a0
 
 
 
8d00201
 
 
da4f6a0
 
 
8d00201
 
ca06aae
 
 
 
 
8d00201
 
 
 
 
 
 
6accf0d
8d00201
 
 
6accf0d
 
 
 
 
 
 
8d00201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da4f6a0
 
 
8d00201
 
 
 
 
 
 
 
 
6accf0d
 
da4f6a0
6accf0d
da4f6a0
 
 
 
6accf0d
8d00201
da4f6a0
 
 
 
8d00201
da4f6a0
8d00201
 
da4f6a0
 
8d00201
6accf0d
8d00201
6accf0d
8d00201
 
da4f6a0
8d00201
da4f6a0
8d00201
 
 
 
 
 
 
 
03e280c
8d00201

#!/usr/bin/env python

import gradio as gr
import os
import json
import requests
import time
from concurrent.futures import ThreadPoolExecutor
from utils import is_chinese, process_image_without_resize, parse_response, templates_agent_cogagent, template_grounding_cogvlm, postprocess_text

DESCRIPTION = '''<h2 style='text-align: center'> <a href="https://github.com/THUDM/CogVLM"> CogVLM & CogAgent Chat Demo</a> </h2>'''

NOTES = 'This app is adapted from <a href="https://github.com/THUDM/CogVLM">https://github.com/THUDM/CogVLM</a>. It would be recommended to check out the repo if you want to see the detail of our model.\n\n该demo仅作为测试使用，不支持批量请求。如有大批量需求，欢迎联系[智谱AI](mailto:business@zhipuai.cn)。\n\n请注意该Demo目前仅支持英文，<a href="http://36.103.203.44:7861/">备用网页</a>支持中文。'

MAINTENANCE_NOTICE1 = 'Hint 1: If the app report "Something went wrong, connection error out", please turn off your proxy and retry.<br>Hint 2: If you upload a large size of image like 10MB, it may take some time to upload and process. Please be patient and wait.'

GROUNDING_NOTICE = 'Hint: When you check "Grounding", please use the <a href="https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L344">corresponding prompt</a> or the examples below.'

AGENT_NOTICE = 'Hint: When you check "CogAgent", please use the <a href="https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L761C1-L761C17">corresponding prompt</a> or the examples below.'


default_chatbox = [("", "Hi, What do you want to know about this image?")]

URL = os.environ.get("URL")


def make_request(URL, headers, data):
    response = requests.request("POST", URL, headers=headers, data=data, timeout=(60, 100))
    return response.json()

def post(
        input_text,
        temperature,
        top_p,
        top_k,
        image_prompt,
        result_previous,
        hidden_image,
        grounding,
        cogagent,
        grounding_template,
        agent_template    
        ):
    result_text = [(ele[0], ele[1]) for ele in result_previous]
    for i in range(len(result_text)-1, -1, -1):
        if result_text[i][0] == "" or result_text[i][0] == None:
            del result_text[i]
    print(f"history {result_text}")

    is_zh = is_chinese(input_text)

    if image_prompt is None:
        print("Image empty")
        if is_zh:
            result_text.append((input_text, '图片为空！请上传图片并重试。'))
        else:
            result_text.append((input_text, 'Image empty! Please upload a image and retry.'))
        return input_text, result_text, hidden_image
    elif input_text == "":
        print("Text empty")
        result_text.append((input_text, 'Text empty! Please enter text and retry.'))
        return "", result_text, hidden_image                

    headers = {
            "Content-Type": "application/json; charset=UTF-8",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        }
    if image_prompt:
        pil_img, encoded_img, image_hash, image_path_grounding = process_image_without_resize(image_prompt)
        print(f"image_hash:{image_hash}, hidden_image_hash:{hidden_image}")

        if hidden_image is not None and image_hash != hidden_image:
            print("image has been update")
            result_text = []        
        hidden_image = image_hash        
    else:
        encoded_img = None 

    model_use = "vlm_chat"
    if not cogagent and grounding:
        model_use = "vlm_grounding"
        if grounding_template:
            input_text = postprocess_text(grounding_template, input_text)
    elif cogagent:
        model_use = "agent_chat"
        if agent_template is not None and agent_template != "do not use template":
            input_text = postprocess_text(agent_template, input_text)

    prompt = input_text

    if grounding:
        prompt += "(with grounding)"   

    print(f'request {model_use} model... with prompt {prompt}, grounding_template {grounding_template}, agent_template {agent_template}')
    data = json.dumps({
        'model_use': model_use,
        'is_grounding': grounding,
        'text': prompt,
        'history': result_text,
        'image': encoded_img,
        'temperature': temperature,
        'top_p': top_p,
        'top_k': top_k,
        'do_sample': True,
        'max_new_tokens': 2048
    })
    try:
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(make_request, URL, headers, data)
            # time.sleep(15)
            response = future.result()  # Blocks until the request is complete
        # response = requests.request("POST", URL, headers=headers, data=data, timeout=(60, 100)).json()
    except Exception as e:
        print("error message", e)
        if is_zh:
            result_text.append((input_text, '超时！请稍等几分钟再重试。'))
        else:
            result_text.append((input_text, 'Timeout! Please wait a few minutes and retry.'))
        return "", result_text, hidden_image
    print('request done...')
    # response = {'result':input_text}

    answer = str(response['result'])
    if grounding:
        parse_response(pil_img, answer, image_path_grounding)
        new_answer = answer.replace(input_text, "")
        result_text.append((input_text, new_answer))
        result_text.append((None, (image_path_grounding,)))
    else:
        result_text.append((input_text, answer))
    print(result_text)
    print('finished')
    return "", result_text, hidden_image


def clear_fn(value):
    return "", default_chatbox, None

def clear_fn2(value):
    return default_chatbox


def main():
    gr.close_all()
    examples = []
    with open("./examples/example_inputs.jsonl") as f:
        for line in f:
            data = json.loads(line)
            examples.append(data)


    with gr.Blocks(css='style.css') as demo:

        gr.Markdown(DESCRIPTION)
        gr.Markdown(NOTES)

        with gr.Row():
            with gr.Column(scale=4.5):
                with gr.Group():
                    input_text = gr.Textbox(label='Input Text', placeholder='Please enter text prompt below and press ENTER.')
                    with gr.Row():
                        run_button = gr.Button('Generate')
                        clear_button = gr.Button('Clear')

                    image_prompt = gr.Image(type="filepath", label="Image Prompt", value=None)
                with gr.Row():
                    grounding = gr.Checkbox(label="Grounding")
                    cogagent = gr.Checkbox(label="CogAgent")
                with gr.Row():
                    # grounding_notice = gr.Markdown(GROUNDING_NOTICE)
                    grounding_template = gr.Dropdown(choices=template_grounding_cogvlm, label="Grounding Template", value=template_grounding_cogvlm[0])
                    # agent_notice = gr.Markdown(AGENT_NOTICE)
                    agent_template = gr.Dropdown(choices=templates_agent_cogagent, label="Agent Template", value=templates_agent_cogagent[0])

                with gr.Row():
                    temperature = gr.Slider(maximum=1, value=0.9, minimum=0, label='Temperature')
                    top_p = gr.Slider(maximum=1, value=0.8, minimum=0, label='Top P')
                    top_k = gr.Slider(maximum=50, value=5, minimum=1, step=1, label='Top K')

            with gr.Column(scale=5.5):
                result_text = gr.components.Chatbot(label='Multi-round conversation History', value=[("", "Hi, What do you want to know about this image?")], height=550)
                hidden_image_hash = gr.Textbox(visible=False)

        gr_examples = gr.Examples(examples=[[example["text"], example["image"], example["grounding"], example["cogagent"]] for example in examples], 
                                  inputs=[input_text, image_prompt, grounding, cogagent],
                                  label="Example Inputs (Click to insert an examplet into the input box)",
                                  examples_per_page=6)

        gr.Markdown(MAINTENANCE_NOTICE1)

        print(gr.__version__)
        run_button.click(fn=post,inputs=[input_text, temperature, top_p, top_k, image_prompt, result_text, hidden_image_hash, grounding, cogagent, grounding_template, agent_template],
                         outputs=[input_text, result_text, hidden_image_hash])
        input_text.submit(fn=post,inputs=[input_text, temperature, top_p, top_k, image_prompt, result_text, hidden_image_hash, grounding, cogagent, grounding_template, agent_template],
                         outputs=[input_text, result_text, hidden_image_hash])
        clear_button.click(fn=clear_fn, inputs=clear_button, outputs=[input_text, result_text, image_prompt])
        image_prompt.upload(fn=clear_fn2, inputs=clear_button, outputs=[result_text])
        image_prompt.clear(fn=clear_fn2, inputs=clear_button, outputs=[result_text])

        print(gr.__version__)

    demo.queue(concurrency_count=10)
    demo.launch()

if __name__ == '__main__':
    main()