Spaces:

AIDC-AI
/

Ovis1.6-Gemma2-9B

Running on Zero

File size: 6,243 Bytes

42fea26
 
e16319b
 
2005ef8
42fea26
 
e16319b
 
42fea26
 
 
 
 
 
 
 
 
 
e16319b
42fea26
e16319b
42fea26
e16319b
 
 
 
42fea26
 
e16319b
42fea26
 
e16319b
 
 
42fea26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e16319b
 
 
 
 
 
 
 
 
 
 
 
 
135611f
2e22cc7
 
b116de8
2e22cc7
 
 
 
 
b116de8
42fea26
 
 
 
e16319b
 
 
 
46b9cd3
e16319b
 
 
 
f9a2bde
46b9cd3
 
 
 
 
e16319b
46b9cd3
 
 
e16319b
46b9cd3
 
 
e16319b
46b9cd3
 
 
e16319b
46b9cd3
 
 
e16319b
46b9cd3
 
 
e16319b
46b9cd3
 
 
e16319b
46b9cd3
 
42fea26
 
46b9cd3
42fea26
 
 
 
e16319b
b1ee704
 
 
e16319b
42fea26
 
 
e16319b
42fea26
 
 
 
2005ef8
e16319b
 
42fea26
2005ef8

import spaces
import os
import re
import time
import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from transformers import TextIteratorStreamer
from threading import Thread

model_name = 'AIDC-AI/Ovis1.6-Gemma2-9B'

# load model
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             multimodal_max_length=8192,
                                             trust_remote_code=True).to(device='cuda')
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
image_placeholder = '<image>'
cur_dir = os.path.dirname(os.path.abspath(__file__))

def submit_chat(chatbot, text_input):
    response = ''
    chatbot.append((text_input, response))
    return chatbot ,''

@spaces.GPU
def ovis_chat(chatbot, image_input):
    # preprocess inputs
    conversations = []
    response = ""
    text_input = chatbot[-1][0]
    for query, response in chatbot[:-1]:
        conversations.append({
            "from": "human",
            "value": query
        })
        conversations.append({
            "from": "gpt",
            "value": response
        })
    text_input = text_input.replace(image_placeholder, '')
    conversations.append({
        "from": "human",
        "value": text_input
    })
    if image_input is not None:
        conversations[0]["value"] = image_placeholder + '\n' + conversations[0]["value"]
    prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input])
    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
    input_ids = input_ids.unsqueeze(0).to(device=model.device)
    attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
    if image_input is None:
        pixel_values = [None]
    else:
        pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]

    with torch.inference_mode():
        gen_kwargs = dict(
            max_new_tokens=512,
            do_sample=False,
            top_p=None,
            top_k=None,
            temperature=None,
            repetition_penalty=None,
            eos_token_id=model.generation_config.eos_token_id,
            pad_token_id=text_tokenizer.pad_token_id,
            use_cache=True
        )
    response = ""
    thread = Thread(target=model.generate, 
                kwargs={"inputs": input_ids,
                        "pixel_values": pixel_values,
                        "attention_mask": attention_mask,
                        "streamer": streamer,
                        **gen_kwargs})
    thread.start()
    for new_text in streamer:
        response += new_text
        chatbot[-1][1] = response
        yield chatbot
    thread.join()
    # debug
    print('*'*60)
    print('*'*60)
    print('OVIS_CONV_START')
    for i, (request, answer) in enumerate(chatbot[:-1], 1):
        print(f'Q{i}:\n {request}')
        print(f'A{i}:\n {answer}')
    print('New_Q:\n', text_input)
    print('New_A:\n', response)
    print('OVIS_CONV_END')

def clear_chat():
    return [], None, ""

with open(f"{cur_dir}/resource/logo.svg", "r", encoding="utf-8") as svg_file:
    svg_content = svg_file.read()
font_size = "2.5em"
svg_content = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
html = f"""
<p align="center" style="font-size: {font_size}; line-height: 1;">
    <span style="display: inline-block; vertical-align: middle;">{svg_content}</span>
    <span style="display: inline-block; vertical-align: middle;">{model_name.split('/')[-1]}</span>
</p>
<center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_name}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
"""

latex_delimiters_set = [{
        "left": "\\(",
        "right": "\\)",
        "display": False 
    }, {
        "left": "\\begin{equation}",
        "right": "\\end{equation}",
        "display": True 
    }, {
        "left": "\\begin{align}",
        "right": "\\end{align}",
        "display": True
    }, {
        "left": "\\begin{alignat}",
        "right": "\\end{alignat}",
        "display": True
    }, {
        "left": "\\begin{gather}",
        "right": "\\end{gather}",
        "display": True
    }, {
        "left": "\\begin{CD}",
        "right": "\\end{CD}",
        "display": True
    }, {
        "left": "\\[",
        "right": "\\]",
        "display": True
    }]

text_input = gr.Textbox(label="prompt", placeholder="Enter your text here...", lines=1, container=False)
with gr.Blocks(title=model_name.split('/')[-1]) as demo:
    gr.HTML(html)
    with gr.Row():
        with gr.Column(scale=3):
            image_input = gr.Image(label="image", height=350, type="pil")
            gr.Examples(
                examples=[
                    [f"{cur_dir}/examples/case0.png", "Find the area of the shaded region."],
                    [f"{cur_dir}/examples/case1.png", "explain this model to me."],
                    [f"{cur_dir}/examples/case2.png", "What is net profit margin as a percentage of total revenue?"],
                ],
                inputs=[image_input, text_input]
            )
        with gr.Column(scale=7):
            chatbot = gr.Chatbot(label="Ovis", layout="panel", height=600, show_copy_button=True, latex_delimiters=latex_delimiters_set)
            text_input.render()
            with gr.Row():
                send_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear", variant="secondary")

    send_click_event = send_btn.click(submit_chat, [chatbot, text_input], [chatbot, text_input]).then(ovis_chat,[chatbot, image_input],chatbot)
    submit_event = text_input.submit(submit_chat, [chatbot, text_input], [chatbot, text_input]).then(ovis_chat,[chatbot, image_input],chatbot)
    clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input])

demo.launch()