Spaces:

not-lain
/

Ovis1.6-Gemma2-9B

Paused

File size: 5,656 Bytes

import spaces
import os
import re
import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from transformers import TextIteratorStreamer
from threading import Thread
from PIL import Image

model_name = 'AIDC-AI/Ovis1.6-Gemma2-9B'

# load model
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             multimodal_max_length=8192,
                                             trust_remote_code=True).to(device='cuda')
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
image_placeholder = '<image>'
cur_dir = os.path.dirname(os.path.abspath(__file__))


@spaces.GPU
def ovis_chat(message, history):
    try :
        image_input = Image.open(message["files"][0]).convert("RGB")
    except : 
        image_input = None
    # preprocess inputs
    conversations = []
    response = ""
    text_input = message["text"]
    for msg in history:
        # case history entry pair only has text
        if isinstance(msg[0],str):
            conversations.append({
                "from": "human",
                "value": msg[0]
            })
            conversations.append({
                "from": "gpt",
                "value": msg[1]
            })
        # case history pair has an image
        elif isinstance(msg[0],tuple): 
            # case user did not pass a new image
            # we override the none with the new image
            if image_input is None : 
                # if someone uploads a file and not an image this will break
                image_input = Image.open(msg[0][0]).convert("RGB")
            # we always process the text
            conversations.append({
                "from": "human",
                "value": msg[1][0]
            })
            conversations.append({
                "from": "gpt",
                "value": msg[1][1]
            })

    text_input = text_input.replace(image_placeholder, '')
    conversations.append({
        "from": "human",
        "value": text_input
    })
    if image_input is not None:
        conversations[0]["value"] = image_placeholder + '\n' + conversations[0]["value"]
    prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input])
    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
    input_ids = input_ids.unsqueeze(0).to(device=model.device)
    attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
    if image_input is None:
        pixel_values = [None]
    else:
        pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]

    with torch.inference_mode():
        gen_kwargs = dict(
            max_new_tokens=512,
            do_sample=False,
            top_p=None,
            top_k=None,
            temperature=None,
            repetition_penalty=None,
            eos_token_id=model.generation_config.eos_token_id,
            pad_token_id=text_tokenizer.pad_token_id,
            use_cache=True
        )
    response = ""
    thread = Thread(target=model.generate, 
                kwargs={"inputs": input_ids,
                        "pixel_values": pixel_values,
                        "attention_mask": attention_mask,
                        "streamer": streamer,
                        **gen_kwargs})
    thread.start()
    for new_text in streamer:
        response += new_text
        yield response
    thread.join()

def clear_chat():
    return [], None, ""

with open(f"{cur_dir}/resource/logo.svg", "r", encoding="utf-8") as svg_file:
    svg_content = svg_file.read()
font_size = "2.5em"
svg_content = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
html = f"""
<p align="center" style="font-size: {font_size}; line-height: 1;">
    <span style="display: inline-block; vertical-align: middle;">{svg_content}</span>
    <span style="display: inline-block; vertical-align: middle;">{model_name.split('/')[-1]}</span>
</p>
<center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_name}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
"""

latex_delimiters_set = [{
        "left": "\\(",
        "right": "\\)",
        "display": False 
    }, {
        "left": "\\begin{equation}",
        "right": "\\end{equation}",
        "display": True 
    }, {
        "left": "\\begin{align}",
        "right": "\\end{align}",
        "display": True
    }, {
        "left": "\\begin{alignat}",
        "right": "\\end{alignat}",
        "display": True
    }, {
        "left": "\\begin{gather}",
        "right": "\\end{gather}",
        "display": True
    }, {
        "left": "\\begin{CD}",
        "right": "\\end{CD}",
        "display": True
    }, {
        "left": "\\[",
        "right": "\\]",
        "display": True
    }]

#     send_click_event = send_btn.click(submit_chat, [chatbot, text_input], [chatbot, text_input]).then(ovis_chat,[chatbot, image_input],chatbot)
#     submit_event = text_input.submit(submit_chat, [chatbot, text_input], [chatbot, text_input]).then(ovis_chat,[chatbot, image_input],chatbot)
#     clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input])

demo = gr.ChatInterface(fn=ovis_chat, textbox=gr.MultimodalTextbox(),multimodal=True)
demo.launch(debug=True)