|
import time |
|
|
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForCausalLM |
|
from transformers import AutoProcessor |
|
|
|
model_id = "microsoft/Phi-3.5-vision-instruct" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
device_map = "auto", |
|
trust_remote_code = True, |
|
torch_dtype = torch.bfloat16, |
|
_attn_implementation = 'eager' |
|
) |
|
device = torch.device("cpu") |
|
model.to(device) |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_id, |
|
trust_remote_code = True, |
|
num_crops = 4 |
|
) |
|
|
|
user_prompt = '<|user|>\n' |
|
assistant_prompt = '<|assistant|>\n' |
|
prompt_suffix = "<|end|>\n" |
|
|
|
title_html = """ |
|
<h2> This space uses the model/microsoft/Phi-3.5-vision-instruct </h2> |
|
""" |
|
|
|
def call_model(raw_image = None, text_input = None): |
|
prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}" |
|
image = raw_image.convert("RGB") |
|
|
|
inputs = processor(prompt, image, return_tensors = "pt").to("cpu:0") |
|
generate_ids = model.generate(**inputs, |
|
max_new_tokens = 1000, |
|
eos_token_id = processor.tokenizer.eos_token_id, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode(generate_ids, |
|
skip_special_tokens = True, |
|
clean_up_tokenization_spaces = False)[0] |
|
return response |
|
|
|
|
|
def get_model_memory_footprint(model_): |
|
footprint = model_.get_memory_footprint() |
|
return f"Footprint of the model in MBs: {footprint / 1e+6}Mb" |
|
|
|
|
|
def process(raw_image, prompt): |
|
print("start...") |
|
start_time = time.time() |
|
memory_usage = get_model_memory_footprint(model) |
|
model_response = call_model(raw_image = raw_image, text_input = prompt) |
|
end_time = time.time() |
|
execution_time = end_time - start_time |
|
execution_time_min = round((execution_time / 60), 2) |
|
print(f"Execution time: {execution_time:.4f} seconds") |
|
print(f"Execution time: {execution_time_min:.2f} min") |
|
return memory_usage, model_response, execution_time_min |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML(title_html) |
|
gr.Markdown(""" |
|
NOTES : |
|
- The performance of this model is low since it runs on a CPU and a free space, it takes 1min minimum !. |
|
- If the input text in not specified the model will describe the image, that will take more time |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
_raw_image = gr.Image(type = 'pil') |
|
user_input = gr.Textbox(label = "What do you want to ask?") |
|
submit_btn = gr.Button(value = "Submit") |
|
with gr.Column(): |
|
memory = gr.Textbox(label = "Memory usage") |
|
results = gr.Textbox(label = "Model response") |
|
exec_time = gr.Textbox(label = "Execution time (min)") |
|
|
|
submit_btn.click( |
|
process, inputs = [_raw_image, user_input], outputs = [memory, results, exec_time] |
|
) |
|
|
|
gr.Examples( |
|
examples=[ |
|
["assets/img.jpg", 'after you can split horizontally the image into 6 rows, extract all text into JSON format. ignore "Au-dessous de Normal" and "Au-dessus de Normal"'], |
|
["assets/cats.jpg", 'how many cats are here? and what are they doing ?'], |
|
["assets/demo.jpg", 'is it night time ?'], |
|
], |
|
inputs=[_raw_image, user_input], |
|
outputs=[memory, results, exec_time], |
|
fn=process, |
|
label="Examples", |
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
demo.launch() |