import time import gradio as gr import torch from transformers import AutoModelForCausalLM from transformers import AutoProcessor model_id = "microsoft/Phi-3.5-vision-instruct" # Note: set _attn_implementation='eager' if you don't have flash_attn installed model = AutoModelForCausalLM.from_pretrained( model_id, device_map = "auto", trust_remote_code = True, torch_dtype = torch.bfloat16, _attn_implementation = 'eager' ) device = torch.device("cpu") model.to(device) # for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame. processor = AutoProcessor.from_pretrained(model_id, trust_remote_code = True, num_crops = 4 ) user_prompt = '<|user|>\n' assistant_prompt = '<|assistant|>\n' prompt_suffix = "<|end|>\n" title_html = """

This space uses the model/microsoft/Phi-3.5-vision-instruct

""" def call_model(raw_image = None, text_input = None): prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}" image = raw_image.convert("RGB") inputs = processor(prompt, image, return_tensors = "pt").to("cpu:0") generate_ids = model.generate(**inputs, max_new_tokens = 1000, eos_token_id = processor.tokenizer.eos_token_id, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode(generate_ids, skip_special_tokens = True, clean_up_tokenization_spaces = False)[0] return response def get_model_memory_footprint(model_): footprint = model_.get_memory_footprint() return f"Footprint of the model in MBs: {footprint / 1e+6}Mb" def process(raw_image, prompt): print("start...") start_time = time.time() memory_usage = get_model_memory_footprint(model) model_response = call_model(raw_image = raw_image, text_input = prompt) end_time = time.time() execution_time = end_time - start_time execution_time_min = round((execution_time / 60), 2) print(f"Execution time: {execution_time:.4f} seconds") print(f"Execution time: {execution_time_min:.2f} min") return memory_usage, model_response, execution_time_min with gr.Blocks() as demo: gr.HTML(title_html) gr.Markdown(""" NOTES : - The performance of this model is low since it runs on a CPU and a free space, it takes 1min minimum !. - If the input text in not specified the model will describe the image, that will take more time """) with gr.Row(): with gr.Column(): _raw_image = gr.Image(type = 'pil') user_input = gr.Textbox(label = "What do you want to ask?") submit_btn = gr.Button(value = "Submit") with gr.Column(): memory = gr.Textbox(label = "Memory usage") results = gr.Textbox(label = "Model response") exec_time = gr.Textbox(label = "Execution time (min)") submit_btn.click( process, inputs = [_raw_image, user_input], outputs = [memory, results, exec_time] ) gr.Examples( examples=[ ["assets/img.jpg", 'after you can split horizontally the image into 6 rows, extract all text into JSON format. ignore "Au-dessous de Normal" and "Au-dessus de Normal"'], ["assets/cats.jpg", 'how many cats are here? and what are they doing ?'], ["assets/demo.jpg", 'is it night time ?'], ], inputs=[_raw_image, user_input], outputs=[memory, results, exec_time], fn=process, label="Examples", ) if __name__ == '__main__': demo.launch()