from transformers import AutoProcessor, AutoModelForImageTextToText, MllamaForConditionalGeneration import torch from threading import Thread import gradio as gr import spaces from PIL import Image # from zipnn import zipnn_hf # zipnn_hf() # model_id = "royleibov/Llama-3.2-11B-Vision-Instruct-ZipNN-Compressed" model_id = "unsloth/Llama-3.2-11B-Vision" model = AutoModelForImageTextToText.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) @spaces.GPU def score_it(input_img): # Convert numpy array to PIL Image pil_image = Image.fromarray(input_img.astype('uint8')) # Now use the PIL Image as before image = pil_image.convert("RGB").resize((224, 224)) prompt = "<|begin_of_text|><|image|this script says" inputs = processor(image, prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=200) return processor.decode(output[0]) demo = gr.Interface(fn=score_it, title="Upload your English script and get the score", inputs=[gr.Image()], outputs=['text'], stop_btn="Stop Generation", ) demo.launch(debug=True)