Spaces:
Sleeping
Sleeping
from transformers import AutoProcessor, AutoModelForImageTextToText, MllamaForConditionalGeneration | |
import torch | |
from threading import Thread | |
import gradio as gr | |
import spaces | |
from PIL import Image | |
# from zipnn import zipnn_hf | |
# zipnn_hf() | |
# model_id = "royleibov/Llama-3.2-11B-Vision-Instruct-ZipNN-Compressed" | |
model_id = "unsloth/Llama-3.2-11B-Vision" | |
model = AutoModelForImageTextToText.from_pretrained( | |
model_id, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
) | |
processor = AutoProcessor.from_pretrained(model_id) | |
def score_it(input_img): | |
# Convert numpy array to PIL Image | |
pil_image = Image.fromarray(input_img.astype('uint8')) | |
# Now use the PIL Image as before | |
image = pil_image.convert("RGB").resize((224, 224)) | |
prompt = "<|begin_of_text|><|image|this script says" | |
inputs = processor(image, prompt, return_tensors="pt").to(model.device) | |
output = model.generate(**inputs, max_new_tokens=200) | |
return processor.decode(output[0]) | |
demo = gr.Interface(fn=score_it, title="Upload your English script and get the score", | |
inputs=[gr.Image()], | |
outputs=['text'], | |
stop_btn="Stop Generation", | |
) | |
demo.launch(debug=True) | |