Spaces:
Sleeping
Sleeping
File size: 1,287 Bytes
1687073 c0d1625 56427b4 5bb878a 9b8bf99 38fb884 d9df1f9 53a41f9 1687073 53a41f9 1687073 53a41f9 1687073 5bb878a 1687073 c0d1625 d398fa0 76209e3 c0d1625 5bb878a 9b8bf99 56427b4 38fb884 5bb878a 4d30ab9 56427b4 5bb878a 76209e3 01d243d 5bb878a 73bdba2 56427b4 5bb878a 56427b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from transformers import AutoProcessor, AutoModelForImageTextToText, MllamaForConditionalGeneration
import torch
from threading import Thread
import gradio as gr
import spaces
from PIL import Image
# from zipnn import zipnn_hf
# zipnn_hf()
# model_id = "royleibov/Llama-3.2-11B-Vision-Instruct-ZipNN-Compressed"
model_id = "unsloth/Llama-3.2-11B-Vision"
model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
@spaces.GPU
def score_it(input_img):
# Convert numpy array to PIL Image
pil_image = Image.fromarray(input_img.astype('uint8'))
# Now use the PIL Image as before
image = pil_image.convert("RGB").resize((224, 224))
prompt = "<|begin_of_text|><|image|this script says"
inputs = processor(image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=200)
return processor.decode(output[0])
demo = gr.Interface(fn=score_it, title="Upload your English script and get the score",
inputs=[gr.Image()],
outputs=['text'],
stop_btn="Stop Generation",
)
demo.launch(debug=True)
|