File size: 1,287 Bytes
1687073
c0d1625
56427b4
5bb878a
9b8bf99
38fb884
d9df1f9
53a41f9
1687073
53a41f9
1687073
53a41f9
1687073
 
5bb878a
1687073
c0d1625
d398fa0
76209e3
c0d1625
 
5bb878a
 
9b8bf99
56427b4
38fb884
 
 
 
 
5bb878a
4d30ab9
56427b4
5bb878a
76209e3
01d243d
 
5bb878a
 
73bdba2
56427b4
 
 
 
5bb878a
56427b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from transformers import AutoProcessor, AutoModelForImageTextToText, MllamaForConditionalGeneration
import torch
from threading import Thread
import gradio as gr
import spaces
from PIL import Image


# from zipnn import zipnn_hf

# zipnn_hf()

# model_id = "royleibov/Llama-3.2-11B-Vision-Instruct-ZipNN-Compressed"
model_id = "unsloth/Llama-3.2-11B-Vision"

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)


@spaces.GPU
def score_it(input_img):
    # Convert numpy array to PIL Image
    pil_image = Image.fromarray(input_img.astype('uint8'))
    
    # Now use the PIL Image as before
    image = pil_image.convert("RGB").resize((224, 224))

    prompt = "<|begin_of_text|><|image|this script says"
    inputs = processor(image, prompt, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=200)
    return processor.decode(output[0])
    


demo = gr.Interface(fn=score_it, title="Upload your English script and get the score",
                        inputs=[gr.Image()],
                        outputs=['text'],
                        stop_btn="Stop Generation",
                        )

demo.launch(debug=True)