rockyyyyyy commited on
Commit
613bcbb
1 Parent(s): b54b5e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import torch
3
+ from PIL import Image
4
+ import gradio as gr
5
+ from transformers import AutoModelForCausalLM, AutoProcessor
6
+
7
+ # Load model and processor
8
+ model_id_or_path = "rhymes-ai/Aria"
9
+ model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
10
+ processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True)
11
+
12
+ # Function to process the input and generate text
13
+ def generate_response(image):
14
+ # Convert the input image to PIL format (if necessary)
15
+ if isinstance(image, str):
16
+ image = Image.open(requests.get(image, stream=True).raw)
17
+
18
+ # Prepare messages for the model
19
+ messages = [
20
+ {
21
+ "role": "user",
22
+ "content": [
23
+ {"text": None, "type": "image"},
24
+ {"text": "what is the image?", "type": "text"},
25
+ ],
26
+ }
27
+ ]
28
+
29
+ text = processor.apply_chat_template(messages, add_generation_prompt=True)
30
+ inputs = processor(text=text, images=image, return_tensors="pt")
31
+
32
+ # Move pixel values to the correct dtype
33
+ inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
34
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
35
+
36
+ # Generate response
37
+ with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
38
+ output = model.generate(
39
+ **inputs,
40
+ max_new_tokens=500,
41
+ stop_strings=["<|im_end|>"],
42
+ tokenizer=processor.tokenizer,
43
+ do_sample=True,
44
+ temperature=0.9,
45
+ )
46
+ output_ids = output[0][inputs["input_ids"].shape[1]:]
47
+ result = processor.decode(output_ids, skip_special_tokens=True)
48
+
49
+ return result
50
+
51
+ # Gradio interface
52
+ iface = gr.Interface(
53
+ fn=generate_response,
54
+ inputs=gr.inputs.Image(type="filepath"),
55
+ outputs="text",
56
+ title="Image-to-Text Model",
57
+ description="Upload an image, and the model will describe it.",
58
+ )
59
+
60
+ # Launch the app
61
+ iface.launch()