Harshithtd commited on
Commit
6df7939
1 Parent(s): 33cf200

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
+ from PIL import Image
3
+ import torch
4
+ from threading import Thread
5
+ import gradio as gr
6
+ from gradio import FileData
7
+ import time
8
+ import spaces
9
+
10
+ ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
11
+ # Load model to CPU by default
12
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16)
13
+ processor = AutoProcessor.from_pretrained(ckpt)
14
+
15
+ @spaces.CPU
16
+ def bot_streaming(message, history, max_new_tokens=250):
17
+
18
+ txt = message["text"]
19
+ ext_buffer = f"{txt}"
20
+
21
+ messages = []
22
+ images = []
23
+
24
+ for i, msg in enumerate(history):
25
+ if isinstance(msg[0], tuple):
26
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
27
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
28
+ images.append(Image.open(msg[0][0]).convert("RGB"))
29
+ elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
30
+ pass
31
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
32
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
33
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
34
+
35
+ if len(message["files"]) == 1:
36
+ if isinstance(message["files"][0], str):
37
+ image = Image.open(message["files"][0]).convert("RGB")
38
+ else:
39
+ image = Image.open(message["files"][0]["path"]).convert("RGB")
40
+ images.append(image)
41
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
42
+ else:
43
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
44
+
45
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
46
+
47
+ if images == []:
48
+ inputs = processor(text=texts, return_tensors="pt") # No .to("cuda"), stays on CPU
49
+ else:
50
+ inputs = processor(text=texts, images=images, return_tensors="pt") # No .to("cuda"), stays on CPU
51
+
52
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
53
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
54
+ generated_text = ""
55
+
56
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
57
+ thread.start()
58
+ buffer = ""
59
+
60
+ for new_text in streamer:
61
+ buffer += new_text
62
+ generated_text_without_prompt = buffer
63
+ time.sleep(0.01)
64
+ yield buffer
65
+
66
+ demo = gr.ChatInterface(
67
+ fn=bot_streaming,
68
+ title="Multimodal Llama",
69
+ examples=[], # No examples provided
70
+ textbox=gr.MultimodalTextbox(),
71
+ additional_inputs=[gr.Slider(
72
+ minimum=10,
73
+ maximum=500,
74
+ value=250,
75
+ step=10,
76
+ label="Maximum number of new tokens to generate",
77
+ )],
78
+ cache_examples=False,
79
+ description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32).",
80
+ stop_btn="Stop Generation",
81
+ fill_height=True,
82
+ multimodal=True
83
+ )
84
+
85
+ demo.launch(debug=True)