teowu commited on
Commit
96d7606
1 Parent(s): 5e53de8
Files changed (2) hide show
  1. app.py +179 -63
  2. requirements.txt +10 -1
app.py CHANGED
@@ -1,64 +1,180 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
-
63
- if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
3
+ from threading import Thread
4
+ import re
5
+ import time
6
+ from PIL import Image
7
+ import torch
8
+ import spaces
9
+ import subprocess
10
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
11
+
12
+ import requests
13
+ import torch
14
+ from PIL import Image
15
+ from transformers import AutoModelForCausalLM, AutoProcessor
16
+
17
+ from decord import VideoReader
18
+ from decord import cpu
19
+ from PIL import Image
20
+ import numpy as np
21
+
22
+ def load_video(video_path, frames=32):
23
+ """
24
+ Load a video and extract a specified number of frames as PIL.Image objects.
25
+
26
+ Parameters:
27
+ - video_path (str): Path to the video file.
28
+ - frames (int): Number of frames to extract.
29
+
30
+ Returns:
31
+ - List[PIL.Image]: A list of PIL.Image objects for the extracted frames.
32
+ """
33
+ # Initialize VideoReader
34
+ vr = VideoReader(video_path, ctx=cpu())
35
+ total_frames = len(vr)
36
+
37
+ # Select frame indices evenly spaced throughout the video
38
+ frame_indices = np.linspace(0, total_frames - 1, frames, dtype=int)
39
+
40
+ # Extract frames and convert to PIL.Images
41
+ images = []
42
+ for idx in frame_indices:
43
+ frame = vr[idx] # Get the frame as a NumPy array
44
+ image = Image.fromarray(frame.asnumpy()) # Convert to PIL.Image
45
+ images.append(image)
46
+
47
+ return images
48
+
49
+ model_id_or_path = "teowu/Aria-Chat-Preview"
50
+
51
+ model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16,
52
+ trust_remote_code=True)
53
+
54
+ processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True)
55
+
56
+ @spaces.GPU
57
+ def model_inference(
58
+ input_dict, history, decoding_strategy, temperature, max_new_tokens, top_p
59
+ ):
60
+ text = input_dict["text"]
61
+ print(input_dict["files"])
62
+ if len(input_dict["files"]) > 1:
63
+ images = [Image.open(image).convert("RGB") for image in input_dict["files"]]
64
+ elif len(input_dict["files"]) == 1:
65
+ if input_dict["files"][0].endswith(".mp4") or input_dict["files"][0].endswith(".avi"):
66
+ images = load_video(input_dict["files"][0])
67
+ else:
68
+ images = [Image.open(input_dict["files"][0]).convert("RGB")]
69
+ else:
70
+ images = []
71
+
72
+
73
+ if text == "" and not images:
74
+ gr.Error("Please input a query and optionally image(s).")
75
+
76
+ if text == "" and images:
77
+ text = "Please provide a detailed description."
78
+ #gr.Error("Please input a text query along the image(s).")
79
+
80
+
81
+
82
+
83
+ resulting_messages = [
84
+ {
85
+ "role": "user",
86
+ "content": [{"type": "image", "text": None} for _ in range(len(images))] + [
87
+ {"type": "text", "text": "\n" + text}
88
+ ]
89
+ }
90
+ ]
91
+ prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
92
+ inputs = processor(text=prompt, images=images, return_tensors="pt")
93
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
94
+ generation_args = {
95
+ "max_new_tokens": max_new_tokens,
96
+ "repetition_penalty": repetition_penalty,
97
+ }
98
+
99
+ assert decoding_strategy in [
100
+ "Greedy",
101
+ "Top P Sampling",
102
+ ]
103
+ if decoding_strategy == "Greedy":
104
+ generation_args["do_sample"] = False
105
+ elif decoding_strategy == "Top P Sampling":
106
+ generation_args["temperature"] = temperature
107
+ generation_args["do_sample"] = True
108
+ generation_args["top_p"] = top_p
109
+
110
+ generation_args.update(inputs)
111
+ # Generate
112
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens= True)
113
+ generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
114
+ generated_text = ""
115
+
116
+ thread = Thread(target=model.generate, kwargs=generation_args)
117
+ thread.start()
118
+
119
+ yield "..."
120
+ buffer = ""
121
+
122
+
123
+ for new_text in streamer:
124
+
125
+ buffer += new_text
126
+ generated_text_without_prompt = buffer#[len(ext_buffer):]
127
+ time.sleep(0.01)
128
+ yield buffer
129
+
130
+
131
+ examples=[
132
+ [{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}, "Greedy", 0.4, 512, 1.2, 0.8],
133
+ [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, "Greedy", 0.4, 512, 1.2, 0.8],
134
+ [{"text": "What is the due date and the invoice date?", "files": ["example_images/examples_invoice.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
135
+ [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
136
+ [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}, "Greedy", 0.4, 512, 1.2, 0.8],
137
+ ]
138
+ demo = gr.ChatInterface(fn=model_inference, title="Aria-Chat: Improved Real-world Abilties for Open-source LMMs on Images and Videos",
139
+ description="Play with [rhymes-ai/Aria-Chat-Preview](https://huggingface.co/rhymes-ai/Aria-Chat-Preview) in this demo. To get started, upload an image (or a video) and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
140
+ examples=examples,
141
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
142
+ additional_inputs=[gr.Radio(["Top P Sampling",
143
+ "Greedy"],
144
+ value="Greedy",
145
+ label="Decoding strategy",
146
+ #interactive=True,
147
+ info="Higher values is equivalent to sampling more low-probability tokens.",
148
+
149
+ ), gr.Slider(
150
+ minimum=0.0,
151
+ maximum=5.0,
152
+ value=0.4,
153
+ step=0.1,
154
+ interactive=True,
155
+ label="Sampling temperature",
156
+ info="Higher values will produce more diverse outputs.",
157
+ ),
158
+ gr.Slider(
159
+ minimum=8,
160
+ maximum=1024,
161
+ value=512,
162
+ step=1,
163
+ interactive=True,
164
+ label="Maximum number of new tokens to generate",
165
+ ),
166
+ gr.Slider(
167
+ minimum=0.01,
168
+ maximum=0.99,
169
+ value=0.8,
170
+ step=0.01,
171
+ interactive=True,
172
+ label="Top P",
173
+ info="Higher values is equivalent to sampling more low-probability tokens.",
174
+ )],cache_examples=False
175
+ )
176
+
177
+
178
+
179
+
180
+ demo.launch(debug=True)
requirements.txt CHANGED
@@ -1 +1,10 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ accelerate
3
+ huggingface_hub
4
+ gradio
5
+ transformers
6
+ spaces
7
+ decord
8
+ torchvision
9
+ sentencepiece
10
+ grouped_gemm