akhaliq's picture
akhaliq HF staff
Update app.py
b6ef90b verified
raw
history blame
3.58 kB
import gradio as gr
from PIL import Image
import requests
import os
from together import Together
import base64
from threading import Thread
import time
# Initialize Together client
client = Together()
# Ensure API key is set
if "TOGETHER_API_KEY" not in os.environ:
raise ValueError("Please set the TOGETHER_API_KEY environment variable")
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def bot_streaming(message, history, max_new_tokens=250):
txt = message["text"]
messages = []
images = []
for i, msg in enumerate(history):
if isinstance(msg[0], tuple):
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(msg[0][0])}"}}]})
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
pass
elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
if len(message["files"]) == 1:
if isinstance(message["files"][0], str): # examples
image_path = message["files"][0]
else: # regular input
image_path = message["files"][0]["path"]
messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}]})
else:
messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
stream = client.chat.completions.create(
model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
messages=messages,
max_tokens=max_new_tokens,
stream=True,
)
buffer = ""
for chunk in stream:
if chunk.choices[0].delta.content is not None:
buffer += chunk.choices[0].delta.content
time.sleep(0.01)
yield buffer
demo = gr.ChatInterface(
fn=bot_streaming,
title="Multimodal Llama",
examples=[
[{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
[{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
[{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
[{"text": "Which company was this invoice addressed to?", "files":["./examples/invoice.png"]}, 250],
[{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
],
textbox=gr.MultimodalTextbox(),
additional_inputs=[
gr.Slider(
minimum=10,
maximum=500,
value=250,
step=10,
label="Maximum number of new tokens to generate",
)
],
cache_examples=False,
description="Try Multimodal Llama by Meta with the Together API in this demo. Upload an image, and start chatting about it, or simply try one of the examples below.",
stop_btn="Stop Generation",
fill_height=True,
multimodal=True
)
if __name__ == "__main__":
demo.launch(debug=True)