import gradio as gr from PIL import Image import requests import os from together import Together import base64 from threading import Thread import time # Initialize Together client client = Together() # Ensure API key is set if "TOGETHER_API_KEY" not in os.environ: raise ValueError("Please set the TOGETHER_API_KEY environment variable") def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def bot_streaming(message, history, max_new_tokens=250): txt = message["text"] messages = [] images = [] for i, msg in enumerate(history): if isinstance(msg[0], tuple): messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(msg[0][0])}"}}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) elif isinstance(history[i-1], tuple) and isinstance(msg[0], str): pass elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) if len(message["files"]) == 1: if isinstance(message["files"][0], str): # examples image_path = message["files"][0] else: # regular input image_path = message["files"][0]["path"] messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}]}) else: messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) stream = client.chat.completions.create( model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", messages=messages, max_tokens=max_new_tokens, stream=True, ) buffer = "" for chunk in stream: if chunk.choices[0].delta.content is not None: buffer += chunk.choices[0].delta.content time.sleep(0.01) yield buffer demo = gr.ChatInterface( fn=bot_streaming, title="Multimodal Llama", examples=[ [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200], [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250], [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250], [{"text": "Which company was this invoice addressed to?", "files":["./examples/invoice.png"]}, 250], [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250], ], textbox=gr.MultimodalTextbox(), additional_inputs=[ gr.Slider( minimum=10, maximum=500, value=250, step=10, label="Maximum number of new tokens to generate", ) ], cache_examples=False, description="Try Multimodal Llama by Meta with the Together API in this demo. Upload an image, and start chatting about it, or simply try one of the examples below.", stop_btn="Stop Generation", fill_height=True, multimodal=True ) if __name__ == "__main__": demo.launch(debug=True)