from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer from PIL import Image import torch from threading import Thread import gradio as gr from gradio import FileData import time import spaces # Load model directly # Load model directly from transformers import AutoProcessor, AutoModelForPreTraining processor = AutoProcessor.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit") model = AutoModelForPreTraining.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit") @spaces.CPU def bot_streaming(message, history, max_new_tokens=250): txt = message["text"] ext_buffer = f"{txt}" messages = [] images = [] for i, msg in enumerate(history): if isinstance(msg[0], tuple): messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) images.append(Image.open(msg[0][0]).convert("RGB")) elif isinstance(history[i-1], tuple) and isinstance(msg[0], str): pass elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) if len(message["files"]) == 1: if isinstance(message["files"][0], str): image = Image.open(message["files"][0]).convert("RGB") else: image = Image.open(message["files"][0]["path"]).convert("RGB") images.append(image) messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]}) else: messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) texts = processor.apply_chat_template(messages, add_generation_prompt=True) if images == []: inputs = processor(text=texts, return_tensors="pt") # No .to("cuda"), stays on CPU else: inputs = processor(text=texts, images=images, return_tensors="pt") # No .to("cuda"), stays on CPU streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens) generated_text = "" thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() buffer = "" for new_text in streamer: buffer += new_text generated_text_without_prompt = buffer time.sleep(0.01) yield buffer demo = gr.ChatInterface( fn=bot_streaming, title="Multimodal Llama", examples=[], # No examples provided textbox=gr.MultimodalTextbox(), additional_inputs=[gr.Slider( minimum=10, maximum=500, value=250, step=10, label="Maximum number of new tokens to generate", )], cache_examples=False, description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32).", stop_btn="Stop Generation", fill_height=True, multimodal=True ) demo.launch(debug=True)