import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import json import torch import requests import time import random from PIL import Image from typing import Union device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using {device}" if device != "cpu" else "Using CPU") def _load_model(): tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True, revision="2024-05-08") model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map=device, trust_remote_code=True, revision="2024-05-08") return (model, tokenizer) class MoonDream(): def __init__(self, model=None, tokenizer=None): self.model, self.tokenizer = (model, tokenizer) if not model or not tokenizer: self.model, self.tokenizer = _load_model() self.device = device self.model.to(self.device) def __call__(self, question, imgs): imn = 0 for img in imgs: img = self.model.encode_image(img) res = self.model.answer_question(question=question, image_embeds=img, tokenizer=self.tokenizer) yield res return def _respond_one(question, img): txt = "" yield (txt := txt + MoonDream()(question, [img])) return txt def respond_batch(question, **imgs): md = MoonDream() for img in imgs.values(): res = md(question, img) for r in res: yield r yield "\n\n\n\n\n\n" return red = Image.new("RGB", (192,192), (255,0,0)) green = Image.new("RGB", (192,192), (0,255,0)) blue = Image.new("RGB", (192,192), (0,0,255)) res = respond_batch("What color is this? Elaborate upon what emotion registers most strongly with you upon viewing. ", imgs=[red, green, blue]) for r in res: print(r) if "\n\n\n\n\n\n" in r: break def dual_images(img1: Image): # Ran once for each img to it's respective output. Output should be detailed str of description/feature extraction/interrogation. md = MoonDream() res = md("Describe the image in plain english ", [img1]) txt = "" for r in res: yield (txt := txt + r) return import os with open("together_key.txt", "r") as f: os.environ["TOGETHER_KEY"] = f.read().strip() print("Set together key") def merge_descriptions_to_prompt(mi, d1, d2): from together import Together tog = Together(api_key=os.getenv("TOGETHER_KEY")) res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing. ### Description 1: ```text {d1} ``` ### Description 2: ```text {d2} ``` Merge-Specific Instructions: ```text {mi} ``` Ensure you end your output with ```\\n --- Complete Description: ```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024) return res.choices[0].text.split("```")[0] def xform_image_description(img, inst): from together import Together desc = dual_images(img) tog = Together(api_key=os.getenv("TOGETHER_KEY")) prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text""" res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024) return res.choices[0].text[len(prompt):].split("```")[0] with gr.Blocks() as demo: with gr.Row(visible=True): with gr.Column(): with gr.Row(): img = gr.Image(label="images", type='pil') with gr.Row(): btn = gr.Button("submit") with gr.Row(): otpt = gr.Textbox(label="output", lines=3, interactive=True) with gr.Row(): with gr.Column(): im1 = gr.Image(label="image 1", type='pil') with gr.Column(): im2 = gr.Image(label="image 2", type='pil') with gr.Row(): btn2 = gr.Button("submit batch") with gr.Row(): with gr.Column(): otp2 = gr.Textbox(label="individual batch output (left)", interactive=True) with gr.Column(): otp3 = gr.Textbox(label="individual batch output (right)", interactive=True) with gr.Row(): minst = gr.Textbox(label="Merge Instructions") with gr.Row(): btn_scd = gr.Button("Merge Descriptions to Single Combined Description") with gr.Row(): otp4 = gr.Textbox(label="batch output ( combined )", interactive=True, lines=4) btn2.click(dual_images, inputs=[im1], outputs=[otp2]) btn2.click(dual_images, inputs=[im2], outputs=[otp3]) btn.click(dual_images, inputs=[img], outputs=[otpt]) btn_scd.click(merge_descriptions_to_prompt, inputs=[minst, otp2, otp3], outputs=[otp4]) demo.launch(debug=True, share=True)