Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import json | |
import torch | |
import requests | |
import time | |
import random | |
from PIL import Image | |
from typing import Union | |
import os | |
device = "cuda" | |
print(f"Using {device}" if device != "cpu" else "Using CPU") | |
def _load_model(): | |
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True, revision="2024-05-08", torch_dtype=(torch.bfloat16 if device == 'cuda' else torch.float32)) | |
model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map=device, trust_remote_code=True, revision="2024-05-08") | |
return (model, tokenizer) | |
class MoonDream(): | |
def __init__(self, model=None, tokenizer=None): | |
self.model, self.tokenizer = (model, tokenizer) | |
if not model or model is None or not tokenizer or tokenizer is None: | |
self.model, self.tokenizer = _load_model() | |
self.device = device | |
self.model.to(self.device) | |
def __call__(self, question, imgs): | |
imn = 0 | |
for img in imgs: | |
img = self.model.encode_image(img) | |
res = self.model.answer_question(question=question, image_embeds=img, tokenizer=self.tokenizer) | |
yield res | |
return | |
def _respond_one(question, img): | |
txt = "" | |
yield (txt := txt + MoonDream()(question, [img])) | |
return txt | |
def respond_batch(question, **imgs): | |
md = MoonDream() | |
for img in imgs.values(): | |
res = md(question, img) | |
for r in res: | |
yield r | |
yield "\n\n\n\n\n\n" | |
return | |
red = Image.new("RGB", (192,192), (255,0,0)) | |
green = Image.new("RGB", (192,192), (0,255,0)) | |
blue = Image.new("RGB", (192,192), (0,0,255)) | |
res = respond_batch("What color is this? Elaborate upon what emotion registers most strongly with you upon viewing. ", imgs=[red, green, blue]) | |
for r in res: | |
print(r) | |
if "\n\n\n\n\n\n" in r: | |
break | |
def dual_images(img1: Image): | |
# Ran once for each img to it's respective output. Output should be detailed str of description/feature extraction/interrogation. | |
md = MoonDream() | |
res = md("Describe the image in plain english ", [img1]) | |
txt = "" | |
for r in res: | |
yield (txt := txt + r) | |
return | |
import os | |
def merge_descriptions_to_prompt(mi, d1, d2): | |
from together import Together | |
tog = Together(api_key=os.getenv("TOGETHER_KEY")) | |
res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing. | |
### Description 1: | |
```text | |
{d1} | |
``` | |
### Description 2: | |
```text | |
{d2} | |
``` | |
Merge-Specific Instructions: | |
```text | |
{mi} | |
``` | |
Ensure you end your output with ```\\n | |
--- | |
Complete Description: | |
```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024) | |
return res.choices[0].text.split("```")[0] | |
def xform_image_description(img, inst): | |
#md = MoonDream() | |
from together import Together | |
desc = dual_images(img) | |
tog = Together(api_key=os.getenv("TOGETHER_KEY")) | |
prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text""" | |
res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024) | |
return res.choices[0].text[len(prompt):].split("```")[0] | |
with gr.Blocks() as demo: | |
with gr.Row(visible=True): | |
with gr.Row(): | |
img = gr.Image(label="images", type='pil') | |
with gr.Row(): | |
btn = gr.Button("submit") | |
with gr.Row(): | |
otpt = gr.Textbox(label="output", lines=3, interactive=True) | |
with gr.Row(): | |
with gr.Column(): | |
im1 = gr.Image(label="image 1", type='pil') | |
otp2 = gr.Textbox(label="image 1", interactive=True) | |
with gr.Column(): | |
im2 = gr.Image(label="image 2", type='pil') | |
otp3 = gr.Textbox(label="image 2") | |
with gr.Row(): | |
minst = gr.Textbox(label="Merge Instructions") | |
with gr.Row(): | |
btn2 = gr.Button("submit batch") | |
with gr.Row(): | |
with gr.Column(): | |
im1 = gr.Image(label="image 1", type='pil') | |
otp2 = gr.Textbox(label="individual batch output (left)", interactive=True) | |
with gr.Column(): | |
im2 = gr.Image(label="image 2", type='pil') | |
otp3 = gr.Textbox(label="individual batch output (right)", interactive=True) | |
with gr.Row(): | |
otp4 = gr.Textbox(label="batch output ( combined )", interactive=True, lines=4) | |
with gr.Row(): | |
btn_scd = gr.Button("Merge Descriptions to Single Combined Description") | |
btn2.click(dual_images, inputs=[im1], outputs=[otp2]) | |
btn2.click(dual_images, inputs=[im2], outputs=[otp3]) | |
btn.click(dual_images, inputs=[img], outputs=[otpt]) | |
btn_scd.click(merge_descriptions_to_prompt, inputs=[minst, otp2, otp3], outputs=[otp4]) | |
demo.launch(debug=True, share=True) |