import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import torch
import requests
import time
import random
from PIL import Image
from typing import Union
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}" if device != "cpu" else "Using CPU")

def _load_model():
    tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True, revision="2024-05-08", torch_dtype=(torch.bfloat16 if device == 'cuda' else torch.float32))
    model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map=device, trust_remote_code=True, revision="2024-05-08")
    return (model, tokenizer)

class MoonDream():
    def __init__(self, model=None, tokenizer=None):
        self.model, self.tokenizer = (model, tokenizer)
        if not model or model is None or not tokenizer or tokenizer is None:
            self.model, self.tokenizer = _load_model()
        self.device = device
        self.model.to(self.device)
    def __call__(self, question, imgs):
        imn = 0
        for img in imgs:
            img = self.model.encode_image(img)
            res = self.model.answer_question(question=question, image_embeds=img, tokenizer=self.tokenizer)
            yield res
        return

md = MoonDream()

def _respond_one(question, img):
  txt = ""
  yield (txt := txt + MoonDream()(question, [img]))
  return txt

def respond_batch(question, **imgs):
  md = MoonDream()
  for img in imgs.values():
    res = md(question, img)
    for r in res:
      yield r
    yield "\n\n\n\n\n\n"
  return

red = Image.new("RGB", (192,192), (255,0,0))
green = Image.new("RGB", (192,192), (0,255,0))
blue = Image.new("RGB", (192,192), (0,0,255))
res = respond_batch("What color is this? Elaborate upon what emotion registers most strongly with you upon viewing. ", imgs=[red, green, blue])
for r in res:
  print(r)
  if "\n\n\n\n\n\n" in r:
    break

def dual_images(img1: Image):
  # Ran once for each img to it's respective output. Output should be detailed str of description/feature extraction/interrogation.
  md = MoonDream()
  res = md("Describe the image in plain english ", [img1])
  txt = ""
  for r in res:
    yield (txt := txt + r)
  return

import os

def merge_descriptions_to_prompt(mi, d1, d2):
  from together import Together
  tog = Together(api_key=os.getenv("TOGETHER_KEY"))
  res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing.
### Description 1:
```text
{d1}
```
### Description 2:
```text
{d2}
```
Merge-Specific Instructions:
```text
{mi}
```
Ensure you end your output with ```\\n
---
Complete Description:
```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
  return res.choices[0].text.split("```")[0]

def xform_image_description(img, inst):
  #md = MoonDream()
  from together import Together
  desc = dual_images(img)
  tog = Together(api_key=os.getenv("TOGETHER_KEY"))
  prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text"""
  res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
  return res.choices[0].text[len(prompt):].split("```")[0]

def simple_desc(img, prompt):
  return md(img, prompt)

with gr.Blocks() as if_simple_description:
  with gr.Row():
    with gr.Column():
      simple_img = gr.Image(type="pil")
    with gr.Column():
      simple_text = gr.Textbox(label="prompt ( Shift+Enter sends )")
      simple_btn = gr.Button("Prompt & Image 2 Text")
  simple_btn.click(simple_desc, inputs=[simple_img, simple_desc], outputs=[simple_desc])

"""
with gr.Blocks() as demo:
  
  with gr.Row():
    with gr.Column():
      im1 = gr.Image(label="image 1", type='pil')
      otp2 = gr.Textbox(label="image 1", interactive=True)
    with gr.Column():
      im2 = gr.Image(label="image 2", type='pil')
      otp3 = gr.Textbox(label="image 2")
  with gr.Row():
    minst = gr.Textbox(label="Merge Instructions")
  with gr.Row():
    btn2 = gr.Button("submit batch")
  with gr.Row():
    with gr.Column():
      im1 = gr.Image(label="image 1", type='pil')
      otp2 = gr.Textbox(label="individual batch output (left)", interactive=True)
    with gr.Column():
      im2 = gr.Image(label="image 2", type='pil')
      otp3 = gr.Textbox(label="individual batch output (right)", interactive=True)
  with gr.Row():
    otp4 = gr.Textbox(label="batch output ( combined )", interactive=True, lines=4)
  with gr.Row():
    btn_scd = gr.Button("Merge Descriptions to Single Combined Description")
  btn2.click(dual_images, inputs=[im1], outputs=[otp2])
  btn2.click(dual_images, inputs=[im2], outputs=[otp3])
  btn.click(dual_images, inputs=[img], outputs=[otpt])
  btn_scd.click(merge_descriptions_to_prompt, inputs=[minst, otp2, otp3], outputs=[otp4])

  demo.launch(debug=True, share=True)
  """
with gr.TabbedInterface(if_simple_description) as ifc:
  ifc.launch(share=False, server_host="0.0.0.0", server_port=7860)