from PIL import Image
from transformers import AutoTokenizer
from pydantic import BaseModel 
from enum import Enum

from moonline import Moonline 

def main():
    class Mood(Enum):
        sad = "sad"
        happy = "happy"
        angry = "angry"
        neutral = "neutral"

    class ExampleModel(BaseModel):
        description: str
        mood: Mood

    prompt = f"""
    Your job is to describe the image.
    Please answer in json with the following format: {ExampleModel.__annotations__}
    """
    
    image_path = "example.png"
    prompt = prompt

    model_id = "vikhyatk/moondream2"
    revision = "2024-04-02"
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    moonline = Moonline.from_pretrained(
        model_id,
        revision=revision,
    ).to()
    moonline.eval()

    image = Image.open(image_path)
    image_embeds = moonline.encode_image(image)
    fsm = moonline.generate_fsm(ExampleModel, tokenizer)

    answer = moonline.answer_question(image_embeds, prompt, tokenizer, fsm)
    print(f"answer: {answer}")


if __name__ == "__main__":
    main()