from PIL import Image from transformers import AutoTokenizer from pydantic import BaseModel from enum import Enum from moonline import Moonline def main(): class Mood(Enum): sad = "sad" happy = "happy" angry = "angry" neutral = "neutral" class ExampleModel(BaseModel): description: str mood: Mood prompt = f""" Your job is to describe the image. Please answer in json with the following format: {ExampleModel.__annotations__} """ image_path = "example.png" prompt = prompt model_id = "vikhyatk/moondream2" revision = "2024-04-02" tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) moonline = Moonline.from_pretrained( model_id, revision=revision, ).to() moonline.eval() image = Image.open(image_path) image_embeds = moonline.encode_image(image) fsm = moonline.generate_fsm(ExampleModel, tokenizer) answer = moonline.answer_question(image_embeds, prompt, tokenizer, fsm) print(f"answer: {answer}") if __name__ == "__main__": main()