from typing import  Dict, List, Any
from transformers import AutoProcessor, AutoModelForVision2Seq
import io
import base64
from PIL import Image

class EndpointHandler():
    def __init__(self, path=""):
        # load the optimized model
        self.device = "cuda:0"
        self.model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to(self.device)
        self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")

    def __call__(self, data: dict[str, Any]) -> str:
        """
        example:    
            {"inputs": 
                messages: [{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "What’s the difference between these two images?"},
                        {"type": "image"},
                        {"type": "image"},
                    ],
                }]
                images: []
            }
        """
        text = self.processor.apply_chat_template(data["inputs"]["messages"], add_generation_prompt=False)
        images = [self.decode_image_base64(img) for img in data["inputs"]["images"]]
        inputs = self.processor(images=images, text=text, return_tensors="pt")
        inputs = {k: v.to(self.device) for k,v in inputs.items()}
        generated_ids = self.model.generate(**inputs, max_new_tokens=500)
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
        return generated_text


    def decode_image_base64(self, encoded_image):
        """
        Decodes a base64-encoded image back into a PIL image.
        """
        # Decode the base64-encoded string to bytes
        img_data = base64.b64decode(encoded_image.encode("utf-8"))
        
        # Create a BytesIO object from the decoded bytes
        img_io = io.BytesIO(img_data)
        
        # Open the image using PIL (Python Imaging Library)
        image = Image.open(img_io)
        
        return image