File size: 1,982 Bytes
0546112
a583978
 
 
 
cb05228
f3126f3
cb05228
0546112
 
a79f819
a583978
 
0546112
a583978
 
 
9d9b5e2
a7a4721
a583978
0546112
 
 
 
 
2a79ef4
8576dce
2a79ef4
a583978
 
a7a4721
a583978
 
 
 
 
0546112
baa2ff5
a583978
e53bc58
baa2ff5
0546112
 
 
 
 
 
 
2a79ef4
0546112
 
baa2ff5
0546112
a583978
0546112
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# +
from typing import  Dict, List, Any
from PIL import Image
import requests
import torch
import base64
import os
from io import BytesIO

from transformers import BlipForConditionalGeneration, BlipProcessor
from models.blip_decoder import blip_decoder
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
# -

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EndpointHandler():
    def __init__(self, path=""):
        # load the optimized model
        
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 
        self.model = BlipForConditionalGeneration.from_pretrained(
            "Salesforce/blip-image-captioning-base"
        ).to(device)
        self.model.eval()
        self.model = self.model.to(device)
        


    def __call__(self, data: Any) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
                - "caption": A string corresponding to the generated caption.
        """
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})
 
        raw_images = [Image.open(BytesIO(_img)) for _img in inputs]
                                     
        processed_image = self.processor(images=raw_images, return_tensors="pt") 
        processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
        processed_image = {**processed_image, **parameters}
        
        with torch.no_grad():
            out = self.model.generate(
                **processed_image
            )
        captions = self.processor.batch_decode(out, skip_special_tokens=True)
        # postprocess the prediction
        return {"captions": captions}