File size: 1,982 Bytes
0546112 a583978 cb05228 f3126f3 cb05228 0546112 a79f819 a583978 0546112 a583978 9d9b5e2 a7a4721 a583978 0546112 2a79ef4 8576dce 2a79ef4 a583978 a7a4721 a583978 0546112 baa2ff5 a583978 e53bc58 baa2ff5 0546112 2a79ef4 0546112 baa2ff5 0546112 a583978 0546112 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# +
from typing import Dict, List, Any
from PIL import Image
import requests
import torch
import base64
import os
from io import BytesIO
from transformers import BlipForConditionalGeneration, BlipProcessor
from models.blip_decoder import blip_decoder
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
# -
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler():
def __init__(self, path=""):
# load the optimized model
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
self.model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
).to(device)
self.model.eval()
self.model = self.model.to(device)
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
raw_images = [Image.open(BytesIO(_img)) for _img in inputs]
processed_image = self.processor(images=raw_images, return_tensors="pt")
processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
processed_image = {**processed_image, **parameters}
with torch.no_grad():
out = self.model.generate(
**processed_image
)
captions = self.processor.batch_decode(out, skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}
|