Spaces:

wiusdy
/

VQA_fashion_hvar

Sleeping

File size: 1,744 Bytes

79c1479
6e61211
0a5203f
c5797b7
 
0306e1b
24eb62a
 
9c010ec
80ba3ac
 
c5797b7
79c1479
 
 
 
 
 
 
 
 
 
c5797b7
e727785
24eb62a
e727785
 
 
 
 
24eb62a
e727785
0306e1b

from transformers import BlipProcessor, BlipForQuestionAnswering
from transformers.utils import logging

class Inference:
    def __init__(self):
        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        self.blip_model_saffal = BlipForQuestionAnswering.from_pretrained("wiusdy/blip_pretrained_saffal_fashion_finetuning")
        self.blip_model_control_net = BlipForQuestionAnswering.from_pretrained("wiusdy/blip_pretrained_control_net_fashion_finetuning")

        logging.set_verbosity_info()
        self.logger = logging.get_logger("transformers")

    def inference(self, options, image, text):
        if options == "Blip Saffal":
            self.logger.info(f"Running inference for model BLIP Saffal")
            BLIP_saffal_inference = self.__inference_saffal_blip(image, text)
            return BLIP_saffal_inference

        elif options == "Blip Control Net":
            self.logger.info(f"Running inference for model BLIP Control Net")
            BLIP_control_net_inference = self.__inference_control_net_blip(image, text)
            return BLIP_control_net_inference

    def __inference_saffal_blip(self, image, text):
        encoding = self.blip_processor(image, text, return_tensors="pt")
        out = self.blip_model_saffal.generate(**encoding)
        generated_text = self.blip_processor.decode(out[0], skip_special_tokens=True)
        return f"{generated_text}"

    def __inference_control_net_blip(self, image, text):
        encoding = self.blip_processor(image, text, return_tensors="pt")
        out = self.blip_model_control_net.generate(**encoding)
        generated_text = self.blip_processor.decode(out[0], skip_special_tokens=True)
        return f"{generated_text}"