Spaces:

bilgeyucel
/

captionate

Running

App Files Files Community

bilgeyucel commited on Feb 23, 2024

Commit

4397b1e

verified ·

1 Parent(s): e5db11f

Update image_captioner.py

Browse files

Files changed (1) hide show

image_captioner.py +8 -16

image_captioner.py CHANGED Viewed

@@ -37,20 +37,16 @@ class ImageCaptioner:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
-    @component.output_types(captions=List[str])
-    def run(self, image_file_paths: List[str]) -> List[Document]:
-        images = []
-        for image_path in image_file_paths:
-            i_image = Image.open(image_path)
-            if i_image.mode != "RGB":
-                i_image = i_image.convert(mode="RGB")
-            images.append(i_image)
         preds = []
         if self.model_name == "nlpconnect/vit-gpt2-image-captioning":
-            pixel_values = self.feature_extractor(images=images, return_tensors="pt").pixel_values
             pixel_values = pixel_values.to(self.device)
             output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
@@ -59,7 +55,7 @@ class ImageCaptioner:
             preds = [pred.strip() for pred in preds]
         else:
-            inputs = self.processor(images, return_tensors="pt")
             output_ids = self.model.generate(**inputs)
             preds = self.processor.batch_decode(output_ids, skip_special_tokens=True)
             preds = [pred.strip() for pred in preds]
@@ -68,8 +64,4 @@ class ImageCaptioner:
         # for caption, image_file_path in zip(preds, image_file_paths):
         #     document = Document(content=caption, meta={"image_path": image_file_path})
         #     captions.append(document)
-        return {"captions": preds}
-# captioner = ImageCaptioner(model_name="Salesforce/blip-image-captioning-base")
-# result = captioner.run(image_file_paths=["selfie.png"])
-# print(result)

         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
+    @component.output_types(caption=str)
+    def run(self, image_file_path: str) -> List[Document]:
+        i_image = Image.open(image_file_path)
+        if i_image.mode != "RGB":
+            i_image = i_image.convert(mode="RGB")
         preds = []
         if self.model_name == "nlpconnect/vit-gpt2-image-captioning":
+            pixel_values = self.feature_extractor(images=[i_image], return_tensors="pt").pixel_values
             pixel_values = pixel_values.to(self.device)
             output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
             preds = [pred.strip() for pred in preds]
         else:
+            inputs = self.processor([i_image], return_tensors="pt")
             output_ids = self.model.generate(**inputs)
             preds = self.processor.batch_decode(output_ids, skip_special_tokens=True)
             preds = [pred.strip() for pred in preds]
         # for caption, image_file_path in zip(preds, image_file_paths):
         #     document = Document(content=caption, meta={"image_path": image_file_path})
         #     captions.append(document)
+        return {"caption": preds[0]}