unography
/

blip-large-long-cap

@@ -22,7 +22,7 @@ inference:
     max_length: 300
 ---
-# LongCap: Finetuned [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-base) for generating long captions of images, suitable for prompts for text-to-image generation and captioning text-to-image datasets
 ## Usage
@@ -41,8 +41,8 @@ import requests
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
-processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
-model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
@@ -51,7 +51,7 @@ inputs = processor(raw_image, return_tensors="pt")
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250)
 print(processor.decode(out[0], skip_special_tokens=True))
->>> a beach setting with a woman kneeling down and interacting with a dog. the woman is wearing a collar and is standing near the dog. the dog is positioned on the sand, and the atmosphere is calm and relaxing. there are no other people or animals in the image.
 ```
 </details>
@@ -68,8 +68,8 @@ import requests
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
-processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
-model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap").to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
@@ -78,7 +78,7 @@ inputs = processor(raw_image, return_tensors="pt").to("cuda")
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250)
 print(processor.decode(out[0], skip_special_tokens=True))
->>> a beach setting with a woman kneeling down and interacting with a dog. the woman is wearing a collar and is standing near the dog. the dog is positioned on the sand, and the atmosphere is calm and relaxing. there are no other people or animals in the image.
 ```
 </details>
@@ -93,8 +93,8 @@ import requests
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
-processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
-model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap", torch_dtype=torch.float16).to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
@@ -103,6 +103,6 @@ inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250)
 print(processor.decode(out[0], skip_special_tokens=True))
->>> a beach setting with a woman kneeling down and interacting with a dog. the woman is wearing a collar and is standing near the dog. the dog is positioned on the sand, and the atmosphere is calm and relaxing. there are no other people or animals in the image.
 ```
 </details>

     max_length: 300
 ---
+# LongCap: Finetuned [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-large) for generating long captions of images, suitable for prompts for text-to-image generation and captioning text-to-image datasets
 ## Usage
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
+processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
+model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250)
 print(processor.decode(out[0], skip_special_tokens=True))
+>>> a woman sitting on the beach, wearing a checkered shirt and a dog collar. the woman is interacting with the dog, which is positioned towards the left side of the image. the setting is a beachfront with a calm sea and a golden hue.
 ```
 </details>
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
+processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
+model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap").to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250)
 print(processor.decode(out[0], skip_special_tokens=True))
+>>> a woman sitting on the beach, wearing a checkered shirt and a dog collar. the woman is interacting with the dog, which is positioned towards the left side of the image. the setting is a beachfront with a calm sea and a golden hue.
 ```
 </details>
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
+processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
+model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap", torch_dtype=torch.float16).to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250)
 print(processor.decode(out[0], skip_special_tokens=True))
+>>> a woman sitting on the beach, wearing a checkered shirt and a dog collar. the woman is interacting with the dog, which is positioned towards the left side of the image. the setting is a beachfront with a calm sea and a golden hue.
 ```
 </details>