Update README.md
Browse files
README.md
CHANGED
@@ -22,7 +22,7 @@ inference:
|
|
22 |
max_length: 300
|
23 |
---
|
24 |
|
25 |
-
# LongCap: Finetuned [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-
|
26 |
|
27 |
|
28 |
## Usage
|
@@ -41,8 +41,8 @@ import requests
|
|
41 |
from PIL import Image
|
42 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
43 |
|
44 |
-
processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
|
45 |
-
model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap")
|
46 |
|
47 |
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
48 |
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
@@ -51,7 +51,7 @@ inputs = processor(raw_image, return_tensors="pt")
|
|
51 |
pixel_values = inputs.pixel_values
|
52 |
out = model.generate(pixel_values=pixel_values, max_length=250)
|
53 |
print(processor.decode(out[0], skip_special_tokens=True))
|
54 |
-
>>> a
|
55 |
|
56 |
```
|
57 |
</details>
|
@@ -68,8 +68,8 @@ import requests
|
|
68 |
from PIL import Image
|
69 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
70 |
|
71 |
-
processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
|
72 |
-
model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap").to("cuda")
|
73 |
|
74 |
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
75 |
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
@@ -78,7 +78,7 @@ inputs = processor(raw_image, return_tensors="pt").to("cuda")
|
|
78 |
pixel_values = inputs.pixel_values
|
79 |
out = model.generate(pixel_values=pixel_values, max_length=250)
|
80 |
print(processor.decode(out[0], skip_special_tokens=True))
|
81 |
-
>>> a
|
82 |
```
|
83 |
</details>
|
84 |
|
@@ -93,8 +93,8 @@ import requests
|
|
93 |
from PIL import Image
|
94 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
95 |
|
96 |
-
processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
|
97 |
-
model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap", torch_dtype=torch.float16).to("cuda")
|
98 |
|
99 |
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
100 |
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
@@ -103,6 +103,6 @@ inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
|
|
103 |
pixel_values = inputs.pixel_values
|
104 |
out = model.generate(pixel_values=pixel_values, max_length=250)
|
105 |
print(processor.decode(out[0], skip_special_tokens=True))
|
106 |
-
>>> a
|
107 |
```
|
108 |
</details>
|
|
|
22 |
max_length: 300
|
23 |
---
|
24 |
|
25 |
+
# LongCap: Finetuned [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-large) for generating long captions of images, suitable for prompts for text-to-image generation and captioning text-to-image datasets
|
26 |
|
27 |
|
28 |
## Usage
|
|
|
41 |
from PIL import Image
|
42 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
43 |
|
44 |
+
processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
|
45 |
+
model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap")
|
46 |
|
47 |
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
48 |
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
|
|
51 |
pixel_values = inputs.pixel_values
|
52 |
out = model.generate(pixel_values=pixel_values, max_length=250)
|
53 |
print(processor.decode(out[0], skip_special_tokens=True))
|
54 |
+
>>> a woman sitting on the beach, wearing a checkered shirt and a dog collar. the woman is interacting with the dog, which is positioned towards the left side of the image. the setting is a beachfront with a calm sea and a golden hue.
|
55 |
|
56 |
```
|
57 |
</details>
|
|
|
68 |
from PIL import Image
|
69 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
70 |
|
71 |
+
processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
|
72 |
+
model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap").to("cuda")
|
73 |
|
74 |
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
75 |
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
|
|
78 |
pixel_values = inputs.pixel_values
|
79 |
out = model.generate(pixel_values=pixel_values, max_length=250)
|
80 |
print(processor.decode(out[0], skip_special_tokens=True))
|
81 |
+
>>> a woman sitting on the beach, wearing a checkered shirt and a dog collar. the woman is interacting with the dog, which is positioned towards the left side of the image. the setting is a beachfront with a calm sea and a golden hue.
|
82 |
```
|
83 |
</details>
|
84 |
|
|
|
93 |
from PIL import Image
|
94 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
95 |
|
96 |
+
processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
|
97 |
+
model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap", torch_dtype=torch.float16).to("cuda")
|
98 |
|
99 |
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
100 |
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
|
|
103 |
pixel_values = inputs.pixel_values
|
104 |
out = model.generate(pixel_values=pixel_values, max_length=250)
|
105 |
print(processor.decode(out[0], skip_special_tokens=True))
|
106 |
+
>>> a woman sitting on the beach, wearing a checkered shirt and a dog collar. the woman is interacting with the dog, which is positioned towards the left side of the image. the setting is a beachfront with a calm sea and a golden hue.
|
107 |
```
|
108 |
</details>
|