Update README.md
Browse files
README.md
CHANGED
@@ -46,6 +46,95 @@ Lyrics can take images, text, and visual objects as input, and text and spatial
|
|
46 |
* pytorch 1.12 and above
|
47 |
* CUDA 11.3 and above are recommended (this is for GPU users)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
### 零样本图像描述 & 通用视觉问答 (Zero-shot Image Captioning & General VQA)
|
50 |
![](assets/image_caption_vqa.jpg)
|
51 |
|
|
|
46 |
* pytorch 1.12 and above
|
47 |
* CUDA 11.3 and above are recommended (this is for GPU users)
|
48 |
|
49 |
+
|
50 |
+
## 使用 Usage
|
51 |
+
|
52 |
+
首先加载Ziya-Visual模型:需要注意的是Visual-Ziya的模型仓库只包含视觉模型部分的参数,Ziya LLM部分的参数通过[Ziya-LLaMA-13B-v1](https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-13B-v1)获得。得到这两部分的模型参数后,我们加载模型:
|
53 |
+
|
54 |
+
First load the Ziya-Visual model: it should be noted that the model repository of Visual-Ziya contains only the parameters of the visual model part, the parameters of the Ziya LLM part are obtained through [Ziya-LLaMA-13B-v1](https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-13B-v1). Once we have the parameters for both parts of the model, we load the model:
|
55 |
+
|
56 |
+
```python
|
57 |
+
import gradio as gr
|
58 |
+
from PIL import Image
|
59 |
+
import torch
|
60 |
+
import random
|
61 |
+
from fengshen.models.Lyrics.modeling_lyrics import LyricsLMForConditionalGeneration
|
62 |
+
from torchvision.transforms import Compose, ToTensor, Resize, Normalize
|
63 |
+
from transformers import InstructBlipProcessor, LlamaTokenizer, BertTokenizer, GenerationConfig
|
64 |
+
from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip
|
65 |
+
import fengshen.models.Lyrics.groundingdino.transforms as T
|
66 |
+
from transformers import InstructBlipForConditionalGeneration
|
67 |
+
from peft import PeftModel
|
68 |
+
|
69 |
+
OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
70 |
+
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
|
71 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
72 |
+
|
73 |
+
_MODEL_PATH = "your_model_path"
|
74 |
+
|
75 |
+
processor = InstructBlipProcessor.from_pretrained(os.path.join(_MODEL_PATH, "vicuna-13b_processor"), padding_side = "left")
|
76 |
+
grounding_transforms = T.Compose(
|
77 |
+
[
|
78 |
+
T.RandomResize([800], max_size=1333),
|
79 |
+
# T.RandomResize([800]),
|
80 |
+
T.ToTensor(),
|
81 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
82 |
+
]
|
83 |
+
)
|
84 |
+
ram_transforms = Compose([
|
85 |
+
Resize((384, 384)),
|
86 |
+
ToTensor(),
|
87 |
+
Normalize(mean=[0.485, 0.456, 0.406],
|
88 |
+
std=[0.229, 0.224, 0.225])
|
89 |
+
])
|
90 |
+
|
91 |
+
|
92 |
+
model = LyricsLMForConditionalGeneration.from_pretrained(_MODEL_PATH).to(device).eval().float()
|
93 |
+
model = PeftModel.from_pretrained(model, _MODEL_PATH).to(device).eval().float()
|
94 |
+
model.requires_grad_=False
|
95 |
+
|
96 |
+
prompt = [
|
97 |
+
"Question A",
|
98 |
+
"Question B",
|
99 |
+
]
|
100 |
+
|
101 |
+
image_url = [
|
102 |
+
'Img Path A',
|
103 |
+
'Img Path B',
|
104 |
+
]
|
105 |
+
|
106 |
+
imgs = []
|
107 |
+
|
108 |
+
for image, text in zip(image_url, prompt):
|
109 |
+
image = Image.open(image).convert("RGB")
|
110 |
+
ram_pixel_values = ram_transforms(image).unsqueeze(0).to(device)
|
111 |
+
grounding_pixel_values = [grounding_transforms(image, None)[0]]
|
112 |
+
|
113 |
+
inputs = processor(images=image, text=text, return_tensors="pt").to(device)
|
114 |
+
|
115 |
+
outputs = model.generate(
|
116 |
+
# **inputs,
|
117 |
+
pixel_values=inputs.pixel_values,
|
118 |
+
ram_pixel_values=ram_pixel_values,
|
119 |
+
grounding_pixel_values=grounding_pixel_values,
|
120 |
+
input_ids=inputs.input_ids,
|
121 |
+
attention_mask=inputs.attention_mask,
|
122 |
+
qformer_input_ids=inputs.qformer_input_ids,
|
123 |
+
qformer_attention_mask=inputs.qformer_attention_mask,
|
124 |
+
do_sample=False,
|
125 |
+
num_beams=5,
|
126 |
+
max_length=256,
|
127 |
+
min_length=1,
|
128 |
+
# repetition_penalty=1.5,
|
129 |
+
length_penalty=1.0,
|
130 |
+
# temperature=0.3,
|
131 |
+
# top_p=0.1,
|
132 |
+
# pad_token_id=32000,
|
133 |
+
)
|
134 |
+
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
135 |
+
print(generated_text, '\n')
|
136 |
+
|
137 |
+
|
138 |
### 零样本图像描述 & 通用视觉问答 (Zero-shot Image Captioning & General VQA)
|
139 |
![](assets/image_caption_vqa.jpg)
|
140 |
|