LinhIcey commited on
Commit
a813303
·
1 Parent(s): c739eb5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +89 -0
README.md CHANGED
@@ -46,6 +46,95 @@ Lyrics can take images, text, and visual objects as input, and text and spatial
46
  * pytorch 1.12 and above
47
  * CUDA 11.3 and above are recommended (this is for GPU users)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  ### 零样本图像描述 & 通用视觉问答 (Zero-shot Image Captioning & General VQA)
50
  ![](assets/image_caption_vqa.jpg)
51
 
 
46
  * pytorch 1.12 and above
47
  * CUDA 11.3 and above are recommended (this is for GPU users)
48
 
49
+
50
+ ## 使用 Usage
51
+
52
+ 首先加载Ziya-Visual模型:需要注意的是Visual-Ziya的模型仓库只包含视觉模型部分的参数,Ziya LLM部分的参数通过[Ziya-LLaMA-13B-v1](https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-13B-v1)获得。得到这两部分的模型参数后,我们加载模型:
53
+
54
+ First load the Ziya-Visual model: it should be noted that the model repository of Visual-Ziya contains only the parameters of the visual model part, the parameters of the Ziya LLM part are obtained through [Ziya-LLaMA-13B-v1](https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-13B-v1). Once we have the parameters for both parts of the model, we load the model:
55
+
56
+ ```python
57
+ import gradio as gr
58
+ from PIL import Image
59
+ import torch
60
+ import random
61
+ from fengshen.models.Lyrics.modeling_lyrics import LyricsLMForConditionalGeneration
62
+ from torchvision.transforms import Compose, ToTensor, Resize, Normalize
63
+ from transformers import InstructBlipProcessor, LlamaTokenizer, BertTokenizer, GenerationConfig
64
+ from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip
65
+ import fengshen.models.Lyrics.groundingdino.transforms as T
66
+ from transformers import InstructBlipForConditionalGeneration
67
+ from peft import PeftModel
68
+
69
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
70
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
71
+ device = "cuda" if torch.cuda.is_available() else "cpu"
72
+
73
+ _MODEL_PATH = "your_model_path"
74
+
75
+ processor = InstructBlipProcessor.from_pretrained(os.path.join(_MODEL_PATH, "vicuna-13b_processor"), padding_side = "left")
76
+ grounding_transforms = T.Compose(
77
+ [
78
+ T.RandomResize([800], max_size=1333),
79
+ # T.RandomResize([800]),
80
+ T.ToTensor(),
81
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
82
+ ]
83
+ )
84
+ ram_transforms = Compose([
85
+ Resize((384, 384)),
86
+ ToTensor(),
87
+ Normalize(mean=[0.485, 0.456, 0.406],
88
+ std=[0.229, 0.224, 0.225])
89
+ ])
90
+
91
+
92
+ model = LyricsLMForConditionalGeneration.from_pretrained(_MODEL_PATH).to(device).eval().float()
93
+ model = PeftModel.from_pretrained(model, _MODEL_PATH).to(device).eval().float()
94
+ model.requires_grad_=False
95
+
96
+ prompt = [
97
+ "Question A",
98
+ "Question B",
99
+ ]
100
+
101
+ image_url = [
102
+ 'Img Path A',
103
+ 'Img Path B',
104
+ ]
105
+
106
+ imgs = []
107
+
108
+ for image, text in zip(image_url, prompt):
109
+ image = Image.open(image).convert("RGB")
110
+ ram_pixel_values = ram_transforms(image).unsqueeze(0).to(device)
111
+ grounding_pixel_values = [grounding_transforms(image, None)[0]]
112
+
113
+ inputs = processor(images=image, text=text, return_tensors="pt").to(device)
114
+
115
+ outputs = model.generate(
116
+ # **inputs,
117
+ pixel_values=inputs.pixel_values,
118
+ ram_pixel_values=ram_pixel_values,
119
+ grounding_pixel_values=grounding_pixel_values,
120
+ input_ids=inputs.input_ids,
121
+ attention_mask=inputs.attention_mask,
122
+ qformer_input_ids=inputs.qformer_input_ids,
123
+ qformer_attention_mask=inputs.qformer_attention_mask,
124
+ do_sample=False,
125
+ num_beams=5,
126
+ max_length=256,
127
+ min_length=1,
128
+ # repetition_penalty=1.5,
129
+ length_penalty=1.0,
130
+ # temperature=0.3,
131
+ # top_p=0.1,
132
+ # pad_token_id=32000,
133
+ )
134
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
135
+ print(generated_text, '\n')
136
+
137
+
138
  ### 零样本图像描述 & 通用视觉问答 (Zero-shot Image Captioning & General VQA)
139
  ![](assets/image_caption_vqa.jpg)
140