IDEA-CCNL
/

Ziya-Visual-Lyrics-14B

@@ -46,6 +46,95 @@ Lyrics can take images, text, and visual objects as input, and text and spatial
 * pytorch 1.12 and above
 * CUDA 11.3 and above are recommended (this is for GPU users)
 ### 零样本图像描述 & 通用视觉问答 (Zero-shot Image Captioning & General VQA)
 ![](assets/image_caption_vqa.jpg)

 * pytorch 1.12 and above
 * CUDA 11.3 and above are recommended (this is for GPU users)
+## 使用 Usage
+首先加载Ziya-Visual模型：需要注意的是Visual-Ziya的模型仓库只包含视觉模型部分的参数，Ziya LLM部分的参数通过[Ziya-LLaMA-13B-v1](https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-13B-v1)获得。得到这两部分的模型参数后，我们加载模型：
+First load the Ziya-Visual model: it should be noted that the model repository of Visual-Ziya contains only the parameters of the visual model part, the parameters of the Ziya LLM part are obtained through [Ziya-LLaMA-13B-v1](https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-13B-v1). Once we have the parameters for both parts of the model, we load the model:
+```python
+import gradio as gr
+from PIL import Image
+import torch
+import random
+from fengshen.models.Lyrics.modeling_lyrics import LyricsLMForConditionalGeneration
+from torchvision.transforms import Compose, ToTensor, Resize, Normalize
+from transformers import InstructBlipProcessor, LlamaTokenizer, BertTokenizer, GenerationConfig
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip
+import fengshen.models.Lyrics.groundingdino.transforms as T
+from transformers import InstructBlipForConditionalGeneration
+from peft import PeftModel
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+_MODEL_PATH = "your_model_path"
+processor = InstructBlipProcessor.from_pretrained(os.path.join(_MODEL_PATH, "vicuna-13b_processor"), padding_side = "left")
+grounding_transforms = T.Compose(
+    [
+        T.RandomResize([800], max_size=1333),
+        # T.RandomResize([800]),
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+    ]
+)
+ram_transforms = Compose([
+            Resize((384, 384)),
+            ToTensor(),
+            Normalize(mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225])
+        ])
+model = LyricsLMForConditionalGeneration.from_pretrained(_MODEL_PATH).to(device).eval().float()
+model = PeftModel.from_pretrained(model, _MODEL_PATH).to(device).eval().float()
+model.requires_grad_=False
+prompt = [
+        "Question A",
+        "Question B",
+            ]
+image_url = [
+        'Img Path A',
+        'Img Path B',
+            ]
+imgs = []
+for image, text in zip(image_url, prompt):
+        image = Image.open(image).convert("RGB")
+        ram_pixel_values = ram_transforms(image).unsqueeze(0).to(device)
+        grounding_pixel_values = [grounding_transforms(image, None)[0]]
+        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
+        outputs = model.generate(
+                # **inputs,
+                pixel_values=inputs.pixel_values,
+                ram_pixel_values=ram_pixel_values,
+                grounding_pixel_values=grounding_pixel_values,
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                qformer_input_ids=inputs.qformer_input_ids,
+                qformer_attention_mask=inputs.qformer_attention_mask,
+                do_sample=False,
+                num_beams=5,
+                max_length=256,
+                min_length=1,
+                # repetition_penalty=1.5,
+                length_penalty=1.0,
+                # temperature=0.3,
+                # top_p=0.1,
+                # pad_token_id=32000,
+        )
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        print(generated_text, '\n')
 ### 零样本图像描述 & 通用视觉问答 (Zero-shot Image Captioning & General VQA)
 ![](assets/image_caption_vqa.jpg)