llava-hf
/

llava-v1.6-mistral-7b-hf

@@ -3,6 +3,9 @@ license: apache-2.0
 tags:
 - vision
 - image-text-to-text
 ---
 # LLaVa-Next, leveraging [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) as LLM
@@ -47,7 +50,20 @@ model.to("cuda:0")
 # prepare image and text prompt, using the appropriate prompt template
 url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 image = Image.open(requests.get(url, stream=True).raw)
-prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
 inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")

 tags:
 - vision
 - image-text-to-text
+language:
+- en
+pipeline_tag: image-text-to-text
 ---
 # LLaVa-Next, leveraging [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) as LLM
 # prepare image and text prompt, using the appropriate prompt template
 url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 image = Image.open(requests.get(url, stream=True).raw)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What is shown in this image?"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")