llava-hf/LLaVA-NeXT-Video-7B-hf · Update chat_template.json to incorporate `generation` tag

Incorporating generation tag to chat_template.json so that return_assistant_tokens_mask can work correctly with the tokenizer (https://github.com/huggingface/transformers/pull/30650/files). A self-contained test script is pasted below.

from transformers import LlavaNextVideoProcessor

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")

# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are a helpful assistant."},
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What is shown in this video?"},
            {"type": "video"},
        ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "This is a video about a cat."},
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What is the cat doing?"},
        ]
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "The cat is sleeping on a sofa. It looks very comfortable."},
        ]
    }
]

template = (
    "{% for message in messages %}"
    "{% if message['role'] != 'system' %}"
    "{{ message['role'].upper() + ': '}}"
    "{% endif %}"
    "{# Render all images first #}"
    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
    "{{ '<image>\n' }}"
    "{% endfor %}"
    "{# Render all images first #}"
    "{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
    "{{ '<video>\n' }}"
    "{% endfor %}"
    "{# Render all text next #}"
    "{% if message['role'] != 'assistant' %}"
    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
    "{{ content['text'] + ' '}}"
    "{% endfor %}"
    "{% else %}"
    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
    "{% generation %}"
    "{{ content['text'] + ' '}}"
    "{% endgeneration %}"
    "{% endfor %}"
    "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "{{ 'ASSISTANT:' }}"
    "{% endif %}"
)
print("\n")
print(repr(template))
print("\n")

prompt = processor.apply_chat_template(
    conversation,
    chat_template=template,
    add_generation_prompt=False,
    tokenize=False
)
print(prompt)

inputs = processor.apply_chat_template(
    conversation, 
    chat_template=template,
    add_generation_prompt=False,
    tokenize=True,
    return_assistant_tokens_mask=True,
    return_dict=True
)
print(inputs['assistant_masks'])