File size: 3,824 Bytes
3a496ae
 
 
1cb9066
3a496ae
 
 
 
 
 
 
 
 
 
 
 
de447f9
3a496ae
 
 
0ba1d16
b3402e9
42a01c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e711763
de447f9
 
6ab097e
3a496ae
 
 
 
 
 
 
 
cfe5653
3a496ae
 
 
 
 
 
 
 
 
 
 
42a01c3
3a496ae
cfe5653
42a01c3
 
3a496ae
cfe5653
b3402e9
0ba1d16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b85fc6
 
0ba1d16
8b85fc6
a606624
890b2c5
0ba1d16
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import csv
import json
import torch
import argparse
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from collections import defaultdict
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from torch.utils.data import DataLoader
from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration
from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
from peft import LoraConfig, get_peft_model
from data_utils.xgpt3_dataset import MultiModalDataset
from utils import batchify
from huggingface_hub import hf_hub_download

import gradio as gr
from entailment_inference import get_scores
from nle_inference import VideoCaptionDataset, get_nle

import re

def modify_keys(state_dict):
    new_state_dict = defaultdict()

    pattern = re.compile(r'.*language_model.*\.(q_proj|v_proj|k_proj|o_proj|gate_proj|down_proj|up_proj).weight')

    for key, value in state_dict.items():
        if pattern.match(key):
            key = key.split('.')
            key.insert(-1, 'base_layer')
            key = '.'.join(key)
        new_state_dict[key] = value

    return new_state_dict

pretrained_ckpt = "MAGAer13/mplug-owl-llama-7b-video"
trained_ckpt = hf_hub_download(repo_id="videocon/owl-con", filename="pytorch_model.bin", repo_type="model")


tokenizer = LlamaTokenizer.from_pretrained(pretrained_ckpt)
image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
processor = MplugOwlProcessor(image_processor, tokenizer)

# Instantiate model
model = MplugOwlForConditionalGeneration.from_pretrained(
    pretrained_ckpt,
    torch_dtype=torch.bfloat16,
    device_map={'': 'cpu'}
)

peft_config = LoraConfig(
    target_modules=r'.*language_model.*\.(q_proj|v_proj|k_proj|o_proj|gate_proj|down_proj|up_proj)', 
    inference_mode=True, 
    r=32, 
    lora_alpha=16, 
    lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

with open(trained_ckpt, 'rb') as f:
    ckpt = torch.load(f, map_location = torch.device("cpu"))
    ckpt = modify_keys(ckpt)
    
model.load_state_dict(ckpt)
model = model.to("cuda:0").to(torch.bfloat16)

def inference(videopath, text):

    PROMPT = """The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
    Human: <|video|>
    Human: Does this video entail the description: "{caption}"?
    AI: """

    valid_data = MultiModalDataset(videopath, PROMPT.format(caption = text), tokenizer, processor, max_length = 256, loss_objective = 'sequential')
    dataloader = DataLoader(valid_data, pin_memory=True, collate_fn=batchify)
    score = get_scores(model, tokenizer, dataloader)

    if score < 0.5:
        dataset = VideoCaptionDataset(videopath, text)
        dataloader = DataLoader(dataset)
        nle = get_nle(model, processor, tokenizer, dataloader)
    else:
        nle = "None (NLE is only triggered when entailment score < 0.5)"

    return score, nle

demo = gr.Interface(inference, 
                    title="Owl-Con Demo",
                    description="Owl-Con Demo (Code: https://github.com/Hritikbansal/videocon | Paper: https://arxiv.org/abs/2311.10111)",
                    inputs=[gr.Video(label='input_video'), gr.Textbox(label='input_caption')], 
                    outputs=[gr.Number(label='Entailment Score'), gr.Textbox(label='Natural Language Explanation')],
                    examples=[["examples/820.mp4", "We see the group making cookies."], ["examples/820.mp4", "We see the group eating cookies."], ["examples/244.mp4", "She throws a bowling ball while talking on the phone."], ["examples/244.mp4", "She throws a baseball while talking on the phone."]])

if __name__ == "__main__":
    demo.launch()