File size: 3,386 Bytes
5fb8331
 
 
92f266c
5fb8331
92f266c
 
 
9871891
 
 
 
c78e00d
9871891
 
 
 
92f266c
9871891
92f266c
 
 
5fb8331
92f266c
 
08ba28a
92f266c
 
 
 
 
9871891
92f266c
 
5fb8331
 
 
c837039
 
 
 
 
 
87cbf92
 
 
 
8a67c3f
5fb8331
 
 
 
e80df4f
5fb8331
 
92f266c
ce45613
92f266c
cbd9557
92f266c
 
 
 
 
 
 
 
 
 
 
 
 
08ba28a
9871891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import warnings
warnings.filterwarnings("ignore")
import gradio as gr
from src.video_model import describe_video  # Assuming this function processes the video and query

# --- Function to construct the final query --- 
def process_video_and_questions(video, sitting, hands, location, screen):
    query = "Describe this video in detail and answer the questions"
    additional_info = []
    if sitting:
        additional_info.append("Is the subject in the video standing or sitting?")
    if hands:
        additional_info.append("Is the subject holding any object in their hands, if so the hands are not free else they are free?")
    if location:
        additional_info.append("Is the subject present indoors or outdoors?")
    if screen:
        additional_info.append("Is the subject interacting with a screen in the background by facing the screen?")
    
    final_query = query + " " + " ".join(additional_info)
    # Assuming your describe_video function handles the video processing
    response = describe_video(video, final_query) 
    return response

# Video and text inputs for the interface
video = gr.Video(label="Video")

# Options as checkboxes
sitting = gr.Checkbox(label="Sitting/Standing")
hands = gr.Checkbox(label="Hands Free/Not Free")
location = gr.Checkbox(label="Indoors/Outdoors")
screen = gr.Checkbox(label="Screen Interaction")

# Output for the interface
response = gr.Textbox(label="Predicted answer", show_label=True, show_copy_button=True)

# Examples for the interface
examples = [
    ["videos/2016-01-01_0100_US_KNBC_Channel_4_News_1867.16-1871.38_now.mp4",],
    ["videos/2016-01-01_0200_US_KNBC_Channel_4_News_1329.12-1333.29_tonight.mp4",],
    ["videos/2016-01-01_0830_US_KNBC_Tonight_Show_with_Jimmy_Fallon_725.45-729.76_tonight.mp4",],
    ["videos/2016-01-01_0200_US_KOCE_The_PBS_Newshour_577.03-581.31_tonight.mp4"],
    ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
    ["videos/2016-01-02_0735_US_KCBS_Late_Show_with_Stephen_Colbert_285.94-290.67_this_year.mp4"],
    ["videos/2016-01-13_2200_US_KTTV-FOX_The_Doctor_Oz_Show_1709.79-1714.17_this_month.mp4"],
    ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
    ["videos/2016-01-01_1300_US_KNBC_Today_in_LA_at_5am_12.46-16.95_this_morning.mp4"],
    ["videos/2016-01-05_0200_US_KNBC_Channel_4_News_1561.29-1565.95_next_week.mp4"],
    ["videos/2016-01-28_0700_US_KNBC_Channel_4_News_at_11PM_629.56-633.99_in_the_future.mp4"]
]

# Title, description, and article for the interface
title = "GSoC Super Raid Annotator"
description = "Annotate Videos"
article = "<p style='text-align: center'><a href='https://github.com/OpenBMB/MiniCPM-V' target='_blank'>Model GitHub Repo</a> | <a href='https://huggingface.co/openbmb/MiniCPM-V-2_6' target='_blank'>Model Page</a></p>"


custom_theme = gr.themes.Soft(
    # Set the primary hue of the Soft theme to your red color
    primary_hue="red", 
    secondary_hue="red")

# Launch the interface
interface = gr.Interface(
    fn=process_video_and_questions, # Updated function to handle the query construction
    inputs=[video, sitting, hands, location, screen], 
    outputs=response,
    examples=examples,
    title=title,
    description=description,
    article=article,
    theme=custom_theme,
    allow_flagging="never",
)
interface.launch(debug=False)