File size: 3,960 Bytes
0ec47c6
5fb8331
 
 
5443ab4
 
5fb8331
0ec47c6
1b7a970
0ec47c6
9871891
 
 
 
5443ab4
9871891
 
 
 
0ec47c6
9871891
0ec47c6
3fdad44
08ba28a
57a114d
5443ab4
57a114d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60a8041
57a114d
 
 
 
 
 
 
 
 
 
 
 
5443ab4
 
 
 
 
 
 
 
 
 
 
57a114d
92f266c
57a114d
 
 
 
 
 
60a8041
57a114d
 
 
 
 
60a8041
57a114d
5443ab4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# --- main.py (your Gradio app file) ---
import warnings
warnings.filterwarnings("ignore")
import gradio as gr
from src.video_model import describe_video  # Ensure this function is defined in src.video_model
from src.text_processor import process_description  # Ensure this function is defined in src.text_processor

# --- Function to handle both video and text processing ---
def process_video(video, sitting, hands, location, screen):
    query = "Describe this video in detail and answer the questions."
    additional_info = []
    if sitting:
        additional_info.append("Is the subject in the video standing or sitting?")
    if hands:
        additional_info.append("Is the subject holding any object in their hands, if so, are the hands free?")
    if location:
        additional_info.append("Is the subject present indoors or outdoors?")
    if screen:
        additional_info.append("Is the subject interacting with a screen in the background by facing the screen?")

    final_query = query + " " + " ".join(additional_info)
    video_description = describe_video(video, final_query)
    return video_description

# --- Function to handle JSON processing ---
def process_and_display_json(video_description): 
    json_response = process_description(video_description)
    return json_response

# --- Gradio Blocks Interface --- 
with gr.Blocks(title="GSoC Super Raid Annotator", theme=gr.themes.Soft(primary_hue="red", secondary_hue="red")) as demo:
    gr.Markdown("Annotate Videos")
    gr.Markdown(
        "<p style='text-align: center'><a href='https://github.com/OpenBMB/MiniCPM-V' target='_blank'>Model GitHub Repo</a> | <a href='https://huggingface.co/openbmb/MiniCPM-V-2_6' target='_blank'>Model Page</a></p>"
    )

    with gr.Row():
        with gr.Column():
            # Input components
            video = gr.Video(label="Video")
            sitting = gr.Checkbox(label="Sitting/Standing")
            hands = gr.Checkbox(label="Hands Free/Not Free")
            location = gr.Checkbox(label="Indoors/Outdoors")
            screen = gr.Checkbox(label="Screen Interaction")

            # Submit buttons
            with gr.Row():
                process_video_btn = gr.Button("Process Video")
                process_json_btn = gr.Button("Process JSON")

        with gr.Column():
            # Output components
            video_description = gr.Textbox(label="Video Description", show_label=True, show_copy_button=True)
            json_output = gr.JSON(label="JSON Output")

    # Examples for the interface
    examples = [
        ["videos/2016-01-01_0100_US_KNBC_Channel_4_News_1867.16-1871.38_now.mp4"],
        ["videos/2016-01-01_0200_US_KNBC_Channel_4_News_1329.12-1333.29_tonight.mp4"],
        ["videos/2016-01-01_0830_US_KNBC_Tonight_Show_with_Jimmy_Fallon_725.45-729.76_tonight.mp4"],
        ["videos/2016-01-01_0200_US_KOCE_The_PBS_Newshour_577.03-581.31_tonight.mp4"],
        ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
        ["videos/2016-01-02_0735_US_KCBS_Late_Show_with_Stephen_Colbert_285.94-290.67_this_year.mp4"],
        ["videos/2016-01-13_2200_US_KTTV-FOX_The_Doctor_Oz_Show_1709.79-1714.17_this_month.mp4"],
        ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
        ["videos/2016-01-01_1300_US_KNBC_Today_in_LA_at_5am_12.46-16.95_this_morning.mp4"],
        ["videos/2016-01-05_0200_US_KNBC_Channel_4_News_1561.29-1565.95_next_week.mp4"],
        ["videos/2016-01-28_0700_US_KNBC_Channel_4_News_at_11PM_629.56-633.99_in_the_future.mp4"]
    ]

    # Event handling
    process_video_btn.click(
        fn=process_video,
        inputs=[video, sitting, hands, location, screen],
        outputs=video_description,
    )

    process_json_btn.click(
        fn=process_and_display_json,
        inputs=video_description,
        outputs=json_output,
    )

# Launch the interface
demo.launch(debug=False)