Spaces:

ManishThota
/

GSoC-Super-Rapid-Annotator

Runtime error

File size: 3,816 Bytes

0ec47c6
5fb8331
 
 
0ec47c6
 
5fb8331
0ec47c6
1b7a970
0ec47c6
9871891
 
 
 
c78e00d
9871891
 
 
 
0ec47c6
9871891
5fb8331
0ec47c6
 
3fdad44
08ba28a
3fdad44
 
 
 
 
0ec47c6
92f266c
 
 
 
9871891
0ec47c6
 
5fb8331
60a8041
5fb8331
 
c837039
 
 
 
 
 
87cbf92
 
 
 
8a67c3f
5fb8331
 
 
 
e80df4f
5fb8331
 
ce45613
a9d7425
 
d0ede9f
0ec47c6
92f266c
ea0cd9a
92f266c
ea0cd9a
0ec47c6
2b0ca40
92f266c
 
 
 
 
 
ea0cd9a
08ba28a
60a8041
ea0cd9a
 
 
 
 
 
 
 
60a8041
ea0cd9a

# --- main.py (your Gradio app file) ---
import warnings
warnings.filterwarnings("ignore")
import gradio as gr
from src.video_model import describe_video  # Your video processing function
from src.text_processor import process_description

# --- Function to handle both video and text processing ---
def process_video(video, sitting, hands, location, screen):
    query = "Describe this video in detail and answer the questions."
    additional_info = []
    if sitting:
        additional_info.append("Is the subject in the video standing or sitting?")
    if hands:
        additional_info.append("Is the subject holding any object in their hands, if so the hands are not free else they are free?")
    if location:
        additional_info.append("Is the subject present indoors or outdoors?")
    if screen:
        additional_info.append("Is the subject interacting with a screen in the background by facing the screen?")

    final_query = query + " " + " ".join(additional_info)

    video_description = describe_video(video, final_query)

    return video_description

def process_and_display_json(video_description): 
    json_response = process_description(video_description)
    return json_response
    
    # --- Gradio Interface --- 
video = gr.Video(label="Video")
sitting = gr.Checkbox(label="Sitting/Standing")
hands = gr.Checkbox(label="Hands Free/Not Free")
location = gr.Checkbox(label="Indoors/Outdoors")
screen = gr.Checkbox(label="Screen Interaction")

video_description = gr.Textbox(label="Video Description", show_label=True, show_copy_button=True)
json_output = gr.JSON(label="JSON Output") 


# Examples for the interface
examples = [
    ["videos/2016-01-01_0100_US_KNBC_Channel_4_News_1867.16-1871.38_now.mp4",],
    ["videos/2016-01-01_0200_US_KNBC_Channel_4_News_1329.12-1333.29_tonight.mp4",],
    ["videos/2016-01-01_0830_US_KNBC_Tonight_Show_with_Jimmy_Fallon_725.45-729.76_tonight.mp4",],
    ["videos/2016-01-01_0200_US_KOCE_The_PBS_Newshour_577.03-581.31_tonight.mp4"],
    ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
    ["videos/2016-01-02_0735_US_KCBS_Late_Show_with_Stephen_Colbert_285.94-290.67_this_year.mp4"],
    ["videos/2016-01-13_2200_US_KTTV-FOX_The_Doctor_Oz_Show_1709.79-1714.17_this_month.mp4"],
    ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
    ["videos/2016-01-01_1300_US_KNBC_Today_in_LA_at_5am_12.46-16.95_this_morning.mp4"],
    ["videos/2016-01-05_0200_US_KNBC_Channel_4_News_1561.29-1565.95_next_week.mp4"],
    ["videos/2016-01-28_0700_US_KNBC_Channel_4_News_at_11PM_629.56-633.99_in_the_future.mp4"]
]

# Title, description, and article for the interface
title = "GSoC Super Raid Annotator"
description = "Annotate Videos"
article = "<p style='text-align: center'><a href='https://github.com/OpenBMB/MiniCPM-V' target='_blank'>Model GitHub Repo</a> | <a href='https://huggingface.co/openbmb/MiniCPM-V-2_6' target='_blank'>Model Page</a></p>"

custom_theme = gr.themes.Soft(
    primary_hue="blue", 
    secondary_hue="blue",
    neutral_hue="zinc"
)

# --- Create the Interface --- 
interface = gr.Interface(
    fn=process_video,
    inputs=[video, sitting, hands, location, screen],
    outputs=video_description, 
    examples=examples,
    title=title,
    description=description,
    article=article,
    theme=custom_theme,
    allow_flagging="never",
    live=True # Add live=True here
)

# --- Add the JSON processing component with live updates ---
interface.add_component(
    fn=process_and_display_json,
    inputs=video_description,  # This will take output from process_video
    outputs=json_output,
    live=True,               
    live_inputs=[0]          # Update based on the first input (video_description)
)

interface.launch(debug=False)