File size: 7,994 Bytes
406951a
 
 
 
 
 
 
ae2b237
406951a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import warnings
warnings.filterwarnings("ignore")
import gradio as gr
import pandas as pd
import torch
import gc
import os
from typing import Dict, List
from src.video_model import describe_video  
from src.utils import parse_string, parse_annotations

# --- Function to process single video ---
def process_video_and_questions(video, standing, hands, location, screen):
    video_name = os.path.basename(video)
    query = f"Answer the questions from the video\n"
    additional_info = []
    if standing:
        additional_info.append("Is the subject in the video standing or sitting?\n")
    if hands:
        additional_info.append("Is the subject holding any object in their hands?\n")
    if location:
        additional_info.append("Is the subject present indoors?\n")
    if screen:
        additional_info.append("Is the subject interacting with a screen in the background by facing the screen?\n")
    
    end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Follow the below examples\n:
        <annotation>indoors: 0</annotation>
        <annotation>standing: 1</annotation>
        <annotation>hands.free: 0</annotation>
        <annotation>screen.interaction_yes: 0</annotation>
        """

    final_query = query + " " + " ".join(additional_info)
    final_prompt = final_query + " " + end_query
    
    response = describe_video(video, final_prompt)
    final_response = f"<video_name>{video_name}</video_name>" + " \n" + response

    conditions = {
        'standing': (standing, 'standing: 1', 'standing: None'),
        'hands': (hands, 'hands.free: 1', 'hands.free: None'),
        'location': (location, 'indoors: 1', 'indoors: None'),
        'screen': (screen, 'screen.interaction_yes: 1', 'screen.interaction_yes: None')
    }
    
    for key, (condition, to_replace, replacement) in conditions.items():
        if not condition:
            final_response = final_response.replace(to_replace, replacement)
    
    return final_response 

# Function to save data to a CSV file using pandas
def save_to_csv(observations: List[Dict], output_dir: str = "outputs") -> str:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(observations)
    
    # Specify the CSV file path
    csv_file = os.path.join(output_dir, "video_observations.csv")
    
    # Save the DataFrame to a CSV file
    df.to_csv(csv_file, index=False)
    
    return csv_file

# Function to process a single video and return the observation data
def process_single_video(video_path, standing, hands, location, screen) -> Dict:
    video_name = os.path.basename(video_path)  # Extract video name from the path
    query = "Describe this video in detail and answer the questions"
    additional_info = []
    if standing:
        additional_info.append("Is the subject in the video standing or sitting?\n")
    if hands:
        additional_info.append("Is the subject holding any object in their hands?\n")
    if location:
        additional_info.append("Is the subject present indoors?\n")
    if screen:
        additional_info.append("Is the subject interacting with a screen in the background by facing the screen?\n")
    
    end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Follow the below examples:
        <annotation>indoors: 0</annotation>
        <annotation>standing: 1</annotation>
        <annotation>hands.free: 0</annotation>
        <annotation>screen.interaction_yes: 0</annotation>
        """
    
    final_query = query + " " + " ".join(additional_info)
    final_prompt = final_query + " " + end_query
    
    # Assuming your describe_video function handles the video processing
    response = describe_video(video_path, final_prompt)
    final_response = f"<video_name>{video_name}</video_name>" + " \n" + response

    conditions = {
        'standing': (standing, 'standing: 1', 'standing: None'),
        'hands': (hands, 'hands.free: 1', 'hands.free: None'),
        'location': (location, 'indoors: 1', 'indoors: None'),
        'screen': (screen, 'screen.interaction_yes: 1', 'screen.interaction_yes: None')
    }
    
    for key, (condition, to_replace, replacement) in conditions.items():
        if not condition:
            final_response = final_response.replace(to_replace, replacement)

    # Parse the response to extract video name and annotations
    parsed_content = parse_string(final_response, ["video_name", "annotation"])
    video_name = parsed_content['video_name'][0] if parsed_content['video_name'] else None
    annotations_dict = parse_annotations(parsed_content['annotation']) if parsed_content['annotation'] else {}

    # Return the observation as a dictionary
    return {'video_name': video_name, **annotations_dict}

# Function to process all videos in a folder
def process_multiple_videos(video_files: List[str], standing, hands, location, screen):
    all_observations = []

    for video_path in video_files:
        observation = process_single_video(video_path, standing, hands, location, screen)
        if observation['video_name']:  # Only add valid observations
            all_observations.append(observation)
        else:
            print("Error processing video:", video_path)  # Log any errors

        # Clear GPU cache
        torch.cuda.empty_cache()
        gc.collect() 

    # Save all observations to a CSV file and return the file path
    csv_file = save_to_csv(all_observations)
    return "Processing completed. Download the CSV file.", csv_file

# Gradio interface
def gradio_interface_single(video, standing, hands, location, screen):
    return process_video_and_questions(video, standing, hands, location, screen)

def gradio_interface_multiple(video_files, standing, hands, location, screen):
    video_file_paths = [video.name for video in video_files]  # Extract file paths from uploaded files
    return process_multiple_videos(video_file_paths, standing, hands, location, screen)

with gr.Blocks() as demo:
    with gr.Tab("Single Video Processing"):
        with gr.Row():
            with gr.Column():
                video = gr.Video(label="Video")
                standing = gr.Checkbox(label="Standing")
                hands = gr.Checkbox(label="Hands Free")
                location = gr.Checkbox(label="Indoors")
                screen = gr.Checkbox(label="Screen Interaction")
                submit_btn = gr.Button("Generate Annotations")
                generate_csv_btn = gr.Button("Generate CSV")
            
            with gr.Column():
                response = gr.Textbox(label="Video Description", show_label=True, show_copy_button=True)
                csv_output = gr.File(label="Download CSV", interactive=False)
        
        submit_btn.click(
            fn=gradio_interface_single,
            inputs=[video, standing, hands, location, screen],
            outputs=response
        )
        
        generate_csv_btn.click(
            fn=save_to_csv,
            inputs=response,
            outputs=csv_output
        )
    
    with gr.Tab("Batch Video Processing"):
        with gr.Row():
            video_files = gr.File(file_count="multiple", file_types=["video"], label="Upload multiple videos")
            standing = gr.Checkbox(label="Standing")
            hands = gr.Checkbox(label="Hands Free")
            location = gr.Checkbox(label="Indoors")
            screen = gr.Checkbox(label="Screen Interaction")
            submit_btn = gr.Button("Process Videos")
            download_link = gr.File(label="Download CSV")
        
        submit_btn.click(
            fn=gradio_interface_multiple,
            inputs=[video_files, standing, hands, location, screen],
            outputs=[response, download_link]
        )

demo.launch(debug=False)