Spaces:

ManishThota
/

GSoC-Super-Rapid-Annotator

Runtime error

App Files Files Community

GSoC-Super-Rapid-Annotator / both_app.py

ManishThota

Update both_app.py

ae2b237 verified 4 months ago

raw

history blame

7.99 kB

	import warnings
	warnings.filterwarnings("ignore")
	import gradio as gr
	import pandas as pd
	import torch
	import gc
	import os
	from typing import Dict, List
	from src.video_model import describe_video
	from src.utils import parse_string, parse_annotations

	# --- Function to process single video ---
	def process_video_and_questions(video, standing, hands, location, screen):
	video_name = os.path.basename(video)
	query = f"Answer the questions from the video\n"
	additional_info = []
	if standing:
	additional_info.append("Is the subject in the video standing or sitting?\n")
	if hands:
	additional_info.append("Is the subject holding any object in their hands?\n")
	if location:
	additional_info.append("Is the subject present indoors?\n")
	if screen:
	additional_info.append("Is the subject interacting with a screen in the background by facing the screen?\n")

	end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Follow the below examples\n:
	<annotation>indoors: 0</annotation>
	<annotation>standing: 1</annotation>
	<annotation>hands.free: 0</annotation>
	<annotation>screen.interaction_yes: 0</annotation>
	"""

	final_query = query + " " + " ".join(additional_info)
	final_prompt = final_query + " " + end_query

	response = describe_video(video, final_prompt)
	final_response = f"<video_name>{video_name}</video_name>" + " \n" + response

	conditions = {
	'standing': (standing, 'standing: 1', 'standing: None'),
	'hands': (hands, 'hands.free: 1', 'hands.free: None'),
	'location': (location, 'indoors: 1', 'indoors: None'),
	'screen': (screen, 'screen.interaction_yes: 1', 'screen.interaction_yes: None')
	}

	for key, (condition, to_replace, replacement) in conditions.items():
	if not condition:
	final_response = final_response.replace(to_replace, replacement)

	return final_response

	# Function to save data to a CSV file using pandas
	def save_to_csv(observations: List[Dict], output_dir: str = "outputs") -> str:
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Convert the list of dictionaries to a pandas DataFrame
	df = pd.DataFrame(observations)

	# Specify the CSV file path
	csv_file = os.path.join(output_dir, "video_observations.csv")

	# Save the DataFrame to a CSV file
	df.to_csv(csv_file, index=False)

	return csv_file

	# Function to process a single video and return the observation data
	def process_single_video(video_path, standing, hands, location, screen) -> Dict:
	video_name = os.path.basename(video_path) # Extract video name from the path
	query = "Describe this video in detail and answer the questions"
	additional_info = []
	if standing:
	additional_info.append("Is the subject in the video standing or sitting?\n")
	if hands:
	additional_info.append("Is the subject holding any object in their hands?\n")
	if location:
	additional_info.append("Is the subject present indoors?\n")
	if screen:
	additional_info.append("Is the subject interacting with a screen in the background by facing the screen?\n")

	end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Follow the below examples:
	<annotation>indoors: 0</annotation>
	<annotation>standing: 1</annotation>
	<annotation>hands.free: 0</annotation>
	<annotation>screen.interaction_yes: 0</annotation>
	"""

	final_query = query + " " + " ".join(additional_info)
	final_prompt = final_query + " " + end_query

	# Assuming your describe_video function handles the video processing
	response = describe_video(video_path, final_prompt)
	final_response = f"<video_name>{video_name}</video_name>" + " \n" + response

	conditions = {
	'standing': (standing, 'standing: 1', 'standing: None'),
	'hands': (hands, 'hands.free: 1', 'hands.free: None'),
	'location': (location, 'indoors: 1', 'indoors: None'),
	'screen': (screen, 'screen.interaction_yes: 1', 'screen.interaction_yes: None')
	}

	for key, (condition, to_replace, replacement) in conditions.items():
	if not condition:
	final_response = final_response.replace(to_replace, replacement)

	# Parse the response to extract video name and annotations
	parsed_content = parse_string(final_response, ["video_name", "annotation"])
	video_name = parsed_content['video_name'][0] if parsed_content['video_name'] else None
	annotations_dict = parse_annotations(parsed_content['annotation']) if parsed_content['annotation'] else {}

	# Return the observation as a dictionary
	return {'video_name': video_name, **annotations_dict}

	# Function to process all videos in a folder
	def process_multiple_videos(video_files: List[str], standing, hands, location, screen):
	all_observations = []

	for video_path in video_files:
	observation = process_single_video(video_path, standing, hands, location, screen)
	if observation['video_name']: # Only add valid observations
	all_observations.append(observation)
	else:
	print("Error processing video:", video_path) # Log any errors

	# Clear GPU cache
	torch.cuda.empty_cache()
	gc.collect()

	# Save all observations to a CSV file and return the file path
	csv_file = save_to_csv(all_observations)
	return "Processing completed. Download the CSV file.", csv_file

	# Gradio interface
	def gradio_interface_single(video, standing, hands, location, screen):
	return process_video_and_questions(video, standing, hands, location, screen)

	def gradio_interface_multiple(video_files, standing, hands, location, screen):
	video_file_paths = [video.name for video in video_files] # Extract file paths from uploaded files
	return process_multiple_videos(video_file_paths, standing, hands, location, screen)

	with gr.Blocks() as demo:
	with gr.Tab("Single Video Processing"):
	with gr.Row():
	with gr.Column():
	video = gr.Video(label="Video")
	standing = gr.Checkbox(label="Standing")
	hands = gr.Checkbox(label="Hands Free")
	location = gr.Checkbox(label="Indoors")
	screen = gr.Checkbox(label="Screen Interaction")
	submit_btn = gr.Button("Generate Annotations")
	generate_csv_btn = gr.Button("Generate CSV")

	with gr.Column():
	response = gr.Textbox(label="Video Description", show_label=True, show_copy_button=True)
	csv_output = gr.File(label="Download CSV", interactive=False)

	submit_btn.click(
	fn=gradio_interface_single,
	inputs=[video, standing, hands, location, screen],
	outputs=response
	)

	generate_csv_btn.click(
	fn=save_to_csv,
	inputs=response,
	outputs=csv_output
	)

	with gr.Tab("Batch Video Processing"):
	with gr.Row():
	video_files = gr.File(file_count="multiple", file_types=["video"], label="Upload multiple videos")
	standing = gr.Checkbox(label="Standing")
	hands = gr.Checkbox(label="Hands Free")
	location = gr.Checkbox(label="Indoors")
	screen = gr.Checkbox(label="Screen Interaction")
	submit_btn = gr.Button("Process Videos")
	download_link = gr.File(label="Download CSV")

	submit_btn.click(
	fn=gradio_interface_multiple,
	inputs=[video_files, standing, hands, location, screen],
	outputs=[response, download_link]
	)

	demo.launch(debug=False)