Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Aug 25, 2024

Commit

850b849

1 Parent(s): 5c45973

Update app

Browse files

Files changed (2) hide show

app.py +7 -1
preprocess/inference_preprocess.py +1 -1

app.py CHANGED Viewed

@@ -1028,18 +1028,21 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		orig_frames, status = load_video_frames(test_video_25fps)
 		if status != "success":
 			return None, status
 		# Extract and save the audio file
 		orig_wav_file, status = extract_audio(video_path, result_folder)
 		if status != "success":
 			return None, status
 		# Pre-process and extract per-speaker tracks in each scene
 		print("Pre-processing the input video...")
 		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
 		if status != 0:
 			return None, "Error in pre-processing the input video, please check the input video and try again..."
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			tracks = pickle.load(file)
@@ -1053,6 +1056,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 				track_dict[scene_num][i] = {}
 				for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
 					track_dict[scene_num][i][frame_num] = bbox
 		# Get the total number of scenes
 		test_scenes = os.listdir("{}/crops".format(result_folder_input))
@@ -1061,6 +1065,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		# Load the trained model
 		model = Transformer_RGB()
 		model = load_checkpoint(CHECKPOINT_PATH, model)
 		# Compute the active speaker in each scene
 		output_tracks = {}
@@ -1149,6 +1154,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 				output_tracks[frame] = track_dict[scene_num][label][frame]
 		# Save the output video
 		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
 		if status != "success":
 			return None, status

 		orig_frames, status = load_video_frames(test_video_25fps)
 		if status != "success":
 			return None, status
+		print("Successfully loaded the frames")
 		# Extract and save the audio file
 		orig_wav_file, status = extract_audio(video_path, result_folder)
 		if status != "success":
 			return None, status
+		print("Successfully loaded the spectrograms")
 		# Pre-process and extract per-speaker tracks in each scene
 		print("Pre-processing the input video...")
 		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
 		if status != 0:
 			return None, "Error in pre-processing the input video, please check the input video and try again..."
+		print("Successfully preprocessed the video")
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			tracks = pickle.load(file)
 				track_dict[scene_num][i] = {}
 				for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
 					track_dict[scene_num][i][frame_num] = bbox
+		print("Successfully loaded the extracted person-tracks")
 		# Get the total number of scenes
 		test_scenes = os.listdir("{}/crops".format(result_folder_input))
 		# Load the trained model
 		model = Transformer_RGB()
 		model = load_checkpoint(CHECKPOINT_PATH, model)
+		print("Successfully loaded the model")
 		# Compute the active speaker in each scene
 		output_tracks = {}
 				output_tracks[frame] = track_dict[scene_num][label][frame]
 		# Save the output video
+		print("Generating active-speaker detection output video...")
 		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
 		if status != "success":
 			return None, status

preprocess/inference_preprocess.py CHANGED Viewed

@@ -197,7 +197,7 @@ def inference_video(opt, padding=0):
 			fidx += 1
 			yield
-	for _ in tqdm(generator()):
 		pass

 			fidx += 1
 			yield
+	for _ in tqdm(generate_detections()):
 		pass