Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
850b849
1
Parent(s):
5c45973
Update app
Browse files- app.py +7 -1
- preprocess/inference_preprocess.py +1 -1
app.py
CHANGED
@@ -1028,18 +1028,21 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1028 |
orig_frames, status = load_video_frames(test_video_25fps)
|
1029 |
if status != "success":
|
1030 |
return None, status
|
|
|
1031 |
|
1032 |
# Extract and save the audio file
|
1033 |
orig_wav_file, status = extract_audio(video_path, result_folder)
|
1034 |
if status != "success":
|
1035 |
return None, status
|
|
|
1036 |
|
1037 |
# Pre-process and extract per-speaker tracks in each scene
|
1038 |
print("Pre-processing the input video...")
|
1039 |
status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
|
1040 |
if status != 0:
|
1041 |
return None, "Error in pre-processing the input video, please check the input video and try again..."
|
1042 |
-
|
|
|
1043 |
# Load the tracks file saved during pre-processing
|
1044 |
with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
|
1045 |
tracks = pickle.load(file)
|
@@ -1053,6 +1056,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1053 |
track_dict[scene_num][i] = {}
|
1054 |
for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
|
1055 |
track_dict[scene_num][i][frame_num] = bbox
|
|
|
1056 |
|
1057 |
# Get the total number of scenes
|
1058 |
test_scenes = os.listdir("{}/crops".format(result_folder_input))
|
@@ -1061,6 +1065,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1061 |
# Load the trained model
|
1062 |
model = Transformer_RGB()
|
1063 |
model = load_checkpoint(CHECKPOINT_PATH, model)
|
|
|
1064 |
|
1065 |
# Compute the active speaker in each scene
|
1066 |
output_tracks = {}
|
@@ -1149,6 +1154,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1149 |
output_tracks[frame] = track_dict[scene_num][label][frame]
|
1150 |
|
1151 |
# Save the output video
|
|
|
1152 |
video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
|
1153 |
if status != "success":
|
1154 |
return None, status
|
|
|
1028 |
orig_frames, status = load_video_frames(test_video_25fps)
|
1029 |
if status != "success":
|
1030 |
return None, status
|
1031 |
+
print("Successfully loaded the frames")
|
1032 |
|
1033 |
# Extract and save the audio file
|
1034 |
orig_wav_file, status = extract_audio(video_path, result_folder)
|
1035 |
if status != "success":
|
1036 |
return None, status
|
1037 |
+
print("Successfully loaded the spectrograms")
|
1038 |
|
1039 |
# Pre-process and extract per-speaker tracks in each scene
|
1040 |
print("Pre-processing the input video...")
|
1041 |
status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
|
1042 |
if status != 0:
|
1043 |
return None, "Error in pre-processing the input video, please check the input video and try again..."
|
1044 |
+
print("Successfully preprocessed the video")
|
1045 |
+
|
1046 |
# Load the tracks file saved during pre-processing
|
1047 |
with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
|
1048 |
tracks = pickle.load(file)
|
|
|
1056 |
track_dict[scene_num][i] = {}
|
1057 |
for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
|
1058 |
track_dict[scene_num][i][frame_num] = bbox
|
1059 |
+
print("Successfully loaded the extracted person-tracks")
|
1060 |
|
1061 |
# Get the total number of scenes
|
1062 |
test_scenes = os.listdir("{}/crops".format(result_folder_input))
|
|
|
1065 |
# Load the trained model
|
1066 |
model = Transformer_RGB()
|
1067 |
model = load_checkpoint(CHECKPOINT_PATH, model)
|
1068 |
+
print("Successfully loaded the model")
|
1069 |
|
1070 |
# Compute the active speaker in each scene
|
1071 |
output_tracks = {}
|
|
|
1154 |
output_tracks[frame] = track_dict[scene_num][label][frame]
|
1155 |
|
1156 |
# Save the output video
|
1157 |
+
print("Generating active-speaker detection output video...")
|
1158 |
video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
|
1159 |
if status != "success":
|
1160 |
return None, status
|
preprocess/inference_preprocess.py
CHANGED
@@ -197,7 +197,7 @@ def inference_video(opt, padding=0):
|
|
197 |
fidx += 1
|
198 |
yield
|
199 |
|
200 |
-
for _ in tqdm(
|
201 |
pass
|
202 |
|
203 |
|
|
|
197 |
fidx += 1
|
198 |
yield
|
199 |
|
200 |
+
for _ in tqdm(generate_detections()):
|
201 |
pass
|
202 |
|
203 |
|