Spaces:

H-Liu1997
/

TANGO

Build error

App Files Files Community

H-Liu1997 commited on Oct 13, 2024

Commit

1ea6e65

1 Parent(s): f4c7aff

fixbugs

Browse files

Files changed (4) hide show

SMPLer-X/app.py +2 -1
app.py +75 -46
create_graph.py +47 -6
requirements.txt +3 -3

SMPLer-X/app.py CHANGED Viewed

@@ -17,7 +17,8 @@ try:
 except:
     os.system('pip install ./main/transformer_utils')
 # hf_hub_download(repo_id="caizhongang/SMPLer-X", filename="smpler_x_h32.pth.tar", local_dir="/home/user/app/pretrained_models")
-os.system('cp -rf ./assets/conversions.py /content/myenv/lib/python3.10/site-packages/torchgeometry/core/conversions.py')
 def extract_frame_number(file_name):
     match = re.search(r'(\d{5})', file_name)

 except:
     os.system('pip install ./main/transformer_utils')
 # hf_hub_download(repo_id="caizhongang/SMPLer-X", filename="smpler_x_h32.pth.tar", local_dir="/home/user/app/pretrained_models")
+# /home/user/.pyenv/versions/3.9.19/lib/python3.9/site-packages/torchgeometry/core/conversions.py
+# os.system('cp -rf ./assets/conversions.py /content/myenv/lib/python3.10/site-packages/torchgeometry/core/conversions.py')
 def extract_frame_number(file_name):
     match = re.search(r'(\d{5})', file_name)

app.py CHANGED Viewed

@@ -21,6 +21,7 @@ from datetime import datetime
 from decord import VideoReader
 from PIL import Image
 import copy
 import importlib
 import torch
@@ -178,6 +179,7 @@ def search_path_dp(graph, audio_low_np, audio_high_np, loop_penalty=0.1, top_k=1
 def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio_path, **kwargs):
     torch.set_grad_enabled(False)
     pool_path = candidate_json_path.replace("data_json", "cached_graph").replace(".json", ".pkl")
     graph = igraph.Graph.Read_Pickle(fname=pool_path)
@@ -347,25 +349,25 @@ def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio
     res_motion = []
     counter = 0
     for path, is_continue in zip(path_list, is_continue_list):
-        # print(path)
-        # res_motion_current = path_visualization(
-        #   graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
-        # )
-        res_motion_current = path_visualization_v2(
-          graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
-        )
-        video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
-        video_reader = VideoReader(video_temp_path)
-        video_np = []
-        for i in range(len(video_reader)):
-            if i == 0: continue
-            video_frame = video_reader[i].asnumpy()
-            video_np.append(Image.fromarray(video_frame))
-        adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
-        save_videos_from_pil(adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=30, bitrate=2000000)
         audio_temp_path = audio_path
         lipsync_output_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
@@ -377,6 +379,17 @@ def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio
         start_node = path[1].index
         end_node = start_node + 100
     print(f"delete gt-nodes {start_node}, {end_node}")
     nodes_to_delete = list(range(start_node, end_node))
     graph.delete_vertices(nodes_to_delete)
@@ -385,9 +398,9 @@ def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio
     res_motion = []
     counter = 1
     for path, is_continue in zip(path_list, is_continue_list):
-        res_motion_current = path_visualization(
-          graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
-        )
         video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
         video_reader = VideoReader(video_temp_path)
@@ -397,7 +410,7 @@ def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio
             video_frame = video_reader[i].asnumpy()
             video_np.append(Image.fromarray(video_frame))
         adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
-        save_videos_from_pil(adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=30, bitrate=2000000)
         audio_temp_path = audio_path
@@ -446,28 +459,41 @@ def prepare_all(yaml_name):
     return config
-def save_first_10_seconds(video_path, output_path="./save_video.mp4"):
-    import cv2
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         return
     fps = int(cap.get(cv2.CAP_PROP_FPS))
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-    frames_to_save = fps * 10
     frame_count = 0
     while cap.isOpened() and frame_count < frames_to_save:
         ret, frame = cap.read()
         if not ret:
             break
-        out.write(frame)
         frame_count += 1
     cap.release()
@@ -515,9 +541,13 @@ def tango(audio_path, character_name, seed, create_graph=False, video_folder_pat
         data_save_path = "./outputs/tmpdata/"
         json_save_path = "./outputs/save_video.json"
         graph_save_path = "./outputs/save_video.pkl"
-        os.system(f"cd ./SMPLer-X/ && python app.py --video_folder_path {video_folder_path} --data_save_path {data_save_path} --json_save_path {json_save_path} && cd ..")
         os.system(f"python ./create_graph.py --json_save_path {json_save_path} --graph_save_path {graph_save_path}")
         cfg.data.test_meta_paths = json_save_path
     smplx_model = smplx.create(
         "./emage/smplx_models/",
@@ -551,7 +581,7 @@ def tango(audio_path, character_name, seed, create_graph=False, video_folder_pat
     test_path = os.path.join(experiment_ckpt_dir, f"test_{0}")
     os.makedirs(test_path, exist_ok=True)
-    result = test_fn(model, device, 0, cfg.data.test_meta_paths, test_path, cfg, audio_path)
     gc.collect()
     torch.cuda.empty_cache()
     return result
@@ -571,13 +601,11 @@ examples_video = [
 ]
 combined_examples = [
-    ["./datasets/cached_audio/example_male_voice_9_seconds.wav", "./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", 2024],
-    ["./datasets/cached_audio/example_male_voice_9_seconds.wav", "./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4", 2024],
     ["./datasets/cached_audio/example_male_voice_9_seconds.wav", "./datasets/cached_audio/101099-00_18_09-00_18_19.mp4", 2024],
-    ["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4", 2024],
-    ["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4", 2024],
 ]
 def make_demo():
     with gr.Blocks(analytics_enabled=False) as Interface:
         gr.Markdown(
@@ -651,22 +679,24 @@ def make_demo():
                 file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender")
                 file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender")
                 gr.Markdown("""
-                <h4 style="text-align: left;">
                 Details of the low-quality mode:
                 <br>
-                1. Lower resolution.
                 <br>
-                2. More discontinuous graph nodes (causing noticeable "frame jumps").
                 <br>
-                3. Utilizes open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
                 <br>
-                4. only use first 8 seconds of your input audio.
                 <br>
-                5. custom character for a video up to 10 seconds.
                 <br>
                 <br>
                 Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
-                </h4>
                 """)
         with gr.Row():
@@ -720,7 +750,6 @@ def make_demo():
 if __name__ == "__main__":
     os.environ["MASTER_ADDR"]='127.0.0.1'
     os.environ["MASTER_PORT"]='8675'
-    # #os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
     demo = make_demo()
-    demo.launch(share=True)

 from decord import VideoReader
 from PIL import Image
 import copy
+import cv2
 import importlib
 import torch
 def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio_path, **kwargs):
+    create_graph = kwargs["create_graph"]
     torch.set_grad_enabled(False)
     pool_path = candidate_json_path.replace("data_json", "cached_graph").replace(".json", ".pkl")
     graph = igraph.Graph.Read_Pickle(fname=pool_path)
     res_motion = []
     counter = 0
     for path, is_continue in zip(path_list, is_continue_list):
+        if create_graph:
+            # time is limited if we create graph on hugging face, lets skip blending.
+            res_motion_current = path_visualization(
+              graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
+            )
+            video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
+        else:
+            res_motion_current = path_visualization_v2(
+              graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=None, return_motion=True, verbose_continue=True
+            )
+            video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
+            video_reader = VideoReader(video_temp_path)
+            video_np = []
+            for i in range(len(video_reader)):
+                if i == 0: continue
+                video_frame = video_reader[i].asnumpy()
+                video_np.append(Image.fromarray(video_frame))
+            adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
+            save_videos_from_pil(adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=graph.vs[0]['fps'], bitrate=2000000)
         audio_temp_path = audio_path
         lipsync_output_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
         start_node = path[1].index
         end_node = start_node + 100
+    if create_graph:
+        # time is limited if create graph, let us skip the second video
+        result = [
+        os.path.join(save_dir, f"audio_{idx}_retri_0.mp4"),
+        os.path.join(save_dir, f"audio_{idx}_retri_0.mp4"),
+        os.path.join(save_dir, f"audio_{idx}_retri_0.npz"),
+        os.path.join(save_dir, f"audio_{idx}_retri_0.npz")
+        ]
+        return result
     print(f"delete gt-nodes {start_node}, {end_node}")
     nodes_to_delete = list(range(start_node, end_node))
     graph.delete_vertices(nodes_to_delete)
     res_motion = []
     counter = 1
     for path, is_continue in zip(path_list, is_continue_list):
+        res_motion_current = path_visualization_v2(
+              graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=None, return_motion=True, verbose_continue=True
+            )
         video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
         video_reader = VideoReader(video_temp_path)
             video_frame = video_reader[i].asnumpy()
             video_np.append(Image.fromarray(video_frame))
         adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
+        save_videos_from_pil(adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=graph.vs[0]['fps'], bitrate=2000000)
         audio_temp_path = audio_path
     return config
+def save_first_10_seconds(video_path, output_path="./save_video.mp4", max_length=512):
+    if os.path.exists(output_path):
+        os.remove(output_path)
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         return
     fps = int(cap.get(cv2.CAP_PROP_FPS))
+    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # Calculate the aspect ratio and resize dimensions
+    if original_width >= original_height:
+        new_width = max_length
+        new_height = int(original_height * (max_length / original_width))
+    else:
+        new_height = max_length
+        new_width = int(original_width * (max_length / original_height))
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (new_width, new_height))
+    frames_to_save = fps * 20
     frame_count = 0
     while cap.isOpened() and frame_count < frames_to_save:
         ret, frame = cap.read()
         if not ret:
             break
+        # Resize the frame while keeping the aspect ratio
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        # resized_frame = frame
+        out.write(resized_frame)
         frame_count += 1
     cap.release()
         data_save_path = "./outputs/tmpdata/"
         json_save_path = "./outputs/save_video.json"
         graph_save_path = "./outputs/save_video.pkl"
+        os.system(f"cd ./SMPLer-X/ && python app.py --video_folder_path .{video_folder_path} --data_save_path .{data_save_path} --json_save_path .{json_save_path} && cd ..")
+        print(f"cd ./SMPLer-X/ && python app.py --video_folder_path .{video_folder_path} --data_save_path .{data_save_path} --json_save_path .{json_save_path} && cd ..")
         os.system(f"python ./create_graph.py --json_save_path {json_save_path} --graph_save_path {graph_save_path}")
         cfg.data.test_meta_paths = json_save_path
+        gc.collect()
+        torch.cuda.empty_cache()
     smplx_model = smplx.create(
         "./emage/smplx_models/",
     test_path = os.path.join(experiment_ckpt_dir, f"test_{0}")
     os.makedirs(test_path, exist_ok=True)
+    result = test_fn(model, device, 0, cfg.data.test_meta_paths, test_path, cfg, audio_path, create_graph=create_graph)
     gc.collect()
     torch.cuda.empty_cache()
     return result
 ]
 combined_examples = [
     ["./datasets/cached_audio/example_male_voice_9_seconds.wav", "./datasets/cached_audio/101099-00_18_09-00_18_19.mp4", 2024],
+    ["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/101099-00_18_09-00_18_19.mp4", 2024],
 ]
 def make_demo():
     with gr.Blocks(analytics_enabled=False) as Interface:
         gr.Markdown(
                 file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender")
                 file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender")
                 gr.Markdown("""
+                <div style="display: flex; justify-content: center; align-items: center; text-align: left;">
                 Details of the low-quality mode:
                 <br>
+                0. for free users, hugging face zero-gpu has quota, if you see "over quota", please try it later, e.g., after 30 mins. for saving your quota, this project is estimated to run around 120~160s. by the following trade-off.
+                <br>
+                1. lower resolution, video resized as long-side 512 and keep aspect ratio.
                 <br>
+                2. subgraph instead of full-graph, causing noticeable "frame jumps".
                 <br>
+                3. only use the first 8s of your input audio.
                 <br>
+                4. only use the first 20s of your input video for custom character. if you custom character, it will only generate one video result without "smoothing" for saving time.
                 <br>
+                5. use open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
                 <br>
                 <br>
                 Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
+                </div>
                 """)
         with gr.Row():
 if __name__ == "__main__":
     os.environ["MASTER_ADDR"]='127.0.0.1'
     os.environ["MASTER_PORT"]='8675'
     demo = make_demo()
+    demo.launch(share=True)

create_graph.py CHANGED Viewed

@@ -18,7 +18,7 @@ import librosa
 import igraph
 import json
 import utils.rotation_conversions as rc
-from moviepy.editor import VideoClip, AudioFileClip
 from tqdm import tqdm
 import imageio
 import tempfile
@@ -263,27 +263,68 @@ def random_walk(graph, walk_length, start_node=None):
         is_continue.append(is_cont)
     return walk, is_continue
 def path_visualization(graph, path, is_continue, save_path, verbose_continue=False, audio_path=None, return_motion=False):
     all_frames = [node['video'] for node in path]
     average_dis_continue = 1 - sum(is_continue) / len(is_continue)
     if verbose_continue:
         print("average_dis_continue:", average_dis_continue)
-    duration = len(all_frames) / graph.vs[0]['fps']
     def make_frame(t):
-        idx = min(int(t * graph.vs[0]['fps']), len(all_frames) - 1)
         return all_frames[idx]
     video_clip = VideoClip(make_frame, duration=duration)
     if audio_path is not None:
         audio_clip = AudioFileClip(audio_path)
-        video_clip = video_clip.set_audio(audio_clip)
-    video_clip.write_videofile(save_path, codec='libx264', fps=graph.vs[0]['fps'], audio_codec='aac')
     if return_motion:
         all_motion = [node['axis_angle'] for node in path]
         all_motion = np.stack(all_motion, 0)
         return all_motion
 def generate_transition_video(frame_start_path, frame_end_path, output_video_path):
     import subprocess
     import os

 import igraph
 import json
 import utils.rotation_conversions as rc
+from moviepy.editor import VideoClip, AudioFileClip, VideoFileClip
 from tqdm import tqdm
 import imageio
 import tempfile
         is_continue.append(is_cont)
     return walk, is_continue
+import subprocess
 def path_visualization(graph, path, is_continue, save_path, verbose_continue=False, audio_path=None, return_motion=False):
     all_frames = [node['video'] for node in path]
     average_dis_continue = 1 - sum(is_continue) / len(is_continue)
     if verbose_continue:
         print("average_dis_continue:", average_dis_continue)
+    fps = graph.vs[0]['fps']
+    duration = len(all_frames) / fps
     def make_frame(t):
+        idx = min(int(t * fps), len(all_frames) - 1)
         return all_frames[idx]
+    video_only_path = 'video_only.mp4'  # Temporary file
     video_clip = VideoClip(make_frame, duration=duration)
+    video_clip.write_videofile(
+        video_only_path,
+        codec='libx264',
+        fps=fps,
+        audio=False
+    )
+    # Optionally, ensure audio and video durations match
     if audio_path is not None:
         audio_clip = AudioFileClip(audio_path)
+        video_duration = video_clip.duration
+        audio_duration = audio_clip.duration
+        if audio_duration > video_duration:
+            # Trim the audio
+            trimmed_audio_path = 'trimmed_audio.aac'
+            audio_clip = audio_clip.subclip(0, video_duration)
+            audio_clip.write_audiofile(trimmed_audio_path)
+            audio_input = trimmed_audio_path
+        else:
+            audio_input = audio_path
+        # Use FFmpeg to combine video and audio
+        ffmpeg_command = [
+            'ffmpeg', '-y',
+            '-i', video_only_path,
+            '-i', audio_input,
+            '-c:v', 'copy',
+            '-c:a', 'aac',
+            '-strict', 'experimental',
+            save_path
+        ]
+        subprocess.check_call(ffmpeg_command)
+        # Clean up temporary files if necessary
+        os.remove(video_only_path)
+        if audio_input != audio_path:
+            os.remove(audio_input)
     if return_motion:
         all_motion = [node['axis_angle'] for node in path]
         all_motion = np.stack(all_motion, 0)
         return all_motion
 def generate_transition_video(frame_start_path, frame_end_path, output_video_path):
     import subprocess
     import os

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 --extra-index-url https://download.openmmlab.com/mmcv/dist/cu118/torch2.1.0/index.html
 torch==2.1.0
 scikit-image==0.21.0
 scikit-learn==1.3.2
 scipy==1.11.4
@@ -14,7 +14,7 @@ opencv-python==4.8.1.78
 tensorboardx
 filterpy
 cython
-chumpy
 Pillow==9.5.0
 trimesh
 pyrender
@@ -32,7 +32,7 @@ timm
 pyglet
 mmcv==2.1.0
 mmdet==3.2.0
-mmpose
 eval_type_backport
 wget

 --extra-index-url https://download.openmmlab.com/mmcv/dist/cu118/torch2.1.0/index.html
 torch==2.1.0
+numpy==1.23.5
 scikit-image==0.21.0
 scikit-learn==1.3.2
 scipy==1.11.4
 tensorboardx
 filterpy
 cython
+chumpy==0.70.0
 Pillow==9.5.0
 trimesh
 pyrender
 pyglet
 mmcv==2.1.0
 mmdet==3.2.0
+mmpose==0.28.0
 eval_type_backport
 wget