Spaces:

hkchengrex
/

MMAudio

Running on Zero

App Files Files Community

Rex Cheng commited on Dec 11, 2024

Commit

9ac63db

1 Parent(s): c8ca0bd

zeroGPU

Browse files

Files changed (4) hide show

app.py +15 -16
mmaudio/eval_utils.py +18 -1
mmaudio/utils/download_utils.py +2 -1
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -83,14 +83,15 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
     audio = audios.float().cpu()[0]
     # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    video_save_path = tempfile.mktemp(suffix='.mp4')
     # output_dir.mkdir(exist_ok=True, parents=True)
     # video_save_path = output_dir / f'{current_time_string}.mp4'
-    # make_video(video,
-    #            video_save_path,
-    #            audio,
-    #            sampling_rate=seq_cfg.sampling_rate,
-    #            duration_sec=seq_cfg.duration)
     return video_save_path
@@ -116,11 +117,9 @@ def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int,
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
-    # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    # output_dir.mkdir(exist_ok=True, parents=True)
-    # audio_save_path = output_dir / f'{current_time_string}.flac'
-    audio_save_path = tempfile.mktemp(suffix='.flac')
     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
     return audio_save_path
@@ -140,8 +139,8 @@ video_to_audio_tab = gr.Interface(
     title='MMAudio — Video-to-Audio Synthesis',
     examples=[
         [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
-            '',
             '',
             0,
             25,
@@ -185,8 +184,8 @@ video_to_audio_tab = gr.Interface(
             10,
         ],
         [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
-            'waves, seagulls',
             '',
             0,
             25,
@@ -194,8 +193,8 @@ video_to_audio_tab = gr.Interface(
             10,
         ],
         [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
-            'waves, storm',
             '',
             0,
             25,

     audio = audios.float().cpu()[0]
     # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
+    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     # output_dir.mkdir(exist_ok=True, parents=True)
     # video_save_path = output_dir / f'{current_time_string}.mp4'
+    make_video(video,
+               video_save_path,
+               audio,
+               sampling_rate=seq_cfg.sampling_rate,
+               duration_sec=seq_cfg.duration)
+    log.info(f'Saved video to {video_save_path}')
     return video_save_path
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
+    audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
+    log.info(f'Saved audio to {audio_save_path}')
     return audio_save_path
     title='MMAudio — Video-to-Audio Synthesis',
     examples=[
         [
+            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
+            'waves, seagulls',
             '',
             0,
             25,
             10,
         ],
         [
+            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
+            'waves, storm',
             '',
             0,
             25,
             10,
         ],
         [
+            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
+            '',
             '',
             0,
             25,

mmaudio/eval_utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch
 from colorlog import ColoredFormatter
 from torchvision.transforms import v2
 from torio.io import StreamingMediaDecoder, StreamingMediaEncoder
 from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio
@@ -169,11 +170,13 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
     reader = StreamingMediaDecoder(video_path)
     reader.add_basic_video_stream(
         frames_per_chunk=int(_CLIP_FPS * duration_sec),
         frame_rate=_CLIP_FPS,
         format='rgb24',
     )
     reader.add_basic_video_stream(
         frames_per_chunk=int(_SYNC_FPS * duration_sec),
         frame_rate=_SYNC_FPS,
         format='rgb24',
     )
@@ -182,9 +185,14 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
     data_chunk = reader.pop_chunks()
     clip_chunk = data_chunk[0]
     sync_chunk = data_chunk[1]
     assert clip_chunk is not None
     assert sync_chunk is not None
     clip_frames = clip_transform(clip_chunk)
     sync_frames = sync_transform(sync_chunk)
@@ -210,17 +218,26 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
 def make_video(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int,
                duration_sec: float):
     approx_max_length = int(duration_sec * 60)
     reader = StreamingMediaDecoder(video_path)
     reader.add_basic_video_stream(
         frames_per_chunk=approx_max_length,
         format='rgb24',
     )
     reader.fill_buffer()
     video_chunk = reader.pop_chunks()[0]
     assert video_chunk is not None
-    fps = int(reader.get_out_stream_info(0).frame_rate)
     if fps > 60:
         log.warning(f'This code supports only up to 60 fps, but the video has {fps} fps')
         log.warning(f'Just change the *60 above me')

 from colorlog import ColoredFormatter
 from torchvision.transforms import v2
 from torio.io import StreamingMediaDecoder, StreamingMediaEncoder
+import av
 from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio
     reader = StreamingMediaDecoder(video_path)
     reader.add_basic_video_stream(
         frames_per_chunk=int(_CLIP_FPS * duration_sec),
+        buffer_chunk_size=1,
         frame_rate=_CLIP_FPS,
         format='rgb24',
     )
     reader.add_basic_video_stream(
         frames_per_chunk=int(_SYNC_FPS * duration_sec),
+        buffer_chunk_size=1,
         frame_rate=_SYNC_FPS,
         format='rgb24',
     )
     data_chunk = reader.pop_chunks()
     clip_chunk = data_chunk[0]
     sync_chunk = data_chunk[1]
+    print('clip', clip_chunk.shape, clip_chunk.dtype, clip_chunk.max())
+    print('sync', sync_chunk.shape, sync_chunk.dtype, sync_chunk.max())
     assert clip_chunk is not None
     assert sync_chunk is not None
+    for i in range(reader.num_out_streams):
+        print(reader.get_out_stream_info(i))
     clip_frames = clip_transform(clip_chunk)
     sync_frames = sync_transform(sync_chunk)
 def make_video(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int,
                duration_sec: float):
+    av_video = av.open(video_path)
+    frame_rate = av_video.streams.video[0].guessed_rate
+    print('av frame rate', frame_rate)
     approx_max_length = int(duration_sec * 60)
     reader = StreamingMediaDecoder(video_path)
     reader.add_basic_video_stream(
         frames_per_chunk=approx_max_length,
+        buffer_chunk_size=1,
         format='rgb24',
     )
     reader.fill_buffer()
     video_chunk = reader.pop_chunks()[0]
+    print(video_chunk.shape, video_chunk.dtype, video_chunk.max())
     assert video_chunk is not None
+    # fps = int(reader.get_out_stream_info(0).frame_rate)
+    fps = frame_rate
+    for i in range(reader.num_out_streams):
+        print(reader.get_out_stream_info(i))
     if fps > 60:
         log.warning(f'This code supports only up to 60 fps, but the video has {fps} fps')
         log.warning(f'Just change the *60 above me')

mmaudio/utils/download_utils.py CHANGED Viewed

@@ -30,7 +30,8 @@ links = [
     },
     {
         'name': 'mmaudio_large_44k_v2.pth',
-        'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth',
         'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
     },
     {

     },
     {
         'name': 'mmaudio_large_44k_v2.pth',
+        # 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth',
+        'url': 'https://databank.illinois.edu/datafiles/i1pd9/download',
         'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
     },
     {

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
-torch >= 2.5.1
-torchaudio
 torchvision
 python-dotenv
 cython
 gitpython >= 3.1
@@ -23,4 +23,5 @@ hydra_colorlog
 tensordict
 colorlog
 open_clip_torch
-soundfile

+torch == 2.4.0
 torchvision
+torchaudio
 python-dotenv
 cython
 gitpython >= 3.1
 tensordict
 colorlog
 open_clip_torch
+soundfile
+av