M2UGen-Super-30s

Build error

App Files Files Community

crypto-code commited on Jan 3

Commit

e77fc2d

•

1 Parent(s): 686c4ae

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -12

app.py CHANGED Viewed

@@ -20,7 +20,6 @@ import torchvision.transforms as transforms
 import av
 import subprocess
 import librosa
-import re
 args = {"model": "./ckpts/checkpoint.pth", "llama_type": "7B", "llama_dir": "./ckpts/LLaMA-2",
         "mert_path": "m-a-p/MERT-v1-330M", "vit_path": "google/vit-base-patch16-224", "vivit_path": "google/vivit-b-16x2-kinetics400",
@@ -34,6 +33,8 @@ class dotdict(dict):
 args = dotdict(args)
 llama_type = args.llama_type
 llama_ckpt_dir = os.path.join(args.llama_dir, llama_type)
 llama_tokenzier_path = args.llama_dir
@@ -117,6 +118,7 @@ def parse_text(text, image_path, video_path, audio_path):
 def save_audio_to_local(audio, sec):
     if not os.path.exists('temp'):
         os.mkdir('temp')
     filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.wav')
@@ -124,6 +126,7 @@ def save_audio_to_local(audio, sec):
         scipy.io.wavfile.write(filename, rate=16000, data=audio[0])
     else:
         scipy.io.wavfile.write(filename, rate=model.generation_model.config.audio_encoder.sampling_rate, data=audio)
     return filename
@@ -159,10 +162,14 @@ def reset_user_input():
 def reset_dialog():
     return [], []
 def reset_state():
     return None, None, None, None, [], [], []
@@ -209,12 +216,6 @@ def get_video_length(filename):
 def get_audio_length(filename):
     return int(round(librosa.get_duration(path=filename)))
-def get_last_audio():
-    for hist in history[::-1]:
-        print(hist)
-        if "<audio controls playsinline>" in hist[1]:
-            return re.search('<audio controls playsinline><source src=\"\.\/file=(.*)\" type="audio\/wav"><\/audio>', hist[1]).group(1)
-    return None
 def predict(
         prompt_input,
@@ -227,6 +228,7 @@ def predict(
         history,
         modality_cache,
         audio_length_in_s):
     prompts = [llama.format_prompt(prompt_input)]
     prompts = [model.tokenizer(x).input_ids for x in prompts]
     print(image_path, audio_path, video_path)
@@ -244,11 +246,11 @@ def predict(
         container = av.open(video_path)
         indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
         video = read_video_pyav(container=container, indices=indices)
-    generated_audio_file = get_last_audio()
-    if generated_audio_file is not None:
-        audio_length_in_s = get_audio_length(generated_audio_file)
         sample_rate = 24000
-        waveform, sr = torchaudio.load(generated_audio_file)
         if sample_rate != sr:
             waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=sample_rate)
         audio = torch.mean(waveform, 0)
@@ -259,6 +261,7 @@ def predict(
         print(f"Video Length: {audio_length_in_s}")
     if audio_path is not None:
         audio_length_in_s = get_audio_length(audio_path)
         print(f"Audio Length: {audio_length_in_s}")
     print(image, video, audio)
@@ -350,4 +353,4 @@ with gr.Blocks() as demo:
     ], show_progress=True)
 if __name__ == "__main__":
-    demo.launch()

 import av
 import subprocess
 import librosa
 args = {"model": "./ckpts/checkpoint.pth", "llama_type": "7B", "llama_dir": "./ckpts/LLaMA-2",
         "mert_path": "m-a-p/MERT-v1-330M", "vit_path": "google/vit-base-patch16-224", "vivit_path": "google/vivit-b-16x2-kinetics400",
 args = dotdict(args)
+generated_audio_files = []
 llama_type = args.llama_type
 llama_ckpt_dir = os.path.join(args.llama_dir, llama_type)
 llama_tokenzier_path = args.llama_dir
 def save_audio_to_local(audio, sec):
+    global generated_audio_files
     if not os.path.exists('temp'):
         os.mkdir('temp')
     filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.wav')
         scipy.io.wavfile.write(filename, rate=16000, data=audio[0])
     else:
         scipy.io.wavfile.write(filename, rate=model.generation_model.config.audio_encoder.sampling_rate, data=audio)
+    generated_audio_files.append(filename)
     return filename
 def reset_dialog():
+    global generated_audio_files
+    generated_audio_files = []
     return [], []
 def reset_state():
+    global generated_audio_files
+    generated_audio_files = []
     return None, None, None, None, [], [], []
 def get_audio_length(filename):
     return int(round(librosa.get_duration(path=filename)))
 def predict(
         prompt_input,
         history,
         modality_cache,
         audio_length_in_s):
+    global generated_audio_files
     prompts = [llama.format_prompt(prompt_input)]
     prompts = [model.tokenizer(x).input_ids for x in prompts]
     print(image_path, audio_path, video_path)
         container = av.open(video_path)
         indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
         video = read_video_pyav(container=container, indices=indices)
+    if len(generated_audio_files) != 0:
+        audio_length_in_s = get_audio_length(generated_audio_files[-1])
         sample_rate = 24000
+        waveform, sr = torchaudio.load(generated_audio_files[-1])
         if sample_rate != sr:
             waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=sample_rate)
         audio = torch.mean(waveform, 0)
         print(f"Video Length: {audio_length_in_s}")
     if audio_path is not None:
         audio_length_in_s = get_audio_length(audio_path)
+        generated_audio_files.append(audio_path)
         print(f"Audio Length: {audio_length_in_s}")
     print(image, video, audio)
     ], show_progress=True)
 if __name__ == "__main__":
+    demo.launch()