draft of separate audios
Browse files
app.py
CHANGED
@@ -1,4 +1,57 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
with gr.Blocks() as demo:
|
|
|
1 |
import gradio as gr
|
2 |
+
from modelscope.pipelines import pipeline
|
3 |
+
from modelscope.utils.constant import Tasks
|
4 |
+
import soundfile as sf
|
5 |
+
import numpy as np
|
6 |
+
import os
|
7 |
+
# import torch
|
8 |
+
|
9 |
+
SAMPLE_RATE = 8000
|
10 |
+
|
11 |
+
def get_sample_rate(audio_file_path):
|
12 |
+
_, sample_rate = sf.read(audio_file_path, always_2d=True)
|
13 |
+
return sample_rate
|
14 |
+
|
15 |
+
def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
|
16 |
+
# do ffmpeg -i $input_audio_file_path -ar $sample_rate $output_audio_file_path
|
17 |
+
os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} {output_audio_file_path}')
|
18 |
+
|
19 |
+
def audio_is_stereo(audio_file_path):
|
20 |
+
audio, _ = sf.read(audio_file_path, always_2d=True)
|
21 |
+
return audio.shape[1] == 2
|
22 |
+
|
23 |
+
def set_mono(input_audio_file_path, output_audio_file_path):
|
24 |
+
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 {output_audio_file_path}')
|
25 |
+
|
26 |
+
os.system('wget https://maximofn.com/wp-content/uploads/2023/10/vocals.wav')
|
27 |
+
input = "vocals.wav"
|
28 |
+
input_8k = "vocals_8k.wav"
|
29 |
+
input_8k_mono = "vocals_8k_mono.wav"
|
30 |
+
|
31 |
+
sr = get_sample_rate(input)
|
32 |
+
|
33 |
+
if sr != SAMPLE_RATE:
|
34 |
+
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
35 |
+
else:
|
36 |
+
input_8k = input
|
37 |
+
|
38 |
+
if audio_is_stereo(input_8k):
|
39 |
+
set_mono(input_8k, input_8k_mono)
|
40 |
+
else:
|
41 |
+
input_8k_mono = input_8k
|
42 |
+
|
43 |
+
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
44 |
+
device = 'cpu'
|
45 |
+
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
46 |
+
print("Separating...")
|
47 |
+
result = separation(input_8k_mono)
|
48 |
+
print("Separated!")
|
49 |
+
|
50 |
+
print("Saving...")
|
51 |
+
for i, signal in enumerate(result['output_pcm_list']):
|
52 |
+
save_file = f'output_spk{i}.wav'
|
53 |
+
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
54 |
+
print("Saved!")
|
55 |
|
56 |
|
57 |
with gr.Blocks() as demo:
|