Upload 7 files
Browse files- README.md +76 -0
- Step0_transcription.py +119 -0
- Step1_data_processing.py +58 -0
- Step2_extract_feature.py +20 -0
- requirements.txt +131 -0
- utils_audio.py +25 -0
README.md
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Vietnamese Voice Clone
|
2 |
+
|
3 |
+
## Data Preparation
|
4 |
+
|
5 |
+
***If you use custom data***
|
6 |
+
|
7 |
+
- Config your custom data follow this format:
|
8 |
+
|
9 |
+
- Create folder: DATA
|
10 |
+
|
11 |
+
- Subfolder: DATA/wavs -> which contain <audio_id>.wav files inside
|
12 |
+
|
13 |
+
- DATA/train.txt and DATA/val.txt: with format each line follow format: <audio_id><space>transcript
|
14 |
+
|
15 |
+
- If you dont have transcript, please check wav2vec inference script
|
16 |
+
|
17 |
+
***If you try with VIVOS***
|
18 |
+
|
19 |
+
```
|
20 |
+
wget http://ailab.hcmus.edu.vn/assets/vivos.tar.gz
|
21 |
+
tar xzf vivos.tar.gz
|
22 |
+
```
|
23 |
+
|
24 |
+
```
|
25 |
+
mkdir -p DATA/wavs
|
26 |
+
scp -v vivos/*/waves/*/*.wav DATA/wavs
|
27 |
+
```
|
28 |
+
|
29 |
+
```
|
30 |
+
cat vivos/test/prompts.txt > DATA/val.txt
|
31 |
+
cat vivos/test/prompts.txt > DATA/train.txt
|
32 |
+
cat vivos/train/prompts.txt >> DATA/train.txt
|
33 |
+
```
|
34 |
+
|
35 |
+
## Install environment
|
36 |
+
|
37 |
+
```
|
38 |
+
conda create -y -n viclone python=3.8
|
39 |
+
conda activate viclone
|
40 |
+
conda install cudatoolkit=11.3.1 cudnn=8.2.1
|
41 |
+
```
|
42 |
+
|
43 |
+
```
|
44 |
+
python -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116
|
45 |
+
python -m pip install -r requirements.txt
|
46 |
+
```
|
47 |
+
|
48 |
+
```
|
49 |
+
cd vits/monotonic_align
|
50 |
+
mkdir monotonic_align
|
51 |
+
python setup.py build_ext --inplace
|
52 |
+
```
|
53 |
+
|
54 |
+
## Process data
|
55 |
+
|
56 |
+
```
|
57 |
+
python Step1_data_processing.py
|
58 |
+
```
|
59 |
+
|
60 |
+
## Extract feature
|
61 |
+
|
62 |
+
```
|
63 |
+
python Step2_extract_feature.py
|
64 |
+
```
|
65 |
+
|
66 |
+
## Train model
|
67 |
+
|
68 |
+
```
|
69 |
+
python train_ms.py -c configs/vivos.json -m vivos
|
70 |
+
```
|
71 |
+
|
72 |
+
## Demo
|
73 |
+
|
74 |
+
```python app.py```
|
75 |
+
|
76 |
+
Then check port: http://127.0.0.1:7860/
|
Step0_transcription.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from re import A
|
2 |
+
from transformers.file_utils import cached_path, hf_bucket_url
|
3 |
+
import os, zipfile
|
4 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
5 |
+
import soundfile as sf
|
6 |
+
import torch
|
7 |
+
import kenlm
|
8 |
+
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
|
9 |
+
import os
|
10 |
+
from multiprocessing import Pool
|
11 |
+
import argparse, subprocess, tempfile
|
12 |
+
|
13 |
+
def extract_audio(filename, channels=1, rate=16000):
|
14 |
+
"""
|
15 |
+
Extract audio from an input file to a temporary WAV file.
|
16 |
+
"""
|
17 |
+
temp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
18 |
+
if not os.path.isfile(filename):
|
19 |
+
print("The given file does not exist: {}".format(filename))
|
20 |
+
raise Exception("Invalid filepath: {}".format(filename))
|
21 |
+
|
22 |
+
command = ["ffmpeg", "-y", "-i", filename,
|
23 |
+
"-ac", str(channels), "-ar", str(rate),
|
24 |
+
"-loglevel", "error", temp.name]
|
25 |
+
use_shell = True if os.name == "nt" else False
|
26 |
+
subprocess.check_output(command, stdin=open(os.devnull), shell=use_shell)
|
27 |
+
return temp.name, rate
|
28 |
+
|
29 |
+
class Wav2Vec:
|
30 |
+
def __init__(self):
|
31 |
+
|
32 |
+
self.device = "cuda"
|
33 |
+
# Load Wav2Vec
|
34 |
+
cache_dir = './cache/'
|
35 |
+
self.processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
|
36 |
+
lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
|
37 |
+
lm_file = cached_path(lm_file,cache_dir=cache_dir)
|
38 |
+
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
|
39 |
+
zip_ref.extractall(cache_dir)
|
40 |
+
lm_file = cache_dir + 'vi_lm_4grams.bin'
|
41 |
+
self.model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
|
42 |
+
self.model.to(self.device)
|
43 |
+
|
44 |
+
# Load Ngram LM
|
45 |
+
self.ngram_lm_model = self.get_decoder_ngram_model(self.processor.tokenizer, lm_file)
|
46 |
+
|
47 |
+
def get_decoder_ngram_model(self, tokenizer, ngram_lm_path):
|
48 |
+
vocab_dict = tokenizer.get_vocab()
|
49 |
+
sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
|
50 |
+
vocab = [x[1] for x in sort_vocab][:-2]
|
51 |
+
vocab_list = vocab
|
52 |
+
# convert ctc blank character representation
|
53 |
+
vocab_list[tokenizer.pad_token_id] = ""
|
54 |
+
# replace special characters
|
55 |
+
vocab_list[tokenizer.unk_token_id] = ""
|
56 |
+
# vocab_list[tokenizer.bos_token_id] = ""
|
57 |
+
# vocab_list[tokenizer.eos_token_id] = ""
|
58 |
+
# convert space character representation
|
59 |
+
vocab_list[tokenizer.word_delimiter_token_id] = " "
|
60 |
+
# specify ctc blank char index, since conventially it is the last entry of the logit matrix
|
61 |
+
alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
|
62 |
+
lm_model = kenlm.Model(ngram_lm_path)
|
63 |
+
decoder = BeamSearchDecoderCTC(alphabet, language_model=LanguageModel(lm_model))
|
64 |
+
return decoder
|
65 |
+
|
66 |
+
# define function to read in sound file
|
67 |
+
def map_to_array(self, batch):
|
68 |
+
speech, sampling_rate = sf.read(batch["file"])
|
69 |
+
batch["speech"] = speech
|
70 |
+
batch["sampling_rate"] = sampling_rate
|
71 |
+
return batch
|
72 |
+
|
73 |
+
def inference(self, filename):
|
74 |
+
|
75 |
+
# load dummy dataset and read soundfiles
|
76 |
+
ds = self.map_to_array({"file": filename})
|
77 |
+
|
78 |
+
# infer model
|
79 |
+
input_values = self.processor(ds["speech"], sampling_rate=ds["sampling_rate"], return_tensors="pt").input_values
|
80 |
+
input_values = input_values.to(self.device)
|
81 |
+
# model.to("cuda")
|
82 |
+
logits = self.model(input_values).logits[0]
|
83 |
+
# print(logits.shape)
|
84 |
+
|
85 |
+
# decode ctc output
|
86 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
87 |
+
greedy_search_output = self.processor.decode(pred_ids)
|
88 |
+
beam_search_output = self.ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
|
89 |
+
# print("Greedy search output: {}".format(greedy_search_output))
|
90 |
+
# print("Beam search output: {}".format(beam_search_output))
|
91 |
+
return beam_search_output
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
w2v = Wav2Vec()
|
95 |
+
import glob, tqdm
|
96 |
+
|
97 |
+
parser = argparse.ArgumentParser()
|
98 |
+
parser.add_argument('--wavs', default="DATA/wavs", help="", type=str)
|
99 |
+
parser.add_argument('--train_file', default="DATA/train.txt", help="", type=str)
|
100 |
+
parser.add_argument('--val_file', default="DATA/train.txt", help="", type=str)
|
101 |
+
args = parser.parse_args()
|
102 |
+
|
103 |
+
os.makedirs(os.path.dirname(args.train_file), exist_ok = True)
|
104 |
+
|
105 |
+
count_val = 0
|
106 |
+
|
107 |
+
fw = open(args.train_file, "w+", encoding="utf-8")
|
108 |
+
fw_val = open(args.val_file, "w+", encoding="utf-8")
|
109 |
+
for i in tqdm.tqdm(glob.glob(args.wavs + "/*.wav")):
|
110 |
+
audio_filename, audio_rate = extract_audio(i)
|
111 |
+
output = w2v.inference(audio_filename)
|
112 |
+
fw.write(i.split("/")[-1] + " " + output + "\n")
|
113 |
+
|
114 |
+
if count_val < 64:
|
115 |
+
count_val = count_val + 1
|
116 |
+
fw_val.write(i.split("/")[-1] + " " + output + "\n")
|
117 |
+
|
118 |
+
fw.close()
|
119 |
+
fw_val.close()
|
Step1_data_processing.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from viphoneme import vi2IPA_split
|
2 |
+
import tqdm, glob
|
3 |
+
from pydub import AudioSegment
|
4 |
+
|
5 |
+
def process_text():
|
6 |
+
f = open("DATA/train.txt", "r", encoding="utf-8")
|
7 |
+
lines = f.read().splitlines()
|
8 |
+
f.close()
|
9 |
+
|
10 |
+
norm_lines = []
|
11 |
+
for line in tqdm.tqdm(lines):
|
12 |
+
file, script = line.split(" ",1)
|
13 |
+
if not file.endswith(".wav"):
|
14 |
+
file = file + ".wav"
|
15 |
+
phoneme = vi2IPA_split(script.lower(), "/")
|
16 |
+
if len(phoneme.split(" ")) < 4:
|
17 |
+
continue
|
18 |
+
norm_lines.append(file+"|"+file.replace("/wavs", "/embedding").replace(".wav",".npy")+"|"+phoneme)
|
19 |
+
with open("DATA/train.txt", "w", encoding="utf-8") as file:
|
20 |
+
for item in norm_lines:
|
21 |
+
file.write(item + "\n")
|
22 |
+
|
23 |
+
f = open("DATA/val.txt", "r", encoding="utf-8")
|
24 |
+
lines = f.read().splitlines()
|
25 |
+
f.close()
|
26 |
+
|
27 |
+
norm_lines = []
|
28 |
+
for line in tqdm.tqdm(lines):
|
29 |
+
file, script = line.split(" ",1)
|
30 |
+
if not file.endswith(".wav"):
|
31 |
+
file = file + ".wav"
|
32 |
+
phoneme = vi2IPA_split(script.lower(), "/")
|
33 |
+
if len(phoneme.split(" ")) < 4:
|
34 |
+
continue
|
35 |
+
norm_lines.append(file+"|"+file.replace("/wavs", "/embedding").replace(".wav",".npy")+"|"+phoneme)
|
36 |
+
with open("DATA/val.txt", "w", encoding="utf-8") as file:
|
37 |
+
for item in norm_lines:
|
38 |
+
file.write(item + "\n")
|
39 |
+
|
40 |
+
def process_speech():
|
41 |
+
|
42 |
+
wavs = glob.glob("DATA/wavs/*.wav")
|
43 |
+
for wav_file in tqdm.tqdm(wavs):
|
44 |
+
audio = AudioSegment.from_file(wav_file)
|
45 |
+
|
46 |
+
if audio.channels == 2:
|
47 |
+
# Convert stereo audio to mono
|
48 |
+
audio = audio.set_channels(1)
|
49 |
+
|
50 |
+
if audio.frame_rate != 22050:
|
51 |
+
# Convert the audio to 22050 Hz sample rate
|
52 |
+
audio = audio.set_frame_rate(22050)
|
53 |
+
|
54 |
+
audio.export(wav_file, format="wav")
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
process_text()
|
58 |
+
process_speech()
|
Step2_extract_feature.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import glob
|
3 |
+
from resemblyzer import preprocess_wav, VoiceEncoder
|
4 |
+
import numpy as np
|
5 |
+
import glob, tqdm
|
6 |
+
|
7 |
+
encoder = VoiceEncoder(device='cpu')
|
8 |
+
|
9 |
+
def extract_speaker_embedding():
|
10 |
+
wavs = glob.glob("DATA/wavs/*.wav")
|
11 |
+
|
12 |
+
os.makedirs("DATA/embedding", exist_ok=True)
|
13 |
+
for path in tqdm.tqdm(wavs):
|
14 |
+
wav = preprocess_wav(path)
|
15 |
+
embed = encoder.embed_utterance(wav)
|
16 |
+
# print(embed.shape) # (256,)
|
17 |
+
np.save(path.replace("wavs", "embedding").replace(".wav",".npy"), embed)
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
extract_speaker_embedding()
|
requirements.txt
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
+
aiofiles==23.2.1
|
3 |
+
altair==5.1.2
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==3.7.1
|
6 |
+
attrs==23.1.0
|
7 |
+
audioread==3.0.1
|
8 |
+
Babel==2.13.1
|
9 |
+
cachetools==4.2.4
|
10 |
+
certifi==2023.7.22
|
11 |
+
cffi==1.16.0
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
click==8.1.7
|
14 |
+
clldutils==3.20.0
|
15 |
+
colorama==0.4.6
|
16 |
+
colorlog==6.7.0
|
17 |
+
csvw==3.1.3
|
18 |
+
cycler==0.12.1
|
19 |
+
Cython==0.29.21
|
20 |
+
decorator==4.4.2
|
21 |
+
eng-to-ipa==0.0.2
|
22 |
+
exceptiongroup==1.1.3
|
23 |
+
fastapi==0.104.1
|
24 |
+
ffmpy==0.3.1
|
25 |
+
filelock==3.13.1
|
26 |
+
fsspec==2023.10.0
|
27 |
+
google-auth==2.23.4
|
28 |
+
google-auth-oauthlib==1.0.0
|
29 |
+
gradio==4.1.1
|
30 |
+
gradio_client==0.7.0
|
31 |
+
grpcio==1.59.2
|
32 |
+
h11==0.14.0
|
33 |
+
httpcore==1.0.1
|
34 |
+
httpx==0.25.1
|
35 |
+
huggingface-hub==0.18.0
|
36 |
+
idna==3.4
|
37 |
+
imageio==2.32.0
|
38 |
+
imageio-ffmpeg==0.4.9
|
39 |
+
importlib-metadata==6.8.0
|
40 |
+
importlib-resources==6.1.0
|
41 |
+
isodate==0.6.1
|
42 |
+
Jinja2==3.1.2
|
43 |
+
joblib==1.3.2
|
44 |
+
jsonschema==4.19.2
|
45 |
+
jsonschema-specifications==2023.7.1
|
46 |
+
kiwisolver==1.4.5
|
47 |
+
language-tags==1.2.0
|
48 |
+
lazy_loader==0.3
|
49 |
+
librosa==0.8.0
|
50 |
+
llvmlite==0.37.0
|
51 |
+
lxml==4.9.3
|
52 |
+
Markdown==3.5.1
|
53 |
+
markdown-it-py==3.0.0
|
54 |
+
MarkupSafe==2.1.3
|
55 |
+
matplotlib==3.3.1
|
56 |
+
mdurl==0.1.2
|
57 |
+
moviepy==1.0.3
|
58 |
+
msgpack==1.0.7
|
59 |
+
nltk==3.8.1
|
60 |
+
numba==0.54.0
|
61 |
+
numpy==1.19.5
|
62 |
+
oauthlib==3.2.2
|
63 |
+
orjson==3.9.10
|
64 |
+
packaging==23.2
|
65 |
+
pandas==1.1.5
|
66 |
+
phonemizer==2.2.1
|
67 |
+
Pillow==9.5.0
|
68 |
+
pkgutil_resolve_name==1.3.10
|
69 |
+
platformdirs==3.11.0
|
70 |
+
pooch==1.8.0
|
71 |
+
proglog==0.1.10
|
72 |
+
protobuf==3.20.0
|
73 |
+
pyasn1==0.5.0
|
74 |
+
pyasn1-modules==0.3.0
|
75 |
+
pycparser==2.21
|
76 |
+
pydantic==2.4.2
|
77 |
+
pydantic_core==2.10.1
|
78 |
+
pydub==0.25.1
|
79 |
+
Pygments==2.16.1
|
80 |
+
pylatexenc==2.10
|
81 |
+
pyparsing==3.1.1
|
82 |
+
python-crfsuite==0.9.9
|
83 |
+
python-dateutil==2.8.2
|
84 |
+
python-multipart==0.0.6
|
85 |
+
pytz==2023.3.post1
|
86 |
+
PyYAML==6.0.1
|
87 |
+
rdflib==7.0.0
|
88 |
+
referencing==0.30.2
|
89 |
+
regex==2023.10.3
|
90 |
+
requests==2.31.0
|
91 |
+
requests-oauthlib==1.3.1
|
92 |
+
resampy==0.4.2
|
93 |
+
rfc3986==1.5.0
|
94 |
+
rich==13.6.0
|
95 |
+
rpds-py==0.12.0
|
96 |
+
rsa==4.9
|
97 |
+
scikit-learn==1.3.2
|
98 |
+
scipy==1.10.1
|
99 |
+
segments==2.2.1
|
100 |
+
semantic-version==2.10.0
|
101 |
+
shellingham==1.5.4
|
102 |
+
six==1.16.0
|
103 |
+
sniffio==1.3.0
|
104 |
+
soundfile==0.12.1
|
105 |
+
soxr==0.3.7
|
106 |
+
starlette==0.27.0
|
107 |
+
tabulate==0.9.0
|
108 |
+
tensorboard==2.14.0
|
109 |
+
tensorboard-data-server==0.7.2
|
110 |
+
tensorboard-plugin-wit==1.8.1
|
111 |
+
threadpoolctl==3.2.0
|
112 |
+
tomlkit==0.12.0
|
113 |
+
toolz==0.12.0
|
114 |
+
torch==1.12.0+cu116
|
115 |
+
torchaudio==0.12.0+cu116
|
116 |
+
torchvision==0.13.0+cu116
|
117 |
+
tqdm==4.66.1
|
118 |
+
typer==0.9.0
|
119 |
+
typing_extensions==4.8.0
|
120 |
+
tzdata==2023.3
|
121 |
+
underthesea==6.8.0
|
122 |
+
underthesea_core==1.0.4
|
123 |
+
Unidecode==1.1.1
|
124 |
+
uritemplate==4.1.1
|
125 |
+
urllib3==2.0.7
|
126 |
+
uvicorn==0.24.0.post1
|
127 |
+
vinorm==2.0.7
|
128 |
+
webrtcvad==2.0.10
|
129 |
+
websockets==11.0.3
|
130 |
+
Werkzeug==3.0.1
|
131 |
+
zipp==3.17.0
|
utils_audio.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from moviepy.editor import VideoFileClip
|
3 |
+
from pydub import AudioSegment
|
4 |
+
|
5 |
+
def convert_to_wav(input_file):
|
6 |
+
_, extension = os.path.splitext(input_file)
|
7 |
+
extension = extension.lower() # Convert to lowercase for case-insensitivity
|
8 |
+
output_wav_file = input_file.replace(extension, ".wav")
|
9 |
+
if extension == ".wav":
|
10 |
+
return output_wav_file
|
11 |
+
if extension == ".mp4":
|
12 |
+
video_clip = VideoFileClip(input_file)
|
13 |
+
audio_clip = video_clip.audio
|
14 |
+
audio_clip.write_audiofile(output_wav_file)
|
15 |
+
audio_clip.close()
|
16 |
+
print(f"{input_file} (MP4) converted to {output_wav_file}")
|
17 |
+
return output_wav_file
|
18 |
+
elif extension == ".mp3":
|
19 |
+
audio_clip = AudioSegment.from_mp3(input_file)
|
20 |
+
audio_clip.export(output_wav_file, format="wav")
|
21 |
+
print(f"{input_file} (MP3) converted to {output_wav_file}")
|
22 |
+
return output_wav_file
|
23 |
+
else:
|
24 |
+
print(f"Unsupported file format: {extension}")
|
25 |
+
return input_file
|