fhieni commited on
Commit
98115b8
1 Parent(s): a448668

Upload 7 files

Browse files
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Vietnamese Voice Clone
2
+
3
+ ## Data Preparation
4
+
5
+ ***If you use custom data***
6
+
7
+ - Config your custom data follow this format:
8
+
9
+ - Create folder: DATA
10
+
11
+ - Subfolder: DATA/wavs -> which contain <audio_id>.wav files inside
12
+
13
+ - DATA/train.txt and DATA/val.txt: with format each line follow format: <audio_id><space>transcript
14
+
15
+ - If you dont have transcript, please check wav2vec inference script
16
+
17
+ ***If you try with VIVOS***
18
+
19
+ ```
20
+ wget http://ailab.hcmus.edu.vn/assets/vivos.tar.gz
21
+ tar xzf vivos.tar.gz
22
+ ```
23
+
24
+ ```
25
+ mkdir -p DATA/wavs
26
+ scp -v vivos/*/waves/*/*.wav DATA/wavs
27
+ ```
28
+
29
+ ```
30
+ cat vivos/test/prompts.txt > DATA/val.txt
31
+ cat vivos/test/prompts.txt > DATA/train.txt
32
+ cat vivos/train/prompts.txt >> DATA/train.txt
33
+ ```
34
+
35
+ ## Install environment
36
+
37
+ ```
38
+ conda create -y -n viclone python=3.8
39
+ conda activate viclone
40
+ conda install cudatoolkit=11.3.1 cudnn=8.2.1
41
+ ```
42
+
43
+ ```
44
+ python -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116
45
+ python -m pip install -r requirements.txt
46
+ ```
47
+
48
+ ```
49
+ cd vits/monotonic_align
50
+ mkdir monotonic_align
51
+ python setup.py build_ext --inplace
52
+ ```
53
+
54
+ ## Process data
55
+
56
+ ```
57
+ python Step1_data_processing.py
58
+ ```
59
+
60
+ ## Extract feature
61
+
62
+ ```
63
+ python Step2_extract_feature.py
64
+ ```
65
+
66
+ ## Train model
67
+
68
+ ```
69
+ python train_ms.py -c configs/vivos.json -m vivos
70
+ ```
71
+
72
+ ## Demo
73
+
74
+ ```python app.py```
75
+
76
+ Then check port: http://127.0.0.1:7860/
Step0_transcription.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from re import A
2
+ from transformers.file_utils import cached_path, hf_bucket_url
3
+ import os, zipfile
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ import soundfile as sf
6
+ import torch
7
+ import kenlm
8
+ from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
9
+ import os
10
+ from multiprocessing import Pool
11
+ import argparse, subprocess, tempfile
12
+
13
+ def extract_audio(filename, channels=1, rate=16000):
14
+ """
15
+ Extract audio from an input file to a temporary WAV file.
16
+ """
17
+ temp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
18
+ if not os.path.isfile(filename):
19
+ print("The given file does not exist: {}".format(filename))
20
+ raise Exception("Invalid filepath: {}".format(filename))
21
+
22
+ command = ["ffmpeg", "-y", "-i", filename,
23
+ "-ac", str(channels), "-ar", str(rate),
24
+ "-loglevel", "error", temp.name]
25
+ use_shell = True if os.name == "nt" else False
26
+ subprocess.check_output(command, stdin=open(os.devnull), shell=use_shell)
27
+ return temp.name, rate
28
+
29
+ class Wav2Vec:
30
+ def __init__(self):
31
+
32
+ self.device = "cuda"
33
+ # Load Wav2Vec
34
+ cache_dir = './cache/'
35
+ self.processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
36
+ lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
37
+ lm_file = cached_path(lm_file,cache_dir=cache_dir)
38
+ with zipfile.ZipFile(lm_file, 'r') as zip_ref:
39
+ zip_ref.extractall(cache_dir)
40
+ lm_file = cache_dir + 'vi_lm_4grams.bin'
41
+ self.model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
42
+ self.model.to(self.device)
43
+
44
+ # Load Ngram LM
45
+ self.ngram_lm_model = self.get_decoder_ngram_model(self.processor.tokenizer, lm_file)
46
+
47
+ def get_decoder_ngram_model(self, tokenizer, ngram_lm_path):
48
+ vocab_dict = tokenizer.get_vocab()
49
+ sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
50
+ vocab = [x[1] for x in sort_vocab][:-2]
51
+ vocab_list = vocab
52
+ # convert ctc blank character representation
53
+ vocab_list[tokenizer.pad_token_id] = ""
54
+ # replace special characters
55
+ vocab_list[tokenizer.unk_token_id] = ""
56
+ # vocab_list[tokenizer.bos_token_id] = ""
57
+ # vocab_list[tokenizer.eos_token_id] = ""
58
+ # convert space character representation
59
+ vocab_list[tokenizer.word_delimiter_token_id] = " "
60
+ # specify ctc blank char index, since conventially it is the last entry of the logit matrix
61
+ alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
62
+ lm_model = kenlm.Model(ngram_lm_path)
63
+ decoder = BeamSearchDecoderCTC(alphabet, language_model=LanguageModel(lm_model))
64
+ return decoder
65
+
66
+ # define function to read in sound file
67
+ def map_to_array(self, batch):
68
+ speech, sampling_rate = sf.read(batch["file"])
69
+ batch["speech"] = speech
70
+ batch["sampling_rate"] = sampling_rate
71
+ return batch
72
+
73
+ def inference(self, filename):
74
+
75
+ # load dummy dataset and read soundfiles
76
+ ds = self.map_to_array({"file": filename})
77
+
78
+ # infer model
79
+ input_values = self.processor(ds["speech"], sampling_rate=ds["sampling_rate"], return_tensors="pt").input_values
80
+ input_values = input_values.to(self.device)
81
+ # model.to("cuda")
82
+ logits = self.model(input_values).logits[0]
83
+ # print(logits.shape)
84
+
85
+ # decode ctc output
86
+ pred_ids = torch.argmax(logits, dim=-1)
87
+ greedy_search_output = self.processor.decode(pred_ids)
88
+ beam_search_output = self.ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
89
+ # print("Greedy search output: {}".format(greedy_search_output))
90
+ # print("Beam search output: {}".format(beam_search_output))
91
+ return beam_search_output
92
+
93
+ if __name__ == "__main__":
94
+ w2v = Wav2Vec()
95
+ import glob, tqdm
96
+
97
+ parser = argparse.ArgumentParser()
98
+ parser.add_argument('--wavs', default="DATA/wavs", help="", type=str)
99
+ parser.add_argument('--train_file', default="DATA/train.txt", help="", type=str)
100
+ parser.add_argument('--val_file', default="DATA/train.txt", help="", type=str)
101
+ args = parser.parse_args()
102
+
103
+ os.makedirs(os.path.dirname(args.train_file), exist_ok = True)
104
+
105
+ count_val = 0
106
+
107
+ fw = open(args.train_file, "w+", encoding="utf-8")
108
+ fw_val = open(args.val_file, "w+", encoding="utf-8")
109
+ for i in tqdm.tqdm(glob.glob(args.wavs + "/*.wav")):
110
+ audio_filename, audio_rate = extract_audio(i)
111
+ output = w2v.inference(audio_filename)
112
+ fw.write(i.split("/")[-1] + " " + output + "\n")
113
+
114
+ if count_val < 64:
115
+ count_val = count_val + 1
116
+ fw_val.write(i.split("/")[-1] + " " + output + "\n")
117
+
118
+ fw.close()
119
+ fw_val.close()
Step1_data_processing.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from viphoneme import vi2IPA_split
2
+ import tqdm, glob
3
+ from pydub import AudioSegment
4
+
5
+ def process_text():
6
+ f = open("DATA/train.txt", "r", encoding="utf-8")
7
+ lines = f.read().splitlines()
8
+ f.close()
9
+
10
+ norm_lines = []
11
+ for line in tqdm.tqdm(lines):
12
+ file, script = line.split(" ",1)
13
+ if not file.endswith(".wav"):
14
+ file = file + ".wav"
15
+ phoneme = vi2IPA_split(script.lower(), "/")
16
+ if len(phoneme.split(" ")) < 4:
17
+ continue
18
+ norm_lines.append(file+"|"+file.replace("/wavs", "/embedding").replace(".wav",".npy")+"|"+phoneme)
19
+ with open("DATA/train.txt", "w", encoding="utf-8") as file:
20
+ for item in norm_lines:
21
+ file.write(item + "\n")
22
+
23
+ f = open("DATA/val.txt", "r", encoding="utf-8")
24
+ lines = f.read().splitlines()
25
+ f.close()
26
+
27
+ norm_lines = []
28
+ for line in tqdm.tqdm(lines):
29
+ file, script = line.split(" ",1)
30
+ if not file.endswith(".wav"):
31
+ file = file + ".wav"
32
+ phoneme = vi2IPA_split(script.lower(), "/")
33
+ if len(phoneme.split(" ")) < 4:
34
+ continue
35
+ norm_lines.append(file+"|"+file.replace("/wavs", "/embedding").replace(".wav",".npy")+"|"+phoneme)
36
+ with open("DATA/val.txt", "w", encoding="utf-8") as file:
37
+ for item in norm_lines:
38
+ file.write(item + "\n")
39
+
40
+ def process_speech():
41
+
42
+ wavs = glob.glob("DATA/wavs/*.wav")
43
+ for wav_file in tqdm.tqdm(wavs):
44
+ audio = AudioSegment.from_file(wav_file)
45
+
46
+ if audio.channels == 2:
47
+ # Convert stereo audio to mono
48
+ audio = audio.set_channels(1)
49
+
50
+ if audio.frame_rate != 22050:
51
+ # Convert the audio to 22050 Hz sample rate
52
+ audio = audio.set_frame_rate(22050)
53
+
54
+ audio.export(wav_file, format="wav")
55
+
56
+ if __name__ == "__main__":
57
+ process_text()
58
+ process_speech()
Step2_extract_feature.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import glob
3
+ from resemblyzer import preprocess_wav, VoiceEncoder
4
+ import numpy as np
5
+ import glob, tqdm
6
+
7
+ encoder = VoiceEncoder(device='cpu')
8
+
9
+ def extract_speaker_embedding():
10
+ wavs = glob.glob("DATA/wavs/*.wav")
11
+
12
+ os.makedirs("DATA/embedding", exist_ok=True)
13
+ for path in tqdm.tqdm(wavs):
14
+ wav = preprocess_wav(path)
15
+ embed = encoder.embed_utterance(wav)
16
+ # print(embed.shape) # (256,)
17
+ np.save(path.replace("wavs", "embedding").replace(".wav",".npy"), embed)
18
+
19
+ if __name__ == '__main__':
20
+ extract_speaker_embedding()
requirements.txt ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ aiofiles==23.2.1
3
+ altair==5.1.2
4
+ annotated-types==0.6.0
5
+ anyio==3.7.1
6
+ attrs==23.1.0
7
+ audioread==3.0.1
8
+ Babel==2.13.1
9
+ cachetools==4.2.4
10
+ certifi==2023.7.22
11
+ cffi==1.16.0
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ clldutils==3.20.0
15
+ colorama==0.4.6
16
+ colorlog==6.7.0
17
+ csvw==3.1.3
18
+ cycler==0.12.1
19
+ Cython==0.29.21
20
+ decorator==4.4.2
21
+ eng-to-ipa==0.0.2
22
+ exceptiongroup==1.1.3
23
+ fastapi==0.104.1
24
+ ffmpy==0.3.1
25
+ filelock==3.13.1
26
+ fsspec==2023.10.0
27
+ google-auth==2.23.4
28
+ google-auth-oauthlib==1.0.0
29
+ gradio==4.1.1
30
+ gradio_client==0.7.0
31
+ grpcio==1.59.2
32
+ h11==0.14.0
33
+ httpcore==1.0.1
34
+ httpx==0.25.1
35
+ huggingface-hub==0.18.0
36
+ idna==3.4
37
+ imageio==2.32.0
38
+ imageio-ffmpeg==0.4.9
39
+ importlib-metadata==6.8.0
40
+ importlib-resources==6.1.0
41
+ isodate==0.6.1
42
+ Jinja2==3.1.2
43
+ joblib==1.3.2
44
+ jsonschema==4.19.2
45
+ jsonschema-specifications==2023.7.1
46
+ kiwisolver==1.4.5
47
+ language-tags==1.2.0
48
+ lazy_loader==0.3
49
+ librosa==0.8.0
50
+ llvmlite==0.37.0
51
+ lxml==4.9.3
52
+ Markdown==3.5.1
53
+ markdown-it-py==3.0.0
54
+ MarkupSafe==2.1.3
55
+ matplotlib==3.3.1
56
+ mdurl==0.1.2
57
+ moviepy==1.0.3
58
+ msgpack==1.0.7
59
+ nltk==3.8.1
60
+ numba==0.54.0
61
+ numpy==1.19.5
62
+ oauthlib==3.2.2
63
+ orjson==3.9.10
64
+ packaging==23.2
65
+ pandas==1.1.5
66
+ phonemizer==2.2.1
67
+ Pillow==9.5.0
68
+ pkgutil_resolve_name==1.3.10
69
+ platformdirs==3.11.0
70
+ pooch==1.8.0
71
+ proglog==0.1.10
72
+ protobuf==3.20.0
73
+ pyasn1==0.5.0
74
+ pyasn1-modules==0.3.0
75
+ pycparser==2.21
76
+ pydantic==2.4.2
77
+ pydantic_core==2.10.1
78
+ pydub==0.25.1
79
+ Pygments==2.16.1
80
+ pylatexenc==2.10
81
+ pyparsing==3.1.1
82
+ python-crfsuite==0.9.9
83
+ python-dateutil==2.8.2
84
+ python-multipart==0.0.6
85
+ pytz==2023.3.post1
86
+ PyYAML==6.0.1
87
+ rdflib==7.0.0
88
+ referencing==0.30.2
89
+ regex==2023.10.3
90
+ requests==2.31.0
91
+ requests-oauthlib==1.3.1
92
+ resampy==0.4.2
93
+ rfc3986==1.5.0
94
+ rich==13.6.0
95
+ rpds-py==0.12.0
96
+ rsa==4.9
97
+ scikit-learn==1.3.2
98
+ scipy==1.10.1
99
+ segments==2.2.1
100
+ semantic-version==2.10.0
101
+ shellingham==1.5.4
102
+ six==1.16.0
103
+ sniffio==1.3.0
104
+ soundfile==0.12.1
105
+ soxr==0.3.7
106
+ starlette==0.27.0
107
+ tabulate==0.9.0
108
+ tensorboard==2.14.0
109
+ tensorboard-data-server==0.7.2
110
+ tensorboard-plugin-wit==1.8.1
111
+ threadpoolctl==3.2.0
112
+ tomlkit==0.12.0
113
+ toolz==0.12.0
114
+ torch==1.12.0+cu116
115
+ torchaudio==0.12.0+cu116
116
+ torchvision==0.13.0+cu116
117
+ tqdm==4.66.1
118
+ typer==0.9.0
119
+ typing_extensions==4.8.0
120
+ tzdata==2023.3
121
+ underthesea==6.8.0
122
+ underthesea_core==1.0.4
123
+ Unidecode==1.1.1
124
+ uritemplate==4.1.1
125
+ urllib3==2.0.7
126
+ uvicorn==0.24.0.post1
127
+ vinorm==2.0.7
128
+ webrtcvad==2.0.10
129
+ websockets==11.0.3
130
+ Werkzeug==3.0.1
131
+ zipp==3.17.0
utils_audio.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from moviepy.editor import VideoFileClip
3
+ from pydub import AudioSegment
4
+
5
+ def convert_to_wav(input_file):
6
+ _, extension = os.path.splitext(input_file)
7
+ extension = extension.lower() # Convert to lowercase for case-insensitivity
8
+ output_wav_file = input_file.replace(extension, ".wav")
9
+ if extension == ".wav":
10
+ return output_wav_file
11
+ if extension == ".mp4":
12
+ video_clip = VideoFileClip(input_file)
13
+ audio_clip = video_clip.audio
14
+ audio_clip.write_audiofile(output_wav_file)
15
+ audio_clip.close()
16
+ print(f"{input_file} (MP4) converted to {output_wav_file}")
17
+ return output_wav_file
18
+ elif extension == ".mp3":
19
+ audio_clip = AudioSegment.from_mp3(input_file)
20
+ audio_clip.export(output_wav_file, format="wav")
21
+ print(f"{input_file} (MP3) converted to {output_wav_file}")
22
+ return output_wav_file
23
+ else:
24
+ print(f"Unsupported file format: {extension}")
25
+ return input_file