fhieni
/

Vietnamese_VITS

Model card Files Files and versions Community

fhieni commited on Nov 8, 2023

Commit

98115b8

1 Parent(s): a448668

Upload 7 files

Browse files

Files changed (6) hide show

README.md +76 -0
Step0_transcription.py +119 -0
Step1_data_processing.py +58 -0
Step2_extract_feature.py +20 -0
requirements.txt +131 -0
utils_audio.py +25 -0

README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Vietnamese Voice Clone
+## Data Preparation
+***If you use custom data***
+- Config your custom data follow this format:
+     - Create folder: DATA
+     - Subfolder: DATA/wavs -> which contain <audio_id>.wav files inside
+     - DATA/train.txt and DATA/val.txt: with format each line follow format: <audio_id><space>transcript
+- If you dont have transcript, please check wav2vec inference script
+***If you try with VIVOS***
+```
+wget http://ailab.hcmus.edu.vn/assets/vivos.tar.gz
+tar xzf vivos.tar.gz
+```
+```
+mkdir -p DATA/wavs
+scp -v vivos/*/waves/*/*.wav DATA/wavs
+```
+```
+cat vivos/test/prompts.txt > DATA/val.txt
+cat vivos/test/prompts.txt > DATA/train.txt
+cat vivos/train/prompts.txt >> DATA/train.txt
+```
+## Install environment
+```
+conda create -y -n viclone python=3.8
+conda activate viclone
+conda install cudatoolkit=11.3.1 cudnn=8.2.1
+```
+```
+python -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116
+python -m pip install -r requirements.txt
+```
+```
+cd vits/monotonic_align
+mkdir monotonic_align
+python setup.py build_ext --inplace
+```
+## Process data
+```
+python Step1_data_processing.py
+```
+## Extract feature
+```
+python Step2_extract_feature.py
+```
+## Train model
+```
+python train_ms.py -c configs/vivos.json -m vivos
+```
+## Demo
+```python app.py```
+Then check port: http://127.0.0.1:7860/

Step0_transcription.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from re import A
+from transformers.file_utils import cached_path, hf_bucket_url
+import os, zipfile
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import soundfile as sf
+import torch
+import kenlm
+from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
+import os
+from multiprocessing import Pool
+import argparse, subprocess, tempfile
+def extract_audio(filename, channels=1, rate=16000):
+     """
+     Extract audio from an input file to a temporary WAV file.
+     """
+     temp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+     if not os.path.isfile(filename):
+          print("The given file does not exist: {}".format(filename))
+          raise Exception("Invalid filepath: {}".format(filename))
+     command = ["ffmpeg", "-y", "-i", filename,
+                    "-ac", str(channels), "-ar", str(rate),
+                    "-loglevel", "error", temp.name]
+     use_shell = True if os.name == "nt" else False
+     subprocess.check_output(command, stdin=open(os.devnull), shell=use_shell)
+     return temp.name, rate
+class Wav2Vec:
+     def __init__(self):
+          self.device = "cuda"
+          # Load Wav2Vec
+          cache_dir = './cache/'
+          self.processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
+          lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
+          lm_file = cached_path(lm_file,cache_dir=cache_dir)
+          with zipfile.ZipFile(lm_file, 'r') as zip_ref:
+               zip_ref.extractall(cache_dir)
+          lm_file = cache_dir + 'vi_lm_4grams.bin'
+          self.model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
+          self.model.to(self.device)
+          # Load Ngram LM
+          self.ngram_lm_model = self.get_decoder_ngram_model(self.processor.tokenizer, lm_file)
+     def get_decoder_ngram_model(self, tokenizer, ngram_lm_path):
+          vocab_dict = tokenizer.get_vocab()
+          sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
+          vocab = [x[1] for x in sort_vocab][:-2]
+          vocab_list = vocab
+          # convert ctc blank character representation
+          vocab_list[tokenizer.pad_token_id] = ""
+          # replace special characters
+          vocab_list[tokenizer.unk_token_id] = ""
+          # vocab_list[tokenizer.bos_token_id] = ""
+          # vocab_list[tokenizer.eos_token_id] = ""
+          # convert space character representation
+          vocab_list[tokenizer.word_delimiter_token_id] = " "
+          # specify ctc blank char index, since conventially it is the last entry of the logit matrix
+          alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
+          lm_model = kenlm.Model(ngram_lm_path)
+          decoder = BeamSearchDecoderCTC(alphabet, language_model=LanguageModel(lm_model))
+          return decoder
+     # define function to read in sound file
+     def map_to_array(self, batch):
+          speech, sampling_rate = sf.read(batch["file"])
+          batch["speech"] = speech
+          batch["sampling_rate"] = sampling_rate
+          return batch
+     def inference(self, filename):
+          # load dummy dataset and read soundfiles
+          ds = self.map_to_array({"file": filename})
+          # infer model
+          input_values = self.processor(ds["speech"], sampling_rate=ds["sampling_rate"], return_tensors="pt").input_values
+          input_values = input_values.to(self.device)
+          # model.to("cuda")
+          logits = self.model(input_values).logits[0]
+          # print(logits.shape)
+          # decode ctc output
+          pred_ids = torch.argmax(logits, dim=-1)
+          greedy_search_output = self.processor.decode(pred_ids)
+          beam_search_output = self.ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
+          # print("Greedy search output: {}".format(greedy_search_output))
+          # print("Beam search output: {}".format(beam_search_output))
+          return beam_search_output
+if __name__ == "__main__":
+     w2v = Wav2Vec()
+     import glob, tqdm
+     parser = argparse.ArgumentParser()
+     parser.add_argument('--wavs', default="DATA/wavs", help="", type=str)
+     parser.add_argument('--train_file', default="DATA/train.txt", help="", type=str)
+     parser.add_argument('--val_file', default="DATA/train.txt", help="", type=str)
+     args = parser.parse_args()
+     os.makedirs(os.path.dirname(args.train_file), exist_ok = True)
+     count_val = 0
+     fw = open(args.train_file, "w+", encoding="utf-8")
+     fw_val = open(args.val_file, "w+", encoding="utf-8")
+     for i in tqdm.tqdm(glob.glob(args.wavs + "/*.wav")):
+          audio_filename, audio_rate = extract_audio(i)
+          output = w2v.inference(audio_filename)
+          fw.write(i.split("/")[-1] + " " + output + "\n")
+          if count_val < 64:
+               count_val = count_val + 1
+               fw_val.write(i.split("/")[-1] + " " + output + "\n")
+     fw.close()
+     fw_val.close()

Step1_data_processing.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from viphoneme import vi2IPA_split
+import tqdm, glob
+from pydub import AudioSegment
+def process_text():
+     f = open("DATA/train.txt", "r", encoding="utf-8")
+     lines = f.read().splitlines()
+     f.close()
+     norm_lines = []
+     for line in tqdm.tqdm(lines):
+          file, script = line.split(" ",1)
+          if not file.endswith(".wav"):
+               file = file + ".wav"
+          phoneme = vi2IPA_split(script.lower(), "/")
+          if len(phoneme.split(" ")) < 4:
+               continue
+          norm_lines.append(file+"|"+file.replace("/wavs", "/embedding").replace(".wav",".npy")+"|"+phoneme)
+     with open("DATA/train.txt", "w", encoding="utf-8") as file:
+          for item in norm_lines:
+               file.write(item + "\n")
+     f = open("DATA/val.txt", "r", encoding="utf-8")
+     lines = f.read().splitlines()
+     f.close()
+     norm_lines = []
+     for line in tqdm.tqdm(lines):
+          file, script = line.split(" ",1)
+          if not file.endswith(".wav"):
+               file = file + ".wav"
+          phoneme = vi2IPA_split(script.lower(), "/")
+          if len(phoneme.split(" ")) < 4:
+               continue
+          norm_lines.append(file+"|"+file.replace("/wavs", "/embedding").replace(".wav",".npy")+"|"+phoneme)
+     with open("DATA/val.txt", "w", encoding="utf-8") as file:
+          for item in norm_lines:
+               file.write(item + "\n")
+def process_speech():
+     wavs = glob.glob("DATA/wavs/*.wav")
+     for wav_file in tqdm.tqdm(wavs):
+          audio = AudioSegment.from_file(wav_file)
+          if audio.channels == 2:
+               # Convert stereo audio to mono
+               audio = audio.set_channels(1)
+          if audio.frame_rate != 22050:
+               # Convert the audio to 22050 Hz sample rate
+               audio = audio.set_frame_rate(22050)
+          audio.export(wav_file, format="wav")
+if __name__ == "__main__":
+     process_text()
+     process_speech()

Step2_extract_feature.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os, sys
+import glob
+from resemblyzer import preprocess_wav, VoiceEncoder
+import numpy as np
+import glob, tqdm
+encoder = VoiceEncoder(device='cpu')
+def extract_speaker_embedding():
+     wavs = glob.glob("DATA/wavs/*.wav")
+     os.makedirs("DATA/embedding", exist_ok=True)
+     for path in tqdm.tqdm(wavs):
+          wav = preprocess_wav(path)
+          embed = encoder.embed_utterance(wav)
+          # print(embed.shape) # (256,)
+          np.save(path.replace("wavs", "embedding").replace(".wav",".npy"), embed)
+if __name__ == '__main__':
+    extract_speaker_embedding()

requirements.txt ADDED Viewed

	@@ -0,0 +1,131 @@

+absl-py==2.0.0
+aiofiles==23.2.1
+altair==5.1.2
+annotated-types==0.6.0
+anyio==3.7.1
+attrs==23.1.0
+audioread==3.0.1
+Babel==2.13.1
+cachetools==4.2.4
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+clldutils==3.20.0
+colorama==0.4.6
+colorlog==6.7.0
+csvw==3.1.3
+cycler==0.12.1
+Cython==0.29.21
+decorator==4.4.2
+eng-to-ipa==0.0.2
+exceptiongroup==1.1.3
+fastapi==0.104.1
+ffmpy==0.3.1
+filelock==3.13.1
+fsspec==2023.10.0
+google-auth==2.23.4
+google-auth-oauthlib==1.0.0
+gradio==4.1.1
+gradio_client==0.7.0
+grpcio==1.59.2
+h11==0.14.0
+httpcore==1.0.1
+httpx==0.25.1
+huggingface-hub==0.18.0
+idna==3.4
+imageio==2.32.0
+imageio-ffmpeg==0.4.9
+importlib-metadata==6.8.0
+importlib-resources==6.1.0
+isodate==0.6.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.19.2
+jsonschema-specifications==2023.7.1
+kiwisolver==1.4.5
+language-tags==1.2.0
+lazy_loader==0.3
+librosa==0.8.0
+llvmlite==0.37.0
+lxml==4.9.3
+Markdown==3.5.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.3.1
+mdurl==0.1.2
+moviepy==1.0.3
+msgpack==1.0.7
+nltk==3.8.1
+numba==0.54.0
+numpy==1.19.5
+oauthlib==3.2.2
+orjson==3.9.10
+packaging==23.2
+pandas==1.1.5
+phonemizer==2.2.1
+Pillow==9.5.0
+pkgutil_resolve_name==1.3.10
+platformdirs==3.11.0
+pooch==1.8.0
+proglog==0.1.10
+protobuf==3.20.0
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycparser==2.21
+pydantic==2.4.2
+pydantic_core==2.10.1
+pydub==0.25.1
+Pygments==2.16.1
+pylatexenc==2.10
+pyparsing==3.1.1
+python-crfsuite==0.9.9
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyYAML==6.0.1
+rdflib==7.0.0
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+requests-oauthlib==1.3.1
+resampy==0.4.2
+rfc3986==1.5.0
+rich==13.6.0
+rpds-py==0.12.0
+rsa==4.9
+scikit-learn==1.3.2
+scipy==1.10.1
+segments==2.2.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.0
+soundfile==0.12.1
+soxr==0.3.7
+starlette==0.27.0
+tabulate==0.9.0
+tensorboard==2.14.0
+tensorboard-data-server==0.7.2
+tensorboard-plugin-wit==1.8.1
+threadpoolctl==3.2.0
+tomlkit==0.12.0
+toolz==0.12.0
+torch==1.12.0+cu116
+torchaudio==0.12.0+cu116
+torchvision==0.13.0+cu116
+tqdm==4.66.1
+typer==0.9.0
+typing_extensions==4.8.0
+tzdata==2023.3
+underthesea==6.8.0
+underthesea_core==1.0.4
+Unidecode==1.1.1
+uritemplate==4.1.1
+urllib3==2.0.7
+uvicorn==0.24.0.post1
+vinorm==2.0.7
+webrtcvad==2.0.10
+websockets==11.0.3
+Werkzeug==3.0.1
+zipp==3.17.0

utils_audio.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from moviepy.editor import VideoFileClip
+from pydub import AudioSegment
+def convert_to_wav(input_file):
+    _, extension = os.path.splitext(input_file)
+    extension = extension.lower()  # Convert to lowercase for case-insensitivity
+    output_wav_file = input_file.replace(extension, ".wav")
+    if extension == ".wav":
+        return output_wav_file
+    if extension == ".mp4":
+        video_clip = VideoFileClip(input_file)
+        audio_clip = video_clip.audio
+        audio_clip.write_audiofile(output_wav_file)
+        audio_clip.close()
+        print(f"{input_file} (MP4) converted to {output_wav_file}")
+        return output_wav_file
+    elif extension == ".mp3":
+        audio_clip = AudioSegment.from_mp3(input_file)
+        audio_clip.export(output_wav_file, format="wav")
+        print(f"{input_file} (MP3) converted to {output_wav_file}")
+        return output_wav_file
+    else:
+        print(f"Unsupported file format: {extension}")
+        return input_file