Spaces:

nisheeth
/

SSMTDemo

Build error

App Files Files Community

nisheeth commited on Mar 20

Commit

3fe4d91

•

1 Parent(s): 2783814

Upload 7 files

Browse files

Files changed (7) hide show

.gitattributes +6 -34
.gitignore +163 -0
README.md +6 -5
app.py +317 -0
packages.txt +2 -0
pre-requirements.txt +0 -0
requirements.txt +116 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,7 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.json filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*.pt* filter=lfs diff=lfs merge=lfs -text
+*.ckpt* filter=lfs diff=lfs merge=lfs -text
+*.pl filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.ini filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+# *.so
+Temp_Audios/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# *.ini
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: SSMTDemo
-emoji: 😻
-colorFrom: yellow
-colorTo: blue
 sdk: gradio
-sdk_version: 4.22.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OcTra
+emoji: 🏆
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 3.39.0
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# load the libraries for the application
+# -------------------------------------------
+import os
+import re
+import nltk
+import torch
+import librosa
+import tempfile
+import subprocess
+import gradio as gr
+from scipy.io import wavfile
+from nnet import utils, commons
+from transformers import pipeline
+from scipy.io.wavfile import write
+from faster_whisper import WhisperModel
+from nnet.models import SynthesizerTrn as vitsTRN
+from nnet.models_vc import SynthesizerTrn as freeTRN
+from nnet.mel_processing import mel_spectrogram_torch
+from configurations.get_constants import constantConfig
+from speaker_encoder.voice_encoder import SpeakerEncoder
+from df_local.enhance import enhance, init_df, load_audio, save_audio
+from configurations.get_hyperparameters import hyperparameterConfig
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+# making the FreeVC function
+# ---------------------------------
+class FreeVCModel:
+    def __init__(self, config, ptfile, speaker_model, wavLM_model, device='cpu'):
+        self.hps = utils.get_hparams_from_file(config)
+        self.net_g = freeTRN(
+            self.hps.data.filter_length // 2 + 1,
+            self.hps.train.segment_size // self.hps.data.hop_length,
+            **self.hps.model
+        ).to(hyperparameters.device)
+        _ = self.net_g.eval()
+        _ = utils.load_checkpoint(ptfile, self.net_g, None, True)
+        self.cmodel = utils.get_cmodel(device, wavLM_model)
+        if self.hps.model.use_spk:
+            self.smodel = SpeakerEncoder(speaker_model)
+    def convert(self, src, tgt):
+        fs_src, src_audio = src
+        fs_tgt, tgt_audio = tgt
+        src = f"{constants.temp_audio_folder}/src.wav"
+        tgt = f"{constants.temp_audio_folder}/tgt.wav"
+        out = f"{constants.temp_audio_folder}/cnvr.wav"
+        with torch.no_grad():
+            wavfile.write(tgt, fs_tgt, tgt_audio)
+            wav_tgt, _ = librosa.load(tgt, sr=self.hps.data.sampling_rate)
+            wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+            if self.hps.model.use_spk:
+                g_tgt = self.smodel.embed_utterance(wav_tgt)
+                g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(hyperparameters.device.type)
+            else:
+                wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(hyperparameters.device.type)
+                mel_tgt = mel_spectrogram_torch(
+                    wav_tgt,
+                    self.hps.data.filter_length,
+                    self.hps.data.n_mel_channels,
+                    self.hps.data.sampling_rate,
+                    self.hps.data.hop_length,
+                    self.hps.data.win_length,
+                    self.hps.data.mel_fmin,
+                    self.hps.data.mel_fmax,
+                )
+            wavfile.write(src, fs_src, src_audio)
+            wav_src, _ = librosa.load(src, sr=self.hps.data.sampling_rate)
+            wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(hyperparameters.device.type)
+            c = utils.get_content(self.cmodel, wav_src)
+            if self.hps.model.use_spk:
+                audio = self.net_g.infer(c, g=g_tgt)
+            else:
+                audio = self.net_g.infer(c, mel=mel_tgt)
+            audio = audio[0][0].data.cpu().float().numpy()
+            write(out, 24000, audio)
+            return out
+# load the system configurations
+constants       = constantConfig()
+hyperparameters = hyperparameterConfig()
+# load the models
+model, df_state, _  = init_df(hyperparameters.voice_enhacing_model, config_allow_defaults=True) # voice enhancing model
+stt_model           = WhisperModel(hyperparameters.stt_model, device=hyperparameters.device.type, compute_type="float32") #speech to text model
+trans_model     = AutoModelForSeq2SeqLM.from_pretrained(constants.model_name_dict[hyperparameters.nllb_model], torch_dtype=torch.bfloat16).to(hyperparameters.device)
+trans_tokenizer = AutoTokenizer.from_pretrained(constants.model_name_dict[hyperparameters.nllb_model])
+modelConvertSpeech = FreeVCModel(config=hyperparameters.text2speech_config, ptfile=hyperparameters.text2speech_model,
+                                 speaker_model=hyperparameters.text2speech_encoder, wavLM_model=hyperparameters.wavlm_model,
+                                 device=hyperparameters.device.type)
+# download the language model if doesn't existing
+# ----------------------------------------------------
+def download(lang, lang_directory):
+    if not os.path.exists(f"{lang_directory}/{lang}"):
+        cmd = ";".join([
+                f"wget {constants.language_download_web}/{lang}.tar.gz -O {lang_directory}/{lang}.tar.gz",
+                f"tar zxvf {lang_directory}/{lang}.tar.gz -C {lang_directory}"
+        ])
+        subprocess.check_output(cmd, shell=True)
+    try:
+        os.remove(f"{lang_directory}/{lang}.tar.gz")
+    except:
+        pass
+    return f"{lang_directory}/{lang}"
+def preprocess_char(text, lang=None):
+    """
+    Special treatement of characters in certain languages
+    """
+    if lang == 'ron':
+        text = text.replace("ț", "ţ")
+    return text
+def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
+    txt = preprocess_char(txt, lang=lang)
+    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
+    if is_uroman:
+        txt  = text_mapper.uromanize(txt, f'{uroman_dir}/bin/uroman.pl')
+    txt = txt.lower()
+    txt = text_mapper.filter_oov(txt)
+    return txt
+def detect_language(text,LID):
+    predictions = LID.predict(text)
+    detected_lang_code = predictions[0][0].replace("__label__", "")
+    return detected_lang_code
+# text to speech
+class TextMapper(object):
+    def __init__(self, vocab_file):
+        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
+        self.SPACE_ID = self.symbols.index(" ")
+        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
+        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
+    def text_to_sequence(self, text, cleaner_names):
+        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+        Args:
+        text: string to convert to a sequence
+        cleaner_names: names of the cleaner functions to run the text through
+        Returns:
+        List of integers corresponding to the symbols in the text
+        '''
+        sequence = []
+        clean_text = text.strip()
+        for symbol in clean_text:
+            symbol_id = self._symbol_to_id[symbol]
+            sequence += [symbol_id]
+        return sequence
+    def uromanize(self, text, uroman_pl):
+        with tempfile.NamedTemporaryFile() as tf, \
+             tempfile.NamedTemporaryFile() as tf2:
+            with open(tf.name, "w") as f:
+                f.write("\n".join([text]))
+            cmd = f"perl " + uroman_pl
+            cmd += f" -l xxx "
+            cmd +=  f" < {tf.name} > {tf2.name}"
+            os.system(cmd)
+            outtexts = []
+            with open(tf2.name) as f:
+                for line in f:
+                    line =  re.sub(r"\s+", " ", line).strip()
+                    outtexts.append(line)
+            outtext = outtexts[0]
+        return outtext
+    def get_text(self, text, hps):
+        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
+        if hps.data.add_blank:
+            text_norm = commons.intersperse(text_norm, 0)
+        text_norm = torch.LongTensor(text_norm)
+        return text_norm
+    def filter_oov(self, text):
+        val_chars = self._symbol_to_id
+        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
+        return txt_filt
+def speech_to_text(audio_file):
+    try:
+        fs, audio = audio_file
+        wavfile.write(constants.input_speech_file, fs, audio)
+        audio0, _ = load_audio(constants.input_speech_file, sr=df_state.sr())
+        # Enhance the SNR of the audio
+        enhanced = enhance(model, df_state, audio0)
+        save_audio(constants.enhanced_speech_file, enhanced, df_state.sr())
+        segments, info = stt_model.transcribe(constants.enhanced_speech_file)
+        speech_text = ''
+        for segment in segments:
+            speech_text = f'{speech_text}{segment.text}'
+        try:
+            source_lang_nllb = [k for k, v in constants.flores_codes_to_tts_codes.items() if v[:2] == info.language][0]
+        except:
+            source_lang_nllb = 'language cant be determined, select manually'
+        # text translation
+        return speech_text, gr.Dropdown.update(value=source_lang_nllb)
+    except:
+        return '', gr.Dropdown.update(value='English')
+# Text tp speech
+def text_to_speech(text, target_lang):
+    txt = text
+    # LANG = get_target_tts_lang(target_lang)
+    LANG = constants.flores_codes_to_tts_codes[target_lang]
+    ckpt_dir = download(LANG, lang_directory=constants.language_directory)
+    vocab_file  = f"{ckpt_dir}/{constants.language_vocab_text}"
+    config_file = f"{ckpt_dir}/{constants.language_vocab_configuration}"
+    hps = utils.get_hparams_from_file(config_file)
+    text_mapper = TextMapper(vocab_file)
+    net_g = vitsTRN(
+        len(text_mapper.symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **hps.model)
+    net_g.to(hyperparameters.device)
+    _ = net_g.eval()
+    g_pth = f"{ckpt_dir}/{constants.language_vocab_model}"
+    _ = utils.load_checkpoint(g_pth, net_g, None)
+    txt = preprocess_text(txt, text_mapper, hps, lang=LANG, uroman_dir=constants.uroman_directory)
+    stn_tst = text_mapper.get_text(txt, hps)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0).to(hyperparameters.device)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(hyperparameters.device)
+        hyp = net_g.infer(
+            x_tst, x_tst_lengths, noise_scale=.667,
+            noise_scale_w=0.8, length_scale=1.0
+        )[0][0,0].cpu().float().numpy()
+    return hps.data.sampling_rate, hyp
+def translation(audio, text, source_lang_nllb, target_code_nllb, output_type, sentence_mode):
+    target_code    = constants.flores_codes[target_code_nllb]
+    translator     = pipeline('translation', model=trans_model, tokenizer=trans_tokenizer, src_lang=source_lang_nllb, tgt_lang=target_code, device=hyperparameters.device)
+    # output = translator(text, max_length=400)[0]['translation_text']
+    if sentence_mode == "Sentence-wise":
+        sentences = sent_tokenize(text)
+        translated_sentences = []
+        for sentence in sentences:
+            translated_sentence = translator(sentence, max_length=400)[0]['translation_text']
+            translated_sentences.append(translated_sentence)
+        output = ' '.join(translated_sentences)
+    else:
+        output = translator(text, max_length=1024)[0]['translation_text']
+    # get the text to speech
+    fs_out, audio_out = text_to_speech(output, target_code_nllb)
+    if output_type == 'own voice':
+        out_file = modelConvertSpeech.convert((fs_out, audio_out), audio)
+        return output, out_file
+    wavfile.write(constants.text2speech_wavfile, fs_out, audio_out)
+    return output, constants.text2speech_wavfile
+with gr.Blocks(title = "Octopus Translation App") as octopus_translator:
+    with gr.Row():
+        audio_file = gr.Audio(source="microphone")
+    with gr.Row():
+        input_text  = gr.Textbox(label="Input text")
+        source_language     = gr.Dropdown(list(constants.flores_codes.keys()), value='English', label='Source (Autoselected)', interactive=True)
+    with gr.Row():
+        output_text = gr.Textbox(label='Translated text')
+        target_language  = gr.Dropdown(list(constants.flores_codes.keys()), value='German', label='Target', interactive=True)
+    with gr.Row():
+        output_speech = gr.Audio(label='Translated speech')
+        translate_button = gr.Button('Translate')
+    with gr.Row():
+        enhance_audio       = gr.Radio(['yes', 'no'], value='yes', label='Enhance input voice', interactive=True)
+        input_type          = gr.Radio(['Whole text', 'Sentence-wise'],value='Sentence-wise', label="Translation Mode", interactive=True)
+        output_audio_type   = gr.Radio(['standard speaker', 'voice transfer'], value='voice transfer', label='Enhance output voice', interactive=True)
+    audio_file.change(speech_to_text,
+                      inputs=[audio_file],
+                      outputs=[input_text, source_language])
+    translate_button.click(translation,
+                           inputs=[audio_file, input_text,
+                                   source_language, target_language,
+                                   output_audio_type, input_type],
+                           outputs=[output_text, output_speech])
+octopus_translator.launch(share=False)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ rustc

pre-requirements.txt ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,116 @@

+aiofiles==23.2.1
+aiohttp==3.8.5
+aiosignal==1.3.1
+altair==5.0.1
+annotated-types==0.5.0
+anyio==3.7.1
+asttokens==2.2.1
+async-timeout==4.0.3
+attrs==23.1.0
+audioread==3.0.0
+av==10.0.0
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.6
+colorama==0.4.6
+coloredlogs==15.0.1
+contourpy==1.1.0
+ctranslate2==3.18.0
+cycler==0.11.0
+Cython==3.0.0
+decorator==5.1.1
+DeepFilterLib==0.2.4
+deepfilternet==0.2.4
+executing==1.2.0
+fastapi==0.101.1
+faster-whisper==0.7.1
+ffmpeg-python==0.2.0
+ffmpy==0.3.1
+filelock==3.9.0
+flatbuffers==23.5.26
+fonttools==4.42.0
+frozenlist==1.4.0
+fsspec==2023.6.0
+future==0.18.3
+gradio==3.40.1
+gradio_client==0.4.0
+h11==0.14.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.16.4
+humanfriendly==10.0
+icecream==2.1.3
+idna==3.4
+importlib-resources==6.0.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+kiwisolver==1.4.4
+lazy_loader==0.3
+librosa==0.10.1
+linkify-it-py==2.0.2
+llvmlite==0.40.1
+loguru==0.7.0
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.2
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mpmath==1.2.1
+msgpack==1.0.5
+multidict==6.0.4
+networkx==3.0
+nltk==3.8.1
+numba==0.57.1
+numpy==1.24.4
+onnxruntime==1.15.1
+orjson==3.9.5
+packaging==23.1
+pandas==2.0.3
+Pillow==9.3.0
+platformdirs==3.10.0
+pooch==1.7.0
+protobuf==4.24.0
+pycparser==2.21
+pydantic==2.1.1
+pydantic_core==2.4.0
+pydub==0.25.1
+Pygments==2.16.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0.1
+referencing==0.30.2
+regex==2023.8.8
+requests==2.28.1
+rpds-py==0.9.2
+safetensors==0.3.2
+scikit-learn==1.3.0
+scipy==1.11.1
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+soundfile==0.12.1
+soxr==0.3.6
+starlette==0.27.0
+sympy==1.11.1
+threadpoolctl==3.2.0
+tokenizers==0.13.3
+toolz==0.12.0
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==2.0.1+cpu
+torchaudio==2.0.2+cpu
+torchvision==0.15.2+cpu
+tqdm==4.66.1
+transformers==4.31.0
+typing_extensions==4.7.1
+tzdata==2023.3
+uc-micro-py==1.0.2
+urllib3==1.26.13
+uvicorn==0.23.2
+webrtcvad==2.0.10
+websockets==11.0.3
+yarl==1.9.2