Spaces:
Running
Running
import os | |
import gradio as gr | |
import numpy as np | |
import torch | |
from pathlib import Path | |
os.system("pip uninstall -y gradio") | |
os.system("pip install gradio==3.2") | |
from demo_inference.demo_tts import DemoTTS | |
from demo_inference.demo_asr import DemoASR | |
from demo_inference.demo_anonymization import DemoAnonymizer | |
def pcm2float(sig, dtype='float32'): | |
""" | |
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 | |
""" | |
sig = np.asarray(sig) | |
if sig.dtype.kind not in 'iu': | |
raise TypeError("'sig' must be an array of integers") | |
dtype = np.dtype(dtype) | |
if dtype.kind != 'f': | |
raise TypeError("'dtype' must be a floating point type") | |
i = np.iinfo(sig.dtype) | |
abs_max = 2 ** (i.bits - 1) | |
offset = i.min + abs_max | |
return (sig.astype(dtype) - offset) / abs_max | |
def float2pcm(sig, dtype='int16'): | |
""" | |
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 | |
""" | |
sig = np.asarray(sig) | |
if sig.dtype.kind != 'f': | |
raise TypeError("'sig' must be a float array") | |
dtype = np.dtype(dtype) | |
if dtype.kind not in 'iu': | |
raise TypeError("'dtype' must be an integer type") | |
i = np.iinfo(dtype) | |
abs_max = 2 ** (i.bits - 1) | |
offset = i.min + abs_max | |
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) | |
class VPInterface: | |
def __init__(self): | |
self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
self.path_to_tts_models = Path('models', 'tts') | |
self.path_to_asr_model = Path('models', 'asr') | |
self.path_to_anon_model = Path('models', 'anonymization') | |
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100', | |
device=self.device) | |
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device) | |
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool', | |
device=self.device) | |
def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag): | |
sr, audio = recording | |
audio = pcm2float(audio) | |
self._check_models(asr_model_tag, anon_model_tag, tts_model_tag) | |
text_is_phonemes = (self.asr_model.model_tag == 'phones') | |
text = self.asr_model.recognize_speech(audio, sr) | |
print(text) | |
speaker_embedding = self.anon_model.anonymize_embedding(audio, sr) | |
print(speaker_embedding) | |
syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding, | |
text_is_phonemes=text_is_phonemes) | |
return 48000, float2pcm(syn_audio.cpu().numpy()) | |
def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag): | |
if asr_model_tag != self.asr_model.model_tag: | |
self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device) | |
if anon_model_tag != self.anon_model.model_tag: | |
self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag, | |
device=self.device) | |
if tts_model_tag != self.synthesis_model.model_tag: | |
self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag, | |
device=self.device) | |
model = VPInterface() | |
article = """ | |
This demo allows you to anonymize your input speech by defining the single models for ASR, anonymization and TTS. If | |
you want to know more about each model, please read the paper linked above. Every time you click the *submit* button, | |
you should receive a new voice. | |
Note that for *pool* anonymization in this demo, we are using a different scaling approach ( | |
sklearn.preprocessing.StandardScaler instead of sklearn.preprocessing.MinMaxScaler) because we are processing only | |
one sample at a time and would otherwise always end up with the same voice. | |
This demo is still work in progress, so please be lenient with possible low quality and errors. Also, be aware that | |
this Huggingface space runs on CPU which makes the demo quite slow. | |
For more information about this system, visit our Github page: [https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization) | |
""" | |
description = """ | |
## Test demo corresponding to the models in our paper [Speaker Anonymization with Phonetic Intermediate Representations](https://arxiv.org/abs/2207.04834) | |
""" | |
css = """ | |
.gr-button-primary {background-color: green !important, border-color: green} | |
""" | |
iface = gr.Interface(fn=model.read, | |
inputs=[gr.inputs.Audio(source='microphone', type='numpy', label='Say a sentence in English.'), | |
gr.inputs.Dropdown(['phones', 'STT', 'TTS'], type='value', default='phones', | |
label='ASR model'), | |
gr.inputs.Dropdown(['pool', 'random', 'pool raw'], type='value', default='pool', | |
label='Anonymization'), | |
gr.inputs.Dropdown(['Libri100', 'Libri100 + finetuned', 'Libri600', | |
'Libri600 + finetuned'], type='value', default='Libri100', | |
label='TTS model') | |
], | |
outputs=gr.outputs.Audio(type='numpy', label=None), | |
layout='vertical', | |
title='IMS Speaker Anonymization', | |
description=description, | |
theme='default', | |
allow_flagging='never', | |
article=article, | |
allow_screenshot=False) | |
iface.launch(enable_queue=True) | |