import os
import re
from num2words import num2words
import gradio as gr
import torch
import torchaudio
from data.tokenizer import (
from edit_utils_en import parse_edit_en
from edit_utils_en import parse_tts_en
from inference_scale import inference_one_sample
import librosa
import soundfile as sf
from models import ssr
import io
import numpy as np
import random
import uuid
import opencc
import spaces
import nltk'punkt')
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
device = "cuda" if torch.cuda.is_available() else "cpu"
def get_random_string():
return "".join(str(uuid.uuid4()).split("-"))
def seed_everything(seed):
if seed != -1:
os.environ['PYTHONHASHSEED'] = str(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def get_mask_interval(transcribe_state, word_span):
seg_num = len(transcribe_state['segments'])
data = []
for i in range(seg_num):
words = transcribe_state['segments'][i]['words']
for item in words:
data.append([item['start'], item['end'], item['word']])
s, e = word_span[0], word_span[1]
assert s <= e, f"s:{s}, e:{e}"
assert s >= 0, f"s:{s}"
assert e <= len(data), f"e:{e}"
if e == 0: # start
start = 0.
end = float(data[0][0])
elif s == len(data): # end
start = float(data[-1][1])
end = float(data[-1][1]) # don't know the end yet
elif s == e: # insert
start = float(data[s-1][1])
end = float(data[s][0])
start = float(data[s-1][1]) if s > 0 else float(data[s][0])
end = float(data[e][0]) if e < len(data) else float(data[-1][1])
return (start, end)
from whisperx import load_align_model, load_model, load_audio
from whisperx import align as align_func
ssrspeech_model_name = "English"
text_tokenizer = TextTokenizer(backend="espeak")
language = "en"
transcribe_model_name = "base.en"
ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
if not os.path.exists(ssrspeech_fn):
os.system(f"wget{ssrspeech_model_name}/resolve/main/{ssrspeech_model_name}.pth -O " + ssrspeech_fn)
ckpt = torch.load(ssrspeech_fn)
model = ssr.SSR_Speech(ckpt["config"])
config = model.args
phn2num = ckpt["phn2num"]
encodec_fn = f"{MODELS_PATH}/"
if not os.path.exists(encodec_fn):
os.system(f"wget -O " + encodec_fn)
ssrspeech_model = {
"config": config,
"phn2num": phn2num,
"model": model,
"text_tokenizer": text_tokenizer,
"audio_tokenizer": AudioTokenizer(signature=encodec_fn)
def get_transcribe_state(segments):
transcript = " ".join([segment["text"] for segment in segments])
transcript = transcript[1:] if transcript[0] == " " else transcript
return {
"segments": segments,
"transcript": transcript,
def transcribe(audio_path):
transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
for segment in segments:
segment['text'] = replace_numbers_with_words(segment['text'])
_, segments = align(segments, audio_path)
state = get_transcribe_state(segments)
success_message = "Success: Transcribe completed successfully!"
return [
state["transcript"], state['segments'],
state, success_message
def align(segments, audio_path):
align_model, metadata = load_align_model(language_code=language, device=device)
audio = load_audio(audio_path)
segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
state = get_transcribe_state(segments)
return state, segments
def get_output_audio(audio_tensors, codec_audio_sr):
result =, 1)
buffer = io.BytesIO(), result, int(codec_audio_sr), format="wav")
def replace_numbers_with_words(sentence):
sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
def replace_with_words(match):
num =
return num2words(num) # Convert numbers to words
return num # In case num2words fails (unlikely with digits but just to be safe)
return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
def run(seed, sub_amount, codec_audio_sr, codec_sr, top_k, top_p, temperature,
stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef, prompt_length,
audio_path, original_transcript, transcript, mode):
aug_text = True if aug_text == 1 else False
if ssrspeech_model is None:
raise gr.Error("ssrspeech model not loaded")
# resample audio
audio, _ = librosa.load(audio_path, sr=16000)
sf.write(audio_path, audio, 16000)
# text normalization
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
[orig_transcript, segments, _, _] = transcribe(audio_path)
orig_transcript = orig_transcript.lower()
target_transcript = target_transcript.lower()
transcribe_state,_ = align(segments, audio_path)
if mode == "TTS":
info =
duration = info.num_frames / info.sample_rate
cut_length = duration
# Cut long audio for tts
if duration > prompt_length:
seg_num = len(transcribe_state['segments'])
for i in range(seg_num):
words = transcribe_state['segments'][i]['words']
for item in words:
if item['end'] >= prompt_length:
cut_length = min(item['end'], cut_length)
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
sf.write(audio_path, audio, 16000)
[orig_transcript, segments, _, _] = transcribe(audio_path)
orig_transcript = orig_transcript.lower()
target_transcript = target_transcript.lower()
transcribe_state,_ = align(segments, audio_path)
target_transcript_copy = target_transcript # for tts cut out
target_transcript_copy = target_transcript_copy.split(' ')[0]
target_transcript = orig_transcript + ' ' + target_transcript
if mode == "Edit":
operations, orig_spans = parse_edit_en(orig_transcript, target_transcript)
print("orig_spans: ", orig_spans)
if len(orig_spans) > 3:
raise gr.Error("Current model only supports maximum 3 editings")
starting_intervals = []
ending_intervals = []
for orig_span in orig_spans:
start, end = get_mask_interval(transcribe_state, orig_span)
print("intervals: ", starting_intervals, ending_intervals)
info =
audio_dur = info.num_frames / info.sample_rate
def combine_spans(spans, threshold=0.2):
spans.sort(key=lambda x: x[0])
combined_spans = []
current_span = spans[0]
for i in range(1, len(spans)):
next_span = spans[i]
if current_span[1] >= next_span[0] - threshold:
current_span[1] = max(current_span[1], next_span[1])
current_span = next_span
return combined_spans
morphed_span = [[max(start - sub_amount, 0), min(end + sub_amount, audio_dur)]
for start, end in zip(starting_intervals, ending_intervals)] # in seconds
morphed_span = combine_spans(morphed_span, threshold=0.2)
print("morphed_spans: ", morphed_span)
mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
info =
audio_dur = info.num_frames / info.sample_rate
morphed_span = [(audio_dur, audio_dur)] # in seconds
mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
print("mask_interval: ", mask_interval)
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr}
tts = True if mode == "TTS" else False
new_audio = inference_one_sample(
audio_path, orig_transcript, target_transcript, mask_interval,
cfg_coef, aug_text, False, True, tts,
device, decode_config
audio_tensors = []
# save segments for comparison
new_audio = new_audio[0].cpu(), new_audio, codec_audio_sr)
if tts: # remove the start parts
[new_transcript, new_segments, _, _] = transcribe(audio_path)
transcribe_state,_ = align(new_segments, audio_path)
tmp1 = transcribe_state['segments'][0]['words'][0]['word'].lower()
tmp2 = target_transcript_copy.lower()
if tmp1 == tmp2:
offset = transcribe_state['segments'][0]['words'][0]['start']
offset = transcribe_state['segments'][0]['words'][1]['start']
new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
success_message = "Success: Inference successfully!"
return output_audio, success_message
demo_original_transcript = "Gwynplain had besides for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
demo_text = {
"TTS": {
"regular": "Gwynplain had besides for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!"
"Edit": {
"regular": "Gwynplain had besides for his work and feats of strength, hanging from his neck and shoulders, an esclavine of leather."
def get_app():
with gr.Blocks() as app:
# SSR-Speech: High-quality Speech Editor and Text-to-Speech Synthesizer
Generate and edit speech from text. Adjust advanced settings for more control.
Learn more about 🟣**SSR-Speech** on the [SSR-Speech Homepage](
🚀 The **SSR-Speech (Mandarin)** demo is now live! Try it on [🤗SSR-Speech-Mandarin Space](
with gr.Row():
with gr.Column(scale=2):
input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
with gr.Group():
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
info="Use whisperx model to get the transcript.")
transcribe_btn = gr.Button(value="Transcribe")
with gr.Column(scale=3):
with gr.Group():
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["Edit"]["regular"])
with gr.Row():
mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
run_btn = gr.Button(value="Run")
with gr.Column(scale=2):
output_audio = gr.Audio(label="Output Audio")
with gr.Row():
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
info="set to 0 to use less VRAM, but with slower inference")
aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
info="set to 1 to use cfg")
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
prompt_length = gr.Number(label="prompt_length", value=3,
info="used for tts prompt, will automatically cut the prompt audio to this length")
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not change")
top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, do not change')
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, do not change')
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
success_output = gr.HTML()
semgents = gr.State() # not used,
outputs=[original_transcript, semgents, success_output]),
seed, sub_amount,
codec_audio_sr, codec_sr,
top_k, top_p, temperature, stop_repetition, kvcache, silence_tokens,
aug_text, cfg_coef, prompt_length,
input_audio, original_transcript, transcript,
outputs=[output_audio, success_output])
return app
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Ssrspeech gradio app.")
parser.add_argument("--demo-path", default="./demo", help="Path to demo directory")
parser.add_argument("--tmp-path", default="./demo/temp", help="Path to tmp directory")
parser.add_argument("--models-path", default="./pretrained_models", help="Path to ssrspeech models directory")
parser.add_argument("--port", default=7860, type=int, help="App port")
parser.add_argument("--share", action="store_true", help="Launch with public url")
os.environ["USER"] = os.getenv("USER", "user")
args = parser.parse_args()
DEMO_PATH = args.demo_path
TMP_PATH = args.tmp_path
MODELS_PATH = args.models_path
app = get_app()
app.queue().launch(share=args.share, server_port=args.port)