Spaces:
Running
on
T4
Running
on
T4
File size: 7,872 Bytes
565faca 4f47580 a113e94 4f47580 565faca 4f47580 565faca f7aabee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
import io
import json
import os
import gradio as gr
import requests
import soundfile as sf
API_SERVER_URL = "http://127.0.0.1:58003/tts"
RADIO_CHOICES = ["Preset voices", "Upload target voice", "Record your voice"]
MAX_CHARS = 220
PRESET_VOICES = {
# female
"Ava": "https://cdn.themetavoice.xyz/speakers/ava.flac",
"Bria": "https://cdn.themetavoice.xyz/speakers/bria.mp3",
# male
"Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3",
"Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav",
}
def denormalise_top_p(top_p):
# returns top_p in the range [0.9, 1.0]
return round(0.9 + top_p / 100, 2)
def denormalise_guidance(guidance):
# returns guidance in the range [1.0, 3.0]
return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1)
def _handle_edge_cases(to_say, upload_target):
if not to_say:
raise gr.Error("Please provide text to synthesise")
def _check_file_size(path):
if not path:
return
filesize = os.path.getsize(path)
filesize_mb = filesize / 1024 / 1024
if filesize_mb >= 50:
raise gr.Error(
f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB"
)
_check_file_size(upload_target)
def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target, record_target):
d_top_p = denormalise_top_p(top_p)
d_guidance = denormalise_guidance(guidance)
_handle_edge_cases(to_say, upload_target)
to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS]
custom_target_path = None
if toggle == RADIO_CHOICES[1]:
custom_target_path = upload_target
elif toggle == RADIO_CHOICES[2]:
custom_target_path = record_target
config = {
"text": to_say,
"guidance": d_guidance,
"top_p": d_top_p,
"speaker_ref_path": PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else None,
}
headers = {"Content-Type": "audio/wav", "X-Payload": json.dumps(config)}
if not custom_target_path:
response = requests.post(API_SERVER_URL, headers=headers, data=None)
else:
with open(custom_target_path, "rb") as f:
data = f.read()
response = requests.post(API_SERVER_URL, headers=headers, data=data)
wav, sr = None, None
if response.status_code == 200:
audio_buffer = io.BytesIO(response.content)
audio_buffer.seek(0)
wav, sr = sf.read(audio_buffer, dtype="float32")
else:
print(f"Something went wrong. response status code: {response.status_code}")
return sr, wav
def change_voice_selection_layout(choice):
index = RADIO_CHOICES.index(choice)
return [
gr.update(visible=True)
if i == index else gr.update(visible=False)
for i in range(len(RADIO_CHOICES))
]
title = "# TTS by Kotoba-Speech"
description = """
<strong>Kotoba-Speech v0.1</strong>は、1.2Bのトランスフォーマーに基づく音声生成モデルです。
以下の機能をサポートしています:
\n
* 日本語における滑らかなテキスト読み上げ生成
* スピーチプロンプトを通じたOne-shot音声クローニング
Kotoba Technologiesは、公開されたモデルを商用可能なApache 2.0ライセンスで公開します。
推論およびモデルコードは、Meta-Voiceをベースに作られており、学習コードは弊社のGitHubで近日中に公開する予定です。
Kotoba Technologiesは、音声基盤モデルの開発に取り組んでおり、今後もモデルの公開を行なっていきます。是非、[Discord Community](https://discord.gg/qPVFqhGN7Z)に参加してご意見ください!
<strong>Kotoba-Speech v0.1</strong> is a 1.2B Transformer-based speech generative model. It supports the following properties:
\n
* Fluent text-to-speech generation in Japanese
* One-shot voice cloning through speech prompt
We are releasing our model under the Apache 2.0 license. Our inference and model code is adapted from Meta-Voice, and we will our training code on our GitHub repository shortly.
Kotoba Technologies is committing on developing speech foundation models, and we’ll continue releasing our models. Please join [our discord](https://discord.gg/qPVFqhGN7Z) to contribute to out community.
"""
ga_id = os.getenv("ga_id")
ga_script = f"<script async src=\"https://www.googletagmanager.com/gtag/js?id={ga_id}\"></script>"
ga_load = """
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){{dataLayer.push(arguments);}}
gtag('js', new Date());
gtag('config', '{ga_id}');
</script>
"""
ga_load = ga_load.format_map({"ga_id": ga_id})
with gr.Blocks(title="TTS by Kotoba-Speech", head=ga_script) as demo:
gr.Markdown(title)
with gr.Row():
gr.Markdown(description)
with gr.Row():
with gr.Column():
to_say = gr.TextArea(
label="What should I say!?",
lines=4,
value="コトバテクノロジーズのミッションは、音声基盤モデルを作ることです。",
)
with gr.Row(), gr.Column():
# voice settings
top_p = gr.Slider(
value=5.0,
minimum=0.0,
maximum=10.0,
step=1.0,
label="Speech Stability - improves text following for a challenging speaker",
)
guidance = gr.Slider(
value=5.0,
minimum=1.0,
maximum=5.0,
step=1.0,
label="Speaker similarity - How closely to match speaker identity and speech style.",
)
# voice select
toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
with gr.Row(visible=True) as row_1:
preset_dropdown = gr.Dropdown(
PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
)
with gr.Accordion("Preview: Preset voices", open=False):
for label, path in PRESET_VOICES.items():
gr.Audio(value=path, label=label)
with gr.Row(visible=False) as row_2:
upload_target = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload a clean sample to clone. Sample should contain 1 speaker, be between 10-90 seconds and not contain background noise.",
min_length=10,
max_length=90,
)
with gr.Row(visible=False) as row_3:
record_target = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record your voice with a microphone to clone. Sample should contain 1 speaker, be between 10-90 seconds and not contain background noise.",
min_length=10,
max_length=90,
)
toggle.change(
change_voice_selection_layout,
inputs=toggle,
outputs=[row_1, row_2, row_3],
)
with gr.Column():
speech = gr.Audio(
type="numpy",
label="Kotoba-Speech says...",
)
submit = gr.Button("Generate Speech")
submit.click(
fn=tts,
inputs=[to_say, top_p, guidance, toggle, preset_dropdown, upload_target, record_target],
outputs=speech,
)
demo.load(None, js=ga_load)
demo.queue(default_concurrency_limit=2)
demo.launch()
# demo.launch(server_name="0.0.0.0", server_port=3000, share=True) # dev
|