Spaces:

projecte-aina
/

transcripcio-fonetica-catala

Running

App Files Files Community

transcripcio-fonetica-catala / app.py

ccoreilly

use filepath

f47653c almost 2 years ago

raw

history blame

3.4 kB

	from engine import Piper
	import tempfile
	from typing import Optional
	from TTS.config import load_config
	import gradio as gr
	import numpy as np
	import os
	import json
	from TTS.utils.manage import ModelManager
	from TTS.utils.synthesizer import Synthesizer

	MAX_TXT_LEN = 100

	SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']

	def carrega_bsc():
	model_path = os.getcwd() + "/models/bsc/best_model.pth"
	config_path = os.getcwd() + "/models/bsc/config.json"
	speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
	vocoder_path = None
	vocoder_config_path = None

	synthesizer = Synthesizer(
	model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
	)

	return synthesizer

	def carrega_collectivat():
	model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
	config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
	vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
	vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
	synthesizer = Synthesizer(
	model_path, config_path, None, None, vocoder_path, vocoder_config_path
	)

	return synthesizer

	def carrega_piper():
	return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")


	model_bsc = carrega_bsc()
	SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names

	model_collectivat = carrega_collectivat()

	model_piper = carrega_piper()

	def tts(text, speaker_idx):
	if len(text) > MAX_TXT_LEN:
	text = text[:MAX_TXT_LEN]
	print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
	print(text)

	# synthesize
	wav_bsc = model_bsc.tts(text, speaker_idx)
	wav_coll = model_collectivat.tts(text)
	wav_piper = model_piper.synthesize(text)

	#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)

	# return output
	fp_bsc = ""
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	model_bsc.save_wav(wav_bsc, fp)
	fp_bsc = fp.name

	fp_coll = ""
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	model_collectivat.save_wav(wav_coll, fp)
	fp_coll = fp.name

	fp_piper = ""
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	fp.write(wav_piper)
	fp_piper = fp.name

	return fp_bsc, fp_coll, fp_piper


	description="""
	1️⃣ Introdueix el text a sintetitzar.

	2️⃣ Selecciona una veu en el desplegable.

	3️⃣ Gaudeix!
	"""
	article= ""

	iface = gr.Interface(
	fn=tts,
	inputs=[
	gr.Textbox(
	label="Text",
	default="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
	),
	gr.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, default="ona")
	],
	outputs=[
	gr.Audio(label="BSC VITS",type="filepath"),
	gr.Audio(label="Collectivat Fastspeech",type="filepath"),
	gr.Audio(label="Piper VITS",type="filepath")
	],
	title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
	description=description,
	article=article,
	allow_flagging="never",
	layout="vertical",
	live=False
	)
	iface.launch(server_name="0.0.0.0", server_port=7860)