Spaces:

softcatala
/

comparativa-tts-catala

Running

App Files Files Community

comparativa-tts-catala / app.py

ccoreilly

manté puntuació també als fonemes mostrats

cd62b9b over 1 year ago

raw

history blame

4.51 kB

	from engine import Piper
	import tempfile
	from typing import Optional
	from TTS.config import load_config
	import gradio as gr
	import numpy as np
	import os
	import json
	from TTS.utils.manage import ModelManager
	from TTS.utils.synthesizer import Synthesizer
	from espeak_phonemizer import Phonemizer

	MAX_TXT_LEN = 325

	SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']

	fonemitzador = Phonemizer("ca")

	def carrega_bsc():
	model_path = os.getcwd() + "/models/bsc/best_model.pth"
	config_path = os.getcwd() + "/models/bsc/config.json"
	speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
	vocoder_path = None
	vocoder_config_path = None

	synthesizer = Synthesizer(
	model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
	)

	return synthesizer

	def carrega_collectivat():
	model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
	config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
	vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
	vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
	synthesizer = Synthesizer(
	model_path, config_path, None, None, vocoder_path, vocoder_config_path
	)

	return synthesizer

	def carrega_piper():
	return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")


	model_bsc = carrega_bsc()
	SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names

	model_collectivat = carrega_collectivat()

	model_piper = carrega_piper()

	def tts(text, speaker_idx):
	if len(text) > MAX_TXT_LEN:
	text = text[:MAX_TXT_LEN]
	print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
	print(text)

	# synthesize
	wav_bsc = model_bsc.tts(text, speaker_idx)
	wav_coll = model_collectivat.tts(text)
	wav_piper = model_piper.synthesize(text)

	#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)

	# return output
	fp_bsc = ""
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	model_bsc.save_wav(wav_bsc, fp)
	fp_bsc = fp.name

	fp_coll = ""
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	model_collectivat.save_wav(wav_coll, fp)
	fp_coll = fp.name

	fp_piper = ""
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	fp.write(wav_piper)
	fp_piper = fp.name

	fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)

	return fonemes, fp_bsc, fp_coll, fp_piper


	description="""
	Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català.

	1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina)
	https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker

	2. Model Fastspeech entrenat per Col·lectivat
	https://github.com/CollectivaT-dev/TTS-API

	3. Model VITS entrenat per Piper/Home Assistant
	https://github.com/rhasspy/piper

	Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.

	Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
	https://github.com/projecte-aina/espeak-ng

	NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador.
	"""
	article= ""

	iface = gr.Interface(
	fn=tts,
	inputs=[
	gr.Textbox(
	label="Text",
	value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
	),
	gr.Dropdown(label="Selecciona un parlant pel model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona")
	],
	outputs=[
	gr.Markdown(label="Fonemes"),
	gr.Audio(label="BSC VITS",type="filepath"),
	gr.Audio(label="Collectivat Fastspeech",type="filepath"),
	gr.Audio(label="Piper VITS",type="filepath")
	],
	title="Comparativa de síntesi lliure en català️",
	description=description,
	article=article,
	allow_flagging="never",
	layout="vertical",
	live=False
	)
	iface.launch(server_name="0.0.0.0", server_port=7860)