|
import numpy as np |
|
from scipy.io import wavfile |
|
import torch |
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from transformers import AutoTokenizer |
|
import gradio as gr |
|
import re |
|
from num2words import num2words |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
try: |
|
model = ParlerTTSForConditionalGeneration.from_pretrained("CONCREE/Adia_TTS", torch_dtype=torch.float16).to(device) |
|
tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS") |
|
except Exception as e: |
|
raise RuntimeError(f"Erreur lors du chargement du modèle : {e}") |
|
|
|
|
|
|
|
|
|
|
|
class EnglishNumberNormalizer: |
|
def __call__(self, text): |
|
|
|
numbers = re.findall(r'\d+', text) |
|
for number in numbers: |
|
|
|
text = text.replace(number, num2words(int(number), lang='fr')) |
|
return text |
|
|
|
number_normalizer = EnglishNumberNormalizer() |
|
|
|
|
|
def preprocess(text): |
|
|
|
text = number_normalizer(text).strip() |
|
|
|
|
|
text = text.replace("-", " ") |
|
|
|
|
|
if not text.endswith(('.', '!', '?')): |
|
text += "." |
|
|
|
|
|
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' |
|
abbreviations = re.findall(abbreviations_pattern, text) |
|
for abv in abbreviations: |
|
|
|
separated_abv = " ".join(abv.replace(".", "")) |
|
text = text.replace(abv, separated_abv) |
|
|
|
return text |
|
|
|
|
|
default_prompt = "Abdoul nena souba dinagnou am reunion pour waxtaan li des" |
|
default_description = """A crystal clear and distinct voice, with a moderate reading rate that facilitates understanding. The tone is monotonous, without variations or inflections, which provides a uniform listening experience. The voice is free of background noise and allows for continuous reading, without inappropriate pauses, thus ensuring a constant and pleasant flow.""" |
|
|
|
|
|
|
|
|
|
def generate_audio(prompt, description): |
|
|
|
prompt = preprocess(prompt) |
|
|
|
|
|
input_ids = tokenizer(description.strip(), return_tensors="pt").input_ids.to(device) |
|
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) |
|
|
|
|
|
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) |
|
audio_arr = generation.cpu().numpy().squeeze() |
|
|
|
|
|
sampling_rate = model.config.sampling_rate |
|
|
|
|
|
audio_arr = audio_arr / np.max(np.abs(audio_arr)) |
|
|
|
return sampling_rate, audio_arr |
|
|
|
|
|
def update_char_counter(text): |
|
remaining_chars = 200 - len(text) |
|
return f"Caractères restants : {remaining_chars}" |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown("data:image/s3,"s3://crabby-images/76c7d/76c7d5a90ed23034754b6ab7e9f5bb15c3c69e32" alt="Logo"") |
|
|
|
|
|
gr.Markdown("# 🌟 Bienvenue sur Adia TTS 🌟") |
|
gr.Markdown(f"[Adia TTS](https://huggingface.co/CONCREE/Adia_TTS) est un modèle de génération audio en wolof. Cette interface vous permet de générer des fichiers audio à partir de textes en wolof. Vous pouvez choisir une description pour personnaliser la voix générée.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
prompt_input = gr.Textbox(label="Entrez votre texte en wolof", placeholder="Ex: Abdoul nena souba dinagnou am reunion pour waxtaan li des", value=default_prompt, max_length=180) |
|
char_counter = gr.Label(value=update_char_counter(default_prompt)) |
|
description_input = gr.Textbox(label="Entrez une description pour la voix", value=default_description) |
|
generate_button = gr.Button("Générer l'audio", variant="primary") |
|
|
|
with gr.Column(): |
|
audio_output = gr.Audio(label="Audio généré", type="numpy") |
|
|
|
|
|
gr.Markdown("## Exemples de textes et descriptions") |
|
gr.Examples( |
|
examples=[ |
|
[ |
|
"""Liggéeyukaay ci wàllu mbay mi ci Senegal dafa am solo lool ci wàllu kaaraange dundu ak sos liggéey, ndax dafay boole xeeti liggéey yu bees yu melni agroecologie ak togg ci gox bi.""", |
|
default_description, |
|
], |
|
[ |
|
"""Entreprenariat ci Senegal dafa am solo lool ci yokkuteg koom-koom, di gëna yokk liggéey ak indi gis-gis yu bees ci dëkk bi. Ndaw yi am këru liggéey dañuy am xéewal yu amul fenn ndax ecosystem bi dafay màgg, te inisiatiif yu réew mi ak yu prive yi ñoo leen di jàppale.""", |
|
default_description, |
|
], |
|
], |
|
inputs=[prompt_input, description_input], |
|
outputs=audio_output, |
|
fn=generate_audio, |
|
label="Cliquez sur un exemple pour générer l'audio", |
|
cache_examples=True, |
|
) |
|
|
|
|
|
prompt_input.change(fn=update_char_counter, inputs=prompt_input, outputs=char_counter) |
|
|
|
generate_button.click(fn=generate_audio, inputs=[prompt_input, description_input], outputs=audio_output) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
interface = create_interface() |
|
interface.launch() |