Very bad result in Italian - a setup problem?
Testing using this code, with attached results that are impossibile to use. Testing on CUDA 12.1 on a GTX 1070, wanted to ask if this is the status of italian TTS, or there is someting bad in my code.
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-multilingual-v1.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-multilingual-v1.1")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
prompt = "Cosa sappiamo dell’evoluzione di Homo Heidelbergensis in Europa? Quanto è cambiata la sua morfologia una volta arrivato nel nostro continente? Per rispondere a queste domande disponiamo di una ricca serie di fossili distribuiti in tutta l’Europa: 130 frammenti appartenenti a 7 o 8 individui nel sito di Tautavel nei Pirenei orientali, 3000 frammenti (tra cui dei crani completi) associati ad almeno 28 individui nel sito di Sima de los Huesos in Spagna, un cranio a Aroeira in Portogallo, un osso occipitale a Vértesszöllös in Ungheria, una calotta cranica a Ceprano, qualche dente a Visogliano e un femore a Venosa in Italia, una tibia a Boxgrove in Inghilterra, ed il primo fossile in assoluto di Homo Heidelbergensis: la mandibola di Mauer, in Germania."
description = "Julia's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)
Hey
@davide445
,
Thanks for your message, the model was trained to generate speech from 2 to 30s. In other words, it can generate short prompts that corresponds to small audio snippet.
Given the length of your prompt, you probably have to generate it in multiple go.
Additionally, you can get inspiration from here: https://huggingface.co/spaces/ai4bharat/indic-parler-tts/blob/main/app.py#L179
To get both a long-form generation algorithm, and a proper use of the attention masks (don't forget to add prompt_attention_mask and attention_mask to the generate call).
Hope it helps
Thia code generate this anyway bad result
import torch
import nltk
import numpy as np
import io
import argparse
from pydub import AudioSegment
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from tqdm import tqdm
import sys
Testo predefinito da utilizzare
DEFAULT_TEXT = """
La neve, il freddo e il ghiaccio portano disagi in Lombardia. Un nuovo peggioramento delle condizioni meteorologiche è previsto per i prossimi giorni, annuncia il Centro funzionale monitoraggio rischi della Regione. La Protezione civile ha diramato un'allerta meteo gialla (rischio ordinario) a partire dalle 3 di questa notte e per l'intera giornata di domani.
"""
DEFAULT_DESCRIPTION = """
Julia's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.
"""
class ProgressBar:
def init(self, total_chunks):
self.total_chunks = total_chunks
self.current_chunk = 0
self.current_progress = 0
def update(self, chunk_progress):
chunk_contribution = 100.0 / self.total_chunks
total_progress = (self.current_chunk * chunk_contribution) + (chunk_progress * chunk_contribution / 100)
if int(total_progress) > self.current_progress:
self.current_progress = int(total_progress)
sys.stdout.write(f'\rGenerazione audio: {self.current_progress}%')
sys.stdout.flush()
def next_chunk(self):
self.current_chunk += 1
def finish(self):
sys.stdout.write('\rGenerazione audio: 100%\n')
sys.stdout.flush()
def numpy_to_mp3(audio_array, sampling_rate):
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
if max_val > 0:
audio_array = (audio_array / max_val) * 32767
audio_array = audio_array.astype(np.int16)
audio_segment = AudioSegment(
audio_array.tobytes(),
frame_rate=sampling_rate,
sample_width=audio_array.dtype.itemsize,
channels=1
)
return audio_segment
def generate_long_form_audio(text, description, model, tokenizer, description_tokenizer, device, chunk_size=25):
inputs = description_tokenizer(description, return_tensors="pt").to(device)
sentences = nltk.sent_tokenize(text)
curr_sentence = ""
chunks = []
for sentence in sentences:
candidate = " ".join([curr_sentence, sentence]).strip()
if len(candidate.split()) >= chunk_size:
if curr_sentence:
chunks.append(curr_sentence)
curr_sentence = sentence
else:
curr_sentence = candidate
if curr_sentence:
chunks.append(curr_sentence)
print(f"\nProcessing {len(chunks)} chunks")
progress_bar = ProgressBar(len(chunks))
all_audio = []
for i, chunk in enumerate(chunks):
try:
prompt = tokenizer(chunk, return_tensors="pt").to(device)
with torch.cuda.amp.autocast(enabled=device.startswith("cuda")):
generation = model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
prompt_input_ids=prompt.input_ids,
prompt_attention_mask=prompt.attention_mask,
do_sample=True,
return_dict_in_generate=True
)
if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'):
audio = generation.sequences[0, :generation.audios_length[0]]
audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
if len(audio_np.shape) > 1:
audio_np = audio_np.flatten()
all_audio.append(audio_np)
progress_bar.next_chunk()
progress_bar.update(100)
except Exception as e:
print(f"\nError processing chunk {i+1}: {e}")
continue
progress_bar.finish()
if all_audio:
combined_audio = np.concatenate(all_audio)
return combined_audio
else:
raise Exception("No audio was generated")
def get_user_input():
print("\n=== Parler TTS Audio Generator ===")
# Selezione del dispositivo
print("\nDispositivi disponibili:")
print("1. CPU")
if torch.cuda.is_available():
print("2. GPU (CUDA)")
valid_choices = ['1', '2']
else:
valid_choices = ['1']
print("GPU non disponibile su questo sistema")
while True:
device_choice = input("\nSeleziona il dispositivo da utilizzare (inserisci il numero): ").strip()
if device_choice in valid_choices:
break
print("Scelta non valida. Riprova.")
use_cpu = device_choice == '1'
# Selezione del testo
print("\nOpzioni per il testo:")
print("1. Usa testo predefinito")
print("2. Inserisci nuovo testo")
while True:
text_choice = input("\nSeleziona l'opzione per il testo (1 o 2): ").strip()
if text_choice in ['1', '2']:
break
print("Scelta non valida. Riprova.")
if text_choice == '1':
text = DEFAULT_TEXT
print("\nUso il testo predefinito:")
print(text)
else:
print("\nInserisci il tuo testo (premi Invio due volte per terminare):")
lines = []
while True:
line = input()
if line == "":
break
lines.append(line)
text = "\n".join(lines)
# Nome file di output
output_file = input("\nInserisci il nome del file di output (default: output.mp3): ").strip()
if not output_file:
output_file = "output.mp3"
if not output_file.endswith('.mp3'):
output_file += '.mp3'
return use_cpu, text, output_file
def main():
try:
# Ottieni input dall'utente
use_cpu, text, output_file = get_user_input()
# Impostazione del device
if use_cpu:
device = "cpu"
torch_dtype = torch.float32
print("\nUsando CPU come richiesto")
else:
device = "cuda:0"
torch_dtype = torch.float16
print(f"\nUsando CUDA con device: {torch.cuda.get_device_name(0)}")
# Download NLTK data
nltk.download('punkt', quiet=True)
print("\nCaricamento modelli...")
model = ParlerTTSForConditionalGeneration.from_pretrained(
"parler-tts/parler-tts-mini-multilingual-v1.1",
torch_dtype=torch_dtype,
device_map=device
).to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-multilingual-v1.1")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
print("Modelli caricati con successo")
print("\nInizio generazione audio...")
audio_arr = generate_long_form_audio(
text,
DEFAULT_DESCRIPTION,
model,
tokenizer,
description_tokenizer,
device
)
print(f"\nLunghezza audio generato: {round(len(audio_arr) / model.config.sampling_rate, 2)} secondi")
audio_segment = numpy_to_mp3(audio_arr, model.config.sampling_rate)
audio_segment.export(output_file, format="mp3", bitrate="320k")
print(f"Audio salvato con successo in: {output_file}")
except Exception as e:
print(f"\nErrore durante l'esecuzione: {e}")
raise
if name == "main":
main()