|
--- |
|
license: cc-by-nc-sa-4.0 |
|
datasets: |
|
- openslr |
|
- mozilla-foundation/common_voice_13_0 |
|
- Lagos-NWU_Yoruba_Speech_Corpus |
|
language: |
|
- yo |
|
library_name: transformers |
|
pipeline_tag: text-to-speech |
|
--- |
|
|
|
```python |
|
# Load model directly |
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
from huggingface_hub import hf_hub_download |
|
import torch |
|
|
|
processor = SpeechT5Processor.from_pretrained("imhotepai/yoruba-tts") |
|
model = SpeechT5ForTextToSpeech.from_pretrained("imhotepai/yoruba-tts") |
|
|
|
dir_= hf_hub_download(repo_id="imhotepai/yoruba-tts", filename="speaker_embeddings.pt") |
|
speaker_embeddings= torch.load(dir_) |
|
|
|
text='Báwó ni'.lower() |
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
|
|
|
# Audio in notebook |
|
from IPython.display import Audio |
|
|
|
Audio(speech.numpy(), rate=16000) |
|
``` |