|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import List, Union |
|
|
|
from ..utils import is_torch_available |
|
from .base import Pipeline |
|
|
|
|
|
if is_torch_available(): |
|
from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING |
|
from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan |
|
|
|
DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan" |
|
|
|
|
|
class TextToAudioPipeline(Pipeline): |
|
""" |
|
Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This |
|
pipeline generates an audio file from an input text and optional other conditional inputs. |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import pipeline |
|
|
|
>>> pipe = pipeline(model="suno/bark-small") |
|
>>> output = pipe("Hey it's HuggingFace on the phone!") |
|
|
|
>>> audio = output["audio"] |
|
>>> sampling_rate = output["sampling_rate"] |
|
``` |
|
|
|
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) |
|
|
|
|
|
This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or |
|
`"text-to-audio"`. |
|
|
|
See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech). |
|
""" |
|
|
|
def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
|
|
if self.framework == "tf": |
|
raise ValueError("The TextToAudioPipeline is only available in PyTorch.") |
|
|
|
self.vocoder = None |
|
if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values(): |
|
self.vocoder = ( |
|
SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device) |
|
if vocoder is None |
|
else vocoder |
|
) |
|
|
|
self.sampling_rate = sampling_rate |
|
if self.vocoder is not None: |
|
self.sampling_rate = self.vocoder.config.sampling_rate |
|
|
|
if self.sampling_rate is None: |
|
|
|
|
|
config = self.model.config |
|
gen_config = self.model.__dict__.get("generation_config", None) |
|
if gen_config is not None: |
|
config.update(gen_config.to_dict()) |
|
|
|
for sampling_rate_name in ["sample_rate", "sampling_rate"]: |
|
sampling_rate = getattr(config, sampling_rate_name, None) |
|
if sampling_rate is not None: |
|
self.sampling_rate = sampling_rate |
|
|
|
def preprocess(self, text, **kwargs): |
|
if isinstance(text, str): |
|
text = [text] |
|
|
|
if self.model.config.model_type == "bark": |
|
|
|
new_kwargs = { |
|
"max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256), |
|
"add_special_tokens": False, |
|
"return_attention_mask": True, |
|
"return_token_type_ids": False, |
|
"padding": "max_length", |
|
} |
|
|
|
|
|
new_kwargs.update(kwargs) |
|
|
|
kwargs = new_kwargs |
|
|
|
output = self.tokenizer(text, **kwargs, return_tensors="pt") |
|
|
|
return output |
|
|
|
def _forward(self, model_inputs, **kwargs): |
|
|
|
kwargs = self._ensure_tensor_on_device(kwargs, device=self.device) |
|
|
|
if self.model.can_generate(): |
|
output = self.model.generate(**model_inputs, **kwargs) |
|
else: |
|
output = self.model(**model_inputs, **kwargs)[0] |
|
|
|
if self.vocoder is not None: |
|
|
|
output = self.vocoder(output) |
|
|
|
return output |
|
|
|
def __call__(self, text_inputs: Union[str, List[str]], **forward_params): |
|
""" |
|
Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information. |
|
|
|
Args: |
|
text_inputs (`str` or `List[str]`): |
|
The text(s) to generate. |
|
forward_params (*optional*): |
|
Parameters passed to the model generation/forward method. |
|
|
|
Return: |
|
A `dict` or a list of `dict`: The dictionaries have two keys: |
|
|
|
- **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform. |
|
- **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform. |
|
""" |
|
return super().__call__(text_inputs, **forward_params) |
|
|
|
def _sanitize_parameters( |
|
self, |
|
preprocess_params=None, |
|
forward_params=None, |
|
): |
|
if preprocess_params is None: |
|
preprocess_params = {} |
|
if forward_params is None: |
|
forward_params = {} |
|
postprocess_params = {} |
|
|
|
return preprocess_params, forward_params, postprocess_params |
|
|
|
def postprocess(self, waveform): |
|
output_dict = {} |
|
|
|
output_dict["audio"] = waveform.cpu().float().numpy() |
|
output_dict["sampling_rate"] = self.sampling_rate |
|
|
|
return output_dict |
|
|