Spaces:

AkhilTolani
/

vocals

Runtime error

App Files Files Community

AkhilTolani commited on May 29

Commit

e658e7c

•

1 Parent(s): 09acd34

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -177

app.py CHANGED Viewed

@@ -1,27 +1,23 @@
-import math
-from queue import Queue
-from threading import Thread
-from typing import Optional
-import numpy as np
 import spaces
 import gradio as gr
 import torch
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
-from transformers.generation.streamers import BaseStreamer
-device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
-torch_dtype = torch.float16 if device != "cpu" else torch.float32
 custom_repo_id = "AkhilTolani/vocals-english"
-custom_model = ParlerTTSForConditionalGeneration.from_pretrained(custom_repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")
-SEED = 456
 default_text = "Raindrops on the window pane, mirroring my tears again.  Autumn leaves are falling down, just like my world without you around. They called you different, a love forbidden, but in your eyes I saw my future written."
 default_description = "A man delivers his speech in a quiet, enclosed space with exceptional clarity, maintaining a very monotone tone of voice, at a relatively slow pace. His pitch is slightly low."
@@ -29,167 +25,44 @@ default_description = "A man delivers his speech in a quiet, enclosed space with
 examples = [
     [
         "Raindrops on the window pane, mirroring my tears again.  Autumn leaves are falling down, just like my world without you around. They called you different, a love forbidden, but in your eyes I saw my future written.",
-        "'A woman speaks with a somewhat monotone tone, delivering her words at a moderate pace, in a recording that sounds quite clear but slightly confined. Her voice has a slightly high pitch.'",
-        10.0,
-    ],
 ]
-class ParlerTTSStreamer(BaseStreamer):
-    def __init__(
-        self,
-        model: ParlerTTSForConditionalGeneration,
-        device: Optional[str] = None,
-        play_steps: Optional[int] = 10,
-        stride: Optional[int] = None,
-        timeout: Optional[float] = None,
-    ):
-        """
-        Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
-        useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
-        Gradio demo).
-        Parameters:
-            model (`ParlerTTSForConditionalGeneration`):
-                The Parler-TTS model used to generate the audio waveform.
-            device (`str`, *optional*):
-                The torch device on which to run the computation. If `None`, will default to the device of the model.
-            play_steps (`int`, *optional*, defaults to 10):
-                The number of generation steps with which to return the generated audio array. Using fewer steps will
-                mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
-                should be tuned to your device and latency requirements.
-            stride (`int`, *optional*):
-                The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
-                the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
-                play_steps // 6 in the audio space.
-            timeout (`int`, *optional*):
-                The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
-                in `.generate()`, when it is called in a separate thread.
-        """
-        self.decoder = model.decoder
-        self.audio_encoder = model.audio_encoder
-        self.generation_config = model.generation_config
-        self.device = device if device is not None else model.device
-        # variables used in the streaming process
-        self.play_steps = play_steps
-        if stride is not None:
-            self.stride = stride
-        else:
-            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
-            self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
-        self.token_cache = None
-        self.to_yield = 0
-        # varibles used in the thread process
-        self.audio_queue = Queue()
-        self.stop_signal = None
-        self.timeout = timeout
-    def apply_delay_pattern_mask(self, input_ids):
-        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
-        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
-            input_ids[:, :1],
-            bos_token_id=self.generation_config.bos_token_id,
-            pad_token_id=self.generation_config.decoder_start_token_id,
-            max_length=input_ids.shape[-1],
-        )
-        # apply the pattern mask to the input ids
-        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
-        # revert the pattern delay mask by filtering the pad token id
-        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
-        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
-        # append the frame dimension back to the audio codes
-        input_ids = input_ids[None, ...]
-        # send the input_ids to the correct device
-        input_ids = input_ids.to(self.audio_encoder.device)
-        decode_sequentially = (
-            self.generation_config.bos_token_id in input_ids
-            or self.generation_config.pad_token_id in input_ids
-            or self.generation_config.eos_token_id in input_ids
-        )
-        if not decode_sequentially:
-            output_values = self.audio_encoder.decode(
-                input_ids,
-                audio_scales=[None],
-            )
-        else:
-            sample = input_ids[:, 0]
-            sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
-            sample = sample[:, :, sample_mask]
-            output_values = self.audio_encoder.decode(sample[None, ...], [None])
-        audio_values = output_values.audio_values[0, 0]
-        return audio_values.cpu().float().numpy()
-    def put(self, value):
-        batch_size = value.shape[0] // self.decoder.num_codebooks
-        if batch_size > 1:
-            raise ValueError("ParlerTTSStreamer only supports batch size 1")
-        if self.token_cache is None:
-            self.token_cache = value
-        else:
-            self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
-        if self.token_cache.shape[-1] % self.play_steps == 0:
-            audio_values = self.apply_delay_pattern_mask(self.token_cache)
-            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
-            self.to_yield += len(audio_values) - self.to_yield - self.stride
-    def end(self):
-        """Flushes any remaining cache and appends the stop symbol."""
-        if self.token_cache is not None:
-            audio_values = self.apply_delay_pattern_mask(self.token_cache)
-        else:
-            audio_values = np.zeros(self.to_yield)
-        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
-    def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
-        """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
-        self.audio_queue.put(audio, timeout=self.timeout)
-        if stream_end:
-            self.audio_queue.put(self.stop_signal, timeout=self.timeout)
-    def __iter__(self):
-        return self
-    def __next__(self):
-        value = self.audio_queue.get(timeout=self.timeout)
-        if not isinstance(value, np.ndarray) and value == self.stop_signal:
-            raise StopIteration()
-        else:
-            return value
-sampling_rate = custom_model.audio_encoder.config.sampling_rate
-frame_rate = custom_model.audio_encoder.config.frame_rate
 @spaces.GPU
-def generate_base(text, description, play_steps_in_s=2.0):
-    play_steps = int(frame_rate * play_steps_in_s)
-    streamer = ParlerTTSStreamer(custom_model, device=device, play_steps=play_steps)
     inputs = tokenizer(description, return_tensors="pt").to(device)
-    prompt = tokenizer(text, return_tensors="pt").to(device)
-    generation_kwargs = dict(
-        input_ids=inputs.input_ids,
-        prompt_input_ids=prompt.input_ids,
-        streamer=streamer,
-        min_length=20,
     )
-    set_seed(SEED)
-    thread = Thread(target=custom_model.generate, kwargs=generation_kwargs)
-    thread.start()
-    for new_audio in streamer:
-        print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
-        yield sampling_rate, new_audio
 css = """
         #share-btn-container {
@@ -256,20 +129,18 @@ with gr.Blocks(css=css) as block:
         </p>
         """
     )
-    with gr.Tab("Vocals"):
-        with gr.Row():
-            with gr.Column():
-                input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
-                description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
-                play_seconds = gr.Slider(3.0, 15.0, value=10.0, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps")
-                run_button = gr.Button("Generate Audio", variant="primary")
-            with gr.Column():
-                audio_out = gr.Audio(label="Parler-TTS + Vocals", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
-        inputs = [input_text, description, play_seconds]
-        outputs = [audio_out]
-        gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
-        run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
 block.queue()
 block.launch(share=True)

 import spaces
 import gradio as gr
 import torch
+from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
+from string import punctuation
+import re
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
 custom_repo_id = "AkhilTolani/vocals-english"
+model = ParlerTTSForConditionalGeneration.from_pretrained(custom_repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")
+SEED = 42
 default_text = "Raindrops on the window pane, mirroring my tears again.  Autumn leaves are falling down, just like my world without you around. They called you different, a love forbidden, but in your eyes I saw my future written."
 default_description = "A man delivers his speech in a quiet, enclosed space with exceptional clarity, maintaining a very monotone tone of voice, at a relatively slow pace. His pitch is slightly low."
 examples = [
     [
         "Raindrops on the window pane, mirroring my tears again.  Autumn leaves are falling down, just like my world without you around. They called you different, a love forbidden, but in your eyes I saw my future written.",
+        "A woman speaks with a somewhat monotone tone, delivering her words at a moderate pace, in a recording that sounds quite clear but slightly confined. Her voice has a slightly high pitch.",
+    ]
 ]
+number_normalizer = EnglishNumberNormalizer()
+def preprocess(text):
+    text = number_normalizer(text).strip()
+    text = text.replace("-", " ")
+    if text[-1] not in punctuation:
+        text = f"{text}."
+    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
+    def separate_abb(chunk):
+        chunk = chunk.replace(".","")
+        print(chunk)
+        return " ".join(chunk)
+    abbreviations = re.findall(abbreviations_pattern, text)
+    for abv in abbreviations:
+        if abv in text:
+            text = text.replace(abv, separate_abb(abv))
+    return text
 @spaces.GPU
+def gen_tts(text, description):
     inputs = tokenizer(description, return_tensors="pt").to(device)
+    prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
+    set_seed(SEED)
+    generation = model.generate(
+        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, do_sample=True, temperature=1.0
     )
+    audio_arr = generation.cpu().numpy().squeeze()
+    return audio_arr
 css = """
         #share-btn-container {
         </p>
         """
     )
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
+            description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
+            run_button = gr.Button("Generate Audio", variant="primary")
+        with gr.Column():
+            audio_out = gr.Audio(label="Parler-TTS + Vocals", type="numpy", elem_id="audio_out")
+    inputs = [input_text, description]
+    outputs = [audio_out]
+    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
+    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
 block.queue()
 block.launch(share=True)