magic-8-ball

Running on Zero

App Files Files Community

freddyaboulton HF staff commited on Aug 21

Commit

2acc3a1

•

1 Parent(s): f3f7cbd

Add code

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +38 -181
streamer.py +133 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Bedtime Story Reader
-emoji: 🌈
 colorFrom: red
 colorTo: indigo
 sdk: gradio

 ---
+title: Magic 8 Ball
+emoji: 🎱
 colorFrom: red
 colorTo: indigo
 sdk: gradio

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import io
 import math
-from queue import Queue
 from threading import Thread
-from typing import Optional
 import numpy as np
 import spaces
@@ -12,10 +11,8 @@ import torch
 from parler_tts import ParlerTTSForConditionalGeneration
 from pydub import AudioSegment
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
-from transformers.generation.streamers import BaseStreamer
 from huggingface_hub import InferenceClient
-import nltk
-nltk.download('punkt')
 device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
@@ -38,135 +35,6 @@ SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
-class ParlerTTSStreamer(BaseStreamer):
-    def __init__(
-        self,
-        model: ParlerTTSForConditionalGeneration,
-        device: Optional[str] = None,
-        play_steps: Optional[int] = 10,
-        stride: Optional[int] = None,
-        timeout: Optional[float] = None,
-    ):
-        """
-        Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
-        useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
-        Gradio demo).
-        Parameters:
-            model (`ParlerTTSForConditionalGeneration`):
-                The Parler-TTS model used to generate the audio waveform.
-            device (`str`, *optional*):
-                The torch device on which to run the computation. If `None`, will default to the device of the model.
-            play_steps (`int`, *optional*, defaults to 10):
-                The number of generation steps with which to return the generated audio array. Using fewer steps will
-                mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
-                should be tuned to your device and latency requirements.
-            stride (`int`, *optional*):
-                The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
-                the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
-                play_steps // 6 in the audio space.
-            timeout (`int`, *optional*):
-                The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
-                in `.generate()`, when it is called in a separate thread.
-        """
-        self.decoder = model.decoder
-        self.audio_encoder = model.audio_encoder
-        self.generation_config = model.generation_config
-        self.device = device if device is not None else model.device
-        # variables used in the streaming process
-        self.play_steps = play_steps
-        if stride is not None:
-            self.stride = stride
-        else:
-            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
-            self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
-        self.token_cache = None
-        self.to_yield = 0
-        # varibles used in the thread process
-        self.audio_queue = Queue()
-        self.stop_signal = None
-        self.timeout = timeout
-    def apply_delay_pattern_mask(self, input_ids):
-        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
-        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
-            input_ids[:, :1],
-            bos_token_id=self.generation_config.bos_token_id,
-            pad_token_id=self.generation_config.decoder_start_token_id,
-            max_length=input_ids.shape[-1],
-        )
-        # apply the pattern mask to the input ids
-        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
-        # revert the pattern delay mask by filtering the pad token id
-        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
-        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
-        # append the frame dimension back to the audio codes
-        input_ids = input_ids[None, ...]
-        # send the input_ids to the correct device
-        input_ids = input_ids.to(self.audio_encoder.device)
-        decode_sequentially = (
-            self.generation_config.bos_token_id in input_ids
-            or self.generation_config.pad_token_id in input_ids
-            or self.generation_config.eos_token_id in input_ids
-        )
-        if not decode_sequentially:
-            output_values = self.audio_encoder.decode(
-                input_ids,
-                audio_scales=[None],
-            )
-        else:
-            sample = input_ids[:, 0]
-            sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
-            sample = sample[:, :, sample_mask]
-            output_values = self.audio_encoder.decode(sample[None, ...], [None])
-        audio_values = output_values.audio_values[0, 0]
-        return audio_values.cpu().float().numpy()
-    def put(self, value):
-        batch_size = value.shape[0] // self.decoder.num_codebooks
-        if batch_size > 1:
-            raise ValueError("ParlerTTSStreamer only supports batch size 1")
-        if self.token_cache is None:
-            self.token_cache = value
-        else:
-            self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
-        if self.token_cache.shape[-1] % self.play_steps == 0:
-            audio_values = self.apply_delay_pattern_mask(self.token_cache)
-            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
-            self.to_yield += len(audio_values) - self.to_yield - self.stride
-    def end(self):
-        """Flushes any remaining cache and appends the stop symbol."""
-        if self.token_cache is not None:
-            audio_values = self.apply_delay_pattern_mask(self.token_cache)
-        else:
-            audio_values = np.zeros(self.to_yield)
-        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
-    def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
-        """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
-        self.audio_queue.put(audio, timeout=self.timeout)
-        if stream_end:
-            self.audio_queue.put(self.stop_signal, timeout=self.timeout)
-    def __iter__(self):
-        return self
-    def __next__(self):
-        value = self.audio_queue.get(timeout=self.timeout)
-        if not isinstance(value, np.ndarray) and value == self.stop_signal:
-            raise StopIteration()
-        else:
-            return value
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     if np.issubdtype(audio_array.dtype, np.floating):
@@ -195,75 +63,64 @@ def numpy_to_mp3(audio_array, sampling_rate):
 sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
-import random
-import datetime
 @spaces.GPU
-def generate_base(subject, setting):
-    messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
-                                              "You want to write a bed time story for your child. They will give you the subject and setting "
-                                              "and you will write the entire story. It should be targetted at children 5 and younger and take about "
-                                              "a minute to read")},
-                {"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
-    gr.Info("Generating story", duration=3)
     response = client.chat_completion(messages, max_tokens=1024, seed=random.randint(1, 5000))
-    gr.Info("Story Generated", duration=3)
-    story = response.choices[0].message.content
-    model_input = story.replace("\n", " ").strip()
-    model_input_tokens = nltk.sent_tokenize(model_input)
-    play_steps_in_s = 3.0
     play_steps = int(frame_rate * play_steps_in_s)
     description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
     description_tokens = tokenizer(description, return_tensors="pt").to(device)
-    for i, sentence in enumerate(model_input_tokens):
-        streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
-        print("SENTENCE", sentence)
-        prompt = tokenizer(sentence, return_tensors="pt").to(device)
-        generation_kwargs = dict(
-            input_ids=description_tokens.input_ids,
-            prompt_input_ids=prompt.input_ids,
-            streamer=streamer,
-            do_sample=True,
-            temperature=1.0,
-            min_new_tokens=10,
-        )
-        set_seed(SEED)
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        for new_audio in streamer:
-            if i == 0:
-                gr.Info("Reading story", duration=3)
-            print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
-            yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 with gr.Blocks() as block:
     gr.HTML(
         f"""
-        <h1> Bedtime Story Reader 😴🔊 </h1>
-        <p> Powered by <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a>
         """
     )
     with gr.Group():
         with gr.Row():
-            subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"], label="Subject")
-            setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater", "Pirate Ship"], label="Setting")
         with gr.Row():
-            run_button = gr.Button("Generate Story", variant="primary")
-    with gr.Row():
-        with gr.Group():
-            audio_out = gr.Audio(label="Bed time story",  streaming=True, autoplay=True)
-            story = gr.Textbox(label="Story")
-    inputs = [subject, setting]
-    outputs = [story, audio_out]
-    run_button.click(fn=generate_base, inputs=inputs, outputs=outputs)
 block.launch()

 import io
 import math
 from threading import Thread
+import random
 import numpy as np
 import spaces
 from parler_tts import ParlerTTSForConditionalGeneration
 from pydub import AudioSegment
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 from huggingface_hub import InferenceClient
+from streamer import ParlerTTSStreamer
 device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 SEED = 42
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     if np.issubdtype(audio_array.dtype, np.floating):
 sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
 @spaces.GPU
+def generate_base(audio):
+    question = client.audtomatic_speech_recognition(audio)
+    messages = [{"role": "sytem", "content": ("You are a magic 8 ball."
+                                              "Someone will present to you a situation or question and your job "
+                                              "is to answer with a cryptic addage or proverb such as "
+                                              "'curiosity killed the cat' or 'The early bird gets the worm'.")},
+                {"role": "user", "content": f"Please tell me what to do about {question}"}]
     response = client.chat_completion(messages, max_tokens=1024, seed=random.randint(1, 5000))
+    response = response.choices[0].message.content
+    play_steps_in_s = 1.0
     play_steps = int(frame_rate * play_steps_in_s)
     description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
     description_tokens = tokenizer(description, return_tensors="pt").to(device)
+    streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
+    prompt = tokenizer(sentence, return_tensors="pt").to(device)
+    generation_kwargs = dict(
+        input_ids=description_tokens.input_ids,
+        prompt_input_ids=prompt.input_ids,
+        streamer=streamer,
+        do_sample=True,
+        temperature=1.0,
+        min_new_tokens=10,
+    )
+    set_seed(SEED)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    for new_audio in streamer:
+        print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
+        yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
+css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
+                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
 with gr.Blocks() as block:
     gr.HTML(
         f"""
+        <h1 style='text-align: center;'> Magic 8 Ball 🎱 </h1>
+        <p style='text-align: center;'> Powered by <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a>
         """
     )
     with gr.Group():
         with gr.Row():
+            audio_out = gr.Audio(visble=False, streaming=True)
+            answer = gr.Textbox(label="Answer")
         with gr.Row():
+            audio_in = gr.Audio(label="Speak you question", sources="microphone", format="filepath")
+    audio_in.stop_recording(fn=generate_base, inputs=audio_in, outputs=[answer, audio_out])
 block.launch()

streamer.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from queue import Queue
+from transformers.generation.streamers import BaseStreamer
+from typing import Optional
+class ParlerTTSStreamer(BaseStreamer):
+    def __init__(
+        self,
+        model: ParlerTTSForConditionalGeneration,
+        device: Optional[str] = None,
+        play_steps: Optional[int] = 10,
+        stride: Optional[int] = None,
+        timeout: Optional[float] = None,
+    ):
+        """
+        Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
+        useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
+        Gradio demo).
+        Parameters:
+            model (`ParlerTTSForConditionalGeneration`):
+                The Parler-TTS model used to generate the audio waveform.
+            device (`str`, *optional*):
+                The torch device on which to run the computation. If `None`, will default to the device of the model.
+            play_steps (`int`, *optional*, defaults to 10):
+                The number of generation steps with which to return the generated audio array. Using fewer steps will
+                mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
+                should be tuned to your device and latency requirements.
+            stride (`int`, *optional*):
+                The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
+                the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
+                play_steps // 6 in the audio space.
+            timeout (`int`, *optional*):
+                The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+                in `.generate()`, when it is called in a separate thread.
+        """
+        self.decoder = model.decoder
+        self.audio_encoder = model.audio_encoder
+        self.generation_config = model.generation_config
+        self.device = device if device is not None else model.device
+        # variables used in the streaming process
+        self.play_steps = play_steps
+        if stride is not None:
+            self.stride = stride
+        else:
+            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
+            self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
+        self.token_cache = None
+        self.to_yield = 0
+        # varibles used in the thread process
+        self.audio_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+    def apply_delay_pattern_mask(self, input_ids):
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
+        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+            input_ids[:, :1],
+            bos_token_id=self.generation_config.bos_token_id,
+            pad_token_id=self.generation_config.decoder_start_token_id,
+            max_length=input_ids.shape[-1],
+        )
+        # apply the pattern mask to the input ids
+        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
+        # revert the pattern delay mask by filtering the pad token id
+        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
+        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
+        # append the frame dimension back to the audio codes
+        input_ids = input_ids[None, ...]
+        # send the input_ids to the correct device
+        input_ids = input_ids.to(self.audio_encoder.device)
+        decode_sequentially = (
+            self.generation_config.bos_token_id in input_ids
+            or self.generation_config.pad_token_id in input_ids
+            or self.generation_config.eos_token_id in input_ids
+        )
+        if not decode_sequentially:
+            output_values = self.audio_encoder.decode(
+                input_ids,
+                audio_scales=[None],
+            )
+        else:
+            sample = input_ids[:, 0]
+            sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
+            sample = sample[:, :, sample_mask]
+            output_values = self.audio_encoder.decode(sample[None, ...], [None])
+        audio_values = output_values.audio_values[0, 0]
+        return audio_values.cpu().float().numpy()
+    def put(self, value):
+        batch_size = value.shape[0] // self.decoder.num_codebooks
+        if batch_size > 1:
+            raise ValueError("ParlerTTSStreamer only supports batch size 1")
+        if self.token_cache is None:
+            self.token_cache = value
+        else:
+            self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
+        if self.token_cache.shape[-1] % self.play_steps == 0:
+            audio_values = self.apply_delay_pattern_mask(self.token_cache)
+            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
+            self.to_yield += len(audio_values) - self.to_yield - self.stride
+    def end(self):
+        """Flushes any remaining cache and appends the stop symbol."""
+        if self.token_cache is not None:
+            audio_values = self.apply_delay_pattern_mask(self.token_cache)
+        else:
+            audio_values = np.zeros(self.to_yield)
+        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
+    def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
+        """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.audio_queue.put(audio, timeout=self.timeout)
+        if stream_end:
+            self.audio_queue.put(self.stop_signal, timeout=self.timeout)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.audio_queue.get(timeout=self.timeout)
+        if not isinstance(value, np.ndarray) and value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value