Spaces:

declare-lab
/

tango2

Running on Zero

App Files Files Community

Fabrice-TIERCELIN commited on May 24

Commit

39767da

•

1 Parent(s): c75a416

Merge the contributions from Tango 2 full and the changes on Tango 2

Browse files

Hi

@soujanyaporia

,

With this PR, this space will still have the new pipeline and will also have the new features from _Tango 2 full_ like:
- Simultaneous generations
- Seed parameter

_Successfully tested_

Regards,
Fabrice

Files changed (1) hide show

app.py +184 -54

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 import json
 import torch
 import wavio
@@ -23,7 +24,6 @@ from tqdm import tqdm
 class Tango2Pipeline(DiffusionPipeline):
@@ -169,6 +169,7 @@ class Tango2Pipeline(DiffusionPipeline):
         return AudioPipelineOutput(audios=wave)
 # Automatic device detection
 if torch.cuda.is_available():
@@ -249,21 +250,73 @@ pipe = Tango2Pipeline(vae=tango.vae,
                       scheduler=tango.scheduler
                       )
 @spaces.GPU(duration=60)
-def gradio_generate(prompt, output_format, steps, guidance):
-    output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
     #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
-    output_wave = output_wave.audios[0]
-    output_filename = "temp.wav"
-    wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
-    if (output_format == "mp3"):
-        AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
-        output_filename = "temp.mp3"
-    return output_filename
 # description_text = """
 # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
@@ -285,53 +338,130 @@ def gradio_generate(prompt, output_format, steps, guidance):
 # <p/>
 # """
 description_text = """
 <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
 <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
 <p/>
 """
-# Gradio input and output components
-input_text = gr.Textbox(lines=2, label="Prompt")
-output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
-output_audio = gr.Audio(label="Generated Audio", type="filepath")
-denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
-guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 # Gradio interface
-gr_interface = gr.Interface(
-    fn=gradio_generate,
-    inputs=[input_text, output_format, denoising_steps, guidance_scale],
-    outputs=[output_audio],
-    title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
-    description=description_text,
-    allow_flagging=False,
-    examples=[
-        ["Quiet speech and then and airplane flying away"],
-        ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
-        ["Ducks quack and water splashes with some animal screeching in the background"],
-        ["Describe the sound of the ocean"],
-        ["A woman and a baby are having a conversation"],
-        ["A man speaks followed by a popping noise and laughter"],
-        ["A cup is filled from a faucet"],
-        ["An audience cheering and clapping"],
-        ["Rolling thunder with lightning strikes"],
-        ["A dog barking and a cat mewing and a racing car passes by"],
-        ["Gentle water stream, birds chirping and sudden gun shot"],
-        ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."],
-        ["A dog barking"],
-        ["A cat meowing"],
-        ["Wooden table tapping sound while water pouring"],
-        ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
-        ["two gunshots followed by birds flying away while chirping"],
-        ["Whistling with birds chirping"],
-        ["A person snoring"],
-        ["Motor vehicles are driving with loud engines and a person whistles"],
-        ["People cheering in a stadium while thunder and lightning strikes"],
-        ["A helicopter is in flight"],
-        ["A dog barking and a man talking and a racing car passes by"],
-    ],
-    cache_examples="lazy", # Turn on to cache.
-)
-# Launch Gradio app
-gr_interface.queue(10).launch()

 import gradio as gr
+import random
 import json
 import torch
 import wavio
 class Tango2Pipeline(DiffusionPipeline):
         return AudioPipelineOutput(audios=wave)
+max_64_bit_int = 2**63 - 1
 # Automatic device detection
 if torch.cuda.is_available():
                       scheduler=tango.scheduler
                       )
+def update_seed(is_randomize_seed, seed):
+    if is_randomize_seed:
+        return random.randint(0, max_64_bit_int)
+    return seed
+def check(
+    prompt,
+    output_format,
+    output_number,
+    steps,
+    guidance,
+    is_randomize_seed,
+    seed
+):
+    if prompt is None or prompt == "":
+        raise gr.Error("Please provide a prompt input.")
+    if not output_number in [1, 2, 3]:
+        raise gr.Error("Please ask for 1, 2 or 3 output files.")
+def update_output(output_format, output_number):
+    return [
+        gr.update(format = output_format),
+        gr.update(format = output_format, visible = (2 <= output_number)),
+        gr.update(format = output_format, visible = (output_number == 3))
+    ]
+def generate_output(output_wave, output_format, output_number, output_index):
+    if (output_number < output_index):
+        return gr.update(format = output_format, visible = False)
+    output_wave = output_wave.audios[output_index - 1]
+    output_filename = "tmp" + str(output_index) + ".wav"
+    wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
+    if (output_format == "mp3"):
+        AudioSegment.from_wav("tmp" + str(output_index) + ".wav").export("tmp" + str(output_index) + ".mp3", format = "mp3")
+        output_filename = "tmp" + str(output_index) + ".mp3"
+    return gr.update(value = output_filename, format = output_format, visible = True)
 @spaces.GPU(duration=60)
+def gradio_generate(
+    prompt,
+    output_format,
+    output_number,
+    steps,
+    guidance,
+    is_randomize_seed,
+    seed
+):
+    if seed is None:
+        seed = random.randint(0, max_64_bit_int)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    output_wave = pipe(prompt, steps, guidance, samples = output_number) ## Using pipeline automatically uses flash attention for torch2.0 above
     #output_wave = tango.generate(prompt, steps, guidance)
     # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
+    return [
+        generate_output(output_wave, output_format, output_number, 1),
+        generate_output(output_wave, output_format, output_number, 2),
+        generate_output(output_wave, output_format, output_number, 3)
+    ]
 # description_text = """
 # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 # <p/>
 # """
 description_text = """
+<h1><center>Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization</center></h1>
 <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
 <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
 <p/>
 """
 # Gradio interface
+with gr.Blocks() as interface:
+    gr.HTML(description_text)
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(lines=2, label="Prompt")
+            output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
+            output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 1, step = 1, interactive = True)
+            denoising_steps = gr.Slider(minimum=10, maximum=200, value=100, step=1, label="Steps", interactive=True)
+            guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
+            randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
+            seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
+            submit = gr.Button("Generate", variant = "primary")
+        with gr.Column():
+            output_audio_1 = gr.Audio(label = "Generated Audio #1/3", format = "wav", type="numpy")
+            output_audio_2 = gr.Audio(label = "Generated Audio #2/3", format = "wav", type="numpy")
+            output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
+    submit.click(fn = update_seed, inputs = [
+        randomize_seed,
+        seed
+    ], outputs = [
+        seed
+    ], queue = False, show_progress = False).then(fn = check, inputs = [
+        input_text,
+        output_format,
+        output_number,
+        denoising_steps,
+        guidance_scale,
+        randomize_seed,
+        seed
+    ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
+        output_format,
+        output_number
+    ], outputs = [
+        output_audio_1,
+        output_audio_2,
+        output_audio_3
+    ], queue = False, show_progress = False).success(fn = gradio_generate, inputs = [
+        input_text,
+        output_format,
+        output_number,
+        denoising_steps,
+        guidance_scale,
+        randomize_seed,
+        seed
+    ], outputs = [
+        output_audio_1,
+        output_audio_2,
+        output_audio_3
+    ], scroll_to_output = True)
+    gr.Examples(
+        fn = gradio_generate,
+	    inputs = [
+            input_text,
+            output_format,
+            output_number,
+            denoising_steps,
+            guidance_scale,
+            randomize_seed,
+            seed
+        ],
+	    outputs = [
+            output_audio_1,
+            output_audio_2,
+            output_audio_3
+        ],
+        examples = [
+                ["Quiet speech and then airplane flying away", "wav", 3, 200, 3, False, 123],
+                ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing", "wav", 3, 200, 3, False, 123],
+                ["Ducks quack and water splashes with some animal screeching in the background", "wav", 3, 200, 3, False, 123],
+                ["Describe the sound of the ocean", "wav", 3, 200, 3, False, 123],
+                ["A woman and a baby are having a conversation", "wav", 3, 200, 3, False, 123],
+                ["A man speaks followed by a popping noise and laughter", "wav", 3, 200, 3, False, 123],
+                ["A cup is filled from a faucet", "wav", 3, 200, 3, False, 123],
+                ["An audience cheering and clapping", "wav", 3, 200, 3, False, 123],
+                ["Rolling thunder with lightning strikes", "wav", 3, 200, 3, False, 123],
+                ["A dog barking and a cat mewing and a racing car passes by", "wav", 3, 200, 3, False, 123],
+                ["Gentle water stream, birds chirping and sudden gun shot", "wav", 3, 200, 3, False, 123],
+                ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone.", 3, 200, 3, False, 123],
+                ["A dog barking", "wav", 3, 200, 3, False, 123],
+                ["A cat meowing", "wav", 3, 200, 3, False, 123],
+                ["Wooden table tapping sound while water pouring", "wav", 3, 200, 3, False, 123],
+                ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker", "wav", 3, 200, 3, False, 123],
+                ["two gunshots followed by birds flying away while chirping", "wav", 3, 200, 3, False, 123],
+                ["Whistling with birds chirping", "wav", 3, 200, 3, False, 123],
+                ["A person snoring", "wav", 3, 200, 3, False, 123],
+                ["Motor vehicles are driving with loud engines and a person whistles", "wav", 3, 200, 3, False, 123],
+                ["People cheering in a stadium while thunder and lightning strikes", "wav", 3, 200, 3, False, 123],
+                ["A helicopter is in flight", "wav", 3, 200, 3, False, 123],
+                ["A dog barking and a man talking and a racing car passes by", "wav", 3, 200, 3, False, 123],
+            ],
+        cache_examples = "lazy",
+    )
+    gr.Markdown(
+        """
+        ## How to prompt your sound
+        You can use round brackets to increase the importance of a part:
+        ```
+        Peaceful and (calming) ambient music with singing bowl and other instruments
+        ```
+        You can use several levels of round brackets to even more increase the importance of a part:
+        ```
+        (Peaceful) and ((calming)) ambient music with singing bowl and other instruments
+        ```
+        You can use number instead of several round brackets:
+        ```
+        (Peaceful:1.5) and ((calming)) ambient music with singing bowl and other instruments
+        ```
+        You can do the same thing with square brackets to decrease the importance of a part:
+        ```
+        (Peaceful:1.5) and ((calming)) ambient music with [singing:2] bowl and other instruments
+        """
+    )
+    interface.queue(10).launch()