Spaces:

Nuno-Tome
/

nnf_face_n_text_to_video

Build error

App Files Files Community

Nuno-Tome commited on Mar 11, 2024

Commit

7f9420f

1 Parent(s): f16f614

no message

Browse files

Files changed (3) hide show

app.py +108 -44
app_new.py +86 -0
requirements.txt +9 -1

app.py CHANGED Viewed

@@ -1,38 +1,68 @@
 import gradio as gr
-from gradio_client import Client
-DEBUG_MODE = True
-SAS_SWITCH = True
-'''
-    Function to get the speech from the text
-    @params:    text: str: The text to be converted to speech
-    @params:    voice: str: The voice to be used for the speech
-    @return:    result: str: The speech from the text
-'''
-def get_speech(text, voice):
-    '''
-        For now we are using external space to get the result.
-        In future we will use our own model to get be more independent
-    '''
-    client = Client("https://collabora-whisperspeech.hf.space/")
-    result = client.predict(
-            # str  in 'Enter multilingual text📝' Textbox component
-    		text,
-      	    # filepath  in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
-    		voice,
-    		"",	# str  in 'alternatively, you can paste in an audio file URL:' Textbox component
-    		14,	# float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
-    		api_name="/whisper_speech_demo"
-    )
-    if DEBUG_MODE:
-        print(result)
-    return result
 def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
     if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
     elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
@@ -45,24 +75,58 @@ def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
     audio = pipe.vocoder.decode(atoks)
     return audio.cpu()
-with gr.Blocks() as demo:
-    with gr.Row():
-        text_input = gr.Textbox(label="Enter multilingual text📝")
-        cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
-                            label="Speed (in characters per second)")
-        with gr.Row(equal_height=True):
-            speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
                                      sources=["upload", "microphone"],
                                      type='filepath')
-            url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
-        gr.Markdown("  \n  ") # fixes the bottom overflow from Audio
-        generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
-    with gr.Column(scale=1):
-        output_audio = gr.Audio(label="WhisperSpeech says…")
-demo.launch(server_port=46007)

+import spaces
 import gradio as gr
+import io
+import os
+import re
+import torch
+import torchaudio
+from pathlib import Path
+from whisperspeech.pipeline import Pipeline
+DEVEL=os.environ.get('DEVEL', False)
+title = """
+<picture>
+  <source srcset="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/dark-banner.png" media="(prefers-color-scheme: dark)" />
+  <img alt="WhisperSpeech banner with Collabora and LAION logos" src="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/light-banner.png" style="width: 60%; margin: 0 auto;" />
+</picture>
+# Welcome to Collabora's WhisperSpeech
+WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper.
+The model is fully open and you can run it on your local hardware. It's like **Stable Diffusion but for speech**
+– both powerful and easily customizable.
+[You can contribute to WhisperSpeech on Github.](https://github.com/collabora/WhisperSpeech)
+You can also join the discussion on Discord [![](https://dcbadge.vercel.app/api/server/FANw4rHD5E)](https://discord.gg/FANw4rHD5E)
+Huge thanks to [Tonic](https://huggingface.co/Tonic) who helped build this Space for WhisperSpeech.
+### How to Use It
+Write you text in the box, you can use language tags (`<en>` or `<pl>`) to create multilingual speech.
+Optionally you can upload a speech sample or give it a file URL to clone an existing voice. Check out the
+examples at the bottom of the page for inspiration.
+"""
+footer = """
+### How to use it locally
+```
+pip install -U WhisperSpeech
+```
+Afterwards:
+```
+from whisperspeech.pipeline import Pipeline
+pipe = Pipeline(torch_compile=True)
+pipe.generate_to_file("output.wav", "Hello from WhisperSpeech.")
+```
+"""
+text_examples = [
+    ["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
+    ["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
+    ["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
+    ["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
+    # ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
+    ["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
+    # ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
+]
+def parse_multilingual_text(input_text):
+    pattern = r"(?:<(\w+)>)|([^<]+)"
+    cur_lang = 'en'
+    segments = []
+    for i, (lang, txt) in enumerate(re.findall(pattern, input_text)):
+        if lang: cur_lang = lang
+        else: segments.append((cur_lang, f"  {txt}  ")) # add spaces to give it some time to switch languages
+    if not segments: return [("en", "")]
+    return segments
+@spaces.GPU(enable_queue=True)
 def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
     if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
     elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
     audio = pipe.vocoder.decode(atoks)
     return audio.cpu()
+def whisper_speech_demo(multilingual_text, speaker_audio=None, speaker_url="", cps=14):
+    if len(multilingual_text) == 0:
+        raise gr.Error("Please enter some text for me to speak!")
+    segments = parse_multilingual_text(multilingual_text)
+    audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
+    return (24000, audio.T.numpy())
+    # Did not work for me in Safari:
+    # mp3 = io.BytesIO()
+    # torchaudio.save(mp3, audio, 24000, format='mp3')
+    # return mp3.getvalue()
+pipe = Pipeline(torch_compile=not DEVEL)
+# warmup will come from regenerating the examples
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(label="Enter multilingual text💬📝",
+                                    value=text_examples[0][0],
+                                    info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
+            cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
+                            label="Tempo (in characters per second)")
+            with gr.Row(equal_height=True):
+                speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
                                      sources=["upload", "microphone"],
                                      type='filepath')
+                url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
+            gr.Markdown("  \n  ") # fixes the bottom overflow from Audio
+            generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="WhisperSpeech says…")
+    with gr.Column():
+        gr.Markdown("### Try these examples to get started !🌟🌬️")
+        gr.Examples(
+            examples=text_examples,
+            inputs=[text_input, url_input],
+            outputs=[output_audio],
+            fn=whisper_speech_demo,
+            cache_examples=not DEVEL,
+        )
+    generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
+    gr.Markdown(footer)
+demo.launch(server_port=3000 if DEVEL else None)

app_new.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import gradio as gr
+from gradio_client import Client
+DEBUG_MODE = True
+SAS_SWITCH = True
+'''
+    Function to get the speech from the text
+    @params:    text: str: The text to be converted to speech
+    @params:    voice: str: The voice to be used for the speech
+    @return:    result: str: The speech from the text
+'''
+def get_speech(text, voice):
+    '''
+        For now we are using external space to get the result.
+        In future we will use our own model to get be more independent
+    '''
+    client = Client("https://collabora-whisperspeech.hf.space/")
+    result = client.predict(
+            # str  in 'Enter multilingual text📝' Textbox component
+    		text,
+      	    # filepath  in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
+    		voice,
+    		"",	# str  in 'alternatively, you can paste in an audio file URL:' Textbox component
+    		14,	# float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
+    		api_name="/whisper_speech_demo"
+    )
+    if DEBUG_MODE:
+        print(result)
+    return result
+'''
+'''
+def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
+    # - If the speaker is a string and is a file path
+    #       then we will extract the speaker embedding
+    #       from the file
+    # - else if the speaker_url is provided then we
+    #       will extract the speaker embedding from the url
+    # - else we will use the default speaker
+    if isinstance(speaker, (str, Path)):
+        speaker = pipe.extract_spk_emb(speaker)
+    elif speaker_url:
+        speaker = pipe.extract_spk_emb(speaker_url)
+    else: speaker = pipe.default_speaker
+    langs, texts = [list(x) for x in zip(*segments)]
+    print(texts, langs)
+    stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)
+    stoks = stoks[stoks!=512]
+    atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
+    audio = pipe.vocoder.decode(atoks)
+    return audio.cpu()
+with gr.Blocks() as demo:
+    with gr.Row():
+        text_input = gr.Textbox(label="Enter multilingual text📝")
+        cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
+                            label="Speed (in characters per second)")
+        with gr.Row(equal_height=True):
+            speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
+                                     sources=["upload", "microphone"],
+                                     type='filepath')
+            url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
+        gr.Markdown("  \n  ") # fixes the bottom overflow from Audio
+        generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
+    with gr.Column(scale=1):
+        output_audio = gr.Audio(label="WhisperSpeech says…")
+demo.launch(server_port=46007)

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
 gradio
 transformers
-scipy

 gradio
 transformers
+scipy
+spaces
+io
+os
+re
+torch
+torchaudio
+pathlib
+whisperspeech.pipeline