Nuno-Tome commited on
Commit
7f9420f
·
1 Parent(s): f16f614

no message

Browse files
Files changed (3) hide show
  1. app.py +108 -44
  2. app_new.py +86 -0
  3. requirements.txt +9 -1
app.py CHANGED
@@ -1,38 +1,68 @@
 
1
  import gradio as gr
2
- from gradio_client import Client
3
-
4
-
5
- DEBUG_MODE = True
6
- SAS_SWITCH = True
7
-
8
- '''
9
- Function to get the speech from the text
10
- @params: text: str: The text to be converted to speech
11
- @params: voice: str: The voice to be used for the speech
12
- @return: result: str: The speech from the text
13
- '''
14
- def get_speech(text, voice):
15
-
16
- '''
17
- For now we are using external space to get the result.
18
- In future we will use our own model to get be more independent
19
- '''
20
- client = Client("https://collabora-whisperspeech.hf.space/")
21
- result = client.predict(
22
- # str in 'Enter multilingual text📝' Textbox component
23
- text,
24
- # filepath in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
25
- voice,
26
- "", # str in 'alternatively, you can paste in an audio file URL:' Textbox component
27
- 14, # float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
28
- api_name="/whisper_speech_demo"
29
- )
30
- if DEBUG_MODE:
31
- print(result)
32
- return result
33
 
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
37
  if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
38
  elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
@@ -45,24 +75,58 @@ def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
45
  audio = pipe.vocoder.decode(atoks)
46
  return audio.cpu()
47
 
 
 
 
48
 
 
49
 
50
- with gr.Blocks() as demo:
51
- with gr.Row():
52
- text_input = gr.Textbox(label="Enter multilingual text📝")
53
- cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
54
- label="Speed (in characters per second)")
55
 
56
- with gr.Row(equal_height=True):
57
- speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  sources=["upload", "microphone"],
59
  type='filepath')
60
- url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
61
- gr.Markdown(" \n ") # fixes the bottom overflow from Audio
62
- generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
63
- with gr.Column(scale=1):
64
- output_audio = gr.Audio(label="WhisperSpeech says…")
65
 
 
 
 
 
 
 
 
 
 
66
 
 
 
67
 
68
- demo.launch(server_port=46007)
 
1
+ import spaces
2
  import gradio as gr
3
+ import io
4
+ import os
5
+ import re
6
+ import torch
7
+ import torchaudio
8
+ from pathlib import Path
9
+ from whisperspeech.pipeline import Pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ DEVEL=os.environ.get('DEVEL', False)
12
 
13
+ title = """
14
+ <picture>
15
+ <source srcset="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/dark-banner.png" media="(prefers-color-scheme: dark)" />
16
+ <img alt="WhisperSpeech banner with Collabora and LAION logos" src="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/light-banner.png" style="width: 60%; margin: 0 auto;" />
17
+ </picture>
18
+ # Welcome to Collabora's WhisperSpeech
19
+ WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper.
20
+ The model is fully open and you can run it on your local hardware. It's like **Stable Diffusion but for speech**
21
+ – both powerful and easily customizable.
22
+ [You can contribute to WhisperSpeech on Github.](https://github.com/collabora/WhisperSpeech)
23
+ You can also join the discussion on Discord [![](https://dcbadge.vercel.app/api/server/FANw4rHD5E)](https://discord.gg/FANw4rHD5E)
24
+ Huge thanks to [Tonic](https://huggingface.co/Tonic) who helped build this Space for WhisperSpeech.
25
+ ### How to Use It
26
+ Write you text in the box, you can use language tags (`<en>` or `<pl>`) to create multilingual speech.
27
+ Optionally you can upload a speech sample or give it a file URL to clone an existing voice. Check out the
28
+ examples at the bottom of the page for inspiration.
29
+ """
30
 
31
+ footer = """
32
+ ### How to use it locally
33
+ ```
34
+ pip install -U WhisperSpeech
35
+ ```
36
+ Afterwards:
37
+ ```
38
+ from whisperspeech.pipeline import Pipeline
39
+ pipe = Pipeline(torch_compile=True)
40
+ pipe.generate_to_file("output.wav", "Hello from WhisperSpeech.")
41
+ ```
42
+ """
43
+
44
+
45
+ text_examples = [
46
+ ["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
47
+ ["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
48
+ ["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
49
+ ["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
50
+ # ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
51
+ ["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
52
+ # ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
53
+ ]
54
+
55
+ def parse_multilingual_text(input_text):
56
+ pattern = r"(?:<(\w+)>)|([^<]+)"
57
+ cur_lang = 'en'
58
+ segments = []
59
+ for i, (lang, txt) in enumerate(re.findall(pattern, input_text)):
60
+ if lang: cur_lang = lang
61
+ else: segments.append((cur_lang, f" {txt} ")) # add spaces to give it some time to switch languages
62
+ if not segments: return [("en", "")]
63
+ return segments
64
+
65
+ @spaces.GPU(enable_queue=True)
66
  def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
67
  if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
68
  elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
 
75
  audio = pipe.vocoder.decode(atoks)
76
  return audio.cpu()
77
 
78
+ def whisper_speech_demo(multilingual_text, speaker_audio=None, speaker_url="", cps=14):
79
+ if len(multilingual_text) == 0:
80
+ raise gr.Error("Please enter some text for me to speak!")
81
 
82
+ segments = parse_multilingual_text(multilingual_text)
83
 
84
+ audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
 
 
 
 
85
 
86
+ return (24000, audio.T.numpy())
87
+
88
+ # Did not work for me in Safari:
89
+ # mp3 = io.BytesIO()
90
+ # torchaudio.save(mp3, audio, 24000, format='mp3')
91
+ # return mp3.getvalue()
92
+
93
+ pipe = Pipeline(torch_compile=not DEVEL)
94
+ # warmup will come from regenerating the examples
95
+
96
+ with gr.Blocks() as demo:
97
+ gr.Markdown(title)
98
+ with gr.Row(equal_height=True):
99
+ with gr.Column(scale=2):
100
+
101
+ text_input = gr.Textbox(label="Enter multilingual text💬📝",
102
+ value=text_examples[0][0],
103
+ info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
104
+
105
+ cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
106
+ label="Tempo (in characters per second)")
107
+
108
+
109
+ with gr.Row(equal_height=True):
110
+ speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
111
  sources=["upload", "microphone"],
112
  type='filepath')
113
+ url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
114
+ gr.Markdown(" \n ") # fixes the bottom overflow from Audio
115
+ generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
116
+ with gr.Column(scale=1):
117
+ output_audio = gr.Audio(label="WhisperSpeech says…")
118
 
119
+ with gr.Column():
120
+ gr.Markdown("### Try these examples to get started !🌟🌬️")
121
+ gr.Examples(
122
+ examples=text_examples,
123
+ inputs=[text_input, url_input],
124
+ outputs=[output_audio],
125
+ fn=whisper_speech_demo,
126
+ cache_examples=not DEVEL,
127
+ )
128
 
129
+ generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
130
+ gr.Markdown(footer)
131
 
132
+ demo.launch(server_port=3000 if DEVEL else None)
app_new.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client
3
+
4
+
5
+ DEBUG_MODE = True
6
+ SAS_SWITCH = True
7
+
8
+ '''
9
+ Function to get the speech from the text
10
+ @params: text: str: The text to be converted to speech
11
+ @params: voice: str: The voice to be used for the speech
12
+ @return: result: str: The speech from the text
13
+ '''
14
+ def get_speech(text, voice):
15
+
16
+ '''
17
+ For now we are using external space to get the result.
18
+ In future we will use our own model to get be more independent
19
+ '''
20
+ client = Client("https://collabora-whisperspeech.hf.space/")
21
+ result = client.predict(
22
+ # str in 'Enter multilingual text📝' Textbox component
23
+ text,
24
+ # filepath in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
25
+ voice,
26
+ "", # str in 'alternatively, you can paste in an audio file URL:' Textbox component
27
+ 14, # float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
28
+ api_name="/whisper_speech_demo"
29
+ )
30
+ if DEBUG_MODE:
31
+ print(result)
32
+ return result
33
+
34
+ '''
35
+
36
+ '''
37
+ def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
38
+
39
+ # - If the speaker is a string and is a file path
40
+ # then we will extract the speaker embedding
41
+ # from the file
42
+ # - else if the speaker_url is provided then we
43
+ # will extract the speaker embedding from the url
44
+ # - else we will use the default speaker
45
+ if isinstance(speaker, (str, Path)):
46
+ speaker = pipe.extract_spk_emb(speaker)
47
+ elif speaker_url:
48
+ speaker = pipe.extract_spk_emb(speaker_url)
49
+ else: speaker = pipe.default_speaker
50
+
51
+
52
+ langs, texts = [list(x) for x in zip(*segments)]
53
+ print(texts, langs)
54
+
55
+ stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)
56
+ stoks = stoks[stoks!=512]
57
+ atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
58
+ audio = pipe.vocoder.decode(atoks)
59
+
60
+ return audio.cpu()
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+ with gr.Blocks() as demo:
69
+ with gr.Row():
70
+ text_input = gr.Textbox(label="Enter multilingual text📝")
71
+ cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
72
+ label="Speed (in characters per second)")
73
+
74
+ with gr.Row(equal_height=True):
75
+ speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
76
+ sources=["upload", "microphone"],
77
+ type='filepath')
78
+ url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
79
+ gr.Markdown(" \n ") # fixes the bottom overflow from Audio
80
+ generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
81
+ with gr.Column(scale=1):
82
+ output_audio = gr.Audio(label="WhisperSpeech says…")
83
+
84
+
85
+
86
+ demo.launch(server_port=46007)
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
  gradio
2
  transformers
3
- scipy
 
 
 
 
 
 
 
 
 
1
  gradio
2
  transformers
3
+ scipy
4
+ spaces
5
+ io
6
+ os
7
+ re
8
+ torch
9
+ torchaudio
10
+ pathlib
11
+ whisperspeech.pipeline