File size: 2,871 Bytes
7f9420f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
from gradio_client import Client


DEBUG_MODE = True
SAS_SWITCH = True

'''
    Function to get the speech from the text
    @params:    text: str: The text to be converted to speech
    @params:    voice: str: The voice to be used for the speech
    @return:    result: str: The speech from the text
'''
def get_speech(text, voice):
    
    '''
        For now we are using external space to get the result.
        In future we will use our own model to get be more independent
    '''
    client = Client("https://collabora-whisperspeech.hf.space/")
    result = client.predict(
            # str  in 'Enter multilingual text📝' Textbox component
    		text,
      	    # filepath  in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
    		voice,	
    		"",	# str  in 'alternatively, you can paste in an audio file URL:' Textbox component
    		14,	# float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
    		api_name="/whisper_speech_demo"
    )
    if DEBUG_MODE:
        print(result)
    return result

'''

'''
def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
    
    # - If the speaker is a string and is a file path
    #       then we will extract the speaker embedding 
    #       from the file
    # - else if the speaker_url is provided then we 
    #       will extract the speaker embedding from the url
    # - else we will use the default speaker
    if isinstance(speaker, (str, Path)): 
        speaker = pipe.extract_spk_emb(speaker)
    elif speaker_url: 
        speaker = pipe.extract_spk_emb(speaker_url)
    else: speaker = pipe.default_speaker
    
    
    langs, texts = [list(x) for x in zip(*segments)]
    print(texts, langs)
    
    stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)
    stoks = stoks[stoks!=512]
    atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
    audio = pipe.vocoder.decode(atoks)
    
    return audio.cpu()







with gr.Blocks() as demo:
    with gr.Row():
        text_input = gr.Textbox(label="Enter multilingual text📝")
        cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
                            label="Speed (in characters per second)")

        with gr.Row(equal_height=True):
            speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬", 
                                     sources=["upload", "microphone"],
                                     type='filepath')
            url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
        gr.Markdown("  \n  ") # fixes the bottom overflow from Audio
        generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
    with gr.Column(scale=1):
        output_audio = gr.Audio(label="WhisperSpeech says…")



demo.launch(server_port=46007)