File size: 4,306 Bytes
1b629bf
 
9cd9556
 
 
 
 
 
1b629bf
9cd9556
 
1b629bf
9cd9556
 
1b629bf
9cd9556
 
 
 
 
1b629bf
e64aa56
 
9cd9556
e64aa56
9cd9556
 
 
 
 
 
 
 
 
 
 
 
 
d6146a6
9cd9556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6146a6
9cd9556
6426430
9cd9556
 
 
 
 
 
 
 
 
 
 
 
 
 
6426430
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import os
import httpx
import numpy as np
import base64
import torch
import torchaudio
import io

URL = os.environ['TEMP_HOSTING_URL']
API_KEY = os.environ['TEMP_CALLING_KEY']

def inference(reference_audio, text, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method):
    _sr, _wav = reference_audio

    wav = torch.from_numpy(_wav).float()
    wav = wav / 32768.0
    if wav.dim() == 1: wav = wav[None]
    else:
        wav = wav.mean(dim=-1)[None]

    wav = torchaudio.functional.resample(wav, _sr, 24000)

    io_data = io.BytesIO()
    torchaudio.save(io_data, wav, sample_rate=24000, format='wav')
    io_data.seek(0)

    encoded_data = base64.b64encode(io_data.read())
    encoded_str = encoded_data.decode("utf-8")

    if clone_method == 'deep-clone':
        dlc = 'fixed-ref'
    elif clone_method == 'shallow-clone':
        dlc = 'none'
    elif clone_method == 'follow-on deep-clone':
        dlc = 'per-chunk'

    data = {
        "text": text,
        "reference_audio": encoded_str, # reference audio, b64 encoded. Should be <=15s.
        "reference_text": reference_text if reference_text is not None and len(reference_text) > 0 else None,
        "language": 'en-us',
        "inference_settings": {'top_p': top_p, "prefix": quality_prefix, 'ras_K': ras_K, 'ras_t_r': ras_t_r, 'deep_clone_mode': dlc},
    }
    print(f"Calling with payload {data['inference_settings']}")

    # Send the POST request
    headers={"Authorization": f"Api-Key {API_KEY}"} 
    response = httpx.post(URL, headers=headers, json=data, timeout=300)
    # Check the response status code
    if response.status_code == 200: print("Request successful!")
    else: print("Request failed with status code", response.status_code)
    full_audio_bytes = base64.b64decode(response.json()['output'])

    wav, sr = torchaudio.load(io.BytesIO(full_audio_bytes))
    wav = wav.numpy()

    return (sr, wav.T) 

with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("## Reference Audio")
    with gr.Row():
        reference_audio = gr.Audio(label="Drop Audio Here", max_length=16)
    with gr.Row():
        gr.Markdown("## Text to Generate")
    with gr.Row():
        text_input = gr.Textbox(label="Text to Generate")
    with gr.Row():
        synthesize_button = gr.Button("Synthesize", variant="primary")
    with gr.Accordion("Advanced Settings", open=False):
        with gr.Row():
            reference_text = gr.Textbox(label="Reference Text", 
                                         info="Leave blank to automatically transcribe the reference audio. Inference will be slightly faster if you specify the correct reference transcript below.")
        with gr.Row():
            ras_K = gr.Slider(minimum=1, maximum=20, step=1, value=10, label="RAS_K", info="RAS sampling K value")
        with gr.Row():
            ras_t_r = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.09, label="RAS_t_r", info="RAS sampling t_r value")
        with gr.Row():
            top_p = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.2, label="top_p", info="top-p sampling value")
        with gr.Row():
            quality_prefix = gr.Textbox('48000', label="quality_prefix", info="quality prefix string to append to generation", lines=1)
        with gr.Row():
            gr.Markdown("Cloning method to use. Deep clone and shallow clone use the method described in the paper, " +
                        "while `follow-on deep clone` uses deep cloning, but always using the previous generated segment as the deep clone conditioning. " +
                        "This only makes a difference for long text inputs where the text is internally chunked up and generated in chunks.")
            clone_method = gr.Radio(choices=['deep-clone', 'shallow-clone', 'follow-on deep-clone'], value='deep-clone', label="cloning method", info="cloning method to use")


    with gr.Row():
        gr.Markdown("## Synthesized Audio")
    with gr.Row():
        audio_output = gr.Audio(label="Synthesized Audio")

    synthesize_button.click(
        inference, 
        inputs=[reference_audio, text_input, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method], 
        outputs=[audio_output]
    )

if __name__ == "__main__":
    demo.launch(share=False)