baas-cambai commited on
Commit
9cd9556
·
1 Parent(s): 1b629bf
Files changed (2) hide show
  1. app.py +94 -4
  2. requirements.txt +4 -0
app.py CHANGED
@@ -1,10 +1,100 @@
1
  import gradio as gr
2
  import os
 
 
 
 
 
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!!" + os.environ['testo']
6
 
7
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 
8
 
 
 
 
 
 
9
 
10
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
+ import httpx
4
+ import numpy as np
5
+ import base64
6
+ import torch
7
+ import torchaudio
8
+ import io
9
 
10
+ URL = os.environ['TEMP_HOSTING_URL']
11
+ API_KEY = os.environ['TEMP_CALLING_KEY']
12
 
13
+ def inference(reference_audio, text, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method):
14
+ _sr, _wav = reference_audio
15
 
16
+ wav = torch.from_numpy(_wav).float()
17
+ wav = wav / 32768.0
18
+ if wav.dim() == 1: wav = wav[None]
19
+ else:
20
+ wav = wav.mean(dim=-1)[None]
21
 
22
+ io_data = io.BytesIO()
23
+ torchaudio.save(io_data, wav, sample_rate=_sr, format='wav')
24
+ io_data.seek(0)
25
+
26
+ encoded_data = base64.b64encode(io_data.read())
27
+ encoded_str = encoded_data.decode("utf-8")
28
+
29
+ if clone_method == 'deep-clone':
30
+ dlc = 'fixed-ref'
31
+ elif clone_method == 'shallow-clone':
32
+ dlc = 'none'
33
+ elif clone_method == 'follow-on deep-clone':
34
+ dlc = 'per-chunk'
35
+
36
+ data = {
37
+ "text": text, #"la volpe marrone salta velocemente sopra il cane pigro.",
38
+ "reference_audio": encoded_str, # reference audio, b64 encoded. Should be <=15s.
39
+ "reference_text": reference_text if reference_text is not None and len(reference_text) > 0 else None,
40
+ "language": 'en-us',
41
+ "inference_settings": {'top_p': top_p, "prefix": quality_prefix, 'ras_K': ras_K, 'ras_t_r': ras_t_r, 'deep_clone_mode': dlc},
42
+ }
43
+ print(f"Calling with payload {data['inference_settings']}")
44
+
45
+ # Send the POST request
46
+ headers={"Authorization": f"Api-Key {API_KEY}"}
47
+ response = httpx.post(URL, headers=headers, json=data, timeout=300)
48
+ # Check the response status code
49
+ if response.status_code == 200: print("Request successful!")
50
+ else: print("Request failed with status code", response.status_code)
51
+ full_audio_bytes = base64.b64decode(response.json()['output'])
52
+
53
+ wav, sr = torchaudio.load(io.BytesIO(full_audio_bytes))
54
+ wav = wav.numpy()
55
+
56
+ return (sr, wav.T)
57
+
58
+ with gr.Blocks() as demo:
59
+ with gr.Row():
60
+ gr.Markdown("## Reference Audio")
61
+ with gr.Row():
62
+ reference_audio = gr.Audio(label="Drop Audio Here", max_length=16)
63
+ with gr.Row():
64
+ gr.Markdown("## Text to Generate")
65
+ with gr.Row():
66
+ text_input = gr.Textbox(label="Text to Generate")
67
+ with gr.Row():
68
+ synthesize_button = gr.Button("Synthesize", variant="primary")
69
+ with gr.Accordion("Advanced Settings", open=False):
70
+ with gr.Row():
71
+ reference_text = gr.Textbox(label="Reference Text",
72
+ info="Leave blank to automatically transcribe the reference audio. Inference will be slightly faster if you specify the correct reference transcript below.")
73
+ with gr.Row():
74
+ ras_K = gr.Slider(minimum=1, maximum=20, step=1, value=10, label="RAS_K", info="RAS sampling K value")
75
+ with gr.Row():
76
+ ras_t_r = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.09, label="RAS_t_r", info="RAS sampling t_r value")
77
+ with gr.Row():
78
+ top_p = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.2, label="top_p", info="top-p sampling value")
79
+ with gr.Row():
80
+ quality_prefix = gr.Textbox('48000', label="quality_prefix", info="quality prefix string to append to generation", lines=1)
81
+ with gr.Row():
82
+ gr.Markdown("Cloning method to use. Deep clone and shallow clone use the method described in the paper, " +
83
+ "while follow-on deep clone uses deep cloning, but always using the previous generated segment as the deep clone conditioning. " +
84
+ "This only makes a difference for long text inputs where the text is internally chunked up and generated in chunks.")
85
+ clone_method = gr.Radio(choices=['deep-clone', 'shallow-clone', 'follow-on deep-clone'], value='follow-on deep-clone', label="cloning method", info="cloning method to use")
86
+
87
+
88
+ with gr.Row():
89
+ gr.Markdown("## Synthesized Audio")
90
+ with gr.Row():
91
+ audio_output = gr.Audio(label="Synthesized Audio")
92
+
93
+ synthesize_button.click(
94
+ inference,
95
+ inputs=[reference_audio, text_input, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method],
96
+ outputs=[audio_output]
97
+ )
98
+
99
+ if __name__ == "__main__":
100
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ httpx
2
+ regex
3
+ torch
4
+ torchaudio