Spaces:

CAMB-AI
/

mars6-turbo-demo

Running

App Files Files Community

baas-cambai commited on 24 days ago

Commit

9cd9556

1 Parent(s): 1b629bf

Add app

Browse files

Files changed (2) hide show

app.py +94 -4
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -1,10 +1,100 @@
 import gradio as gr
 import os
-def greet(name):
-    return "Hello " + name + "!!" + os.environ['testo']
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
 import os
+import httpx
+import numpy as np
+import base64
+import torch
+import torchaudio
+import io
+URL = os.environ['TEMP_HOSTING_URL']
+API_KEY = os.environ['TEMP_CALLING_KEY']
+def inference(reference_audio, text, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method):
+    _sr, _wav = reference_audio
+    wav = torch.from_numpy(_wav).float()
+    wav = wav / 32768.0
+    if wav.dim() == 1: wav = wav[None]
+    else:
+        wav = wav.mean(dim=-1)[None]
+    io_data = io.BytesIO()
+    torchaudio.save(io_data, wav, sample_rate=_sr, format='wav')
+    io_data.seek(0)
+    encoded_data = base64.b64encode(io_data.read())
+    encoded_str = encoded_data.decode("utf-8")
+    if clone_method == 'deep-clone':
+        dlc = 'fixed-ref'
+    elif clone_method == 'shallow-clone':
+        dlc = 'none'
+    elif clone_method == 'follow-on deep-clone':
+        dlc = 'per-chunk'
+    data = {
+        "text": text, #"la volpe marrone salta velocemente sopra il cane pigro.",
+        "reference_audio": encoded_str, # reference audio, b64 encoded. Should be <=15s.
+        "reference_text": reference_text if reference_text is not None and len(reference_text) > 0 else None,
+        "language": 'en-us',
+        "inference_settings": {'top_p': top_p, "prefix": quality_prefix, 'ras_K': ras_K, 'ras_t_r': ras_t_r, 'deep_clone_mode': dlc},
+    }
+    print(f"Calling with payload {data['inference_settings']}")
+    # Send the POST request
+    headers={"Authorization": f"Api-Key {API_KEY}"}
+    response = httpx.post(URL, headers=headers, json=data, timeout=300)
+    # Check the response status code
+    if response.status_code == 200: print("Request successful!")
+    else: print("Request failed with status code", response.status_code)
+    full_audio_bytes = base64.b64decode(response.json()['output'])
+    wav, sr = torchaudio.load(io.BytesIO(full_audio_bytes))
+    wav = wav.numpy()
+    return (sr, wav.T)
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("## Reference Audio")
+    with gr.Row():
+        reference_audio = gr.Audio(label="Drop Audio Here", max_length=16)
+    with gr.Row():
+        gr.Markdown("## Text to Generate")
+    with gr.Row():
+        text_input = gr.Textbox(label="Text to Generate")
+    with gr.Row():
+        synthesize_button = gr.Button("Synthesize", variant="primary")
+    with gr.Accordion("Advanced Settings", open=False):
+        with gr.Row():
+            reference_text = gr.Textbox(label="Reference Text",
+                                         info="Leave blank to automatically transcribe the reference audio. Inference will be slightly faster if you specify the correct reference transcript below.")
+        with gr.Row():
+            ras_K = gr.Slider(minimum=1, maximum=20, step=1, value=10, label="RAS_K", info="RAS sampling K value")
+        with gr.Row():
+            ras_t_r = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.09, label="RAS_t_r", info="RAS sampling t_r value")
+        with gr.Row():
+            top_p = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.2, label="top_p", info="top-p sampling value")
+        with gr.Row():
+            quality_prefix = gr.Textbox('48000', label="quality_prefix", info="quality prefix string to append to generation", lines=1)
+        with gr.Row():
+            gr.Markdown("Cloning method to use. Deep clone and shallow clone use the method described in the paper, " +
+                        "while follow-on deep clone uses deep cloning, but always using the previous generated segment as the deep clone conditioning. " +
+                        "This only makes a difference for long text inputs where the text is internally chunked up and generated in chunks.")
+            clone_method = gr.Radio(choices=['deep-clone', 'shallow-clone', 'follow-on deep-clone'], value='follow-on deep-clone', label="cloning method", info="cloning method to use")
+    with gr.Row():
+        gr.Markdown("## Synthesized Audio")
+    with gr.Row():
+        audio_output = gr.Audio(label="Synthesized Audio")
+    synthesize_button.click(
+        inference,
+        inputs=[reference_audio, text_input, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method],
+        outputs=[audio_output]
+    )
+if __name__ == "__main__":
+    demo.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+httpx
+regex
+torch
+torchaudio