import gradio as gr import math import time import numpy as np from pydub import AudioSegment import io def numpy_to_mp3(audio_array, sampling_rate): # Normalize audio_array if it's floating-point if np.issubdtype(audio_array.dtype, np.floating): max_val = np.max(np.abs(audio_array)) audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range audio_array = audio_array.astype(np.int16) # Create an audio segment from the numpy array audio_segment = AudioSegment( audio_array.tobytes(), frame_rate=sampling_rate, sample_width=audio_array.dtype.itemsize, channels=1 ) # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality mp3_io = io.BytesIO() audio_segment.export(mp3_io, format="mp3", bitrate="320k") # Get the MP3 bytes mp3_bytes = mp3_io.getvalue() mp3_io.close() return mp3_bytes def stream(audio, chunk_length_s): start_time = time.time() sampling_rate, array = audio chunk_length = int(chunk_length_s * sampling_rate) time_length = chunk_length_s / 2 # always stream outputs faster than it takes to process audio_length = len(array) num_batches = math.ceil(audio_length / chunk_length) for idx in range(num_batches): time.sleep(time_length) start_pos = idx * chunk_length end_pos = min((idx + 1) * chunk_length, audio_length) chunk = array[start_pos : end_pos] chunk_mp3 = numpy_to_mp3(chunk, sampling_rate=sampling_rate) if idx == 0: first_time = round(time.time() - start_time, 2) run_time = round(time.time() - start_time, 2) yield chunk_mp3, first_time, run_time with gr.Blocks() as demo: with gr.Row(): with gr.Column(): audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy") chunk_length = gr.Slider(minimum=2, maximum=10, value=2, step=2, label="Chunk length (s)") run_button = gr.Button("Stream audio") with gr.Column(): audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="mp3") first_time = gr.Textbox(label="Time to first chunk (s)") run_time = gr.Textbox(label="Time to current chunk (s)") run_button.click(fn=stream, inputs=[audio_in, chunk_length], outputs=[audio_out, first_time, run_time]) demo.launch()