File size: 3,763 Bytes
2ddf3fe 0412962 50a5992 2ddf3fe 3b8e519 0412962 a00d114 3b8e519 8e14b4c 3b8e519 a00d114 8e14b4c 50a5992 2ddf3fe 50a5992 4b0b51b 0412962 50a5992 2ddf3fe 8e14b4c 50a5992 865b8d5 50a5992 2ddf3fe b865c6d 3b8e519 a00d114 2ddf3fe 50a5992 3b8e519 2ddf3fe 865b8d5 3b8e519 0412962 50a5992 2ddf3fe 8e14b4c 50a5992 2ddf3fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch
import pandas as pd
import gdown
def process_audio(audio_input, window_size_samples):
wav = read_audio(audio_input, sampling_rate=16_000)
audio_length_samples = len(wav)
probs = get_speech_probs(wav, window_size_samples=window_size_samples, sampling_rate=16_000)
return make_visualization(probs, 512 / 16_000), probs, audio_length_samples
def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
min_speech_duration_ms *= 1000
min_silence_duration_ms *= 1000
timestamps = probs2speech_timestamps(probs, audio_length_samples,
threshold = threshold,
min_speech_duration_ms = min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
window_size_samples=window_size_samples,
speech_pad_ms=speech_pad_ms,
return_seconds=True,
rounding=3)
df = pd.DataFrame(timestamps)
df["note"] = ""
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
return "timestamps.txt", df
def download_gdrive(id):
output_file = "audio.wav" # Replace "data_file.ext" with the desired output filename and extension
gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
return output_file
def main():
with gr.Blocks() as demo:
probs = gr.State()
audio_length_samples = gr.State()
with gr.Row():
info = """Input the Google Drive file id from the shared link.
It comes after https://drive.google.com/file/d/ <id here.
For example the link https://drive.google.com/file/d/15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8/view?usp=drive_link has id 15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8"""
gdrive_str = gr.Text(label="File ID", info = info)
download_button = gr.Button("Download Audio")
with gr.Row():
audio_input = gr.Audio(type="filepath")
with gr.Column():
md = gr.Markdown("[Parameter Documentation](https://github.com/snakers4/silero-vad/blob/master/utils_vad.py#L198)")
window_size_samples = gr.Dropdown(label="Window Size (samples)", choices=[512, 1024, 1536], value=512)
button1 = gr.Button("Compute Speech Probabilities")
figure = gr.Plot()
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
button1.click(process_audio, inputs=[audio_input, window_size_samples], outputs=[figure, probs, audio_length_samples])
with gr.Row():
threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
min_speech_duration_ms = gr.Number(label="Mininmum Speech Duration (s)", value=10.5)
min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (s)", value=5.5)
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
button2 = gr.Button("Compute Speech Timestamps")
output_file = gr.File()
with gr.Row():
output_df = gr.DataFrame()
button2.click(process_parameters, inputs=[probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
outputs=[output_file, output_df])
demo.launch()
if __name__ == "__main__":
main()
|