File size: 3,763 Bytes
2ddf3fe
 
0412962
 
50a5992
 
2ddf3fe
3b8e519
0412962
a00d114
3b8e519
8e14b4c
 
 
3b8e519
 
a00d114
 
 
 
 
8e14b4c
 
 
50a5992
 
 
 
2ddf3fe
50a5992
 
 
 
4b0b51b
0412962
50a5992
2ddf3fe
8e14b4c
 
50a5992
865b8d5
 
 
 
50a5992
 
2ddf3fe
b865c6d
3b8e519
 
 
 
a00d114
2ddf3fe
50a5992
 
3b8e519
2ddf3fe
 
865b8d5
3b8e519
 
0412962
50a5992
 
 
 
2ddf3fe
8e14b4c
50a5992
2ddf3fe
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch
import pandas as pd
import gdown

def process_audio(audio_input, window_size_samples):
    wav = read_audio(audio_input, sampling_rate=16_000)
    audio_length_samples = len(wav)
    probs = get_speech_probs(wav, window_size_samples=window_size_samples, sampling_rate=16_000)
    return make_visualization(probs, 512 / 16_000), probs, audio_length_samples

def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
    min_speech_duration_ms *= 1000
    min_silence_duration_ms *= 1000
    timestamps = probs2speech_timestamps(probs, audio_length_samples, 
                                         threshold = threshold,
                                         min_speech_duration_ms = min_speech_duration_ms, 
                                         min_silence_duration_ms=min_silence_duration_ms, 
                                         window_size_samples=window_size_samples, 
                                         speech_pad_ms=speech_pad_ms,
                                         return_seconds=True,
                                         rounding=3)
    df = pd.DataFrame(timestamps)
    df["note"] = ""
    df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
    return "timestamps.txt", df

def download_gdrive(id):
    output_file = "audio.wav"  # Replace "data_file.ext" with the desired output filename and extension

    gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
    return output_file

def main(): 
    with gr.Blocks() as demo:
        probs = gr.State()
        audio_length_samples = gr.State()
        with gr.Row():
            info = """Input the Google Drive file id from the shared link.
            It comes after https://drive.google.com/file/d/ <id here.
            For example the link https://drive.google.com/file/d/15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8/view?usp=drive_link has id 15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8"""
            gdrive_str = gr.Text(label="File ID", info = info)
            download_button = gr.Button("Download Audio")

        with gr.Row():
            audio_input = gr.Audio(type="filepath")
            with gr.Column():
                md = gr.Markdown("[Parameter Documentation](https://github.com/snakers4/silero-vad/blob/master/utils_vad.py#L198)")
                window_size_samples = gr.Dropdown(label="Window Size (samples)", choices=[512, 1024, 1536], value=512)
                button1 = gr.Button("Compute Speech Probabilities")
            figure = gr.Plot()

        download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)

        button1.click(process_audio, inputs=[audio_input, window_size_samples], outputs=[figure, probs, audio_length_samples])

        with gr.Row():
            threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
            min_speech_duration_ms = gr.Number(label="Mininmum Speech Duration (s)", value=10.5)
            min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (s)", value=5.5)
            speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
            button2 = gr.Button("Compute Speech Timestamps")
            output_file = gr.File()
        with gr.Row():
            output_df = gr.DataFrame()

        button2.click(process_parameters, inputs=[probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
                      outputs=[output_file, output_df])

    demo.launch()

if __name__ == "__main__":
    main()