File size: 4,168 Bytes
0ab122b
 
 
 
 
 
1409062
0ab122b
 
8097d61
0ab122b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efe3426
0ab122b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from speechline.transcribers import Wav2Vec2Transcriber
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
from speechline.utils.tokenizer import WordTokenizer
from datasets import Dataset, Audio
from pathlib import Path

import os
import gradio as gr
import shutil
import pandas as pd

max_textboxes=5

def preprocess(audio_path, transcriber):
    dataset = Dataset.from_dict({"audio": [audio_path]})
    dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate))
    return dataset

def transcribe(audio_path, transcriber):
    dataset = preprocess(audio_path, transcriber)
    output_offsets = transcriber.predict(dataset, output_offsets=True)
    return output_offsets

def segmentation_interface(choice):
    if choice == "silence":
        return gr.update(visible=True), gr.update(visible=False)
    elif choice == "word_overlap":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=False), gr.update(visible=False)


def process(audio_path, model, segmentation_type, silence_duration, ground_truth):
    output_dir = "./audio_chunks"

    transcriber = Wav2Vec2Transcriber(model)
    output_offsets = transcribe(audio_path, transcriber)

    if segmentation_type == "silence":
        segmenter = SilenceSegmenter()
    elif segmentation_type == "word_overlap":
        segmenter = WordOverlapSegmenter()

    tokenizer = WordTokenizer()

    if os.path.exists(f"{output_dir}/tmp"):
        shutil.rmtree(f"{output_dir}/tmp")

    segmenter.chunk_audio_segments(
        audio_path,
        output_dir,
        output_offsets[0],
        minimum_chunk_duration=0,
        silence_duration=silence_duration,
        ground_truth=tokenizer(ground_truth),
    )
    
    outputs = []
    idx = 0
    for path in sorted(Path(f"{output_dir}/tmp").iterdir()):
        if str(path).split('.')[-1] == 'tsv':
            gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"])
            outputs.append(gr.Dataframe.update(value=gt,visible=True))
            idx+=1
        if str(path).split('.')[-1] == 'wav':
            audio = (str(path))
            outputs.append(gr.Audio.update(value=audio, visible=True))

    for i in range(max_textboxes-idx):
        outputs.append(gr.Dataframe.update(visible=False))
        outputs.append(gr.Audio.update(visible=False))
    outputs.append(gr.Column.update(visible=True))
    return outputs

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(type="filepath")
            radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True)
            model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model")
            slider = gr.Slider(0, 100, value=3, step=0.1, label="silence duration", visible=False)
            gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False)
        
            radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt])
            
            inputs = [audio, model, radio, slider, gt]
            transcribe_btn = gr.Button("Transcribe")    
        

        with gr.Column(visible=False) as output_col:
            outputs = []
            gt1 = gr.Dataframe(visible=False)
            audio1 = gr.Audio(visible=False)

            gt2 = gr.Dataframe(visible=False)
            audio2 = gr.Audio(visible=False)

            gt3 = gr.Dataframe(visible=False)
            audio3 = gr.Audio(visible=False)

            gt4 = gr.Dataframe(visible=False)
            audio4 = gr.Audio(visible=False)

            gt5 = gr.Dataframe(visible=False)
            audio5 = gr.Audio(visible=False)
            

            for i in range(max_textboxes):
                outputs.append(gr.Dataframe(visible=False))
                outputs.append(gr.Audio(visible=False))
            outputs.append(output_col)

            transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs)

demo.queue().launch()