Spaces:
Runtime error
Runtime error
File size: 4,168 Bytes
0ab122b 1409062 0ab122b 8097d61 0ab122b efe3426 0ab122b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
from speechline.transcribers import Wav2Vec2Transcriber
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
from speechline.utils.tokenizer import WordTokenizer
from datasets import Dataset, Audio
from pathlib import Path
import os
import gradio as gr
import shutil
import pandas as pd
max_textboxes=5
def preprocess(audio_path, transcriber):
dataset = Dataset.from_dict({"audio": [audio_path]})
dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate))
return dataset
def transcribe(audio_path, transcriber):
dataset = preprocess(audio_path, transcriber)
output_offsets = transcriber.predict(dataset, output_offsets=True)
return output_offsets
def segmentation_interface(choice):
if choice == "silence":
return gr.update(visible=True), gr.update(visible=False)
elif choice == "word_overlap":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
def process(audio_path, model, segmentation_type, silence_duration, ground_truth):
output_dir = "./audio_chunks"
transcriber = Wav2Vec2Transcriber(model)
output_offsets = transcribe(audio_path, transcriber)
if segmentation_type == "silence":
segmenter = SilenceSegmenter()
elif segmentation_type == "word_overlap":
segmenter = WordOverlapSegmenter()
tokenizer = WordTokenizer()
if os.path.exists(f"{output_dir}/tmp"):
shutil.rmtree(f"{output_dir}/tmp")
segmenter.chunk_audio_segments(
audio_path,
output_dir,
output_offsets[0],
minimum_chunk_duration=0,
silence_duration=silence_duration,
ground_truth=tokenizer(ground_truth),
)
outputs = []
idx = 0
for path in sorted(Path(f"{output_dir}/tmp").iterdir()):
if str(path).split('.')[-1] == 'tsv':
gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"])
outputs.append(gr.Dataframe.update(value=gt,visible=True))
idx+=1
if str(path).split('.')[-1] == 'wav':
audio = (str(path))
outputs.append(gr.Audio.update(value=audio, visible=True))
for i in range(max_textboxes-idx):
outputs.append(gr.Dataframe.update(visible=False))
outputs.append(gr.Audio.update(visible=False))
outputs.append(gr.Column.update(visible=True))
return outputs
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio = gr.Audio(type="filepath")
radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True)
model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model")
slider = gr.Slider(0, 100, value=3, step=0.1, label="silence duration", visible=False)
gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False)
radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt])
inputs = [audio, model, radio, slider, gt]
transcribe_btn = gr.Button("Transcribe")
with gr.Column(visible=False) as output_col:
outputs = []
gt1 = gr.Dataframe(visible=False)
audio1 = gr.Audio(visible=False)
gt2 = gr.Dataframe(visible=False)
audio2 = gr.Audio(visible=False)
gt3 = gr.Dataframe(visible=False)
audio3 = gr.Audio(visible=False)
gt4 = gr.Dataframe(visible=False)
audio4 = gr.Audio(visible=False)
gt5 = gr.Dataframe(visible=False)
audio5 = gr.Audio(visible=False)
for i in range(max_textboxes):
outputs.append(gr.Dataframe(visible=False))
outputs.append(gr.Audio(visible=False))
outputs.append(output_col)
transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs)
demo.queue().launch() |