Spaces:
Runtime error
Runtime error
from speechline.transcribers import Wav2Vec2Transcriber | |
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter | |
from speechline.utils.tokenizer import WordTokenizer | |
from datasets import Dataset, Audio | |
from pathlib import Path | |
import os | |
import gradio as gr | |
import shutil | |
import pandas as pd | |
max_textboxes=5 | |
def preprocess(audio_path, transcriber): | |
dataset = Dataset.from_dict({"audio": [audio_path]}) | |
dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate)) | |
return dataset | |
def transcribe(audio_path, transcriber): | |
dataset = preprocess(audio_path, transcriber) | |
output_offsets = transcriber.predict(dataset, output_offsets=True) | |
return output_offsets | |
def segmentation_interface(choice): | |
if choice == "silence": | |
return gr.update(visible=True), gr.update(visible=False) | |
elif choice == "word_overlap": | |
return gr.update(visible=False), gr.update(visible=True) | |
else: | |
return gr.update(visible=False), gr.update(visible=False) | |
def process(audio_path, model, segmentation_type, silence_duration, ground_truth): | |
output_dir = "./audio_chunks" | |
transcriber = Wav2Vec2Transcriber(model) | |
output_offsets = transcribe(audio_path, transcriber) | |
if segmentation_type == "silence": | |
segmenter = SilenceSegmenter() | |
elif segmentation_type == "word_overlap": | |
segmenter = WordOverlapSegmenter() | |
tokenizer = WordTokenizer() | |
if os.path.exists(f"{output_dir}/tmp"): | |
shutil.rmtree(f"{output_dir}/tmp") | |
segmenter.chunk_audio_segments( | |
audio_path, | |
output_dir, | |
output_offsets[0], | |
minimum_chunk_duration=0, | |
silence_duration=silence_duration, | |
ground_truth=tokenizer(ground_truth), | |
) | |
outputs = [] | |
idx = 0 | |
for path in sorted(Path(f"{output_dir}/tmp").iterdir()): | |
if str(path).split('.')[-1] == 'tsv': | |
gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"]) | |
outputs.append(gr.Dataframe.update(value=gt,visible=True)) | |
idx+=1 | |
if str(path).split('.')[-1] == 'wav': | |
audio = (str(path)) | |
outputs.append(gr.Audio.update(value=audio, visible=True)) | |
for i in range(max_textboxes-idx): | |
outputs.append(gr.Dataframe.update(visible=False)) | |
outputs.append(gr.Audio.update(visible=False)) | |
outputs.append(gr.Column.update(visible=True)) | |
return outputs | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
audio = gr.Audio(type="filepath") | |
radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True) | |
model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model") | |
slider = gr.Slider(0, 100, value=3, step=0.1, label="silence duration", visible=False) | |
gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False) | |
radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt]) | |
inputs = [audio, model, radio, slider, gt] | |
transcribe_btn = gr.Button("Transcribe") | |
with gr.Column(visible=False) as output_col: | |
outputs = [] | |
gt1 = gr.Dataframe(visible=False) | |
audio1 = gr.Audio(visible=False) | |
gt2 = gr.Dataframe(visible=False) | |
audio2 = gr.Audio(visible=False) | |
gt3 = gr.Dataframe(visible=False) | |
audio3 = gr.Audio(visible=False) | |
gt4 = gr.Dataframe(visible=False) | |
audio4 = gr.Audio(visible=False) | |
gt5 = gr.Dataframe(visible=False) | |
audio5 = gr.Audio(visible=False) | |
for i in range(max_textboxes): | |
outputs.append(gr.Dataframe(visible=False)) | |
outputs.append(gr.Audio(visible=False)) | |
outputs.append(output_col) | |
transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs) | |
demo.queue().launch() |