Mihaj's picture
Update app.py
45e3e7f verified
raw
history blame
5.19 kB
import gradio as gr
from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
import os
import soundfile as sf
from pyannote.audio import Pipeline
import torch
from pydub import AudioSegment
from pydub.playback import play
from datetime import datetime
import time
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
HF_TOKEN = os.environ.get("HF_TOKEN")
sr = 16000
channels = 1
model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav"
bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model)
model = Wav2Vec2ForCTC.from_pretrained(bond005_model)
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
model = load_silero_vad()
pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
use_auth_token=HF_TOKEN)
temp_path = "temp.wav"
def preprocess(audio_path):
print("PREPROCESSING STARTED")
sound = AudioSegment.from_file(audio_path, format="mp3")
sound = sound.set_frame_rate(sr)
sound = sound.set_channels(channels)
sound.export(temp_path, format="wav")
print("PREPROCESSING ENDED")
return temp_path
def transcribe(diarise, how_diarise, audio):
audio = preprocess(audio)
y, sr = sf.read(audio)
print(diarise)
if diarise:
if how_diarise=="SlowButHighQuality":
print("DIARISING")
dia = pipeline_dia(audio)
print("DIARISING ENDED")
lines = []
for i, line in enumerate(dia.to_lab().split('\n')):
if line.strip() != "":
res = line.split(" ")
start = int(float(res[0]) * sr)
start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
start_time_prts = start_time.split(":")
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
end = int(float(res[1]) * sr)
end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
end_time_prts = end_time.split(":")
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
label = res[2]
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
print("RECOGNISING ENDED")
print(f"LINE RESULT {trans}")
else:
print("DIARISING")
wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
print("DIARISING ENDED")
lines = []
for i, line in enumerate(speech_timestamps):
start = line['start']
print(start)
start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
start_time_prts = start_time.split(":")
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
print(start_time_srt)
end = line['end']
end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
end_time_prts = end_time.split(":")
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
print("RECOGNISING ENDED")
print(f"LINE RESULT {trans}")
text = "\n".join(lines)
else:
print("RECOGNISING FULL AUDIO")
res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2))
print("RECOGNISING FULL AUDIO ENDED")
text = res["text"]
return text
iface = gr.Interface(
fn=transcribe,
inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)"), gr.Audio(type="filepath")],
outputs="text",
title="Wav2Vec2 RuOH",
description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
)
iface.launch()