|
import gradio as gr |
|
from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC |
|
import os |
|
import soundfile as sf |
|
from pyannote.audio import Pipeline |
|
import torch |
|
from pydub import AudioSegment |
|
from pydub.playback import play |
|
from datetime import datetime |
|
import time |
|
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
sr = 16000 |
|
channels = 1 |
|
|
|
model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav" |
|
bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm" |
|
processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model) |
|
model = Wav2Vec2ForCTC.from_pretrained(bond005_model) |
|
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder) |
|
model = load_silero_vad() |
|
|
|
pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", |
|
use_auth_token=HF_TOKEN) |
|
|
|
|
|
temp_path = "temp.wav" |
|
|
|
def preprocess(audio_path): |
|
print("PREPROCESSING STARTED") |
|
sound = AudioSegment.from_file(audio_path, format="mp3") |
|
sound = sound.set_frame_rate(sr) |
|
sound = sound.set_channels(channels) |
|
sound.export(temp_path, format="wav") |
|
print("PREPROCESSING ENDED") |
|
return temp_path |
|
|
|
def transcribe(diarise, how_diarise, audio): |
|
audio = preprocess(audio) |
|
y, sr = sf.read(audio) |
|
print(diarise) |
|
if diarise: |
|
if how_diarise=="SlowButHighQuality": |
|
print("DIARISING") |
|
dia = pipeline_dia(audio) |
|
print("DIARISING ENDED") |
|
lines = [] |
|
for i, line in enumerate(dia.to_lab().split('\n')): |
|
if line.strip() != "": |
|
res = line.split(" ") |
|
start = int(float(res[0]) * sr) |
|
start_time = str(datetime.fromtimestamp(start / sr)).split()[1] |
|
start_time_prts = start_time.split(":") |
|
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',') |
|
end = int(float(res[1]) * sr) |
|
end_time = str(datetime.fromtimestamp(end / sr)).split()[1] |
|
end_time_prts = end_time.split(":") |
|
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',') |
|
label = res[2] |
|
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}") |
|
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"] |
|
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n") |
|
print("RECOGNISING ENDED") |
|
print(f"LINE RESULT {trans}") |
|
else: |
|
print("DIARISING") |
|
wav = read_audio(audio) |
|
speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256) |
|
print("DIARISING ENDED") |
|
lines = [] |
|
for i, line in enumerate(speech_timestamps): |
|
start = line['start'] |
|
print(start) |
|
start_time = str(datetime.fromtimestamp(start / sr)).split()[1] |
|
start_time_prts = start_time.split(":") |
|
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',') |
|
print(start_time_srt) |
|
end = line['end'] |
|
end_time = str(datetime.fromtimestamp(end / sr)).split()[1] |
|
end_time_prts = end_time.split(":") |
|
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',') |
|
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}") |
|
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"] |
|
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n") |
|
print("RECOGNISING ENDED") |
|
print(f"LINE RESULT {trans}") |
|
text = "\n".join(lines) |
|
else: |
|
print("RECOGNISING FULL AUDIO") |
|
res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2)) |
|
print("RECOGNISING FULL AUDIO ENDED") |
|
text = res["text"] |
|
return text |
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)"), gr.Audio(type="filepath")], |
|
outputs="text", |
|
title="Wav2Vec2 RuOH", |
|
description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm", |
|
) |
|
|
|
iface.launch() |