File size: 6,668 Bytes
6cec0e1 be41a94 8f56d60 be41a94 6b2f42a 702ca95 6b2f42a 1f46a19 6b2f42a d11aaa6 9aee27c 1f46a19 0d19e4b 6b2f42a f69fe84 6b2f42a be41a94 6b2f42a 63ea8e7 6b2f42a 6a5a2f9 6b2f42a f149a3e 1a22203 6a5a2f9 6b2f42a 702ca95 6b2f42a 702ca95 6b2f42a 45e3e7f 6b2f42a 6a5a2f9 6b2f42a 45e3e7f 702ca95 45e3e7f 702ca95 45e3e7f 6a5a2f9 45e3e7f 1a22203 6b2f42a 1a22203 742e02b 9aee27c 6a5a2f9 9aee27c 6a5a2f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
import os
import soundfile as sf
from pyannote.audio import Pipeline
import torch
from pydub import AudioSegment
from pydub.playback import play
from datetime import datetime, timedelta
import time
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
HF_TOKEN = os.environ.get("HF_TOKEN")
sr = 16000
channels = 1
model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav"
bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model)
model = Wav2Vec2ForCTC.from_pretrained(bond005_model)
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
model = load_silero_vad()
pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
use_auth_token=HF_TOKEN)
temp_path = "temp.wav"
def preprocess(audio_path):
print("PREPROCESSING STARTED")
sound = AudioSegment.from_file(audio_path, format="mp3")
sound = sound.set_frame_rate(sr)
sound = sound.set_channels(channels)
sound.export(temp_path, format="wav")
print("PREPROCESSING ENDED")
return temp_path
def fast_transcribe(diarise, how_diarise, translate, audio):
audio = preprocess(audio)
y, sr = sf.read(audio)
if diarise:
if how_diarise=="Accurate":
print("DIARISING")
dia = pipeline_dia(audio)
print("DIARISING ENDED")
lines = []
for i, line in enumerate(dia.to_lab().split('\n')):
if line.strip() != "":
res = line.split(" ")
start = int(float(res[0]) * sr)
start_time = str(datetime.fromtimestamp(start / sr) - timedelta(hours=1, minutes=0)).split()[1]
start_time_prts = start_time.split(":")
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
end = int(float(res[1]) * sr)
end_time = str(datetime.fromtimestamp(end / sr) - timedelta(hours=1, minutes=0)).split()[1]
end_time_prts = end_time.split(":")
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
label = res[2]
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
if not translate:
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
else:
print("TRANSLATION STARTED")
trans_eng = translator.translate('trans', src='ru', dest="en").text
print(f"TRANSLATION ENDED RESULT {trans_eng}")
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n[{label}] {trans_eng}\n")
print("RECOGNISING ENDED")
print(f"LINE RESULT {trans}")
else:
print("DIARISING")
wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
print("DIARISING ENDED")
lines = []
for i, line in enumerate(speech_timestamps):
start = line['start']
print(start)
start_time = str(datetime.fromtimestamp(start / sr) - timedelta(hours=1, minutes=0)).split()[1]
start_time_prts = start_time.split(":")
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
print(start_time_srt)
end = line['end']
end_time = str(datetime.fromtimestamp(end / sr) - timedelta(hours=1, minutes=0)).split()[1]
end_time_prts = end_time.split(":")
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
print("RECOGNISING ENDED")
if not translate:
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{trans}\n")
else:
print("TRANSLATION STARTED")
trans_eng = translator.translate(trans, src='ru', dest="en").text
print(f"TRANSLATION ENDED RESULT {trans_eng}")
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n{trans_eng}\n")
print(f"LINE RESULT {trans}")
text = "\n".join(lines)
else:
print("RECOGNISING FULL AUDIO")
res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2))
print("RECOGNISING FULL AUDIO ENDED")
text = res["text"]
return text
with gr.Blocks() as demo:
gr.Markdown("""
#Wav2Vec2 RuOH
Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm"
""")
with gr.Tab("Fast Translation"):
with gr.Row():
with gr.Column():
fast_diarize_input = gr.Checkbox(label="Subtitles", info="Do you want subtitles?")
fast_diarize_radio_input = gr.Radio(["Fast", "Accurate", "-"], label="separating_on_subtitles_pption", info="You can choose separating audio on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)")
fast_translate_input = gr.Checkbox(label="Translate", info="Do you want translation to English?")
fast_audio_input = gr.Audio(type="filepath")
fast_output = gr.Textbox()
fast_inputs = [fast_diarize_input, fast_diarize_radio_input, fast_translate_input, fast_audio_input]
fast_recognize_button = gr.Button("Run")
fast_recognize_button.click(fast_transcribe, inputs=fast_inputs, outputs=fast_output)
if __name__ == "__main__":
demo.launch() |