File size: 6,668 Bytes
6cec0e1
be41a94
8f56d60
be41a94
 
 
6b2f42a
 
702ca95
6b2f42a
 
 
1f46a19
6b2f42a
 
d11aaa6
9aee27c
1f46a19
0d19e4b
6b2f42a
f69fe84
6b2f42a
be41a94
 
 
 
6b2f42a
 
 
 
 
63ea8e7
6b2f42a
 
 
 
 
 
6a5a2f9
6b2f42a
f149a3e
1a22203
6a5a2f9
6b2f42a
 
 
 
 
 
 
 
702ca95
6b2f42a
 
 
702ca95
6b2f42a
 
 
45e3e7f
6b2f42a
6a5a2f9
 
 
 
 
 
 
6b2f42a
 
45e3e7f
 
 
 
 
 
 
 
 
702ca95
45e3e7f
 
 
 
702ca95
45e3e7f
 
 
 
 
6a5a2f9
 
 
 
 
 
 
 
45e3e7f
1a22203
 
6b2f42a
 
 
1a22203
742e02b
9aee27c
6a5a2f9
 
 
 
 
 
9aee27c
6a5a2f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr 
from transformers import pipeline, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
import os
import soundfile as sf
from pyannote.audio import Pipeline
import torch
from pydub import AudioSegment
from pydub.playback import play
from datetime import datetime, timedelta
import time
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps

HF_TOKEN = os.environ.get("HF_TOKEN")
sr = 16000
channels = 1

model_name = "Mihaj/wav2vec2-large-xls-r-300m-ruOH-alphav"
bond005_model = "bond005/wav2vec2-large-ru-golos-with-lm"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(bond005_model)
model = Wav2Vec2ForCTC.from_pretrained(bond005_model)
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor, feature_extractor=processor.feature_extractor, decoder=processor.decoder) 
model = load_silero_vad()

pipeline_dia = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                    use_auth_token=HF_TOKEN)


temp_path = "temp.wav"

def preprocess(audio_path):
  print("PREPROCESSING STARTED")
  sound = AudioSegment.from_file(audio_path, format="mp3")
  sound = sound.set_frame_rate(sr)
  sound = sound.set_channels(channels)
  sound.export(temp_path, format="wav")  
  print("PREPROCESSING ENDED")
  return temp_path

def fast_transcribe(diarise, how_diarise, translate, audio):
    audio = preprocess(audio)
    y, sr = sf.read(audio)
    if diarise:
        if how_diarise=="Accurate":
            print("DIARISING")
            dia = pipeline_dia(audio)
            print("DIARISING ENDED")
            lines = []
            for i, line in enumerate(dia.to_lab().split('\n')):
                if line.strip() != "":
                    res = line.split(" ")
                    start = int(float(res[0]) * sr)
                    start_time = str(datetime.fromtimestamp(start / sr)  - timedelta(hours=1, minutes=0)).split()[1]
                    start_time_prts = start_time.split(":")
                    start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
                    end = int(float(res[1]) * sr)
                    end_time = str(datetime.fromtimestamp(end / sr) - timedelta(hours=1, minutes=0)).split()[1]
                    end_time_prts = end_time.split(":")
                    end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
                    label = res[2]
                    print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
                    trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
                    if not translate:
                      lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
                    else:
                      print("TRANSLATION STARTED")
                      trans_eng = translator.translate('trans', src='ru', dest="en").text
                      print(f"TRANSLATION ENDED RESULT {trans_eng}")
                      lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n[{label}] {trans_eng}\n")
                    print("RECOGNISING ENDED")
                    print(f"LINE RESULT {trans}")
        else:
            print("DIARISING")
            wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
            speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
            print("DIARISING ENDED")
            lines = []
            for i, line in enumerate(speech_timestamps):
                start = line['start']
                print(start)
                start_time = str(datetime.fromtimestamp(start / sr) - timedelta(hours=1, minutes=0)).split()[1]
                start_time_prts = start_time.split(":")
                start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
                print(start_time_srt)
                end = line['end']
                end_time = str(datetime.fromtimestamp(end / sr) - timedelta(hours=1, minutes=0)).split()[1]
                end_time_prts = end_time.split(":")
                end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
                print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
                trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
                print("RECOGNISING ENDED")
                if not translate:
                  lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{trans}\n")
                else:
                  print("TRANSLATION STARTED")
                  trans_eng = translator.translate(trans, src='ru', dest="en").text
                  print(f"TRANSLATION ENDED RESULT {trans_eng}")
                  lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n{trans_eng}\n")
                
                print(f"LINE RESULT {trans}")
        text = "\n".join(lines)
    else:
        print("RECOGNISING FULL AUDIO")
        res = pipe(y, chunk_length_s=10, stride_length_s=(4, 2))
        print("RECOGNISING FULL AUDIO ENDED")
        text = res["text"]
    return text

with gr.Blocks() as demo:
    gr.Markdown("""
    #Wav2Vec2 RuOH
    Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm"
    """)
    with gr.Tab("Fast Translation"):

        with gr.Row():
          with gr.Column():
            fast_diarize_input = gr.Checkbox(label="Subtitles", info="Do you want subtitles?")
            fast_diarize_radio_input = gr.Radio(["Fast", "Accurate", "-"], label="separating_on_subtitles_pption", info="You can choose separating audio on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)")
            fast_translate_input = gr.Checkbox(label="Translate", info="Do you want translation to English?")
            fast_audio_input = gr.Audio(type="filepath")

          fast_output = gr.Textbox()

        fast_inputs = [fast_diarize_input, fast_diarize_radio_input, fast_translate_input, fast_audio_input]
        fast_recognize_button = gr.Button("Run")


    fast_recognize_button.click(fast_transcribe, inputs=fast_inputs, outputs=fast_output)

if __name__ == "__main__":
    demo.launch()