Mihaj commited on
Commit
45e3e7f
·
verified ·
1 Parent(s): 72e979a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -23
app.py CHANGED
@@ -41,27 +41,7 @@ def transcribe(diarise, how_diarise, audio):
41
  y, sr = sf.read(audio)
42
  print(diarise)
43
  if diarise:
44
- if how_diarise=="FastButLowQuality" or how_diarise=="-":
45
- print("DIARISING")
46
- wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
47
- speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
48
- print("DIARISING ENDED")
49
- lines = []
50
- for i, line in enumerate(speech_timestamps):
51
- start = line['start']
52
- start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
53
- start_time_prts = start_time.split(":")
54
- start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
55
- end = line['end']
56
- end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
57
- end_time_prts = end_time.split(":")
58
- end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
59
- print(f"RECOGNISING LINE_{i} T_START{start_time} T_END{end_time}")
60
- trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
61
- lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
62
- print("RECOGNISING ENDED")
63
- print(f"LINE RESULT {trans}")
64
- elif how_diarise=="SlowButHighQuality":
65
  print("DIARISING")
66
  dia = pipeline_dia(audio)
67
  print("DIARISING ENDED")
@@ -78,11 +58,33 @@ def transcribe(diarise, how_diarise, audio):
78
  end_time_prts = end_time.split(":")
79
  end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
80
  label = res[2]
81
- print(f"RECOGNISING LINE_{i} T_START{res[0]} T_END{res[1]} SPEAKER_{label}")
82
  trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
83
  lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
84
  print("RECOGNISING ENDED")
85
  print(f"LINE RESULT {trans}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  text = "\n".join(lines)
87
  else:
88
  print("RECOGNISING FULL AUDIO")
@@ -93,7 +95,7 @@ def transcribe(diarise, how_diarise, audio):
93
 
94
  iface = gr.Interface(
95
  fn=transcribe,
96
- inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower variant yet high quality variant (Pyannote.Diarization, this variant will detect different speakers)"), gr.Audio(type="filepath")],
97
  outputs="text",
98
  title="Wav2Vec2 RuOH",
99
  description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
 
41
  y, sr = sf.read(audio)
42
  print(diarise)
43
  if diarise:
44
+ if how_diarise=="SlowButHighQuality":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  print("DIARISING")
46
  dia = pipeline_dia(audio)
47
  print("DIARISING ENDED")
 
58
  end_time_prts = end_time.split(":")
59
  end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
60
  label = res[2]
61
+ print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
62
  trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
63
  lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
64
  print("RECOGNISING ENDED")
65
  print(f"LINE RESULT {trans}")
66
+ else:
67
+ print("DIARISING")
68
+ wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
69
+ speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
70
+ print("DIARISING ENDED")
71
+ lines = []
72
+ for i, line in enumerate(speech_timestamps):
73
+ start = line['start']
74
+ print(start)
75
+ start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
76
+ start_time_prts = start_time.split(":")
77
+ start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
78
+ print(start_time_srt)
79
+ end = line['end']
80
+ end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
81
+ end_time_prts = end_time.split(":")
82
+ end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
83
+ print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
84
+ trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
85
+ lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
86
+ print("RECOGNISING ENDED")
87
+ print(f"LINE RESULT {trans}")
88
  text = "\n".join(lines)
89
  else:
90
  print("RECOGNISING FULL AUDIO")
 
95
 
96
  iface = gr.Interface(
97
  fn=transcribe,
98
+ inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)"), gr.Audio(type="filepath")],
99
  outputs="text",
100
  title="Wav2Vec2 RuOH",
101
  description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",