Spaces:
Runtime error
Runtime error
salmanmapkar
commited on
Commit
·
b927090
1
Parent(s):
f848bd7
Update app.py
Browse files
app.py
CHANGED
@@ -22,14 +22,38 @@ from pyannote.audio import Audio
|
|
22 |
from pyannote.core import Segment
|
23 |
import wave
|
24 |
import contextlib
|
25 |
-
from sklearn.cluster import
|
26 |
-
|
27 |
import numpy as np
|
28 |
import json
|
29 |
from datetime import timedelta
|
30 |
|
|
|
|
|
31 |
__FILES = set()
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def CreateFile(filename):
|
35 |
__FILES.add(filename)
|
@@ -140,14 +164,16 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
|
|
140 |
return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
|
141 |
|
142 |
|
143 |
-
def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
144 |
-
model = whisper.load_model(
|
145 |
# embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
|
|
|
146 |
embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
|
147 |
"speechbrain/spkrec-ecapa-voxceleb",
|
148 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
149 |
)
|
150 |
SPEAKER_DICT = {}
|
|
|
151 |
SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
|
152 |
def GetSpeaker(sp):
|
153 |
speaker = sp
|
@@ -156,6 +182,10 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
156 |
t = SPEAKERS.pop(0)
|
157 |
SPEAKER_DICT[sp] = t
|
158 |
speaker = SPEAKER_DICT[sp]
|
|
|
|
|
|
|
|
|
159 |
else:
|
160 |
speaker = SPEAKER_DICT[sp]
|
161 |
return speaker
|
@@ -168,6 +198,9 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
168 |
return s
|
169 |
as_audio = AudioSegment.from_wav(audio)
|
170 |
DEMO_FILE = {'uri': 'blabal', 'audio': audio}
|
|
|
|
|
|
|
171 |
if num_speakers:
|
172 |
dz = pipeline(DEMO_FILE, num_speakers=num_speakers)
|
173 |
else:
|
@@ -201,6 +234,8 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
201 |
# conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
|
202 |
# conversation[-1][1] += segment["text"][1:]
|
203 |
# return output
|
|
|
|
|
204 |
return ("".join([f"[{start}] - {speaker} \n{text}\n" for start, end, speaker, text in conversation])), ({ "data": [{"start": start, "end":end, "speaker": speaker, "text": text} for start, end, speaker, text in conversation]})
|
205 |
|
206 |
def get_duration(path):
|
@@ -224,7 +259,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
224 |
return embedding_model(waveform[None])
|
225 |
|
226 |
def add_speaker_labels(segments, embeddings, num_speakers):
|
227 |
-
clustering =
|
228 |
labels = clustering.labels_
|
229 |
for i in range(len(segments)):
|
230 |
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
|
@@ -236,9 +271,9 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
236 |
if duration > 4 * 60 * 60:
|
237 |
return "Audio duration too long"
|
238 |
|
239 |
-
print(json.dumps(diarization(audio)))
|
240 |
result = model.transcribe(audio)
|
241 |
-
print(json.dumps(result))
|
242 |
|
243 |
segments = result["segments"]
|
244 |
|
@@ -251,7 +286,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
|
|
251 |
return get_output(segments)
|
252 |
# return output
|
253 |
|
254 |
-
def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
|
255 |
print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
|
256 |
if retries:
|
257 |
# subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
|
@@ -262,11 +297,11 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
|
|
262 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
263 |
if not (os.path.isfile("temp_audio.wav")):
|
264 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
265 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
266 |
else:
|
267 |
raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
|
268 |
|
269 |
-
def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
|
270 |
if retries:
|
271 |
try:
|
272 |
clip = mp.VideoFileClip(video)
|
@@ -278,12 +313,11 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
|
|
278 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
279 |
if not (os.path.isfile("temp_audio.wav")):
|
280 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
281 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
282 |
else:
|
283 |
raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
|
284 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
285 |
|
286 |
-
def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
|
287 |
if retries:
|
288 |
if "youtu" not in URL.lower():
|
289 |
raise gr.Error(f"{URL} is not a valid youtube URL.")
|
@@ -305,42 +339,28 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
|
|
305 |
stream = ffmpeg.input('temp_audio.m4a')
|
306 |
stream = ffmpeg.output(stream, 'temp_audio.wav')
|
307 |
RemoveFile("temp_audio.m4a")
|
308 |
-
return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
|
309 |
else:
|
310 |
raise gr.Error(f"Unable to get video from {URL}")
|
311 |
|
312 |
-
ut = gr.Interface(
|
313 |
-
fn=YoutubeTranscribe,
|
314 |
-
inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
|
315 |
-
outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
|
316 |
-
)
|
317 |
-
vt = gr.Interface(
|
318 |
-
fn=VideoTranscribe,
|
319 |
-
inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
|
320 |
-
outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
|
321 |
-
)
|
322 |
-
at = gr.Interface(
|
323 |
-
fn=AudioTranscribe,
|
324 |
-
inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
|
325 |
-
outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
|
326 |
-
)
|
327 |
|
328 |
-
# demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
|
329 |
-
# demo.launch()
|
330 |
with gr.Blocks() as yav_ui:
|
331 |
with gr.Row():
|
332 |
with gr.Column():
|
333 |
with gr.Tab("Youtube", id=1):
|
|
|
334 |
yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
335 |
yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
336 |
yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
|
337 |
ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
338 |
with gr.Tab("Video", id=2):
|
|
|
339 |
vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
340 |
vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
341 |
vinput = gr.Video(label="Video")
|
342 |
vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
343 |
with gr.Tab("Audio", id=3):
|
|
|
344 |
ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
345 |
ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
346 |
ainput = gr.Audio(label="Audio", type="filepath")
|
@@ -352,17 +372,17 @@ with gr.Blocks() as yav_ui:
|
|
352 |
output_json = gr.JSON(label="Transcribed JSON")
|
353 |
ybutton_transcribe.click(
|
354 |
fn=YoutubeTranscribe,
|
355 |
-
inputs=[yinput_nos,yinput_sn,yinput],
|
356 |
outputs=[output_textbox,output_json]
|
357 |
)
|
358 |
abutton_transcribe.click(
|
359 |
fn=AudioTranscribe,
|
360 |
-
inputs=[ainput_nos,ainput_sn,ainput],
|
361 |
outputs=[output_textbox,output_json]
|
362 |
)
|
363 |
vbutton_transcribe.click(
|
364 |
fn=VideoTranscribe,
|
365 |
-
inputs=[vinput_nos,vinput_sn,vinput],
|
366 |
outputs=[output_textbox,output_json]
|
367 |
)
|
368 |
yav_ui.launch(debug=True)
|
|
|
22 |
from pyannote.core import Segment
|
23 |
import wave
|
24 |
import contextlib
|
25 |
+
from sklearn.cluster import AgglomerativeClustering
|
|
|
26 |
import numpy as np
|
27 |
import json
|
28 |
from datetime import timedelta
|
29 |
|
30 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
31 |
+
|
32 |
__FILES = set()
|
33 |
+
wispher_models = list(whisper._MODELS.keys())
|
34 |
+
|
35 |
+
def correct_grammar(input_text,num_return_sequences=1):
|
36 |
+
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
37 |
+
tokenizer = T5Tokenizer.from_pretrained('deep-learning-analytics/GrammarCorrector')
|
38 |
+
model = T5ForConditionalGeneration.from_pretrained('deep-learning-analytics/GrammarCorrector').to(torch_device)
|
39 |
+
batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=len(input_text), return_tensors="pt").to(torch_device)
|
40 |
+
results = model.generate(**batch,max_length=len(input_text),num_beams=2, num_return_sequences=num_return_sequences, temperature=1.5)
|
41 |
+
generated_sequences = []
|
42 |
+
for generated_sequence_idx, generated_sequence in enumerate(results):
|
43 |
+
text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
|
44 |
+
generated_sequences.append(text)
|
45 |
+
generated_text = "".join(generated_sequences)
|
46 |
+
_generated_text = ""
|
47 |
+
for idx, _sentence in enumerate(generated_text.split('.'), 0):
|
48 |
+
if not idx:
|
49 |
+
_generated_text+=_sentence+'.'
|
50 |
+
elif _sentence[:1]!=' ':
|
51 |
+
_generated_text+=' '+_sentence+'.'
|
52 |
+
elif _sentence[:1]=='':
|
53 |
+
pass
|
54 |
+
else:
|
55 |
+
_generated_text+=_sentence+'.'
|
56 |
+
return _generated_text
|
57 |
|
58 |
def CreateFile(filename):
|
59 |
__FILES.add(filename)
|
|
|
164 |
return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
|
165 |
|
166 |
|
167 |
+
def Transcribe_V2(model, num_speakers, speaker_names, audio="temp_audio.wav"):
|
168 |
+
model = whisper.load_model(model)
|
169 |
# embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
|
170 |
+
|
171 |
embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
|
172 |
"speechbrain/spkrec-ecapa-voxceleb",
|
173 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
174 |
)
|
175 |
SPEAKER_DICT = {}
|
176 |
+
default_speaker_names = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
|
177 |
SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
|
178 |
def GetSpeaker(sp):
|
179 |
speaker = sp
|
|
|
182 |
t = SPEAKERS.pop(0)
|
183 |
SPEAKER_DICT[sp] = t
|
184 |
speaker = SPEAKER_DICT[sp]
|
185 |
+
elif len(default_speaker_names):
|
186 |
+
t = default_speaker_names.pop(0)
|
187 |
+
SPEAKER_DICT[sp] = t
|
188 |
+
speaker = SPEAKER_DICT[sp]
|
189 |
else:
|
190 |
speaker = SPEAKER_DICT[sp]
|
191 |
return speaker
|
|
|
198 |
return s
|
199 |
as_audio = AudioSegment.from_wav(audio)
|
200 |
DEMO_FILE = {'uri': 'blabal', 'audio': audio}
|
201 |
+
hparams = pipeline.parameters(instantiated=True)
|
202 |
+
hparams["segmentation"]["min_duration_off"] -= 0.25
|
203 |
+
pipeline.instantiate(hparams)
|
204 |
if num_speakers:
|
205 |
dz = pipeline(DEMO_FILE, num_speakers=num_speakers)
|
206 |
else:
|
|
|
234 |
# conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
|
235 |
# conversation[-1][1] += segment["text"][1:]
|
236 |
# return output
|
237 |
+
for idx in range(len(conversation)):
|
238 |
+
conversation[idx][3] = correct_grammar(conversation[idx][3])
|
239 |
return ("".join([f"[{start}] - {speaker} \n{text}\n" for start, end, speaker, text in conversation])), ({ "data": [{"start": start, "end":end, "speaker": speaker, "text": text} for start, end, speaker, text in conversation]})
|
240 |
|
241 |
def get_duration(path):
|
|
|
259 |
return embedding_model(waveform[None])
|
260 |
|
261 |
def add_speaker_labels(segments, embeddings, num_speakers):
|
262 |
+
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
263 |
labels = clustering.labels_
|
264 |
for i in range(len(segments)):
|
265 |
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
|
|
|
271 |
if duration > 4 * 60 * 60:
|
272 |
return "Audio duration too long"
|
273 |
|
274 |
+
# print(json.dumps(diarization(audio)))
|
275 |
result = model.transcribe(audio)
|
276 |
+
# print(json.dumps(result))
|
277 |
|
278 |
segments = result["segments"]
|
279 |
|
|
|
286 |
return get_output(segments)
|
287 |
# return output
|
288 |
|
289 |
+
def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5, model='base'):
|
290 |
print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
|
291 |
if retries:
|
292 |
# subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
|
|
|
297 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
298 |
if not (os.path.isfile("temp_audio.wav")):
|
299 |
return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
|
300 |
+
return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
|
301 |
else:
|
302 |
raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
|
303 |
|
304 |
+
def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5, model='base'):
|
305 |
if retries:
|
306 |
try:
|
307 |
clip = mp.VideoFileClip(video)
|
|
|
313 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
314 |
if not (os.path.isfile("temp_audio.wav")):
|
315 |
return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
|
316 |
+
return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
|
317 |
else:
|
318 |
raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
|
|
|
319 |
|
320 |
+
def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5, model='base'):
|
321 |
if retries:
|
322 |
if "youtu" not in URL.lower():
|
323 |
raise gr.Error(f"{URL} is not a valid youtube URL.")
|
|
|
339 |
stream = ffmpeg.input('temp_audio.m4a')
|
340 |
stream = ffmpeg.output(stream, 'temp_audio.wav')
|
341 |
RemoveFile("temp_audio.m4a")
|
342 |
+
return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
|
343 |
else:
|
344 |
raise gr.Error(f"Unable to get video from {URL}")
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
|
|
|
|
347 |
with gr.Blocks() as yav_ui:
|
348 |
with gr.Row():
|
349 |
with gr.Column():
|
350 |
with gr.Tab("Youtube", id=1):
|
351 |
+
ysz = gr.Dropdown(label="Model Size", choices=wispher_models , value='base')
|
352 |
yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
353 |
yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
354 |
yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
|
355 |
ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
356 |
with gr.Tab("Video", id=2):
|
357 |
+
vsz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
|
358 |
vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
359 |
vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
360 |
vinput = gr.Video(label="Video")
|
361 |
vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
|
362 |
with gr.Tab("Audio", id=3):
|
363 |
+
asz = gr.Dropdown(label="Model Size", choices=wispher_models , value='base')
|
364 |
ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
|
365 |
ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
|
366 |
ainput = gr.Audio(label="Audio", type="filepath")
|
|
|
372 |
output_json = gr.JSON(label="Transcribed JSON")
|
373 |
ybutton_transcribe.click(
|
374 |
fn=YoutubeTranscribe,
|
375 |
+
inputs=[yinput_nos,yinput_sn,yinput, ysz],
|
376 |
outputs=[output_textbox,output_json]
|
377 |
)
|
378 |
abutton_transcribe.click(
|
379 |
fn=AudioTranscribe,
|
380 |
+
inputs=[ainput_nos,ainput_sn,ainput, asz],
|
381 |
outputs=[output_textbox,output_json]
|
382 |
)
|
383 |
vbutton_transcribe.click(
|
384 |
fn=VideoTranscribe,
|
385 |
+
inputs=[vinput_nos,vinput_sn,vinput, vsz],
|
386 |
outputs=[output_textbox,output_json]
|
387 |
)
|
388 |
yav_ui.launch(debug=True)
|