Spaces:

katospiegel
/

amanu

Runtime error

App Files Files Community

katospiegel commited on Oct 19, 2023

Commit

2cce248

1 Parent(s): ee9f03e

fixed srt subtitules

Browse files

Files changed (3) hide show

README.md +3 -2
app.py +13 -13
transcription.py +6 -6

README.md CHANGED Viewed

@@ -44,6 +44,7 @@ The user will logging using a password and user specified by me. That user and p
 - [ ] Introduce SRT as output
 - [ ] Obtain txt with Diarization.
 - [ ] Obtain plain txt with segments.
-- [ ] Introduce POS
-Introducir segmento en la app para hacer analisis POS. Quizas correcciones.

 - [ ] Introduce SRT as output
 - [ ] Obtain txt with Diarization.
 - [ ] Obtain plain txt with segments.
+- [ ] Introduce POS.
+Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.

app.py CHANGED Viewed

@@ -47,7 +47,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
     vocal_path = mp3_to_wav(vocal_path, "vocal")
     #result = fast_transcription(vocal_path, model, "es")
-    result, diarize_segments = doWhisperX(vocal_path, whisper_model="large-v2")
     #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
@@ -58,17 +58,17 @@ def transcribeWhisperX(audiofile, model, language, patiente,
     ##########################################################################
     import whisperx
     from pathlib import Path
-    device = "cuda"
-    model_a, metadata = whisperx.load_align_model(
-        language_code="es", device=device
-    )
-    result_aligned = whisperx.align(
-        result["segments"],
-        model_a,
-        metadata,
-        vocal_path,
-        device=device,
-    )
     import datetime
     fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -88,7 +88,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
     #     write_srt(result_aligned["segments"], file=srt)
     ###########################################################################
-    return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, guardar_dataframe_en_csv(diarize_segments), json.dumps(result)
 transcribeI = gr.Interface(

     vocal_path = mp3_to_wav(vocal_path, "vocal")
     #result = fast_transcription(vocal_path, model, "es")
+    result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model)
     #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
     ##########################################################################
     import whisperx
     from pathlib import Path
+    # device = "cuda"
+    # model_a, metadata = whisperx.load_align_model(
+    #     language_code="es", device=device
+    # )
+    # result_aligned = whisperx.align(
+    #     result["segments"],
+    #     model_a,
+    #     metadata,
+    #     vocal_path,
+    #     device=device,
+    # )
     import datetime
     fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     #     write_srt(result_aligned["segments"], file=srt)
     ###########################################################################
+    return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers)
 transcribeI = gr.Interface(

transcription.py CHANGED Viewed

@@ -32,8 +32,8 @@ import psutil
 import whisperx
 import gc
-def doWhisperX(audio_file, whisper_model="large-v2"):
-    device = "cuda"
     #audio_file = "audio.mp3"
     batch_size = 16 # reduce if low on GPU mem
     compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
@@ -42,7 +42,7 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
     model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
     audio = whisperx.load_audio(audio_file)
-    result = model.transcribe(audio, batch_size=batch_size)
     #print(result["segments"]) # before alignment
     # delete model if low on GPU resources
@@ -50,7 +50,7 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
     # 2. Align whisper output
     model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
     #print(result["segments"]) # after alignment
@@ -64,10 +64,10 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
     diarize_segments = diarize_model(audio)
     # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
-    result = whisperx.assign_word_speakers(diarize_segments, result)
     #print(diarize_segments)
     #print(result["segments"]) # segments are now assigned speaker IDs
-    return result, diarize_segments
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",

 import whisperx
 import gc
+def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     #audio_file = "audio.mp3"
     batch_size = 16 # reduce if low on GPU mem
     compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
     model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
     audio = whisperx.load_audio(audio_file)
+    result = model.transcribe(audio, language=language, batch_size=batch_size)
     #print(result["segments"]) # before alignment
     # delete model if low on GPU resources
     # 2. Align whisper output
     model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result_aligned = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
     #print(result["segments"]) # after alignment
     diarize_segments = diarize_model(audio)
     # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+    result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
     #print(diarize_segments)
     #print(result["segments"]) # segments are now assigned speaker IDs
+    return result_aligned, result_speakers, diarize_segments
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",