katospiegel commited on
Commit
2cce248
·
1 Parent(s): ee9f03e

fixed srt subtitules

Browse files
Files changed (3) hide show
  1. README.md +3 -2
  2. app.py +13 -13
  3. transcription.py +6 -6
README.md CHANGED
@@ -44,6 +44,7 @@ The user will logging using a password and user specified by me. That user and p
44
  - [ ] Introduce SRT as output
45
  - [ ] Obtain txt with Diarization.
46
  - [ ] Obtain plain txt with segments.
47
- - [ ] Introduce POS
48
 
49
- Introducir segmento en la app para hacer analisis POS. Quizas correcciones.
 
 
44
  - [ ] Introduce SRT as output
45
  - [ ] Obtain txt with Diarization.
46
  - [ ] Obtain plain txt with segments.
47
+ - [ ] Introduce POS.
48
 
49
+
50
+ Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
app.py CHANGED
@@ -47,7 +47,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
47
  vocal_path = mp3_to_wav(vocal_path, "vocal")
48
 
49
  #result = fast_transcription(vocal_path, model, "es")
50
- result, diarize_segments = doWhisperX(vocal_path, whisper_model="large-v2")
51
 
52
  #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
53
 
@@ -58,17 +58,17 @@ def transcribeWhisperX(audiofile, model, language, patiente,
58
  ##########################################################################
59
  import whisperx
60
  from pathlib import Path
61
- device = "cuda"
62
- model_a, metadata = whisperx.load_align_model(
63
- language_code="es", device=device
64
- )
65
- result_aligned = whisperx.align(
66
- result["segments"],
67
- model_a,
68
- metadata,
69
- vocal_path,
70
- device=device,
71
- )
72
  import datetime
73
  fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
74
 
@@ -88,7 +88,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
88
  # write_srt(result_aligned["segments"], file=srt)
89
  ###########################################################################
90
 
91
- return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, guardar_dataframe_en_csv(diarize_segments), json.dumps(result)
92
 
93
 
94
  transcribeI = gr.Interface(
 
47
  vocal_path = mp3_to_wav(vocal_path, "vocal")
48
 
49
  #result = fast_transcription(vocal_path, model, "es")
50
+ result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model)
51
 
52
  #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
53
 
 
58
  ##########################################################################
59
  import whisperx
60
  from pathlib import Path
61
+ # device = "cuda"
62
+ # model_a, metadata = whisperx.load_align_model(
63
+ # language_code="es", device=device
64
+ # )
65
+ # result_aligned = whisperx.align(
66
+ # result["segments"],
67
+ # model_a,
68
+ # metadata,
69
+ # vocal_path,
70
+ # device=device,
71
+ # )
72
  import datetime
73
  fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
74
 
 
88
  # write_srt(result_aligned["segments"], file=srt)
89
  ###########################################################################
90
 
91
+ return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers)
92
 
93
 
94
  transcribeI = gr.Interface(
transcription.py CHANGED
@@ -32,8 +32,8 @@ import psutil
32
  import whisperx
33
  import gc
34
 
35
- def doWhisperX(audio_file, whisper_model="large-v2"):
36
- device = "cuda"
37
  #audio_file = "audio.mp3"
38
  batch_size = 16 # reduce if low on GPU mem
39
  compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
@@ -42,7 +42,7 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
42
  model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
43
 
44
  audio = whisperx.load_audio(audio_file)
45
- result = model.transcribe(audio, batch_size=batch_size)
46
  #print(result["segments"]) # before alignment
47
 
48
  # delete model if low on GPU resources
@@ -50,7 +50,7 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
50
 
51
  # 2. Align whisper output
52
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
53
- result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
54
 
55
  #print(result["segments"]) # after alignment
56
 
@@ -64,10 +64,10 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
64
  diarize_segments = diarize_model(audio)
65
  # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
66
 
67
- result = whisperx.assign_word_speakers(diarize_segments, result)
68
  #print(diarize_segments)
69
  #print(result["segments"]) # segments are now assigned speaker IDs
70
- return result, diarize_segments
71
 
72
  embedding_model = PretrainedSpeakerEmbedding(
73
  "speechbrain/spkrec-ecapa-voxceleb",
 
32
  import whisperx
33
  import gc
34
 
35
+ def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
36
+ device = "cuda" if torch.cuda.is_available() else "cpu"
37
  #audio_file = "audio.mp3"
38
  batch_size = 16 # reduce if low on GPU mem
39
  compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
 
42
  model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
43
 
44
  audio = whisperx.load_audio(audio_file)
45
+ result = model.transcribe(audio, language=language, batch_size=batch_size)
46
  #print(result["segments"]) # before alignment
47
 
48
  # delete model if low on GPU resources
 
50
 
51
  # 2. Align whisper output
52
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
53
+ result_aligned = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
54
 
55
  #print(result["segments"]) # after alignment
56
 
 
64
  diarize_segments = diarize_model(audio)
65
  # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
66
 
67
+ result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
68
  #print(diarize_segments)
69
  #print(result["segments"]) # segments are now assigned speaker IDs
70
+ return result_aligned, result_speakers, diarize_segments
71
 
72
  embedding_model = PretrainedSpeakerEmbedding(
73
  "speechbrain/spkrec-ecapa-voxceleb",