katospiegel commited on
Commit
36d2951
·
1 Parent(s): fe5ca43

adding fix for exporting language

Browse files
Files changed (3) hide show
  1. README.md +1 -0
  2. app.py +2 -1
  3. transcription.py +7 -7
README.md CHANGED
@@ -46,6 +46,7 @@ The user will logging using a password and user specified by me. That user and p
46
  - [ ] Obtain plain txt with segments.
47
  - [ ] Introduce POS.
48
  - [ ] Optional Preprocessing
 
49
 
50
 
51
  Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
 
46
  - [ ] Obtain plain txt with segments.
47
  - [ ] Introduce POS.
48
  - [ ] Optional Preprocessing
49
+ - [ ] Trasncripcion box as the text being written.
50
 
51
 
52
  Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
app.py CHANGED
@@ -48,7 +48,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
48
  vocal_path = mp3_to_wav(vocal_path, "vocal")
49
 
50
  #result = fast_transcription(vocal_path, model, "es")
51
- result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
52
 
53
  #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
54
 
@@ -79,6 +79,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
79
  file_path = Path(nombre_archivo)
80
  writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None}
81
  srt_writer = get_writer("srt", Path("."))
 
82
  srt_writer(result_aligned, str(file_path.stem), writter_args)
83
 
84
  # with open(
 
48
  vocal_path = mp3_to_wav(vocal_path, "vocal")
49
 
50
  #result = fast_transcription(vocal_path, model, "es")
51
+ result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
52
 
53
  #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
54
 
 
79
  file_path = Path(nombre_archivo)
80
  writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None}
81
  srt_writer = get_writer("srt", Path("."))
82
+ result_aligned["language"] = language
83
  srt_writer(result_aligned, str(file_path.stem), writter_args)
84
 
85
  # with open(
transcription.py CHANGED
@@ -35,7 +35,7 @@ import gc
35
  def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
36
  if language == "Cualquiera":
37
  language = None
38
-
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
  #audio_file = "audio.mp3"
41
  batch_size = 16 # reduce if low on GPU mem
@@ -45,17 +45,17 @@ def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
45
  model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
46
 
47
  audio = whisperx.load_audio(audio_file)
48
- result = model.transcribe(audio, language=language, batch_size=batch_size)
49
- #print(result["segments"]) # before alignment
50
 
51
  # delete model if low on GPU resources
52
  # import gc; gc.collect(); torch.cuda.empty_cache(); del model
53
 
54
  # 2. Align whisper output
55
- model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
56
- result_aligned = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
57
 
58
- #print(result["segments"]) # after alignment
59
 
60
  # delete model if low on GPU resources
61
  # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
@@ -70,7 +70,7 @@ def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
70
  result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
71
  #print(diarize_segments)
72
  #print(result["segments"]) # segments are now assigned speaker IDs
73
- return result_aligned, result_speakers, diarize_segments
74
 
75
  embedding_model = PretrainedSpeakerEmbedding(
76
  "speechbrain/spkrec-ecapa-voxceleb",
 
35
  def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
36
  if language == "Cualquiera":
37
  language = None
38
+
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
  #audio_file = "audio.mp3"
41
  batch_size = 16 # reduce if low on GPU mem
 
45
  model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
46
 
47
  audio = whisperx.load_audio(audio_file)
48
+ result_whisper = model.transcribe(audio, language=language, batch_size=batch_size)
49
+ print(result_whisper["segments"]) # before alignment
50
 
51
  # delete model if low on GPU resources
52
  # import gc; gc.collect(); torch.cuda.empty_cache(); del model
53
 
54
  # 2. Align whisper output
55
+ model_a, metadata = whisperx.load_align_model(language_code=result_whisper["language"], device=device)
56
+ result_aligned = whisperx.align(result_whisper["segments"], model_a, metadata, audio, device, return_char_alignments=False)
57
 
58
+ print(result_aligned) # after alignment
59
 
60
  # delete model if low on GPU resources
61
  # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
 
70
  result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
71
  #print(diarize_segments)
72
  #print(result["segments"]) # segments are now assigned speaker IDs
73
+ return result_whisper, result_aligned, result_speakers, diarize_segments
74
 
75
  embedding_model = PretrainedSpeakerEmbedding(
76
  "speechbrain/spkrec-ecapa-voxceleb",