Spaces:
Runtime error
Runtime error
Commit
·
2cce248
1
Parent(s):
ee9f03e
fixed srt subtitules
Browse files- README.md +3 -2
- app.py +13 -13
- transcription.py +6 -6
README.md
CHANGED
@@ -44,6 +44,7 @@ The user will logging using a password and user specified by me. That user and p
|
|
44 |
- [ ] Introduce SRT as output
|
45 |
- [ ] Obtain txt with Diarization.
|
46 |
- [ ] Obtain plain txt with segments.
|
47 |
-
- [ ] Introduce POS
|
48 |
|
49 |
-
|
|
|
|
44 |
- [ ] Introduce SRT as output
|
45 |
- [ ] Obtain txt with Diarization.
|
46 |
- [ ] Obtain plain txt with segments.
|
47 |
+
- [ ] Introduce POS.
|
48 |
|
49 |
+
|
50 |
+
Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
|
app.py
CHANGED
@@ -47,7 +47,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
|
|
47 |
vocal_path = mp3_to_wav(vocal_path, "vocal")
|
48 |
|
49 |
#result = fast_transcription(vocal_path, model, "es")
|
50 |
-
|
51 |
|
52 |
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
|
53 |
|
@@ -58,17 +58,17 @@ def transcribeWhisperX(audiofile, model, language, patiente,
|
|
58 |
##########################################################################
|
59 |
import whisperx
|
60 |
from pathlib import Path
|
61 |
-
device = "cuda"
|
62 |
-
model_a, metadata = whisperx.load_align_model(
|
63 |
-
|
64 |
-
)
|
65 |
-
result_aligned = whisperx.align(
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
)
|
72 |
import datetime
|
73 |
fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
74 |
|
@@ -88,7 +88,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
|
|
88 |
# write_srt(result_aligned["segments"], file=srt)
|
89 |
###########################################################################
|
90 |
|
91 |
-
return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, guardar_dataframe_en_csv(diarize_segments), json.dumps(
|
92 |
|
93 |
|
94 |
transcribeI = gr.Interface(
|
|
|
47 |
vocal_path = mp3_to_wav(vocal_path, "vocal")
|
48 |
|
49 |
#result = fast_transcription(vocal_path, model, "es")
|
50 |
+
result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model)
|
51 |
|
52 |
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
|
53 |
|
|
|
58 |
##########################################################################
|
59 |
import whisperx
|
60 |
from pathlib import Path
|
61 |
+
# device = "cuda"
|
62 |
+
# model_a, metadata = whisperx.load_align_model(
|
63 |
+
# language_code="es", device=device
|
64 |
+
# )
|
65 |
+
# result_aligned = whisperx.align(
|
66 |
+
# result["segments"],
|
67 |
+
# model_a,
|
68 |
+
# metadata,
|
69 |
+
# vocal_path,
|
70 |
+
# device=device,
|
71 |
+
# )
|
72 |
import datetime
|
73 |
fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
74 |
|
|
|
88 |
# write_srt(result_aligned["segments"], file=srt)
|
89 |
###########################################################################
|
90 |
|
91 |
+
return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers)
|
92 |
|
93 |
|
94 |
transcribeI = gr.Interface(
|
transcription.py
CHANGED
@@ -32,8 +32,8 @@ import psutil
|
|
32 |
import whisperx
|
33 |
import gc
|
34 |
|
35 |
-
def doWhisperX(audio_file, whisper_model="large-v2"):
|
36 |
-
device = "cuda"
|
37 |
#audio_file = "audio.mp3"
|
38 |
batch_size = 16 # reduce if low on GPU mem
|
39 |
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
|
@@ -42,7 +42,7 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
|
|
42 |
model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
|
43 |
|
44 |
audio = whisperx.load_audio(audio_file)
|
45 |
-
result = model.transcribe(audio, batch_size=batch_size)
|
46 |
#print(result["segments"]) # before alignment
|
47 |
|
48 |
# delete model if low on GPU resources
|
@@ -50,7 +50,7 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
|
|
50 |
|
51 |
# 2. Align whisper output
|
52 |
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
53 |
-
|
54 |
|
55 |
#print(result["segments"]) # after alignment
|
56 |
|
@@ -64,10 +64,10 @@ def doWhisperX(audio_file, whisper_model="large-v2"):
|
|
64 |
diarize_segments = diarize_model(audio)
|
65 |
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
|
66 |
|
67 |
-
|
68 |
#print(diarize_segments)
|
69 |
#print(result["segments"]) # segments are now assigned speaker IDs
|
70 |
-
return
|
71 |
|
72 |
embedding_model = PretrainedSpeakerEmbedding(
|
73 |
"speechbrain/spkrec-ecapa-voxceleb",
|
|
|
32 |
import whisperx
|
33 |
import gc
|
34 |
|
35 |
+
def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
|
36 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
37 |
#audio_file = "audio.mp3"
|
38 |
batch_size = 16 # reduce if low on GPU mem
|
39 |
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
|
|
|
42 |
model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
|
43 |
|
44 |
audio = whisperx.load_audio(audio_file)
|
45 |
+
result = model.transcribe(audio, language=language, batch_size=batch_size)
|
46 |
#print(result["segments"]) # before alignment
|
47 |
|
48 |
# delete model if low on GPU resources
|
|
|
50 |
|
51 |
# 2. Align whisper output
|
52 |
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
53 |
+
result_aligned = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
|
54 |
|
55 |
#print(result["segments"]) # after alignment
|
56 |
|
|
|
64 |
diarize_segments = diarize_model(audio)
|
65 |
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
|
66 |
|
67 |
+
result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
|
68 |
#print(diarize_segments)
|
69 |
#print(result["segments"]) # segments are now assigned speaker IDs
|
70 |
+
return result_aligned, result_speakers, diarize_segments
|
71 |
|
72 |
embedding_model = PretrainedSpeakerEmbedding(
|
73 |
"speechbrain/spkrec-ecapa-voxceleb",
|