Spaces:
Runtime error
Runtime error
Commit
·
36d2951
1
Parent(s):
fe5ca43
adding fix for exporting language
Browse files- README.md +1 -0
- app.py +2 -1
- transcription.py +7 -7
README.md
CHANGED
@@ -46,6 +46,7 @@ The user will logging using a password and user specified by me. That user and p
|
|
46 |
- [ ] Obtain plain txt with segments.
|
47 |
- [ ] Introduce POS.
|
48 |
- [ ] Optional Preprocessing
|
|
|
49 |
|
50 |
|
51 |
Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
|
|
|
46 |
- [ ] Obtain plain txt with segments.
|
47 |
- [ ] Introduce POS.
|
48 |
- [ ] Optional Preprocessing
|
49 |
+
- [ ] Trasncripcion box as the text being written.
|
50 |
|
51 |
|
52 |
Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
|
app.py
CHANGED
@@ -48,7 +48,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
|
|
48 |
vocal_path = mp3_to_wav(vocal_path, "vocal")
|
49 |
|
50 |
#result = fast_transcription(vocal_path, model, "es")
|
51 |
-
result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
|
52 |
|
53 |
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
|
54 |
|
@@ -79,6 +79,7 @@ def transcribeWhisperX(audiofile, model, language, patiente,
|
|
79 |
file_path = Path(nombre_archivo)
|
80 |
writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None}
|
81 |
srt_writer = get_writer("srt", Path("."))
|
|
|
82 |
srt_writer(result_aligned, str(file_path.stem), writter_args)
|
83 |
|
84 |
# with open(
|
|
|
48 |
vocal_path = mp3_to_wav(vocal_path, "vocal")
|
49 |
|
50 |
#result = fast_transcription(vocal_path, model, "es")
|
51 |
+
result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
|
52 |
|
53 |
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
|
54 |
|
|
|
79 |
file_path = Path(nombre_archivo)
|
80 |
writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None}
|
81 |
srt_writer = get_writer("srt", Path("."))
|
82 |
+
result_aligned["language"] = language
|
83 |
srt_writer(result_aligned, str(file_path.stem), writter_args)
|
84 |
|
85 |
# with open(
|
transcription.py
CHANGED
@@ -35,7 +35,7 @@ import gc
|
|
35 |
def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
|
36 |
if language == "Cualquiera":
|
37 |
language = None
|
38 |
-
|
39 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
40 |
#audio_file = "audio.mp3"
|
41 |
batch_size = 16 # reduce if low on GPU mem
|
@@ -45,17 +45,17 @@ def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
|
|
45 |
model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
|
46 |
|
47 |
audio = whisperx.load_audio(audio_file)
|
48 |
-
|
49 |
-
|
50 |
|
51 |
# delete model if low on GPU resources
|
52 |
# import gc; gc.collect(); torch.cuda.empty_cache(); del model
|
53 |
|
54 |
# 2. Align whisper output
|
55 |
-
model_a, metadata = whisperx.load_align_model(language_code=
|
56 |
-
result_aligned = whisperx.align(
|
57 |
|
58 |
-
|
59 |
|
60 |
# delete model if low on GPU resources
|
61 |
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
|
@@ -70,7 +70,7 @@ def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
|
|
70 |
result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
|
71 |
#print(diarize_segments)
|
72 |
#print(result["segments"]) # segments are now assigned speaker IDs
|
73 |
-
return result_aligned, result_speakers, diarize_segments
|
74 |
|
75 |
embedding_model = PretrainedSpeakerEmbedding(
|
76 |
"speechbrain/spkrec-ecapa-voxceleb",
|
|
|
35 |
def doWhisperX(audio_file, whisper_model="large-v2", language="es"):
|
36 |
if language == "Cualquiera":
|
37 |
language = None
|
38 |
+
|
39 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
40 |
#audio_file = "audio.mp3"
|
41 |
batch_size = 16 # reduce if low on GPU mem
|
|
|
45 |
model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
|
46 |
|
47 |
audio = whisperx.load_audio(audio_file)
|
48 |
+
result_whisper = model.transcribe(audio, language=language, batch_size=batch_size)
|
49 |
+
print(result_whisper["segments"]) # before alignment
|
50 |
|
51 |
# delete model if low on GPU resources
|
52 |
# import gc; gc.collect(); torch.cuda.empty_cache(); del model
|
53 |
|
54 |
# 2. Align whisper output
|
55 |
+
model_a, metadata = whisperx.load_align_model(language_code=result_whisper["language"], device=device)
|
56 |
+
result_aligned = whisperx.align(result_whisper["segments"], model_a, metadata, audio, device, return_char_alignments=False)
|
57 |
|
58 |
+
print(result_aligned) # after alignment
|
59 |
|
60 |
# delete model if low on GPU resources
|
61 |
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
|
|
|
70 |
result_speakers = whisperx.assign_word_speakers(diarize_segments, result_aligned)
|
71 |
#print(diarize_segments)
|
72 |
#print(result["segments"]) # segments are now assigned speaker IDs
|
73 |
+
return result_whisper, result_aligned, result_speakers, diarize_segments
|
74 |
|
75 |
embedding_model = PretrainedSpeakerEmbedding(
|
76 |
"speechbrain/spkrec-ecapa-voxceleb",
|