Spaces:
Runtime error
Runtime error
Commit
路
f036671
1
Parent(s):
f3782dc
Whisper X implementation
Browse files- app.py +53 -5
- requirements.txt +1 -0
- transcription.py +41 -0
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
from transformers import pipeline
|
5 |
from transformers.pipelines.audio_utils import ffmpeg_read
|
6 |
|
7 |
-
from transcription import fast_transcription, speech_to_text
|
8 |
from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
|
9 |
from audio import overlay_audios, compose_audio, total_duration, append_wav_files
|
10 |
from helpers import guardar_en_archivo
|
@@ -29,7 +29,29 @@ def transcribe(audiofile, model):
|
|
29 |
#Archivo
|
30 |
nombre_archivo = guardar_en_archivo(out)
|
31 |
|
32 |
-
return audio_path, audio_normalized_path, vocal_path, novocal_path, str(result), nombre_archivo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
|
35 |
transcribeI = gr.Interface(
|
@@ -43,12 +65,38 @@ transcribeI = gr.Interface(
|
|
43 |
gr.Audio(type="filepath", label="vocal"),
|
44 |
gr.Audio(type="filepath", label="no_vocal"),
|
45 |
gr.TextArea(label="Transcription"),
|
|
|
46 |
gr.File(label="Archivo generado")
|
47 |
],
|
48 |
theme="huggingface",
|
49 |
-
title="Transcripci贸n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
description=(
|
51 |
-
"
|
52 |
"Paste a link to a youtube video\n"
|
53 |
),
|
54 |
allow_flagging="never",
|
@@ -59,7 +107,7 @@ transcribeI = gr.Interface(
|
|
59 |
demo = gr.Blocks()
|
60 |
with demo:
|
61 |
gr.Markdown("# Dubbing")
|
62 |
-
gr.TabbedInterface([transcribeI], ["
|
63 |
|
64 |
#demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
|
65 |
demo.launch(enable_queue=True)
|
|
|
4 |
from transformers import pipeline
|
5 |
from transformers.pipelines.audio_utils import ffmpeg_read
|
6 |
|
7 |
+
from transcription import fast_transcription, speech_to_text, doWhisperX
|
8 |
from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
|
9 |
from audio import overlay_audios, compose_audio, total_duration, append_wav_files
|
10 |
from helpers import guardar_en_archivo
|
|
|
29 |
#Archivo
|
30 |
nombre_archivo = guardar_en_archivo(out)
|
31 |
|
32 |
+
return audio_path, audio_normalized_path, vocal_path, novocal_path, out, str(result), nombre_archivo
|
33 |
+
|
34 |
+
def transcribeWhisperX(audiofile, model):
|
35 |
+
|
36 |
+
audio_path = audiofile[0].name
|
37 |
+
|
38 |
+
audio_normalized_path = normalizeAudio(audio_path, ".wav")
|
39 |
+
|
40 |
+
novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
|
41 |
+
|
42 |
+
novocal_path = mp3_to_wav(novocal_path, "novocal")
|
43 |
+
vocal_path = mp3_to_wav(vocal_path, "vocal")
|
44 |
+
|
45 |
+
#result = fast_transcription(vocal_path, model, "es")
|
46 |
+
result, diarize_segments = doWhisperX(vocal_path, whisper_model="large-v2")
|
47 |
+
|
48 |
+
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
|
49 |
+
|
50 |
+
#transcript = "\n".join(out)
|
51 |
+
#Archivo
|
52 |
+
#nombre_archivo = guardar_en_archivo(out)
|
53 |
+
|
54 |
+
return audio_path, audio_normalized_path, vocal_path, novocal_path, str(result), str(diarize_segments)
|
55 |
|
56 |
|
57 |
transcribeI = gr.Interface(
|
|
|
65 |
gr.Audio(type="filepath", label="vocal"),
|
66 |
gr.Audio(type="filepath", label="no_vocal"),
|
67 |
gr.TextArea(label="Transcription"),
|
68 |
+
gr.JSON(label="JSON Output"),
|
69 |
gr.File(label="Archivo generado")
|
70 |
],
|
71 |
theme="huggingface",
|
72 |
+
title="Transcripci贸n con Whisper",
|
73 |
+
description=(
|
74 |
+
"Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
|
75 |
+
"Paste a link to a youtube video\n"
|
76 |
+
),
|
77 |
+
allow_flagging="never",
|
78 |
+
#examples=[[None, "COSER-4004-01-00_5m.wav", "large-v2"]]
|
79 |
+
|
80 |
+
)
|
81 |
+
|
82 |
+
transcribeII = gr.Interface(
|
83 |
+
fn=transcribe,
|
84 |
+
inputs=[
|
85 |
+
gr.File(label="Upload Files", file_count="multiple"),
|
86 |
+
gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"),
|
87 |
+
],
|
88 |
+
outputs=[gr.Audio(type="filepath", label="original"),
|
89 |
+
gr.Audio(type="filepath", label="normalized"),
|
90 |
+
gr.Audio(type="filepath", label="vocal"),
|
91 |
+
gr.Audio(type="filepath", label="no_vocal"),
|
92 |
+
gr.JSON(label="JSON Output"),
|
93 |
+
gr.JSON(label="JSON Output"),
|
94 |
+
#gr.File(label="Archivo generado")
|
95 |
+
],
|
96 |
+
theme="huggingface",
|
97 |
+
title="Transcripci贸n con WshiperX",
|
98 |
description=(
|
99 |
+
"Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
|
100 |
"Paste a link to a youtube video\n"
|
101 |
),
|
102 |
allow_flagging="never",
|
|
|
107 |
demo = gr.Blocks()
|
108 |
with demo:
|
109 |
gr.Markdown("# Dubbing")
|
110 |
+
gr.TabbedInterface([transcribeI, transcribeII], ["Transcripci贸n con Whisper", "Transcripci贸n y diarizaci贸n con WhisperX"])
|
111 |
|
112 |
#demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
|
113 |
demo.launch(enable_queue=True)
|
requirements.txt
CHANGED
@@ -3,6 +3,7 @@ torch
|
|
3 |
yt-dlp
|
4 |
openai
|
5 |
pydub
|
|
|
6 |
faster-whisper
|
7 |
scikit-learn
|
8 |
pandas
|
|
|
3 |
yt-dlp
|
4 |
openai
|
5 |
pydub
|
6 |
+
whisperx
|
7 |
faster-whisper
|
8 |
scikit-learn
|
9 |
pandas
|
transcription.py
CHANGED
@@ -29,10 +29,51 @@ import contextlib
|
|
29 |
from transformers import pipeline
|
30 |
import psutil
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
embedding_model = PretrainedSpeakerEmbedding(
|
33 |
"speechbrain/spkrec-ecapa-voxceleb",
|
34 |
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
35 |
|
|
|
36 |
def fast_transcription(audio_file, whisper_model, language):
|
37 |
"""
|
38 |
# Transcribe youtube link using OpenAI Whisper
|
|
|
29 |
from transformers import pipeline
|
30 |
import psutil
|
31 |
|
32 |
+
import whisperx
|
33 |
+
import gc
|
34 |
+
|
35 |
+
def doWhisperX(audio_file, whisper_model="large-v2"):
|
36 |
+
device = "cuda"
|
37 |
+
#audio_file = "audio.mp3"
|
38 |
+
batch_size = 16 # reduce if low on GPU mem
|
39 |
+
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
|
40 |
+
|
41 |
+
# 1. Transcribe with original whisper (batched)
|
42 |
+
model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
|
43 |
+
|
44 |
+
audio = whisperx.load_audio(audio_file)
|
45 |
+
result = model.transcribe(audio, batch_size=batch_size)
|
46 |
+
#print(result["segments"]) # before alignment
|
47 |
+
|
48 |
+
# delete model if low on GPU resources
|
49 |
+
# import gc; gc.collect(); torch.cuda.empty_cache(); del model
|
50 |
+
|
51 |
+
# 2. Align whisper output
|
52 |
+
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
53 |
+
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
|
54 |
+
|
55 |
+
#print(result["segments"]) # after alignment
|
56 |
+
|
57 |
+
# delete model if low on GPU resources
|
58 |
+
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
|
59 |
+
|
60 |
+
# 3. Assign speaker labels
|
61 |
+
diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ['HF_TOKEN'], device=device)
|
62 |
+
|
63 |
+
# add min/max number of speakers if known
|
64 |
+
diarize_segments = diarize_model(audio)
|
65 |
+
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
|
66 |
+
|
67 |
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
68 |
+
#print(diarize_segments)
|
69 |
+
#print(result["segments"]) # segments are now assigned speaker IDs
|
70 |
+
return result, diarize_segments
|
71 |
+
|
72 |
embedding_model = PretrainedSpeakerEmbedding(
|
73 |
"speechbrain/spkrec-ecapa-voxceleb",
|
74 |
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
75 |
|
76 |
+
|
77 |
def fast_transcription(audio_file, whisper_model, language):
|
78 |
"""
|
79 |
# Transcribe youtube link using OpenAI Whisper
|