Spaces:

katospiegel
/

amanu

Runtime error

App Files Files Community

katospiegel commited on Oct 16, 2023

Commit

f036671

1 Parent(s): f3782dc

Whisper X implementation

Browse files

Files changed (3) hide show

app.py +53 -5
requirements.txt +1 -0
transcription.py +41 -0

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
-from transcription import fast_transcription, speech_to_text
 from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
 from audio import overlay_audios, compose_audio, total_duration, append_wav_files
 from helpers import guardar_en_archivo
@@ -29,7 +29,29 @@ def transcribe(audiofile, model):
     #Archivo
     nombre_archivo = guardar_en_archivo(out)
-    return audio_path, audio_normalized_path, vocal_path, novocal_path, str(result), nombre_archivo
 transcribeI = gr.Interface(
@@ -43,12 +65,38 @@ transcribeI = gr.Interface(
              gr.Audio(type="filepath", label="vocal"),
              gr.Audio(type="filepath", label="no_vocal"),
              gr.TextArea(label="Transcription"),
              gr.File(label="Archivo generado")
         ],
     theme="huggingface",
-    title="Transcripción",
     description=(
-        "Sound extraction, processing, and dialogue transcription.\n"
         "Paste a link to a youtube video\n"
     ),
     allow_flagging="never",
@@ -59,7 +107,7 @@ transcribeI = gr.Interface(
 demo = gr.Blocks()
 with demo:
     gr.Markdown("# Dubbing")
-    gr.TabbedInterface([transcribeI], ["transcribeI"])
 #demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
 demo.launch(enable_queue=True)

 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
+from transcription import fast_transcription, speech_to_text, doWhisperX
 from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
 from audio import overlay_audios, compose_audio, total_duration, append_wav_files
 from helpers import guardar_en_archivo
     #Archivo
     nombre_archivo = guardar_en_archivo(out)
+    return audio_path, audio_normalized_path, vocal_path, novocal_path, out, str(result), nombre_archivo
+def transcribeWhisperX(audiofile, model):
+    audio_path = audiofile[0].name
+    audio_normalized_path = normalizeAudio(audio_path, ".wav")
+    novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
+    novocal_path = mp3_to_wav(novocal_path, "novocal")
+    vocal_path = mp3_to_wav(vocal_path, "vocal")
+    #result = fast_transcription(vocal_path, model, "es")
+    result, diarize_segments = doWhisperX(vocal_path, whisper_model="large-v2")
+    #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
+    #transcript = "\n".join(out)
+    #Archivo
+    #nombre_archivo = guardar_en_archivo(out)
+    return audio_path, audio_normalized_path, vocal_path, novocal_path, str(result), str(diarize_segments)
 transcribeI = gr.Interface(
              gr.Audio(type="filepath", label="vocal"),
              gr.Audio(type="filepath", label="no_vocal"),
              gr.TextArea(label="Transcription"),
+             gr.JSON(label="JSON Output"),
              gr.File(label="Archivo generado")
         ],
     theme="huggingface",
+    title="Transcripción con Whisper",
+    description=(
+        "Esta página realiza una transcripción de audio utilizando Whisper. Además añade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversión de los archivos de audio a un formato compatible con Whisper, c) Cálculo de la marca temporal palabra por palabra, d) Cálculo del nivel de seguridad de la transcripción, e) Conversión del resultado a .csv, .srt y ass.\n"
+        "Paste a link to a youtube video\n"
+    ),
+    allow_flagging="never",
+    #examples=[[None, "COSER-4004-01-00_5m.wav", "large-v2"]]
+)
+transcribeII = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.File(label="Upload Files", file_count="multiple"),
+        gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"),
+    ],
+    outputs=[gr.Audio(type="filepath", label="original"),
+             gr.Audio(type="filepath", label="normalized"),
+             gr.Audio(type="filepath", label="vocal"),
+             gr.Audio(type="filepath", label="no_vocal"),
+             gr.JSON(label="JSON Output"),
+             gr.JSON(label="JSON Output"),
+             #gr.File(label="Archivo generado")
+        ],
+    theme="huggingface",
+    title="Transcripción con WshiperX",
     description=(
+        "Esta página realiza una transcripción de audio utilizando Whisper. Además añade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversión de los archivos de audio a un formato compatible con Whisper, c) Cálculo de la marca temporal palabra por palabra, d) Cálculo del nivel de seguridad de la transcripción, e) Conversión del resultado a .csv, .srt y ass.\n"
         "Paste a link to a youtube video\n"
     ),
     allow_flagging="never",
 demo = gr.Blocks()
 with demo:
     gr.Markdown("# Dubbing")
+    gr.TabbedInterface([transcribeI, transcribeII], ["Transcripción con Whisper", "Transcripción y diarización con WhisperX"])
 #demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
 demo.launch(enable_queue=True)

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ torch
 yt-dlp
 openai
 pydub
 faster-whisper
 scikit-learn
 pandas

 yt-dlp
 openai
 pydub
+whisperx
 faster-whisper
 scikit-learn
 pandas

transcription.py CHANGED Viewed

@@ -29,10 +29,51 @@ import contextlib
 from transformers import pipeline
 import psutil
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
 def fast_transcription(audio_file, whisper_model, language):
     """
     # Transcribe youtube link using OpenAI Whisper

 from transformers import pipeline
 import psutil
+import whisperx
+import gc
+def doWhisperX(audio_file, whisper_model="large-v2"):
+    device = "cuda"
+    #audio_file = "audio.mp3"
+    batch_size = 16 # reduce if low on GPU mem
+    compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
+    # 1. Transcribe with original whisper (batched)
+    model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
+    audio = whisperx.load_audio(audio_file)
+    result = model.transcribe(audio, batch_size=batch_size)
+    #print(result["segments"]) # before alignment
+    # delete model if low on GPU resources
+    # import gc; gc.collect(); torch.cuda.empty_cache(); del model
+    # 2. Align whisper output
+    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+    #print(result["segments"]) # after alignment
+    # delete model if low on GPU resources
+    # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
+    # 3. Assign speaker labels
+    diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ['HF_TOKEN'], device=device)
+    # add min/max number of speakers if known
+    diarize_segments = diarize_model(audio)
+    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+    result = whisperx.assign_word_speakers(diarize_segments, result)
+    #print(diarize_segments)
+    #print(result["segments"]) # segments are now assigned speaker IDs
+    return result, diarize_segments
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
 def fast_transcription(audio_file, whisper_model, language):
     """
     # Transcribe youtube link using OpenAI Whisper