katospiegel commited on
Commit
f036671
1 Parent(s): f3782dc

Whisper X implementation

Browse files
Files changed (3) hide show
  1. app.py +53 -5
  2. requirements.txt +1 -0
  3. transcription.py +41 -0
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  from transformers import pipeline
5
  from transformers.pipelines.audio_utils import ffmpeg_read
6
 
7
- from transcription import fast_transcription, speech_to_text
8
  from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
9
  from audio import overlay_audios, compose_audio, total_duration, append_wav_files
10
  from helpers import guardar_en_archivo
@@ -29,7 +29,29 @@ def transcribe(audiofile, model):
29
  #Archivo
30
  nombre_archivo = guardar_en_archivo(out)
31
 
32
- return audio_path, audio_normalized_path, vocal_path, novocal_path, str(result), nombre_archivo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  transcribeI = gr.Interface(
@@ -43,12 +65,38 @@ transcribeI = gr.Interface(
43
  gr.Audio(type="filepath", label="vocal"),
44
  gr.Audio(type="filepath", label="no_vocal"),
45
  gr.TextArea(label="Transcription"),
 
46
  gr.File(label="Archivo generado")
47
  ],
48
  theme="huggingface",
49
- title="Transcripci贸n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  description=(
51
- "Sound extraction, processing, and dialogue transcription.\n"
52
  "Paste a link to a youtube video\n"
53
  ),
54
  allow_flagging="never",
@@ -59,7 +107,7 @@ transcribeI = gr.Interface(
59
  demo = gr.Blocks()
60
  with demo:
61
  gr.Markdown("# Dubbing")
62
- gr.TabbedInterface([transcribeI], ["transcribeI"])
63
 
64
  #demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
65
  demo.launch(enable_queue=True)
 
4
  from transformers import pipeline
5
  from transformers.pipelines.audio_utils import ffmpeg_read
6
 
7
+ from transcription import fast_transcription, speech_to_text, doWhisperX
8
  from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
9
  from audio import overlay_audios, compose_audio, total_duration, append_wav_files
10
  from helpers import guardar_en_archivo
 
29
  #Archivo
30
  nombre_archivo = guardar_en_archivo(out)
31
 
32
+ return audio_path, audio_normalized_path, vocal_path, novocal_path, out, str(result), nombre_archivo
33
+
34
+ def transcribeWhisperX(audiofile, model):
35
+
36
+ audio_path = audiofile[0].name
37
+
38
+ audio_normalized_path = normalizeAudio(audio_path, ".wav")
39
+
40
+ novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
41
+
42
+ novocal_path = mp3_to_wav(novocal_path, "novocal")
43
+ vocal_path = mp3_to_wav(vocal_path, "vocal")
44
+
45
+ #result = fast_transcription(vocal_path, model, "es")
46
+ result, diarize_segments = doWhisperX(vocal_path, whisper_model="large-v2")
47
+
48
+ #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
49
+
50
+ #transcript = "\n".join(out)
51
+ #Archivo
52
+ #nombre_archivo = guardar_en_archivo(out)
53
+
54
+ return audio_path, audio_normalized_path, vocal_path, novocal_path, str(result), str(diarize_segments)
55
 
56
 
57
  transcribeI = gr.Interface(
 
65
  gr.Audio(type="filepath", label="vocal"),
66
  gr.Audio(type="filepath", label="no_vocal"),
67
  gr.TextArea(label="Transcription"),
68
+ gr.JSON(label="JSON Output"),
69
  gr.File(label="Archivo generado")
70
  ],
71
  theme="huggingface",
72
+ title="Transcripci贸n con Whisper",
73
+ description=(
74
+ "Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
75
+ "Paste a link to a youtube video\n"
76
+ ),
77
+ allow_flagging="never",
78
+ #examples=[[None, "COSER-4004-01-00_5m.wav", "large-v2"]]
79
+
80
+ )
81
+
82
+ transcribeII = gr.Interface(
83
+ fn=transcribe,
84
+ inputs=[
85
+ gr.File(label="Upload Files", file_count="multiple"),
86
+ gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"),
87
+ ],
88
+ outputs=[gr.Audio(type="filepath", label="original"),
89
+ gr.Audio(type="filepath", label="normalized"),
90
+ gr.Audio(type="filepath", label="vocal"),
91
+ gr.Audio(type="filepath", label="no_vocal"),
92
+ gr.JSON(label="JSON Output"),
93
+ gr.JSON(label="JSON Output"),
94
+ #gr.File(label="Archivo generado")
95
+ ],
96
+ theme="huggingface",
97
+ title="Transcripci贸n con WshiperX",
98
  description=(
99
+ "Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
100
  "Paste a link to a youtube video\n"
101
  ),
102
  allow_flagging="never",
 
107
  demo = gr.Blocks()
108
  with demo:
109
  gr.Markdown("# Dubbing")
110
+ gr.TabbedInterface([transcribeI, transcribeII], ["Transcripci贸n con Whisper", "Transcripci贸n y diarizaci贸n con WhisperX"])
111
 
112
  #demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
113
  demo.launch(enable_queue=True)
requirements.txt CHANGED
@@ -3,6 +3,7 @@ torch
3
  yt-dlp
4
  openai
5
  pydub
 
6
  faster-whisper
7
  scikit-learn
8
  pandas
 
3
  yt-dlp
4
  openai
5
  pydub
6
+ whisperx
7
  faster-whisper
8
  scikit-learn
9
  pandas
transcription.py CHANGED
@@ -29,10 +29,51 @@ import contextlib
29
  from transformers import pipeline
30
  import psutil
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  embedding_model = PretrainedSpeakerEmbedding(
33
  "speechbrain/spkrec-ecapa-voxceleb",
34
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
35
 
 
36
  def fast_transcription(audio_file, whisper_model, language):
37
  """
38
  # Transcribe youtube link using OpenAI Whisper
 
29
  from transformers import pipeline
30
  import psutil
31
 
32
+ import whisperx
33
+ import gc
34
+
35
+ def doWhisperX(audio_file, whisper_model="large-v2"):
36
+ device = "cuda"
37
+ #audio_file = "audio.mp3"
38
+ batch_size = 16 # reduce if low on GPU mem
39
+ compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
40
+
41
+ # 1. Transcribe with original whisper (batched)
42
+ model = whisperx.load_model(whisper_model, device, compute_type=compute_type)
43
+
44
+ audio = whisperx.load_audio(audio_file)
45
+ result = model.transcribe(audio, batch_size=batch_size)
46
+ #print(result["segments"]) # before alignment
47
+
48
+ # delete model if low on GPU resources
49
+ # import gc; gc.collect(); torch.cuda.empty_cache(); del model
50
+
51
+ # 2. Align whisper output
52
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
53
+ result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
54
+
55
+ #print(result["segments"]) # after alignment
56
+
57
+ # delete model if low on GPU resources
58
+ # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
59
+
60
+ # 3. Assign speaker labels
61
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ['HF_TOKEN'], device=device)
62
+
63
+ # add min/max number of speakers if known
64
+ diarize_segments = diarize_model(audio)
65
+ # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
66
+
67
+ result = whisperx.assign_word_speakers(diarize_segments, result)
68
+ #print(diarize_segments)
69
+ #print(result["segments"]) # segments are now assigned speaker IDs
70
+ return result, diarize_segments
71
+
72
  embedding_model = PretrainedSpeakerEmbedding(
73
  "speechbrain/spkrec-ecapa-voxceleb",
74
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
75
 
76
+
77
  def fast_transcription(audio_file, whisper_model, language):
78
  """
79
  # Transcribe youtube link using OpenAI Whisper