Spaces:
Runtime error
Runtime error
File size: 7,128 Bytes
73b906e f036671 90bf3ca 73b906e 89bb3ca 73b906e 412977d 73b906e 20bc888 da53eff 20bc888 73b906e ce185dc f036671 124ba39 f036671 2cce248 f036671 ee9f03e 2cce248 ee9f03e 89bb3ca ee9f03e 89bb3ca ee9f03e d49f35d 90bf3ca ee9f03e 89bb3ca 9dba6bb 73b906e ce185dc 89bb3ca ce185dc 73b906e f036671 d49f35d f036671 72284ae f036671 124ba39 89bb3ca 124ba39 89bb3ca 124ba39 f036671 9dba6bb 89bb3ca f036671 ac0af78 f036671 124ba39 73b906e f036671 73b906e e486aab f036671 73b906e cef3824 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import torch
import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from transcription import fast_transcription, speech_to_text, doWhisperX
from whisperx.utils import get_writer
from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
from helpers import guardar_en_archivo, guardar_dataframe_en_csv
import json
def transcribe(audiofile, model):
audio_path = audiofile[0].name
audio_normalized_path = normalizeAudio(audio_path, ".wav")
novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
novocal_path = mp3_to_wav(novocal_path, "novocal")
vocal_path = mp3_to_wav(vocal_path, "vocal")
result = fast_transcription(vocal_path, model, "es")
out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
#transcript = "\n".join(out)
#Archivo
nombre_archivo = guardar_en_archivo(out)
return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, out, json.dumps(result)
def transcribeWhisperX(audiofile, model, language, patiente,
initial_prompt, condition_on_previous_text, temperature,
compression, logprob, no_speech_threshold):
audio_path = audiofile[0].name
audio_normalized_path = normalizeAudio(audio_path, ".wav")
novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
novocal_path = mp3_to_wav(novocal_path, "novocal")
vocal_path = mp3_to_wav(vocal_path, "vocal")
#result = fast_transcription(vocal_path, model, "es")
result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model)
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]
#transcript = "\n".join(out)
#Archivo
#nombre_archivo = guardar_en_archivo(out)
##########################################################################
import whisperx
from pathlib import Path
# device = "cuda"
# model_a, metadata = whisperx.load_align_model(
# language_code="es", device=device
# )
# result_aligned = whisperx.align(
# result["segments"],
# model_a,
# metadata,
# vocal_path,
# device=device,
# )
import datetime
fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Generar el nombre del archivo
nombre_archivo = f"transcription_{fecha_actual}.srt"
file_path = Path(nombre_archivo)
writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None}
srt_writer = get_writer("srt", Path("."))
srt_writer(result_aligned, str(file_path.stem), writter_args)
# with open(
# nombre_archivo,
# "w",
# encoding="utf-8",
# ) as srt:
# write_srt(result_aligned["segments"], file=srt)
###########################################################################
return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, str(file_path.stem), guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers)
transcribeI = gr.Interface(
fn=transcribe,
inputs=[
gr.File(label="Upload Files", file_count="multiple"),
gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"),
],
outputs=[gr.Audio(type="filepath", label="original"),
gr.Audio(type="filepath", label="normalized"),
gr.Audio(type="filepath", label="vocal"),
gr.Audio(type="filepath", label="no_vocal"),
gr.File(label="Archivo generado"),
gr.TextArea(label="Transcripci贸n"),
gr.JSON(label="JSON Output")
],
theme="huggingface",
title="Transcripci贸n con Whisper",
description=(
"Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
),
allow_flagging="never",
examples=[[None, "Espana 04 - Video 01 - extracto 2 min.wav", "large-v2"]]
)
transcribeII = gr.Interface(
fn=transcribeWhisperX,
inputs=[
gr.File(label="Upload Files", file_count="multiple"),
gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
],
outputs=[gr.Audio(type="filepath", label="original"),
gr.Audio(type="filepath", label="normalized"),
gr.Audio(type="filepath", label="vocal"),
gr.Audio(type="filepath", label="no_vocal"),
gr.File(label="Archivo SRT generado"),
gr.File(label="Archivo CSV generado"),
gr.File(label="Tabla con diarizaci贸n generada"),
gr.JSON(label="JSON Output"),
#gr.JSON(label="JSON Output"),
#gr.File(label="Archivo generado")
],
theme="huggingface",
title="Transcripci贸n con WhisperX",
description=(
"Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
),
allow_flagging="never",
#examples=[[None, "COSER-4004-01-00_5m.wav", "large-v2"]]
)
demo = gr.Blocks()
with demo:
gr.Markdown("# Amanuensis. Transcripci贸n de audios basada en OpenAI Whisper.")
gr.TabbedInterface([transcribeI, transcribeII], ["Transcripci贸n con Whisper", "Transcripci贸n y diarizaci贸n con WhisperX"])
#demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
demo.launch(enable_queue=True)
|