Spaces:
Runtime error
Runtime error
import torch | |
import gradio as gr | |
from transformers import pipeline | |
from transformers.pipelines.audio_utils import ffmpeg_read | |
from transcription import fast_transcription, speech_to_text, doWhisperX | |
from whisperx.utils import get_writer | |
from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio | |
from helpers import guardar_en_archivo, guardar_dataframe_en_csv, generar_transcripcion | |
from helpers import crear_diccionario, generar_html_palabras | |
import json | |
import os | |
def transcribe(audiofile, model, preprocesamiento): | |
#if audiofile.type is not str: | |
audio_path = audiofile.name | |
if preprocesamiento == "Pre-procesamiento del audio": | |
audio_normalized_path = normalizeAudio(audio_path, ".wav") | |
novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path) | |
novocal_path = mp3_to_wav(novocal_path, "novocal") | |
vocal_path = mp3_to_wav(vocal_path, "vocal") | |
else: | |
audio_normalized_path = audio_path | |
novocal_path = audio_path | |
vocal_path = audio_path | |
result = fast_transcription(vocal_path, model, "es") | |
out = [str(s["start"]) + " " + s["text"] for s in result["segments"]] | |
#transcript = "\n".join(out) | |
#Archivo | |
nombre_archivo = guardar_en_archivo(out) | |
########################################################################## | |
#SRT | |
from pathlib import Path | |
import datetime | |
fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
# Generar el nombre del archivo | |
nombre_archivo_srt = f"transcription_{fecha_actual}.srt" | |
file_path = Path(nombre_archivo_srt) | |
writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None} | |
srt_writer = get_writer("srt", Path(".")) | |
result["language"] = "es" | |
srt_writer(result, file_path, writter_args) | |
########################################################################### | |
results_short = result | |
results_short["segments"] = results_short["segments"][0:10] | |
return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, nombre_archivo_srt, "\n".join(out), json.dumps(results_short) | |
def transcribeWhisperX(audiofile, model, language, preprocesamiento | |
#patience, initial_prompt, condition_on_previous_text, temperature, | |
#compression, logprob, no_speech_threshold | |
): | |
#if audiofile.type is not str: | |
audio_path = audiofile.name | |
if preprocesamiento == "Pre-procesamiento del audio": | |
audio_normalized_path = normalizeAudio(audio_path, ".wav") | |
novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path) | |
novocal_path = mp3_to_wav(novocal_path, "novocal") | |
vocal_path = mp3_to_wav(vocal_path, "vocal") | |
else: | |
audio_normalized_path = audio_path | |
novocal_path = audio_path | |
vocal_path = audio_path | |
#result = fast_transcription(vocal_path, model, "es") | |
result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language) | |
#out = [str(s["start"]) + " " + s["text"] for s in result["segments"]] | |
#transcript = "\n".join(out) | |
#Archivo | |
#nombre_archivo = guardar_en_archivo(out) | |
########################################################################## | |
from pathlib import Path | |
import datetime | |
fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
# Generar el nombre del archivo | |
nombre_archivo = f"transcription_{fecha_actual}.srt" | |
file_path = Path(nombre_archivo) | |
writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None} | |
srt_writer = get_writer("srt", Path(".")) | |
result_aligned["language"] = language | |
srt_writer(result_aligned, file_path, writter_args) | |
########################################################################### | |
# Creating the txt | |
lineas_txt, nombre_file_txt = generar_transcripcion(result_speakers) | |
lineas_txt_string = "\n".join(lineas_txt) | |
########################################################################### | |
# Creating the highlight | |
dout = crear_diccionario(result_speakers) | |
htmlout = generar_html_palabras(dout["word"], dout["score"]) | |
############################################################################ | |
results_short = result_speakers | |
results_short["segments"] = results_short["segments"][0:10] | |
outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout, | |
nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(results_short)) | |
return outputs | |
transcribeI = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.File(label="Upload Files"), #, file_count="multiple" | |
gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"), | |
gr.Radio(["Audio Original","Pre-procesamiento del audio"], label="Mejora del audio", value="Pre-procesamiento del audio"), | |
], | |
outputs=[gr.Audio(type="filepath", label="original"), | |
gr.Audio(type="filepath", label="normalized"), | |
gr.Audio(type="filepath", label="vocal"), | |
gr.Audio(type="filepath", label="no_vocal"), | |
gr.File(label="Archivo TXT generado"), | |
gr.File(label="Archivo SRT generado"), | |
gr.TextArea(label="Transcripci贸n. (Tiempo en segundos)"), | |
gr.JSON(label="JSON Output") | |
], | |
theme="huggingface", | |
title="Transcripci贸n con Whisper", | |
description=( | |
"Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n" | |
), | |
allow_flagging="never", | |
examples=[["Espana 04 - Video 01 - extracto 2 min.wav", "large-v2", "Pre-procesamiento del audio"]] | |
) | |
transcribeII = gr.Interface( | |
fn=transcribeWhisperX, | |
inputs=[ | |
gr.File(label="Upload Files"), | |
gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"), | |
gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"), | |
gr.Radio(["Audio Original","Pre-procesamiento del audio"], label="Mejora del audio", value="Pre-procesamiento del audio"), | |
#gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"), | |
#gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""), | |
#gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""), | |
#gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"), | |
#gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5), | |
#gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5), | |
#gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5), | |
], | |
outputs=[gr.Audio(type="filepath", label="original"), | |
gr.Audio(type="filepath", label="normalized"), | |
gr.Audio(type="filepath", label="vocal"), | |
gr.Audio(type="filepath", label="no_vocal"), | |
gr.TextArea(label="Transcripci贸n"), | |
gr.HTML(label="Scoring color mapping"), | |
gr.File(label="Archivo TXT generado"), | |
gr.File(label="Archivo SRT generado con turno de palabra"), | |
gr.File(label="Archivo CSV generado con turno de palabra"), | |
gr.JSON(label="Resultados estructurados en JSON palabra por palabra"), | |
#gr.JSON(label="JSON Output"), | |
#gr.File(label="Archivo generado") | |
], | |
theme="huggingface", | |
title="Transcripci贸n con WhisperX", | |
description=( | |
"Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n" | |
), | |
allow_flagging="never", | |
examples=[["Espana 04 - Video 01 - extracto 2 min.wav", | |
"large-v2", | |
"Cualquiera", | |
"Pre-procesamiento del audio" | |
#0.5, | |
#"", | |
#"", | |
#0.5, | |
#0.5, | |
#0.5, | |
#0.5 | |
]] | |
) | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("# Amanuensis. Transcripci贸n de audios basada en OpenAI Whisper.") | |
gr.Markdown(""" ## Muestras | |
- Bajar muestra corta aqu铆: https://drive.google.com/file/d/1dP45fLKHoj8_MfUFF1H9uP11QEskdRf3/view?usp=share_link | |
- Bajar muestra larga aqu铆: https://drive.google.com/file/d/1Nd7Ho3qfsAo33fth4lKGHd7EzuYZ6ZTo/view?usp=share_link | |
""") | |
gr.TabbedInterface([transcribeI, transcribeII], ["Transcripci贸n con Whisper", "Transcripci贸n y turno de palabra con WhisperX"]) | |
gr.Markdown("Interfaz desarrollada por Carlos Vivar Rios usando las tecnolog铆as de c贸digo abierto whisper, whiserX, y. Si quieres utilizar esta herramienta o una soluci贸n personalizada: carlosvivarrios@gmail.com") | |
gr.Markdown("Echa un ojo a mis otros proyectos: carlosvivarrios.com") | |
demo.queue(concurrency_count=2).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD'])) | |