File size: 7,128 Bytes
73b906e
 
 
 
 
 
f036671
90bf3ca
73b906e
89bb3ca
73b906e
412977d
 
73b906e
 
 
 
 
 
 
 
 
 
 
 
20bc888
 
da53eff
20bc888
 
73b906e
 
 
ce185dc
f036671
124ba39
 
 
f036671
 
 
 
 
 
 
 
 
 
 
2cce248
f036671
 
 
 
 
 
 
ee9f03e
 
 
2cce248
 
 
 
 
 
 
 
 
 
 
ee9f03e
 
89bb3ca
ee9f03e
 
89bb3ca
ee9f03e
 
 
d49f35d
90bf3ca
 
 
 
 
 
 
ee9f03e
89bb3ca
9dba6bb
73b906e
 
 
 
 
 
 
 
 
 
 
 
ce185dc
89bb3ca
ce185dc
73b906e
 
f036671
 
 
 
 
d49f35d
f036671
 
 
 
72284ae
f036671
 
124ba39
 
89bb3ca
124ba39
 
89bb3ca
124ba39
 
 
f036671
 
 
 
 
9dba6bb
 
89bb3ca
f036671
ac0af78
f036671
 
 
124ba39
73b906e
f036671
73b906e
 
 
 
 
 
 
 
e486aab
f036671
73b906e
 
cef3824
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import torch

import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

from transcription import fast_transcription, speech_to_text, doWhisperX
from whisperx.utils import get_writer
from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
from helpers import guardar_en_archivo, guardar_dataframe_en_csv

import json


def transcribe(audiofile, model):

    audio_path = audiofile[0].name

    audio_normalized_path = normalizeAudio(audio_path, ".wav")

    novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)

    novocal_path = mp3_to_wav(novocal_path, "novocal")
    vocal_path = mp3_to_wav(vocal_path, "vocal")

    result = fast_transcription(vocal_path, model, "es")

    out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]

    #transcript = "\n".join(out)
    #Archivo
    nombre_archivo = guardar_en_archivo(out)

    return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, out, json.dumps(result)

def transcribeWhisperX(audiofile, model, language, patiente, 
                       initial_prompt, condition_on_previous_text, temperature,
                       compression, logprob, no_speech_threshold):

    audio_path = audiofile[0].name

    audio_normalized_path = normalizeAudio(audio_path, ".wav")

    novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)

    novocal_path = mp3_to_wav(novocal_path, "novocal")
    vocal_path = mp3_to_wav(vocal_path, "vocal")

    #result = fast_transcription(vocal_path, model, "es")
    result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model)

    #out = [str(s["start"]) + " " + s["text"] for s in result["segments"]]

    #transcript = "\n".join(out)
    #Archivo
    #nombre_archivo = guardar_en_archivo(out)

    ##########################################################################
    import whisperx
    from pathlib import Path
    # device = "cuda"
    # model_a, metadata = whisperx.load_align_model(
    #     language_code="es", device=device
    # )
    # result_aligned = whisperx.align(
    #     result["segments"],
    #     model_a,
    #     metadata,
    #     vocal_path,
    #     device=device,
    # )
    import datetime
    fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    # Generar el nombre del archivo
    nombre_archivo = f"transcription_{fecha_actual}.srt"

    file_path = Path(nombre_archivo)
    writter_args = {"highlight_words": None, "max_line_count": None, "max_line_width": None}
    srt_writer = get_writer("srt", Path("."))
    srt_writer(result_aligned, str(file_path.stem), writter_args)

    # with open(
    #    nombre_archivo,
    #     "w",
    #     encoding="utf-8",
    # ) as srt:
    #     write_srt(result_aligned["segments"], file=srt)
    ###########################################################################

    return audio_path, audio_normalized_path, vocal_path, novocal_path, vocal_path, str(file_path.stem), guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers)


transcribeI = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.File(label="Upload Files", file_count="multiple"),
        gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"),
    ],
    outputs=[gr.Audio(type="filepath", label="original"),
             gr.Audio(type="filepath", label="normalized"),
             gr.Audio(type="filepath", label="vocal"),
             gr.Audio(type="filepath", label="no_vocal"),
             gr.File(label="Archivo generado"),
             gr.TextArea(label="Transcripci贸n"),
             gr.JSON(label="JSON Output")
        ],
    theme="huggingface",
    title="Transcripci贸n con Whisper",
    description=(
        "Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
    ),
    allow_flagging="never",
    examples=[[None, "Espana 04 - Video 01 - extracto 2 min.wav", "large-v2"]]

)

transcribeII = gr.Interface(
    fn=transcribeWhisperX,
    inputs=[
        gr.File(label="Upload Files", file_count="multiple"),
        gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
        gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
        gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
        gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
        gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
        gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
        gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
        gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
        gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
    ],
    outputs=[gr.Audio(type="filepath", label="original"),
             gr.Audio(type="filepath", label="normalized"),
             gr.Audio(type="filepath", label="vocal"),
             gr.Audio(type="filepath", label="no_vocal"),
             gr.File(label="Archivo SRT generado"),
             gr.File(label="Archivo CSV generado"),
             gr.File(label="Tabla con diarizaci贸n generada"),
             gr.JSON(label="JSON Output"),
             #gr.JSON(label="JSON Output"),
             #gr.File(label="Archivo generado")
        ],
    theme="huggingface",
    title="Transcripci贸n con WhisperX",
    description=(
        "Esta p谩gina realiza una transcripci贸n de audio utilizando Whisper. Adem谩s a帽ade varias mejoras y utilidades: a) Preprocesamiento del audio y limpieza de ruido ambiental, b) Conversi贸n de los archivos de audio a un formato compatible con Whisper, c) C谩lculo de la marca temporal palabra por palabra, d) C谩lculo del nivel de seguridad de la transcripci贸n, e) Conversi贸n del resultado a .csv, .srt y ass.\n"
    ),
    allow_flagging="never",
    #examples=[[None, "COSER-4004-01-00_5m.wav", "large-v2"]]

)

demo = gr.Blocks()
with demo:
    gr.Markdown("# Amanuensis. Transcripci贸n de audios basada en OpenAI Whisper.")
    gr.TabbedInterface([transcribeI, transcribeII], ["Transcripci贸n con Whisper", "Transcripci贸n y diarizaci贸n con WhisperX"])

#demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
demo.launch(enable_queue=True)