File size: 2,464 Bytes
6b9c021
 
c3ed104
6b9c021
 
ea2f95f
81a412a
c27a48e
6b9c021
ea2f95f
ce037b1
6b9c021
ea2f95f
 
 
863d084
a0291f0
ea2f95f
a0291f0
a61a05c
c27a48e
 
 
ff8667c
a4340f7
 
 
c27a48e
a4340f7
c27a48e
 
dc74cbe
 
5721619
 
6b9c021
 
a4340f7
6b9c021
 
ea2f95f
863d084
ea2f95f
6b9c021
 
 
 
 
 
a4340f7
863d084
a4340f7
6b9c021
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import gradio as gr
import torch
from transformers import pipeline


title = "Transcribe speech in several languages"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

asr_pipe_audio2Text_Ge = pipeline(task="automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-german")
asr_pipe_whisper = pipeline(task="automatic-speech-recognition", model="openai/whisper-medium", device=device)

def transcribeFile(inputlang, audio_path : str) -> str:
    #transcription = asr_pipe_audio2Text_Ge(audio_path)
    #transcription = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"transcribe"})
    if inputlang == "Auto Detect":
        transcription = asr_pipe_whisper(audio_path, chunk_length_s=10, stride_length_s=(4, 2), generate_kwargs={"task":"transcribe"}, batch_size=32)
    elif inputlang == "German":
        transcription = asr_pipe_audio2Text_Ge(audio_path, chunk_length_s=10, stride_length_s=(4, 2), batch_size=32)    
    return transcription["text"]

def translateAudio(audio_path):
    translationOutput = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"})
    return translationOutput

def transcribeFileMulti(inputlang, audio_path : str) -> str:
    if inputlang == "English":
        transcription = asr_pipe_whisper(audio_path)
    elif inputlang == "German":
        transcription = asr_pipe_audio2Text_Ge(audio_path)
        translation = translateAudio(audio_path)
        t1 = transcription["text"]
        t2 = translation["text"]
        output = t1+t2
    return output #transcription["text"]


    
app1 = gr.Interface(
    fn=transcribeFile,
    #inputs=gr.inputs.Audio(label="Upload audio file", type="filepath"),
    inputs=[gr.Radio(["Auto Detect", "German"], value="Auto Detect", label="Source Language", info="Select the language of the speech you want to transcribe"),
                     gr.Audio(source="upload", type="filepath",label="Upload audio file")],     
    outputs="text",
    title=title
)


app2 = gr.Interface(
    fn=transcribeFileMulti,
    inputs=[gr.Radio(["Auto Detect", "German"], value="Auto Detect", label="Source Language", info="Select the language of the speech you want to transcribe"),
                     gr.Audio(source="microphone", type="filepath")], 
    outputs="text",
    title=title
)


demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"])

if __name__ == "__main__":
    demo.launch()