File size: 3,430 Bytes
254ac8d
 
 
cac3bb3
 
 
 
 
 
 
254ac8d
ed0b2d9
 
cac3bb3
 
 
 
254ac8d
 
 
 
 
 
2f6b046
254ac8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cac3bb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254ac8d
cac3bb3
254ac8d
cac3bb3
254ac8d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate


MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
lang = "id"
title = "indonesian Whisperer"
description = "Cross Language Speech to Speech using OpenAI Whisper and Coqui TTS"
info = "more info at [indonesian Whisperer](https://github.com/cahya-wirawan/indonesian-whisperer)"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]

    return warn_output + text


LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"

coquiTTS = CoquiTTS()


def tts(language: str, audio_file: str):
    print(f"### {datetime.now()} TTS", language, audio_file)
    transcribed = transcribe(None, audio_file)
    print(f"### {datetime.now()} transcribed:", transcribed)
    translation = translate(transcribed, language, "id")
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(translation, fp, speaker={"language": language})
        print(f"### {datetime.now()} fp.name:", fp.name)
        return fp.name


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
                + title
                + "</h1>")
    gr.Markdown(description)
    with gr.Row():# equal_height=False
        with gr.Column():# variant="panel"
            upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
            print("upload:", upload)
            radio = gr.Radio(
                label="Language",
                choices=LANGUAGES,
                value=default_lang
            )
            with gr.Row(): # mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
        audio = gr.Audio(label="Output", interactive=False)
    memory = psutil.virtual_memory()
    gr.Markdown(info)
    system_status = info = f"""
            *Memory: {memory.total/(1024*1024*1024):.2f}GB, used: {memory.percent}%, available: {memory.available/(1024*1024*1024):.2f}GB*
            """
    gr.Markdown(system_status)
    gr.Markdown("<center>"
                +f'<img src={badge} alt="visitors badge"/>'
                +"</center>")

    # actions
    submit.click(
        tts,
        [radio, upload],
        [audio],
    )
    radio.change(lambda lang: CoquiTTS.langs[lang]["sentence"], radio)

blocks.launch()