File size: 5,833 Bytes
10a3d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24bbf2b
10a3d05
 
 
 
 
 
 
 
 
 
 
 
 
f6518c4
10a3d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2487fb
 
 
 
0b908ce
 
fcb190a
e2487fb
 
 
 
 
 
10a3d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import torch
import spaces
import gradio as gr

from transformers import pipeline
from huggingface_hub import model_info

MODEL_NAME = "openai/whisper-small"

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids( task="transcribe")

@spaces.GPU(duration=240)
def transcribe(mic, file_upload):

    file = mic if mic is not None else file_upload

    text = pipe(file)["text"]
    return text


#---------------------------------------------------------------
import ctranslate2
import gradio as gr
from huggingface_hub import snapshot_download
from sentencepiece import SentencePieceProcessor

model_name = "santhosh/madlad400-3b-ct2"
model_path = snapshot_download(model_name)

tokenizer = SentencePieceProcessor()
tokenizer.load(f"{model_path}/sentencepiece.model")
translator = ctranslate2.Translator(model_path)
tokens = [tokenizer.decode(i) for i in range(460)]
lang_codes = [token[2:-1] for token in tokens if token.startswith("<2")]


@spaces.GPU(duration=240)
def translate(input_text, target_language):
    input_tokens = tokenizer.encode(f"<2{target_language}> {input_text}", out_type=str)
    results = translator.translate_batch(
        [input_tokens],
        batch_type="tokens",
        beam_size=1,
        no_repeat_ngram_size=1,
    )
    translated_sentence = tokenizer.decode(results[0].hypotheses[0])
    return translated_sentence


@spaces.GPU(duration=240)
def translate_interface(input_text, target_language):
    translated_text = translate(input_text, target_language)
    return translated_text


with gr.Blocks() as demo:
    with gr.Column():
        gr.Markdown(
        """
       
        <div style="text-align: left;">
            <a href='https://huggingface.co/PhuongPhan'><img style='display: inline-block; margin: 0; padding: 0;' src='https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg' alt='Follow me on HF'></a>
            <a href='https://github.com/PhuongFX'><img style='display: inline-block; margin: 0; padding: 0;' src='https://img.shields.io/badge/GitHub%20Pages-121013?logo=github&logoColor=white' alt='GitHub Pages'></a>
        </div>

        """ )

        gr.Markdown("<h1 style='text-align: center;'>🎀 Speech to Text & Translation πŸ—£οΈ</h1>")

        gr.HTML(
            "<p style='text-align: center'>"
                "🐀 <a href='https://huggingface.co/openai/whisper-small' target='_blank'>OpenAI Whisper</a>  | "
                "πŸ§‘β€πŸ’» <a href='https://huggingface.co/google/madlad400-3b-mt' target='_blank'>Google Madlad</a>"
            "</p>")
        
        gr.Markdown("<p style='text-align: center;'><i>Upload an audio file or use your microphone to transcribe speech and then translate it to different languages.</i></p>")    

    
    with gr.Row():
        # First interface for transcription
        gr.Markdown("## πŸŽ™οΈ Transcribe Audio ") 
        gr.Markdown("---")
        audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath")
        transcribe_button = gr.Button("Transcribe")
        transcribed_output = gr.Textbox(label="Transcribed Text")
        transcribe_button.click(transcribe, inputs=audio_input, outputs=transcribed_output)

    with gr.Row():
        # Second interface for translation
        gr.Markdown("## 🌐 Translate Text 🌐")
        gr.Markdown("---") 
        lang_dropdown = gr.Dropdown(lang_codes, value="en", label="Target Language")
        translate_button = gr.Button("Translate")
        translated_output = gr.Textbox(label="Translated Text")
        translate_button.click(translate_interface, inputs=[transcribed_output, lang_dropdown], outputs=translated_output)


    gr.Examples(
            examples=[
                "Speech_samples/consumer4.wav", 
                "Speech_samples/samples_audio-files_05-gettysburg-address-2min.wav",
                "Speech_samples/samples_audio-files_12-jfk-speech-12sec.wav",
                "Speech_samples/harvard_3mins.wav",
            ],
            inputs=audio_input,
            label="Try these examples"
        )

    
    gr.Markdown("---")
    with gr.Accordion("See Details", open = False):

        gr.Markdown("---")
        gr.Markdown('''

## Description πŸ“

    > Using OpenAI Whisper Base model to transcribe audio files into text Google Madlad model to translate transcribed texts into multiple languages. 
    > Enabling users to convert spoken words into written text. 
    > Supporting various use cases, including transcription of audio files, detection of phrases, speech-to-text generation, and translation of text.
    
## How it Works 🫢

    - Upload an audio file or record a new one directly in the app.
    - Transcribe the audio into text, allow copy and paste function for further use.
    - Or/ Translates the transcribed text into multiple languages.

## Usage πŸ€—

    1. Transcribe audio files for note-taking, research, or content creation
    2. Detect phrases or keywords in audio recordings for data analysis or market research
    3. Generate text from speech for speech-to-text applications, such as subtitles, closed captions, or voice assistants
    4. Use the app for language learning, by transcribing audio files in a foreign language and practicing pronunciation
    5. Translate the transcribed text into multiple languages for global communication

## Disclaimer πŸ™…β€β™‚οΈ

> This app is for personal use only and should not be used for commercial purposes.
The OpenAI Whisper Base model and Google Madlad model are pre-trained models and may not always produce accurate results.   ''')

    demo.queue(max_size=20)
    demo.launch()