File size: 3,166 Bytes
a48dac6
3915d32
a48dac6
 
 
 
 
4bcd948
a48dac6
 
 
 
4bcd948
a48dac6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ca4d96
a48dac6
ff7533e
a48dac6
 
ff7533e
 
4bcd948
7538ca1
65eb9eb
e2afe6d
 
 
65eb9eb
3d69495
7d18ad5
a48dac6
ff7533e
aea591d
a48dac6
 
 
 
c66ab14
2f2f365
7d18ad5
d40a14a
a48dac6
 
 
 
 
 
 
71b3330
e2afe6d
a48dac6
 
 
792ee14
 
 
 
 
 
 
 
 
 
71b3330
e2afe6d
792ee14
 
a67cf5b
792ee14
 
a48dac6
792ee14
 
3915d32
792ee14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
import gradio as gr
import pytube as pt
from transformers import pipeline
from huggingface_hub import model_info
import time
import unicodedata
# from gradio.themes.utils.theme_dropdown import create_theme_dropdown

MODEL_NAME = "SakshiRathi77/wav2vec2-large-xlsr-300m-hi-kagglex"
lang = "hi"

# my_theme = gr.Theme.from_hub('freddyaboulton/dracula_revamped')
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    device=device,
)

def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"
    file = microphone if microphone is not None else file_upload
    text = pipe(file)["text"]
    
    return warn_output + text


def rt_transcribe(audio, state=""):
    time.sleep(2)
    text = pipe(audio)["text"]
    state += unicodedata.normalize("NFC",text) + " "
    
    return state, state



demo = gr.Blocks()
examples=[["examples/example1.mp3"], ["examples/example2.mp3"],["examples/example3.mp3"]]

title ="""
HindiSpeechPro: WAV2VEC-Powered ASR Interface
"""

description = """
<p>
<center>
Welcome to the HindiSpeechPro, a cutting-edge interface powered by a fine-tuned version of facebook/wav2vec2-xls-r-300m on the common_voice dataset.
<img src="https://huggingface.co/spaces/SakshiRathi77/SakshiRathi77-Wav2Vec2-hi-kagglex/resolve/main/Images/main_image2.png" alt="logo" ;>
</center>
</p>
"""


# article = "<p style='text-align: center'><a href='https://github.com/SakshiRathi77/ASR' target='_blank'>Source Code on Github</a></p><p style='text-align: center'><a href='https://huggingface.co/blog/fine-tune-xlsr-wav2vec2' target='_blank'>Reference</a></p><p style='text-align: center'><a href='https://forms.gle/hjfc3F1P7m3weQVAA' target='_blank'><img src='https://e7.pngegg.com/pngimages/794/310/png-clipart-customer-review-feedback-user-service-others-miscellaneous-text-thumbnail.png' alt='Feedback Form' ;></a></p>"


mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath"),
        gr.inputs.Audio(source="upload", type="filepath"),
    ],
    outputs="text",
    # theme="huggingface",
    title=title,
    description= description ,
    allow_flagging="never",
    examples=examples,
)

rt_transcribe = gr.Interface(
    fn=rt_transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True), 
        "state"
    ],
    outputs=[ "textbox",
        "state"],
    # theme="huggingface",
    title=title,
    description= description ,
    allow_flagging="never",
    live=True,
)


with demo:
    gr.TabbedInterface([mf_transcribe, rt_transcribe], ["Transcribe Audio", "Transcribe Realtime Voice"])

demo.launch(share=True)