File size: 3,224 Bytes
8a1e498
 
 
b357c71
8a1e498
 
 
 
 
 
b357c71
8a1e498
 
 
 
 
b357c71
8a1e498
 
 
 
 
 
 
 
 
 
 
dfc9440
8a1e498
b357c71
9aedf57
8a1e498
 
 
 
 
 
9aedf57
b357c71
8a1e498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
8a1e498
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# from transformers import WhisperTokenizer
# import os
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small") #, language="marathi", task="transcribe"

# from transformers import pipeline
# import gradio as gr
# import torch 

# pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
#                 task="automatic-speech-recognition", tokenizer= tokenizer)  # change to "your-username/the-name-you-picked"

# # pipe.model.config.forced_decoder_ids = (
# #         pipe.tokenizer.get_decoder_prompt_ids(
# #             language="marathi", task="transcribe"
# #         )
# #     )

# def transcribe_speech(filepath):
#     output = pipe(
#         filepath,
#         max_new_tokens=256,
#         generate_kwargs={
#             "task": "transcribe",
#             "language": "konkani",
#         },  # update with the language you've fine-tuned on
#         chunk_length_s=30,
#         batch_size=8,
#         padding=True
#     )
#     return output["text"]


# demo = gr.Blocks()

# mic_transcribe = gr.Interface(
#     fn=transcribe_speech,
#     inputs=gr.Audio(sources="microphone", type="filepath"),
#     outputs=gr.components.Textbox(),
# )

# file_transcribe = gr.Interface(
#     fn=transcribe_speech,
#     inputs=gr.Audio(sources="upload", type="filepath"),
#     outputs=gr.components.Textbox(),
# )
# with demo:
#     gr.TabbedInterface(
#         [mic_transcribe, file_transcribe],
#         ["Transcribe Microphone", "Transcribe Audio File"],
#     )

# demo.launch(debug=True)

# # def transcribe(audio):
# #     # text = pipe(audio)["text"]
# #     # pipe(audio)
# #     text = pipe(audio)
# #     print("op",text)
# #     return text#pipe(audio) #text

# # iface = gr.Interface(
# #     fn=transcribe, 
# #     inputs=[gr.Audio(sources=["microphone", "upload"])], 
# #     outputs="text",
# #     examples=[
# #         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# #         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# #         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# #     ],
# #     title="Whisper Konkani",
# #     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )


# # iface.launch()


from transformers import WhisperTokenizer, pipeline
import gradio as gr
import os

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")

pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)

def transcribe(audio):
    result = pipe(audio)
    text = result[0]['text']
    print("op", text)
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=[gr.Audio(sources=["microphone", "upload"])],
    outputs="text",
    examples=[
        [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
        [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
        [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
    ],
    title="Whisper Konkani",
    description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()