Spaces:

thak123
/

Whisper-Konkani

Running

File size: 3,186 Bytes

caaee3e
 
 
b357c71
caaee3e
 
 
b357c71
caaee3e
 
9aedf57
caaee3e
 
 
 
 
8a1e498
caaee3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
caaee3e
 
 
 
 
 
 
 
 
 
8a1e498
caaee3e
8a1e498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caaee3e
 
 
8a1e498
caaee3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
caaee3e

from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small") #, language="marathi", task="transcribe"

from transformers import pipeline
import gradio as gr
import torch 

pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
                task="automatic-speech-recognition", tokenizer= tokenizer)  # change to "your-username/the-name-you-picked"

# pipe.model.config.forced_decoder_ids = (
#         pipe.tokenizer.get_decoder_prompt_ids(
#             language="marathi", task="transcribe"
#         )
#     )

def transcribe_speech(filepath):
    output = pipe(
        filepath,
        max_new_tokens=256,
        generate_kwargs={
            "task": "transcribe",
            "language": "konkani",
        },  # update with the language you've fine-tuned on
        chunk_length_s=30,
        batch_size=8,
        # padding=True
    )
    return output["text"]


demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.components.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.components.Textbox(),
)
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

# # def transcribe(audio):
# #     # text = pipe(audio)["text"]
# #     # pipe(audio)
# #     text = pipe(audio)
# #     print("op",text)
# #     return text#pipe(audio) #text

# # iface = gr.Interface(
# #     fn=transcribe, 
# #     inputs=[gr.Audio(sources=["microphone", "upload"])], 
# #     outputs="text",
# #     examples=[
# #         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# #         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# #         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# #     ],
# #     title="Whisper Konkani",
# #     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )


# # iface.launch()


# from transformers import WhisperTokenizer, pipeline
# import gradio as gr
# import os

# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")

# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)

# def transcribe(audio):
#     result = pipe(audio)
#     text = result[0]['text']
#     print("op", text)
#     return text

# iface = gr.Interface(
#     fn=transcribe,
#     inputs=[gr.Audio(sources=["microphone", "upload"])],
#     outputs="text",
#     examples=[
#         [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
#         [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
#         [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
#     ],
#     title="Whisper Konkani",
#     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )

# iface.launch()