from transformers import WhisperTokenizer import os tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe" from transformers import pipeline import gradio as gr import torch import torchaudio pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", task="automatic-speech-recognition", tokenizer= tokenizer, ) # change to "your-username/the-name-you-picked" # pipe.model.config.forced_decoder_ids = ( # pipe.tokenizer.get_decoder_prompt_ids( # language="marathi", task="transcribe" # ) # ) def transcribe_speech(filepath): # waveform, sample_rate = torchaudio.load(filepath) # Resample the audio signal to 16k sampling rate # resampler = torchaudio.transforms.Resample(sample_rate, 16000) # waveform_16k = resampler(waveform) # Save the resampled audio signal to a new file # torchaudio.save(filepath, waveform_16k, 16000) output = pipe( filepath, # max_new_tokens=256, generate_kwargs={ "task": "transcribe", # "language": "konkani", }, # update with the language you've fine-tuned on chunk_length_s=30, batch_size=8, # sampling_rate=16000, # padding=True ) return output["text"] demo = gr.Blocks() mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.components.Textbox(), ) file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.components.Textbox(), ) with demo: gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"], ) demo.launch(debug=True) # # def transcribe(audio): # # # text = pipe(audio)["text"] # # # pipe(audio) # # text = pipe(audio) # # print("op",text) # # return text#pipe(audio) #text # # iface = gr.Interface( # # fn=transcribe, # # inputs=[gr.Audio(sources=["microphone", "upload"])], # # outputs="text", # # examples=[ # # [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], # # [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], # # [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], # # ], # # title="Whisper Konkani", # # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", # # ) # # iface.launch() # from transformers import WhisperTokenizer, pipeline # import gradio as gr # import os # tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe") # pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer) # def transcribe(audio): # result = pipe(audio) # text = result[0]['text'] # print("op", text) # return text # iface = gr.Interface( # fn=transcribe, # inputs=[gr.Audio(sources=["microphone", "upload"])], # outputs="text", # examples=[ # [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")], # [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")], # [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")], # ], # title="Whisper Konkani", # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", # ) # iface.launch()