import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import soundfile as sf # Correctly load the Wav2Vec2Processor and model processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53") def transcribe_audio(audio): """ Takes an audio file, processes it using Hugging Face Wav2Vec2 model, and returns the transcribed text. """ # Read the audio file audio_input, _ = sf.read(audio.name) # Process audio input using the processor input_values = processor(audio_input, return_tensors="pt").input_values # Get model logits (raw prediction) logits = model(input_values).logits # Decode the prediction into text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # Create a Gradio interface for users to upload audio files iface = gr.Interface(fn=transcribe_audio, inputs=gr.Audio(source="upload", type="file"), outputs="text", title="Voice Login System", description="Upload an audio file for transcription using Wav2Vec2 model.") iface.launch()