awacke1's picture
Create app.py
8079286
import streamlit as st
import torch
import torchaudio
import requests
from io import BytesIO
# Load the Hugging Face model for speech recognition
model_name = "facebook/wav2vec2-large-xlsr-53"
model = torch.hub.load('pytorch/fairseq', model_name)
# Create a function to transcribe audio from a URL using the model
def transcribe_audio(url):
# Download the audio file from the URL
response = requests.get(url)
audio_bytes = BytesIO(response.content)
# Load the audio file with Torchaudio and apply preprocessing
waveform, sample_rate = torchaudio.load(audio_bytes)
with torch.no_grad():
features = model.feature_extractor(waveform)
logits = model.feature_aggregator(features)
transcription = model.decoder.decode(logits)
return transcription[0]['text']
# Define the Streamlit app
st.title("Speech Recognition with Hugging Face")
# Add a file uploader to allow the user to upload an audio file
audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
if audio_file is not None:
# Load the audio file with Torchaudio and apply preprocessing
waveform, sample_rate = torchaudio.load(audio_file)
with torch.no_grad():
features = model.feature_extractor(waveform)
logits = model.feature_aggregator(features)
transcription = model.decoder.decode(logits)
# Display the transcription
st.write("Transcription:")
st.write(transcription[0]['text'])
# Add a text input to allow the user to enter a URL of an audio file
url = st.text_input("Enter the URL of an audio file")
if url:
# Transcribe the audio from the URL using the model
transcription = transcribe_audio(url)
# Display the transcription
st.write("Transcription:")
st.write(transcription)