import streamlit as st from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM import ffmpeg import torch import torchaudio import torchaudio.functional as F st.set_page_config( page_title="Swedish Speech-to-Text", page_icon="🎙️" ) st.image( "https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png", width=100, ) st.markdown(""" # Swedish high-quality transcription Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram). """) model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device) processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name) def run_inference(file): waveform, sample_rate = torchaudio.load(file) if sample_rate == 16_000: waveform = waveform[0] else: waveform = F.resample(waveform, sample_rate, 16_000)[0] inputs = processor( waveform, sampling_rate=16_000, return_tensors="pt", padding=True ).to(device) with torch.no_grad(): logits = model(**inputs).logits return processor.batch_decode(logits.cpu().numpy()).text[0].lower() uploaded_file = st.file_uploader("Choose a file", type=[".wav"]) if uploaded_file is not None: if uploaded_file.type != "audio/wav": pass # TODO: convert to wav # bytes = uploaded_file.getvalue() # audio_input = ffmpeg.input(bytes).audio # audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav") # ffmpeg.run(audio_output) transcript = run_inference(uploaded_file) st.download_button("Download transcript", transcript, f"{uploaded_file.name}-swedish-transcript.txt") with st.expander("Transcript", expanded=True): st.write(transcript)