Spaces:

viktor-enzell
/

wav2vec2-large-voxrex-swedish-4gram

Runtime error

viktor-enzell commited on Jun 5, 2022

Commit

4dce433

•

1 Parent(s): 091b848

Caching inference function.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -40,13 +40,18 @@ class ASR:
         return self.processor.batch_decode(logits.cpu().numpy()).text[0].lower()
-@st.cache(allow_output_mutation=True, show_spinner=True)
 def load_model():
     asr = ASR()
     asr.load_model()
     return asr
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Swedish Speech-to-Text",
@@ -57,12 +62,13 @@ if __name__ == "__main__":
         width=100,
     )
     st.markdown("""
-    # Swedish high-quality transcription
-    Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
     """)
-    asr = load_model()
     uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
     if uploaded_file is not None:
@@ -74,7 +80,8 @@ if __name__ == "__main__":
             # audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
             # ffmpeg.run(audio_output)
-        transcript = asr.run_inference(uploaded_file)
         st.download_button("Download transcript", transcript, "transcript.txt")

         return self.processor.batch_decode(logits.cpu().numpy()).text[0].lower()
+@st.cache(allow_output_mutation=True, show_spinner=False)
 def load_model():
     asr = ASR()
     asr.load_model()
     return asr
+@st.cache(allow_output_mutation=True, hash_funcs={ASR: lambda _: None}, show_spinner=False)
+def run_inference(asr, file):
+    return asr.run_inference(file)
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Swedish Speech-to-Text",
         width=100,
     )
     st.markdown("""
+    # Swedish Speech-to-text
+    Generate and download high-quality Swedish transcripts for your audio files. The speech-to-text model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
     """)
+    with st.spinner(text="Loading model..."):
+        asr = load_model()
     uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
     if uploaded_file is not None:
             # audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
             # ffmpeg.run(audio_output)
+        with st.spinner(text="Transcribing..."):
+            transcript = run_inference(asr, uploaded_file)
         st.download_button("Download transcript", transcript, "transcript.txt")