Spaces:

Kr08
/

ASR

Sleeping

App Files Files Community

Kr08 commited on Aug 16, 2024

Commit

b815c4a

verified ·

1 Parent(s): aa348cd

Generate french texts for now

Browse files

First transcription commit
To Do:
1. Predict language from files.
2. Add audio player with temporally fused text.

Files changed (1) hide show

app.py +37 -7

app.py CHANGED Viewed

@@ -1,11 +1,25 @@
-import torchaudio as ta
 import streamlit as st
 from io import BytesIO
-from transformers import AutoProcessor, SeamlessM4TModel
-processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium", use_fast=False)
-model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
 # Title of the app
 st.title("Audio Player with Live Transcription")
@@ -30,12 +44,12 @@ submit_button = st.sidebar.button("Submit")
 #         return f"Could not request results; {e}"
-if submit_button and uploaded_files:
     st.write("Files uploaded successfully!")
     for uploaded_file in uploaded_files:
         # Display file name and audio player
-        print(uploaded_file)
         st.write(f"**File name**: {uploaded_file.name}")
         st.audio(uploaded_file, format=uploaded_file.type)
@@ -44,8 +58,24 @@ if submit_button and uploaded_files:
         # Read the uploaded file data
         waveform, sampling_rate = ta.load(uploaded_file.getvalue())
         # Run transcription function and display
         # import pdb;pdb.set_trace()
         # st.write(audio_data.getvalue())

+import torch
 import streamlit as st
+import torchaudio as ta
 from io import BytesIO
+from transformers import AutoProcessor, SeamlessM4TModel, WhisperProcessor, WhisperForConditionalGeneration
+if torch.cuda.is_available():
+    device = "cuda:0"
+    torch_dtype = torch.float16
+else:
+    device = "cpu"
+    torch_dtype = torch.float32
+SAMPLING_RATE=16000
+task = "transcribe"
+print(f"{device} Active!")
+# load Whisper model and processor
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
 # Title of the app
 st.title("Audio Player with Live Transcription")
 #         return f"Could not request results; {e}"
+if submit_button and uploaded_files is not None:
     st.write("Files uploaded successfully!")
     for uploaded_file in uploaded_files:
         # Display file name and audio player
         st.write(f"**File name**: {uploaded_file.name}")
         st.audio(uploaded_file, format=uploaded_file.type)
         # Read the uploaded file data
         waveform, sampling_rate = ta.load(uploaded_file.getvalue())
+        resampled_inp = ta.functional.resample(waveform, orig_freq=sampling_rate, new_freq=SAMPLING_RATE)
+        input_features = processor(resampled_inp[0], sampling_rate=16000, return_tensors='pt').input_features
+        ## Here Generate specific language!!!
+        forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="translate")
+        if task == "translate":
+            predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
+        else:
+            predicted_ids = model.generate(input_features)
+        # decode token ids to text
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        st.write(transcription)
+        # print(waveform, sampling_rate)
         # Run transcription function and display
         # import pdb;pdb.set_trace()
         # st.write(audio_data.getvalue())