Spaces:

piecurus
/

speech_to_text

Runtime error

App Files Files Community

piecurus commited on Feb 2, 2022

Commit

cab3a0b

•

1 Parent(s): da0005f

added functionality for long text+comments removed

Browse files

Files changed (1) hide show

app.py +6 -74

app.py CHANGED Viewed

@@ -4,13 +4,6 @@
 # In[ ]:
-# conver mp3 to wav
-# ffmpeg -i test_5.mp3 -b:a 16000 test_5.wav
-# In[1]:
 #Importing all the necessary packages
 import nltk
 import librosa
@@ -21,7 +14,7 @@ from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
 nltk.download("punkt")
-# In[2]:
 #Loading the model and the tokenizer
@@ -32,7 +25,7 @@ tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)#omdel_name
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
-# In[3]:
 def load_data(input_file):
@@ -50,7 +43,7 @@ def load_data(input_file):
     return speech
-# In[4]:
 def correct_casing(input_sentence):
@@ -60,7 +53,7 @@ def correct_casing(input_sentence):
     return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
-# In[5]:
 def asr_transcript(input_file):
@@ -80,7 +73,7 @@ def asr_transcript(input_file):
     return transcription
-# In[6]:
 def asr_transcript_long(input_file,tokenizer=tokenizer, model=model ):
@@ -112,32 +105,7 @@ def asr_transcript_long(input_file,tokenizer=tokenizer, model=model ):
     return transcript
-from pydub import AudioSegment
-from pydub.silence import split_on_silence
-from pydub.playback import play
-sound = AudioSegment.from_file("./test_2.wav", format="wav")
-chunks = split_on_silence(
-    sound,
-    # split on silences longer than 1000ms (1 sec)
-    min_silence_len=5000,
-    # anything under -16 dBFS is considered silence
-    silence_thresh=-32,
-    # keep 200 ms of leading/trailing silence
-    keep_silence=500
-)#read the file
-speech, sample_rate = librosa.load('./test_2.wav')
-#make it 1-D
-if len(speech.shape) > 1:
-    speech = speech[:,0] + speech[:,1]
-#Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
-if sample_rate !=16000:
-    speech = librosa.resample(speech, sample_rate,16000)
-part_of_speech = librosa.effects.split(speech)idx = -1
-IPython.display.Audio(data=speech[part_of_speech[idx,0]:part_of_speech[idx,1]], rate=16000)
 # In[ ]:
@@ -149,39 +117,3 @@ gr.Interface(asr_transcript_long,
              description = "This application displays transcribed text for given audio input",
              examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()
-# In[ ]:
-# In[ ]:
-# In[ ]:
-# In[7]:
-#temp = asr_transcript_long('./test_2.wav')
-# In[ ]:
-# In[ ]:

 # In[ ]:
 #Importing all the necessary packages
 import nltk
 import librosa
 nltk.download("punkt")
+# In[ ]:
 #Loading the model and the tokenizer
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
+# In[ ]:
 def load_data(input_file):
     return speech
+# In[ ]:
 def correct_casing(input_sentence):
     return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
+# In[ ]:
 def asr_transcript(input_file):
     return transcription
+# In[ ]:
 def asr_transcript_long(input_file,tokenizer=tokenizer, model=model ):
     return transcript
 # In[ ]:
              description = "This application displays transcribed text for given audio input",
              examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()