bofenghuang commited on
Commit
e927cf5
β€’
1 Parent(s): a356f8e

fix wavform type

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -2
  2. run_demo.py +12 -7
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  transformers
2
- torch
3
- torchaudio
4
  pyctcdecode
5
  pypi-kenlm
 
1
  transformers
2
+ librosa
 
3
  pyctcdecode
4
  pypi-kenlm
run_demo.py CHANGED
@@ -2,7 +2,8 @@ import logging
2
  import warnings
3
 
4
  import gradio as gr
5
- import torchaudio
 
6
  from transformers import pipeline
7
  from transformers.utils.logging import disable_progress_bar
8
 
@@ -24,13 +25,17 @@ logger.info("ASR pipeline has been initialized")
24
 
25
 
26
  def process_audio_file(audio_file):
27
- waveform, sample_rate = torchaudio.load(audio_file)
28
- waveform = waveform.squeeze(axis=0) # mono
29
-
 
 
 
 
 
30
  # resample
31
  if sample_rate != SAMPLE_RATE:
32
- resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
33
- waveform = resampler(waveform)
34
 
35
  return waveform
36
 
@@ -52,7 +57,7 @@ def transcribe(microphone_audio_file, uploaded_audio_file):
52
 
53
  audio_data = process_audio_file(audio_file)
54
 
55
- # text = pipe(audio, chunk_length_s=30, stride_length_s=5)["text"]
56
  text = pipe(audio_data)["text"]
57
  logger.info(f"Transcription for {audio_file}: {text}")
58
 
 
2
  import warnings
3
 
4
  import gradio as gr
5
+ import librosa
6
+ # import torchaudio
7
  from transformers import pipeline
8
  from transformers.utils.logging import disable_progress_bar
9
 
 
25
 
26
 
27
  def process_audio_file(audio_file):
28
+ # waveform, sample_rate = torchaudio.load(audio_file)
29
+ # waveform = waveform.squeeze(axis=0) # mono
30
+ # # resample
31
+ # if sample_rate != SAMPLE_RATE:
32
+ # resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
33
+ # waveform = resampler(waveform)
34
+
35
+ waveform, sample_rate = librosa.load(audio_file, mono=True)
36
  # resample
37
  if sample_rate != SAMPLE_RATE:
38
+ waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
 
39
 
40
  return waveform
41
 
 
57
 
58
  audio_data = process_audio_file(audio_file)
59
 
60
+ # text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"]
61
  text = pipe(audio_data)["text"]
62
  logger.info(f"Transcription for {audio_file}: {text}")
63