amritsar commited on
Commit
461f105
·
verified ·
1 Parent(s): 00f6a88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  from transformers import Wav2Vec2Processor, Wav2Vec2BertForCTC
3
  import torch
4
  import librosa
 
5
 
6
  # Load the correct processor and model
7
  model_id = "kdcyberdude/w2v-bert-punjabi"
@@ -11,8 +12,15 @@ model = Wav2Vec2BertForCTC.from_pretrained(model_id)
11
  def transcribe_audio(audio_file):
12
  try:
13
  # Load and preprocess the audio
14
- audio, rate = librosa.load(audio_file, sr=16000)
15
- chunk_size = 30 * rate # 30-second chunks
 
 
 
 
 
 
 
16
  transcription = []
17
 
18
  for i in range(0, len(audio), chunk_size):
@@ -23,7 +31,7 @@ def transcribe_audio(audio_file):
23
  with torch.no_grad():
24
  logits = model(input_values).logits
25
 
26
- # Decode the predicted ids to text
27
  predicted_ids = torch.argmax(logits, dim=-1)
28
  transcription.append(processor.batch_decode(predicted_ids)[0])
29
 
 
2
  from transformers import Wav2Vec2Processor, Wav2Vec2BertForCTC
3
  import torch
4
  import librosa
5
+ import numpy as np
6
 
7
  # Load the correct processor and model
8
  model_id = "kdcyberdude/w2v-bert-punjabi"
 
12
  def transcribe_audio(audio_file):
13
  try:
14
  # Load and preprocess the audio
15
+ audio, rate = librosa.load(audio_file, sr=16000) # Resample to 16 kHz
16
+ if len(audio.shape) > 1: # If stereo, convert to mono
17
+ audio = np.mean(audio, axis=1)
18
+
19
+ # Normalize audio to match expected input range [-1, 1]
20
+ audio = librosa.util.normalize(audio)
21
+
22
+ # Split into manageable chunks (30 seconds each)
23
+ chunk_size = int(30 * rate) # 30 seconds in samples
24
  transcription = []
25
 
26
  for i in range(0, len(audio), chunk_size):
 
31
  with torch.no_grad():
32
  logits = model(input_values).logits
33
 
34
+ # Decode predicted IDs to text
35
  predicted_ids = torch.argmax(logits, dim=-1)
36
  transcription.append(processor.batch_decode(predicted_ids)[0])
37