Spaces:
Sleeping
Sleeping
Update audio_processing.py
Browse files- audio_processing.py +56 -2
audio_processing.py
CHANGED
@@ -98,6 +98,51 @@ class AudioProcessor:
|
|
98 |
|
99 |
return translation
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
@spaces.GPU(duration=60)
|
102 |
def process_audio(self, audio_path, translate=False):
|
103 |
"""Main processing function"""
|
@@ -106,10 +151,19 @@ class AudioProcessor:
|
|
106 |
waveform, sample_rate = torchaudio.load(audio_path)
|
107 |
if waveform.shape[0] > 1:
|
108 |
waveform = torch.mean(waveform, dim=0)
|
109 |
-
|
|
|
|
|
110 |
# Resample if necessary
|
111 |
if sample_rate != self.sample_rate:
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
# Load models
|
115 |
models = self.load_models()
|
|
|
98 |
|
99 |
return translation
|
100 |
|
101 |
+
def preprocess_audio(self, audio):
|
102 |
+
"""
|
103 |
+
Create overlapping chunks with improved timing logic
|
104 |
+
"""
|
105 |
+
chunk_samples = int(self.chunk_size * self.sample_rate)
|
106 |
+
overlap_samples = int(self.overlap * self.sample_rate)
|
107 |
+
|
108 |
+
chunks_with_times = []
|
109 |
+
start_idx = 0
|
110 |
+
|
111 |
+
while start_idx < len(audio):
|
112 |
+
end_idx = min(start_idx + chunk_samples, len(audio))
|
113 |
+
|
114 |
+
# Add padding for first chunk
|
115 |
+
if start_idx == 0:
|
116 |
+
chunk = audio[start_idx:end_idx]
|
117 |
+
padding = torch.zeros(int(1 * self.sample_rate))
|
118 |
+
chunk = torch.cat([padding, chunk])
|
119 |
+
else:
|
120 |
+
# Include overlap from previous chunk
|
121 |
+
actual_start = max(0, start_idx - overlap_samples)
|
122 |
+
chunk = audio[actual_start:end_idx]
|
123 |
+
|
124 |
+
# Pad if necessary
|
125 |
+
if len(chunk) < chunk_samples:
|
126 |
+
chunk = torch.nn.functional.pad(chunk, (0, chunk_samples - len(chunk)))
|
127 |
+
|
128 |
+
# Adjust time ranges to account for overlaps
|
129 |
+
chunk_start_time = max(0, (start_idx / self.sample_rate) - self.overlap)
|
130 |
+
chunk_end_time = min((end_idx / self.sample_rate) + self.overlap, len(audio) / self.sample_rate)
|
131 |
+
|
132 |
+
chunks_with_times.append({
|
133 |
+
'chunk': chunk,
|
134 |
+
'start_time': start_idx / self.sample_rate,
|
135 |
+
'end_time': end_idx / self.sample_rate,
|
136 |
+
'transcribe_start': chunk_start_time,
|
137 |
+
'transcribe_end': chunk_end_time
|
138 |
+
})
|
139 |
+
|
140 |
+
# Move to next chunk with smaller step size for better continuity
|
141 |
+
start_idx += (chunk_samples - overlap_samples)
|
142 |
+
|
143 |
+
return chunks_with_times
|
144 |
+
|
145 |
+
|
146 |
@spaces.GPU(duration=60)
|
147 |
def process_audio(self, audio_path, translate=False):
|
148 |
"""Main processing function"""
|
|
|
151 |
waveform, sample_rate = torchaudio.load(audio_path)
|
152 |
if waveform.shape[0] > 1:
|
153 |
waveform = torch.mean(waveform, dim=0)
|
154 |
+
else:
|
155 |
+
waveform = waveform.squeeze(0)
|
156 |
+
|
157 |
# Resample if necessary
|
158 |
if sample_rate != self.sample_rate:
|
159 |
+
resampler = torchaudio.transforms.Resample(
|
160 |
+
orig_freq=sample_rate,
|
161 |
+
new_freq=self.sample_rate
|
162 |
+
)
|
163 |
+
waveform = resampler(waveform)
|
164 |
+
|
165 |
+
# if sample_rate != self.sample_rate:
|
166 |
+
# waveform = torchaudio.transforms.Resample(sample_rate, self.sample_rate)(waveform)
|
167 |
|
168 |
# Load models
|
169 |
models = self.load_models()
|