File size: 1,665 Bytes
1eb4ae1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import librosa
import numpy as np
from typing import Dict, Tuple

class AudioProcessor:
    def __init__(self):
        self.sample_rate = 16000
        self.n_mfcc = 13
        self.n_mels = 128
        
    def process_audio(self, audio_path: str) -> Tuple[np.ndarray, Dict]:
        # Load and preprocess audio
        waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        # Extract features
        features = {
            'mfcc': self._extract_mfcc(waveform),
            'pitch': self._extract_pitch(waveform),
            'energy': self._extract_energy(waveform)
        }
        
        return waveform, features
    
    def _extract_mfcc(self, waveform: np.ndarray) -> np.ndarray:
        mfccs = librosa.feature.mfcc(
            y=waveform,
            sr=self.sample_rate,
            n_mfcc=self.n_mfcc
        )
        return mfccs.mean(axis=1)
    
    def _extract_pitch(self, waveform: np.ndarray) -> Dict:
        f0, voiced_flag, voiced_probs = librosa.pyin(
            waveform,
            fmin=librosa.note_to_hz('C2'),
            fmax=librosa.note_to_hz('C7'),
            sr=self.sample_rate
        )
        
        return {
            'mean': float(np.nanmean(f0)),
            'std': float(np.nanstd(f0)),
            'max': float(np.nanmax(f0)),
            'min': float(np.nanmin(f0))
        }
    
    def _extract_energy(self, waveform: np.ndarray) -> Dict:
        rms = librosa.feature.rms(y=waveform)[0]
        
        return {
            'mean': float(np.mean(rms)),
            'std': float(np.std(rms)),
            'max': float(np.max(rms)),
            'min': float(np.min(rms))
        }