Spaces:
Runtime error
Runtime error
Ajay Karthick Senthil Kumar
commited on
Commit
·
381c43b
1
Parent(s):
d643e7f
add src
Browse files- src/__init__.py +0 -0
- src/features/__init__.py +12 -0
- src/features/extraction/README.md +72 -0
- src/features/extraction/__init__.py +2 -0
- src/features/extraction/features_list.py +69 -0
- src/features/extraction/high_level_features_extractor.py +159 -0
- src/features/extraction/low_level_features_extractor.py +112 -0
- src/features/extraction/prosodic_features.py +127 -0
- src/features/extraction/spectral_features.py +117 -0
- src/features/extraction/stat_measures.py +49 -0
- src/features/extraction/voice_quality_features.py +272 -0
- src/models/__init__.py +0 -0
- src/models/predict.py +36 -0
src/__init__.py
ADDED
File without changes
|
src/features/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .extraction import (
|
2 |
+
LowLevelFeatureExtractor,
|
3 |
+
HighLevelFeatureExtractor
|
4 |
+
)
|
5 |
+
from .visualization import (
|
6 |
+
plot_low_level_feature_dist,
|
7 |
+
plot_high_level_feature_dist,
|
8 |
+
perform_pca_and_plot,
|
9 |
+
perform_kernel_pca_and_plot,
|
10 |
+
perform_pca_and_plot_3d
|
11 |
+
)
|
12 |
+
|
src/features/extraction/README.md
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Features Extraction Module
|
2 |
+
|
3 |
+
This module is an essential part of a system designed to identify deepfake audio recordings. It contains all the necessary tools for extracting a wide range of audio features that help differentiate real audio from synthetic ones.
|
4 |
+
|
5 |
+
## Purpose
|
6 |
+
|
7 |
+
The main purpose of this module is to provide a reliable framework for pulling various audio features from raw sound data. These features include spectral, prosodic, and voice quality aspects, each offering valuable insights into the characteristics of audio signals. By examining these features, the system can identify patterns and irregularities that are commonly found in deepfake audio, improving the effectiveness of detection methods.
|
8 |
+
|
9 |
+
## Modular Design
|
10 |
+
|
11 |
+
The module's design is highly modular, making it easy to integrate and scale. Researchers and developers can conveniently expand existing feature categories or add new ones as new developments in audio analysis become available. This flexibility ensures that the system can adapt and evolve without needing major changes, supporting ongoing improvements in deepfake detection technology.
|
12 |
+
|
13 |
+
## Module Overview
|
14 |
+
|
15 |
+
### Directory Structure
|
16 |
+
|
17 |
+
- `__init__.py`: Initializes the `features.extraction` package, allowing classes and functions to be imported.
|
18 |
+
- `features_list.py`: Manages lists of feature names used throughout the various extraction modules.
|
19 |
+
- `high_level_features_extractor.py`: Extracts complex statistical features from detailed low-level features.
|
20 |
+
- `low_level_features_extractor.py`: Pulls detailed low-level features from raw audio data.
|
21 |
+
- `prosodic_features.py`: Focuses on features related to the rhythm and intonation of speech.
|
22 |
+
- `spectral_features.py`: Extracts features that describe the audio spectrum.
|
23 |
+
- `voice_quality_features.py`: Collects features that show the quality and traits of the voice.
|
24 |
+
- `stat_measures.py`: Provides statistical analysis tools to compute measures such as mean, variance, and skewness for extracted detailed low-level features.
|
25 |
+
|
26 |
+
## Workflow
|
27 |
+
|
28 |
+
### 1. Input
|
29 |
+
|
30 |
+
The workflow starts with raw audio data which is preprocessed to get it ready for feature extraction.
|
31 |
+
|
32 |
+
### 2. Detailed Low-Level Feature Extraction
|
33 |
+
|
34 |
+
This phase involves pulling out various audio features:
|
35 |
+
|
36 |
+
- **Spectral Features**: Such as MFCCs, FFT, and spectral centroids.
|
37 |
+
- **Temporal Features**: Like zero-crossing rate and peak amplitude.
|
38 |
+
- **Prosodic Features**: Including measures like speaking rate and pitch.
|
39 |
+
- **Voice Quality Features**: Evaluating quality through metrics like jitter and shimmer.
|
40 |
+
|
41 |
+
### 3. Transformation and Aggregation
|
42 |
+
|
43 |
+
After extracting features, the data is summarized statistically and segmented (sometimes using rolling windows). This helps focus on the most informative parts of the features and reduces the amount of data.
|
44 |
+
|
45 |
+
### 4. High-Level Feature Extraction
|
46 |
+
|
47 |
+
The summarized data is then used to compute higher-order statistical features such as mean, standard deviation, skewness, and kurtosis, providing a summary suitable for machine learning models.
|
48 |
+
|
49 |
+
### 5. Output
|
50 |
+
|
51 |
+
The end result is a structured array of high-level features for each audio sample, ready for further analysis or direct use in machine learning algorithms.
|
52 |
+
|
53 |
+
## Usage
|
54 |
+
|
55 |
+
```python
|
56 |
+
from features.extraction.low_level_features_extractor import LowLevelFeatureExtractor
|
57 |
+
from features.extraction.high_level_features_extractor import HighLevelFeatureExtractor
|
58 |
+
|
59 |
+
# Initialize extractors
|
60 |
+
low_level_extractor = LowLevelFeatureExtractor()
|
61 |
+
high_level_extractor = HighLevelFeatureExtractor()
|
62 |
+
|
63 |
+
# Process audio data
|
64 |
+
audio_data = {'audio_arr': your_audio_array, 'srate': your_sampling_rate}
|
65 |
+
low_level_features = low_level_extractor.extract_features(audio_data)
|
66 |
+
high_level_features = high_level_extractor.compute high_level_features(low_level_features)
|
67 |
+
|
68 |
+
print(high_level_features)
|
69 |
+
```
|
70 |
+
|
71 |
+
## Future Integration
|
72 |
+
The module is designed for easy integration with data preprocessing pipelines and machine learning frameworks. It allows for simple updates, such as adding new feature categories or improving existing ones, ensuring the system stays current with the latest in audio analysis. This flexibility is particularly important for building a reliable audio deepfake detection system.
|
src/features/extraction/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .low_level_features_extractor import LowLevelFeatureExtractor
|
2 |
+
from .high_level_features_extractor import HighLevelFeatureExtractor
|
src/features/extraction/features_list.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Spectral features
|
3 |
+
DEFAULT_SPECTRAL_FEATURES = [
|
4 |
+
'spectral_centroid',
|
5 |
+
'spectral_bandwidth',
|
6 |
+
'spectral_contrast',
|
7 |
+
'spectral_flatness',
|
8 |
+
'spectral_rolloff',
|
9 |
+
'zero_crossing_rate',
|
10 |
+
'mfccs',
|
11 |
+
'chroma_stft',
|
12 |
+
'spectral_flux'
|
13 |
+
]
|
14 |
+
ALL_SPECTRAL_FEATURES = [
|
15 |
+
'spectral_centroid',
|
16 |
+
'spectral_bandwidth',
|
17 |
+
'spectral_contrast',
|
18 |
+
'spectral_flatness',
|
19 |
+
'spectral_rolloff',
|
20 |
+
'zero_crossing_rate',
|
21 |
+
'mfccs',
|
22 |
+
'chroma_stft',
|
23 |
+
'spectral_flux'
|
24 |
+
]
|
25 |
+
|
26 |
+
# Prosodic features
|
27 |
+
DEFAULT_PROSODIC_FEATURES = ['f0', 'energy', 'speaking_rate', 'pauses', 'formants']
|
28 |
+
ALL_PROSODIC_FEATURES = ['f0', 'energy', 'speaking_rate', 'pauses', 'formants']
|
29 |
+
|
30 |
+
# Voice Quality Features
|
31 |
+
DEFAULT_VOICE_QUALITY_FEATURES = [
|
32 |
+
'jitter',
|
33 |
+
'shimmer',
|
34 |
+
'hnr',
|
35 |
+
'speech_rate'
|
36 |
+
]
|
37 |
+
ALL_VOICE_QUALITY_FEATURES = [
|
38 |
+
'jitter_local',
|
39 |
+
'jitter_rap',
|
40 |
+
'jitter_ppq5',
|
41 |
+
'shimmer_local',
|
42 |
+
'shimmer_apq3',
|
43 |
+
'shimmer_apq5',
|
44 |
+
'shimmer_dda',
|
45 |
+
'hnr',
|
46 |
+
'voicedcount',
|
47 |
+
'npause',
|
48 |
+
'intensity_duration',
|
49 |
+
'speakingrate',
|
50 |
+
'articulationrate',
|
51 |
+
'asd',
|
52 |
+
'totalpauseduration'
|
53 |
+
]
|
54 |
+
|
55 |
+
|
56 |
+
# Default features to extract
|
57 |
+
DEFAULT_FEATURES = {
|
58 |
+
'spectral': DEFAULT_SPECTRAL_FEATURES,
|
59 |
+
'prosodic': DEFAULT_PROSODIC_FEATURES,
|
60 |
+
'voice_quality': DEFAULT_VOICE_QUALITY_FEATURES
|
61 |
+
}
|
62 |
+
|
63 |
+
# All features to extract
|
64 |
+
ALL_FEATURES = {
|
65 |
+
'spectral': ALL_SPECTRAL_FEATURES,
|
66 |
+
'prosodic': ALL_PROSODIC_FEATURES,
|
67 |
+
'voice_quality': ALL_VOICE_QUALITY_FEATURES
|
68 |
+
}
|
69 |
+
|
src/features/extraction/high_level_features_extractor.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from .stat_measures import StatisticalMeasures
|
3 |
+
from .features_list import ALL_FEATURES
|
4 |
+
|
5 |
+
|
6 |
+
class HighLevelFeatureExtractor:
|
7 |
+
"""
|
8 |
+
A class to extract high-level statistical measures from low-level audio features, which include
|
9 |
+
spectral, prosodic, voice quality characteristics and more.
|
10 |
+
|
11 |
+
Attributes:
|
12 |
+
measures (list of str): List of statistical measures to compute for each feature.
|
13 |
+
|
14 |
+
Methods:
|
15 |
+
compute_high_level_features(feature_dict): Computes high-level features for a given set of audio features.
|
16 |
+
high_level_feature_generator(low_level_gen): Generates high-level features from a generator of low-level features.
|
17 |
+
"""
|
18 |
+
def __init__(self, measures=None):
|
19 |
+
"""
|
20 |
+
Initializes the HighLevelFeatureExtractor with a list of statistical measures.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
measures (list of str, optional): A list of statistical measures to apply. Default measures include
|
24 |
+
mean, standard deviation, variance, min, max, range, and percentiles.
|
25 |
+
"""
|
26 |
+
self.measures = measures if measures is not None else ['mean', 'std', 'var', 'min', 'max', 'range', '25th_percentile', '50th_percentile', '75th_percentile', 'skew', 'kurtosis']
|
27 |
+
|
28 |
+
def compute_high_level_features(self, feature_dict):
|
29 |
+
"""
|
30 |
+
Computes high-level features for a dictionary of extracted low-level features.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
feature_dict (dict): Dictionary containing low-level audio feature arrays.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
dict: A dictionary containing high-level statistical features.
|
37 |
+
"""
|
38 |
+
features = {
|
39 |
+
'audio_id': feature_dict['audio_id'],
|
40 |
+
'real_or_fake': feature_dict['real_or_fake']
|
41 |
+
}
|
42 |
+
|
43 |
+
# Compute high-level spectral, prosodic, and voice quality features
|
44 |
+
features.update(self._compute_spectral_features(feature_dict, ALL_FEATURES['spectral']))
|
45 |
+
features.update(self._compute_prosodic_features(feature_dict, ALL_FEATURES['prosodic']))
|
46 |
+
features.update(self._compute_voice_quality_features(feature_dict, ALL_FEATURES['voice_quality']))
|
47 |
+
|
48 |
+
return features
|
49 |
+
|
50 |
+
def high_level_feature_generator(self, low_level_gen):
|
51 |
+
"""
|
52 |
+
Generator to process each set of low-level features and compute high-level features.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
low_level_gen (generator): Generator yielding dictionaries of low-level features.
|
56 |
+
|
57 |
+
Yields:
|
58 |
+
dict: High-level features computed from each low-level feature set.
|
59 |
+
"""
|
60 |
+
for low_level_features in low_level_gen:
|
61 |
+
yield self.compute_high_level_features(feature_dict=low_level_features)
|
62 |
+
|
63 |
+
|
64 |
+
##################################################################
|
65 |
+
### Additional private methods to compute each type of feature ###
|
66 |
+
##################################################################
|
67 |
+
def _compute_spectral_features(self, feature_dict, spectral_features):
|
68 |
+
"""
|
69 |
+
Computes high-level statistical features for spectral features.
|
70 |
+
"""
|
71 |
+
spectral_features_dict = {}
|
72 |
+
for feature_name in spectral_features:
|
73 |
+
if feature_name in ['mfccs', 'chroma_stft']:
|
74 |
+
continue
|
75 |
+
feature_array = feature_dict.get(feature_name)
|
76 |
+
if feature_array is not None:
|
77 |
+
stats = StatisticalMeasures.compute_statistical_measures(feature_array, self.measures)
|
78 |
+
spectral_features_dict.update({f"{feature_name}_{key}": value for key, value in stats.items()})
|
79 |
+
|
80 |
+
# Compute MFCC Features
|
81 |
+
if 'mfccs' in ALL_FEATURES['spectral'] and 'mfccs' in feature_dict:
|
82 |
+
spectral_features_dict.update(self._compute_mfcc_features(feature_dict['mfccs']))
|
83 |
+
|
84 |
+
# Compute Chroma Features
|
85 |
+
if 'chroma_stft' in ALL_FEATURES['spectral'] and 'chroma_stft' in feature_dict:
|
86 |
+
spectral_features_dict.update(self._compute_chroma_features(feature_dict['chroma_stft']))
|
87 |
+
|
88 |
+
return spectral_features_dict
|
89 |
+
|
90 |
+
def _compute_mfcc_features(self, mfccs_flat):
|
91 |
+
"""
|
92 |
+
MFCC features are computed from a flattened array of MFCC coefficients.
|
93 |
+
"""
|
94 |
+
mfccs = mfccs_flat.reshape(-1, 13) # Reshape the flattened array to its original form
|
95 |
+
mfcc_features_dict = {}
|
96 |
+
for i in range(mfccs.shape[1]):
|
97 |
+
feature_array = mfccs[:, i]
|
98 |
+
stats = StatisticalMeasures.compute_statistical_measures(feature_array, self.measures)
|
99 |
+
mfcc_features_dict.update({f"mfcc_{i+1}_{key}": value for key, value in stats.items()})
|
100 |
+
return mfcc_features_dict
|
101 |
+
|
102 |
+
def _compute_chroma_features(self, chroma_flat):
|
103 |
+
"""
|
104 |
+
Chroma features are computed from a flattened array of chroma coefficients.
|
105 |
+
"""
|
106 |
+
chroma = chroma_flat.reshape(-1, 12) # Reshape the flattened array to its original form
|
107 |
+
chroma_features_dict = {}
|
108 |
+
for i in range(chroma.shape[1]):
|
109 |
+
feature_array = chroma[:, i]
|
110 |
+
stats = StatisticalMeasures.compute_statistical_measures(feature_array, self.measures)
|
111 |
+
chroma_features_dict.update({f"chroma_{i+1}_{key}": value for key, value in stats.items()})
|
112 |
+
return chroma_features_dict
|
113 |
+
|
114 |
+
def _compute_prosodic_features(self, feature_dict, prosodic_features):
|
115 |
+
"""
|
116 |
+
Computes high-level statistical features for prosodic features.
|
117 |
+
"""
|
118 |
+
prosodic_features_dict = {}
|
119 |
+
for feature_name in prosodic_features:
|
120 |
+
if feature_name in ['speaking_rate', 'pauses', 'formants']:
|
121 |
+
continue
|
122 |
+
feature_array = feature_dict.get(feature_name)
|
123 |
+
if isinstance(feature_array, dict):
|
124 |
+
raise TypeError(f"Expected array for {feature_name}, but got {type(feature_array).__name__}")
|
125 |
+
if feature_array is not None:
|
126 |
+
stats = StatisticalMeasures.compute_statistical_measures(feature_array, self.measures)
|
127 |
+
prosodic_features_dict.update({f"{feature_name}_{key}": value for key, value in stats.items()})
|
128 |
+
|
129 |
+
if 'speaking_rate' in prosodic_features and 'speaking_rate' in feature_dict:
|
130 |
+
prosodic_features_dict['speaking_rate'] = feature_dict['speaking_rate']
|
131 |
+
|
132 |
+
if 'pauses' in prosodic_features and 'pauses' in feature_dict:
|
133 |
+
pauses = feature_dict['pauses']
|
134 |
+
if pauses:
|
135 |
+
pause_durations = np.array([end - start for start, end in pauses])
|
136 |
+
pause_stats = StatisticalMeasures.compute_statistical_measures(pause_durations, self.measures)
|
137 |
+
prosodic_features_dict.update({f"pause_{key}": value for key, value in pause_stats.items()})
|
138 |
+
else:
|
139 |
+
for measure in self.measures:
|
140 |
+
prosodic_features_dict[f'pause_{measure}'] = np.nan
|
141 |
+
|
142 |
+
if 'formants' in prosodic_features and 'formants' in feature_dict:
|
143 |
+
formant_values = feature_dict['formants']
|
144 |
+
if formant_values:
|
145 |
+
for key, value in formant_values.items():
|
146 |
+
prosodic_features_dict[key] = value
|
147 |
+
|
148 |
+
return prosodic_features_dict
|
149 |
+
|
150 |
+
def _compute_voice_quality_features(self, feature_dict, voice_quality_features):
|
151 |
+
"""
|
152 |
+
Computes high-level statistical features for voice quality features.
|
153 |
+
"""
|
154 |
+
voice_quality_features_dict = {}
|
155 |
+
for feature_name in voice_quality_features:
|
156 |
+
feature_value = feature_dict.get(feature_name)
|
157 |
+
if feature_value is not None:
|
158 |
+
voice_quality_features_dict[feature_name] = feature_value
|
159 |
+
return voice_quality_features_dict
|
src/features/extraction/low_level_features_extractor.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import parselmouth
|
3 |
+
from parselmouth.praat import call
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
from .spectral_features import SpectralFeatureExtractor
|
9 |
+
from .prosodic_features import ProsodicFeatureExtractor
|
10 |
+
from .voice_quality_features import VoiceQualityFeatureExtractor
|
11 |
+
from .features_list import DEFAULT_FEATURES
|
12 |
+
|
13 |
+
|
14 |
+
class LowLevelFeatureExtractor:
|
15 |
+
"""
|
16 |
+
A class to orchestrate the extraction of low-level audio features across spectral, prosodic,
|
17 |
+
and voice quality domains.
|
18 |
+
|
19 |
+
Attributes:
|
20 |
+
target_sr (int): The target sampling rate for audio resampling.
|
21 |
+
include_spectral (bool): Flag to include spectral feature extraction.
|
22 |
+
include_prosodic (bool): Flag to include prosodic feature extraction.
|
23 |
+
include_voice_quality (bool): Flag to include voice quality feature extraction.
|
24 |
+
spectral_features (list): List of spectral features to extract.
|
25 |
+
prosodic_features (list): List of prosodic features to extract.
|
26 |
+
voice_quality_features (list): List of voice quality features to extract.
|
27 |
+
|
28 |
+
Methods:
|
29 |
+
resample_audio(audio_arr, orig_sr): Resamples the audio to the target sampling rate.
|
30 |
+
extract_features(row): Extracts all configured features for a single audio example.
|
31 |
+
low_level_feature_generator(df): Generator that processes a DataFrame of audio examples.
|
32 |
+
"""
|
33 |
+
def __init__(self, target_sr=16000, include_only=None, spectral_features=None, prosodic_features=None, voice_quality_features=None):
|
34 |
+
"""
|
35 |
+
Initializes the LowLevelFeatureExtractor with optional feature lists and inclusion flags.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
target_sr (int): Desired sampling rate for the analysis.
|
39 |
+
include_only (list of str): Optional, specify which feature groups to include.
|
40 |
+
spectral_features (list of str): Specific spectral features to extract.
|
41 |
+
prosodic_features (list of str): Specific prosodic features to extract.
|
42 |
+
voice_quality_features (list of str): Specific voice quality features to extract.
|
43 |
+
"""
|
44 |
+
self.target_sr = target_sr
|
45 |
+
self.include_spectral = True
|
46 |
+
self.include_prosodic = True
|
47 |
+
self.include_voice_quality = True
|
48 |
+
self.spectral_features = spectral_features if spectral_features is not None else DEFAULT_FEATURES['spectral']
|
49 |
+
self.prosodic_features = prosodic_features if prosodic_features is not None else DEFAULT_FEATURES['prosodic']
|
50 |
+
self.voice_quality_features = voice_quality_features if voice_quality_features is not None else DEFAULT_FEATURES['voice_quality']
|
51 |
+
|
52 |
+
if include_only is not None:
|
53 |
+
self.include_spectral = 'spectral' in include_only
|
54 |
+
self.include_prosodic = 'prosodic' in include_only
|
55 |
+
self.include_voice_quality = 'voice_quality' in include_only
|
56 |
+
|
57 |
+
if not self.include_spectral:
|
58 |
+
self.spectral_features = []
|
59 |
+
|
60 |
+
if not self.include_prosodic:
|
61 |
+
self.prosodic_features = []
|
62 |
+
|
63 |
+
if not self.include_voice_quality:
|
64 |
+
self.voice_quality_features = []
|
65 |
+
|
66 |
+
def resample_audio(self, audio_arr, orig_sr):
|
67 |
+
"""
|
68 |
+
Resamples the given audio array from its original sampling rate to the target rate.
|
69 |
+
"""
|
70 |
+
return librosa.resample(audio_arr, orig_sr=orig_sr, target_sr=self.target_sr)
|
71 |
+
|
72 |
+
def extract_features(self, row):
|
73 |
+
"""
|
74 |
+
Extracts features from a single row of audio data, which includes audio id, array, and other metadata.
|
75 |
+
"""
|
76 |
+
audio_id = row['audio_id']
|
77 |
+
audio_arr = row['audio_arr']
|
78 |
+
orig_sr = row['srate']
|
79 |
+
real_or_fake = row['real_or_fake']
|
80 |
+
|
81 |
+
y = self.resample_audio(audio_arr, orig_sr)
|
82 |
+
|
83 |
+
features = {}
|
84 |
+
|
85 |
+
if self.include_spectral:
|
86 |
+
spectral_extractor = SpectralFeatureExtractor(y, self.target_sr)
|
87 |
+
features.update(spectral_extractor.extract(self.spectral_features))
|
88 |
+
|
89 |
+
if self.include_prosodic:
|
90 |
+
prosodic_extractor = ProsodicFeatureExtractor(y, self.target_sr, audio_arr, orig_sr)
|
91 |
+
features.update(prosodic_extractor.extract(self.prosodic_features))
|
92 |
+
|
93 |
+
if self.include_voice_quality:
|
94 |
+
voice_quality_extractor = VoiceQualityFeatureExtractor(audio_arr, orig_sr)
|
95 |
+
features.update(voice_quality_extractor.extract(self.voice_quality_features))
|
96 |
+
|
97 |
+
features = {**{'audio_id': audio_id, 'real_or_fake': real_or_fake}, **features}
|
98 |
+
|
99 |
+
return features
|
100 |
+
|
101 |
+
def low_level_feature_generator(self, df):
|
102 |
+
"""
|
103 |
+
A generator that processes a DataFrame of audio examples to extract features.
|
104 |
+
|
105 |
+
Args:
|
106 |
+
df (pandas.DataFrame): DataFrame containing columns with audio data and metadata.
|
107 |
+
|
108 |
+
Yields:
|
109 |
+
dict: A dictionary of extracted features for each audio file.
|
110 |
+
"""
|
111 |
+
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Audios"):
|
112 |
+
yield self.extract_features(row)
|
src/features/extraction/prosodic_features.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import parselmouth
|
3 |
+
from parselmouth.praat import call
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class ProsodicFeatureExtractor:
|
9 |
+
"""
|
10 |
+
A class for extracting various prosodic features from audio data.
|
11 |
+
|
12 |
+
Attributes:
|
13 |
+
y (numpy.array): Audio time series.
|
14 |
+
sr (int): Sampling rate of the audio time series.
|
15 |
+
audio_arr (numpy.array): Original audio array for parselmouth processing.
|
16 |
+
orig_sr (int): Original sampling rate of the audio array
|
17 |
+
|
18 |
+
Methods:
|
19 |
+
extract(features_to_extract=None): Extracts specified prosodic features from audio.
|
20 |
+
extract_f0(): Extracts fundamental frequency (F0) from audio.
|
21 |
+
extract_energy(): Extracts energy from audio.
|
22 |
+
extract_speaking_rate(): Estimates the speaking rate from audio.
|
23 |
+
extract_pauses(): Detects pauses from audio.
|
24 |
+
extract_formants(): Extracts formant frequencies from audio.
|
25 |
+
"""
|
26 |
+
def __init__(self, y, sr, audio_arr, orig_sr):
|
27 |
+
"""
|
28 |
+
Initializes the ProsodicFeatureExtractor with audio data.
|
29 |
+
"""
|
30 |
+
self.y = y
|
31 |
+
self.sr = sr
|
32 |
+
self.audio_arr = audio_arr
|
33 |
+
self.orig_sr = orig_sr
|
34 |
+
|
35 |
+
def extract(self, features_to_extract=None):
|
36 |
+
"""
|
37 |
+
Extracts the specified prosodic features.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
features_to_extract (list, optional): List of feature names to extract.
|
41 |
+
Defaults to all available features if None.
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
dict: A dictionary containing the extracted features.
|
45 |
+
"""
|
46 |
+
feature_funcs = {
|
47 |
+
'f0': self.extract_f0,
|
48 |
+
'energy': self.extract_energy,
|
49 |
+
'speaking_rate': self.extract_speaking_rate,
|
50 |
+
'pauses': self.extract_pauses,
|
51 |
+
'formants': self.extract_formants
|
52 |
+
}
|
53 |
+
|
54 |
+
if features_to_extract is None:
|
55 |
+
features_to_extract = feature_funcs.keys()
|
56 |
+
|
57 |
+
features = {}
|
58 |
+
for feature in features_to_extract:
|
59 |
+
if feature in feature_funcs:
|
60 |
+
result = feature_funcs[feature]()
|
61 |
+
if isinstance(result, tuple):
|
62 |
+
features.update(result)
|
63 |
+
else:
|
64 |
+
features[feature] = result
|
65 |
+
|
66 |
+
return features
|
67 |
+
|
68 |
+
def extract_f0(self):
|
69 |
+
"""
|
70 |
+
Extracts the fundamental frequency (F0) using PYIN algorithm.
|
71 |
+
"""
|
72 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(self.y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
|
73 |
+
f0 = np.nan_to_num(f0)
|
74 |
+
return f0
|
75 |
+
|
76 |
+
def extract_energy(self):
|
77 |
+
"""
|
78 |
+
Extracts the root-mean-square (RMS) energy from the audio.
|
79 |
+
"""
|
80 |
+
return librosa.feature.rms(y=self.y)[0]
|
81 |
+
|
82 |
+
|
83 |
+
def extract_speaking_rate(self):
|
84 |
+
"""
|
85 |
+
Estimates the speaking rate by calculating the number of syllables per second.
|
86 |
+
"""
|
87 |
+
try:
|
88 |
+
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
89 |
+
total_duration = snd.get_total_duration()
|
90 |
+
intensity = snd.to_intensity()
|
91 |
+
intensity_values = intensity.values.T
|
92 |
+
threshold = 0.3 * max(intensity_values)
|
93 |
+
syllable_count = len([1 for i in intensity_values if i > threshold])
|
94 |
+
speaking_rate = syllable_count / total_duration
|
95 |
+
return speaking_rate
|
96 |
+
except Exception as e:
|
97 |
+
print(f'Error extracting speaking rate: {e}')
|
98 |
+
return None
|
99 |
+
|
100 |
+
def extract_pauses(self):
|
101 |
+
"""
|
102 |
+
Identifies and timestamps pauses in the audio.
|
103 |
+
"""
|
104 |
+
try:
|
105 |
+
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
106 |
+
silences = call(snd, "To TextGrid (silences)", 100, 0, -25, 0.1, 0.1, "silent", "sounding")
|
107 |
+
pauses = [(call(silences, "Get start time of interval", 1, i), call(silences, "Get end time of interval", 1, i)) for i in range(1, call(silences, "Get number of intervals", 1) + 1) if call(silences, "Get label of interval", 1, i) == "silent"]
|
108 |
+
return pauses
|
109 |
+
except Exception as e:
|
110 |
+
print(f'Error extracting pauses: {e}')
|
111 |
+
return None
|
112 |
+
|
113 |
+
def extract_formants(self):
|
114 |
+
"""
|
115 |
+
Extracts the first three formant frequencies using the Burg method.
|
116 |
+
"""
|
117 |
+
try:
|
118 |
+
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
119 |
+
formant = call(snd, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50)
|
120 |
+
formant_values = {}
|
121 |
+
for i in range(1, 4): # Extracting the first three formants
|
122 |
+
formant_values[f'F{i}_mean'] = call(formant, "Get mean", i, 0, 0, "Hertz")
|
123 |
+
formant_values[f'F{i}_stdev'] = call(formant, "Get standard deviation", i, 0, 0, "Hertz")
|
124 |
+
return formant_values
|
125 |
+
except Exception as e:
|
126 |
+
print(f'Error extracting formants: {e}')
|
127 |
+
return {}
|
src/features/extraction/spectral_features.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
class SpectralFeatureExtractor:
|
6 |
+
"""
|
7 |
+
A class to extract various spectral features from audio data using the librosa library.
|
8 |
+
|
9 |
+
Attributes:
|
10 |
+
y (numpy.array): Audio time series.
|
11 |
+
sr (int): Sampling rate of the audio time series.
|
12 |
+
|
13 |
+
Methods:
|
14 |
+
extract(features_to_extract=None): Extracts specified spectral features from audio.
|
15 |
+
spectral_centroid(): Computes the spectral centroid of the audio.
|
16 |
+
spectral_bandwidth(): Computes the spectral bandwidth of the audio.
|
17 |
+
spectral_contrast(): Computes the spectral contrast of the audio.
|
18 |
+
spectral_flatness(): Computes the spectral flatness of the audio.
|
19 |
+
spectral_rolloff(): Computes the spectral rolloff of the audio.
|
20 |
+
zero_crossing_rate(): Computes the zero crossing rate of the audio.
|
21 |
+
mfccs(): Computes the Mel-frequency cepstral coefficients (MFCCs) of the audio.
|
22 |
+
chroma_stft(): Computes the chromagram from a waveform or power spectrogram.
|
23 |
+
spectral_flux(): Computes the spectral flux of the audio.
|
24 |
+
"""
|
25 |
+
def __init__(self, y, sr):
|
26 |
+
"""
|
27 |
+
Initializes the SpectralFeatureExtractor with audio data.
|
28 |
+
"""
|
29 |
+
self.y = y
|
30 |
+
self.sr = sr
|
31 |
+
|
32 |
+
def extract(self, features_to_extract=None):
|
33 |
+
"""
|
34 |
+
Extracts the specified spectral features.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
features_to_extract (list of str, optional): A list of feature names to extract.
|
38 |
+
Defaults to extracting all available features if None.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
dict: A dictionary containing the extracted features.
|
42 |
+
"""
|
43 |
+
feature_funcs = {
|
44 |
+
'spectral_centroid': self.spectral_centroid,
|
45 |
+
'spectral_bandwidth': self.spectral_bandwidth,
|
46 |
+
'spectral_contrast': self.spectral_contrast,
|
47 |
+
'spectral_flatness': self.spectral_flatness,
|
48 |
+
'spectral_rolloff': self.spectral_rolloff,
|
49 |
+
'zero_crossing_rate': self.zero_crossing_rate,
|
50 |
+
'mfccs': self.mfccs,
|
51 |
+
'chroma_stft': self.chroma_stft,
|
52 |
+
'spectral_flux': self.spectral_flux
|
53 |
+
}
|
54 |
+
|
55 |
+
if features_to_extract is None:
|
56 |
+
features_to_extract = feature_funcs.keys()
|
57 |
+
|
58 |
+
features = {}
|
59 |
+
for feature in features_to_extract:
|
60 |
+
if feature in feature_funcs:
|
61 |
+
features[feature] = feature_funcs[feature]()
|
62 |
+
return features
|
63 |
+
|
64 |
+
def spectral_centroid(self):
|
65 |
+
"""
|
66 |
+
Computes the spectral centroid of the audio.
|
67 |
+
"""
|
68 |
+
return librosa.feature.spectral_centroid(y=self.y, sr=self.sr).flatten()
|
69 |
+
|
70 |
+
def spectral_bandwidth(self):
|
71 |
+
"""
|
72 |
+
Computes the spectral bandwidth of the audio.
|
73 |
+
"""
|
74 |
+
return librosa.feature.spectral_bandwidth(y=self.y, sr=self.sr).flatten()
|
75 |
+
|
76 |
+
def spectral_contrast(self):
|
77 |
+
"""
|
78 |
+
Computes the spectral contrast of the audio.
|
79 |
+
"""
|
80 |
+
return librosa.feature.spectral_contrast(y=self.y, sr=self.sr).flatten()
|
81 |
+
|
82 |
+
def spectral_flatness(self):
|
83 |
+
"""
|
84 |
+
Computes the spectral flatness of the audio.
|
85 |
+
"""
|
86 |
+
return librosa.feature.spectral_flatness(y=self.y).flatten()
|
87 |
+
|
88 |
+
def spectral_rolloff(self):
|
89 |
+
"""
|
90 |
+
Computes the spectral rolloff point of the audio.
|
91 |
+
"""
|
92 |
+
return librosa.feature.spectral_rolloff(y=self.y, sr=self.sr).flatten()
|
93 |
+
|
94 |
+
def zero_crossing_rate(self):
|
95 |
+
"""
|
96 |
+
Computes the zero crossing rate of the audio.
|
97 |
+
"""
|
98 |
+
return librosa.feature.zero_crossing_rate(self.y).flatten()
|
99 |
+
|
100 |
+
def mfccs(self):
|
101 |
+
"""
|
102 |
+
Computes the Mel-frequency cepstral coefficients (MFCCs) of the audio.
|
103 |
+
"""
|
104 |
+
return librosa.feature.mfcc(y=self.y, sr=self.sr, n_mfcc=13).flatten()
|
105 |
+
|
106 |
+
def chroma_stft(self):
|
107 |
+
"""
|
108 |
+
Computes the chromagram from a waveform or power spectrogram.
|
109 |
+
"""
|
110 |
+
return librosa.feature.chroma_stft(y=self.y, sr=self.sr).flatten()
|
111 |
+
|
112 |
+
def spectral_flux(self):
|
113 |
+
"""
|
114 |
+
Computes the spectral flux of the audio, indicating the rate of change in the power spectrum.
|
115 |
+
"""
|
116 |
+
S = np.abs(librosa.stft(self.y))
|
117 |
+
return np.sqrt(np.sum(np.diff(S, axis=1)**2, axis=0))
|
src/features/extraction/stat_measures.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from scipy.stats import skew, kurtosis
|
3 |
+
|
4 |
+
|
5 |
+
class StatisticalMeasures:
|
6 |
+
"""
|
7 |
+
A class dedicated to computing various statistical measures on a given feature array.
|
8 |
+
This utility class provides a method to compute a predefined set of statistical measures
|
9 |
+
which are commonly used in data analysis and feature extraction.
|
10 |
+
|
11 |
+
Methods:
|
12 |
+
--------
|
13 |
+
compute_statistical_measures(feature_array, measures=None):
|
14 |
+
Computes selected statistical measures from a provided numerical array.
|
15 |
+
"""
|
16 |
+
@staticmethod
|
17 |
+
def compute_statistical_measures(feature_array, measures=None):
|
18 |
+
|
19 |
+
if measures is None:
|
20 |
+
measures = ['mean', 'std', 'var', 'min', 'max', 'range', '25th_percentile', '50th_percentile', '75th_percentile', 'skew', 'kurtosis']
|
21 |
+
|
22 |
+
stats = {}
|
23 |
+
if 'mean' in measures:
|
24 |
+
stats['mean'] = np.mean(feature_array)
|
25 |
+
if 'std' in measures:
|
26 |
+
stats['std'] = np.std(feature_array)
|
27 |
+
if 'var' in measures:
|
28 |
+
stats['var'] = np.var(feature_array)
|
29 |
+
if 'min' in measures:
|
30 |
+
stats['min'] = np.min(feature_array)
|
31 |
+
if 'max' in measures:
|
32 |
+
stats['max'] = np.max(feature_array)
|
33 |
+
if 'range' in measures:
|
34 |
+
stats['range'] = np.ptp(feature_array)
|
35 |
+
if '25th_percentile' in measures:
|
36 |
+
stats['25th_percentile'] = np.percentile(feature_array, 25)
|
37 |
+
if '50th_percentile' in measures:
|
38 |
+
stats['50th_percentile'] = np.percentile(feature_array, 50)
|
39 |
+
if '75th_percentile' in measures:
|
40 |
+
stats['75th_percentile'] = np.percentile(feature_array, 75)
|
41 |
+
if 'skew' in measures and len(np.unique(feature_array)) > 1:
|
42 |
+
stats['skew'] = skew(feature_array)
|
43 |
+
else:
|
44 |
+
stats['skew'] = np.nan
|
45 |
+
if 'kurtosis' in measures and len(np.unique(feature_array)) > 1:
|
46 |
+
stats['kurtosis'] = kurtosis(feature_array)
|
47 |
+
else:
|
48 |
+
stats['kurtosis'] = np.nan
|
49 |
+
return stats
|
src/features/extraction/voice_quality_features.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import parselmouth
|
2 |
+
from parselmouth.praat import call
|
3 |
+
import numpy as np
|
4 |
+
import math
|
5 |
+
|
6 |
+
|
7 |
+
class VoiceQualityFeatureExtractor:
|
8 |
+
"""
|
9 |
+
A class to extract various voice quality features from audio data.
|
10 |
+
|
11 |
+
Attributes:
|
12 |
+
audio_arr (numpy.array): The audio array used for processing.
|
13 |
+
orig_sr (int): The original sampling rate of the audio.
|
14 |
+
|
15 |
+
Methods:
|
16 |
+
extract(features_to_extract=None): Main method to extract specified voice quality features.
|
17 |
+
extract_jitter(): Extracts measures of frequency variation (jitter).
|
18 |
+
extract_shimmer(): Extracts measures of amplitude variation (shimmer).
|
19 |
+
extract_hnr(): Extracts the Harmonics-to-Noise Ratio (HNR).
|
20 |
+
extract_speech_rate(): Calculates various speech rate metrics.
|
21 |
+
measure_speech_rate(voiceID): Helper method to perform detailed speech rate analysis.
|
22 |
+
"""
|
23 |
+
def __init__(self, audio_arr, orig_sr):
|
24 |
+
"""
|
25 |
+
Initializes the VoiceQualityFeatureExtractor with audio data.
|
26 |
+
"""
|
27 |
+
self.audio_arr = audio_arr
|
28 |
+
self.orig_sr = orig_sr
|
29 |
+
|
30 |
+
def extract(self, features_to_extract=None):
|
31 |
+
"""
|
32 |
+
Extracts specified voice quality features from the audio data.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
features_to_extract (list of str, optional): A list of feature names to extract.
|
36 |
+
Defaults to extracting all available features if None.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
dict: A dictionary containing the extracted features.
|
40 |
+
"""
|
41 |
+
feature_funcs = {
|
42 |
+
'jitter': self.extract_jitter,
|
43 |
+
'shimmer': self.extract_shimmer,
|
44 |
+
'hnr': self.extract_hnr,
|
45 |
+
'speech_rate': self.extract_speech_rate
|
46 |
+
}
|
47 |
+
|
48 |
+
if features_to_extract is None:
|
49 |
+
features_to_extract = feature_funcs.keys()
|
50 |
+
|
51 |
+
features = {}
|
52 |
+
for feature in features_to_extract:
|
53 |
+
if feature in feature_funcs:
|
54 |
+
feature_values = feature_funcs[feature]()
|
55 |
+
if isinstance(feature_values, dict):
|
56 |
+
features.update(feature_values)
|
57 |
+
else:
|
58 |
+
features[feature] = feature_values
|
59 |
+
return features
|
60 |
+
|
61 |
+
def extract_jitter(self):
|
62 |
+
"""
|
63 |
+
Extracts jitter measures from the audio data.
|
64 |
+
"""
|
65 |
+
try:
|
66 |
+
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
67 |
+
point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500)
|
68 |
+
jitter_local = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
|
69 |
+
jitter_rap = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
|
70 |
+
jitter_ppq5 = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
|
71 |
+
return {
|
72 |
+
'jitter_local': jitter_local,
|
73 |
+
'jitter_rap': jitter_rap,
|
74 |
+
'jitter_ppq5': jitter_ppq5
|
75 |
+
}
|
76 |
+
except Exception as e:
|
77 |
+
print(f'Error extracting jitter: {e}')
|
78 |
+
return {
|
79 |
+
'jitter_local': np.nan,
|
80 |
+
'jitter_rap': np.nan,
|
81 |
+
'jitter_ppq5': np.nan
|
82 |
+
}
|
83 |
+
|
84 |
+
def extract_shimmer(self):
|
85 |
+
"""
|
86 |
+
Extracts shimmer measures from the audio data.
|
87 |
+
"""
|
88 |
+
try:
|
89 |
+
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
90 |
+
point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500)
|
91 |
+
shimmer_local = call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
|
92 |
+
shimmer_apq3 = call([snd, point_process], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
|
93 |
+
shimmer_apq5 = call([snd, point_process], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
|
94 |
+
shimmer_dda = call([snd, point_process], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
|
95 |
+
return {
|
96 |
+
'shimmer_local': shimmer_local,
|
97 |
+
'shimmer_apq3': shimmer_apq3,
|
98 |
+
'shimmer_apq5': shimmer_apq5,
|
99 |
+
'shimmer_dda': shimmer_dda
|
100 |
+
}
|
101 |
+
except Exception as e:
|
102 |
+
print(f'Error extracting shimmer: {e}')
|
103 |
+
return {
|
104 |
+
'shimmer_local': np.nan,
|
105 |
+
'shimmer_apq3': np.nan,
|
106 |
+
'shimmer_apq5': np.nan,
|
107 |
+
'shimmer_dda': np.nan
|
108 |
+
}
|
109 |
+
|
110 |
+
def extract_hnr(self):
|
111 |
+
"""
|
112 |
+
Extracts the Harmonics-to-Noise Ratio (HNR) from the audio data.
|
113 |
+
"""
|
114 |
+
try:
|
115 |
+
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
116 |
+
harmonicity = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
|
117 |
+
hnr = call(harmonicity, "Get mean", 0, 0)
|
118 |
+
return {'hnr': hnr}
|
119 |
+
except Exception as e:
|
120 |
+
print(f'Error extracting HNR: {e}')
|
121 |
+
return {'hnr': np.nan}
|
122 |
+
|
123 |
+
|
124 |
+
def extract_speech_rate(self):
|
125 |
+
"""
|
126 |
+
Calculates and extracts various metrics related to speech rate.
|
127 |
+
"""
|
128 |
+
try:
|
129 |
+
sound = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
|
130 |
+
(voicedcount, npause, originaldur, intensity_duration, speakingrate, articulationrate, asd, totalpauseduration) = self.measure_speech_rate(sound)
|
131 |
+
return {
|
132 |
+
'voicedcount': voicedcount,
|
133 |
+
'npause': npause,
|
134 |
+
'originaldur': originaldur,
|
135 |
+
'intensity_duration': intensity_duration,
|
136 |
+
'speakingrate': speakingrate,
|
137 |
+
'articulationrate': articulationrate,
|
138 |
+
'asd': asd,
|
139 |
+
'totalpauseduration': totalpauseduration
|
140 |
+
}
|
141 |
+
except Exception as e:
|
142 |
+
print(f'Error extracting speech rate: {e}')
|
143 |
+
return {
|
144 |
+
'voicedcount': np.nan,
|
145 |
+
'npause': np.nan,
|
146 |
+
'originaldur': np.nan,
|
147 |
+
'intensity_duration': np.nan,
|
148 |
+
'speakingrate': np.nan,
|
149 |
+
'articulationrate': np.nan,
|
150 |
+
'asd': np.nan,
|
151 |
+
'totalpauseduration': np.nan
|
152 |
+
}
|
153 |
+
|
154 |
+
|
155 |
+
def measure_speech_rate(self, voiceID):
|
156 |
+
"""
|
157 |
+
Performs a detailed analysis to measure various speech rate metrics from the given audio.
|
158 |
+
|
159 |
+
This method calculates metrics like the number of voiced segments, number of pauses,
|
160 |
+
the total original duration of the audio, the duration of voiced segments, speaking rate,
|
161 |
+
articulation rate, average syllable duration, and the total duration of pauses.
|
162 |
+
"""
|
163 |
+
silencedb = -25
|
164 |
+
mindip = 2
|
165 |
+
minpause = 0.3
|
166 |
+
|
167 |
+
sound = parselmouth.Sound(voiceID)
|
168 |
+
originaldur = sound.get_total_duration()
|
169 |
+
intensity = sound.to_intensity(50)
|
170 |
+
start = call(intensity, "Get time from frame number", 1)
|
171 |
+
nframes = call(intensity, "Get number of frames")
|
172 |
+
end = call(intensity, "Get time from frame number", nframes)
|
173 |
+
min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
|
174 |
+
max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")
|
175 |
+
|
176 |
+
# get .99 quantile to get maximum (without influence of non-speech sound bursts)
|
177 |
+
max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)
|
178 |
+
|
179 |
+
# estimate Intensity threshold
|
180 |
+
threshold = max_99_intensity + silencedb
|
181 |
+
threshold2 = max_intensity - max_99_intensity
|
182 |
+
threshold3 = silencedb - threshold2
|
183 |
+
if threshold < min_intensity:
|
184 |
+
threshold = min_intensity
|
185 |
+
|
186 |
+
# get pauses (silences) and speakingtime
|
187 |
+
textgrid = call(intensity, "To TextGrid (silences)", threshold3, minpause, 0.1, "silent", "sounding")
|
188 |
+
silencetier = call(textgrid, "Extract tier", 1)
|
189 |
+
|
190 |
+
silencetable = call(silencetier, "Down to TableOfReal", "sounding")
|
191 |
+
npauses = call(silencetable, "Get number of rows")
|
192 |
+
|
193 |
+
speakingtot = 0
|
194 |
+
for ipause in range(npauses):
|
195 |
+
pause = ipause + 1
|
196 |
+
beginsound = call(silencetable, "Get value", pause, 1)
|
197 |
+
endsound = call(silencetable, "Get value", pause, 2)
|
198 |
+
speakingdur = endsound - beginsound
|
199 |
+
speakingtot += speakingdur
|
200 |
+
total_pause_duration = originaldur - speakingtot
|
201 |
+
|
202 |
+
intensity_matrix = call(intensity, "Down to Matrix")
|
203 |
+
sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)
|
204 |
+
intensity_duration = call(sound_from_intensity_matrix, "Get total duration")
|
205 |
+
intensity_max = call(sound_from_intensity_matrix, "Get maximum", 0, 0, "Parabolic")
|
206 |
+
point_process = call(sound_from_intensity_matrix, "To PointProcess (extrema)", "Left", "yes", "no", "Sinc70")
|
207 |
+
numpeaks = call(point_process, "Get number of points")
|
208 |
+
t = [call(point_process, "Get time from index", i + 1) for i in range(numpeaks)]
|
209 |
+
|
210 |
+
timepeaks = []
|
211 |
+
peakcount = 0
|
212 |
+
intensities = []
|
213 |
+
for i in range(numpeaks):
|
214 |
+
value = call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
|
215 |
+
if value > threshold:
|
216 |
+
peakcount += 1
|
217 |
+
intensities.append(value)
|
218 |
+
timepeaks.append(t[i])
|
219 |
+
|
220 |
+
validpeakcount = 0
|
221 |
+
currenttime = timepeaks[0]
|
222 |
+
currentint = intensities[0]
|
223 |
+
validtime = []
|
224 |
+
|
225 |
+
for p in range(peakcount - 1):
|
226 |
+
following = p + 1
|
227 |
+
followingtime = timepeaks[p + 1]
|
228 |
+
dip = call(intensity, "Get minimum", currenttime, timepeaks[p + 1], "None")
|
229 |
+
diffint = abs(currentint - dip)
|
230 |
+
if diffint > mindip:
|
231 |
+
validpeakcount += 1
|
232 |
+
validtime.append(timepeaks[p])
|
233 |
+
currenttime = timepeaks[following]
|
234 |
+
currentint = call(intensity, "Get value at time", timepeaks[following], "Cubic")
|
235 |
+
|
236 |
+
pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
|
237 |
+
voicedcount = 0
|
238 |
+
voicedpeak = []
|
239 |
+
|
240 |
+
for time in range(validpeakcount):
|
241 |
+
querytime = validtime[time]
|
242 |
+
whichinterval = call(textgrid, "Get interval at time", 1, querytime)
|
243 |
+
whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
|
244 |
+
value = pitch.get_value_at_time(querytime)
|
245 |
+
if not math.isnan(value):
|
246 |
+
if whichlabel == "sounding":
|
247 |
+
voicedcount += 1
|
248 |
+
voicedpeak.append(validtime[time])
|
249 |
+
|
250 |
+
timecorrection = originaldur / intensity_duration
|
251 |
+
call(textgrid, "Insert point tier", 1, "syllables")
|
252 |
+
for i in range(len(voicedpeak)):
|
253 |
+
position = (voicedpeak[i] * timecorrection)
|
254 |
+
call(textgrid, "Insert point", 1, position, "")
|
255 |
+
|
256 |
+
speakingrate = voicedcount / originaldur
|
257 |
+
|
258 |
+
# Handling division by zero for articulationrate
|
259 |
+
if speakingtot != 0:
|
260 |
+
articulationrate = voicedcount / speakingtot
|
261 |
+
else:
|
262 |
+
articulationrate = float('nan')
|
263 |
+
|
264 |
+
# Handling division by zero for asd
|
265 |
+
if voicedcount != 0:
|
266 |
+
asd = speakingtot / voicedcount
|
267 |
+
else:
|
268 |
+
asd = float('nan')
|
269 |
+
|
270 |
+
npause = npauses - 1
|
271 |
+
|
272 |
+
return voicedcount, npause, originaldur, intensity_duration, speakingrate, articulationrate, asd, total_pause_duration
|
src/models/__init__.py
ADDED
File without changes
|
src/models/predict.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.impute import SimpleImputer
|
4 |
+
from sklearn.preprocessing import StandardScaler
|
5 |
+
|
6 |
+
def select_features(features_df: pd.DataFrame):
|
7 |
+
selected_features = ['spectral_contrast_var', 'spectral_contrast_range', 'spectral_contrast_mean', 'F3_mean', 'F2_stdev', 'F3_stdev', 'F1_stdev', 'mfcc_13_std', 'F2_mean', 'mfcc_6_75th_percentile', 'mfcc_12_75th_percentile', 'mfcc_9_75th_percentile', 'mfcc_3_75th_percentile', 'mfcc_12_50th_percentile', 'mfcc_9_50th_percentile', 'mfcc_2_50th_percentile', 'mfcc_5_50th_percentile', 'mfcc_7_50th_percentile', 'f0_skew', 'pause_std', 'asd', 'pause_75th_percentile', 'chroma_11_50th_percentile', 'chroma_3_50th_percentile', 'chroma_6_50th_percentile', 'spectral_flux_skew', 'mfcc_12_25th_percentile', 'mfcc_6_25th_percentile', 'mfcc_2_25th_percentile', 'spectral_bandwidth_min', 'zero_crossing_rate_skew', 'chroma_1_range', 'speaking_rate', 'chroma_12_range', 'chroma_2_range', 'chroma_3_range', 'chroma_5_range', 'chroma_10_range', 'spectral_flatness_skew', 'chroma_6_range', 'chroma_8_range', 'chroma_7_range', 'chroma_9_range', 'f0_kurtosis', 'chroma_11_range', 'spectral_bandwidth_kurtosis', 'chroma_6_max', 'chroma_10_max', 'chroma_2_max', 'chroma_12_max', 'chroma_5_max', 'chroma_7_max', 'chroma_4_max', 'chroma_1_max', 'chroma_11_max', 'chroma_4_std', 'chroma_6_std', 'chroma_7_std', 'chroma_3_max', 'chroma_12_std', 'chroma_11_std', 'chroma_2_std', 'chroma_10_std', 'chroma_3_std', 'chroma_9_std', 'chroma_8_std', 'chroma_5_std', 'chroma_1_std', 'zero_crossing_rate_range', 'mfcc_1_skew', 'spectral_rolloff_range', 'f0_25th_percentile', 'pause_skew', 'chroma_9_min', 'mfcc_13_mean', 'mfcc_11_mean', 'zero_crossing_rate_min', 'spectral_bandwidth_max', 'mfcc_10_max', 'f0_75th_percentile', 'mfcc_5_max', 'mfcc_6_mean', 'mfcc_3_max', 'jitter_local', 'spectral_flux_25th_percentile', 'spectral_flatness_min', 'energy_min', 'shimmer_local', 'spectral_flatness_range']
|
8 |
+
print(f"Number of features {len(selected_features)}")
|
9 |
+
features_df = features_df[selected_features]
|
10 |
+
return features_df
|
11 |
+
|
12 |
+
def impute_missing_values(features_df: pd.DataFrame):
|
13 |
+
"""Impute missing values in feature set."""
|
14 |
+
imputer = SimpleImputer(strategy='mean')
|
15 |
+
features_imputed = imputer.fit_transform(features_df)
|
16 |
+
scaler = StandardScaler()
|
17 |
+
features_scaled = scaler.fit_transform(features_imputed)
|
18 |
+
return features_scaled
|
19 |
+
|
20 |
+
def preprocess(features_df: pd.DataFrame):
|
21 |
+
features_df = select_features(features_df)
|
22 |
+
features_df = impute_missing_values(features_df)
|
23 |
+
return features_df
|
24 |
+
|
25 |
+
def load_model():
|
26 |
+
model_path = './models/logistic_regression_model.pkl'
|
27 |
+
model = joblib.load(model_path)
|
28 |
+
return model
|
29 |
+
|
30 |
+
def predict(features_df: pd.DataFrame):
|
31 |
+
features_df = preprocess(features_df)
|
32 |
+
model = load_model()
|
33 |
+
predictions = model.predict(features_df)
|
34 |
+
# Prediction Probabilities
|
35 |
+
prediction_probabilities = model.predict_proba(features_df)
|
36 |
+
return predictions, prediction_probabilities
|