|
import os |
|
import math |
|
|
|
import librosa |
|
import numpy as np |
|
from transformers import Wav2Vec2FeatureExtractor |
|
|
|
|
|
class DataProcessor: |
|
def __init__(self, sampling_rate, wav2vec_model_path): |
|
self._processor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_model_path, local_files_only=True) |
|
self._sampling_rate = sampling_rate |
|
|
|
def extract_feature(self, audio_path): |
|
speech_array, sampling_rate = librosa.load(audio_path, sr=self._sampling_rate) |
|
input_value = np.squeeze(self._processor(speech_array, sampling_rate=sampling_rate).input_values) |
|
return input_value |
|
|
|
|
|
def prepare_audio_feature(wav_file, fps=30, sampling_rate=16000, wav2vec_model_path=None): |
|
data_preprocessor = DataProcessor(sampling_rate, wav2vec_model_path) |
|
|
|
input_value = data_preprocessor.extract_feature(wav_file) |
|
seq_len = math.ceil(len(input_value)/sampling_rate*fps) |
|
return { |
|
"audio_feature": input_value, |
|
"seq_len": seq_len |
|
} |
|
|
|
|
|
|