Spaces:

701w
/

speech_defects

Sleeping

App Files Files Community

yufii commited on Oct 24, 2024

Commit

6dee894

verified ·

1 Parent(s): dba1e8a

Upload 10 files

Browse files

Files changed (5) hide show

.gitattributes +2 -0
app.py +19 -11
best_model.keras +3 -0
best_model_2.keras +3 -0
utils_api.py +129 -27

.gitattributes CHANGED Viewed

@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 cnn_1_v6_final_model.h5 filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 cnn_1_v6_final_model.h5 filter=lfs diff=lfs merge=lfs -text
+best_model_2.keras filter=lfs diff=lfs merge=lfs -text
+best_model.keras filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -9,10 +9,11 @@ import librosa
 import numpy as np
 import re
 import Levenshtein
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
-from utils import get_features
 #вывод в консоль для просмотри на hugging face
@@ -43,10 +44,12 @@ os.makedirs(cache_dir, exist_ok=True)
 whisper_model = whisper.load_model("tiny", download_root=cache_dir)
 # загрузка параметров модели
-filepath = os.path.abspath("best_model.h5")
 if not os.path.exists(filepath):
-    raise FileNotFoundError(f"Model file not found at {filepath}")
 # Контекстный менеджер для временных аудио файлов
 @contextmanager
 def temporary_audio_file(audio_bytes):
@@ -68,8 +71,6 @@ def temporary_audio_file(audio_bytes):
 async def read_root():
     return {"message": "Welcome to the Defects_model API"}
-model = keras.models.load_model(filepath, compile=False)
 # Endpoint для сохранения аудио файлов
 @app.post("/save-audio")
 async def save_audio(file: UploadFile = File(...)):
@@ -142,21 +143,28 @@ async def process_audio(
                 raise ValueError("Empty or invalid audio data.")
             # Извлечение признаков из аудио
-            features = get_features(tmp_filename) # here data already in form
             # features = np.expand_dims(features, axis=0)  # Add batch dimension
             logging.info(f"Features extracted: shape = {features.shape}")
             # Получение предсказания от модели
-            class_weights = {0: 0.5460790960451978, 1: 1.0068333333333332, 2: 4}
             prediction = model.predict(features)
             logging.info(f"Prediction shape: {prediction.shape}")
-            # Iterate over columns (classes)
             for j in range(prediction.shape[1]):
-                prediction[0, j] *= class_weights.get(j, 1.0)  # Access elements using [0, j]
             logging.info(f"Prediction: {prediction}")
             # Транскрибация аудио с помощью Whisper
             transcription_result = whisper_model.transcribe(tmp_filename, language="russian")
@@ -178,7 +186,7 @@ async def process_audio(
             # Возврат результатов
             return {
-                "prediction": prediction.tolist(),
                 "match_phrase": match_phrase
             }

 import numpy as np
 import re
 import Levenshtein
+import tensorflow as tf
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
+from utils_api import get_features
 #вывод в консоль для просмотри на hugging face
 whisper_model = whisper.load_model("tiny", download_root=cache_dir)
 # загрузка параметров модели
+filepath = "best_model.keras"
 if not os.path.exists(filepath):
+    raise FileNotFoundError(f"Model file not found at {filepath}")\
+model = tf.keras.models.load_model(filepath, compile=False)
+logging.info(model.summary())
 # Контекстный менеджер для временных аудио файлов
 @contextmanager
 def temporary_audio_file(audio_bytes):
 async def read_root():
     return {"message": "Welcome to the Defects_model API"}
 # Endpoint для сохранения аудио файлов
 @app.post("/save-audio")
 async def save_audio(file: UploadFile = File(...)):
                 raise ValueError("Empty or invalid audio data.")
             # Извлечение признаков из аудио
+            features = get_features(tmp_filename)
             # features = np.expand_dims(features, axis=0)  # Add batch dimension
             logging.info(f"Features extracted: shape = {features.shape}")
             # Получение предсказания от модели
+            class_weights = {0: 0.5460790960451978, 1: 1.0068333333333332, 2: 1000.696369636963697}
             prediction = model.predict(features)
             logging.info(f"Prediction shape: {prediction.shape}")
+            #умножаем предикт на веса классов
             for j in range(prediction.shape[1]):
+                prediction[0, j] *= class_weights.get(j, 1.0)
+                prediction[0, j] *= 10
             logging.info(f"Prediction: {prediction}")
+            response_answer = np.argmax(prediction)
+            if (response_answer == 0):
+                response_answer = 1
+            else:
+                response_answer = 0
+            logging.info(f"Right or with defects: 1 or 0: {response_answer}")
             # Транскрибация аудио с помощью Whisper
             transcription_result = whisper_model.transcribe(tmp_filename, language="russian")
             # Возврат результатов
             return {
+                "prediction": response_answer,
                 "match_phrase": match_phrase
             }

best_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6044b8558b0f84731adc10353ae525c7a76b710127bf4de33aeecd620d8bc4be
+size 34790773

best_model_2.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40e263882edea1f1f1517a3ed580af0d994d0b2529bec20b04afa3801233a3ab
+size 34790755

utils_api.py CHANGED Viewed

@@ -7,59 +7,161 @@ from tensorflow.keras.models import Sequential
 import tensorflow_hub as hub
 import soundfile as sf
 import tensorflow as tf
-# Load the embedding model globally to avoid reloading it every time
 embedding_model = hub.load("https://www.kaggle.com/models/google/speech-embedding/TensorFlow1/speech-embedding/1")
 def load_audio(audio_file_path):
     try:
         audio_samples, sample_rate = librosa.load(audio_file_path, sr=None)
         audio_samples = audio_samples.astype(np.float32)
-        audio_samples /= np.max(np.abs(audio_samples))
         return audio_samples, sample_rate
     except Exception as e:
-        print(f"Error loading {audio_file_path}: {e}")
         return None, None
 def pad_or_trim(audio, sr, target_length=5):
     target_samples = int(target_length * sr)
     return librosa.util.fix_length(audio, size=target_samples) if len(audio) < target_samples else audio[:target_samples]
 def get_features(path, duration=5):
     try:
         data, sample_rate = load_audio(path)
-        # Model need 16000 sample rate
-        if sample_rate != 16000:
-            data = librosa.resample(data, orig_sr=sample_rate, target_sr=16000)
-            sample_rate = 16000
         data = pad_or_trim(data, sample_rate)
     except Exception as e:
-        print(f"Error loading {path}: {e}")
         return None
-    embeddings = extract_embeddings(np.expand_dims(data, axis=0))  # Add batch dimension
-    if embeddings is not None:
-        return embeddings
-    else:
-        return None # Fail
 def extract_embeddings(audio_samples):
-    """Extract embeddings from audio samples."""
     try:
-        # Convert audio samples to tensor and extract embeddings
         embeddings = embedding_model.signatures['default'](tf.convert_to_tensor(audio_samples))
-        return embeddings['default'].numpy().flatten()
     except Exception as e:
-        print(f"Error extracting embeddings: {e}")
-        return None  # Return None if there's an error
-def pad_or_trim(feature, target_shape):
-    """Pad or trim feature array to ensure a consistent shape."""
-    if len(feature) > target_shape:
-        feature = feature[:target_shape]
-    elif len(feature) < target_shape:
-        feature = np.pad(feature, (0, target_shape - len(feature)), mode='constant')
-    return feature

 import tensorflow_hub as hub
 import soundfile as sf
 import tensorflow as tf
+from scipy.signal import resample, butter, lfilter
+import logging
+# Настройка логирования для отслеживания ошибок и процесса выполнения
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()]
+)
+# Загрузка модели для извлечения эмбеддингов из речи
 embedding_model = hub.load("https://www.kaggle.com/models/google/speech-embedding/TensorFlow1/speech-embedding/1")
+# Константы для фильтрации звука
+LOWCUT = 400    # Нижняя частота среза в Гц
+HIGHCUT = 5000  # Верхняя частота среза в Гц
+ORDER = 5       # Порядок фильтра
 def load_audio(audio_file_path):
+    """
+    Загрузка аудиофайла и нормализация его амплитуды
+    Args:
+        audio_file_path: путь к аудиофайлу
+    Returns:
+        кортеж (аудио данные, частота дискретизации)
+    """
     try:
         audio_samples, sample_rate = librosa.load(audio_file_path, sr=None)
         audio_samples = audio_samples.astype(np.float32)
+        audio_samples /= np.max(np.abs(audio_samples))  # Нормализация амплитуды
         return audio_samples, sample_rate
     except Exception as e:
+        logging.exception(f"Error loading {audio_file_path}: {e}")
         return None, None
 def pad_or_trim(audio, sr, target_length=5):
+    """
+    Обрезка или дополнение аудио до заданной длительности
+    Args:
+        audio: аудио данные
+        sr: частота дискретизации
+        target_length: целевая длительность в секундах
+    Returns:
+        обработанные аудио данные
+    """
     target_samples = int(target_length * sr)
     return librosa.util.fix_length(audio, size=target_samples) if len(audio) < target_samples else audio[:target_samples]
 def get_features(path, duration=5):
+    """
+    Извлечение признаков из аудиофайла
+    Args:
+        path: путь к аудиофайлу
+        duration: длительность в секундах
+    Returns:
+        эмбеддинги аудио или None в случае ошибки
+    """
     try:
         data, sample_rate = load_audio(path)
+        data, sample_rate = upgrade_sound(data, sample_rate)  # Улучшение качества звука
         data = pad_or_trim(data, sample_rate)
     except Exception as e:
+        logging.exception(f"Error loading {path}: {e}")
         return None
+    data = np.array(data, dtype=np.float32)  # Преобразование в float32
+    embeddings = extract_embeddings(np.expand_dims(data, axis=0))  # Добавление размерности батча
+    return embeddings if embeddings is not None else None
 def extract_embeddings(audio_samples):
+    """
+    Извлечение эмбеддингов из аудио с помощью предобученной модели
+    Args:
+        audio_samples: аудио данные
+    Returns:
+        эмбеддинги в форме (1, n_features)
+    """
     try:
+        # Преобразование в тензор и получение эмбеддингов
         embeddings = embedding_model.signatures['default'](tf.convert_to_tensor(audio_samples))
+        # Получение тензора эмбеддингов
+        embeddings_tensor = embeddings['default'].numpy()
+        # Преобразование формы для соответствия входу модели
+        embeddings_flat = embeddings_tensor.reshape((1, -1))
+        return embeddings_flat
     except Exception as e:
+        logging.exception(f"Error extracting embeddings: {e}")
+        return None
+def butter_bandpass(lowcut, highcut, sr, order=5):
+    """
+    Создание полосового фильтра Баттерворта
+    Args:
+        lowcut: нижняя частота среза
+        highcut: верхняя частота среза
+        sr: частота дискретизации
+        order: порядок фильтра
+    Returns:
+        коэффициенты фильтра (b, a)
+    """
+    nyquist = 0.5 * sr
+    low = lowcut / nyquist
+    high = highcut / nyquist
+    return butter(order, [low, high], btype='band')
+def apply_bandpass_filter(y, sr, lowcut=LOWCUT, highcut=HIGHCUT, order=ORDER):
+    """
+    Применение полосового фильтра к аудио
+    Args:
+        y: аудио данные
+        sr: частота дискретизации
+        lowcut: нижняя частота среза
+        highcut: верхняя частота среза
+        order: порядок фильтра
+    Returns:
+        отфильтрованные аудио данные
+    """
+    b, a = butter_bandpass(lowcut, highcut, sr, order)
+    return lfilter(b, a, y)
+def resample_audio(y, sr, target_sr=16000):
+    """
+    Передискретизация аудио до целевой частоты
+    Args:
+        y: аудио данные
+        sr: исходная частота дискретизации
+        target_sr: целевая частота дискретизации
+    Returns:
+        кортеж (передискретизированные данные, новая частота)
+    """
+    if sr != target_sr:
+        num_samples = round(len(y) * float(target_sr) / sr)
+        return resample(y, num_samples), target_sr
+    return y, sr
+def upgrade_sound(y, sr):
+    """
+    Комплексное улучшение качества звука
+    Args:
+        y: аудио данные
+        sr: частота дискретизации
+    Returns:
+        кортеж (улучшенные аудио данные, частота дискретизации)
+    Выполняет:
+    1. Передискретизацию до 16кГц
+    2. Нормализацию амплитуды
+    3. Полосовую фильтрацию
+    4. Предварительное усиление высоких частот
+    """
+    y_resampled, sr = resample_audio(y, sr)
+    y_normalized = librosa.util.normalize(y_resampled)
+    y_filtered = apply_bandpass_filter(y_normalized, sr)
+    return librosa.effects.preemphasis(y_filtered), sr