yufii commited on
Commit
ec8fa7d
·
verified ·
1 Parent(s): 476003d

Upload 9 files

Browse files
Files changed (8) hide show
  1. .gitattributes +36 -36
  2. .gitignore +4 -4
  3. Dockerfile +23 -23
  4. README.md +10 -10
  5. app.py +9 -15
  6. best_model.h5 +3 -0
  7. requirements.txt +11 -10
  8. utils.py +46 -96
.gitattributes CHANGED
@@ -1,36 +1,36 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- cnn_1_v6_final_model.h5 filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cnn_1_v6_final_model.h5 filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
- venv
2
- ol.py
3
- server.zip
4
- whisper_cahce
5
  others
 
1
+ venv
2
+ ol.py
3
+ server.zip
4
+ whisper_cahce
5
  others
Dockerfile CHANGED
@@ -1,23 +1,23 @@
1
- FROM python:3.11
2
-
3
- WORKDIR /app
4
-
5
- COPY . /app
6
-
7
- RUN apt-get update && apt-get install -y ffmpeg && apt-get clean
8
-
9
- RUN groupadd -r user && useradd -r -g user -m user
10
-
11
- RUN mkdir -p /tmp/whisper_cache && chmod 777 /tmp/whisper_cache
12
-
13
- RUN chown -R user:user /app
14
-
15
- ENV NUMBA_CACHE_DIR /tmp
16
-
17
- RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements.txt
18
-
19
- USER user
20
-
21
- EXPOSE 8000
22
-
23
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app
6
+
7
+ RUN apt-get update && apt-get install -y ffmpeg && apt-get clean
8
+
9
+ RUN groupadd -r user && useradd -r -g user -m user
10
+
11
+ RUN mkdir -p /tmp/whisper_cache && chmod 777 /tmp/whisper_cache
12
+
13
+ RUN chown -R user:user /app
14
+
15
+ ENV NUMBA_CACHE_DIR /tmp
16
+
17
+ RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements.txt
18
+
19
+ USER user
20
+
21
+ EXPOSE 8000
22
+
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
- ---
2
- title: speech_defects
3
- emoji: 🐳
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: docker
7
- app_port: 8000
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: speech_defects
3
+ emoji: 🐳
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
7
+ app_port: 8000
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -9,13 +9,11 @@ import librosa
9
  import numpy as np
10
  import re
11
  import Levenshtein
12
- from fastapi.responses import JSONResponse
13
 
 
14
  from fastapi.middleware.cors import CORSMiddleware
15
- from utils import (
16
- extract_features,
17
- pad_or_trim,
18
- )
19
 
20
  #вывод в консоль для просмотри на hugging face
21
  logging.basicConfig(
@@ -45,7 +43,7 @@ os.makedirs(cache_dir, exist_ok=True)
45
  whisper_model = whisper.load_model("tiny", download_root=cache_dir)
46
 
47
  # загрузка параметров модели
48
- filepath = os.path.abspath("cnn_1_v6_final_model.h5")
49
  if not os.path.exists(filepath):
50
  raise FileNotFoundError(f"Model file not found at {filepath}")
51
 
@@ -70,9 +68,7 @@ def temporary_audio_file(audio_bytes):
70
  async def read_root():
71
  return {"message": "Welcome to the Defects_model API"}
72
 
73
-
74
  model = keras.models.load_model(filepath, compile=False)
75
- target_shape = (32, 200)
76
 
77
  # Endpoint для сохранения аудио файлов
78
  @app.post("/save-audio")
@@ -146,16 +142,14 @@ async def process_audio(
146
  raise ValueError("Empty or invalid audio data.")
147
 
148
  # Извлечение признаков из аудио
149
- features = extract_features(audio_data, sample_rate)
 
150
  logging.info(f"Features extracted: shape = {features.shape}")
151
 
152
- # Подготовка данных для модели
153
- target_shape = (1, model.input_shape[1])
154
- features = pad_or_trim(features, target_shape[1])
155
- features = np.expand_dims(features, axis=0)
156
-
157
  # Получение предсказания от модели
 
158
  prediction = model.predict(features)
 
159
  logging.info(f"Prediction: {prediction}")
160
 
161
  # Транскрибация аудио с помощью Whisper
@@ -184,4 +178,4 @@ async def process_audio(
184
 
185
  except Exception as e:
186
  logging.exception(f"Error processing audio: {e}")
187
- raise HTTPException(status_code=500, detail="Internal server error")
 
9
  import numpy as np
10
  import re
11
  import Levenshtein
 
12
 
13
+ from fastapi.responses import JSONResponse
14
  from fastapi.middleware.cors import CORSMiddleware
15
+ from utils import get_features
16
+
 
 
17
 
18
  #вывод в консоль для просмотри на hugging face
19
  logging.basicConfig(
 
43
  whisper_model = whisper.load_model("tiny", download_root=cache_dir)
44
 
45
  # загрузка параметров модели
46
+ filepath = os.path.abspath("best_model.h5")
47
  if not os.path.exists(filepath):
48
  raise FileNotFoundError(f"Model file not found at {filepath}")
49
 
 
68
  async def read_root():
69
  return {"message": "Welcome to the Defects_model API"}
70
 
 
71
  model = keras.models.load_model(filepath, compile=False)
 
72
 
73
  # Endpoint для сохранения аудио файлов
74
  @app.post("/save-audio")
 
142
  raise ValueError("Empty or invalid audio data.")
143
 
144
  # Извлечение признаков из аудио
145
+ features = get_features(tmp_filename) # here data already in form
146
+ # features = np.expand_dims(features, axis=0) # Add batch dimension
147
  logging.info(f"Features extracted: shape = {features.shape}")
148
 
 
 
 
 
 
149
  # Получение предсказания от модели
150
+ class_weights = {0: 0.5460790960451978, 1: 1.0068333333333332, 2: 10.696369636963697}
151
  prediction = model.predict(features)
152
+ prediction = prediction * class_weights
153
  logging.info(f"Prediction: {prediction}")
154
 
155
  # Транскрибация аудио с помощью Whisper
 
178
 
179
  except Exception as e:
180
  logging.exception(f"Error processing audio: {e}")
181
+ raise HTTPException(status_code=500, detail="Internal server error")
best_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d65809a43e770cf2498633341312aef94ad9c7940bbe1a17009503305e1d4d1e
3
+ size 20930136
requirements.txt CHANGED
@@ -1,10 +1,11 @@
1
- fastapi
2
- uvicorn
3
- torch
4
- librosa
5
- tensorflow
6
- keras
7
- python-multipart
8
- numba
9
- openai-whisper
10
- Levenshtein
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch
4
+ librosa
5
+ tensorflow
6
+ keras
7
+ python-multipart
8
+ numba
9
+ openai-whisper
10
+ Levenshtein
11
+ tensorflow_hub
utils.py CHANGED
@@ -1,97 +1,59 @@
1
  import librosa
2
  import numpy as np
3
  from keras import layers, models
4
-
5
- def create_cnn_model(input_shape):
6
- model = models.Sequential()
7
-
8
- # First Convolutional Layer
9
- model.add(layers.Conv1D(32, 3, activation='relu', input_shape=input_shape))
10
- model.add(layers.MaxPooling1D(pool_size=2))
11
-
12
- # Second Convolutional Layer
13
- model.add(layers.Conv1D(64, 3, activation='relu'))
14
- model.add(layers.MaxPooling1D(pool_size=2))
15
-
16
- # Flatten layer
17
- model.add(layers.Flatten())
18
-
19
- # Dense layers
20
- model.add(layers.Dense(128, activation='relu', input_shape=input_shape))
21
- model.add(layers.Dense(256, activation='relu', input_shape=input_shape))
22
- model.add(layers.Dense(512, activation='relu', input_shape=input_shape))
23
- model.add(layers.Dense(512, activation='relu', input_shape=input_shape))
24
- model.add(layers.Dense(256, activation='relu', input_shape=input_shape))
25
- model.add(layers.Dense(128, activation='relu', input_shape=input_shape))
26
-
27
- # Output layer
28
- model.add(layers.Dense(1, activation='sigmoid'))
29
-
30
- return model
31
-
32
-
33
- def get_features(path, duration=6):
34
  try:
35
  # Load audio file with specific duration and offset to handle silent parts
36
- data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
37
- except Exception as e:
38
- print(f"Error loading {path}: {e}")
39
- return None # Skip the file if there's an error
40
-
41
- # Without augmentation
42
- res1 = extract_features(data, sample_rate)
43
- result = np.array(res1)
44
-
45
- # With noise
46
- noise_data = noise(data)
47
- res2 = extract_features(noise_data, sample_rate)
48
- result = np.vstack((result, res2))
49
-
50
- # Stretching and pitching
51
- new_data = stretch(data)
52
- data_stretch_pitch = pitch(new_data, sample_rate)
53
- res3 = extract_features(data_stretch_pitch, sample_rate)
54
- result = np.vstack((result, res3))
55
 
56
- return result
57
-
58
-
59
- def extract_features(data, sample_rate, target_shape=40):
60
- result = np.array([])
61
-
62
- # ZCR
63
- zcr = librosa.feature.zero_crossing_rate(y=data)
64
- zcr = np.mean(zcr.T, axis=0)
65
- zcr = pad_or_trim(zcr, target_shape)
66
- result = np.hstack((result, zcr))
67
-
68
- # Chroma_stft
69
- stft = np.abs(librosa.stft(data))
70
- chroma_stft = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
71
- chroma_stft = np.mean(chroma_stft.T, axis=0)
72
- chroma_stft = pad_or_trim(chroma_stft, target_shape)
73
- result = np.hstack((result, chroma_stft))
74
-
75
- # MFCC
76
- mfcc = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=13)
77
- mfcc = np.mean(mfcc.T, axis=0)
78
- mfcc = pad_or_trim(mfcc, target_shape)
79
- result = np.hstack((result, mfcc))
80
 
81
- # Root Mean Square Value
82
- rms = librosa.feature.rms(y=data)
83
- rms = np.mean(rms.T, axis=0)
84
- rms = pad_or_trim(rms, target_shape)
85
- result = np.hstack((result, rms))
86
 
87
- # MelSpectrogram
88
- mel = librosa.feature.melspectrogram(y=data, sr=sample_rate)
89
- mel = np.mean(mel.T, axis=0)
90
- mel = pad_or_trim(mel, target_shape)
91
- result = np.hstack((result, mel))
 
 
 
 
92
 
93
  return result
94
 
 
 
 
 
 
 
 
 
 
95
 
96
  def pad_or_trim(feature, target_shape):
97
  """Pad or trim feature array to ensure a consistent shape."""
@@ -99,16 +61,4 @@ def pad_or_trim(feature, target_shape):
99
  feature = feature[:target_shape]
100
  elif len(feature) < target_shape:
101
  feature = np.pad(feature, (0, target_shape - len(feature)), mode='constant')
102
- return feature
103
-
104
-
105
- def noise(data, noise_factor=0.005):
106
- noise_amp = noise_factor * np.random.uniform() * np.amax(data)
107
- data = data + noise_amp * np.random.normal(size=data.shape[0])
108
- return data
109
-
110
- def stretch(data, rate=0.8):
111
- return librosa.effects.time_stretch(data, rate=rate)
112
-
113
- def pitch(data, sample_rate, pitch_factor=0.7):
114
- return librosa.effects.pitch_shift(data, sr=sample_rate, n_steps=pitch_factor)
 
1
  import librosa
2
  import numpy as np
3
  from keras import layers, models
4
+ from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Dense, Dropout, Reshape, Input, GlobalAveragePooling1D
5
+ from tensorflow.keras.regularizers import l2
6
+ from tensorflow.keras.models import Sequential
7
+ import tensorflow_hub as hub
8
+ import tensorflow as tf
9
+ import logging
10
+
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
14
+ handlers=[logging.StreamHandler()]
15
+ )
16
+
17
+ # Load the embedding model globally to avoid reloading it every time
18
+ #embedding_model = hub.load("https://www.kaggle.com/models/google/speech-embedding/TensorFlow1/speech-embedding/1")
19
+ embedding_model = tf.saved_model.load("speech-embedding",tags=[])
20
+ def get_features(path, duration=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  try:
22
  # Load audio file with specific duration and offset to handle silent parts
23
+ data, sample_rate = librosa.load(path, duration=duration, offset=0.6)
24
+ data = pad_or_trim(data, sample_rate * 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Model need 16000 sample rate
27
+ if sample_rate != 16000:
28
+ data = librosa.resample(data, orig_sr=sample_rate, target_sr=16000)
29
+ sample_rate = 16000
30
+ except Exception as e:
31
+ logging.info(f"Error loading {path}: {e}")
32
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ embeddings = extract_embeddings(np.expand_dims(data, axis=0)) # Add batch dimension
 
 
 
 
35
 
36
+ if embeddings is not None:
37
+ try:
38
+ result = embedding_model.signatures['default'](tf.convert_to_tensor(data))
39
+ result = embeddings['default'].numpy().flatten()
40
+ except Exception as e:
41
+ logging.info(f"Error extracting embedding from {path}: {e}")
42
+ return None
43
+ else:
44
+ return None # Fail
45
 
46
  return result
47
 
48
+ def extract_embeddings(audio_samples):
49
+ """Extract embeddings from audio samples."""
50
+ try:
51
+ # Convert audio samples to tensor and extract embeddings
52
+ embeddings = embedding_model.signatures['default'](tf.convert_to_tensor(audio_samples))
53
+ return embeddings['default'].numpy().flatten()
54
+ except Exception as e:
55
+ logging.info(f"Error extracting embeddings: {e}")
56
+ return None # Return None if there's an error
57
 
58
  def pad_or_trim(feature, target_shape):
59
  """Pad or trim feature array to ensure a consistent shape."""
 
61
  feature = feature[:target_shape]
62
  elif len(feature) < target_shape:
63
  feature = np.pad(feature, (0, target_shape - len(feature)), mode='constant')
64
+ return feature