Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- .gitattributes +36 -36
- .gitignore +4 -4
- Dockerfile +23 -23
- README.md +10 -10
- app.py +9 -15
- best_model.h5 +3 -0
- requirements.txt +11 -10
- utils.py +46 -96
.gitattributes
CHANGED
@@ -1,36 +1,36 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
cnn_1_v6_final_model.h5 filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cnn_1_v6_final_model.h5 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
venv
|
2 |
-
ol.py
|
3 |
-
server.zip
|
4 |
-
whisper_cahce
|
5 |
others
|
|
|
1 |
+
venv
|
2 |
+
ol.py
|
3 |
+
server.zip
|
4 |
+
whisper_cahce
|
5 |
others
|
Dockerfile
CHANGED
@@ -1,23 +1,23 @@
|
|
1 |
-
FROM python:3.11
|
2 |
-
|
3 |
-
WORKDIR /app
|
4 |
-
|
5 |
-
COPY . /app
|
6 |
-
|
7 |
-
RUN apt-get update && apt-get install -y ffmpeg && apt-get clean
|
8 |
-
|
9 |
-
RUN groupadd -r user && useradd -r -g user -m user
|
10 |
-
|
11 |
-
RUN mkdir -p /tmp/whisper_cache && chmod 777 /tmp/whisper_cache
|
12 |
-
|
13 |
-
RUN chown -R user:user /app
|
14 |
-
|
15 |
-
ENV NUMBA_CACHE_DIR /tmp
|
16 |
-
|
17 |
-
RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
18 |
-
|
19 |
-
USER user
|
20 |
-
|
21 |
-
EXPOSE 8000
|
22 |
-
|
23 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . /app
|
6 |
+
|
7 |
+
RUN apt-get update && apt-get install -y ffmpeg && apt-get clean
|
8 |
+
|
9 |
+
RUN groupadd -r user && useradd -r -g user -m user
|
10 |
+
|
11 |
+
RUN mkdir -p /tmp/whisper_cache && chmod 777 /tmp/whisper_cache
|
12 |
+
|
13 |
+
RUN chown -R user:user /app
|
14 |
+
|
15 |
+
ENV NUMBA_CACHE_DIR /tmp
|
16 |
+
|
17 |
+
RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
18 |
+
|
19 |
+
USER user
|
20 |
+
|
21 |
+
EXPOSE 8000
|
22 |
+
|
23 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
---
|
2 |
-
title: speech_defects
|
3 |
-
emoji: 🐳
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: gray
|
6 |
-
sdk: docker
|
7 |
-
app_port: 8000
|
8 |
-
---
|
9 |
-
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: speech_defects
|
3 |
+
emoji: 🐳
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: gray
|
6 |
+
sdk: docker
|
7 |
+
app_port: 8000
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -9,13 +9,11 @@ import librosa
|
|
9 |
import numpy as np
|
10 |
import re
|
11 |
import Levenshtein
|
12 |
-
from fastapi.responses import JSONResponse
|
13 |
|
|
|
14 |
from fastapi.middleware.cors import CORSMiddleware
|
15 |
-
from utils import
|
16 |
-
|
17 |
-
pad_or_trim,
|
18 |
-
)
|
19 |
|
20 |
#вывод в консоль для просмотри на hugging face
|
21 |
logging.basicConfig(
|
@@ -45,7 +43,7 @@ os.makedirs(cache_dir, exist_ok=True)
|
|
45 |
whisper_model = whisper.load_model("tiny", download_root=cache_dir)
|
46 |
|
47 |
# загрузка параметров модели
|
48 |
-
filepath = os.path.abspath("
|
49 |
if not os.path.exists(filepath):
|
50 |
raise FileNotFoundError(f"Model file not found at {filepath}")
|
51 |
|
@@ -70,9 +68,7 @@ def temporary_audio_file(audio_bytes):
|
|
70 |
async def read_root():
|
71 |
return {"message": "Welcome to the Defects_model API"}
|
72 |
|
73 |
-
|
74 |
model = keras.models.load_model(filepath, compile=False)
|
75 |
-
target_shape = (32, 200)
|
76 |
|
77 |
# Endpoint для сохранения аудио файлов
|
78 |
@app.post("/save-audio")
|
@@ -146,16 +142,14 @@ async def process_audio(
|
|
146 |
raise ValueError("Empty or invalid audio data.")
|
147 |
|
148 |
# Извлечение признаков из аудио
|
149 |
-
features =
|
|
|
150 |
logging.info(f"Features extracted: shape = {features.shape}")
|
151 |
|
152 |
-
# Подготовка данных для модели
|
153 |
-
target_shape = (1, model.input_shape[1])
|
154 |
-
features = pad_or_trim(features, target_shape[1])
|
155 |
-
features = np.expand_dims(features, axis=0)
|
156 |
-
|
157 |
# Получение предсказания от модели
|
|
|
158 |
prediction = model.predict(features)
|
|
|
159 |
logging.info(f"Prediction: {prediction}")
|
160 |
|
161 |
# Транскрибация аудио с помощью Whisper
|
@@ -184,4 +178,4 @@ async def process_audio(
|
|
184 |
|
185 |
except Exception as e:
|
186 |
logging.exception(f"Error processing audio: {e}")
|
187 |
-
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
9 |
import numpy as np
|
10 |
import re
|
11 |
import Levenshtein
|
|
|
12 |
|
13 |
+
from fastapi.responses import JSONResponse
|
14 |
from fastapi.middleware.cors import CORSMiddleware
|
15 |
+
from utils import get_features
|
16 |
+
|
|
|
|
|
17 |
|
18 |
#вывод в консоль для просмотри на hugging face
|
19 |
logging.basicConfig(
|
|
|
43 |
whisper_model = whisper.load_model("tiny", download_root=cache_dir)
|
44 |
|
45 |
# загрузка параметров модели
|
46 |
+
filepath = os.path.abspath("best_model.h5")
|
47 |
if not os.path.exists(filepath):
|
48 |
raise FileNotFoundError(f"Model file not found at {filepath}")
|
49 |
|
|
|
68 |
async def read_root():
|
69 |
return {"message": "Welcome to the Defects_model API"}
|
70 |
|
|
|
71 |
model = keras.models.load_model(filepath, compile=False)
|
|
|
72 |
|
73 |
# Endpoint для сохранения аудио файлов
|
74 |
@app.post("/save-audio")
|
|
|
142 |
raise ValueError("Empty or invalid audio data.")
|
143 |
|
144 |
# Извлечение признаков из аудио
|
145 |
+
features = get_features(tmp_filename) # here data already in form
|
146 |
+
# features = np.expand_dims(features, axis=0) # Add batch dimension
|
147 |
logging.info(f"Features extracted: shape = {features.shape}")
|
148 |
|
|
|
|
|
|
|
|
|
|
|
149 |
# Получение предсказания от модели
|
150 |
+
class_weights = {0: 0.5460790960451978, 1: 1.0068333333333332, 2: 10.696369636963697}
|
151 |
prediction = model.predict(features)
|
152 |
+
prediction = prediction * class_weights
|
153 |
logging.info(f"Prediction: {prediction}")
|
154 |
|
155 |
# Транскрибация аудио с помощью Whisper
|
|
|
178 |
|
179 |
except Exception as e:
|
180 |
logging.exception(f"Error processing audio: {e}")
|
181 |
+
raise HTTPException(status_code=500, detail="Internal server error")
|
best_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d65809a43e770cf2498633341312aef94ad9c7940bbe1a17009503305e1d4d1e
|
3 |
+
size 20930136
|
requirements.txt
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
-
fastapi
|
2 |
-
uvicorn
|
3 |
-
torch
|
4 |
-
librosa
|
5 |
-
tensorflow
|
6 |
-
keras
|
7 |
-
python-multipart
|
8 |
-
numba
|
9 |
-
openai-whisper
|
10 |
-
Levenshtein
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
torch
|
4 |
+
librosa
|
5 |
+
tensorflow
|
6 |
+
keras
|
7 |
+
python-multipart
|
8 |
+
numba
|
9 |
+
openai-whisper
|
10 |
+
Levenshtein
|
11 |
+
tensorflow_hub
|
utils.py
CHANGED
@@ -1,97 +1,59 @@
|
|
1 |
import librosa
|
2 |
import numpy as np
|
3 |
from keras import layers, models
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
model.add(layers.Dense(256, activation='relu', input_shape=input_shape))
|
22 |
-
model.add(layers.Dense(512, activation='relu', input_shape=input_shape))
|
23 |
-
model.add(layers.Dense(512, activation='relu', input_shape=input_shape))
|
24 |
-
model.add(layers.Dense(256, activation='relu', input_shape=input_shape))
|
25 |
-
model.add(layers.Dense(128, activation='relu', input_shape=input_shape))
|
26 |
-
|
27 |
-
# Output layer
|
28 |
-
model.add(layers.Dense(1, activation='sigmoid'))
|
29 |
-
|
30 |
-
return model
|
31 |
-
|
32 |
-
|
33 |
-
def get_features(path, duration=6):
|
34 |
try:
|
35 |
# Load audio file with specific duration and offset to handle silent parts
|
36 |
-
data, sample_rate = librosa.load(path, duration=
|
37 |
-
|
38 |
-
print(f"Error loading {path}: {e}")
|
39 |
-
return None # Skip the file if there's an error
|
40 |
-
|
41 |
-
# Without augmentation
|
42 |
-
res1 = extract_features(data, sample_rate)
|
43 |
-
result = np.array(res1)
|
44 |
-
|
45 |
-
# With noise
|
46 |
-
noise_data = noise(data)
|
47 |
-
res2 = extract_features(noise_data, sample_rate)
|
48 |
-
result = np.vstack((result, res2))
|
49 |
-
|
50 |
-
# Stretching and pitching
|
51 |
-
new_data = stretch(data)
|
52 |
-
data_stretch_pitch = pitch(new_data, sample_rate)
|
53 |
-
res3 = extract_features(data_stretch_pitch, sample_rate)
|
54 |
-
result = np.vstack((result, res3))
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
zcr = librosa.feature.zero_crossing_rate(y=data)
|
64 |
-
zcr = np.mean(zcr.T, axis=0)
|
65 |
-
zcr = pad_or_trim(zcr, target_shape)
|
66 |
-
result = np.hstack((result, zcr))
|
67 |
-
|
68 |
-
# Chroma_stft
|
69 |
-
stft = np.abs(librosa.stft(data))
|
70 |
-
chroma_stft = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
|
71 |
-
chroma_stft = np.mean(chroma_stft.T, axis=0)
|
72 |
-
chroma_stft = pad_or_trim(chroma_stft, target_shape)
|
73 |
-
result = np.hstack((result, chroma_stft))
|
74 |
-
|
75 |
-
# MFCC
|
76 |
-
mfcc = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=13)
|
77 |
-
mfcc = np.mean(mfcc.T, axis=0)
|
78 |
-
mfcc = pad_or_trim(mfcc, target_shape)
|
79 |
-
result = np.hstack((result, mfcc))
|
80 |
|
81 |
-
#
|
82 |
-
rms = librosa.feature.rms(y=data)
|
83 |
-
rms = np.mean(rms.T, axis=0)
|
84 |
-
rms = pad_or_trim(rms, target_shape)
|
85 |
-
result = np.hstack((result, rms))
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
92 |
|
93 |
return result
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
def pad_or_trim(feature, target_shape):
|
97 |
"""Pad or trim feature array to ensure a consistent shape."""
|
@@ -99,16 +61,4 @@ def pad_or_trim(feature, target_shape):
|
|
99 |
feature = feature[:target_shape]
|
100 |
elif len(feature) < target_shape:
|
101 |
feature = np.pad(feature, (0, target_shape - len(feature)), mode='constant')
|
102 |
-
return feature
|
103 |
-
|
104 |
-
|
105 |
-
def noise(data, noise_factor=0.005):
|
106 |
-
noise_amp = noise_factor * np.random.uniform() * np.amax(data)
|
107 |
-
data = data + noise_amp * np.random.normal(size=data.shape[0])
|
108 |
-
return data
|
109 |
-
|
110 |
-
def stretch(data, rate=0.8):
|
111 |
-
return librosa.effects.time_stretch(data, rate=rate)
|
112 |
-
|
113 |
-
def pitch(data, sample_rate, pitch_factor=0.7):
|
114 |
-
return librosa.effects.pitch_shift(data, sr=sample_rate, n_steps=pitch_factor)
|
|
|
1 |
import librosa
|
2 |
import numpy as np
|
3 |
from keras import layers, models
|
4 |
+
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Dense, Dropout, Reshape, Input, GlobalAveragePooling1D
|
5 |
+
from tensorflow.keras.regularizers import l2
|
6 |
+
from tensorflow.keras.models import Sequential
|
7 |
+
import tensorflow_hub as hub
|
8 |
+
import tensorflow as tf
|
9 |
+
import logging
|
10 |
+
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO,
|
13 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
14 |
+
handlers=[logging.StreamHandler()]
|
15 |
+
)
|
16 |
+
|
17 |
+
# Load the embedding model globally to avoid reloading it every time
|
18 |
+
#embedding_model = hub.load("https://www.kaggle.com/models/google/speech-embedding/TensorFlow1/speech-embedding/1")
|
19 |
+
embedding_model = tf.saved_model.load("speech-embedding",tags=[])
|
20 |
+
def get_features(path, duration=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
try:
|
22 |
# Load audio file with specific duration and offset to handle silent parts
|
23 |
+
data, sample_rate = librosa.load(path, duration=duration, offset=0.6)
|
24 |
+
data = pad_or_trim(data, sample_rate * 5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Model need 16000 sample rate
|
27 |
+
if sample_rate != 16000:
|
28 |
+
data = librosa.resample(data, orig_sr=sample_rate, target_sr=16000)
|
29 |
+
sample_rate = 16000
|
30 |
+
except Exception as e:
|
31 |
+
logging.info(f"Error loading {path}: {e}")
|
32 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
embeddings = extract_embeddings(np.expand_dims(data, axis=0)) # Add batch dimension
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
if embeddings is not None:
|
37 |
+
try:
|
38 |
+
result = embedding_model.signatures['default'](tf.convert_to_tensor(data))
|
39 |
+
result = embeddings['default'].numpy().flatten()
|
40 |
+
except Exception as e:
|
41 |
+
logging.info(f"Error extracting embedding from {path}: {e}")
|
42 |
+
return None
|
43 |
+
else:
|
44 |
+
return None # Fail
|
45 |
|
46 |
return result
|
47 |
|
48 |
+
def extract_embeddings(audio_samples):
|
49 |
+
"""Extract embeddings from audio samples."""
|
50 |
+
try:
|
51 |
+
# Convert audio samples to tensor and extract embeddings
|
52 |
+
embeddings = embedding_model.signatures['default'](tf.convert_to_tensor(audio_samples))
|
53 |
+
return embeddings['default'].numpy().flatten()
|
54 |
+
except Exception as e:
|
55 |
+
logging.info(f"Error extracting embeddings: {e}")
|
56 |
+
return None # Return None if there's an error
|
57 |
|
58 |
def pad_or_trim(feature, target_shape):
|
59 |
"""Pad or trim feature array to ensure a consistent shape."""
|
|
|
61 |
feature = feature[:target_shape]
|
62 |
elif len(feature) < target_shape:
|
63 |
feature = np.pad(feature, (0, target_shape - len(feature)), mode='constant')
|
64 |
+
return feature
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|