Spaces:

701w
/

speech_defects

Sleeping

App Files Files Community

yufii commited on Oct 23, 2024

Commit

476003d

1 Parent(s): d1dcb47

added coments

Browse files

Files changed (10) hide show

.gitignore +2 -1
__pycache__/app.cpython-311.pyc +0 -0
app.py +39 -13
audio.mp3 +0 -0
forms.py +0 -9
main.py +0 -85
models.py +0 -56
server.log +0 -0
test.py +0 -62
test_audio.mp3 +0 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 venv
 ol.py
 server.zip
-whisper_cahce

 venv
 ol.py
 server.zip
+whisper_cahce
+others

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -17,16 +17,20 @@ from utils import (
     pad_or_trim,
 )
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     handlers=[logging.StreamHandler()]
 )
 os.environ['NUMBA_CACHE_DIR'] = '/tmp'
 app = FastAPI(port=8000)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -35,18 +39,22 @@ app.add_middleware(
     allow_headers=["*"],
 )
-filepath = os.path.abspath("cnn_1_v6_final_model.h5")
-if not os.path.exists(filepath):
-    raise FileNotFoundError(f"Model file not found at {filepath}")
 cache_dir = "/tmp/whisper_cache"
 os.makedirs(cache_dir, exist_ok=True)
 whisper_model = whisper.load_model("tiny", download_root=cache_dir)
-model = keras.models.load_model(filepath, compile=False)
 @contextmanager
 def temporary_audio_file(audio_bytes):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_file.write(audio_bytes)
         tmp_file.flush()
@@ -57,20 +65,21 @@ def temporary_audio_file(audio_bytes):
         if os.path.exists(tmp_filename):
             os.remove(tmp_filename)
 @app.get("/")
 async def read_root():
     return {"message": "Welcome to the Defects_model API"}
-filepath = os.path.abspath("cnn_1_v6_final_model.h5")
-if not os.path.exists(filepath):
-    raise FileNotFoundError(f"Model file not found at {filepath}")
 model = keras.models.load_model(filepath, compile=False)
 target_shape = (32, 200)
 @app.post("/save-audio")
 async def save_audio(file: UploadFile = File(...)):
     if not file.content_type.startswith("audio/"):
         raise HTTPException(status_code=400, detail="Invalid file type")
@@ -87,27 +96,36 @@ async def save_audio(file: UploadFile = File(...)):
     except Exception as e:
         return JSONResponse(content={"error": str(e)}, status_code=500)
 log_file_path = os.path.join("/tmp", "server.log")
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     handlers=[logging.StreamHandler()]
 )
 @app.post("/process-audio")
 async def process_audio(
     audio: UploadFile = File(...),
     phrase: str = Form(...)
 ):
     if audio.content_type != "audio/mpeg":
         raise HTTPException(
             status_code=400, detail="Invalid file type. Only MP3 files are supported."
         )
     try:
         audio_bytes = await audio.read()
         if not audio_bytes:
@@ -115,9 +133,11 @@ async def process_audio(
         logging.info(f"Received audio bytes: {len(audio_bytes)} bytes")
         with temporary_audio_file(audio_bytes) as tmp_filename:
             logging.info(f"Temporary file created: {tmp_filename}")
             audio_data, sample_rate = librosa.load(tmp_filename, sr=None)
             logging.info(
                 f"Audio loaded: sample rate = {sample_rate}, data shape = {audio_data.shape}"
@@ -125,32 +145,38 @@ async def process_audio(
             if not audio_data.any() or sample_rate == 0:
                 raise ValueError("Empty or invalid audio data.")
             features = extract_features(audio_data, sample_rate)
             logging.info(f"Features extracted: shape = {features.shape}")
             target_shape = (1, model.input_shape[1])
             features = pad_or_trim(features, target_shape[1])
             features = np.expand_dims(features, axis=0)
             prediction = model.predict(features)
             logging.info(f"Prediction: {prediction}")
             transcription_result = whisper_model.transcribe(tmp_filename, language="russian")
             transcribed_text = transcription_result["text"].lower().strip()
-            # Удаление знаков препинания из транскрибированного текста
             transcribed_text_clean = re.sub(r'[^\w\s]', '', transcribed_text)
             logging.info(f"Transcribed text (cleaned): {transcribed_text_clean}")
-            # Вычисление редакторского расстояния
             lev_distance = Levenshtein.distance(transcribed_text_clean, phrase.lower().strip())
             phrase_length = max(len(transcribed_text_clean), len(phrase))
             max_acceptable_distance = 0.5 * phrase_length
             match_phrase = lev_distance <= max_acceptable_distance
             logging.info(f"Expected phrase: {phrase}, Is correct: {match_phrase}, Transcribed text: {transcribed_text_clean}, Levenshtein distance: {lev_distance}")
             return {
                 "prediction": prediction.tolist(),
                 "match_phrase": match_phrase

     pad_or_trim,
 )
+#вывод в консоль для просмотри на hugging face
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     handlers=[logging.StreamHandler()]
 )
+# Установка временной директории для кэша Numba
 os.environ['NUMBA_CACHE_DIR'] = '/tmp'
+# Инициализация FastAPI приложения
 app = FastAPI(port=8000)
+# Настройка CORS (Cross-Origin Resource Sharing) для обработки запросов с разных доменов
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Инициализация и загрузка модели Whisper для распознавания речи
 cache_dir = "/tmp/whisper_cache"
 os.makedirs(cache_dir, exist_ok=True)
 whisper_model = whisper.load_model("tiny", download_root=cache_dir)
+# загрузка параметров модели
+filepath = os.path.abspath("cnn_1_v6_final_model.h5")
+if not os.path.exists(filepath):
+    raise FileNotFoundError(f"Model file not found at {filepath}")
+# Контекстный менеджер для временных аудио файлов
 @contextmanager
 def temporary_audio_file(audio_bytes):
+    """
+    Создает временный файл для хранения аудио данных и автоматически удаляет его после использования
+    """
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_file.write(audio_bytes)
         tmp_file.flush()
         if os.path.exists(tmp_filename):
             os.remove(tmp_filename)
+# Корневой endpoint
 @app.get("/")
 async def read_root():
     return {"message": "Welcome to the Defects_model API"}
 model = keras.models.load_model(filepath, compile=False)
 target_shape = (32, 200)
+# Endpoint для сохранения аудио файлов
 @app.post("/save-audio")
 async def save_audio(file: UploadFile = File(...)):
+    """
+    Обработчик для сохранения загруженных аудио файлов
+    """
     if not file.content_type.startswith("audio/"):
         raise HTTPException(status_code=400, detail="Invalid file type")
     except Exception as e:
         return JSONResponse(content={"error": str(e)}, status_code=500)
+# Настройка пути для файла логов
 log_file_path = os.path.join("/tmp", "server.log")
+# Настройка логирования для отслеживания работы сервера
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     handlers=[logging.StreamHandler()]
 )
+# Основной endpoint для обработки аудио
 @app.post("/process-audio")
 async def process_audio(
     audio: UploadFile = File(...),
     phrase: str = Form(...)
 ):
+    """
+    Главный обработчик для анализа аудио файлов:
+    - Делает предсказание моделью
+    - Прогоняет аудио через openai-whisper для проверки фразы
+    - Сравнивает полученный текст с ожидаемой фразой
+    """
+    # Проверка формата файла
     if audio.content_type != "audio/mpeg":
         raise HTTPException(
             status_code=400, detail="Invalid file type. Only MP3 files are supported."
         )
     try:
+        # Чтение аудио файла
         audio_bytes = await audio.read()
         if not audio_bytes:
         logging.info(f"Received audio bytes: {len(audio_bytes)} bytes")
+        # Обработка аудио во временном файле
         with temporary_audio_file(audio_bytes) as tmp_filename:
             logging.info(f"Temporary file created: {tmp_filename}")
+            # Загрузка аудио данных
             audio_data, sample_rate = librosa.load(tmp_filename, sr=None)
             logging.info(
                 f"Audio loaded: sample rate = {sample_rate}, data shape = {audio_data.shape}"
             if not audio_data.any() or sample_rate == 0:
                 raise ValueError("Empty or invalid audio data.")
+            # Извлечение признаков из аудио
             features = extract_features(audio_data, sample_rate)
             logging.info(f"Features extracted: shape = {features.shape}")
+            # Подготовка данных для модели
             target_shape = (1, model.input_shape[1])
             features = pad_or_trim(features, target_shape[1])
             features = np.expand_dims(features, axis=0)
+            # Получение предсказания от модели
             prediction = model.predict(features)
             logging.info(f"Prediction: {prediction}")
+            # Транскрибация аудио с помощью Whisper
             transcription_result = whisper_model.transcribe(tmp_filename, language="russian")
             transcribed_text = transcription_result["text"].lower().strip()
+            # Очистка транскрибированного текста
             transcribed_text_clean = re.sub(r'[^\w\s]', '', transcribed_text)
             logging.info(f"Transcribed text (cleaned): {transcribed_text_clean}")
+            # Сравнение с ожидаемой фразой
             lev_distance = Levenshtein.distance(transcribed_text_clean, phrase.lower().strip())
             phrase_length = max(len(transcribed_text_clean), len(phrase))
+            # Определение допустимого расстояния Левенштейна
             max_acceptable_distance = 0.5 * phrase_length
             match_phrase = lev_distance <= max_acceptable_distance
             logging.info(f"Expected phrase: {phrase}, Is correct: {match_phrase}, Transcribed text: {transcribed_text_clean}, Levenshtein distance: {lev_distance}")
+            # Возврат результатов
             return {
                 "prediction": prediction.tolist(),
                 "match_phrase": match_phrase

audio.mp3 DELETED Viewed

Binary file (190 kB)

forms.py DELETED Viewed

@@ -1,9 +0,0 @@
-from pydantic import BaseModel
-class UserRegistration(BaseModel):
-    login: str
-    password: str
-class UserLoginForm(BaseModel):
-    login: str
-    password: str

main.py DELETED Viewed

@@ -1,85 +0,0 @@
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from models import User, Course, connection
-from forms import UserRegistration, UserLoginForm
-from fastapi.responses import JSONResponse
-from utils import create_cnn_model, get_features, extract_features, pad_or_trim, noise, stretch, pitch
-from peewee import *
-import numpy as np
-import tensorflow as tf
-import keras
-import requests
-import io
-import os
-from fastapi.middleware.cors import CORSMiddleware
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-UPLOAD_DIR = 'audio'
-os.makedirs(UPLOAD_DIR, exist_ok=True)
-MODEL_SERVER_URL = "http://model-server-url/predict"
-@app.post("/save-audio")
-async def save_audio(file: UploadFile = File(...)):
-    if not file.content_type.startswith('audio/'):
-        raise HTTPException(status_code=400, detail="Invalid file type")
-    file_path = os.path.join(UPLOAD_DIR, file.filename)
-    try:
-        with open(file_path, "wb") as f:
-            content = await file.read()
-            f.write(content)
-        return JSONResponse(content={"message": "File saved successfully", "filePath": file_path}, status_code=200)
-    except Exception as e:
-        return JSONResponse(content={"error": str(e)}, status_code=500)
-model = tf.keras.models.load_model("cnn_1_v6_final_model.keras", compile=False)
-@app.post("/process-audio")
-async def process_audio(audio: UploadFile = File(...)):
-    if audio.content_type != "audio/mpeg":
-        raise HTTPException(status_code=400, detail="Invalid file type. Please upload an MP3 file.")
-    audio_bytes = await audio.read()
-    features = get_features(audio_bytes)
-    if features is None:
-        raise HTTPException(status_code=400, detail="Invalid audio file. Please upload a valid MP3 file.")
-    prediction = model.predict(np.expand_dims(features, axis=0))
-    return {"prediction": prediction}
-'''
-@router.post("/login")
-async def login(user_data: UserLoginForm):
-    user = User.get(User.login == user_data.login)
-    if not user or user_data.password != user.password:
-        return {"message": "Invalid login or password"}
-    token_content = {"user_id": user.user_id}
-    jwt_token = jwt.encode(token_content, SECRET_KEY, algorithm=ALGORITHM)
-    return {"token": jwt_token}
-@router.post("/registration")
-async def registration(user_data: UserRegistration):
-    try:
-        new_user = User.create(login=user_data.login, password=user_data.password)
-        new_user.save()
-        return {"message": "User registered successfully"}
-    except IntegrityError:
-        return {"message": "User with this login already exists"}
-'''

models.py DELETED Viewed

@@ -1,56 +0,0 @@
-from peewee import *
-connection = SqliteDatabase('database.db')
-class BaseModel(Model):
-    class Meta:
-        database = connection
-class User(BaseModel):
-    user_id = AutoField()
-    login = CharField(unique=True)
-    password = CharField()
-    class Meta:
-        db_table = 'Users'
-        order_by = ('user_id',)
-class Course(BaseModel):
-    course_id = AutoField()
-    name = CharField()
-    progress = IntegerField()
-    class Meta:
-        db_table = 'Courses'
-        order_by = ('course_id',)
-from peewee import *
-connection = SqliteDatabase('database.db')
-class BaseModel(Model):
-    class Meta:
-        database = connection
-class User(BaseModel):
-    user_id = AutoField()
-    login = CharField(unique=True)
-    password = CharField()
-    class Meta:
-        db_table = 'Users'
-        order_by = ('user_id',)
-class Course(BaseModel):
-    course_id = AutoField()
-    name = CharField()
-    progress = IntegerField()
-    class Meta:
-        db_table = 'Courses'
-        order_by = ('course_id',)

server.log DELETED Viewed

File without changes

test.py DELETED Viewed

@@ -1,62 +0,0 @@
-import os
-import numpy as np
-import keras
-import httpx
-import librosa
-import whisper
-from utils import (
-    extract_features,
-    pad_or_trim,
-)
-def test_get_answer(audio_file_path: str):
-    url = "http://127.0.0.1:8000/process-audio"
-    headers = {
-        "accept": "application/json",
-    }
-    with open(audio_file_path, "rb") as audio_file:
-        files = {
-            "audio": ("test.mp3", audio_file, "audio/mp3")
-        }
-        response = httpx.post(url, headers=headers, files=files)
-    print("Status Code:", response.status_code)
-    print("Response JSON:", response.json())
-audio_file_path = "test_audio.mp3"
-if not os.path.exists(audio_file_path):
-    raise FileNotFoundError(f"Audio file not found at {audio_file_path}")
-audio_data, sample_rate = librosa.load(audio_file_path)
-features = extract_features(audio_data, sample_rate)
-target_shape = (32, 200)
-features = pad_or_trim(features, target_shape[1])
-features = np.expand_dims(features, axis=0)
-filepath = os.path.abspath("cnn_1_v6_final_model.h5")
-if not os.path.exists(filepath):
-    raise FileNotFoundError(f"Model file not found at {filepath}")
-model = keras.models.load_model(filepath, compile=False)
-prediction = model.predict(features)
-print(f"Prediction: {prediction.tolist()}")
-def transcribe_russian(audio_file, model_name="tiny"):
-    model = whisper.load_model(model_name)
-    result = model.transcribe(audio_file, language="russian")
-    return result["text"]
-# Example usage:
-audio_file = "audio.mp3"
-text = transcribe_russian(audio_file)
-print(text)

test_audio.mp3 DELETED Viewed

Binary file (2.71 kB)