AudioModels / app.py
datasciencesage's picture
app.py
8fa04cd
# import os
# os.environ["KERAS_BACKEND"] = "jax"
# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
# import logging
# from pathlib import Path
# import numpy as np
# import librosa
# import tensorflow_hub as hub
# from flask import Flask, render_template, request, jsonify, session
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# import keras
# import torch
# from werkzeug.utils import secure_filename
# import traceback
# # Configure logging
# logging.basicConfig(
# level=logging.INFO,
# format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
# handlers=[
# logging.FileHandler('app.log'),
# logging.StreamHandler()
# ]
# )
# logger = logging.getLogger(__name__)
# # Environment setup
# class AudioProcessor:
# _instance = None
# _initialized = False
# def __new__(cls):
# if cls._instance is None:
# cls._instance = super(AudioProcessor, cls).__new__(cls)
# return cls._instance
# def __init__(self):
# if not AudioProcessor._initialized:
# self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
# self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# self.initialize_models()
# AudioProcessor._initialized = True
# def initialize_models(self):
# try:
# logger.info("Initializing models...")
# # Initialize transcription model
# model_id = "distil-whisper/distil-large-v3"
# self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
# model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )
# self.transcription_model.to(self.device)
# self.processor = AutoProcessor.from_pretrained(model_id)
# # Initialize classification model
# self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification")
# # Initialize pipeline
# self.pipe = pipeline(
# "automatic-speech-recognition",
# model=self.transcription_model,
# tokenizer=self.processor.tokenizer,
# feature_extractor=self.processor.feature_extractor,
# max_new_tokens=128,
# chunk_length_s=25,
# batch_size=16,
# torch_dtype=self.torch_dtype,
# device=self.device,
# )
# # Initialize YAMNet model
# self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
# logger.info("Models initialized successfully")
# except Exception as e:
# logger.error(f"Error initializing models: {str(e)}")
# raise
# def load_wav_16k_mono(self, filename):
# try:
# wav, sr = librosa.load(filename, mono=True, sr=None)
# if sr != 16000:
# wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
# return wav
# except Exception as e:
# logger.error(f"Error loading audio file: {str(e)}")
# raise
# def get_features_yamnet_extract_embedding(self, wav_data):
# try:
# scores, embeddings, spectrogram = self.yamnet_model(wav_data)
# return np.mean(embeddings.numpy(), axis=0)
# except Exception as e:
# logger.error(f"Error extracting YAMNet embeddings: {str(e)}")
# raise
# # Initialize Flask application
# app = Flask(__name__)
# app.secret_key = 'your_secret_key_here'
# app.config['UPLOAD_FOLDER'] = Path('uploads')
# app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
# # Create upload folder
# app.config['UPLOAD_FOLDER'].mkdir(exist_ok=True)
# # Initialize audio processor (will only happen once)
# audio_processor = AudioProcessor()
# @app.route('/')
# def index():
# session.clear()
# return render_template('terminal.html')
# @app.route('/process', methods=['POST'])
# def process():
# try:
# data = request.json
# command = data.get('command', '').strip().lower()
# if command in ['classify', 'transcribe']:
# session['operation'] = command
# return jsonify({
# 'result': f'root@math:~$ Upload a .mp3 file for {command} operation.',
# 'upload': True
# })
# else:
# return jsonify({
# 'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
# })
# except Exception as e:
# logger.error(f"Error in process route: {str(e)}\n{traceback.format_exc()}")
# session.pop('operation', None)
# return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
# @app.route('/upload', methods=['POST'])
# def upload():
# filepath = None
# try:
# operation = session.get('operation')
# if not operation:
# return jsonify({
# 'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".'
# })
# if 'file' not in request.files:
# return jsonify({'result': 'root@math:~$ No file uploaded.'})
# file = request.files['file']
# if file.filename == '' or not file.filename.lower().endswith('.mp3'):
# return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
# filename = secure_filename(file.filename)
# filepath = app.config['UPLOAD_FOLDER'] / filename
# file.save(filepath)
# wav_data = audio_processor.load_wav_16k_mono(filepath)
# if operation == 'classify':
# embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
# embeddings = np.reshape(embeddings, (-1, 1024))
# result = np.argmax(audio_processor.classification_model.predict(embeddings))
# elif operation == 'transcribe':
# result = audio_processor.pipe(str(filepath))['text']
# else:
# result = 'Invalid operation'
# return jsonify({
# 'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".',
# 'upload': False
# })
# except Exception as e:
# logger.error(f"Error in upload route: {str(e)}\n{traceback.format_exc()}")
# return jsonify({
# 'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
# })
# finally:
# session.pop('operation', None)
# if filepath and Path(filepath).exists():
# try:
# Path(filepath).unlink()
# except Exception as e:
# logger.error(f"Error deleting file {filepath}: {str(e)}")
import os
os.environ["KERAS_BACKEND"] = "jax"
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import logging
import numpy as np
import librosa
import tensorflow_hub as hub
from flask import Flask, render_template, request, jsonify, session
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import keras
import torch
import io
import traceback
# Configure logging to print to terminal only
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class AudioProcessor:
_instance = None
_initialized = False
def __new__(cls):
if cls._instance is None:
cls._instance = super(AudioProcessor, cls).__new__(cls)
return cls._instance
def __init__(self):
if not AudioProcessor._initialized:
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
self.initialize_models()
AudioProcessor._initialized = True
def initialize_models(self):
try:
logger.info("Initializing models...")
# Initialize transcription model
model_id = "distil-whisper/distil-large-v3"
self.transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
self.transcription_model.to(self.device)
self.processor = AutoProcessor.from_pretrained(model_id)
# Initialize classification model
self.classification_model = keras.saving.load_model("hf://datasciencesage/attentionaudioclassification")
# Initialize pipeline
self.pipe = pipeline(
"automatic-speech-recognition",
model=self.transcription_model,
tokenizer=self.processor.tokenizer,
feature_extractor=self.processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=25,
batch_size=16,
torch_dtype=self.torch_dtype,
device=self.device,
)
# Initialize YAMNet model
self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
logger.info("Models initialized successfully")
except Exception as e:
logger.error(f"Error initializing models: {str(e)}")
raise
def load_wav_16k_mono(self, audio_data):
try:
# Load audio from bytes buffer instead of file
wav, sr = librosa.load(io.BytesIO(audio_data), mono=True, sr=None)
if sr != 16000:
wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
return wav
except Exception as e:
logger.error(f"Error loading audio data: {str(e)}")
raise
def get_features_yamnet_extract_embedding(self, wav_data):
try:
scores, embeddings, spectrogram = self.yamnet_model(wav_data)
return np.mean(embeddings.numpy(), axis=0)
except Exception as e:
logger.error(f"Error extracting YAMNet embeddings: {str(e)}")
raise
# Initialize Flask application
app = Flask(__name__)
app.secret_key = 'your_secret_key_here'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
# Initialize audio processor (will only happen once)
audio_processor = AudioProcessor()
@app.route('/')
def index():
session.clear()
return render_template('terminal.html')
@app.route('/process', methods=['POST'])
def process():
try:
data = request.json
command = data.get('command', '').strip().lower()
if command in ['classify', 'transcribe']:
session['operation'] = command
return jsonify({
'result': f'root@math:~$ Upload a .mp3 file for {command} operation.',
'upload': True
})
else:
return jsonify({
'result': 'root@math:~$ Please specify an operation: "classify" or "transcribe".'
})
except Exception as e:
logger.error(f"Error in process route: {str(e)}")
session.pop('operation', None)
return jsonify({'result': f'root@math:~$ Error: {str(e)}'})
@app.route('/upload', methods=['POST'])
def upload():
try:
operation = session.get('operation')
if not operation:
return jsonify({
'result': 'root@math:~$ Please specify an operation first: "classify" or "transcribe".'
})
if 'file' not in request.files:
return jsonify({'result': 'root@math:~$ No file uploaded.'})
file = request.files['file']
if file.filename == '' or not file.filename.lower().endswith('.mp3'):
return jsonify({'result': 'root@math:~$ Please upload a valid .mp3 file.'})
# Read file content into memory
audio_data = file.read()
wav_data = audio_processor.load_wav_16k_mono(audio_data)
if operation == 'classify':
embeddings = audio_processor.get_features_yamnet_extract_embedding(wav_data)
embeddings = np.reshape(embeddings, (-1, 1024))
result = np.argmax(audio_processor.classification_model.predict(embeddings))
elif operation == 'transcribe':
# Create temporary buffer for transcription
audio_buffer = io.BytesIO(audio_data)
result = audio_processor.pipe(audio_buffer)['text']
else:
result = 'Invalid operation'
return jsonify({
'result': f'root@math:~$ Result is: {result}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".',
'upload': False
})
except Exception as e:
logger.error(f"Error in upload route: {str(e)}")
return jsonify({
'result': f'root@math:~$ Error: {str(e)}\nroot@math:~$ Please specify an operation: "classify" or "transcribe".'
})
finally:
session.pop('operation', None)
# if __name__ == '__main__':
# app.run(host='0.0.0.0', port=7860)