invincible-jha commited on
Commit
822dda9
1 Parent(s): 78a3bb0

Upload 9 files

Browse files
Files changed (9) hide show
  1. analyzer.py +61 -0
  2. app.py +62 -0
  3. audio-processor.py +55 -0
  4. gpu-optimizer.py +30 -0
  5. model-cache.py +18 -0
  6. model-manager.py +79 -0
  7. readme.md +38 -0
  8. requirements.txt +9 -0
  9. visualizer.py +74 -0
analyzer.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model_manager import ModelManager
2
+ from .audio_processor import AudioProcessor
3
+ from typing import Dict
4
+
5
+ class Analyzer:
6
+ def __init__(self, model_manager: ModelManager, audio_processor: AudioProcessor):
7
+ self.model_manager = model_manager
8
+ self.audio_processor = audio_processor
9
+ self.model_manager.load_models()
10
+
11
+ def analyze(self, audio_path: str) -> Dict:
12
+ # Process audio
13
+ waveform, features = self.audio_processor.process_audio(audio_path)
14
+
15
+ # Get transcription
16
+ transcription = self.model_manager.transcribe(waveform)
17
+
18
+ # Analyze emotions
19
+ emotions = self.model_manager.analyze_emotions(transcription)
20
+
21
+ # Analyze mental health indicators
22
+ mental_health = self.model_manager.analyze_mental_health(transcription)
23
+
24
+ # Combine analysis with audio features
25
+ mental_health = self._combine_analysis(mental_health, features)
26
+
27
+ return {
28
+ 'transcription': transcription,
29
+ 'emotions': {
30
+ 'scores': emotions,
31
+ 'dominant_emotion': max(emotions.items(), key=lambda x: x[1])[0]
32
+ },
33
+ 'mental_health_indicators': mental_health,
34
+ 'audio_features': features
35
+ }
36
+
37
+ def _combine_analysis(self, mental_health: Dict, features: Dict) -> Dict:
38
+ """Combine mental health analysis with audio features"""
39
+ # Adjust risk scores based on audio features
40
+ energy_level = features['energy']['mean']
41
+ pitch_variability = features['pitch']['std']
42
+
43
+ # Simple risk score adjustment based on audio features
44
+ mental_health['depression_risk'] = (
45
+ mental_health['depression_risk'] * 0.7 +
46
+ (1 - energy_level) * 0.3 # Lower energy may indicate depression
47
+ )
48
+
49
+ mental_health['anxiety_risk'] = (
50
+ mental_health['anxiety_risk'] * 0.7 +
51
+ pitch_variability * 0.3 # Higher pitch variability may indicate anxiety
52
+ )
53
+
54
+ # Add confidence scores
55
+ mental_health['confidence'] = {
56
+ 'depression': 0.8, # Example confidence scores
57
+ 'anxiety': 0.8,
58
+ 'stress': 0.7
59
+ }
60
+
61
+ return mental_health
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.models import ModelManager, AudioProcessor, Analyzer
3
+ from src.utils import visualizer, GPUOptimizer, ModelCache
4
+
5
+ # Initialize components
6
+ optimizer = GPUOptimizer()
7
+ optimizer.optimize()
8
+
9
+ model_manager = ModelManager()
10
+ audio_processor = AudioProcessor()
11
+ analyzer = Analyzer(model_manager, audio_processor)
12
+ cache = ModelCache()
13
+
14
+ def process_audio(audio_file):
15
+ try:
16
+ # Check cache
17
+ with open(audio_file, 'rb') as f:
18
+ cache_key = cache.get_cache_key(f.read())
19
+
20
+ cached_result = cache.cache_result(cache_key, None)
21
+ if cached_result:
22
+ return cached_result
23
+
24
+ # Process audio
25
+ results = analyzer.analyze(audio_file)
26
+
27
+ # Format outputs
28
+ outputs = (
29
+ results['transcription'],
30
+ visualizer.create_emotion_plot(results['emotions']['scores']),
31
+ _format_indicators(results['mental_health_indicators'])
32
+ )
33
+
34
+ # Cache results
35
+ cache.cache_result(cache_key, outputs)
36
+
37
+ return outputs
38
+
39
+ except Exception as e:
40
+ return str(e), "Error in analysis", "Error in analysis"
41
+
42
+ def _format_indicators(indicators):
43
+ return f"""
44
+ ### Mental Health Indicators
45
+ - Depression Risk: {indicators['depression_risk']:.2f}
46
+ - Anxiety Risk: {indicators['anxiety_risk']:.2f}
47
+ - Stress Level: {indicators['stress_level']:.2f}
48
+ """
49
+
50
+ interface = gr.Interface(
51
+ fn=process_audio,
52
+ inputs=gr.Audio(source="microphone", type="filepath"),
53
+ outputs=[
54
+ gr.Textbox(label="Transcription"),
55
+ gr.HTML(label="Emotion Analysis"),
56
+ gr.Markdown(label="Mental Health Indicators")
57
+ ],
58
+ title="Vocal Biomarker Analysis",
59
+ description="Analyze voice for emotional and mental health indicators"
60
+ )
61
+
62
+ interface.launch()
audio-processor.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ from typing import Dict, Tuple
4
+
5
+ class AudioProcessor:
6
+ def __init__(self):
7
+ self.sample_rate = 16000
8
+ self.n_mfcc = 13
9
+ self.n_mels = 128
10
+
11
+ def process_audio(self, audio_path: str) -> Tuple[np.ndarray, Dict]:
12
+ # Load and preprocess audio
13
+ waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
14
+
15
+ # Extract features
16
+ features = {
17
+ 'mfcc': self._extract_mfcc(waveform),
18
+ 'pitch': self._extract_pitch(waveform),
19
+ 'energy': self._extract_energy(waveform)
20
+ }
21
+
22
+ return waveform, features
23
+
24
+ def _extract_mfcc(self, waveform: np.ndarray) -> np.ndarray:
25
+ mfccs = librosa.feature.mfcc(
26
+ y=waveform,
27
+ sr=self.sample_rate,
28
+ n_mfcc=self.n_mfcc
29
+ )
30
+ return mfccs.mean(axis=1)
31
+
32
+ def _extract_pitch(self, waveform: np.ndarray) -> Dict:
33
+ f0, voiced_flag, voiced_probs = librosa.pyin(
34
+ waveform,
35
+ fmin=librosa.note_to_hz('C2'),
36
+ fmax=librosa.note_to_hz('C7'),
37
+ sr=self.sample_rate
38
+ )
39
+
40
+ return {
41
+ 'mean': float(np.nanmean(f0)),
42
+ 'std': float(np.nanstd(f0)),
43
+ 'max': float(np.nanmax(f0)),
44
+ 'min': float(np.nanmin(f0))
45
+ }
46
+
47
+ def _extract_energy(self, waveform: np.ndarray) -> Dict:
48
+ rms = librosa.feature.rms(y=waveform)[0]
49
+
50
+ return {
51
+ 'mean': float(np.mean(rms)),
52
+ 'std': float(np.std(rms)),
53
+ 'max': float(np.max(rms)),
54
+ 'min': float(np.min(rms))
55
+ }
gpu-optimizer.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gc
3
+
4
+ class GPUOptimizer:
5
+ def __init__(self):
6
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+
8
+ def optimize(self):
9
+ if torch.cuda.is_available():
10
+ # Clear cache
11
+ torch.cuda.empty_cache()
12
+ gc.collect()
13
+
14
+ # Set memory fraction
15
+ torch.cuda.set_per_process_memory_fraction(0.9)
16
+
17
+ # Enable TF32 for better performance
18
+ torch.backends.cuda.matmul.allow_tf32 = True
19
+ torch.backends.cudnn.allow_tf32 = True
20
+
21
+ # Enable autocast for mixed precision
22
+ torch.cuda.amp.autocast(enabled=True)
23
+
24
+ def get_memory_usage(self):
25
+ if torch.cuda.is_available():
26
+ return {
27
+ 'allocated': torch.cuda.memory_allocated() / 1024**2, # MB
28
+ 'reserved': torch.cuda.memory_reserved() / 1024**2 # MB
29
+ }
30
+ return {'allocated': 0, 'reserved': 0}
model-cache.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ import hashlib
3
+ import json
4
+
5
+ class ModelCache:
6
+ def __init__(self, cache_size=128):
7
+ self.cache_size = cache_size
8
+
9
+ @lru_cache(maxsize=128)
10
+ def cache_result(self, input_key, result):
11
+ return result
12
+
13
+ def get_cache_key(self, audio_data):
14
+ # Create hash of audio data for cache key
15
+ return hashlib.md5(audio_data).hexdigest()
16
+
17
+ def clear_cache(self):
18
+ self.cache_result.cache_clear()
model-manager.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ WhisperProcessor, WhisperForConditionalGeneration,
3
+ AutoModelForSequenceClassification, AutoTokenizer
4
+ )
5
+ import torch
6
+
7
+ class ModelManager:
8
+ def __init__(self):
9
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ self.models = {}
11
+ self.tokenizers = {}
12
+ self.processors = {}
13
+
14
+ def load_models(self):
15
+ # Load Whisper for speech recognition
16
+ self.processors['whisper'] = WhisperProcessor.from_pretrained("openai/whisper-base")
17
+ self.models['whisper'] = WhisperForConditionalGeneration.from_pretrained(
18
+ "openai/whisper-base"
19
+ ).to(self.device)
20
+
21
+ # Load EmoBERTa for emotion detection
22
+ self.tokenizers['emotion'] = AutoTokenizer.from_pretrained("arpanghoshal/EmoRoBERTa")
23
+ self.models['emotion'] = AutoModelForSequenceClassification.from_pretrained(
24
+ "arpanghoshal/EmoRoBERTa"
25
+ ).to(self.device)
26
+
27
+ # Load ClinicalBERT for analysis
28
+ self.tokenizers['clinical'] = AutoTokenizer.from_pretrained(
29
+ "emilyalsentzer/Bio_ClinicalBERT"
30
+ )
31
+ self.models['clinical'] = AutoModelForSequenceClassification.from_pretrained(
32
+ "emilyalsentzer/Bio_ClinicalBERT"
33
+ ).to(self.device)
34
+
35
+ def transcribe(self, audio_input):
36
+ inputs = self.processors['whisper'](
37
+ audio_input,
38
+ return_tensors="pt"
39
+ ).input_features.to(self.device)
40
+
41
+ generated_ids = self.models['whisper'].generate(inputs)
42
+ transcription = self.processors['whisper'].batch_decode(
43
+ generated_ids,
44
+ skip_special_tokens=True
45
+ )[0]
46
+ return transcription
47
+
48
+ def analyze_emotions(self, text):
49
+ inputs = self.tokenizers['emotion'](
50
+ text,
51
+ return_tensors="pt",
52
+ padding=True,
53
+ truncation=True,
54
+ max_length=512
55
+ ).to(self.device)
56
+
57
+ outputs = self.models['emotion'](**inputs)
58
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
59
+
60
+ emotions = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
61
+ return {emotion: float(prob) for emotion, prob in zip(emotions, probs[0])}
62
+
63
+ def analyze_mental_health(self, text):
64
+ inputs = self.tokenizers['clinical'](
65
+ text,
66
+ return_tensors="pt",
67
+ padding=True,
68
+ truncation=True,
69
+ max_length=512
70
+ ).to(self.device)
71
+
72
+ outputs = self.models['clinical'](**inputs)
73
+ scores = torch.sigmoid(outputs.logits)
74
+
75
+ return {
76
+ 'depression_risk': float(scores[0][0]),
77
+ 'anxiety_risk': float(scores[0][1]),
78
+ 'stress_level': float(scores[0][2])
79
+ }
readme.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vocal Biomarker Analysis
3
+ emoji: 🎤
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.12.0
8
+ python_version: 3.10
9
+ app_file: app.py
10
+ pinned: false
11
+ license: mit
12
+ ---
13
+
14
+ # Vocal Biomarker Analysis
15
+
16
+ This application analyzes voice recordings to detect emotional and mental health indicators using AI models.
17
+
18
+ ## Features
19
+ - Speech-to-text transcription
20
+ - Emotion detection
21
+ - Mental health risk assessment
22
+ - Real-time visualization
23
+
24
+ ## Models
25
+ - Whisper Base (Speech Recognition)
26
+ - EmoBERTa (Emotion Detection)
27
+ - ClinicalBERT (Analysis)
28
+
29
+ ## Usage
30
+ 1. Record audio or upload file
31
+ 2. Click analyze
32
+ 3. View results:
33
+ - Transcription
34
+ - Emotion analysis
35
+ - Mental health indicators
36
+
37
+ ## License
38
+ MIT License
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.12.0
2
+ torch==2.1.0
3
+ transformers==4.36.0
4
+ librosa==0.10.1
5
+ numpy==1.24.3
6
+ plotly==5.18.0
7
+ scipy==1.11.3
8
+ soundfile==0.12.1
9
+ pandas==2.1.1
visualizer.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ from typing import Dict
3
+
4
+ def create_emotion_plot(emotions: Dict[str, float]) -> str:
5
+ """Create emotion distribution plot"""
6
+ fig = go.Figure()
7
+
8
+ # Add bar plot
9
+ fig.add_trace(go.Bar(
10
+ x=list(emotions.keys()),
11
+ y=list(emotions.values()),
12
+ marker_color='rgb(55, 83, 109)'
13
+ ))
14
+
15
+ # Update layout
16
+ fig.update_layout(
17
+ title='Emotion Distribution',
18
+ xaxis_title='Emotion',
19
+ yaxis_title='Score',
20
+ yaxis_range=[0, 1],
21
+ template='plotly_white',
22
+ height=400
23
+ )
24
+
25
+ return fig.to_html(include_plotlyjs=True)
26
+
27
+ def create_pitch_plot(pitch_data: Dict) -> str:
28
+ """Create pitch analysis plot"""
29
+ fig = go.Figure()
30
+
31
+ # Add box plot
32
+ fig.add_trace(go.Box(
33
+ y=[pitch_data['min'], pitch_data['mean'], pitch_data['max']],
34
+ name='Pitch Distribution',
35
+ boxpoints='all'
36
+ ))
37
+
38
+ # Update layout
39
+ fig.update_layout(
40
+ title='Pitch Analysis',
41
+ yaxis_title='Frequency (Hz)',
42
+ template='plotly_white',
43
+ height=400
44
+ )
45
+
46
+ return fig.to_html(include_plotlyjs=True)
47
+
48
+ def create_energy_plot(energy_data: Dict) -> str:
49
+ """Create energy analysis plot"""
50
+ fig = go.Figure()
51
+
52
+ # Add indicator
53
+ fig.add_trace(go.Indicator(
54
+ mode='gauge+number',
55
+ value=energy_data['mean'],
56
+ title={'text': 'Voice Energy Level'},
57
+ gauge={
58
+ 'axis': {'range': [0, 1]},
59
+ 'bar': {'color': 'darkblue'},
60
+ 'steps': [
61
+ {'range': [0, 0.3], 'color': 'lightgray'},
62
+ {'range': [0.3, 0.7], 'color': 'gray'},
63
+ {'range': [0.7, 1], 'color': 'darkgray'}
64
+ ]
65
+ }
66
+ ))
67
+
68
+ # Update layout
69
+ fig.update_layout(
70
+ height=300,
71
+ template='plotly_white'
72
+ )
73
+
74
+ return fig.to_html(include_plotlyjs=True)