Boltz79 commited on
Commit
786ea23
·
verified ·
1 Parent(s): 7ac7ed0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -160
app.py CHANGED
@@ -1,168 +1,53 @@
1
  import gradio as gr
2
- import numpy as np
3
- import torch
4
- from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
5
- import librosa
6
- import os
7
- import warnings
8
- warnings.filterwarnings("ignore")
9
 
10
- class EmotionRecognizer:
11
- def __init__(self):
12
- # Initialize the model and feature extractor
13
- self.model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
14
- self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
15
- self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_name)
16
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
- self.model.to(self.device)
18
- self.sample_rate = 16000
19
-
20
- # Define emotion labels
21
- self.labels = ['angry', 'happy', 'sad', 'neutral', 'fearful']
22
-
23
- def process_audio(self, audio):
24
- """Process audio and return emotions with confidence scores"""
25
- try:
26
- # Check if audio is a tuple (new Gradio audio format)
27
- if isinstance(audio, tuple):
28
- sample_rate, audio_data = audio
29
- else:
30
- return "Error: Invalid audio format", None
31
-
32
- # Resample if necessary
33
- if sample_rate != self.sample_rate:
34
- audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=self.sample_rate)
35
-
36
- # Convert to float32 if not already
37
- audio_data = audio_data.astype(np.float32)
38
-
39
- # Extract features
40
- inputs = self.feature_extractor(
41
- audio_data,
42
- sampling_rate=self.sample_rate,
43
- return_tensors="pt",
44
- padding=True
45
- ).to(self.device)
46
-
47
- # Get model predictions
48
- with torch.no_grad():
49
- outputs = self.model(**inputs)
50
- predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
51
-
52
- # Process results
53
- scores = predictions[0].cpu().numpy()
54
- results = [
55
- {"label": label, "score": float(score)}
56
- for label, score in zip(self.labels, scores)
57
- ]
58
-
59
- # Sort by confidence
60
- results.sort(key=lambda x: x["score"], reverse=True)
61
 
62
- # Format results for display
63
- output_text = "Emotion Analysis Results:\n\n"
64
- output_text += "\n".join([
65
- f"{result['label'].title()}: {result['score']*100:.2f}%"
66
- for result in results
67
- ])
68
 
69
- # Prepare plot data
70
- plot_data = {
71
- "labels": [r["label"].title() for r in results],
72
- "values": [r["score"] * 100 for r in results]
73
- }
74
-
75
- return output_text, plot_data
76
-
77
- except Exception as e:
78
- return f"Error processing audio: {str(e)}", None
79
-
80
- def create_interface():
81
- # Initialize the emotion recognizer
82
- recognizer = EmotionRecognizer()
83
-
84
- # Define processing function for Gradio
85
- def process_audio_file(audio):
86
- if audio is None:
87
- return "Please provide an audio input.", None
88
-
89
- output_text, plot_data = recognizer.process_audio(audio)
90
-
91
- if plot_data is not None:
92
- return (
93
- output_text,
94
- gr.BarPlot.update(
95
- value=plot_data,
96
- x="labels",
97
- y="values",
98
- title="Emotion Confidence Scores",
99
- x_title="Emotions",
100
- y_title="Confidence (%)"
101
- )
102
- )
103
- return output_text, None
104
-
105
- # Create the Gradio interface
106
- with gr.Blocks(title="Audio Emotion Recognition") as interface:
107
- gr.Markdown("# 🎭 Audio Emotion Recognition")
108
- gr.Markdown("""
109
- Upload an audio file or record directly to analyze the emotional content.
110
- The model will detect emotions like angry, happy, sad, neutral, and fearful.
111
- """)
112
-
113
- with gr.Row():
114
- with gr.Column():
115
- # Input audio component (updated format)
116
- audio_input = gr.Audio(
117
- label="Upload or Record Audio",
118
- type="numpy",
119
- sources=["microphone", "upload"]
120
- )
121
-
122
- # Process button
123
- process_btn = gr.Button("Analyze Emotion", variant="primary")
124
-
125
- with gr.Column():
126
- # Output components
127
- output_text = gr.Textbox(
128
- label="Analysis Results",
129
- lines=6
130
- )
131
- output_plot = gr.BarPlot(
132
- title="Emotion Confidence Scores",
133
- x_title="Emotions",
134
- y_title="Confidence (%)"
135
- )
136
 
137
- # Set up event handler
138
- process_btn.click(
139
- fn=process_audio_file,
140
- inputs=[audio_input],
141
- outputs=[output_text, output_plot]
142
- )
143
 
144
- gr.Markdown("""
145
- ### Usage Instructions:
146
- 1. Click the microphone button to record audio or upload an audio file
147
- 2. Click "Analyze Emotion" to process the audio
148
- 3. View the results and confidence scores
149
-
150
- ### Notes:
151
- - For best results, ensure clear audio with minimal background noise
152
- - Speak naturally and clearly when recording
153
- - The model works best with speech in English
154
- """)
155
-
156
- return interface
157
-
158
- def main():
159
- # Create and launch the interface
160
- interface = create_interface()
161
- interface.launch(
162
- share=True,
163
- server_name="0.0.0.0",
164
- server_port=7860
165
  )
166
 
167
- if __name__ == "__main__":
168
- main()
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
 
 
 
 
 
 
3
 
4
+ # Load Whisper for speech-to-text
5
+ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Load a sentiment analysis model
8
+ sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
 
 
 
 
9
 
10
+ # Function to process audio and analyze tone
11
+ def analyze_call(audio_file):
12
+ try:
13
+ # Step 1: Transcribe audio to text using Whisper
14
+ transcription = whisper(audio_file)["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Step 2: Analyze sentiment of the transcription
17
+ sentiment_result = sentiment_analyzer(transcription)[0]
 
 
 
 
18
 
19
+ # Prepare the output
20
+ output = {
21
+ "transcription": transcription,
22
+ "sentiment": sentiment_result["label"],
23
+ "confidence": round(sentiment_result["score"], 4)
24
+ }
25
+ return output
26
+ except Exception as e:
27
+ return {"error": str(e)}
28
+
29
+ # Gradio Interface
30
+ def gradio_interface(audio):
31
+ if audio is None:
32
+ return "Please record or upload an audio file."
33
+ result = analyze_call(audio)
34
+ if "error" in result:
35
+ return f"Error: {result['error']}"
36
+ return (
37
+ f"**Transcription:** {result['transcription']}\n\n"
38
+ f"**Sentiment:** {result['sentiment']}\n\n"
39
+ f"**Confidence:** {result['confidence']}"
40
  )
41
 
42
+ # Create Gradio app
43
+ interface = gr.Interface(
44
+ fn=gradio_interface,
45
+ inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
46
+ outputs=gr.Textbox(label="Analysis Result", lines=5),
47
+ title="Real-Time Call Analysis",
48
+ description="Record or upload audio to analyze tone and sentiment in real time.",
49
+ live=False # Set to False to avoid constant re-runs
50
+ )
51
+
52
+ # Launch the app
53
+ interface.launch()