Hasti11 commited on
Commit
90b19b1
·
verified ·
1 Parent(s): 0d11193

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from transformers import pipeline
4
+ import speech_recognition as sr
5
+ import gradio as gr
6
+ import cv2
7
+ from PIL import Image
8
+ import moviepy.editor as mp
9
+ from gtts import gTTS
10
+ from groq import Groq
11
+
12
+
13
+ client = Groq(
14
+ api_key="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",
15
+ )
16
+
17
+ # Initialize pipelines
18
+ image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
19
+ audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
20
+ text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)
21
+
22
+ conversation_history = []
23
+
24
+ def process_input(video_stream):
25
+ if isinstance(video_stream, str):
26
+ video_file_path = video_stream
27
+
28
+ # Process video frames
29
+ image_features_list = []
30
+ audio_emotion = ""
31
+ text_input = ""
32
+ text_emotions = ""
33
+
34
+ cap = cv2.VideoCapture(video_file_path)
35
+ frame_count = 0
36
+
37
+ while True:
38
+ ret, frame = cap.read()
39
+ if not ret:
40
+ break
41
+
42
+ # Convert frame to PIL image
43
+ pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
44
+
45
+ # Analyze the image
46
+ try:
47
+ image_analysis = image_pipeline(pil_image)
48
+ if image_analysis:
49
+ image_features_list.append(image_analysis[0]['label'])
50
+ except Exception as e:
51
+ print(f"Error processing image data: {e}")
52
+
53
+ # Increment frame count
54
+ frame_count += 1
55
+
56
+ # Combine image features into a single string
57
+ image_features = ', '.join(image_features_list)
58
+ print("Image features:", image_features)
59
+
60
+ # Process audio data and get the emotion label
61
+ try:
62
+ # Extract audio from the video file
63
+ video_clip = mp.VideoFileClip(video_file_path)
64
+ audio_file_path = os.path.join("/tmp", "audio.wav")
65
+ video_clip.audio.write_audiofile(audio_file_path)
66
+
67
+ recognizer = sr.Recognizer()
68
+ with sr.AudioFile(audio_file_path) as source:
69
+ audio = recognizer.record(source)
70
+
71
+ # Convert audio data to numpy array
72
+ audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
73
+ audio_data = audio_data.astype(np.float32) # Convert to float32
74
+
75
+ audio_emotions = audio_pipeline(audio_data)
76
+ if audio_emotions:
77
+ audio_emotion = audio_emotions[0]['label']
78
+ print("Audio emotion:", audio_emotion)
79
+
80
+ # Recognize audio
81
+ text_input = recognizer.recognize_google(audio)
82
+ print("User said:", text_input)
83
+ except Exception as e:
84
+ print(f"Error processing audio data: {e}")
85
+
86
+ # Process text data and get the emotion label
87
+ text_emotions = ""
88
+ try:
89
+ # Initialize text_input in case it's not set
90
+ if not text_input:
91
+ text_input = ""
92
+
93
+ text_analysis = text_pipeline(text_input)
94
+ print("text analysis:", text_analysis)
95
+
96
+ if isinstance(text_analysis, list):
97
+ # Flatten the list of lists
98
+ text_analysis = [item for sublist in text_analysis for item in sublist]
99
+
100
+ # Initialize an empty list to store the text emotions
101
+ text_emotions_list = []
102
+
103
+ # Iterate through each item in the flattened list
104
+ for item in text_analysis:
105
+ # Ensure each item is a dictionary and contains the 'label' key
106
+ if isinstance(item, dict) and 'label' in item:
107
+ # Append the 'label' value to the text_emotions_list
108
+ text_emotions_list.append(item['label'])
109
+
110
+ # Check if text_emotions_list is empty
111
+ if text_emotions_list:
112
+ # Convert the text_emotions_list to a comma-separated string
113
+ text_emotions = ', '.join(text_emotions_list)
114
+ print("Text emotions:", text_emotions)
115
+ else:
116
+ text_emotions = "No significant emotions detected in the text."
117
+
118
+ except Exception as e:
119
+ print(f"Error processing text data: {e}")
120
+
121
+ print("Text emotions:", text_emotions)
122
+
123
+ conversation_history.append({
124
+ "user_input": text_input,
125
+ "image_features": image_features,
126
+ "audio_emotion": audio_emotion,
127
+ "text_emotions": text_emotions
128
+ })
129
+
130
+ prompt = "User said: " + text_input
131
+ if image_features:
132
+ prompt += "\nImage features: " + ', '.join(image_features)
133
+ if audio_emotion:
134
+ prompt += "\nAudio emotion: " + audio_emotion
135
+ if text_emotions:
136
+ prompt += "\nText emotions: " + text_emotions
137
+
138
+ print("image_feature",image_features)
139
+ print("Audio",audio_emotion)
140
+ print("text emotions",text_emotions)
141
+
142
+ chat_completion = client.chat.completions.create(
143
+ messages=[
144
+ {"role": "system",
145
+ "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
146
+ },
147
+ {"role": "user",
148
+ "content": prompt
149
+ }
150
+ ],
151
+ model="llama3-70b-8192",
152
+ temperature=0.5,
153
+ max_tokens=1024,
154
+ top_p=1,
155
+ stop=None,
156
+ stream=False,
157
+ )
158
+
159
+ ai_response = chat_completion.choices[0].message.content
160
+ conversation_history.append({"ai_response": ai_response})
161
+ print(ai_response)
162
+
163
+ # Convert AI response to audio
164
+ tts = gTTS(text=ai_response, lang='en')
165
+ audio_file_path = "/tmp/ai_response.wav"
166
+ tts.save(audio_file_path)
167
+
168
+ return ai_response,audio_file_path,display_history() # Return the generated response
169
+
170
+ def display_history():
171
+ history_str = ""
172
+ for i, turn in enumerate(conversation_history):
173
+ # if "user_input" in turn:
174
+ # history_str += f"User: {turn['user_input']}\n"
175
+ if "ai_response" in turn:
176
+ history_str += f"{turn['ai_response']}\n\n"
177
+ return history_str
178
+
179
+ # Create the Gradio interface
180
+ input_video = gr.Video(sources="webcam",label="Your Video", include_audio=True)
181
+ output_text = gr.Textbox(label="Therapist Response")
182
+ output_audio=gr.Audio(autoplay=True,visible=False)
183
+ history_text = gr.Textbox(display_history(), label="Conversation History", placeholder="")
184
+
185
+
186
+
187
+ iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text,output_audio,history_text], title="Mental Health Therapist", description="Speak to the AI through video input and get responses.",theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"),allow_flagging="auto")
188
+ iface.launch(debug=True)
189
+
190
+