Spaces:

Hasti11
/

MultiModal-Mental-Health-Therapist

Running

App Files Files Community

Hasti11 commited on Apr 28, 2024

Commit

90b19b1

verified ·

1 Parent(s): 0d11193

Create app.py

Browse files

Files changed (1) hide show

app.py +190 -0

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import numpy as np
+from transformers import pipeline
+import speech_recognition as sr
+import gradio as gr
+import cv2
+from PIL import Image
+import moviepy.editor as mp
+from gtts import gTTS
+from groq import Groq
+client = Groq(
+    api_key="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",
+)
+# Initialize pipelines
+image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
+audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
+text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)
+conversation_history = []
+def process_input(video_stream):
+    if isinstance(video_stream, str):
+        video_file_path = video_stream
+    # Process video frames
+    image_features_list = []
+    audio_emotion = ""
+    text_input = ""
+    text_emotions = ""
+    cap = cv2.VideoCapture(video_file_path)
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Convert frame to PIL image
+        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        # Analyze the image
+        try:
+            image_analysis = image_pipeline(pil_image)
+            if image_analysis:
+                image_features_list.append(image_analysis[0]['label'])
+        except Exception as e:
+            print(f"Error processing image data: {e}")
+        # Increment frame count
+        frame_count += 1
+    # Combine image features into a single string
+    image_features = ', '.join(image_features_list)
+    print("Image features:", image_features)
+    # Process audio data and get the emotion label
+    try:
+        # Extract audio from the video file
+        video_clip = mp.VideoFileClip(video_file_path)
+        audio_file_path = os.path.join("/tmp", "audio.wav")
+        video_clip.audio.write_audiofile(audio_file_path)
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(audio_file_path) as source:
+            audio = recognizer.record(source)
+        # Convert audio data to numpy array
+        audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
+        audio_data = audio_data.astype(np.float32)  # Convert to float32
+        audio_emotions = audio_pipeline(audio_data)
+        if audio_emotions:
+            audio_emotion = audio_emotions[0]['label']
+            print("Audio emotion:", audio_emotion)
+            # Recognize audio
+            text_input = recognizer.recognize_google(audio)
+            print("User said:", text_input)
+    except Exception as e:
+        print(f"Error processing audio data: {e}")
+    # Process text data and get the emotion label
+    text_emotions = ""
+    try:
+        # Initialize text_input in case it's not set
+        if not text_input:
+            text_input = ""
+        text_analysis = text_pipeline(text_input)
+        print("text analysis:", text_analysis)
+        if isinstance(text_analysis, list):
+            # Flatten the list of lists
+            text_analysis = [item for sublist in text_analysis for item in sublist]
+            # Initialize an empty list to store the text emotions
+            text_emotions_list = []
+            # Iterate through each item in the flattened list
+            for item in text_analysis:
+                # Ensure each item is a dictionary and contains the 'label' key
+                if isinstance(item, dict) and 'label' in item:
+                    # Append the 'label' value to the text_emotions_list
+                    text_emotions_list.append(item['label'])
+            # Check if text_emotions_list is empty
+            if text_emotions_list:
+                # Convert the text_emotions_list to a comma-separated string
+                text_emotions = ', '.join(text_emotions_list)
+                print("Text emotions:", text_emotions)
+            else:
+                text_emotions = "No significant emotions detected in the text."
+    except Exception as e:
+        print(f"Error processing text data: {e}")
+    print("Text emotions:", text_emotions)
+    conversation_history.append({
+                "user_input": text_input,
+                "image_features": image_features,
+                "audio_emotion": audio_emotion,
+                "text_emotions": text_emotions
+            })
+    prompt = "User said: " + text_input
+    if image_features:
+        prompt += "\nImage features: " + ', '.join(image_features)
+    if audio_emotion:
+        prompt += "\nAudio emotion: " + audio_emotion
+    if text_emotions:
+        prompt += "\nText emotions: " + text_emotions
+    print("image_feature",image_features)
+    print("Audio",audio_emotion)
+    print("text emotions",text_emotions)
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "system",
+                "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
+                },
+            {"role": "user",
+                "content": prompt
+            }
+        ],
+        model="llama3-70b-8192",
+        temperature=0.5,
+        max_tokens=1024,
+        top_p=1,
+        stop=None,
+        stream=False,
+    )
+    ai_response = chat_completion.choices[0].message.content
+    conversation_history.append({"ai_response": ai_response})
+    print(ai_response)
+    # Convert AI response to audio
+    tts = gTTS(text=ai_response, lang='en')
+    audio_file_path = "/tmp/ai_response.wav"
+    tts.save(audio_file_path)
+    return ai_response,audio_file_path,display_history()  # Return the generated response
+def display_history():
+    history_str = ""
+    for i, turn in enumerate(conversation_history):
+        # if "user_input" in turn:
+        #     history_str += f"User: {turn['user_input']}\n"
+        if "ai_response" in turn:
+            history_str += f"{turn['ai_response']}\n\n"
+    return history_str
+# Create the Gradio interface
+input_video = gr.Video(sources="webcam",label="Your Video", include_audio=True)
+output_text = gr.Textbox(label="Therapist Response")
+output_audio=gr.Audio(autoplay=True,visible=False)
+history_text = gr.Textbox(display_history(), label="Conversation History", placeholder="")
+iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text,output_audio,history_text], title="Mental Health Therapist", description="Speak to the AI through video input and get responses.",theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"),allow_flagging="auto")
+iface.launch(debug=True)