Spaces:

JustKiddo
/

KiddoTheBERTo

Sleeping

App Files Files Community

JustKiddo commited on Feb 9

Commit

4911187

verified ·

1 Parent(s): 47b843d

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -29

app.py CHANGED Viewed

@@ -10,22 +10,65 @@ from datetime import datetime
 import json
 from collections import deque
 from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-import torch # Import torch
 class BERTopicChatbot:
     def __init__(self, dataset_name, text_column, split="train", max_samples=10000):
         # Initialize BERT sentence transformer
         self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
-        #Initialize BARTpho model and tokenizer
-        self.bartpho_model_name = "vinai/bartpho-syllable"
-        # Load tokenizer only once
-        self.tokenizer = AutoTokenizer.from_pretrained(self.bartpho_model_name)
-        # Load Dataset and set other variables
         try:
             dataset = load_dataset(dataset_name, split=split)
             # Convert to pandas DataFrame and sample if necessary
@@ -62,13 +105,10 @@ class BERTopicChatbot:
                 'total_documents': len(self.documents),
                 'topics_found': len(set(self.topics))
             }
         except Exception as e:
             st.error(f"Error loading dataset: {str(e)}")
             raise
-        #Load fine-tuned BARTpho model
-        self.bartpho_model = AutoModelForSeq2SeqLM.from_pretrained("./bartpho_chatbot").to("cuda" if torch.cuda.is_available() else "cpu")
-        self.bartpho_model.eval()
     def get_metrics_visualizations(self):
         """Generate visualizations for chatbot metrics"""
@@ -142,34 +182,48 @@ class BERTopicChatbot:
     def get_response(self, user_query):
         try:
             start_time = datetime.now()
-            # Generate response with BARTpho
-            input_ids = self.tokenizer(user_query, return_tensors="pt").input_ids.to(self.bartpho_model.device) #Send the tensor to the same device as the model.
-            with torch.no_grad():
-                outputs = self.bartpho_model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True) # Tune max_length, num_beams
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             end_time = datetime.now()
             metrics = {
-                'similarity': 0.0,  # Remove original implementation
                 'response_time': (end_time - start_time).total_seconds(),
                 'tokens': len(response.split()),
-                'topic': "N/A", # Remove original implementation
-                'detected_condition': "N/A" # Remove original implementation
             }
             # Update metrics history
             self.metrics_history['similarities'].append(metrics['similarity'])
             self.metrics_history['response_times'].append(metrics['response_time'])
             self.metrics_history['token_counts'].append(metrics['tokens'])
-            topic_id = "N/A" # Remove original implementation
             self.metrics_history['topics_accessed'][topic_id] = \
                 self.metrics_history['topics_accessed'].get(topic_id, 0) + 1
             return response, metrics
         except Exception as e:
             return f"Error processing query: {str(e)}", {'error': str(e)}
@@ -191,7 +245,7 @@ class BERTopicChatbot:
                 'dataset_info': None,
                 'metrics': None
             }
 @st.cache_resource
 def initialize_chatbot(dataset_name, text_column, split="train", max_samples=10000):
     return BERTopicChatbot(dataset_name, text_column, split, max_samples)

 import json
 from collections import deque
 from datasets import load_dataset
 class BERTopicChatbot:
+    #Initialize chatbot with a Hugging Face dataset
+    #dataset_name: name of the dataset on Hugging Face (e.g., 'vietnam/legal')
+    #text_column: name of the column containing the text data
+    #split: which split of the dataset to use ('train', 'test', 'validation')
+    #max_samples: maximum number of samples to use (to manage memory)
     def __init__(self, dataset_name, text_column, split="train", max_samples=10000):
         # Initialize BERT sentence transformer
         self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Add label mapping
+        self.label_mapping = {
+            0: 'BPD',
+            1: 'bipolar',
+            2: 'depression',
+            3: 'Anxiety',
+            4: 'schizophrenia',
+            5: 'mentalillness'
+        }
+        # Add comfort responses
+        self.comfort_responses = {
+            'BPD': [
+                "I understand BPD can be overwhelming. You're not alone in this journey.",
+                "Your feelings are valid. BPD is challenging, but there are people who understand.",
+                "Taking things one day at a time with BPD is okay. You're showing great strength."
+            ],
+            'bipolar': [
+                "Bipolar disorder can feel like a roller coaster. Remember, stability is possible.",
+                "You're so strong for managing bipolar disorder. Take it one day at a time.",
+                "Both the highs and lows are temporary. You've gotten through them before."
+            ],
+            'depression': [
+                "Depression is heavy, but you don't have to carry it alone.",
+                "Even small steps forward are progress. You're doing better than you think.",
+                "This feeling won't last forever. You've made it through difficult times before."
+            ],
+            'Anxiety': [
+                "Your anxiety doesn't define you. You're stronger than your fears.",
+                "Remember to breathe. You're safe, and this feeling will pass.",
+                "It's okay to take things at your own pace. You're handling this well."
+            ],
+            'schizophrenia': [
+                "You're not your diagnosis. You're a person first, and you matter.",
+                "Managing schizophrenia takes incredible strength. You're doing well.",
+                "There's support available, and you deserve all the help you need."
+            ],
+            'mentalillness': [
+                "Mental health challenges don't define your worth. You are valuable.",
+                "Recovery isn't linear, and that's okay. Every step counts.",
+                "You're not alone in this journey. There's a community that understands."
+            ]
+        }
+        # Load dataset from Hugging Face
         try:
             dataset = load_dataset(dataset_name, split=split)
             # Convert to pandas DataFrame and sample if necessary
                 'total_documents': len(self.documents),
                 'topics_found': len(set(self.topics))
             }
         except Exception as e:
             st.error(f"Error loading dataset: {str(e)}")
             raise
     def get_metrics_visualizations(self):
         """Generate visualizations for chatbot metrics"""
     def get_response(self, user_query):
         try:
             start_time = datetime.now()
+            # Get most similar documents
+            similar_docs, similarities = self.get_most_similar_document(user_query)
+            # Get the label from the most similar document
+            most_similar_index = similarities.argmax()
+            label_index = int(self.df['label'].iloc[most_similar_index])  # Convert to int
+            condition = self.label_mapping[label_index]  # Map the integer label to condition name
+            # Get comfort response
+            comfort_messages = self.comfort_responses[condition]
+            comfort_response = np.random.choice(comfort_messages)
+            # Calculate query topic for metrics
+            query_topic, _ = self.topic_model.transform([user_query])
+            # Combine information and comfort response
+            if max(similarities) < 0.5:
+                response = f"I sense you might be dealing with {condition}. {comfort_response}"
+            else:
+                response = f"{similar_docs[0]}\n\n{comfort_response}"
+            # Track metrics
             end_time = datetime.now()
             metrics = {
+                'similarity': float(max(similarities)),
                 'response_time': (end_time - start_time).total_seconds(),
                 'tokens': len(response.split()),
+                'topic': str(query_topic[0]),
+                'detected_condition': condition
             }
             # Update metrics history
             self.metrics_history['similarities'].append(metrics['similarity'])
             self.metrics_history['response_times'].append(metrics['response_time'])
             self.metrics_history['token_counts'].append(metrics['tokens'])
+            topic_id = str(query_topic[0])
             self.metrics_history['topics_accessed'][topic_id] = \
                 self.metrics_history['topics_accessed'].get(topic_id, 0) + 1
             return response, metrics
         except Exception as e:
             return f"Error processing query: {str(e)}", {'error': str(e)}
                 'dataset_info': None,
                 'metrics': None
             }
 @st.cache_resource
 def initialize_chatbot(dataset_name, text_column, split="train", max_samples=10000):
     return BERTopicChatbot(dataset_name, text_column, split, max_samples)