FomoFix / xgb_mental_health.py
ASledziewska
Update xgb_mental_health.py
8c1f1b5
import os.path
import pickle
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
class MentalHealthClassifier:
def __init__(self, data_path, model_path):
self.data = pd.read_csv(data_path, skip_blank_lines=True)
self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']]
self.data.dropna(subset=['text'], inplace=True)
self.data.dropna(subset=['clean_text'], inplace=True)
self.data_selected = self.data[['clean_text', 'category']]
self.df = pd.DataFrame(self.data_selected)
self.label_encoder = LabelEncoder()
self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category'])
self.tokenizer = None
self.vectorizer = CountVectorizer()
self.model_path = model_path
self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier()
def preprocess_data(self):
tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']]
X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray()
return X, self.df['category_encoded']
def train_model(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
self.model.fit(X_train, y_train)
y_pred = self.model.predict(X_test)
return y_test, y_pred
def predict_category(self, input_text):
if self.tokenizer is None:
raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.")
tokenized_input = self.tokenizer.tokenize(input_text, padding=True, truncation=True)
input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray()
predicted_category_encoded = self.model.predict(input_feature_vector)
predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded)
return predicted_category[0]
def initialize_tokenizer(self, model_name):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
def save_model(self):
print("saving model...to pickle...")
with open(self.model_path, 'wb') as f:
pickle.dump(self.model, f)
def load_model(self):
print("loading model...from pickle...")
with open(self.model_path, 'rb') as f:
return pickle.load(f)
if __name__ == "__main__":
tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
data_path = 'data/data.csv'
model_path = 'mental_health_model.pkl'
mental_classifier = MentalHealthClassifier(data_path, model_path)
if not os.path.exists(model_path):
mental_classifier.initialize_tokenizer(tokenizer_model_name)
X, y = mental_classifier.preprocess_data()
y_test, y_pred = mental_classifier.train_model(X, y)
mental_classifier.save_model()
else:
mental_classifier.load_model()
mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle
mental_classifier.preprocess_data()
# input_text = "I feel anxiety whenever i am doing nothing."
# predicted_category = mental_classifier.predict_category(input_text)
# print("Predicted mental health condition:", predicted_category)