File size: 4,211 Bytes
bd9870c 2c17cd5 bd9870c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
###
# - Author: Jaelin Lee
# - Date: Mar 23, 2024
# - Description: XGBoost mental health classfier [depression, adhd, anxiety, social_isolation, cyberbullying, social_media_addiction]. Incorporated the updated code from Aleksandra Śledziewska that fixed token size issue. The model is now loaded from pickle if the model is already saved to pickle. This saves time for each prediction without having to retrain the model.
###
import os.path
import pickle
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from clean_text_model import TextCleaner
class MentalHealthClassifier:
def __init__(self, data_path, model_path):
self.data = pd.read_csv(data_path, skip_blank_lines=True)
self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']]
# self.data.dropna(subset=['text'], inplace=True)
self.data.dropna(subset=['clean_text'], inplace=True)
self.data_selected = self.data[['clean_text', 'category']]
self.df = pd.DataFrame(self.data_selected)
self.label_encoder = LabelEncoder()
self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category'])
self.tokenizer = None
self.vectorizer = CountVectorizer()
self.text_cleaner = TextCleaner()
self.model_path = model_path
self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier()
def preprocess_data(self):
tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']]
X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray()
return X, self.df['category_encoded']
def train_model(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
self.model.fit(X_train, y_train)
y_pred = self.model.predict(X_test)
return y_test, y_pred
def predict_category(self, raw_input_text):
if self.tokenizer is None:
raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.")
input_text = self.text_cleaner.cleaning_text(raw_input_text)
tokenized_input = self.tokenizer.tokenize(raw_input_text, padding=True, truncation=True)
input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray()
predicted_category_encoded = self.model.predict(input_feature_vector)
predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded)
return predicted_category[0]
def initialize_tokenizer(self, model_name):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
def save_model(self):
print("saving model...to pickle...")
with open(self.model_path, 'wb') as f:
pickle.dump(self.model, f)
def load_model(self):
print("loading model...from pickle...")
with open(self.model_path, 'rb') as f:
return pickle.load(f)
if __name__ == "__main__":
tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
data_path = 'data/data.csv'
model_path = 'mental_health_model.pkl'
mental_classifier = MentalHealthClassifier(data_path, model_path)
if not os.path.exists(model_path):
mental_classifier.initialize_tokenizer(tokenizer_model_name)
X, y = mental_classifier.preprocess_data()
y_test, y_pred = mental_classifier.train_model(X, y)
mental_classifier.save_model()
else:
mental_classifier.load_model()
mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle
mental_classifier.preprocess_data()
input_text = "I feel bullied online."
predicted_category = mental_classifier.predict_category(input_text)
print("Predicted mental health condition:", predicted_category)
|