|
|
|
|
|
|
|
|
|
|
|
|
|
import os.path |
|
import pickle |
|
import pandas as pd |
|
from transformers import AutoTokenizer |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from xgboost import XGBClassifier |
|
from sklearn.preprocessing import LabelEncoder |
|
from clean_text_model import TextCleaner |
|
|
|
class MentalHealthClassifier: |
|
def __init__(self, data_path, model_path): |
|
self.data = pd.read_csv(data_path, skip_blank_lines=True) |
|
self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']] |
|
|
|
self.data.dropna(subset=['clean_text'], inplace=True) |
|
self.data_selected = self.data[['clean_text', 'category']] |
|
self.df = pd.DataFrame(self.data_selected) |
|
self.label_encoder = LabelEncoder() |
|
self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category']) |
|
self.tokenizer = None |
|
self.vectorizer = CountVectorizer() |
|
self.text_cleaner = TextCleaner() |
|
self.model_path = model_path |
|
self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier() |
|
|
|
def preprocess_data(self): |
|
tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']] |
|
X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray() |
|
return X, self.df['category_encoded'] |
|
|
|
def train_model(self, X, y): |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
self.model.fit(X_train, y_train) |
|
y_pred = self.model.predict(X_test) |
|
return y_test, y_pred |
|
|
|
def predict_category(self, raw_input_text): |
|
if self.tokenizer is None: |
|
raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.") |
|
input_text = self.text_cleaner.cleaning_text(raw_input_text) |
|
tokenized_input = self.tokenizer.tokenize(raw_input_text, padding=True, truncation=True) |
|
input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray() |
|
predicted_category_encoded = self.model.predict(input_feature_vector) |
|
predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded) |
|
return predicted_category[0] |
|
|
|
def initialize_tokenizer(self, model_name): |
|
self.model_name = model_name |
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
|
|
|
def save_model(self): |
|
print("saving model...to pickle...") |
|
with open(self.model_path, 'wb') as f: |
|
pickle.dump(self.model, f) |
|
|
|
def load_model(self): |
|
print("loading model...from pickle...") |
|
with open(self.model_path, 'rb') as f: |
|
return pickle.load(f) |
|
|
|
if __name__ == "__main__": |
|
tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment" |
|
data_path = 'data/processed/data.csv' |
|
model_path = 'app/mental_health_model.pkl' |
|
mental_classifier = MentalHealthClassifier(data_path, model_path) |
|
|
|
if not os.path.exists(model_path): |
|
mental_classifier.initialize_tokenizer(tokenizer_model_name) |
|
X, y = mental_classifier.preprocess_data() |
|
y_test, y_pred = mental_classifier.train_model(X, y) |
|
mental_classifier.save_model() |
|
else: |
|
mental_classifier.load_model() |
|
mental_classifier.initialize_tokenizer(tokenizer_model_name) |
|
mental_classifier.preprocess_data() |
|
|
|
input_text = "I feel bullied online." |
|
predicted_category = mental_classifier.predict_category(input_text) |
|
print("Predicted mental health condition:", predicted_category) |
|
|