Spaces:
Sleeping
Sleeping
import os.path | |
import pickle | |
import pandas as pd | |
from transformers import AutoTokenizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import CountVectorizer | |
from xgboost import XGBClassifier | |
from sklearn.preprocessing import LabelEncoder | |
class MentalHealthClassifier: | |
def __init__(self, data_path, model_path): | |
self.data = pd.read_csv(data_path, skip_blank_lines=True) | |
self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']] | |
self.data.dropna(subset=['text'], inplace=True) | |
self.data.dropna(subset=['clean_text'], inplace=True) | |
self.data_selected = self.data[['clean_text', 'category']] | |
self.df = pd.DataFrame(self.data_selected) | |
self.label_encoder = LabelEncoder() | |
self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category']) | |
self.tokenizer = None | |
self.vectorizer = CountVectorizer() | |
self.model_path = model_path | |
self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier() | |
def preprocess_data(self): | |
tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']] | |
X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray() | |
return X, self.df['category_encoded'] | |
def train_model(self, X, y): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
self.model.fit(X_train, y_train) | |
y_pred = self.model.predict(X_test) | |
return y_test, y_pred | |
def predict_category(self, input_text): | |
if self.tokenizer is None: | |
raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.") | |
tokenized_input = self.tokenizer.tokenize(input_text, padding=True, truncation=True) | |
input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray() | |
predicted_category_encoded = self.model.predict(input_feature_vector) | |
predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded) | |
return predicted_category[0] | |
def initialize_tokenizer(self, model_name): | |
self.model_name = model_name | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
def save_model(self): | |
print("saving model...to pickle...") | |
with open(self.model_path, 'wb') as f: | |
pickle.dump(self.model, f) | |
def load_model(self): | |
print("loading model...from pickle...") | |
with open(self.model_path, 'rb') as f: | |
return pickle.load(f) | |
if __name__ == "__main__": | |
tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment" | |
data_path = 'data/data.csv' | |
model_path = 'mental_health_model.pkl' | |
mental_classifier = MentalHealthClassifier(data_path, model_path) | |
if not os.path.exists(model_path): | |
mental_classifier.initialize_tokenizer(tokenizer_model_name) | |
X, y = mental_classifier.preprocess_data() | |
y_test, y_pred = mental_classifier.train_model(X, y) | |
mental_classifier.save_model() | |
else: | |
mental_classifier.load_model() | |
mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle | |
mental_classifier.preprocess_data() | |
# input_text = "I feel anxiety whenever i am doing nothing." | |
# predicted_category = mental_classifier.predict_category(input_text) | |
# print("Predicted mental health condition:", predicted_category) | |