Spaces:

Omdena-MentalHealth-team3
/

FomoFix

Sleeping

FomoFix / xgb_mental_health.py

ASledziewska

Update xgb_mental_health.py

8c1f1b5 8 months ago

3.64 kB

	import os.path
	import pickle
	import pandas as pd
	from transformers import AutoTokenizer
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import CountVectorizer
	from xgboost import XGBClassifier
	from sklearn.preprocessing import LabelEncoder

	class MentalHealthClassifier:
	def __init__(self, data_path, model_path):
	self.data = pd.read_csv(data_path, skip_blank_lines=True)
	self.data['category'] = ['anxiety' if x == 'axienty' else x for x in self.data['category']]
	self.data.dropna(subset=['text'], inplace=True)
	self.data.dropna(subset=['clean_text'], inplace=True)
	self.data_selected = self.data[['clean_text', 'category']]
	self.df = pd.DataFrame(self.data_selected)
	self.label_encoder = LabelEncoder()
	self.df['category_encoded'] = self.label_encoder.fit_transform(self.df['category'])
	self.tokenizer = None
	self.vectorizer = CountVectorizer()
	self.model_path = model_path
	self.model = self.load_model() if os.path.exists(model_path) else XGBClassifier()

	def preprocess_data(self):
	tokenized_texts = [self.tokenizer.tokenize(text, padding=True, truncation=True) for text in self.df['clean_text']]
	X = self.vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_texts]).toarray()
	return X, self.df['category_encoded']

	def train_model(self, X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	self.model.fit(X_train, y_train)
	y_pred = self.model.predict(X_test)
	return y_test, y_pred

	def predict_category(self, input_text):
	if self.tokenizer is None:
	raise ValueError("Tokenizer not initialized. Call 'initialize_tokenizer' first.")
	tokenized_input = self.tokenizer.tokenize(input_text, padding=True, truncation=True)
	input_feature_vector = self.vectorizer.transform([' '.join(tokenized_input)]).toarray()
	predicted_category_encoded = self.model.predict(input_feature_vector)
	predicted_category = self.label_encoder.inverse_transform(predicted_category_encoded)
	return predicted_category[0]

	def initialize_tokenizer(self, model_name):
	self.model_name = model_name
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

	def save_model(self):
	print("saving model...to pickle...")
	with open(self.model_path, 'wb') as f:
	pickle.dump(self.model, f)

	def load_model(self):
	print("loading model...from pickle...")
	with open(self.model_path, 'rb') as f:
	return pickle.load(f)

	if __name__ == "__main__":
	tokenizer_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
	data_path = 'data/data.csv'
	model_path = 'mental_health_model.pkl'
	mental_classifier = MentalHealthClassifier(data_path, model_path)

	if not os.path.exists(model_path):
	mental_classifier.initialize_tokenizer(tokenizer_model_name)
	X, y = mental_classifier.preprocess_data()
	y_test, y_pred = mental_classifier.train_model(X, y)
	mental_classifier.save_model()
	else:
	mental_classifier.load_model()
	mental_classifier.initialize_tokenizer(tokenizer_model_name) # Ensure tokenizer is initialized if loading model from pickle
	mental_classifier.preprocess_data()

	# input_text = "I feel anxiety whenever i am doing nothing."
	# predicted_category = mental_classifier.predict_category(input_text)
	# print("Predicted mental health condition:", predicted_category)