QuophyDzifa
/

Sentiment-Analysis-Model

Text Classification

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Community

Sentiment-Analysis-Model / scripts /utils.py

QuophyDzifa's picture

add files

a7758d0 over 1 year ago

history blame contribute delete

3.26 kB

	import pandas as pd
	import numpy as np
	from sklearn import metrics
	from sklearn.model_selection import train_test_split
	from sklearn import preprocessing
	from sklearn.pipeline import Pipeline
	from sklearn.impute import SimpleImputer
	# preprocessing.OneHotEncoder, preprocessing.StandardScaler
	from sklearn.pipeline import Pipeline
	from sklearn import compose # ColumnTransformer
	# Config
	from zindi_challenge.scripts.config import *
	# Models
	from sklearn import linear_model
	from sklearn import svm
	from sklearn import ensemble

	#######################################################################

	# DATA GATHERING
	## Needed functions

	## Main function
	def data_gathering():
	"""This function will load and aggregate all the data then will output a pandas DataFrame
	"""
	list_of_dataframes = [] # to store temporarily the loaded Dframes to later concat them
	list_of_csv_filepaths = []
	# list_of_json_filepaths = [] #uncomment if needed, Add other variables below if needed

	# Load CSV files
	for fp in list_of_csv_filepaths:
	df = pd.read_csv(fp)
	list_of_dataframes.append(df)


	# Load other files (Json, txt, etc)
	## Do like above. you can load manually if some files need special arguments, during loading.
	## You may write loading functions at the section `## Needed functions` and call them here.



	# Concatenate all the dataframes
	global_df = pd.concat(list_of_dataframes, axis=0)

	return global_df


	# PROCESSING
	## Needed functions
	def remove_rows_with_Nan(data):
	"""
	"""
	return data[ ~data.isna().any(axis=1) ]


	## Main function
	def processing(dataset:pd.DataFrame = data_gathering()):
	"""This function will process the dataset then return a dataset ready-for-modelling
	"""

	numerical_cols = dataset.select_dtypes(include=np.number).columns.tolist()
	categorical_cols = dataset.select_dtypes(exclude=np.number).columns.tolist()

	# categorical_attributes = list(forest_fire.select_dtypes(include=['object']).columns)
	# numerical_attributes = list(dataset.select_dtypes(include=['float64', 'int64']).columns)

	num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
	('drop_attributes', AttributeDeleter()),
	('std_scaler', preprocessing.StandardScaler()),
	])
	full_pipeline = compose.ColumnTransformer([('num', num_pipeline, numerical_attributes),
	('cat', OneHotEncoder(), categorical_attributes),
	])

	train = full_pipeline.fit_transform(dataset)

	columns_to_keep = []
	df_dataset_processed = None
	return df_dataset_processed[columns_to_keep]


	# MODELLING
	## Needed functions

	## Main function
	def modelling(y_col):
	"""This function will take training data as input then return train models
	"""
	# Use the processing function to retrieve the processed dataset
	df_dataset_processed = processing()

	# Separate X and y
	X, y = df_dataset_processed.drop(columns=y_col) ,df_dataset_processed[y_col]

	# Instantiate ML models

	# Train ML models
	model = linear_model.LogisticRegression().fit(X, y)
	return model