Movie_Analyzer / main.py
efeperro's picture
Upload 4 files
93eddbd verified
from functions_preprocess import LinguisticPreprocessor, fit_model, training_data
from datasets import load_dataset
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
def main():
#####load dataset
dataset_1 = load_dataset("rotten_tomatoes")
dataset_2 = load_dataset('sst2')
dataset_2 = dataset_2.rename_column('sentence', 'text')
dataset_3 = load_dataset('imdb')
X_train, y_train, X_test, y_test = training_data(dataset_1, dataset_2, dataset_3)
pipeline = Pipeline(
steps=[
("processor", LinguisticPreprocessor()),
("vectorizer", TfidfVectorizer(ngram_range=(1, 2))),
("model", SGDClassifier(loss="log_loss", n_jobs = -1, alpha=0.000001, penalty= 'elasticnet'))])
####### fit model and save the results
fit_model(pipeline, X_train, y_train, X_test, y_test)
predictions = pipeline.predict(X_test)
# Create a DataFrame with index and predictions
results_df = pd.DataFrame({
"index": range(len(predictions)),
"pred": predictions})
# Save the DataFrame to a CSV file
results_df.to_csv("results.csv", index=False)
if __name__ == "__main__":
main()
# model_pkl_file = "sentiment_model.pkl"
#
# with open(model_pkl_file, 'wb') as file:
# pickle.dump(pipeline, file)