Spaces:
Paused
Paused
from functions_preprocess import LinguisticPreprocessor, fit_model, training_data | |
from datasets import load_dataset | |
import pandas as pd | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def main(): | |
#####load dataset | |
dataset_1 = load_dataset("rotten_tomatoes") | |
dataset_2 = load_dataset('sst2') | |
dataset_2 = dataset_2.rename_column('sentence', 'text') | |
dataset_3 = load_dataset('imdb') | |
X_train, y_train, X_test, y_test = training_data(dataset_1, dataset_2, dataset_3) | |
pipeline = Pipeline( | |
steps=[ | |
("processor", LinguisticPreprocessor()), | |
("vectorizer", TfidfVectorizer(ngram_range=(1, 2))), | |
("model", SGDClassifier(loss="log_loss", n_jobs = -1, alpha=0.000001, penalty= 'elasticnet'))]) | |
####### fit model and save the results | |
fit_model(pipeline, X_train, y_train, X_test, y_test) | |
predictions = pipeline.predict(X_test) | |
# Create a DataFrame with index and predictions | |
results_df = pd.DataFrame({ | |
"index": range(len(predictions)), | |
"pred": predictions}) | |
# Save the DataFrame to a CSV file | |
results_df.to_csv("results.csv", index=False) | |
if __name__ == "__main__": | |
main() | |
# model_pkl_file = "sentiment_model.pkl" | |
# | |
# with open(model_pkl_file, 'wb') as file: | |
# pickle.dump(pipeline, file) | |