Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

sentiment-analysis / deprecated /train.py

Tymec

Restructure project into package structure

667fe9d 7 months ago

raw

history blame

4.9 kB

	from __future__ import annotations

	import warnings
	from pathlib import Path
	from typing import TYPE_CHECKING

	import click
	import joblib
	import pandas as pd
	from numpy.random import RandomState
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, classification_report
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline

	if TYPE_CHECKING:
	from sklearn.base import BaseEstimator

	SEED = 42
	DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
	STOPWORDS_PATH = Path("data/stopwords-en.txt")
	CHECKPOINT_PATH = Path("cache/pipeline.pkl")
	MODELS_DIR = Path("models")
	CACHE_DIR = Path("cache")
	MAX_FEATURES = 10000 # 500000

	# Make sure paths exist
	MODELS_DIR.mkdir(parents=True, exist_ok=True)
	CACHE_DIR.mkdir(parents=True, exist_ok=True)

	# Memory cache for sklearn pipelines
	mem = joblib.Memory(CACHE_DIR, verbose=0)

	# TODO: use xgboost


	def get_random_state(seed: int = SEED) -> RandomState:
	return RandomState(seed)


	def load_data() -> tuple[list[str], list[int]]:
	"""The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
	data = pd.read_csv(
	DATASET_PATH,
	encoding="ISO-8859-1",
	names=[
	"target", # 0 = negative, 2 = neutral, 4 = positive
	"id", # The id of the tweet
	"date", # The date of the tweet
	"flag", # The query, NO_QUERY if not present
	"user", # The user that tweeted
	"text", # The text of the tweet
	],
	)

	# Ignore rows with neutral sentiment
	data = data[data["target"] != 2]

	# Create new column called "sentiment" with 1 for positive and 0 for negative
	data["sentiment"] = data["target"] == 4

	# Drop the columns we don't need
	# data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need

	# Return as lists
	return list(data["text"]), list(data["sentiment"])


	def create_pipeline(clf: BaseEstimator) -> Pipeline:
	return Pipeline(
	[
	# Preprocess
	# ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
	# ("tfidf", TfidfTransformer()),
	("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
	# Classifier
	("clf", clf),
	],
	memory=mem,
	)


	def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
	y_pred = pipeline.predict(x)
	report = classification_report(y, y_pred)
	click.echo(report)

	# TODO: Confusion matrix

	return accuracy_score(y, y_pred)


	def export_pipeline(pipeline: Pipeline, name: str) -> None:
	model_path = MODELS_DIR / f"{name}.pkl"
	joblib.dump(pipeline, model_path)
	click.echo(f"Model exported to {model_path!r}")


	@click.command()
	@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
	@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
	@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
	@click.option("--seed", type=int, default=SEED, help="Random seed.")
	def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
	rng = get_random_state(seed)

	# Clear sklearn cache
	if flush_cache:
	click.echo("Clearing cache... ", nl=False)
	mem.clear(warn=False)
	click.echo("DONE")

	# Load and split data
	click.echo("Loading data... ", nl=False)
	x, y = load_data()
	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
	click.echo("DONE")

	# Train model
	if retrain or not CHECKPOINT_PATH.exists():
	click.echo("Training model... ", nl=False)
	clf = LogisticRegression(max_iter=1000, random_state=rng)
	model = create_pipeline(clf)
	with warnings.catch_warnings():
	warnings.simplefilter("ignore") # Ignore joblib warnings
	model.fit(x_train, y_train)
	joblib.dump(model, CHECKPOINT_PATH)
	click.echo("DONE")
	else:
	click.echo("Loading model... ", nl=False)
	model = joblib.load(CHECKPOINT_PATH)
	click.echo("DONE")

	# Evaluate model
	if evaluate:
	evaluate_pipeline(model, x_test, y_test)

	# Quick test
	test_text = ["I love this movie", "I hate this movie"]
	click.echo("Quick test:")
	for text in test_text:
	click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")

	# Export model
	click.echo("Exporting model... ", nl=False)
	export_pipeline(model, "logistic_regression")
	click.echo("DONE")


	if __name__ == "__main__":
	train()