Tymec's picture
Restructure project into package structure
667fe9d
raw
history blame
4.9 kB
from __future__ import annotations
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
import click
import joblib
import pandas as pd
from numpy.random import RandomState
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
if TYPE_CHECKING:
from sklearn.base import BaseEstimator
SEED = 42
DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
STOPWORDS_PATH = Path("data/stopwords-en.txt")
CHECKPOINT_PATH = Path("cache/pipeline.pkl")
MODELS_DIR = Path("models")
CACHE_DIR = Path("cache")
MAX_FEATURES = 10000 # 500000
# Make sure paths exist
MODELS_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Memory cache for sklearn pipelines
mem = joblib.Memory(CACHE_DIR, verbose=0)
# TODO: use xgboost
def get_random_state(seed: int = SEED) -> RandomState:
return RandomState(seed)
def load_data() -> tuple[list[str], list[int]]:
"""The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
data = pd.read_csv(
DATASET_PATH,
encoding="ISO-8859-1",
names=[
"target", # 0 = negative, 2 = neutral, 4 = positive
"id", # The id of the tweet
"date", # The date of the tweet
"flag", # The query, NO_QUERY if not present
"user", # The user that tweeted
"text", # The text of the tweet
],
)
# Ignore rows with neutral sentiment
data = data[data["target"] != 2]
# Create new column called "sentiment" with 1 for positive and 0 for negative
data["sentiment"] = data["target"] == 4
# Drop the columns we don't need
# data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need
# Return as lists
return list(data["text"]), list(data["sentiment"])
def create_pipeline(clf: BaseEstimator) -> Pipeline:
return Pipeline(
[
# Preprocess
# ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
# ("tfidf", TfidfTransformer()),
("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
# Classifier
("clf", clf),
],
memory=mem,
)
def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
y_pred = pipeline.predict(x)
report = classification_report(y, y_pred)
click.echo(report)
# TODO: Confusion matrix
return accuracy_score(y, y_pred)
def export_pipeline(pipeline: Pipeline, name: str) -> None:
model_path = MODELS_DIR / f"{name}.pkl"
joblib.dump(pipeline, model_path)
click.echo(f"Model exported to {model_path!r}")
@click.command()
@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
@click.option("--seed", type=int, default=SEED, help="Random seed.")
def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
rng = get_random_state(seed)
# Clear sklearn cache
if flush_cache:
click.echo("Clearing cache... ", nl=False)
mem.clear(warn=False)
click.echo("DONE")
# Load and split data
click.echo("Loading data... ", nl=False)
x, y = load_data()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
click.echo("DONE")
# Train model
if retrain or not CHECKPOINT_PATH.exists():
click.echo("Training model... ", nl=False)
clf = LogisticRegression(max_iter=1000, random_state=rng)
model = create_pipeline(clf)
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Ignore joblib warnings
model.fit(x_train, y_train)
joblib.dump(model, CHECKPOINT_PATH)
click.echo("DONE")
else:
click.echo("Loading model... ", nl=False)
model = joblib.load(CHECKPOINT_PATH)
click.echo("DONE")
# Evaluate model
if evaluate:
evaluate_pipeline(model, x_test, y_test)
# Quick test
test_text = ["I love this movie", "I hate this movie"]
click.echo("Quick test:")
for text in test_text:
click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")
# Export model
click.echo("Exporting model... ", nl=False)
export_pipeline(model, "logistic_regression")
click.echo("DONE")
if __name__ == "__main__":
train()