Spaces:

Tymec
/

sentiment-analysis

Running

File size: 4,896 Bytes

667fe9d

from __future__ import annotations

import warnings
from pathlib import Path
from typing import TYPE_CHECKING

import click
import joblib
import pandas as pd
from numpy.random import RandomState
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

if TYPE_CHECKING:
    from sklearn.base import BaseEstimator

SEED = 42
DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
STOPWORDS_PATH = Path("data/stopwords-en.txt")
CHECKPOINT_PATH = Path("cache/pipeline.pkl")
MODELS_DIR = Path("models")
CACHE_DIR = Path("cache")
MAX_FEATURES = 10000  # 500000

# Make sure paths exist
MODELS_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Memory cache for sklearn pipelines
mem = joblib.Memory(CACHE_DIR, verbose=0)

# TODO: use xgboost


def get_random_state(seed: int = SEED) -> RandomState:
    return RandomState(seed)


def load_data() -> tuple[list[str], list[int]]:
    """The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
    data = pd.read_csv(
        DATASET_PATH,
        encoding="ISO-8859-1",
        names=[
            "target",  # 0 = negative, 2 = neutral, 4 = positive
            "id",  # The id of the tweet
            "date",  # The date of the tweet
            "flag",  # The query, NO_QUERY if not present
            "user",  # The user that tweeted
            "text",  # The text of the tweet
        ],
    )

    # Ignore rows with neutral sentiment
    data = data[data["target"] != 2]

    # Create new column called "sentiment" with 1 for positive and 0 for negative
    data["sentiment"] = data["target"] == 4

    # Drop the columns we don't need
    # data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need

    # Return as lists
    return list(data["text"]), list(data["sentiment"])


def create_pipeline(clf: BaseEstimator) -> Pipeline:
    return Pipeline(
        [
            # Preprocess
            # ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
            # ("tfidf", TfidfTransformer()),
            ("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
            # Classifier
            ("clf", clf),
        ],
        memory=mem,
    )


def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
    y_pred = pipeline.predict(x)
    report = classification_report(y, y_pred)
    click.echo(report)

    # TODO: Confusion matrix

    return accuracy_score(y, y_pred)


def export_pipeline(pipeline: Pipeline, name: str) -> None:
    model_path = MODELS_DIR / f"{name}.pkl"
    joblib.dump(pipeline, model_path)
    click.echo(f"Model exported to {model_path!r}")


@click.command()
@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
@click.option("--seed", type=int, default=SEED, help="Random seed.")
def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
    rng = get_random_state(seed)

    # Clear sklearn cache
    if flush_cache:
        click.echo("Clearing cache... ", nl=False)
        mem.clear(warn=False)
        click.echo("DONE")

    # Load and split data
    click.echo("Loading data... ", nl=False)
    x, y = load_data()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
    click.echo("DONE")

    # Train model
    if retrain or not CHECKPOINT_PATH.exists():
        click.echo("Training model... ", nl=False)
        clf = LogisticRegression(max_iter=1000, random_state=rng)
        model = create_pipeline(clf)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")  # Ignore joblib warnings
            model.fit(x_train, y_train)
        joblib.dump(model, CHECKPOINT_PATH)
        click.echo("DONE")
    else:
        click.echo("Loading model... ", nl=False)
        model = joblib.load(CHECKPOINT_PATH)
        click.echo("DONE")

    # Evaluate model
    if evaluate:
        evaluate_pipeline(model, x_test, y_test)

    # Quick test
    test_text = ["I love this movie", "I hate this movie"]
    click.echo("Quick test:")
    for text in test_text:
        click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")

    # Export model
    click.echo("Exporting model... ", nl=False)
    export_pipeline(model, "logistic_regression")
    click.echo("DONE")


if __name__ == "__main__":
    train()