Spaces:

Tymec
/

sentiment-analysis

Sleeping

App Files Files

Tymec commited on May 15, 2024

Commit

667fe9d

1 Parent(s): 7d4eb47

Restructure project into package structure

Browse files

Files changed (12) hide show

.vscode/settings.json +1 -1
README.md +15 -0
app/__init__.py +0 -0
app/constants.py +16 -0
app/gui.py +92 -0
app/model.py +144 -0
app/utils.py +164 -0
deprecated/__init__.py +0 -0
deprecated/main.py +44 -0
deprecated/train.py +152 -0
pyproject.toml +1 -0
style.css +3 -0

.vscode/settings.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "notebook.formatOnSave.enabled": true,
   "notebook.codeActionsOnSave": {
     "notebook.source.fixAll": "explicit",
-    "notebook.source.organizeImports": "explicit"
   },
   "[python]": {
     "editor.formatOnSave": true,

   "notebook.formatOnSave.enabled": true,
   "notebook.codeActionsOnSave": {
     "notebook.source.fixAll": "explicit",
+    "source.organizeImports": "explicit"
   },
   "[python]": {
     "editor.formatOnSave": true,

README.md CHANGED Viewed

@@ -6,3 +6,18 @@ Sentiment Analysis
 2. `cd` into the repository
 3. Run `just install` to install the dependencies
 4. Run `just run --help` to see the available commands

 2. `cd` into the repository
 3. Run `just install` to install the dependencies
 4. Run `just run --help` to see the available commands
+### TODO
+- [ ] CLI using `click` (commands: predict, train, evaluate) with settings set via flags or environment variables
+- [ ] GUI using `gradio` (tabs: predict, train, evaluate, compare, settings)
+- [ ] For the sklearn model, add more classifiers
+- [ ] Use random search for hyperparameter tuning and grid search for fine-tuning
+- [ ] Finish the text pre-processing transformer
+- [ ] For vectorization, use custom stopwords
+- [ ] Write own tokenizer/vectorizer
+- [ ] Add more datasets
+- [ ] Add more models (e.g. BERT)
+- [ ] Write tests
+- [ ] Use xgboost?
+- [ ] Deploy to huggingface?

app/__init__.py ADDED Viewed

File without changes

app/constants.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from pathlib import Path
+DEFAULT_SEED: int = 42
+MAX_TOKENIZER_FEATURES: int = 500000
+CLF_MAX_ITER: int = 1000
+DATASET_PATH: Path = Path("data/training.1600000.processed.noemoticon.csv")
+STOPWORDS_PATH: Path = Path("data/stopwords-en.txt")
+MODELS_DIR: Path = Path("models")
+CACHE_DIR: Path = Path("cache")
+CHECKPOINT_PATH: Path = CACHE_DIR / "pipeline.pkl"
+# Create directories if they don't exist
+MODELS_DIR.mkdir(parents=True, exist_ok=True)
+CACHE_DIR.mkdir(parents=True, exist_ok=True)

app/gui.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from __future__ import annotations
+from pathlib import Path
+import gradio as gr
+from constants import MODELS_DIR
+from model import predict, tokenize
+CSS_PATH = Path("style.css")
+TOKENIZER_EXT = ".tokenizer.pkl"
+MODEL_EXT = ".model.pkl"
+POSITIVE_LABEL = "Positive 😊"
+NEGATIVE_LABEL = "Negative 😤"
+REFRESH_SYMBOL = "🔄"
+def load_style() -> str:
+    if not CSS_PATH.is_file():
+        return ""
+    with Path.open(CSS_PATH) as f:
+        return f.read()
+def predict_wrapper(text: str, tokenizer: str, model: str) -> str:
+    toks = tokenize(text, MODELS_DIR / f"{tokenizer}{TOKENIZER_EXT}")
+    pred = predict(toks, MODELS_DIR / f"{model}{MODEL_EXT}")
+    return POSITIVE_LABEL if pred else NEGATIVE_LABEL
+def train_wrapper() -> None:
+    msg = "Training is not supported in the GUI."
+    raise NotImplementedError(msg)
+def evaluate_wrapper() -> None:
+    msg = "Evaluation is not supported in the GUI."
+    raise NotImplementedError(msg)
+with gr.Blocks(css=load_style()) as demo:
+    gr.Markdown("## Sentiment Analysis")
+    with gr.Row(equal_height=True):
+        textbox = gr.Textbox(
+            lines=10,
+            label="Enter text to analyze",
+            placeholder="Enter text here",
+            key="input-textbox",
+        )
+        with gr.Column():
+            output = gr.Label()
+            with gr.Row(elem_classes="justify-between"):
+                clear_btn = gr.ClearButton([textbox, output], value="Clear 🧹")
+                analyze_btn = gr.Button(
+                    "Analyze 🔍",
+                    variant="primary",
+                    interactive=False,
+                )
+            with gr.Row():
+                tokenizer_selector = gr.Dropdown(
+                    choices=[tkn.stem[: -len(".tokenizer")] for tkn in MODELS_DIR.glob(f"*{TOKENIZER_EXT}")],
+                    label="Tokenizer",
+                    key="tokenizer-selector",
+                )
+                model_selector = gr.Dropdown(
+                    choices=[mdl.stem[: -len(".model")] for mdl in MODELS_DIR.glob(f"*{MODEL_EXT}")],
+                    label="Model",
+                    key="model-selector",
+                )
+                # TODO: Refresh button
+    # Event handlers
+    textbox.input(
+        fn=lambda text: gr.update(interactive=bool(text.strip())),
+        inputs=[textbox],
+        outputs=[analyze_btn],
+    )
+    analyze_btn.click(
+        fn=predict_wrapper,
+        inputs=[textbox, tokenizer_selector, model_selector],
+        outputs=[output],
+    )
+demo.queue()
+demo.launch()

app/model.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from __future__ import annotations
+import warnings
+from functools import lru_cache
+from typing import TYPE_CHECKING, Sequence
+import joblib
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES
+from utils import get_cache_memory, get_random_state
+if TYPE_CHECKING:
+    from pathlib import Path
+    from numpy import ndarray
+    from numpy.random import RandomState
+__all__ = ["predict", "tokenize"]
+@lru_cache(maxsize=1)
+def get_model(model_path: Path) -> Pipeline:
+    return joblib.load(model_path)
+@lru_cache(maxsize=1)
+def get_tokenizer(tokenizer_path: Path) -> Pipeline:
+    return joblib.load(tokenizer_path)
+def export_to_file(pipeline: Pipeline, path: Path) -> None:
+    joblib.dump(pipeline, path)
+def tokenize(text: str, tokenizer_path: Path) -> ndarray:
+    tokenizer = get_tokenizer(tokenizer_path)
+    return tokenizer.transform([text])[0]
+def predict(tokens: ndarray, model_path: Path) -> bool:
+    model = get_model(model_path)
+    prediction = model.predict([tokens])
+    return prediction[0] == 1
+def train_and_export(
+    steps: Sequence[tuple],
+    x: list[str],
+    y: list[int],
+    export_path: Path,
+    cache: joblib.Memory,
+) -> Pipeline:
+    pipeline = Pipeline(steps, memory=cache)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        pipeline.fit(x, y)
+    export_to_file(pipeline, export_path)
+    return pipeline
+def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline:
+    return train_and_export(
+        [
+            (
+                "vectorize",
+                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
+            ),
+            ("tfidf", TfidfTransformer()),
+        ],
+        x,
+        y,
+        export_path,
+        cache,
+    )
+def train_model_and_export(
+    x: ndarray,
+    y: list[int],
+    export_path: Path,
+    cache: joblib.Memory,
+    rs: RandomState,
+) -> Pipeline:
+    return train_and_export(
+        [("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))],
+        x,
+        y,
+        export_path,
+        cache,
+    )
+def train(x: list[str], y: list[int]) -> Pipeline:
+    cache = get_cache_memory()
+    rs = get_random_state()
+    tokenizer = train_tokenizer(x, y, cache)
+    x_tr = tokenizer.transform(x)
+    model = train_model(x_tr, y, cache, rs)
+    return Pipeline([("tokenizer", tokenizer), ("model", model)])
+def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline:
+    # TODO: In the future, allow for different tokenizers
+    pipeline = Pipeline(
+        [
+            (
+                "vectorize",
+                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
+            ),
+            ("tfidf", TfidfTransformer()),
+        ],
+        memory=cache,
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")  # Ignore joblib warnings
+        pipeline.fit(x, y)
+    return pipeline
+def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline:
+    # TODO: In the future, allow for different classifiers
+    pipeline = Pipeline(
+        [
+            ("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)),
+        ],
+        memory=cache,
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")  # Ignore joblib warnings
+        pipeline.fit(x, y)
+    return pipeline

app/utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Utility functions"""
+from __future__ import annotations
+import itertools
+import re
+import warnings
+from collections import deque
+from enum import Enum
+from functools import lru_cache
+from threading import Event, Lock
+from typing import Any
+from joblib import Memory
+from numpy.random import RandomState
+from constants import CACHE_DIR, DEFAULT_SEED
+__all__ = ["colorize", "wrap_queued_call", "get_random_state", "get_cache_memory"]
+ANSI_RESET = 0
+class Color(Enum):
+    """ANSI color codes."""
+    BLACK = 30
+    RED = 31
+    GREEN = 32
+    YELLOW = 33
+    BLUE = 34
+    MAGENTA = 35
+    CYAN = 36
+    WHITE = 37
+class Style(Enum):
+    """ANSI style codes."""
+    BOLD = 1
+    DIM = 2
+    ITALIC = 3
+    UNDERLINE = 4
+    BLINK = 5
+    INVERTED = 7
+    HIDDEN = 8
+# https://gist.github.com/vitaliyp/6d54dd76ca2c3cdfc1149d33007dc34a
+class FIFOLock:
+    def __init__(self):
+        self._lock = Lock()
+        self._inner_lock = Lock()
+        self._pending_threads = deque()
+    def acquire(self, blocking: bool = True) -> bool:
+        with self._inner_lock:
+            lock_acquired = self._lock.acquire(False)
+            if lock_acquired:
+                return True
+            if not blocking:
+                return False
+            release_event = Event()
+            self._pending_threads.append(release_event)
+        release_event.wait()
+        return self._lock.acquire()
+    def release(self) -> None:
+        with self._inner_lock:
+            if self._pending_threads:
+                release_event = self._pending_threads.popleft()
+                release_event.set()
+            self._lock.release()
+    __enter__ = acquire
+    def __exit__(self, _t, _v, _tb):  # noqa: ANN001
+        self.release()
+@lru_cache(maxsize=1)
+def get_queue_lock() -> FIFOLock:
+    return FIFOLock()
+@lru_cache(maxsize=1)
+def get_random_state(seed: int = DEFAULT_SEED) -> RandomState:
+    return RandomState(seed)
+@lru_cache(maxsize=1)
+def get_cache_memory() -> Memory:
+    return Memory(CACHE_DIR, verbose=0)
+def to_ansi(code: int) -> str:
+    """Convert an integer to an ANSI escape code."""
+    return f"\033[{code}m"
+@lru_cache(maxsize=None)
+def get_ansi_color(color: Color, bright: bool = False, background: bool = False) -> str:
+    """Get ANSI color code for the specified color, brightness and background."""
+    code = color.value
+    if bright:
+        code += 60
+    if background:
+        code += 10
+    return to_ansi(code)
+def replace_color_tag(color: Color, text: str) -> None:
+    """Replace both dark and light color tags for background and foreground."""
+    for bright, bg in itertools.product([False, True], repeat=2):
+        tag = f"{'BG_' if bg else ''}{'BRIGHT_' if bright else ''}{color.name}"
+        text = text.replace(f"[{tag}]", get_ansi_color(color, bright=bright, background=bg))
+        text = text.replace(f"[/{tag}]", to_ansi(ANSI_RESET))
+    return text
+@lru_cache(maxsize=256)
+def colorize(text: str, strip: bool = True) -> str:
+    """Format text with ANSI color codes using tags [COLOR], [BG_COLOR] and [STYLE].
+    Reset color/style with [/TAG].
+    Escape with double brackets [[]]. Strip leading and trailing whitespace if strip=True.
+    """
+    # replace foreground and background color tags
+    for color in Color:
+        text = replace_color_tag(color, text)
+    # replace style tags
+    for style in Style:
+        text = text.replace(f"[{style.name}]", to_ansi(style.value)).replace(f"[/{style.name}]", to_ansi(ANSI_RESET))
+    # if there are any tags left, remove them and throw a warning
+    pat1 = re.compile(r"((?<!\[)\[)([^\[\]]*)(\](?!\]))")
+    for match in pat1.finditer(text):
+        color = match.group(1)
+        text = text.replace(match.group(0), "")
+        warnings.warn(f"Invalid color tag: {color!r}", UserWarning, stacklevel=2)
+    # escape double brackets
+    pat2 = re.compile(r"\[\[[^\[\]\v]+\]\]")
+    text = pat2.sub("", text)
+    # reset color/style at the end
+    text += to_ansi(ANSI_RESET)
+    return text.strip() if strip else text
+# https://github.com/AUTOMATIC1111/stable-diffusion-webui/modules/call_queue.py
+def wrap_queued_call(func: callable) -> callable:
+    def f(*args, **kwargs) -> Any:  # noqa: ANN003, ANN002
+        with get_queue_lock():
+            return func(*args, **kwargs)
+    return f

deprecated/__init__.py ADDED Viewed

File without changes

deprecated/main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import annotations
+from pathlib import Path
+import click
+import joblib
+from app.utils import colorize
+@click.group()
+def cli() -> None: ...
+@cli.command("predict")
+@click.option(
+    "-m",
+    "--model",
+    "model_path",
+    default="models/model.pkl",
+    help="Path to the model file.",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
+)
+@click.argument("text", nargs=-1)
+def predict(model_path: Path, text: list[str]) -> None:
+    input_text = " ".join(text).strip()
+    if not input_text:
+        click.echo("[RED]Error[/RED]: Input text is empty.")
+        return
+    # Load the model
+    click.echo("Loading model... ", nl=False)
+    model = joblib.load(model_path)
+    click.echo(colorize("[GREEN]DONE"))
+    # Run the model
+    click.echo("Performing sentiment analysis... ", nl=False)
+    prediction = model.predict([input_text])
+    sentiment = "[GREEN]POSITIVE" if prediction[0] == 1 else "[RED]NEGATIVE"
+    click.echo(colorize(sentiment))
+if __name__ == "__main__":
+    cli()

deprecated/train.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from __future__ import annotations
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING
+import click
+import joblib
+import pandas as pd
+from numpy.random import RandomState
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+if TYPE_CHECKING:
+    from sklearn.base import BaseEstimator
+SEED = 42
+DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
+STOPWORDS_PATH = Path("data/stopwords-en.txt")
+CHECKPOINT_PATH = Path("cache/pipeline.pkl")
+MODELS_DIR = Path("models")
+CACHE_DIR = Path("cache")
+MAX_FEATURES = 10000  # 500000
+# Make sure paths exist
+MODELS_DIR.mkdir(parents=True, exist_ok=True)
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+# Memory cache for sklearn pipelines
+mem = joblib.Memory(CACHE_DIR, verbose=0)
+# TODO: use xgboost
+def get_random_state(seed: int = SEED) -> RandomState:
+    return RandomState(seed)
+def load_data() -> tuple[list[str], list[int]]:
+    """The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
+    data = pd.read_csv(
+        DATASET_PATH,
+        encoding="ISO-8859-1",
+        names=[
+            "target",  # 0 = negative, 2 = neutral, 4 = positive
+            "id",  # The id of the tweet
+            "date",  # The date of the tweet
+            "flag",  # The query, NO_QUERY if not present
+            "user",  # The user that tweeted
+            "text",  # The text of the tweet
+        ],
+    )
+    # Ignore rows with neutral sentiment
+    data = data[data["target"] != 2]
+    # Create new column called "sentiment" with 1 for positive and 0 for negative
+    data["sentiment"] = data["target"] == 4
+    # Drop the columns we don't need
+    # data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need
+    # Return as lists
+    return list(data["text"]), list(data["sentiment"])
+def create_pipeline(clf: BaseEstimator) -> Pipeline:
+    return Pipeline(
+        [
+            # Preprocess
+            # ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
+            # ("tfidf", TfidfTransformer()),
+            ("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
+            # Classifier
+            ("clf", clf),
+        ],
+        memory=mem,
+    )
+def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
+    y_pred = pipeline.predict(x)
+    report = classification_report(y, y_pred)
+    click.echo(report)
+    # TODO: Confusion matrix
+    return accuracy_score(y, y_pred)
+def export_pipeline(pipeline: Pipeline, name: str) -> None:
+    model_path = MODELS_DIR / f"{name}.pkl"
+    joblib.dump(pipeline, model_path)
+    click.echo(f"Model exported to {model_path!r}")
+@click.command()
+@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
+@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
+@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
+@click.option("--seed", type=int, default=SEED, help="Random seed.")
+def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
+    rng = get_random_state(seed)
+    # Clear sklearn cache
+    if flush_cache:
+        click.echo("Clearing cache... ", nl=False)
+        mem.clear(warn=False)
+        click.echo("DONE")
+    # Load and split data
+    click.echo("Loading data... ", nl=False)
+    x, y = load_data()
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
+    click.echo("DONE")
+    # Train model
+    if retrain or not CHECKPOINT_PATH.exists():
+        click.echo("Training model... ", nl=False)
+        clf = LogisticRegression(max_iter=1000, random_state=rng)
+        model = create_pipeline(clf)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")  # Ignore joblib warnings
+            model.fit(x_train, y_train)
+        joblib.dump(model, CHECKPOINT_PATH)
+        click.echo("DONE")
+    else:
+        click.echo("Loading model... ", nl=False)
+        model = joblib.load(CHECKPOINT_PATH)
+        click.echo("DONE")
+    # Evaluate model
+    if evaluate:
+        evaluate_pipeline(model, x_test, y_test)
+    # Quick test
+    test_text = ["I love this movie", "I hate this movie"]
+    click.echo("Quick test:")
+    for text in test_text:
+        click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")
+    # Export model
+    click.echo("Exporting model... ", nl=False)
+    export_pipeline(model, "logistic_regression")
+    click.echo("DONE")
+if __name__ == "__main__":
+    train()

pyproject.toml CHANGED Viewed

@@ -108,6 +108,7 @@ ignore = [
   "PERF203", # ignore for now; investigate
   "T201",    # print
   "ANN204",  # missing-return-type-special-method
 ]
 select = ["ALL"]
 # Allow unused variables when underscore-prefixed

   "PERF203", # ignore for now; investigate
   "T201",    # print
   "ANN204",  # missing-return-type-special-method
+  "ERA001",  # commented-out-code
 ]
 select = ["ALL"]
 # Allow unused variables when underscore-prefixed

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+.justify-between {
+  justify-content: space-between;
+}