Spaces:
Running
Running
Restructure project into package structure
Browse files- .vscode/settings.json +1 -1
- README.md +15 -0
- app/__init__.py +0 -0
- app/constants.py +16 -0
- app/gui.py +92 -0
- app/model.py +144 -0
- app/utils.py +164 -0
- deprecated/__init__.py +0 -0
- deprecated/main.py +44 -0
- deprecated/train.py +152 -0
- pyproject.toml +1 -0
- style.css +3 -0
.vscode/settings.json
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"notebook.formatOnSave.enabled": true,
|
3 |
"notebook.codeActionsOnSave": {
|
4 |
"notebook.source.fixAll": "explicit",
|
5 |
-
"
|
6 |
},
|
7 |
"[python]": {
|
8 |
"editor.formatOnSave": true,
|
|
|
2 |
"notebook.formatOnSave.enabled": true,
|
3 |
"notebook.codeActionsOnSave": {
|
4 |
"notebook.source.fixAll": "explicit",
|
5 |
+
"source.organizeImports": "explicit"
|
6 |
},
|
7 |
"[python]": {
|
8 |
"editor.formatOnSave": true,
|
README.md
CHANGED
@@ -6,3 +6,18 @@ Sentiment Analysis
|
|
6 |
2. `cd` into the repository
|
7 |
3. Run `just install` to install the dependencies
|
8 |
4. Run `just run --help` to see the available commands
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
2. `cd` into the repository
|
7 |
3. Run `just install` to install the dependencies
|
8 |
4. Run `just run --help` to see the available commands
|
9 |
+
|
10 |
+
|
11 |
+
### TODO
|
12 |
+
- [ ] CLI using `click` (commands: predict, train, evaluate) with settings set via flags or environment variables
|
13 |
+
- [ ] GUI using `gradio` (tabs: predict, train, evaluate, compare, settings)
|
14 |
+
- [ ] For the sklearn model, add more classifiers
|
15 |
+
- [ ] Use random search for hyperparameter tuning and grid search for fine-tuning
|
16 |
+
- [ ] Finish the text pre-processing transformer
|
17 |
+
- [ ] For vectorization, use custom stopwords
|
18 |
+
- [ ] Write own tokenizer/vectorizer
|
19 |
+
- [ ] Add more datasets
|
20 |
+
- [ ] Add more models (e.g. BERT)
|
21 |
+
- [ ] Write tests
|
22 |
+
- [ ] Use xgboost?
|
23 |
+
- [ ] Deploy to huggingface?
|
app/__init__.py
ADDED
File without changes
|
app/constants.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
DEFAULT_SEED: int = 42
|
4 |
+
MAX_TOKENIZER_FEATURES: int = 500000
|
5 |
+
CLF_MAX_ITER: int = 1000
|
6 |
+
|
7 |
+
DATASET_PATH: Path = Path("data/training.1600000.processed.noemoticon.csv")
|
8 |
+
STOPWORDS_PATH: Path = Path("data/stopwords-en.txt")
|
9 |
+
MODELS_DIR: Path = Path("models")
|
10 |
+
CACHE_DIR: Path = Path("cache")
|
11 |
+
CHECKPOINT_PATH: Path = CACHE_DIR / "pipeline.pkl"
|
12 |
+
|
13 |
+
|
14 |
+
# Create directories if they don't exist
|
15 |
+
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
16 |
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
app/gui.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from constants import MODELS_DIR
|
8 |
+
from model import predict, tokenize
|
9 |
+
|
10 |
+
CSS_PATH = Path("style.css")
|
11 |
+
TOKENIZER_EXT = ".tokenizer.pkl"
|
12 |
+
MODEL_EXT = ".model.pkl"
|
13 |
+
POSITIVE_LABEL = "Positive 😊"
|
14 |
+
NEGATIVE_LABEL = "Negative 😤"
|
15 |
+
REFRESH_SYMBOL = "🔄"
|
16 |
+
|
17 |
+
|
18 |
+
def load_style() -> str:
|
19 |
+
if not CSS_PATH.is_file():
|
20 |
+
return ""
|
21 |
+
|
22 |
+
with Path.open(CSS_PATH) as f:
|
23 |
+
return f.read()
|
24 |
+
|
25 |
+
|
26 |
+
def predict_wrapper(text: str, tokenizer: str, model: str) -> str:
|
27 |
+
toks = tokenize(text, MODELS_DIR / f"{tokenizer}{TOKENIZER_EXT}")
|
28 |
+
pred = predict(toks, MODELS_DIR / f"{model}{MODEL_EXT}")
|
29 |
+
return POSITIVE_LABEL if pred else NEGATIVE_LABEL
|
30 |
+
|
31 |
+
|
32 |
+
def train_wrapper() -> None:
|
33 |
+
msg = "Training is not supported in the GUI."
|
34 |
+
raise NotImplementedError(msg)
|
35 |
+
|
36 |
+
|
37 |
+
def evaluate_wrapper() -> None:
|
38 |
+
msg = "Evaluation is not supported in the GUI."
|
39 |
+
raise NotImplementedError(msg)
|
40 |
+
|
41 |
+
|
42 |
+
with gr.Blocks(css=load_style()) as demo:
|
43 |
+
gr.Markdown("## Sentiment Analysis")
|
44 |
+
|
45 |
+
with gr.Row(equal_height=True):
|
46 |
+
textbox = gr.Textbox(
|
47 |
+
lines=10,
|
48 |
+
label="Enter text to analyze",
|
49 |
+
placeholder="Enter text here",
|
50 |
+
key="input-textbox",
|
51 |
+
)
|
52 |
+
|
53 |
+
with gr.Column():
|
54 |
+
output = gr.Label()
|
55 |
+
|
56 |
+
with gr.Row(elem_classes="justify-between"):
|
57 |
+
clear_btn = gr.ClearButton([textbox, output], value="Clear 🧹")
|
58 |
+
analyze_btn = gr.Button(
|
59 |
+
"Analyze 🔍",
|
60 |
+
variant="primary",
|
61 |
+
interactive=False,
|
62 |
+
)
|
63 |
+
|
64 |
+
with gr.Row():
|
65 |
+
tokenizer_selector = gr.Dropdown(
|
66 |
+
choices=[tkn.stem[: -len(".tokenizer")] for tkn in MODELS_DIR.glob(f"*{TOKENIZER_EXT}")],
|
67 |
+
label="Tokenizer",
|
68 |
+
key="tokenizer-selector",
|
69 |
+
)
|
70 |
+
|
71 |
+
model_selector = gr.Dropdown(
|
72 |
+
choices=[mdl.stem[: -len(".model")] for mdl in MODELS_DIR.glob(f"*{MODEL_EXT}")],
|
73 |
+
label="Model",
|
74 |
+
key="model-selector",
|
75 |
+
)
|
76 |
+
|
77 |
+
# TODO: Refresh button
|
78 |
+
|
79 |
+
# Event handlers
|
80 |
+
textbox.input(
|
81 |
+
fn=lambda text: gr.update(interactive=bool(text.strip())),
|
82 |
+
inputs=[textbox],
|
83 |
+
outputs=[analyze_btn],
|
84 |
+
)
|
85 |
+
analyze_btn.click(
|
86 |
+
fn=predict_wrapper,
|
87 |
+
inputs=[textbox, tokenizer_selector, model_selector],
|
88 |
+
outputs=[output],
|
89 |
+
)
|
90 |
+
|
91 |
+
demo.queue()
|
92 |
+
demo.launch()
|
app/model.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import warnings
|
4 |
+
from functools import lru_cache
|
5 |
+
from typing import TYPE_CHECKING, Sequence
|
6 |
+
|
7 |
+
import joblib
|
8 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
from sklearn.pipeline import Pipeline
|
11 |
+
|
12 |
+
from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES
|
13 |
+
from utils import get_cache_memory, get_random_state
|
14 |
+
|
15 |
+
if TYPE_CHECKING:
|
16 |
+
from pathlib import Path
|
17 |
+
|
18 |
+
from numpy import ndarray
|
19 |
+
from numpy.random import RandomState
|
20 |
+
|
21 |
+
|
22 |
+
__all__ = ["predict", "tokenize"]
|
23 |
+
|
24 |
+
|
25 |
+
@lru_cache(maxsize=1)
|
26 |
+
def get_model(model_path: Path) -> Pipeline:
|
27 |
+
return joblib.load(model_path)
|
28 |
+
|
29 |
+
|
30 |
+
@lru_cache(maxsize=1)
|
31 |
+
def get_tokenizer(tokenizer_path: Path) -> Pipeline:
|
32 |
+
return joblib.load(tokenizer_path)
|
33 |
+
|
34 |
+
|
35 |
+
def export_to_file(pipeline: Pipeline, path: Path) -> None:
|
36 |
+
joblib.dump(pipeline, path)
|
37 |
+
|
38 |
+
|
39 |
+
def tokenize(text: str, tokenizer_path: Path) -> ndarray:
|
40 |
+
tokenizer = get_tokenizer(tokenizer_path)
|
41 |
+
return tokenizer.transform([text])[0]
|
42 |
+
|
43 |
+
|
44 |
+
def predict(tokens: ndarray, model_path: Path) -> bool:
|
45 |
+
model = get_model(model_path)
|
46 |
+
prediction = model.predict([tokens])
|
47 |
+
return prediction[0] == 1
|
48 |
+
|
49 |
+
|
50 |
+
def train_and_export(
|
51 |
+
steps: Sequence[tuple],
|
52 |
+
x: list[str],
|
53 |
+
y: list[int],
|
54 |
+
export_path: Path,
|
55 |
+
cache: joblib.Memory,
|
56 |
+
) -> Pipeline:
|
57 |
+
pipeline = Pipeline(steps, memory=cache)
|
58 |
+
|
59 |
+
with warnings.catch_warnings():
|
60 |
+
warnings.simplefilter("ignore")
|
61 |
+
pipeline.fit(x, y)
|
62 |
+
|
63 |
+
export_to_file(pipeline, export_path)
|
64 |
+
return pipeline
|
65 |
+
|
66 |
+
|
67 |
+
def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline:
|
68 |
+
return train_and_export(
|
69 |
+
[
|
70 |
+
(
|
71 |
+
"vectorize",
|
72 |
+
CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
|
73 |
+
),
|
74 |
+
("tfidf", TfidfTransformer()),
|
75 |
+
],
|
76 |
+
x,
|
77 |
+
y,
|
78 |
+
export_path,
|
79 |
+
cache,
|
80 |
+
)
|
81 |
+
|
82 |
+
|
83 |
+
def train_model_and_export(
|
84 |
+
x: ndarray,
|
85 |
+
y: list[int],
|
86 |
+
export_path: Path,
|
87 |
+
cache: joblib.Memory,
|
88 |
+
rs: RandomState,
|
89 |
+
) -> Pipeline:
|
90 |
+
return train_and_export(
|
91 |
+
[("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))],
|
92 |
+
x,
|
93 |
+
y,
|
94 |
+
export_path,
|
95 |
+
cache,
|
96 |
+
)
|
97 |
+
|
98 |
+
|
99 |
+
def train(x: list[str], y: list[int]) -> Pipeline:
|
100 |
+
cache = get_cache_memory()
|
101 |
+
rs = get_random_state()
|
102 |
+
|
103 |
+
tokenizer = train_tokenizer(x, y, cache)
|
104 |
+
x_tr = tokenizer.transform(x)
|
105 |
+
|
106 |
+
model = train_model(x_tr, y, cache, rs)
|
107 |
+
|
108 |
+
return Pipeline([("tokenizer", tokenizer), ("model", model)])
|
109 |
+
|
110 |
+
|
111 |
+
def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline:
|
112 |
+
# TODO: In the future, allow for different tokenizers
|
113 |
+
pipeline = Pipeline(
|
114 |
+
[
|
115 |
+
(
|
116 |
+
"vectorize",
|
117 |
+
CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
|
118 |
+
),
|
119 |
+
("tfidf", TfidfTransformer()),
|
120 |
+
],
|
121 |
+
memory=cache,
|
122 |
+
)
|
123 |
+
|
124 |
+
with warnings.catch_warnings():
|
125 |
+
warnings.simplefilter("ignore") # Ignore joblib warnings
|
126 |
+
pipeline.fit(x, y)
|
127 |
+
|
128 |
+
return pipeline
|
129 |
+
|
130 |
+
|
131 |
+
def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline:
|
132 |
+
# TODO: In the future, allow for different classifiers
|
133 |
+
pipeline = Pipeline(
|
134 |
+
[
|
135 |
+
("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)),
|
136 |
+
],
|
137 |
+
memory=cache,
|
138 |
+
)
|
139 |
+
|
140 |
+
with warnings.catch_warnings():
|
141 |
+
warnings.simplefilter("ignore") # Ignore joblib warnings
|
142 |
+
pipeline.fit(x, y)
|
143 |
+
|
144 |
+
return pipeline
|
app/utils.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility functions"""
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
|
5 |
+
import itertools
|
6 |
+
import re
|
7 |
+
import warnings
|
8 |
+
from collections import deque
|
9 |
+
from enum import Enum
|
10 |
+
from functools import lru_cache
|
11 |
+
from threading import Event, Lock
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
from joblib import Memory
|
15 |
+
from numpy.random import RandomState
|
16 |
+
|
17 |
+
from constants import CACHE_DIR, DEFAULT_SEED
|
18 |
+
|
19 |
+
__all__ = ["colorize", "wrap_queued_call", "get_random_state", "get_cache_memory"]
|
20 |
+
|
21 |
+
|
22 |
+
ANSI_RESET = 0
|
23 |
+
|
24 |
+
|
25 |
+
class Color(Enum):
|
26 |
+
"""ANSI color codes."""
|
27 |
+
|
28 |
+
BLACK = 30
|
29 |
+
RED = 31
|
30 |
+
GREEN = 32
|
31 |
+
YELLOW = 33
|
32 |
+
BLUE = 34
|
33 |
+
MAGENTA = 35
|
34 |
+
CYAN = 36
|
35 |
+
WHITE = 37
|
36 |
+
|
37 |
+
|
38 |
+
class Style(Enum):
|
39 |
+
"""ANSI style codes."""
|
40 |
+
|
41 |
+
BOLD = 1
|
42 |
+
DIM = 2
|
43 |
+
ITALIC = 3
|
44 |
+
UNDERLINE = 4
|
45 |
+
BLINK = 5
|
46 |
+
INVERTED = 7
|
47 |
+
HIDDEN = 8
|
48 |
+
|
49 |
+
|
50 |
+
# https://gist.github.com/vitaliyp/6d54dd76ca2c3cdfc1149d33007dc34a
|
51 |
+
class FIFOLock:
|
52 |
+
def __init__(self):
|
53 |
+
self._lock = Lock()
|
54 |
+
self._inner_lock = Lock()
|
55 |
+
self._pending_threads = deque()
|
56 |
+
|
57 |
+
def acquire(self, blocking: bool = True) -> bool:
|
58 |
+
with self._inner_lock:
|
59 |
+
lock_acquired = self._lock.acquire(False)
|
60 |
+
if lock_acquired:
|
61 |
+
return True
|
62 |
+
if not blocking:
|
63 |
+
return False
|
64 |
+
|
65 |
+
release_event = Event()
|
66 |
+
self._pending_threads.append(release_event)
|
67 |
+
|
68 |
+
release_event.wait()
|
69 |
+
return self._lock.acquire()
|
70 |
+
|
71 |
+
def release(self) -> None:
|
72 |
+
with self._inner_lock:
|
73 |
+
if self._pending_threads:
|
74 |
+
release_event = self._pending_threads.popleft()
|
75 |
+
release_event.set()
|
76 |
+
|
77 |
+
self._lock.release()
|
78 |
+
|
79 |
+
__enter__ = acquire
|
80 |
+
|
81 |
+
def __exit__(self, _t, _v, _tb): # noqa: ANN001
|
82 |
+
self.release()
|
83 |
+
|
84 |
+
|
85 |
+
@lru_cache(maxsize=1)
|
86 |
+
def get_queue_lock() -> FIFOLock:
|
87 |
+
return FIFOLock()
|
88 |
+
|
89 |
+
|
90 |
+
@lru_cache(maxsize=1)
|
91 |
+
def get_random_state(seed: int = DEFAULT_SEED) -> RandomState:
|
92 |
+
return RandomState(seed)
|
93 |
+
|
94 |
+
|
95 |
+
@lru_cache(maxsize=1)
|
96 |
+
def get_cache_memory() -> Memory:
|
97 |
+
return Memory(CACHE_DIR, verbose=0)
|
98 |
+
|
99 |
+
|
100 |
+
def to_ansi(code: int) -> str:
|
101 |
+
"""Convert an integer to an ANSI escape code."""
|
102 |
+
return f"\033[{code}m"
|
103 |
+
|
104 |
+
|
105 |
+
@lru_cache(maxsize=None)
|
106 |
+
def get_ansi_color(color: Color, bright: bool = False, background: bool = False) -> str:
|
107 |
+
"""Get ANSI color code for the specified color, brightness and background."""
|
108 |
+
code = color.value
|
109 |
+
if bright:
|
110 |
+
code += 60
|
111 |
+
if background:
|
112 |
+
code += 10
|
113 |
+
return to_ansi(code)
|
114 |
+
|
115 |
+
|
116 |
+
def replace_color_tag(color: Color, text: str) -> None:
|
117 |
+
"""Replace both dark and light color tags for background and foreground."""
|
118 |
+
for bright, bg in itertools.product([False, True], repeat=2):
|
119 |
+
tag = f"{'BG_' if bg else ''}{'BRIGHT_' if bright else ''}{color.name}"
|
120 |
+
text = text.replace(f"[{tag}]", get_ansi_color(color, bright=bright, background=bg))
|
121 |
+
text = text.replace(f"[/{tag}]", to_ansi(ANSI_RESET))
|
122 |
+
|
123 |
+
return text
|
124 |
+
|
125 |
+
|
126 |
+
@lru_cache(maxsize=256)
|
127 |
+
def colorize(text: str, strip: bool = True) -> str:
|
128 |
+
"""Format text with ANSI color codes using tags [COLOR], [BG_COLOR] and [STYLE].
|
129 |
+
Reset color/style with [/TAG].
|
130 |
+
Escape with double brackets [[]]. Strip leading and trailing whitespace if strip=True.
|
131 |
+
"""
|
132 |
+
|
133 |
+
# replace foreground and background color tags
|
134 |
+
for color in Color:
|
135 |
+
text = replace_color_tag(color, text)
|
136 |
+
|
137 |
+
# replace style tags
|
138 |
+
for style in Style:
|
139 |
+
text = text.replace(f"[{style.name}]", to_ansi(style.value)).replace(f"[/{style.name}]", to_ansi(ANSI_RESET))
|
140 |
+
|
141 |
+
# if there are any tags left, remove them and throw a warning
|
142 |
+
pat1 = re.compile(r"((?<!\[)\[)([^\[\]]*)(\](?!\]))")
|
143 |
+
for match in pat1.finditer(text):
|
144 |
+
color = match.group(1)
|
145 |
+
text = text.replace(match.group(0), "")
|
146 |
+
warnings.warn(f"Invalid color tag: {color!r}", UserWarning, stacklevel=2)
|
147 |
+
|
148 |
+
# escape double brackets
|
149 |
+
pat2 = re.compile(r"\[\[[^\[\]\v]+\]\]")
|
150 |
+
text = pat2.sub("", text)
|
151 |
+
|
152 |
+
# reset color/style at the end
|
153 |
+
text += to_ansi(ANSI_RESET)
|
154 |
+
|
155 |
+
return text.strip() if strip else text
|
156 |
+
|
157 |
+
|
158 |
+
# https://github.com/AUTOMATIC1111/stable-diffusion-webui/modules/call_queue.py
|
159 |
+
def wrap_queued_call(func: callable) -> callable:
|
160 |
+
def f(*args, **kwargs) -> Any: # noqa: ANN003, ANN002
|
161 |
+
with get_queue_lock():
|
162 |
+
return func(*args, **kwargs)
|
163 |
+
|
164 |
+
return f
|
deprecated/__init__.py
ADDED
File without changes
|
deprecated/main.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import click
|
6 |
+
import joblib
|
7 |
+
|
8 |
+
from app.utils import colorize
|
9 |
+
|
10 |
+
|
11 |
+
@click.group()
|
12 |
+
def cli() -> None: ...
|
13 |
+
|
14 |
+
|
15 |
+
@cli.command("predict")
|
16 |
+
@click.option(
|
17 |
+
"-m",
|
18 |
+
"--model",
|
19 |
+
"model_path",
|
20 |
+
default="models/model.pkl",
|
21 |
+
help="Path to the model file.",
|
22 |
+
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
|
23 |
+
)
|
24 |
+
@click.argument("text", nargs=-1)
|
25 |
+
def predict(model_path: Path, text: list[str]) -> None:
|
26 |
+
input_text = " ".join(text).strip()
|
27 |
+
if not input_text:
|
28 |
+
click.echo("[RED]Error[/RED]: Input text is empty.")
|
29 |
+
return
|
30 |
+
|
31 |
+
# Load the model
|
32 |
+
click.echo("Loading model... ", nl=False)
|
33 |
+
model = joblib.load(model_path)
|
34 |
+
click.echo(colorize("[GREEN]DONE"))
|
35 |
+
|
36 |
+
# Run the model
|
37 |
+
click.echo("Performing sentiment analysis... ", nl=False)
|
38 |
+
prediction = model.predict([input_text])
|
39 |
+
sentiment = "[GREEN]POSITIVE" if prediction[0] == 1 else "[RED]NEGATIVE"
|
40 |
+
click.echo(colorize(sentiment))
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
cli()
|
deprecated/train.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import warnings
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import TYPE_CHECKING
|
6 |
+
|
7 |
+
import click
|
8 |
+
import joblib
|
9 |
+
import pandas as pd
|
10 |
+
from numpy.random import RandomState
|
11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
+
from sklearn.linear_model import LogisticRegression
|
13 |
+
from sklearn.metrics import accuracy_score, classification_report
|
14 |
+
from sklearn.model_selection import train_test_split
|
15 |
+
from sklearn.pipeline import Pipeline
|
16 |
+
|
17 |
+
if TYPE_CHECKING:
|
18 |
+
from sklearn.base import BaseEstimator
|
19 |
+
|
20 |
+
SEED = 42
|
21 |
+
DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
|
22 |
+
STOPWORDS_PATH = Path("data/stopwords-en.txt")
|
23 |
+
CHECKPOINT_PATH = Path("cache/pipeline.pkl")
|
24 |
+
MODELS_DIR = Path("models")
|
25 |
+
CACHE_DIR = Path("cache")
|
26 |
+
MAX_FEATURES = 10000 # 500000
|
27 |
+
|
28 |
+
# Make sure paths exist
|
29 |
+
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
30 |
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
31 |
+
|
32 |
+
# Memory cache for sklearn pipelines
|
33 |
+
mem = joblib.Memory(CACHE_DIR, verbose=0)
|
34 |
+
|
35 |
+
# TODO: use xgboost
|
36 |
+
|
37 |
+
|
38 |
+
def get_random_state(seed: int = SEED) -> RandomState:
|
39 |
+
return RandomState(seed)
|
40 |
+
|
41 |
+
|
42 |
+
def load_data() -> tuple[list[str], list[int]]:
|
43 |
+
"""The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
|
44 |
+
data = pd.read_csv(
|
45 |
+
DATASET_PATH,
|
46 |
+
encoding="ISO-8859-1",
|
47 |
+
names=[
|
48 |
+
"target", # 0 = negative, 2 = neutral, 4 = positive
|
49 |
+
"id", # The id of the tweet
|
50 |
+
"date", # The date of the tweet
|
51 |
+
"flag", # The query, NO_QUERY if not present
|
52 |
+
"user", # The user that tweeted
|
53 |
+
"text", # The text of the tweet
|
54 |
+
],
|
55 |
+
)
|
56 |
+
|
57 |
+
# Ignore rows with neutral sentiment
|
58 |
+
data = data[data["target"] != 2]
|
59 |
+
|
60 |
+
# Create new column called "sentiment" with 1 for positive and 0 for negative
|
61 |
+
data["sentiment"] = data["target"] == 4
|
62 |
+
|
63 |
+
# Drop the columns we don't need
|
64 |
+
# data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need
|
65 |
+
|
66 |
+
# Return as lists
|
67 |
+
return list(data["text"]), list(data["sentiment"])
|
68 |
+
|
69 |
+
|
70 |
+
def create_pipeline(clf: BaseEstimator) -> Pipeline:
|
71 |
+
return Pipeline(
|
72 |
+
[
|
73 |
+
# Preprocess
|
74 |
+
# ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
|
75 |
+
# ("tfidf", TfidfTransformer()),
|
76 |
+
("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
|
77 |
+
# Classifier
|
78 |
+
("clf", clf),
|
79 |
+
],
|
80 |
+
memory=mem,
|
81 |
+
)
|
82 |
+
|
83 |
+
|
84 |
+
def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
|
85 |
+
y_pred = pipeline.predict(x)
|
86 |
+
report = classification_report(y, y_pred)
|
87 |
+
click.echo(report)
|
88 |
+
|
89 |
+
# TODO: Confusion matrix
|
90 |
+
|
91 |
+
return accuracy_score(y, y_pred)
|
92 |
+
|
93 |
+
|
94 |
+
def export_pipeline(pipeline: Pipeline, name: str) -> None:
|
95 |
+
model_path = MODELS_DIR / f"{name}.pkl"
|
96 |
+
joblib.dump(pipeline, model_path)
|
97 |
+
click.echo(f"Model exported to {model_path!r}")
|
98 |
+
|
99 |
+
|
100 |
+
@click.command()
|
101 |
+
@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
|
102 |
+
@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
|
103 |
+
@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
|
104 |
+
@click.option("--seed", type=int, default=SEED, help="Random seed.")
|
105 |
+
def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
|
106 |
+
rng = get_random_state(seed)
|
107 |
+
|
108 |
+
# Clear sklearn cache
|
109 |
+
if flush_cache:
|
110 |
+
click.echo("Clearing cache... ", nl=False)
|
111 |
+
mem.clear(warn=False)
|
112 |
+
click.echo("DONE")
|
113 |
+
|
114 |
+
# Load and split data
|
115 |
+
click.echo("Loading data... ", nl=False)
|
116 |
+
x, y = load_data()
|
117 |
+
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
|
118 |
+
click.echo("DONE")
|
119 |
+
|
120 |
+
# Train model
|
121 |
+
if retrain or not CHECKPOINT_PATH.exists():
|
122 |
+
click.echo("Training model... ", nl=False)
|
123 |
+
clf = LogisticRegression(max_iter=1000, random_state=rng)
|
124 |
+
model = create_pipeline(clf)
|
125 |
+
with warnings.catch_warnings():
|
126 |
+
warnings.simplefilter("ignore") # Ignore joblib warnings
|
127 |
+
model.fit(x_train, y_train)
|
128 |
+
joblib.dump(model, CHECKPOINT_PATH)
|
129 |
+
click.echo("DONE")
|
130 |
+
else:
|
131 |
+
click.echo("Loading model... ", nl=False)
|
132 |
+
model = joblib.load(CHECKPOINT_PATH)
|
133 |
+
click.echo("DONE")
|
134 |
+
|
135 |
+
# Evaluate model
|
136 |
+
if evaluate:
|
137 |
+
evaluate_pipeline(model, x_test, y_test)
|
138 |
+
|
139 |
+
# Quick test
|
140 |
+
test_text = ["I love this movie", "I hate this movie"]
|
141 |
+
click.echo("Quick test:")
|
142 |
+
for text in test_text:
|
143 |
+
click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")
|
144 |
+
|
145 |
+
# Export model
|
146 |
+
click.echo("Exporting model... ", nl=False)
|
147 |
+
export_pipeline(model, "logistic_regression")
|
148 |
+
click.echo("DONE")
|
149 |
+
|
150 |
+
|
151 |
+
if __name__ == "__main__":
|
152 |
+
train()
|
pyproject.toml
CHANGED
@@ -108,6 +108,7 @@ ignore = [
|
|
108 |
"PERF203", # ignore for now; investigate
|
109 |
"T201", # print
|
110 |
"ANN204", # missing-return-type-special-method
|
|
|
111 |
]
|
112 |
select = ["ALL"]
|
113 |
# Allow unused variables when underscore-prefixed
|
|
|
108 |
"PERF203", # ignore for now; investigate
|
109 |
"T201", # print
|
110 |
"ANN204", # missing-return-type-special-method
|
111 |
+
"ERA001", # commented-out-code
|
112 |
]
|
113 |
select = ["ALL"]
|
114 |
# Allow unused variables when underscore-prefixed
|
style.css
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.justify-between {
|
2 |
+
justify-content: space-between;
|
3 |
+
}
|