Spaces:
Running
Running
from __future__ import annotations | |
import warnings | |
from functools import lru_cache | |
from typing import TYPE_CHECKING, Sequence | |
import joblib | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.pipeline import Pipeline | |
from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES | |
from utils import get_cache_memory, get_random_state | |
if TYPE_CHECKING: | |
from pathlib import Path | |
from numpy import ndarray | |
from numpy.random import RandomState | |
__all__ = ["predict", "tokenize"] | |
def get_model(model_path: Path) -> Pipeline: | |
return joblib.load(model_path) | |
def get_tokenizer(tokenizer_path: Path) -> Pipeline: | |
return joblib.load(tokenizer_path) | |
def export_to_file(pipeline: Pipeline, path: Path) -> None: | |
joblib.dump(pipeline, path) | |
def tokenize(text: str, tokenizer_path: Path) -> ndarray: | |
tokenizer = get_tokenizer(tokenizer_path) | |
return tokenizer.transform([text])[0] | |
def predict(tokens: ndarray, model_path: Path) -> bool: | |
model = get_model(model_path) | |
prediction = model.predict([tokens]) | |
return prediction[0] == 1 | |
def train_and_export( | |
steps: Sequence[tuple], | |
x: list[str], | |
y: list[int], | |
export_path: Path, | |
cache: joblib.Memory, | |
) -> Pipeline: | |
pipeline = Pipeline(steps, memory=cache) | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
pipeline.fit(x, y) | |
export_to_file(pipeline, export_path) | |
return pipeline | |
def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline: | |
return train_and_export( | |
[ | |
( | |
"vectorize", | |
CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES), | |
), | |
("tfidf", TfidfTransformer()), | |
], | |
x, | |
y, | |
export_path, | |
cache, | |
) | |
def train_model_and_export( | |
x: ndarray, | |
y: list[int], | |
export_path: Path, | |
cache: joblib.Memory, | |
rs: RandomState, | |
) -> Pipeline: | |
return train_and_export( | |
[("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))], | |
x, | |
y, | |
export_path, | |
cache, | |
) | |
def train(x: list[str], y: list[int]) -> Pipeline: | |
cache = get_cache_memory() | |
rs = get_random_state() | |
tokenizer = train_tokenizer(x, y, cache) | |
x_tr = tokenizer.transform(x) | |
model = train_model(x_tr, y, cache, rs) | |
return Pipeline([("tokenizer", tokenizer), ("model", model)]) | |
def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline: | |
# TODO: In the future, allow for different tokenizers | |
pipeline = Pipeline( | |
[ | |
( | |
"vectorize", | |
CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES), | |
), | |
("tfidf", TfidfTransformer()), | |
], | |
memory=cache, | |
) | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") # Ignore joblib warnings | |
pipeline.fit(x, y) | |
return pipeline | |
def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline: | |
# TODO: In the future, allow for different classifiers | |
pipeline = Pipeline( | |
[ | |
("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)), | |
], | |
memory=cache, | |
) | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") # Ignore joblib warnings | |
pipeline.fit(x, y) | |
return pipeline | |