Spaces:
Running
Running
File size: 3,679 Bytes
667fe9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from __future__ import annotations
import warnings
from functools import lru_cache
from typing import TYPE_CHECKING, Sequence
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES
from utils import get_cache_memory, get_random_state
if TYPE_CHECKING:
from pathlib import Path
from numpy import ndarray
from numpy.random import RandomState
__all__ = ["predict", "tokenize"]
@lru_cache(maxsize=1)
def get_model(model_path: Path) -> Pipeline:
return joblib.load(model_path)
@lru_cache(maxsize=1)
def get_tokenizer(tokenizer_path: Path) -> Pipeline:
return joblib.load(tokenizer_path)
def export_to_file(pipeline: Pipeline, path: Path) -> None:
joblib.dump(pipeline, path)
def tokenize(text: str, tokenizer_path: Path) -> ndarray:
tokenizer = get_tokenizer(tokenizer_path)
return tokenizer.transform([text])[0]
def predict(tokens: ndarray, model_path: Path) -> bool:
model = get_model(model_path)
prediction = model.predict([tokens])
return prediction[0] == 1
def train_and_export(
steps: Sequence[tuple],
x: list[str],
y: list[int],
export_path: Path,
cache: joblib.Memory,
) -> Pipeline:
pipeline = Pipeline(steps, memory=cache)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pipeline.fit(x, y)
export_to_file(pipeline, export_path)
return pipeline
def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline:
return train_and_export(
[
(
"vectorize",
CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
),
("tfidf", TfidfTransformer()),
],
x,
y,
export_path,
cache,
)
def train_model_and_export(
x: ndarray,
y: list[int],
export_path: Path,
cache: joblib.Memory,
rs: RandomState,
) -> Pipeline:
return train_and_export(
[("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))],
x,
y,
export_path,
cache,
)
def train(x: list[str], y: list[int]) -> Pipeline:
cache = get_cache_memory()
rs = get_random_state()
tokenizer = train_tokenizer(x, y, cache)
x_tr = tokenizer.transform(x)
model = train_model(x_tr, y, cache, rs)
return Pipeline([("tokenizer", tokenizer), ("model", model)])
def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline:
# TODO: In the future, allow for different tokenizers
pipeline = Pipeline(
[
(
"vectorize",
CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
),
("tfidf", TfidfTransformer()),
],
memory=cache,
)
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Ignore joblib warnings
pipeline.fit(x, y)
return pipeline
def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline:
# TODO: In the future, allow for different classifiers
pipeline = Pipeline(
[
("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)),
],
memory=cache,
)
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Ignore joblib warnings
pipeline.fit(x, y)
return pipeline
|