File size: 3,679 Bytes
667fe9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from __future__ import annotations

import warnings
from functools import lru_cache
from typing import TYPE_CHECKING, Sequence

import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES
from utils import get_cache_memory, get_random_state

if TYPE_CHECKING:
    from pathlib import Path

    from numpy import ndarray
    from numpy.random import RandomState


__all__ = ["predict", "tokenize"]


@lru_cache(maxsize=1)
def get_model(model_path: Path) -> Pipeline:
    return joblib.load(model_path)


@lru_cache(maxsize=1)
def get_tokenizer(tokenizer_path: Path) -> Pipeline:
    return joblib.load(tokenizer_path)


def export_to_file(pipeline: Pipeline, path: Path) -> None:
    joblib.dump(pipeline, path)


def tokenize(text: str, tokenizer_path: Path) -> ndarray:
    tokenizer = get_tokenizer(tokenizer_path)
    return tokenizer.transform([text])[0]


def predict(tokens: ndarray, model_path: Path) -> bool:
    model = get_model(model_path)
    prediction = model.predict([tokens])
    return prediction[0] == 1


def train_and_export(
    steps: Sequence[tuple],
    x: list[str],
    y: list[int],
    export_path: Path,
    cache: joblib.Memory,
) -> Pipeline:
    pipeline = Pipeline(steps, memory=cache)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pipeline.fit(x, y)

    export_to_file(pipeline, export_path)
    return pipeline


def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline:
    return train_and_export(
        [
            (
                "vectorize",
                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
            ),
            ("tfidf", TfidfTransformer()),
        ],
        x,
        y,
        export_path,
        cache,
    )


def train_model_and_export(
    x: ndarray,
    y: list[int],
    export_path: Path,
    cache: joblib.Memory,
    rs: RandomState,
) -> Pipeline:
    return train_and_export(
        [("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))],
        x,
        y,
        export_path,
        cache,
    )


def train(x: list[str], y: list[int]) -> Pipeline:
    cache = get_cache_memory()
    rs = get_random_state()

    tokenizer = train_tokenizer(x, y, cache)
    x_tr = tokenizer.transform(x)

    model = train_model(x_tr, y, cache, rs)

    return Pipeline([("tokenizer", tokenizer), ("model", model)])


def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline:
    # TODO: In the future, allow for different tokenizers
    pipeline = Pipeline(
        [
            (
                "vectorize",
                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
            ),
            ("tfidf", TfidfTransformer()),
        ],
        memory=cache,
    )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")  # Ignore joblib warnings
        pipeline.fit(x, y)

    return pipeline


def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline:
    # TODO: In the future, allow for different classifiers
    pipeline = Pipeline(
        [
            ("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)),
        ],
        memory=cache,
    )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")  # Ignore joblib warnings
        pipeline.fit(x, y)

    return pipeline