pasha
commited on
Commit
•
b4d132c
1
Parent(s):
236bb7f
Version updated
Browse files- tokenizer.json +0 -0
- tokenizer.py +40 -20
- tokenizer_config.json +5 -1
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import json
|
3 |
import re
|
|
|
4 |
from typing import List
|
5 |
|
6 |
from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
|
@@ -12,21 +13,23 @@ DEFAULT_MODEL_NAME = "evilfreelancer/ruMorpheme-v0.2"
|
|
12 |
|
13 |
END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
|
14 |
SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
|
15 |
-
SPACE = 11
|
16 |
|
17 |
AUXILIARY = [
|
18 |
"</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
|
19 |
"system", "user", "assistant", "function_call", "function_response",
|
20 |
-
" ",
|
21 |
]
|
22 |
|
23 |
NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
|
|
|
|
24 |
|
25 |
|
26 |
class RuMorphemePreTokenizer:
|
27 |
"""
|
28 |
Pre-tokenizer for RuMorpheme model.
|
29 |
-
Splits on spaces and
|
30 |
Then, applies morpheme splitting to non-space tokens.
|
31 |
"""
|
32 |
|
@@ -35,35 +38,47 @@ class RuMorphemePreTokenizer:
|
|
35 |
self.model.eval()
|
36 |
|
37 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
38 |
-
# First, split on spaces and
|
39 |
pretok.split(self.split_on_spaces)
|
40 |
-
|
41 |
-
|
|
|
42 |
|
43 |
def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
44 |
"""
|
45 |
-
Splits on spaces and
|
46 |
-
TODO: Need to make performance tests on this function.
|
47 |
"""
|
48 |
text = str(normalized_string)
|
49 |
-
splits = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
return splits
|
51 |
|
52 |
-
def
|
53 |
"""
|
54 |
-
|
|
|
55 |
"""
|
56 |
word = str(normalized_string)
|
57 |
|
58 |
-
# If
|
59 |
if word.isspace() or word.isdigit():
|
60 |
return [normalized_string]
|
61 |
|
62 |
-
# Ignore
|
63 |
if not any(c.isalpha() for c in word):
|
64 |
return [normalized_string]
|
65 |
|
66 |
-
# Detect capitalization
|
67 |
cap_token = None
|
68 |
if word[0].isupper():
|
69 |
cap_token = NormalizedString(AUXILIARY[CAP])
|
@@ -73,15 +88,20 @@ class RuMorphemePreTokenizer:
|
|
73 |
# Convert word to lowercase for morpheme splitting
|
74 |
word_lower = word.lower()
|
75 |
|
76 |
-
# Make predictions
|
77 |
all_predictions, all_log_probs = self.model.predict([word_lower])
|
78 |
morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
|
79 |
|
80 |
-
#
|
81 |
-
morpheme_tokens = [
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# Insert capitalization token if needed
|
87 |
if cap_token:
|
|
|
1 |
import os
|
2 |
import json
|
3 |
import re
|
4 |
+
import string
|
5 |
from typing import List
|
6 |
|
7 |
from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
|
|
|
13 |
|
14 |
END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
|
15 |
SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
|
16 |
+
SPACE, NEWLINE, TAB = 11, 12, 13
|
17 |
|
18 |
AUXILIARY = [
|
19 |
"</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
|
20 |
"system", "user", "assistant", "function_call", "function_response",
|
21 |
+
" ", "\n", "\t"
|
22 |
]
|
23 |
|
24 |
NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
25 |
+
LETTERS_CYRILLIC = list(map(chr, range(ord('а'), ord('я') + 1)))
|
26 |
+
LETTERS_LATIN = list(string.ascii_lowercase)
|
27 |
|
28 |
|
29 |
class RuMorphemePreTokenizer:
|
30 |
"""
|
31 |
Pre-tokenizer for RuMorpheme model.
|
32 |
+
Splits on spaces, newlines, and tabs, including these as tokens.
|
33 |
Then, applies morpheme splitting to non-space tokens.
|
34 |
"""
|
35 |
|
|
|
38 |
self.model.eval()
|
39 |
|
40 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
41 |
+
# First, split on spaces (including newlines and tabs) and add them as tokens
|
42 |
pretok.split(self.split_on_spaces)
|
43 |
+
|
44 |
+
# Apply morpheme or character-level splitting to non-space tokens
|
45 |
+
pretok.split(self.morpheme_or_char_split)
|
46 |
|
47 |
def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
48 |
"""
|
49 |
+
Splits on spaces, newlines, and tabs, including these as tokens.
|
|
|
50 |
"""
|
51 |
text = str(normalized_string)
|
52 |
+
splits = [
|
53 |
+
NormalizedString(match.group())
|
54 |
+
for match in re.finditer(r'\s+|\S+', text)
|
55 |
+
]
|
56 |
+
|
57 |
+
# Convert newlines and tabs to tokens
|
58 |
+
for idx, split in enumerate(splits):
|
59 |
+
if split == "\n":
|
60 |
+
splits[idx] = NormalizedString(AUXILIARY[NEWLINE])
|
61 |
+
elif split == "\t":
|
62 |
+
splits[idx] = NormalizedString(AUXILIARY[TAB])
|
63 |
+
|
64 |
return splits
|
65 |
|
66 |
+
def morpheme_or_char_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
67 |
"""
|
68 |
+
Attempts to split the token into morphemes. If the token starts with "UNKNOWN/",
|
69 |
+
splits it into individual characters.
|
70 |
"""
|
71 |
word = str(normalized_string)
|
72 |
|
73 |
+
# If the token is whitespace or digits, return as is
|
74 |
if word.isspace() or word.isdigit():
|
75 |
return [normalized_string]
|
76 |
|
77 |
+
# Ignore tokens that are only punctuation or non-alphabetical
|
78 |
if not any(c.isalpha() for c in word):
|
79 |
return [normalized_string]
|
80 |
|
81 |
+
# Detect capitalization and add relevant token if necessary
|
82 |
cap_token = None
|
83 |
if word[0].isupper():
|
84 |
cap_token = NormalizedString(AUXILIARY[CAP])
|
|
|
88 |
# Convert word to lowercase for morpheme splitting
|
89 |
word_lower = word.lower()
|
90 |
|
91 |
+
# Make predictions to get morphemes
|
92 |
all_predictions, all_log_probs = self.model.predict([word_lower])
|
93 |
morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
|
94 |
|
95 |
+
# Handle unknown tokens by splitting into characters
|
96 |
+
morpheme_tokens = []
|
97 |
+
for morph, morph_type in zip(morphs, morph_types):
|
98 |
+
if morph_type == "UNKNOWN":
|
99 |
+
# Split unknown morpheme into characters
|
100 |
+
char_tokens = [NormalizedString(char) for char in morph]
|
101 |
+
morpheme_tokens.extend(char_tokens)
|
102 |
+
else:
|
103 |
+
# Add as a single morpheme token
|
104 |
+
morpheme_tokens.append(NormalizedString(f"{morph_type}/{morph}"))
|
105 |
|
106 |
# Insert capitalization token if needed
|
107 |
if cap_token:
|
tokenizer_config.json
CHANGED
@@ -37,11 +37,15 @@
|
|
37 |
"clean_up_tokenization_spaces": false,
|
38 |
"eos_token": "</s>",
|
39 |
"model_max_length": 1000000000000000019884624838656,
|
|
|
40 |
"pad_token": "<pad>",
|
41 |
"tokenizer_class": "RuMorphemeTokenizerFast",
|
42 |
"unk_token": "<unk>",
|
43 |
"use_fast": true,
|
44 |
"auto_map": {
|
45 |
-
"AutoTokenizer": [
|
|
|
|
|
|
|
46 |
}
|
47 |
}
|
|
|
37 |
"clean_up_tokenization_spaces": false,
|
38 |
"eos_token": "</s>",
|
39 |
"model_max_length": 1000000000000000019884624838656,
|
40 |
+
"model_name": "./model",
|
41 |
"pad_token": "<pad>",
|
42 |
"tokenizer_class": "RuMorphemeTokenizerFast",
|
43 |
"unk_token": "<unk>",
|
44 |
"use_fast": true,
|
45 |
"auto_map": {
|
46 |
+
"AutoTokenizer": [
|
47 |
+
"",
|
48 |
+
"tokenizer.RuMorphemeTokenizerFast"
|
49 |
+
]
|
50 |
}
|
51 |
}
|