pasha
commited on
Commit
·
c7a5318
1
Parent(s):
d8fe7bb
Diguts wrapper added
Browse files- tokenizer.json +0 -0
- tokenizer.py +5 -3
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.py
CHANGED
@@ -47,11 +47,11 @@ class RuMorphemePreTokenizer:
|
|
47 |
word = str(normalized_string)
|
48 |
|
49 |
# If word is just spaces, return as is
|
50 |
-
if word.isspace():
|
51 |
return [normalized_string]
|
52 |
|
53 |
-
# Ignore special characters (non-alphabetical
|
54 |
-
if not any(c.isalpha()
|
55 |
return [normalized_string]
|
56 |
|
57 |
# Make predictions and return morphemes
|
@@ -100,6 +100,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
|
|
100 |
# Custom pre-tokenizer
|
101 |
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
102 |
pre_tokenizers.Punctuation(),
|
|
|
103 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
|
104 |
])
|
105 |
# Custom decoder
|
@@ -150,6 +151,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
|
|
150 |
# Custom pre-tokenizer
|
151 |
tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
152 |
pre_tokenizers.Punctuation(),
|
|
|
153 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
|
154 |
])
|
155 |
|
|
|
47 |
word = str(normalized_string)
|
48 |
|
49 |
# If word is just spaces, return as is
|
50 |
+
if word.isspace() or word.isdigit():
|
51 |
return [normalized_string]
|
52 |
|
53 |
+
# Ignore special characters (non-alphabetical)
|
54 |
+
if not any(c.isalpha() for c in word):
|
55 |
return [normalized_string]
|
56 |
|
57 |
# Make predictions and return morphemes
|
|
|
100 |
# Custom pre-tokenizer
|
101 |
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
102 |
pre_tokenizers.Punctuation(),
|
103 |
+
pre_tokenizers.Digits(individual_digits=True),
|
104 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
|
105 |
])
|
106 |
# Custom decoder
|
|
|
151 |
# Custom pre-tokenizer
|
152 |
tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
153 |
pre_tokenizers.Punctuation(),
|
154 |
+
pre_tokenizers.Digits(individual_digits=True),
|
155 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
|
156 |
])
|
157 |
|