pasha commited on
Commit
c7a5318
·
1 Parent(s): d8fe7bb

Diguts wrapper added

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. tokenizer.py +5 -3
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer.py CHANGED
@@ -47,11 +47,11 @@ class RuMorphemePreTokenizer:
47
  word = str(normalized_string)
48
 
49
  # If word is just spaces, return as is
50
- if word.isspace():
51
  return [normalized_string]
52
 
53
- # Ignore special characters (non-alphabetical and non-numeric)
54
- if not any(c.isalpha() or c.isdigit() for c in word):
55
  return [normalized_string]
56
 
57
  # Make predictions and return morphemes
@@ -100,6 +100,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
100
  # Custom pre-tokenizer
101
  self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
102
  pre_tokenizers.Punctuation(),
 
103
  pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
104
  ])
105
  # Custom decoder
@@ -150,6 +151,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
150
  # Custom pre-tokenizer
151
  tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
152
  pre_tokenizers.Punctuation(),
 
153
  pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
154
  ])
155
 
 
47
  word = str(normalized_string)
48
 
49
  # If word is just spaces, return as is
50
+ if word.isspace() or word.isdigit():
51
  return [normalized_string]
52
 
53
+ # Ignore special characters (non-alphabetical)
54
+ if not any(c.isalpha() for c in word):
55
  return [normalized_string]
56
 
57
  # Make predictions and return morphemes
 
100
  # Custom pre-tokenizer
101
  self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
102
  pre_tokenizers.Punctuation(),
103
+ pre_tokenizers.Digits(individual_digits=True),
104
  pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
105
  ])
106
  # Custom decoder
 
151
  # Custom pre-tokenizer
152
  tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
153
  pre_tokenizers.Punctuation(),
154
+ pre_tokenizers.Digits(individual_digits=True),
155
  pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
156
  ])
157