pasha commited on
Commit
b4d132c
1 Parent(s): 236bb7f

Version updated

Browse files
Files changed (3) hide show
  1. tokenizer.json +0 -0
  2. tokenizer.py +40 -20
  3. tokenizer_config.json +5 -1
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import re
 
4
  from typing import List
5
 
6
  from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
@@ -12,21 +13,23 @@ DEFAULT_MODEL_NAME = "evilfreelancer/ruMorpheme-v0.2"
12
 
13
  END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
14
  SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
15
- SPACE = 11
16
 
17
  AUXILIARY = [
18
  "</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
19
  "system", "user", "assistant", "function_call", "function_response",
20
- " ",
21
  ]
22
 
23
  NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
 
 
24
 
25
 
26
  class RuMorphemePreTokenizer:
27
  """
28
  Pre-tokenizer for RuMorpheme model.
29
- Splits on spaces and includes spaces as tokens.
30
  Then, applies morpheme splitting to non-space tokens.
31
  """
32
 
@@ -35,35 +38,47 @@ class RuMorphemePreTokenizer:
35
  self.model.eval()
36
 
37
  def pre_tokenize(self, pretok: PreTokenizedString):
38
- # First, split on spaces and include spaces as tokens
39
  pretok.split(self.split_on_spaces)
40
- # Then, apply morpheme splitting to non-space tokens
41
- pretok.split(self.morpheme_split)
 
42
 
43
  def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
44
  """
45
- Splits on spaces and includes spaces as tokens.
46
- TODO: Need to make performance tests on this function.
47
  """
48
  text = str(normalized_string)
49
- splits = [NormalizedString(match.group()) for match in re.finditer(r'\s+|\S+', text)]
 
 
 
 
 
 
 
 
 
 
 
50
  return splits
51
 
52
- def morpheme_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
53
  """
54
- Split word on morphemes, including numbers and punctuation.
 
55
  """
56
  word = str(normalized_string)
57
 
58
- # If word is just spaces or digits, return as is
59
  if word.isspace() or word.isdigit():
60
  return [normalized_string]
61
 
62
- # Ignore special characters (non-alphabetical)
63
  if not any(c.isalpha() for c in word):
64
  return [normalized_string]
65
 
66
- # Detect capitalization
67
  cap_token = None
68
  if word[0].isupper():
69
  cap_token = NormalizedString(AUXILIARY[CAP])
@@ -73,15 +88,20 @@ class RuMorphemePreTokenizer:
73
  # Convert word to lowercase for morpheme splitting
74
  word_lower = word.lower()
75
 
76
- # Make predictions and return morphemes
77
  all_predictions, all_log_probs = self.model.predict([word_lower])
78
  morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
79
 
80
- # Create list of morpheme tokens
81
- morpheme_tokens = [
82
- NormalizedString(f"{morph_type}/{morph}")
83
- for morph, morph_type in zip(morphs, morph_types)
84
- ]
 
 
 
 
 
85
 
86
  # Insert capitalization token if needed
87
  if cap_token:
 
1
  import os
2
  import json
3
  import re
4
+ import string
5
  from typing import List
6
 
7
  from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
 
13
 
14
  END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
15
  SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
16
+ SPACE, NEWLINE, TAB = 11, 12, 13
17
 
18
  AUXILIARY = [
19
  "</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
20
  "system", "user", "assistant", "function_call", "function_response",
21
+ " ", "\n", "\t"
22
  ]
23
 
24
  NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
25
+ LETTERS_CYRILLIC = list(map(chr, range(ord('а'), ord('я') + 1)))
26
+ LETTERS_LATIN = list(string.ascii_lowercase)
27
 
28
 
29
  class RuMorphemePreTokenizer:
30
  """
31
  Pre-tokenizer for RuMorpheme model.
32
+ Splits on spaces, newlines, and tabs, including these as tokens.
33
  Then, applies morpheme splitting to non-space tokens.
34
  """
35
 
 
38
  self.model.eval()
39
 
40
  def pre_tokenize(self, pretok: PreTokenizedString):
41
+ # First, split on spaces (including newlines and tabs) and add them as tokens
42
  pretok.split(self.split_on_spaces)
43
+
44
+ # Apply morpheme or character-level splitting to non-space tokens
45
+ pretok.split(self.morpheme_or_char_split)
46
 
47
  def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
48
  """
49
+ Splits on spaces, newlines, and tabs, including these as tokens.
 
50
  """
51
  text = str(normalized_string)
52
+ splits = [
53
+ NormalizedString(match.group())
54
+ for match in re.finditer(r'\s+|\S+', text)
55
+ ]
56
+
57
+ # Convert newlines and tabs to tokens
58
+ for idx, split in enumerate(splits):
59
+ if split == "\n":
60
+ splits[idx] = NormalizedString(AUXILIARY[NEWLINE])
61
+ elif split == "\t":
62
+ splits[idx] = NormalizedString(AUXILIARY[TAB])
63
+
64
  return splits
65
 
66
+ def morpheme_or_char_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
67
  """
68
+ Attempts to split the token into morphemes. If the token starts with "UNKNOWN/",
69
+ splits it into individual characters.
70
  """
71
  word = str(normalized_string)
72
 
73
+ # If the token is whitespace or digits, return as is
74
  if word.isspace() or word.isdigit():
75
  return [normalized_string]
76
 
77
+ # Ignore tokens that are only punctuation or non-alphabetical
78
  if not any(c.isalpha() for c in word):
79
  return [normalized_string]
80
 
81
+ # Detect capitalization and add relevant token if necessary
82
  cap_token = None
83
  if word[0].isupper():
84
  cap_token = NormalizedString(AUXILIARY[CAP])
 
88
  # Convert word to lowercase for morpheme splitting
89
  word_lower = word.lower()
90
 
91
+ # Make predictions to get morphemes
92
  all_predictions, all_log_probs = self.model.predict([word_lower])
93
  morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
94
 
95
+ # Handle unknown tokens by splitting into characters
96
+ morpheme_tokens = []
97
+ for morph, morph_type in zip(morphs, morph_types):
98
+ if morph_type == "UNKNOWN":
99
+ # Split unknown morpheme into characters
100
+ char_tokens = [NormalizedString(char) for char in morph]
101
+ morpheme_tokens.extend(char_tokens)
102
+ else:
103
+ # Add as a single morpheme token
104
+ morpheme_tokens.append(NormalizedString(f"{morph_type}/{morph}"))
105
 
106
  # Insert capitalization token if needed
107
  if cap_token:
tokenizer_config.json CHANGED
@@ -37,11 +37,15 @@
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "</s>",
39
  "model_max_length": 1000000000000000019884624838656,
 
40
  "pad_token": "<pad>",
41
  "tokenizer_class": "RuMorphemeTokenizerFast",
42
  "unk_token": "<unk>",
43
  "use_fast": true,
44
  "auto_map": {
45
- "AutoTokenizer": ["","tokenizer.RuMorphemeTokenizerFast"]
 
 
 
46
  }
47
  }
 
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "</s>",
39
  "model_max_length": 1000000000000000019884624838656,
40
+ "model_name": "./model",
41
  "pad_token": "<pad>",
42
  "tokenizer_class": "RuMorphemeTokenizerFast",
43
  "unk_token": "<unk>",
44
  "use_fast": true,
45
  "auto_map": {
46
+ "AutoTokenizer": [
47
+ "",
48
+ "tokenizer.RuMorphemeTokenizerFast"
49
+ ]
50
  }
51
  }