pasha commited on
Commit
15bb64e
1 Parent(s): c7a5318

Switched to v0.2

Browse files
Files changed (3) hide show
  1. tokenizer.json +0 -0
  2. tokenizer.py +60 -10
  3. tokenizer_config.json +3 -2
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer.py CHANGED
@@ -3,14 +3,23 @@ import json
3
  import re
4
  from typing import List
5
 
6
- from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString
7
  from transformers import PreTrainedTokenizerFast
8
 
9
  from rumorpheme import RuMorphemeModel, labels_to_morphemes
10
 
11
- DEFAULT_MODEL_NAME = "evilfreelancer/ruMorpheme-v0.1"
12
- PAD, BEGIN, END, UNKNOWN, SPACE, SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
13
- AUXILIARY = ["<pad>", "<s>", "</s>", "<unk>", " ", "system", "user", "assistant", "function_call", "function_response"]
 
 
 
 
 
 
 
 
 
14
  NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
15
 
16
 
@@ -46,7 +55,7 @@ class RuMorphemePreTokenizer:
46
  """
47
  word = str(normalized_string)
48
 
49
- # If word is just spaces, return as is
50
  if word.isspace() or word.isdigit():
51
  return [normalized_string]
52
 
@@ -54,15 +63,36 @@ class RuMorphemePreTokenizer:
54
  if not any(c.isalpha() for c in word):
55
  return [normalized_string]
56
 
 
 
 
 
 
 
 
 
 
 
57
  # Make predictions and return morphemes
58
- all_predictions, all_log_probs = self.model.predict([word])
59
- morphs, morph_types, _ = labels_to_morphemes(word.lower(), all_predictions[0], all_log_probs[0])
60
- return [NormalizedString(f"{morph_type}/{morph}") for morph, morph_type in zip(morphs, morph_types)]
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  class RuMorphemeDecoder:
64
  """
65
- Custom decoder for RuMorpheme model, it removes morph_type prefix from tokens and keep spaces.
66
  """
67
 
68
  def decode_chain(self, tokens: List[str]) -> List[str]:
@@ -70,7 +100,18 @@ class RuMorphemeDecoder:
70
  tokenizer.decode function calls this function
71
  """
72
  decoded_tokens = []
 
 
 
73
  for token in tokens:
 
 
 
 
 
 
 
 
74
  # If token is a space, keep it as is
75
  if token.isspace():
76
  decoded_tokens.append(token)
@@ -80,6 +121,15 @@ class RuMorphemeDecoder:
80
  _, morph = token.split('/', 1)
81
  else:
82
  morph = token
 
 
 
 
 
 
 
 
 
83
  decoded_tokens.append(morph)
84
  return decoded_tokens
85
 
@@ -131,7 +181,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
131
  # Correctly specify the tokenizer_class with module name
132
  tokenizer_config['tokenizer_class'] = "RuMorphemeTokenizerFast"
133
  tokenizer_config['use_fast'] = True
134
- tokenizer_config['auto_map'] = {"AutoTokenizer": ["", "my_tokenizer.RuMorphemeTokenizerFast"]}
135
 
136
  with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
137
  json.dump(tokenizer_config, f, ensure_ascii=False)
 
3
  import re
4
  from typing import List
5
 
6
+ from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
7
  from transformers import PreTrainedTokenizerFast
8
 
9
  from rumorpheme import RuMorphemeModel, labels_to_morphemes
10
 
11
+ DEFAULT_MODEL_NAME = "evilfreelancer/ruMorpheme-v0.2"
12
+
13
+ END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
14
+ SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
15
+ SPACE = 11
16
+
17
+ AUXILIARY = [
18
+ "</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
19
+ "system", "user", "assistant", "function_call", "function_response",
20
+ " ",
21
+ ]
22
+
23
  NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
24
 
25
 
 
55
  """
56
  word = str(normalized_string)
57
 
58
+ # If word is just spaces or digits, return as is
59
  if word.isspace() or word.isdigit():
60
  return [normalized_string]
61
 
 
63
  if not any(c.isalpha() for c in word):
64
  return [normalized_string]
65
 
66
+ # Detect capitalization
67
+ cap_token = None
68
+ if word[0].isupper():
69
+ cap_token = NormalizedString(AUXILIARY[CAP])
70
+ if len(word) > 1 and word.isupper():
71
+ cap_token = NormalizedString(AUXILIARY[ALL_CAPS])
72
+
73
+ # Convert word to lowercase for morpheme splitting
74
+ word_lower = word.lower()
75
+
76
  # Make predictions and return morphemes
77
+ all_predictions, all_log_probs = self.model.predict([word_lower])
78
+ morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
79
+
80
+ # Create list of morpheme tokens
81
+ morpheme_tokens = [
82
+ NormalizedString(f"{morph_type}/{morph}")
83
+ for morph, morph_type in zip(morphs, morph_types)
84
+ ]
85
+
86
+ # Insert capitalization token if needed
87
+ if cap_token:
88
+ return [cap_token] + morpheme_tokens
89
+ else:
90
+ return morpheme_tokens
91
 
92
 
93
  class RuMorphemeDecoder:
94
  """
95
+ Custom decoder for RuMorpheme model, it removes morph_type prefix from tokens and keeps spaces.
96
  """
97
 
98
  def decode_chain(self, tokens: List[str]) -> List[str]:
 
100
  tokenizer.decode function calls this function
101
  """
102
  decoded_tokens = []
103
+ capitalize_next = False
104
+ uppercase_next = False
105
+
106
  for token in tokens:
107
+ # Handle capitalization tokens
108
+ if token == AUXILIARY[CAP]:
109
+ capitalize_next = True
110
+ continue
111
+ elif token == AUXILIARY[ALL_CAPS]:
112
+ uppercase_next = True
113
+ continue
114
+
115
  # If token is a space, keep it as is
116
  if token.isspace():
117
  decoded_tokens.append(token)
 
121
  _, morph = token.split('/', 1)
122
  else:
123
  morph = token
124
+
125
+ # Apply capitalization if needed
126
+ if uppercase_next:
127
+ morph = morph.upper()
128
+ uppercase_next = False
129
+ elif capitalize_next:
130
+ morph = morph.capitalize()
131
+ capitalize_next = False
132
+
133
  decoded_tokens.append(morph)
134
  return decoded_tokens
135
 
 
181
  # Correctly specify the tokenizer_class with module name
182
  tokenizer_config['tokenizer_class'] = "RuMorphemeTokenizerFast"
183
  tokenizer_config['use_fast'] = True
184
+ tokenizer_config['auto_map'] = {"AutoTokenizer": ["", "tokenizer.RuMorphemeTokenizerFast"]}
185
 
186
  with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
187
  json.dump(tokenizer_config, f, ensure_ascii=False)
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<pad>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -17,7 +17,7 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "</s>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
@@ -37,6 +37,7 @@
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "</s>",
39
  "model_max_length": 1000000000000000019884624838656,
 
40
  "pad_token": "<pad>",
41
  "tokenizer_class": "RuMorphemeTokenizerFast",
42
  "unk_token": "<unk>",
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "</s>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "<pad>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "</s>",
39
  "model_max_length": 1000000000000000019884624838656,
40
+ "model_name": "./model",
41
  "pad_token": "<pad>",
42
  "tokenizer_class": "RuMorphemeTokenizerFast",
43
  "unk_token": "<unk>",