kkmkorea commited on
Commit
d0631e5
Β·
verified Β·
1 Parent(s): e8206a1

Upload 7 files

Browse files
korscideberta/__pycache__/tokenization_korscideberta_v2.cpython-312.pyc ADDED
Binary file (29.6 kB). View file
 
korscideberta/normalize.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re
5
+ import regex
6
+
7
+ from itertools import chain
8
+
9
+
10
+ class MosesPunctNormalizer:
11
+ """
12
+ This is a Python port of the Moses punctuation normalizer from
13
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
14
+ """
15
+
16
+ EXTRA_WHITESPACE = [ # lines 21 - 30
17
+ (r"\r", r""),
18
+ (r"\(", r" ("),
19
+ (r"\)", r") "),
20
+ (r" +", r" "),
21
+ (r"\) ([.!:?;,])", r")\g<1>"),
22
+ (r"\( ", r"("),
23
+ (r" \)", r")"),
24
+ (r"(\d) %", r"\g<1>%"),
25
+ (r" :", r":"),
26
+ (r" ;", r";"),
27
+ ]
28
+
29
+ NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34
30
+
31
+ NORMALIZE_UNICODE = [ # lines 37 - 50
32
+ ("β€ž", r'"'),
33
+ ("β€œ", r'"'),
34
+ ("”", r'"'),
35
+ ("–", r"-"),
36
+ ("β€”", r" - "),
37
+ (r" +", r" "),
38
+ ("Β΄", r"'"),
39
+ ("([a-zA-Z])β€˜([a-zA-Z])", r"\g<1>'\g<2>"),
40
+ ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
41
+ ("β€˜", r"'"),
42
+ ("β€š", r"'"),
43
+ ("’", r"'"),
44
+ (r"''", r'"'),
45
+ ("´´", r'"'),
46
+ ("…", r"..."),
47
+ ]
48
+
49
+ FRENCH_QUOTES = [ # lines 52 - 57
50
+ ("\u00A0Β«\u00A0", r'"'),
51
+ ("Β«\u00A0", r'"'),
52
+ ("Β«", r'"'),
53
+ ("\u00A0Β»\u00A0", r'"'),
54
+ ("\u00A0Β»", r'"'),
55
+ ("Β»", r'"'),
56
+ ]
57
+
58
+ HANDLE_PSEUDO_SPACES = [ # lines 59 - 67
59
+ ("\u00A0%", r"%"),
60
+ ("nΒΊ\u00A0", "nΒΊ "),
61
+ ("\u00A0:", r":"),
62
+ ("\u00A0ΒΊC", " ΒΊC"),
63
+ ("\u00A0cm", r" cm"),
64
+ ("\u00A0\\?", "?"),
65
+ ("\u00A0\\!", "!"),
66
+ ("\u00A0;", r";"),
67
+ (",\u00A0", r", "),
68
+ (r" +", r" "),
69
+ ]
70
+
71
+ EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
72
+
73
+ DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
74
+ (r',"', r'",'),
75
+ (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence
76
+ ]
77
+
78
+ DE_ES_CZ_CS_FR = [
79
+ ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
80
+ ]
81
+
82
+ OTHER = [
83
+ ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
84
+ ]
85
+
86
+ # Regex substitutions from replace-unicode-punctuation.perl
87
+ # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
88
+ REPLACE_UNICODE_PUNCTUATION = [
89
+ (",", ","),
90
+ (r"。\s*", ". "),
91
+ ("、", ","),
92
+ ("”", '"'),
93
+ ("β€œ", '"'),
94
+ ("∢", ":"),
95
+ (":", ":"),
96
+ ("?", "?"),
97
+ ("γ€Š", '"'),
98
+ ("》", '"'),
99
+ ("οΌ‰", ")"),
100
+ ("!", "!"),
101
+ ("(", "("),
102
+ ("οΌ›", ";"),
103
+ ("」", '"'),
104
+ ("γ€Œ", '"'),
105
+ ("0", "0"),
106
+ ("οΌ‘", "1"),
107
+ ("οΌ’", "2"),
108
+ ("οΌ“", "3"),
109
+ ("οΌ”", "4"),
110
+ ("οΌ•", "5"),
111
+ ("οΌ–", "6"),
112
+ ("οΌ—", "7"),
113
+ ("8", "8"),
114
+ ("οΌ™", "9"),
115
+ (r".\s*", ". "),
116
+ ("~", "~"),
117
+ ("’", "'"),
118
+ ("…", "..."),
119
+ ("━", "-"),
120
+ ("γ€ˆ", "<"),
121
+ ("〉", ">"),
122
+ ("【", "["),
123
+ ("】", "]"),
124
+ ("οΌ…", "%"),
125
+ ]
126
+
127
+ def __init__(
128
+ self,
129
+ lang="en",
130
+ penn=True,
131
+ norm_quote_commas=True,
132
+ norm_numbers=True,
133
+ pre_replace_unicode_punct=False,
134
+ post_remove_control_chars=False,
135
+ ):
136
+ """
137
+ :param language: The two-letter language code.
138
+ :type lang: str
139
+ :param penn: Normalize Penn Treebank style quotations.
140
+ :type penn: bool
141
+ :param norm_quote_commas: Normalize quotations and commas
142
+ :type norm_quote_commas: bool
143
+ :param norm_numbers: Normalize numbers
144
+ :type norm_numbers: bool
145
+ """
146
+ self.substitutions = [
147
+ self.EXTRA_WHITESPACE,
148
+ self.NORMALIZE_UNICODE,
149
+ self.FRENCH_QUOTES,
150
+ self.HANDLE_PSEUDO_SPACES,
151
+ ]
152
+
153
+ if penn: # Adds the penn substitutions after extra_whitespace regexes.
154
+ self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
155
+
156
+ if norm_quote_commas:
157
+ if lang == "en":
158
+ self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
159
+ elif lang in ["de", "es", "fr"]:
160
+ self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
161
+
162
+ if norm_numbers:
163
+ if lang in ["de", "es", "cz", "cs", "fr"]:
164
+ self.substitutions.append(self.DE_ES_CZ_CS_FR)
165
+ else:
166
+ self.substitutions.append(self.OTHER)
167
+
168
+ self.substitutions = list(chain(*self.substitutions))
169
+
170
+ self.pre_replace_unicode_punct = pre_replace_unicode_punct
171
+ self.post_remove_control_chars = post_remove_control_chars
172
+
173
+ def normalize(self, text):
174
+ """
175
+ Returns a string with normalized punctuation.
176
+ """
177
+ # Optionally, replace unicode puncts BEFORE normalization.
178
+ if self.pre_replace_unicode_punct:
179
+ text = self.replace_unicode_punct(text)
180
+
181
+ # Actual normalization.
182
+ for regexp, substitution in self.substitutions:
183
+ # print(regexp, substitution)
184
+ text = re.sub(regexp, substitution, str(text))
185
+ # print(text)
186
+
187
+ # Optionally, replace unicode puncts BEFORE normalization.
188
+ if self.post_remove_control_chars:
189
+ text = self.remove_control_chars(text)
190
+
191
+ return text.strip()
192
+
193
+ def replace_unicode_punct(self, text):
194
+ for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
195
+ text = re.sub(regexp, substitution, str(text))
196
+ return text
197
+
198
+ def remove_control_chars(self, text):
199
+ return regex.sub(r"\p{C}", "", text)
korscideberta/pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file: pyproject.toml
2
+
3
+ [build-system]
4
+ requires = [
5
+ "setuptools >= 65",
6
+ "wheel >= 0.38",
7
+ ]
8
+ build-backend = "setuptools.build_meta"
9
+
10
+ [project]
11
+ name = "korscideberta"
12
+ version = "0.1.0"
13
+ readme = "README.md"
14
+ requires-python = ">=3.8"
15
+ dependencies = [
16
+ "sentencepiece",
17
+ "transformers",
18
+ "mecab",
19
+ "konlpy",
20
+ ]
korscideberta/tokenization_korscideberta.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team and Jangwon Park
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization classes for KoBERT model """
16
+
17
+
18
+ import logging
19
+ import os
20
+ import re
21
+ import unicodedata
22
+ from shutil import copyfile
23
+
24
+ from transformers import PreTrainedTokenizer
25
+
26
+ #2023. 7. 28. ν˜•νƒœμ†Œ 뢄리(Mecab), μœ λ‹ˆμ½”λ“œ μ •κ·œν™” μΆ”κ°€
27
+ from konlpy.tag import Mecab
28
+ from korscideberta.unicode import join_jamos
29
+ from korscideberta.normalize import MosesPunctNormalizer
30
+ nor = MosesPunctNormalizer()
31
+
32
+ # μœ λ‹ˆμ½”λ“œ ν•œκΈ€ μ‹œμž‘ : 44032, 끝 : 55199
33
+ BASE_CODE, CHOSUNG, JUNGSUNG = 44032, 588, 28
34
+ # μ΄ˆμ„± 리슀트. 0 ~ 18
35
+ CHOSUNG_LIST = ['γ„±', 'γ„²', 'γ„΄', 'γ„·', 'γ„Έ', 'γ„Ή', 'ㅁ', 'γ…‚', 'γ…ƒ', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…‰', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž']
36
+ # 쀑성 리슀트. 0 ~ 20
37
+ JUNGSUNG_LIST = ['ㅏ', 'ㅐ', 'γ…‘', 'γ…’', 'γ…“', 'γ…”', 'γ…•', 'γ…–', 'γ…—', 'γ…˜', 'γ…™', 'γ…š', 'γ…›', 'γ…œ', 'ㅝ', 'γ…ž', 'γ…Ÿ', 'γ… ', 'γ…‘', 'γ…’', 'γ…£']
38
+ # μ’…μ„± 리슀트. 0 ~ 27 + 1(1개 μ—†μŒ)
39
+ JONGSUNG_LIST = [' ', 'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ„½', 'γ„Ύ', 'γ„Ώ', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž']
40
+ def splitjamo(string):
41
+ sp_list = list(string)
42
+ result = []
43
+ for keyword in sp_list:
44
+ # ν•œκΈ€ μ—¬λΆ€ check ν›„ 뢄리
45
+ if re.match('.*[γ„±-γ…Žγ…-γ…£κ°€-힣]+.*', keyword) is not None:
46
+ # μ΄ˆμ„±
47
+ char_code = ord(keyword) - BASE_CODE
48
+ char1 = int(char_code / CHOSUNG)
49
+ try:
50
+ result.append(CHOSUNG_LIST[char1])
51
+ except:
52
+ return string
53
+ #print("Err: "+str(char1))
54
+ # 쀑성
55
+ char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG)
56
+ result.append(JUNGSUNG_LIST[char2])
57
+ # μ’…μ„±
58
+ char3 = int((char_code - (CHOSUNG * char1) - (JUNGSUNG * char2)))
59
+ result.append(JONGSUNG_LIST[char3])
60
+ else:
61
+ result.append(keyword)
62
+ return result
63
+ def has_coda(word):
64
+ return (ord(word[-1]) -44032)%28==0
65
+ def _replace_unicode(line):
66
+ if(line==None):
67
+ return ""
68
+ line = line.replace("β€”",'-').replace("―","-").replace("–","-").replace("οΌ‚",'"').replace("οΌ‡","'").replace("β€Ή","<").replace("β€Ί",">").replace("β€š","'").replace("β€›","'").replace("β€ž",'"').replace("β€Ÿ",'"').replace("Β«",'<').replace("Β»",'>').replace("˝",'"').replace("(",'(').replace("οΌ‰",')').replace("γ€Ž",'"').replace("』",'"').replace("β€œ",'"').replace("”",'"').replace("β€˜","'").replace("’","'").replace("γ€Š","<").replace("》",">").replace("γ€ˆ","<").replace("〉",">").replace("γ€Œ","'").replace("」","'").replace("【","[").replace("】","]").replace("γ€”","[").replace("〕","]").replace("οΌ»","[").replace("οΌ½","]").replace("ο½›","{").replace("}","}")
69
+ line=nor.replace_unicode_punct(line)
70
+ return line
71
+ def _mecab(line):
72
+ mecab = Mecab()
73
+ #μ°Έκ³ : VV동사 VAν˜•μš©μ‚¬ VX보쑰 μš©μ–Έ VCP긍정 지정사 VCNλΆ€μ • 지정사 JKS주격 쑰사 JKC보격 쑰사, … XSNλͺ…사 νŒŒμƒ 접미사 XSV동사 νŒŒμƒ 접미사 XSAν˜•μš©μ‚¬ νŒŒμƒ 접미사 EP선어말 μ–΄λ―Έ EFμ’…κ²° μ–΄λ―Έ ECμ—°κ²° μ–΄λ―Έ ETNλͺ…μ‚¬ν˜• μ „μ„± μ–΄λ―Έ ETMκ΄€ν˜•ν˜• μ „μ„± μ–΄λ―Έ
74
+
75
+ pdoc = []
76
+ morphs = []
77
+
78
+ poss = mecab.pos(line)
79
+ for pos in poss:
80
+ morphs.append(pos[0])
81
+ '''
82
+ pdoc.append(" ".join(morphs))
83
+ return pdoc
84
+ '''
85
+ return " ".join(morphs)
86
+
87
+ logger = logging.getLogger(__name__)
88
+
89
+ VOCAB_FILES_NAMES = {
90
+ "vocab_file": "spm.model",
91
+ "vocab_txt": "vocab.txt",
92
+ }
93
+
94
+ PRETRAINED_VOCAB_FILES_MAP = {
95
+ "vocab_file": {
96
+ "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
97
+ "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
98
+ "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model",
99
+ },
100
+ "vocab_txt": {
101
+ "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
102
+ "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
103
+ "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt",
104
+ },
105
+ }
106
+
107
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
108
+ "monologg/kobert": 512,
109
+ "monologg/kobert-lm": 512,
110
+ "monologg/distilkobert": 512,
111
+ }
112
+
113
+ PRETRAINED_INIT_CONFIGURATION = {
114
+ "monologg/kobert": {"do_lower_case": False},
115
+ "monologg/kobert-lm": {"do_lower_case": False},
116
+ "monologg/distilkobert": {"do_lower_case": False},
117
+ }
118
+
119
+ SPIECE_UNDERLINE = "▁"
120
+
121
+
122
+ class DebertaV2Tokenizer(PreTrainedTokenizer):
123
+ """
124
+ SentencePiece based tokenizer. Peculiarities:
125
+ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
126
+ """
127
+
128
+ vocab_files_names = VOCAB_FILES_NAMES
129
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
130
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
131
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
132
+
133
+ def __init__(
134
+ self,
135
+ vocab_file,
136
+ vocab_txt,
137
+ do_lower_case=False,
138
+ remove_space=True,
139
+ keep_accents=False,
140
+ unk_token="<unk>",
141
+ sep_token="<s>",
142
+ pad_token="<pad>",
143
+ cls_token="<cls>",
144
+ mask_token="<mask>",
145
+ **kwargs,
146
+ ):
147
+ super().__init__(
148
+ unk_token="<unk>",
149
+ sep_token=sep_token,
150
+ pad_token=pad_token,
151
+ cls_token=cls_token,
152
+ mask_token=mask_token,
153
+ **kwargs,
154
+ )
155
+
156
+ # Build vocab
157
+ self.token2idx = dict()
158
+ self.idx2token = []
159
+ with open(vocab_txt, "r", encoding="utf-8") as f:
160
+ for idx, token in enumerate(f):
161
+ token = token.strip()
162
+ self.token2idx[token] = idx
163
+ self.idx2token.append(token)
164
+
165
+ try:
166
+ import sentencepiece as spm
167
+ except ImportError:
168
+ logger.warning(
169
+ "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
170
+ "pip install sentencepiece"
171
+ )
172
+
173
+ self.do_lower_case = do_lower_case
174
+ self.remove_space = remove_space
175
+ self.keep_accents = keep_accents
176
+ self.vocab_file = vocab_file
177
+ self.vocab_txt = vocab_txt
178
+
179
+ self.sp_model = spm.SentencePieceProcessor()
180
+ self.sp_model.Load(vocab_file)
181
+
182
+ @property
183
+ def vocab_size(self):
184
+ return len(self.idx2token)
185
+
186
+ def get_vocab(self):
187
+ return dict(self.token2idx, **self.added_tokens_encoder)
188
+
189
+ def __getstate__(self):
190
+ state = self.__dict__.copy()
191
+ state["sp_model"] = None
192
+ return state
193
+
194
+ def __setstate__(self, d):
195
+ self.__dict__ = d
196
+ try:
197
+ import sentencepiece as spm
198
+ except ImportError:
199
+ logger.warning(
200
+ "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
201
+ "pip install sentencepiece"
202
+ )
203
+ self.sp_model = spm.SentencePieceProcessor()
204
+ self.sp_model.Load(self.vocab_file)
205
+
206
+ def preprocess_text(self, inputs):
207
+ if self.remove_space:
208
+ outputs = " ".join(inputs.strip().split())
209
+ else:
210
+ outputs = inputs
211
+ outputs = outputs.replace("``", '"').replace("''", '"')
212
+
213
+ if not self.keep_accents:
214
+ outputs = unicodedata.normalize("NFKD", outputs)
215
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
216
+ if self.do_lower_case:
217
+ outputs = outputs.lower()
218
+
219
+ return outputs
220
+
221
+ def _tokenize(self, text):
222
+ """Tokenize a string."""
223
+ text = self.preprocess_text(text)
224
+ #print('text: '+text)
225
+ #logger.info("text ({}) ".format(text))
226
+ text = _replace_unicode(text) #μœ λ‹ˆμ½”λ“œ μ •κ·œν™”
227
+ text = _mecab(text) #ν˜•νƒœμ†Œ 뢄리
228
+ #print('text: '+str(text))
229
+ #logger.info("text ({}) ".format(text))
230
+ pieces = self.sp_model.encode(text, out_type=str)
231
+ new_pieces = []
232
+ for piece in pieces:
233
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
234
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
235
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
236
+ if len(cur_pieces[0]) == 1:
237
+ cur_pieces = cur_pieces[1:]
238
+ else:
239
+ cur_pieces[0] = cur_pieces[0][1:]
240
+ cur_pieces.append(piece[-1])
241
+ new_pieces.extend(cur_pieces)
242
+ else:
243
+ new_pieces.append(piece)
244
+ '''
245
+ return_pieces = []
246
+ for n in new_pieces:
247
+ if(isinstance(n,list)):
248
+ for nn in n:
249
+ return_pieces.append(nn)
250
+ else:
251
+ return_pieces.append(n)
252
+ return return_pieces
253
+ '''
254
+ return new_pieces
255
+
256
+
257
+ def _convert_token_to_id(self, token):
258
+ """ Converts a token (str/unicode) in an id using the vocab. """
259
+ return self.token2idx.get(token, self.token2idx[self.unk_token])
260
+
261
+ def _convert_id_to_token(self, index):
262
+ """Converts an index (integer) in a token (string/unicode) using the vocab."""
263
+ return self.idx2token[index]
264
+
265
+ def convert_tokens_to_string(self, tokens):
266
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
267
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
268
+ return out_string
269
+
270
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
271
+ """
272
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks
273
+ by concatenating and adding special tokens.
274
+ A KoBERT sequence has the following format:
275
+ single sequence: [CLS] X [SEP]
276
+ pair of sequences: [CLS] A [SEP] B [SEP]
277
+ """
278
+ if token_ids_1 is None:
279
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
280
+ cls = [self.cls_token_id]
281
+ sep = [self.sep_token_id]
282
+ return cls + token_ids_0 + sep + token_ids_1 + sep
283
+
284
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
285
+ """
286
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
287
+ special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
288
+ Args:
289
+ token_ids_0: list of ids (must not contain special tokens)
290
+ token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
291
+ for sequence pairs
292
+ already_has_special_tokens: (default False) Set to True if the token list is already formated with
293
+ special tokens for the model
294
+ Returns:
295
+ A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
296
+ """
297
+
298
+ if already_has_special_tokens:
299
+ if token_ids_1 is not None:
300
+ raise ValueError(
301
+ "You should not supply a second sequence if the provided sequence of "
302
+ "ids is already formated with special tokens for the model."
303
+ )
304
+ return list(
305
+ map(
306
+ lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
307
+ token_ids_0,
308
+ )
309
+ )
310
+
311
+ if token_ids_1 is not None:
312
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
313
+ return [1] + ([0] * len(token_ids_0)) + [1]
314
+
315
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
316
+ """
317
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
318
+ A KoBERT sequence pair mask has the following format:
319
+ 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
320
+ | first sequence | second sequence
321
+ if token_ids_1 is None, only returns the first portion of the mask (0's).
322
+ """
323
+ sep = [self.sep_token_id]
324
+ cls = [self.cls_token_id]
325
+ if token_ids_1 is None:
326
+ return len(cls + token_ids_0 + sep) * [0]
327
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
328
+
329
+ def save_vocabulary(self, save_directory):
330
+ """Save the sentencepiece vocabulary (copy original file) and special tokens file
331
+ to a directory.
332
+ """
333
+ if not os.path.isdir(save_directory):
334
+ logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
335
+ return
336
+
337
+ # 1. Save sentencepiece model
338
+ out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
339
+
340
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
341
+ copyfile(self.vocab_file, out_vocab_model)
342
+
343
+ # 2. Save vocab.txt
344
+ index = 0
345
+ out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
346
+ with open(out_vocab_txt, "w", encoding="utf-8") as writer:
347
+ for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
348
+ if index != token_index:
349
+ logger.warning(
350
+ "Saving vocabulary to {}: vocabulary indices are not consecutive."
351
+ " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
352
+ )
353
+ index = token_index
354
+ writer.write(token + "\n")
355
+ index += 1
356
+
357
+ return out_vocab_model, out_vocab_txt
korscideberta/tokenization_korscideberta_v2.py ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 Microsoft and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization class for model DeBERTa."""
16
+
17
+ import os
18
+ import unicodedata
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import sentencepiece as sp
22
+
23
+ from transformers import AddedToken, PreTrainedTokenizer
24
+ from transformers import logging
25
+ #2023. 7. 28. ν˜•νƒœμ†Œ 뢄리(Mecab), μœ λ‹ˆμ½”λ“œ μ •κ·œν™” μΆ”κ°€
26
+ from konlpy.tag import Mecab
27
+ from korscideberta.unicode import join_jamos
28
+ from korscideberta.normalize import MosesPunctNormalizer
29
+ nor = MosesPunctNormalizer()
30
+
31
+ def has_coda(word):
32
+ return (ord(word[-1]) -44032)%28==0
33
+ def _replace_unicode(line):
34
+ if(line==None):
35
+ return ""
36
+ line = line.replace("β€”",'-').replace("―","-").replace("–","-").replace("οΌ‚",'"').replace("οΌ‡","'").replace("β€Ή","<").replace("β€Ί",">").replace("β€š","'").replace("β€›","'").replace("β€ž",'"').replace("β€Ÿ",'"').replace("Β«",'<').replace("Β»",'>').replace("˝",'"').replace("(",'(').replace("οΌ‰",')').replace("γ€Ž",'"').replace("』",'"').replace("β€œ",'"').replace("”",'"').replace("β€˜","'").replace("’","'").replace("γ€Š","<").replace("》",">").replace("γ€ˆ","<").replace("〉",">").replace("γ€Œ","'").replace("」","'").replace("【","[").replace("】","]").replace("γ€”","[").replace("〕","]").replace("οΌ»","[").replace("οΌ½","]").replace("ο½›","{").replace("}","}")
37
+ line=nor.replace_unicode_punct(line)
38
+ return line
39
+ def _mecab(line):
40
+ mecab = Mecab()
41
+ #μ°Έκ³ : VV동사 VAν˜•μš©μ‚¬ VX보쑰 μš©μ–Έ VCP긍정 지정사 VCNλΆ€μ • 지정사 JKS주격 쑰사 JKC보격 쑰사, … XSNλͺ…사 νŒŒμƒ 접미사 XSV동사 νŒŒμƒ 접미사 XSAν˜•μš©μ‚¬ νŒŒμƒ 접미사 EP선어말 μ–΄λ―Έ EFμ’…κ²° μ–΄λ―Έ ECμ—°κ²° μ–΄λ―Έ ETNλͺ…μ‚¬ν˜• μ „μ„± μ–΄λ―Έ ETMκ΄€ν˜•ν˜• μ „μ„± μ–΄λ―Έ
42
+
43
+ pdoc = []
44
+ morphs = []
45
+
46
+ poss = mecab.pos(line)
47
+ for pos in poss:
48
+ morphs.append(pos[0])
49
+ '''
50
+ pdoc.append(" ".join(morphs))
51
+ return pdoc
52
+ '''
53
+ return " ".join(morphs)
54
+
55
+ logger = logging.get_logger(__name__)
56
+
57
+ PRETRAINED_VOCAB_FILES_MAP = {
58
+ "vocab_file": {
59
+ "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
60
+ "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
61
+ "microsoft/deberta-v2-xlarge-mnli": (
62
+ "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
63
+ ),
64
+ "microsoft/deberta-v2-xxlarge-mnli": (
65
+ "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
66
+ ),
67
+ }
68
+ }
69
+
70
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
71
+ "microsoft/deberta-v2-xlarge": 512,
72
+ "microsoft/deberta-v2-xxlarge": 512,
73
+ "microsoft/deberta-v2-xlarge-mnli": 512,
74
+ "microsoft/deberta-v2-xxlarge-mnli": 512,
75
+ }
76
+
77
+ PRETRAINED_INIT_CONFIGURATION = {
78
+ "microsoft/deberta-v2-xlarge": {"do_lower_case": False},
79
+ "microsoft/deberta-v2-xxlarge": {"do_lower_case": False},
80
+ "microsoft/deberta-v2-xlarge-mnli": {"do_lower_case": False},
81
+ "microsoft/deberta-v2-xxlarge-mnli": {"do_lower_case": False},
82
+ }
83
+
84
+ VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
85
+
86
+
87
+ class DebertaV2Tokenizer(PreTrainedTokenizer):
88
+ r"""
89
+ Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
90
+
91
+ Args:
92
+ vocab_file (`str`):
93
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
94
+ contains the vocabulary necessary to instantiate a tokenizer.
95
+ do_lower_case (`bool`, *optional*, defaults to `False`):
96
+ Whether or not to lowercase the input when tokenizing.
97
+ bos_token (`string`, *optional*, defaults to `"[CLS]"`):
98
+ The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
99
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
100
+ sequence. The token used is the `cls_token`.
101
+ eos_token (`string`, *optional*, defaults to `"[SEP]"`):
102
+ The end of sequence token. When building a sequence using special tokens, this is not the token that is
103
+ used for the end of sequence. The token used is the `sep_token`.
104
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
105
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
106
+ token instead.
107
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
108
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
109
+ sequence classification or for a text and a question for question answering. It is also used as the last
110
+ token of a sequence built with special tokens.
111
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
112
+ The token used for padding, for example when batching sequences of different lengths.
113
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
114
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
115
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
116
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
117
+ The token used for masking values. This is the token used when training this model with masked language
118
+ modeling. This is the token which the model will try to predict.
119
+ sp_model_kwargs (`dict`, *optional*):
120
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
121
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
122
+ to set:
123
+
124
+ - `enable_sampling`: Enable subword regularization.
125
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
126
+
127
+ - `nbest_size = {0,1}`: No sampling is performed.
128
+ - `nbest_size > 1`: samples from the nbest_size results.
129
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
130
+ using forward-filtering-and-backward-sampling algorithm.
131
+
132
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
133
+ BPE-dropout.
134
+ """
135
+
136
+ vocab_files_names = VOCAB_FILES_NAMES
137
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
138
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
139
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
140
+
141
+ def __init__(
142
+ self,
143
+ vocab_file,
144
+ do_lower_case=False,
145
+ split_by_punct=False,
146
+ bos_token="[CLS]",
147
+ eos_token="[SEP]",
148
+ unk_token="[UNK]",
149
+ sep_token="[SEP]",
150
+ pad_token="[PAD]",
151
+ cls_token="[CLS]",
152
+ mask_token="[MASK]",
153
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
154
+ **kwargs,
155
+ ) -> None:
156
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
157
+
158
+ if not os.path.isfile(vocab_file):
159
+ raise ValueError(
160
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
161
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
162
+ )
163
+ self.do_lower_case = do_lower_case
164
+ self.split_by_punct = split_by_punct
165
+ self.vocab_file = vocab_file
166
+ self._tokenizer = SPMTokenizer(
167
+ vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
168
+ )
169
+ unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
170
+ super().__init__(
171
+ do_lower_case=do_lower_case,
172
+ bos_token=bos_token,
173
+ eos_token=eos_token,
174
+ unk_token=unk_token,
175
+ sep_token=sep_token,
176
+ pad_token=pad_token,
177
+ cls_token=cls_token,
178
+ mask_token=mask_token,
179
+ split_by_punct=split_by_punct,
180
+ sp_model_kwargs=self.sp_model_kwargs,
181
+ **kwargs,
182
+ )
183
+ self._tokenizer.special_tokens = self.all_special_tokens
184
+
185
+ @property
186
+ def vocab_size(self):
187
+ return len(self.vocab)
188
+
189
+ @property
190
+ def vocab(self):
191
+ return self._tokenizer.vocab
192
+
193
+ def get_vocab(self):
194
+ vocab = self.vocab.copy()
195
+ vocab.update(self.get_added_vocab())
196
+ return vocab
197
+
198
+ def _tokenize(self, text: str) -> List[str]:
199
+ """Take as input a string and return a list of strings (tokens) for words/sub-words"""
200
+ if self.do_lower_case:
201
+ text = text.lower()
202
+ return self._tokenizer.tokenize(text)
203
+
204
+ def _convert_token_to_id(self, token):
205
+ """Converts a token (str) in an id using the vocab."""
206
+ return self._tokenizer.spm.PieceToId(token)
207
+
208
+ def _convert_id_to_token(self, index):
209
+ """Converts an index (integer) in a token (str) using the vocab."""
210
+ return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
211
+
212
+ def convert_tokens_to_string(self, tokens):
213
+ """Converts a sequence of tokens (string) in a single string."""
214
+ return self._tokenizer.decode(tokens)
215
+
216
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
217
+ """
218
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
219
+ adding special tokens. A DeBERTa sequence has the following format:
220
+
221
+ - single sequence: [CLS] X [SEP]
222
+ - pair of sequences: [CLS] A [SEP] B [SEP]
223
+
224
+ Args:
225
+ token_ids_0 (`List[int]`):
226
+ List of IDs to which the special tokens will be added.
227
+ token_ids_1 (`List[int]`, *optional*):
228
+ Optional second list of IDs for sequence pairs.
229
+
230
+ Returns:
231
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
232
+ """
233
+
234
+ if token_ids_1 is None:
235
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
236
+ cls = [self.cls_token_id]
237
+ sep = [self.sep_token_id]
238
+ return cls + token_ids_0 + sep + token_ids_1 + sep
239
+
240
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
241
+ """
242
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
243
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
244
+
245
+ Args:
246
+ token_ids_0 (`List[int]`):
247
+ List of IDs.
248
+ token_ids_1 (`List[int]`, *optional*):
249
+ Optional second list of IDs for sequence pairs.
250
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
251
+ Whether or not the token list is already formatted with special tokens for the model.
252
+
253
+ Returns:
254
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
255
+ """
256
+
257
+ if already_has_special_tokens:
258
+ return super().get_special_tokens_mask(
259
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
260
+ )
261
+
262
+ if token_ids_1 is not None:
263
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
264
+ return [1] + ([0] * len(token_ids_0)) + [1]
265
+
266
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
267
+ """
268
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
269
+ sequence pair mask has the following format:
270
+
271
+ ```
272
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
273
+ | first sequence | second sequence |
274
+ ```
275
+
276
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
277
+
278
+ Args:
279
+ token_ids_0 (`List[int]`):
280
+ List of IDs.
281
+ token_ids_1 (`List[int]`, *optional*):
282
+ Optional second list of IDs for sequence pairs.
283
+
284
+ Returns:
285
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
286
+ """
287
+ sep = [self.sep_token_id]
288
+ cls = [self.cls_token_id]
289
+ if token_ids_1 is None:
290
+ return len(cls + token_ids_0 + sep) * [0]
291
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
292
+
293
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
294
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
295
+ if is_split_into_words or add_prefix_space:
296
+ text = " " + text
297
+ return (text, kwargs)
298
+
299
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
300
+ return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
301
+
302
+
303
+ class SPMTokenizer:
304
+ r"""
305
+ Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
306
+
307
+ Args:
308
+ vocab_file (`str`):
309
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
310
+ contains the vocabulary necessary to instantiate a tokenizer.
311
+ sp_model_kwargs (`dict`, *optional*):
312
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
313
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
314
+ to set:
315
+
316
+ - `enable_sampling`: Enable subword regularization.
317
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
318
+
319
+ - `nbest_size = {0,1}`: No sampling is performed.
320
+ - `nbest_size > 1`: samples from the nbest_size results.
321
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
322
+ using forward-filtering-and-backward-sampling algorithm.
323
+
324
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
325
+ BPE-dropout.
326
+ """
327
+
328
+ def __init__(
329
+ self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None
330
+ ):
331
+ self.split_by_punct = split_by_punct
332
+ self.vocab_file = vocab_file
333
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
334
+ spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
335
+ if not os.path.exists(vocab_file):
336
+ raise FileNotFoundError(f"{vocab_file} does not exist!")
337
+ spm.load(vocab_file)
338
+ bpe_vocab_size = spm.GetPieceSize()
339
+ # Token map
340
+ # <unk> 0+1
341
+ # <s> 1+1
342
+ # </s> 2+1
343
+ self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
344
+ self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
345
+ # self.vocab['[PAD]'] = 0
346
+ # self.vocab['[CLS]'] = 1
347
+ # self.vocab['[SEP]'] = 2
348
+ # self.vocab['[UNK]'] = 3
349
+
350
+ self.spm = spm
351
+ self.special_tokens = special_tokens
352
+
353
+ def __getstate__(self):
354
+ state = self.__dict__.copy()
355
+ state["spm"] = None
356
+ return state
357
+
358
+ def __setstate__(self, d):
359
+ self.__dict__ = d
360
+
361
+ # for backward compatibility
362
+ if not hasattr(self, "sp_model_kwargs"):
363
+ self.sp_model_kwargs = {}
364
+
365
+ self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
366
+ self.spm.Load(self.vocab_file)
367
+
368
+ def tokenize(self, text):
369
+ text = _replace_unicode(text) #μœ λ‹ˆμ½”λ“œ μ •κ·œν™”
370
+ text = _mecab(text) #ν˜•νƒœμ†Œ 뢄리
371
+ return self._encode_as_pieces(text)
372
+
373
+ def convert_ids_to_tokens(self, ids):
374
+ tokens = []
375
+ for i in ids:
376
+ tokens.append(self.ids_to_tokens[i])
377
+ return tokens
378
+
379
+ def decode(self, tokens, start=-1, end=-1, raw_text=None):
380
+ if raw_text is None:
381
+ current_sub_tokens = []
382
+ out_string = ""
383
+ prev_is_special = False
384
+ for token in tokens:
385
+ # make sure that special tokens are not decoded using sentencepiece model
386
+ if token in self.special_tokens:
387
+ if not prev_is_special:
388
+ out_string += " "
389
+ out_string += self.spm.decode_pieces(current_sub_tokens) + token
390
+ prev_is_special = True
391
+ current_sub_tokens = []
392
+ else:
393
+ current_sub_tokens.append(token)
394
+ prev_is_special = False
395
+ out_string += self.spm.decode_pieces(current_sub_tokens)
396
+ return out_string.strip()
397
+ else:
398
+ words = self.split_to_words(raw_text)
399
+ word_tokens = [self.tokenize(w) for w in words]
400
+ token2words = [0] * len(tokens)
401
+ tid = 0
402
+ for i, w in enumerate(word_tokens):
403
+ for k, t in enumerate(w):
404
+ token2words[tid] = i
405
+ tid += 1
406
+ word_start = token2words[start]
407
+ word_end = token2words[end] if end < len(tokens) else len(words)
408
+ text = "".join(words[word_start:word_end])
409
+ return text
410
+
411
+ # TODO add a deprecation cycle as this can have different behaviour from our API
412
+ def add_special_token(self, token):
413
+ if token not in self.special_tokens:
414
+ self.special_tokens.append(token)
415
+ if token not in self.vocab:
416
+ self.vocab[token] = len(self.vocab) - 1
417
+ self.ids_to_tokens.append(token)
418
+ return self.id(token)
419
+
420
+ def part_of_whole_word(self, token, is_bos=False):
421
+ logger.warning_once(
422
+ "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
423
+ )
424
+ if is_bos:
425
+ return True
426
+ if (
427
+ len(token) == 1
428
+ and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
429
+ ) or token in self.special_tokens:
430
+ return False
431
+
432
+ word_start = b"\xe2\x96\x81".decode("utf-8")
433
+ return not token.startswith(word_start)
434
+
435
+ def pad(self):
436
+ return "[PAD]"
437
+
438
+ def bos(self):
439
+ return "[CLS]"
440
+
441
+ def eos(self):
442
+ return "[SEP]"
443
+
444
+ def unk(self):
445
+ return "[UNK]"
446
+
447
+ def mask(self):
448
+ return "[MASK]"
449
+
450
+ def sym(self, id):
451
+ return self.ids_to_tokens[id]
452
+
453
+ def id(self, sym):
454
+ logger.warning_once(
455
+ "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
456
+ )
457
+ return self.vocab[sym] if sym in self.vocab else 1
458
+
459
+ def _encode_as_pieces(self, text):
460
+ text = convert_to_unicode(text)
461
+ if self.split_by_punct:
462
+ words = self._run_split_on_punc(text)
463
+ pieces = [self.spm.encode(w, out_type=str) for w in words]
464
+ return [p for w in pieces for p in w]
465
+ else:
466
+ return self.spm.encode(text, out_type=str)
467
+
468
+ def split_to_words(self, text):
469
+ pieces = self._encode_as_pieces(text)
470
+ word_start = b"\xe2\x96\x81".decode("utf-8")
471
+ words = []
472
+ offset = 0
473
+ prev_end = 0
474
+ for i, p in enumerate(pieces):
475
+ if p.startswith(word_start):
476
+ if offset > prev_end:
477
+ words.append(text[prev_end:offset])
478
+ prev_end = offset
479
+ w = p.replace(word_start, "")
480
+ else:
481
+ w = p
482
+ try:
483
+ s = text.index(w, offset)
484
+ pn = ""
485
+ k = i + 1
486
+ while k < len(pieces):
487
+ pn = pieces[k].replace(word_start, "")
488
+ if len(pn) > 0:
489
+ break
490
+ k += 1
491
+
492
+ if len(pn) > 0 and pn in text[offset:s]:
493
+ offset = offset + 1
494
+ else:
495
+ offset = s + len(w)
496
+ except Exception:
497
+ offset = offset + 1
498
+
499
+ if prev_end < offset:
500
+ words.append(text[prev_end:offset])
501
+
502
+ return words
503
+
504
+ def _run_split_on_punc(self, text):
505
+ """Splits punctuation on a piece of text."""
506
+ chars = list(text)
507
+ i = 0
508
+ start_new_word = True
509
+ output = []
510
+ while i < len(chars):
511
+ char = chars[i]
512
+ if _is_punctuation(char):
513
+ output.append([char])
514
+ start_new_word = True
515
+ else:
516
+ if start_new_word:
517
+ output.append([])
518
+ start_new_word = False
519
+ output[-1].append(char)
520
+ i += 1
521
+
522
+ return ["".join(x) for x in output]
523
+
524
+ def save_pretrained(self, path: str, filename_prefix: str = None):
525
+ filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
526
+ if filename_prefix is not None:
527
+ filename = filename_prefix + "-" + filename
528
+ full_path = os.path.join(path, filename)
529
+ with open(full_path, "wb") as fs:
530
+ fs.write(self.spm.serialized_model_proto())
531
+ return (full_path,)
532
+
533
+
534
+ def _is_whitespace(char):
535
+ """Checks whether `chars` is a whitespace character."""
536
+ # \t, \n, and \r are technically control characters but we treat them
537
+ # as whitespace since they are generally considered as such.
538
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
539
+ return True
540
+ cat = unicodedata.category(char)
541
+ if cat == "Zs":
542
+ return True
543
+ return False
544
+
545
+
546
+ def _is_control(char):
547
+ """Checks whether `chars` is a control character."""
548
+ # These are technically control characters but we count them as whitespace
549
+ # characters.
550
+ if char == "\t" or char == "\n" or char == "\r":
551
+ return False
552
+ cat = unicodedata.category(char)
553
+ if cat.startswith("C"):
554
+ return True
555
+ return False
556
+
557
+
558
+ def _is_punctuation(char):
559
+ """Checks whether `chars` is a punctuation character."""
560
+ cp = ord(char)
561
+ # We treat all non-letter/number ASCII as punctuation.
562
+ # Characters such as "^", "$", and "`" are not in the Unicode
563
+ # Punctuation class but we treat them as punctuation anyways, for
564
+ # consistency.
565
+ if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
566
+ return True
567
+ cat = unicodedata.category(char)
568
+ if cat.startswith("P"):
569
+ return True
570
+ return False
571
+
572
+
573
+ def convert_to_unicode(text):
574
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
575
+ if isinstance(text, str):
576
+ return text
577
+ elif isinstance(text, bytes):
578
+ return text.decode("utf-8", "ignore")
579
+ else:
580
+ raise ValueError(f"Unsupported string type: {type(text)}")
korscideberta/unicode.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = ["split_syllable_char", "split_syllables",
2
+ "join_jamos", "join_jamos_char",
3
+ "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]
4
+
5
+ import itertools
6
+
7
+ INITIAL = 0x001
8
+ MEDIAL = 0x010
9
+ FINAL = 0x100
10
+ CHAR_LISTS = {
11
+ INITIAL: list(map(chr, [
12
+ 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
13
+ 0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
14
+ 0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
15
+ 0x314e
16
+ ])),
17
+ MEDIAL: list(map(chr, [
18
+ 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
19
+ 0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
20
+ 0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
21
+ 0x3161, 0x3162, 0x3163
22
+ ])),
23
+ FINAL: list(map(chr, [
24
+ 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
25
+ 0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
26
+ 0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
27
+ 0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
28
+ 0x314c, 0x314d, 0x314e
29
+ ]))
30
+ }
31
+ CHAR_INITIALS = CHAR_LISTS[INITIAL]
32
+ CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
33
+ CHAR_FINALS = CHAR_LISTS[FINAL]
34
+ CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
35
+ CHARSET = set(itertools.chain(*CHAR_SETS.values()))
36
+ CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
37
+ for k, v in CHAR_LISTS.items()}
38
+
39
+
40
+ def is_hangul_syllable(c):
41
+ return 0xac00 <= ord(c) <= 0xd7a3 # Hangul Syllables
42
+
43
+
44
+ def is_hangul_jamo(c):
45
+ return 0x1100 <= ord(c) <= 0x11ff # Hangul Jamo
46
+
47
+
48
+ def is_hangul_compat_jamo(c):
49
+ return 0x3130 <= ord(c) <= 0x318f # Hangul Compatibility Jamo
50
+
51
+
52
+ def is_hangul_jamo_exta(c):
53
+ return 0xa960 <= ord(c) <= 0xa97f # Hangul Jamo Extended-A
54
+
55
+
56
+ def is_hangul_jamo_extb(c):
57
+ return 0xd7b0 <= ord(c) <= 0xd7ff # Hangul Jamo Extended-B
58
+
59
+
60
+ def is_hangul(c):
61
+ return (is_hangul_syllable(c) or
62
+ is_hangul_jamo(c) or
63
+ is_hangul_compat_jamo(c) or
64
+ is_hangul_jamo_exta(c) or
65
+ is_hangul_jamo_extb(c))
66
+
67
+
68
+ def is_supported_hangul(c):
69
+ return is_hangul_syllable(c) or is_hangul_compat_jamo(c)
70
+
71
+
72
+ def check_hangul(c, jamo_only=False):
73
+ if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
74
+ raise ValueError(f"'{c}' is not a supported hangul character. "
75
+ f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
76
+ f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
77
+ f"supported at the moment.")
78
+
79
+
80
+ def get_jamo_type(c):
81
+ check_hangul(c)
82
+ assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
83
+ return sum(t for t, s in CHAR_SETS.items() if c in s)
84
+
85
+
86
+ def split_syllable_char(c):
87
+ """
88
+ Splits a given korean syllable into its components. Each component is
89
+ represented by Unicode in 'Hangul Compatibility Jamo' range.
90
+
91
+ Arguments:
92
+ c: A Korean character.
93
+
94
+ Returns:
95
+ A triple (initial, medial, final) of Hangul Compatibility Jamos.
96
+ If no jamo corresponds to a position, `None` is returned there.
97
+
98
+ Example:
99
+ >>> split_syllable_char("μ•ˆ")
100
+ ("γ…‡", "ㅏ", "γ„΄")
101
+ >>> split_syllable_char("κ³ ")
102
+ ("γ„±", "γ…—", None)
103
+ >>> split_syllable_char("γ…—")
104
+ (None, "γ…—", None)
105
+ >>> split_syllable_char("γ…‡")
106
+ ("γ…‡", None, None)
107
+ """
108
+ check_hangul(c)
109
+ if len(c) != 1:
110
+ raise ValueError("Input string must have exactly one character.")
111
+
112
+ init, med, final = None, None, None
113
+ if is_hangul_syllable(c):
114
+ offset = ord(c) - 0xac00
115
+ x = (offset - offset % 28) // 28
116
+ init, med, final = x // 21, x % 21, offset % 28
117
+ if not final:
118
+ final = None
119
+ else:
120
+ final -= 1
121
+ else:
122
+ pos = get_jamo_type(c)
123
+ if pos & INITIAL == INITIAL:
124
+ pos = INITIAL
125
+ elif pos & MEDIAL == MEDIAL:
126
+ pos = MEDIAL
127
+ elif pos & FINAL == FINAL:
128
+ pos = FINAL
129
+ idx = CHAR_INDICES[pos][c]
130
+ if pos == INITIAL:
131
+ init = idx
132
+ elif pos == MEDIAL:
133
+ med = idx
134
+ else:
135
+ final = idx
136
+ return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
137
+ for pos, idx in
138
+ zip([INITIAL, MEDIAL, FINAL], [init, med, final]))
139
+
140
+
141
+ def split_syllables(s, ignore_err=True, pad=None):
142
+ """
143
+ Performs syllable-split on a string.
144
+
145
+ Arguments:
146
+ s (str): A string (possibly mixed with non-Hangul characters).
147
+ ignore_err (bool): If set False, it ensures that all characters in
148
+ the string are Hangul-splittable and throws a ValueError otherwise.
149
+ (default: True)
150
+ pad (str): Pad empty jamo positions (initial, medial, or final) with
151
+ `pad` character. This is useful for cases where fixed-length
152
+ strings are needed. (default: None)
153
+
154
+ Returns:
155
+ Hangul-split string
156
+
157
+ Example:
158
+ >>> split_syllables("μ•ˆλ…•ν•˜μ„Έμš”")
159
+ "ㅇㅏㄴㄴㅕㅇ��ㅏㅅㅔㅇㅛ"
160
+ >>> split_syllables("μ•ˆλ…•ν•˜μ„Έμš”~~", ignore_err=False)
161
+ ValueError: encountered an unsupported character: ~ (0x7e)
162
+ >>> split_syllables("μ•ˆλ…•ν•˜μ„Έμš”γ…›", pad="x")
163
+ 'γ…‡γ…γ„΄γ„΄γ…•γ…‡γ…Žγ…xγ……γ…”xγ…‡γ…›xxγ…›x'
164
+ """
165
+
166
+ def try_split(c):
167
+ try:
168
+ return split_syllable_char(c)
169
+ except ValueError:
170
+ if ignore_err:
171
+ return (c,)
172
+ raise ValueError(f"encountered an unsupported character: "
173
+ f"{c} (0x{ord(c):x})")
174
+
175
+ s = map(try_split, s)
176
+ if pad is not None:
177
+ tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
178
+ else:
179
+ tuples = map(lambda x: filter(None, x), s)
180
+ return "".join(itertools.chain(*tuples))
181
+
182
+
183
+ def join_jamos_char(init, med, final=None):
184
+ """
185
+ Combines jamos into a single syllable.
186
+
187
+ Arguments:
188
+ init (str): Initial jao.
189
+ med (str): Medial jamo.
190
+ final (str): Final jamo. If not supplied, the final syllable is made
191
+ without the final. (default: None)
192
+
193
+ Returns:
194
+ A Korean syllable.
195
+ """
196
+ chars = (init, med, final)
197
+ for c in filter(None, chars):
198
+ check_hangul(c, jamo_only=True)
199
+
200
+ idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
201
+ for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
202
+ init_idx, med_idx, final_idx = idx
203
+ # final index must be shifted once as
204
+ # final index with 0 points to syllables without final
205
+ final_idx = 0 if final_idx is None else final_idx + 1
206
+ return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)
207
+
208
+
209
+ def join_jamos(s, ignore_err=True):
210
+ """
211
+ Combines a sequence of jamos to produce a sequence of syllables.
212
+
213
+ Arguments:
214
+ s (str): A string (possible mixed with non-jamo characters).
215
+ ignore_err (bool): If set False, it will ensure that all characters
216
+ will be consumed for the making of syllables. It will throw a
217
+ ValueError when it fails to do so. (default: True)
218
+
219
+ Returns:
220
+ A string
221
+
222
+ Example:
223
+ >>> join_jamos("γ…‡γ…γ„΄γ„΄γ…•γ…‡γ…Žγ…γ……γ…”γ…‡γ…›")
224
+ "μ•ˆλ…•ν•˜μ„Έμš”"
225
+ >>> join_jamos("γ…‡γ…γ„΄γ„΄γ„΄γ…•γ…‡γ…Žγ…γ……γ…”γ…‡γ…›")
226
+ "μ•ˆγ„΄λ…•ν•˜μ„Έμš”"
227
+ >>> join_jamos()
228
+ """
229
+ last_t = 0
230
+ queue = []
231
+ new_string = ""
232
+
233
+ def flush(n=0):
234
+ new_queue = []
235
+ while len(queue) > n:
236
+ new_queue.append(queue.pop())
237
+ if len(new_queue) == 1:
238
+ if not ignore_err:
239
+ raise ValueError(f"invalid jamo character: {new_queue[0]}")
240
+ result = new_queue[0]
241
+ elif len(new_queue) >= 2:
242
+ try:
243
+ result = join_jamos_char(*new_queue)
244
+ except (ValueError, KeyError):
245
+ # Invalid jamo combination
246
+ if not ignore_err:
247
+ raise ValueError(f"invalid jamo characters: {new_queue}")
248
+ result = "".join(new_queue)
249
+ else:
250
+ result = None
251
+ return result
252
+
253
+ for c in s:
254
+ if c not in CHARSET:
255
+ if queue:
256
+ new_c = flush() + c
257
+ else:
258
+ new_c = c
259
+ last_t = 0
260
+ else:
261
+ t = get_jamo_type(c)
262
+ new_c = None
263
+ if t & FINAL == FINAL:
264
+ if not (last_t == MEDIAL):
265
+ new_c = flush()
266
+ elif t == INITIAL:
267
+ new_c = flush()
268
+ elif t == MEDIAL:
269
+ if last_t & INITIAL == INITIAL:
270
+ new_c = flush(1)
271
+ else:
272
+ new_c = flush()
273
+ last_t = t
274
+ queue.insert(0, c)
275
+ if new_c:
276
+ new_string += new_c
277
+ if queue:
278
+ new_string += flush()
279
+ return new_string
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file: pyproject.toml
2
+
3
+ [build-system]
4
+ requires = [
5
+ "setuptools >= 65",
6
+ "wheel >= 0.38",
7
+ ]
8
+ build-backend = "setuptools.build_meta"
9
+
10
+ [project]
11
+ name = "korscideberta"
12
+ version = "0.1.0"
13
+ readme = "README.md"
14
+ requires-python = ">=3.8"
15
+ dependencies = [
16
+ "sentencepiece",
17
+ "transformers",
18
+ "mecab",
19
+ "konlpy",
20
+ ]