Spaces:
Sleeping
Sleeping
# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py | |
# Copyright 2020 SacreBLEU Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import re | |
from functools import lru_cache | |
class BaseTokenizer: | |
"""A base dummy tokenizer to derive from.""" | |
def signature(self): | |
""" | |
Returns a signature for the tokenizer. | |
:return: signature string | |
""" | |
return "none" | |
def __call__(self, line): | |
""" | |
Tokenizes an input line with the tokenizer. | |
:param line: a segment to tokenize | |
:return: the tokenized line | |
""" | |
return line | |
class TokenizerRegexp(BaseTokenizer): | |
def signature(self): | |
return "re" | |
def __init__(self): | |
self._re = [ | |
# language-dependent part (assuming Western languages) | |
(re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), | |
# tokenize period and comma unless preceded by a digit | |
(re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), | |
# tokenize period and comma unless followed by a digit | |
(re.compile(r"([\.,])([^0-9])"), r" \1 \2"), | |
# tokenize dash when preceded by a digit | |
(re.compile(r"([0-9])(-)"), r"\1 \2 "), | |
# one space only between words | |
# NOTE: Doing this in Python (below) is faster | |
# (re.compile(r'\s+'), r' '), | |
] | |
def __call__(self, line): | |
"""Common post-processing tokenizer for `13a` and `zh` tokenizers. | |
:param line: a segment to tokenize | |
:return: the tokenized line | |
""" | |
for (_re, repl) in self._re: | |
line = _re.sub(repl, line) | |
# no leading or trailing spaces, single space within words | |
# return ' '.join(line.split()) | |
# This line is changed with regards to the original tokenizer (seen above) to return individual words | |
return line.split() | |
class Tokenizer13a(BaseTokenizer): | |
def signature(self): | |
return "13a" | |
def __init__(self): | |
self._post_tokenizer = TokenizerRegexp() | |
def __call__(self, line): | |
"""Tokenizes an input line using a relatively minimal tokenization | |
that is however equivalent to mteval-v13a, used by WMT. | |
:param line: a segment to tokenize | |
:return: the tokenized line | |
""" | |
# language-independent part: | |
line = line.replace("<skipped>", "") | |
line = line.replace("-\n", "") | |
line = line.replace("\n", " ") | |
if "&" in line: | |
line = line.replace(""", '"') | |
line = line.replace("&", "&") | |
line = line.replace("<", "<") | |
line = line.replace(">", ">") | |
return self._post_tokenizer(f" {line} ") | |