clip-japanese-base / tokenization_clyp.py
pfzhu's picture
Upload folder using huggingface_hub
071945c verified
# coding=utf-8
# Copyright 2024 LY Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import Optional
import torch
from transformers import BatchEncoding, PreTrainedTokenizer, T5Tokenizer
from transformers.tokenization_utils_base import (
PaddingStrategy,
PreTokenizedInput,
TextInput,
TruncationStrategy,
)
class CLYPTokenizer(PreTrainedTokenizer):
"""CLYPTokenizer based on rinna/japanese-roberta-base
This tokenizer is registered as a custom tokenizer to manually add CLS token to each text.
"""
def __init__(self, max_length: int, padding: str, truncation: bool, **kwargs):
# tokenizer
self.tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-roberta-base")
self.tokenizer.do_lower_case = True
super().__init__(
max_length=max_length, padding=padding, truncation=truncation, **kwargs
)
self.max_length = max_length
self.padding = padding
self.truncation = truncation
@property
def vocab_size(self):
return self.tokenizer.vocab_size
def get_vocab(self) -> dict[str, int]:
return self.tokenizer.get_vocab()
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> tuple[str]:
return self.tokenizer.save_vocabulary(
save_directory, filename_prefix=filename_prefix
)
def _tokenize(self, text, **kwargs):
return self.tokenizer._tokenize(text, **kwargs)
def _convert_token_to_id(self, token):
return self.tokenizer._convert_token_to_id(token)
def _convert_id_to_token(self, index: int) -> str:
return self.tokenizer._convert_id_to_token(index)
def __call__(
self,
text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput],
add_special_tokens: bool = True,
padding: bool | str | PaddingStrategy | None = None,
truncation: bool | str | TruncationStrategy | None = None,
max_length: Optional[int] = None,
**kwargs,
):
if max_length is None:
max_length = self.max_length
if padding is None:
padding = self.padding
if truncation is None:
truncation = self.truncation
if add_special_tokens:
max_length = max_length - 1
if not isinstance(text, list):
# TODO: Review
text = [text]
out = self.tokenizer(
text,
max_length=max_length,
padding=padding,
truncation=truncation,
add_special_tokens=False,
**kwargs,
)
if add_special_tokens:
input_ids = [
[self.tokenizer.cls_token_id] + ids for ids in out["input_ids"]
]
attention_mask = [[1] + am for am in out["attention_mask"]]
position_ids = [list(range(0, len(input_ids[0])))] * len(input_ids)
else:
input_ids = out["input_ids"]
attention_mask = out["attention_mask"]
position_ids = [list(range(0, len(input_ids[0])))] * len(input_ids)
# tensor
input_ids = torch.tensor(input_ids, dtype=torch.long)
attention_mask = torch.tensor(attention_mask, dtype=torch.long)
position_ids = torch.tensor(position_ids, dtype=torch.long)
# retrn
data = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"position_ids": position_ids,
}
return BatchEncoding(data=data)