Mists-7B-v01-not-trained / processing_mists.py
HachiML's picture
Upload processing_mists.py
0c4eba3 verified
# Processerでの実施事項
# - TokenizerでTokenize
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
# input_ids: , attention_mask: , time_series_values: の形式で返す。
from typing import List, Optional, Union
from pandas import DataFrame
import numpy as np
import torch
import tensorflow as tf
import jax.numpy as jnp
from transformers import ProcessorMixin
from transformers import TensorType
from transformers import BatchFeature
from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
class MistsProcessor(ProcessorMixin):
# 本来はMoment側のTokenizerもts_tokenizerとして入れたかったが、モデルに組み込まれてしまっている。
# refers: https://github.com/moment-timeseries-foundation-model/moment/blob/088b253a1138ac7e48a7efc9bf902336c9eec8d9/momentfm/models/moment.py#L105
# この2パーツが本来はts_tokenizerの領分になる気がする。
# (normalizer): RevIN()
# (tokenizer): Patching()
attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor=None, tokenizer=None):
super().__init__(feature_extractor, tokenizer)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Union[int, None] = None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
time_series_padding: Union[bool, str] = False,
time_series_max_length: Union[int, None] = None,
text_tokenize: bool = True,
) -> BatchFeature:
if time_series is not None:
time_series_values = self.feature_extractor(
time_series,
return_tensors=return_tensors,
torch_dtype=torch_dtype,
padding=time_series_padding,
max_length=time_series_max_length
)
else:
time_series_values = None
if text is not None:
if text_tokenize:
text_inputs = self.tokenizer(
text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
)
else:
text_inputs = {"text": text}
else:
text_inputs = {}
return BatchFeature(data={**text_inputs, **time_series_values})
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))