File size: 3,769 Bytes

# Processerでの実施事項
# - TokenizerでTokenize
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
# input_ids: , attention_mask: , time_series_values: の形式で返す。

from typing import List, Optional, Union

from pandas import DataFrame
import numpy as np
import torch
import tensorflow as tf
import jax.numpy as jnp

from transformers import ProcessorMixin
from transformers import TensorType
from transformers import BatchFeature
from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy


class MistsProcessor(ProcessorMixin):

    # 本来はMoment側のTokenizerもts_tokenizerとして入れたかったが、モデルに組み込まれてしまっている。
    # refers: https://github.com/moment-timeseries-foundation-model/moment/blob/088b253a1138ac7e48a7efc9bf902336c9eec8d9/momentfm/models/moment.py#L105

    # この2パーツが本来はts_tokenizerの領分になる気がする。
    # (normalizer): RevIN()
    # (tokenizer): Patching()
    attributes = ["feature_extractor", "tokenizer"]
    feature_extractor_class = "AutoFeatureExtractor"
    tokenizer_class = "AutoTokenizer"

    def __init__(self, feature_extractor=None, tokenizer=None):
        super().__init__(feature_extractor, tokenizer)


    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Union[int, None] = None,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
        torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
        time_series_padding: Union[bool, str] = False,
        time_series_max_length: Union[int, None] = None,
        text_tokenize: bool = True,
    ) -> BatchFeature:
        if time_series is not None:
            time_series_values = self.feature_extractor(
                time_series, 
                return_tensors=return_tensors, 
                torch_dtype=torch_dtype, 
                padding=time_series_padding, 
                max_length=time_series_max_length
            )
        else:
            time_series_values = None
        if text is not None: 
            if text_tokenize:
                text_inputs = self.tokenizer(
                    text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
                )
            else:
                text_inputs = {"text": text}
        else:
            text_inputs = {}

        return BatchFeature(data={**text_inputs, **time_series_values})
    
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
    
    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        feature_extractor_input_names = self.feature_extractor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))