HachiML
/

Mists-7B-v01-not-trained

Feature Extraction

TimeSeries-Text-to-Text

Model card Files Files and versions Community

Mists-7B-v01-not-trained / processing_mists.py

HachiML's picture

Upload processing_mists.py

0c4eba3 verified 5 days ago

raw history blame contribute delete

No virus

3.77 kB

	# Processerでの実施事項
	# - TokenizerでTokenize
	# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
	# input_ids: , attention_mask: , time_series_values: の形式で返す。

	from typing import List, Optional, Union

	from pandas import DataFrame
	import numpy as np
	import torch
	import tensorflow as tf
	import jax.numpy as jnp

	from transformers import ProcessorMixin
	from transformers import TensorType
	from transformers import BatchFeature
	from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy


	class MistsProcessor(ProcessorMixin):

	# 本来はMoment側のTokenizerもts_tokenizerとして入れたかったが、モデルに組み込まれてしまっている。
	# refers: https://github.com/moment-timeseries-foundation-model/moment/blob/088b253a1138ac7e48a7efc9bf902336c9eec8d9/momentfm/models/moment.py#L105

	# この2パーツが本来はts_tokenizerの領分になる気がする。
	# (normalizer): RevIN()
	# (tokenizer): Patching()
	attributes = ["feature_extractor", "tokenizer"]
	feature_extractor_class = "AutoFeatureExtractor"
	tokenizer_class = "AutoTokenizer"

	def __init__(self, feature_extractor=None, tokenizer=None):
	super().__init__(feature_extractor, tokenizer)


	def __call__(
	self,
	text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
	time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = None,
	max_length: Union[int, None] = None,
	return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
	torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
	time_series_padding: Union[bool, str] = False,
	time_series_max_length: Union[int, None] = None,
	text_tokenize: bool = True,
	) -> BatchFeature:
	if time_series is not None:
	time_series_values = self.feature_extractor(
	time_series,
	return_tensors=return_tensors,
	torch_dtype=torch_dtype,
	padding=time_series_padding,
	max_length=time_series_max_length
	)
	else:
	time_series_values = None
	if text is not None:
	if text_tokenize:
	text_inputs = self.tokenizer(
	text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
	)
	else:
	text_inputs = {"text": text}
	else:
	text_inputs = {}

	return BatchFeature(data={text_inputs, time_series_values})

	def batch_decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
	refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
	the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	@property
	def model_input_names(self):
	tokenizer_input_names = self.tokenizer.model_input_names
	feature_extractor_input_names = self.feature_extractor.model_input_names
	return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))