HachiML commited on
Commit
d742904
1 Parent(s): 13df84c

Upload processor

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<pad>": 32769,
3
+ "<time_series>": 32768
4
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoFeatureExtractor": "HachiML/MOMENT-1-large-embedding-v0.1--feature_extraction_moment.MomentFeatureExtractor",
4
+ "AutoProcessor": "processing_mists.MistsProcessor"
5
+ },
6
+ "feature_extractor_type": "MomentFeatureExtractor",
7
+ "processor_class": "MistsProcessor"
8
+ }
processing_mists.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Processerでの実施事項
2
+ # - TokenizerでTokenize
3
+ # - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
4
+ # input_ids: , attention_mask: , time_series_values: の形式で返す。
5
+
6
+ from typing import List, Optional, Union
7
+
8
+ from pandas import DataFrame
9
+ import numpy as np
10
+ import torch
11
+ import tensorflow as tf
12
+ import jax.numpy as jnp
13
+
14
+ from transformers import ProcessorMixin
15
+ from transformers import TensorType
16
+ from transformers import BatchFeature
17
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
18
+
19
+
20
+ class MistsProcessor(ProcessorMixin):
21
+
22
+ # 本来はMoment側のTokenizerもts_tokenizerとして入れたかったが、モデルに組み込まれてしまっている。
23
+ # refers: https://github.com/moment-timeseries-foundation-model/moment/blob/088b253a1138ac7e48a7efc9bf902336c9eec8d9/momentfm/models/moment.py#L105
24
+
25
+ # この2パーツが本来はts_tokenizerの領分になる気がする。
26
+ # (normalizer): RevIN()
27
+ # (tokenizer): Patching()
28
+ attributes = ["feature_extractor", "tokenizer"]
29
+ feature_extractor_class = "AutoFeatureExtractor"
30
+ tokenizer_class = "AutoTokenizer"
31
+
32
+ def __init__(self, feature_extractor=None, tokenizer=None):
33
+ super().__init__(feature_extractor, tokenizer)
34
+
35
+
36
+ def __call__(
37
+ self,
38
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
39
+ time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
40
+ padding: Union[bool, str, PaddingStrategy] = False,
41
+ truncation: Union[bool, str, TruncationStrategy] = None,
42
+ max_length: Union[int, None] = None,
43
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
44
+ torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
45
+ time_series_padding: Union[bool, str] = False,
46
+ time_series_max_length: Union[int, None] = None,
47
+ ) -> BatchFeature:
48
+ if time_series is not None:
49
+ time_series_values = self.feature_extractor(
50
+ time_series,
51
+ return_tensors=return_tensors,
52
+ torch_dtype=torch_dtype,
53
+ padding=time_series_padding,
54
+ time_series_max_length=time_series_max_length
55
+ )
56
+ else:
57
+ time_series_values = None
58
+ text_inputs = self.tokenizer(
59
+ text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
60
+ )
61
+
62
+ return BatchFeature(data={**text_inputs, **time_series_values})
63
+
64
+ def batch_decode(self, *args, **kwargs):
65
+ """
66
+ This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
67
+ refer to the docstring of this method for more information.
68
+ """
69
+ return self.tokenizer.batch_decode(*args, **kwargs)
70
+
71
+ def decode(self, *args, **kwargs):
72
+ """
73
+ This method forwards all its arguments to Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
74
+ the docstring of this method for more information.
75
+ """
76
+ return self.tokenizer.decode(*args, **kwargs)
77
+
78
+ @property
79
+ def model_input_names(self):
80
+ tokenizer_input_names = self.tokenizer.model_input_names
81
+ feature_extractor_input_names = self.feature_extractor.model_input_names
82
+ return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_mists.MistsProcessor"
4
+ },
5
+ "processor_class": "MistsProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff