megalaa commited on
Commit
10597c2
1 Parent(s): d5b7867

Upload 11 files

Browse files
args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "hf/norm_group_greekified-finetuned_with_backtranslation_1", "src_language": "eng", "tgt_language": "cop", "max_input_length": 128, "max_target_length": 128}
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-en-mul",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bad_words_ids": [
12
+ [
13
+ 64109
14
+ ]
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classif_dropout": 0.0,
18
+ "classifier_dropout": 0.0,
19
+ "d_model": 512,
20
+ "decoder_attention_heads": 8,
21
+ "decoder_ffn_dim": 2048,
22
+ "decoder_layerdrop": 0.0,
23
+ "decoder_layers": 6,
24
+ "decoder_start_token_id": 64109,
25
+ "decoder_vocab_size": 64110,
26
+ "dropout": 0.1,
27
+ "encoder_attention_heads": 8,
28
+ "encoder_ffn_dim": 2048,
29
+ "encoder_layerdrop": 0.0,
30
+ "encoder_layers": 6,
31
+ "eos_token_id": 0,
32
+ "extra_pos_embeddings": 64110,
33
+ "forced_eos_token_id": 0,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1",
37
+ "2": "LABEL_2"
38
+ },
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1,
44
+ "LABEL_2": 2
45
+ },
46
+ "max_length": 512,
47
+ "max_position_embeddings": 512,
48
+ "model_type": "marian",
49
+ "normalize_before": false,
50
+ "normalize_embedding": false,
51
+ "num_beams": 4,
52
+ "num_hidden_layers": 6,
53
+ "pad_token_id": 64109,
54
+ "scale_embedding": true,
55
+ "share_encoder_decoder_embeddings": true,
56
+ "static_position_embeddings": true,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.33.1",
59
+ "use_cache": true,
60
+ "vocab_size": 64110
61
+ }
config.properties ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models={\
2
+ "en-cop-norm-group-greekified-bt": {\
3
+ "1.0": {\
4
+ "defaultVersion": true,\
5
+ "marName": "en-cop-norm-group-greekified-bt.mar",\
6
+ "minWorkers": 1,\
7
+ "maxWorkers": 4,\
8
+ "batchSize": 1,\
9
+ "maxBatchDelay": 100,\
10
+ "responseTimeout": 120\
11
+ }\
12
+ }\
13
+ }
14
+ inference_address=http://0.0.0.0:7080
15
+ service_envelope=json
handler.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import logging
3
+ import os
4
+ from abc import ABC
5
+ from typing import Optional
6
+
7
+ import torch
8
+ import json
9
+
10
+ from transformers import (
11
+ AutoModelForSeq2SeqLM,
12
+ AutoTokenizer,
13
+ )
14
+ from ts.torch_handler.base_handler import BaseHandler
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ MAX_TOKEN_LENGTH_ERR = {
19
+ "code": 422,
20
+ "type" : "MaxTokenLengthError",
21
+ "message": "Max token length exceeded",
22
+ }
23
+
24
+
25
+ class EngCopHandler(BaseHandler, ABC):
26
+ @dataclass
27
+ class GenerationConfig:
28
+ max_length: int = 20
29
+ max_new_tokens: Optional[int] = None
30
+ min_length: int = 0
31
+ min_new_tokens: Optional[int] = None
32
+ early_stopping: bool = True
33
+ do_sample: bool = False
34
+ num_beams: int = 1
35
+ num_beam_groups: int = 1
36
+ top_k: int = 50
37
+ top_p: float = 0.95
38
+ temperature: float = 1.0
39
+ diversity_penalty: float = 0.0
40
+
41
+ def __init__(self):
42
+ super(EngCopHandler, self).__init__()
43
+ self.initialized = False
44
+
45
+ def initialize(self, ctx):
46
+ """In this initialize function, the HF large model is loaded and
47
+ partitioned using DeepSpeed.
48
+ Args:
49
+ ctx (context): It is a JSON Object containing information
50
+ pertaining to the model artifacts parameters.
51
+ """
52
+ logger.info("Start initialize")
53
+ self.manifest = ctx.manifest
54
+ properties = ctx.system_properties
55
+ model_dir = properties.get("model_dir")
56
+ serialized_file = self.manifest["model"]["serializedFile"]
57
+ model_pt_path = os.path.join(model_dir, serialized_file)
58
+
59
+ setup_config_path = os.path.join(model_dir, "setup_self.config.json")
60
+ if os.path.isfile(setup_config_path):
61
+ with open(setup_config_path) as setup_config_path:
62
+ self.setup_config = json.load(setup_config_path)
63
+
64
+ seed = int(42)
65
+ torch.manual_seed(seed)
66
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
67
+ logger.info("Device: %s", self.device)
68
+
69
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
70
+ self.model.to(self.device)
71
+ self.model.eval()
72
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
73
+ self.config = EngCopHandler.GenerationConfig(
74
+ max_new_tokens=128,
75
+ min_new_tokens=1,
76
+ num_beams=5,
77
+ )
78
+ self.initialized = True
79
+ logger.info("Init done")
80
+
81
+ def preprocess(self, requests):
82
+ preprocessed_data = []
83
+ for data in requests:
84
+ data_item = data.get("data")
85
+ if data_item is None:
86
+ data_item = data.get("body")
87
+ if isinstance(data_item, (bytes, bytearray)):
88
+ data_item = data_item.decode("utf-8")
89
+ preprocessed_data.append(data_item)
90
+ logger.info("preprocessed_data %s: ", preprocessed_data)
91
+ return preprocessed_data
92
+
93
+ def inference(self, data):
94
+ indices = {}
95
+ batch = []
96
+ for i, item in enumerate(data):
97
+ tokens = self.tokenizer(item, return_tensors="pt", padding=True)
98
+ if len(tokens.input_ids.squeeze()) > self.tokenizer.model_max_length:
99
+ logger.info("Skipping token %s for index %s", tokens, i)
100
+ continue
101
+ indices[i] = len(batch)
102
+ batch.append(data[i])
103
+ logger.info("inference batch: %s", batch)
104
+ result = self.batch_translate(batch)
105
+ return [
106
+ degreekify(result[indices[i]]) if i in indices else None
107
+ for i in range(len(data))
108
+ ]
109
+
110
+ def postprocess(self, output):
111
+ return output
112
+
113
+ def handle(self, requests, context):
114
+ logger.info("requests %s: ", requests)
115
+ preprocessed = self.preprocess(requests)
116
+ inference_data = self.inference(preprocessed)
117
+ postprocessed = self.postprocess(inference_data)
118
+ logger.info("inference result: %s", postprocessed)
119
+
120
+ responses = [
121
+ {"code": 200, "translation": translation}
122
+ if translation
123
+ else MAX_TOKEN_LENGTH_ERR
124
+ for translation in postprocessed
125
+ ]
126
+ return responses
127
+
128
+ def batch_translate(self, input_sentences, output_confidence=False):
129
+ if len(input_sentences) == 0:
130
+ return []
131
+ inputs = self.tokenizer(input_sentences, return_tensors="pt", padding=True).to(
132
+ self.device
133
+ )
134
+ output_scores, return_dict_in_generate = output_confidence, output_confidence
135
+ outputs = self.model.generate(
136
+ **inputs,
137
+ max_length=self.config.max_length,
138
+ max_new_tokens=self.config.max_new_tokens,
139
+ min_length=self.config.min_length,
140
+ min_new_tokens=self.config.min_new_tokens,
141
+ early_stopping=self.config.early_stopping,
142
+ do_sample=self.config.do_sample,
143
+ num_beams=self.config.num_beams,
144
+ num_beam_groups=self.config.num_beam_groups,
145
+ top_k=self.config.top_k,
146
+ top_p=self.config.top_p,
147
+ temperature=self.config.temperature,
148
+ diversity_penalty=self.config.diversity_penalty,
149
+ output_scores=output_scores,
150
+ return_dict_in_generate=True,
151
+ )
152
+ translated_text = self.tokenizer.batch_decode(
153
+ outputs.sequences, skip_special_tokens=True
154
+ )
155
+ return translated_text
156
+
157
+
158
+ GREEK_TO_COPTIC = {
159
+ "α": "ⲁ",
160
+ "β": "ⲃ",
161
+ "γ": "ⲅ",
162
+ "δ": "ⲇ",
163
+ "ε": "ⲉ",
164
+ "ϛ": "ⲋ",
165
+ "ζ": "ⲍ",
166
+ "η": "ⲏ",
167
+ "θ": "ⲑ",
168
+ "ι": "ⲓ",
169
+ "κ": "ⲕ",
170
+ "λ": "ⲗ",
171
+ "μ": "ⲙ",
172
+ "ν": "ⲛ",
173
+ "ξ": "ⲝ",
174
+ "ο": "ⲟ",
175
+ "π": "ⲡ",
176
+ "ρ": "ⲣ",
177
+ "σ": "ⲥ",
178
+ "τ": "ⲧ",
179
+ "υ": "ⲩ",
180
+ "φ": "ⲫ",
181
+ "χ": "ⲭ",
182
+ "ψ": "ⲯ",
183
+ "ω": "ⲱ",
184
+ "s": "ϣ",
185
+ "f": "ϥ",
186
+ "k": "ϧ",
187
+ "h": "ϩ",
188
+ "j": "ϫ",
189
+ "c": "ϭ",
190
+ "t": "ϯ",
191
+ }
192
+
193
+
194
+ def degreekify(greek_text):
195
+ chars = []
196
+ for c in greek_text:
197
+ l_c = c.lower()
198
+ chars.append(GREEK_TO_COPTIC.get(l_c, l_c))
199
+ return "".join(chars)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07ad933ead590231162cdc7cb1b63b572436f934865ae2cd9ab9a1d172e79da5
3
+ size 308194181
setup_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name":"en-cop-norm-group-greekified-bt",
3
+ "save_mode":"pretrained",
4
+ "max_length":"512"
5
+ }
source.spm ADDED
Binary file (790 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
Binary file (707 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "eos_token": "</s>",
4
+ "model_max_length": 512,
5
+ "pad_token": "<pad>",
6
+ "separate_vocabs": false,
7
+ "source_lang": "eng",
8
+ "sp_model_kwargs": {},
9
+ "target_lang": "mul",
10
+ "tokenizer_class": "MarianTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff