Trace2333 commited on
Commit
2e7a062
·
2 Parent(s): 5aebdd6 115460a

Merge branch 'hf' into local-main

Browse files
Files changed (5) hide show
  1. app.py +98 -0
  2. app_test.py +14 -0
  3. gpt2_generation.py +379 -0
  4. requirements.txt +15 -0
  5. utils.py +12 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spacy
3
+ from accelerate import PartialState
4
+ from accelerate.utils import set_seed
5
+ from flask import Flask, request, jsonify
6
+
7
+ from gpt2_generation import Translator
8
+ from gpt2_generation import generate_prompt, MODEL_CLASSES
9
+
10
+ os.environ["http_proxy"] = "http://127.0.0.1:7890"
11
+ os.environ["https_proxy"] = "http://127.0.0.1:7890"
12
+
13
+ app = Flask(__name__)
14
+
15
+ path_for_model = "./output/gpt2_openprompt/checkpoint-4500"
16
+
17
+ args = {
18
+ "model_type": "gpt2",
19
+ "model_name_or_path": path_for_model,
20
+ "length": 80,
21
+ "stop_token": None,
22
+ "temperature": 1.0,
23
+ "length_penalty": 1.2,
24
+ "repetition_penalty": 1.2,
25
+ "k": 3,
26
+ "p": 0.9,
27
+ "prefix": "",
28
+ "padding_text": "",
29
+ "xlm_language": "",
30
+ "seed": 42,
31
+ "use_cpu": False,
32
+ "num_return_sequences": 1,
33
+ "fp16": False,
34
+ "jit": False,
35
+ }
36
+
37
+ distributed_state = PartialState(cpu=args["use_cpu"])
38
+
39
+ if args["seed"] is not None:
40
+ set_seed(args["seed"])
41
+
42
+ tokenizer = None
43
+ model = None
44
+ zh_en_translator = None
45
+ nlp = None
46
+
47
+ def load_model_and_components():
48
+ global tokenizer, model, zh_en_translator, nlp
49
+
50
+ # Initialize the model and tokenizer
51
+ try:
52
+ args["model_type"] = args["model_type"].lower()
53
+ model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
54
+ except KeyError:
55
+ raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
56
+
57
+ tokenizer = tokenizer_class.from_pretrained(args["model_name_or_path"], padding_side='left')
58
+ tokenizer.pad_token = tokenizer.eos_token
59
+ tokenizer.mask_token = tokenizer.eos_token
60
+ model = model_class.from_pretrained(args["model_name_or_path"])
61
+ print("Model loaded!")
62
+
63
+ # translator
64
+ zh_en_translator = Translator("Helsinki-NLP/opus-mt-zh-en")
65
+ print("Translator loaded!")
66
+
67
+ # filter
68
+ nlp = spacy.load('en_core_web_sm')
69
+ print("Filter loaded!")
70
+
71
+ # Set the model to the right device
72
+ model.to(distributed_state.device)
73
+
74
+ if args["fp16"]:
75
+ model.half()
76
+
77
+ @app.route('/chat', methods=['POST'])
78
+ def chat():
79
+ phrase = request.json.get('phrase')
80
+
81
+ if tokenizer is None or model is None or zh_en_translator is None or nlp is None:
82
+ load_model_and_components()
83
+
84
+ messages = generate_prompt(
85
+ prompt_text=phrase,
86
+ args=args,
87
+ zh_en_translator=zh_en_translator,
88
+ nlp=nlp,
89
+ model=model,
90
+ tokenizer=tokenizer,
91
+ distributed_state=distributed_state,
92
+ )
93
+
94
+ return jsonify(messages)
95
+
96
+ if __name__ == '__main__':
97
+ load_model_and_components()
98
+ app.run(host='0.0.0.0', port=10008, debug=False)
app_test.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ url = 'http://localhost:10008/chat'
5
+
6
+ data = {
7
+ 'phrase': 'a spiece 和一只狼'
8
+ }
9
+
10
+ response = requests.post(url, json=data)
11
+
12
+ response_data = response.json()
13
+
14
+ print(json.dumps(response_data, indent=4))
gpt2_generation.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ import inspect
4
+ import logging
5
+ import nltk
6
+ from typing import Tuple
7
+
8
+ import torch
9
+
10
+ from transformers import (
11
+ AutoTokenizer,
12
+ BloomForCausalLM,
13
+ BloomTokenizerFast,
14
+ CTRLLMHeadModel,
15
+ CTRLTokenizer,
16
+ GenerationMixin,
17
+ GPT2LMHeadModel,
18
+ GPT2Tokenizer,
19
+ GPTJForCausalLM,
20
+ LlamaForCausalLM,
21
+ LlamaTokenizer,
22
+ OpenAIGPTLMHeadModel,
23
+ OpenAIGPTTokenizer,
24
+ OPTForCausalLM,
25
+ TransfoXLLMHeadModel,
26
+ TransfoXLTokenizer,
27
+ XLMTokenizer,
28
+ XLMWithLMHeadModel,
29
+ XLNetLMHeadModel,
30
+ XLNetTokenizer,
31
+ AutoModelForSeq2SeqLM,
32
+ )
33
+ from transformers.modeling_outputs import CausalLMOutputWithPast
34
+ from forbidden import FORBIDDEN_NOUN
35
+
36
+ logging.basicConfig(
37
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
38
+ datefmt="%m/%d/%Y %H:%M:%S",
39
+ level=logging.INFO,
40
+ )
41
+ MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
42
+
43
+ MODEL_CLASSES = {
44
+ "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
45
+ "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
46
+ "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
47
+ "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
48
+ "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
49
+ "xlm": (XLMWithLMHeadModel, XLMTokenizer),
50
+ "gptj": (GPTJForCausalLM, AutoTokenizer),
51
+ "bloom": (BloomForCausalLM, BloomTokenizerFast),
52
+ "llama": (LlamaForCausalLM, LlamaTokenizer),
53
+ "opt": (OPTForCausalLM, GPT2Tokenizer),
54
+ }
55
+
56
+
57
+ FORBIDDEN_NOUN = set(FORBIDDEN_NOUN)
58
+
59
+ class Translator:
60
+ def __init__(self, model_name):
61
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
62
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
63
+
64
+ def translate(self, text):
65
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True)
66
+ outputs = self.model.generate(**inputs)
67
+ translated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
68
+ return translated_text
69
+
70
+ def __call__(self, text):
71
+ return self.translate(text)
72
+
73
+ #
74
+ # Functions to prepare models' input
75
+ #
76
+ def prepare_ctrl_input(args, _, tokenizer, prompt_text):
77
+ if args["temperature"] > 0.7:
78
+ pass
79
+
80
+ encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
81
+ if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
82
+ pass
83
+ return prompt_text
84
+
85
+
86
+ def prepare_xlm_input(args, model, tokenizer, prompt_text):
87
+ # kwargs = {"language": None, "mask_token_id": None}
88
+
89
+ # Set the language
90
+ use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
91
+ if hasattr(model.config, "lang2id") and use_lang_emb:
92
+ available_languages = model.config.lang2id.keys()
93
+ if args["xlm_language"] in available_languages:
94
+ language = args["xlm_language"]
95
+ else:
96
+ language = None
97
+ while language not in available_languages:
98
+ language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
99
+
100
+ model.config.lang_id = model.config.lang2id[language]
101
+ # kwargs["language"] = tokenizer.lang2id[language]
102
+
103
+ return prompt_text
104
+
105
+
106
+ def prepare_xlnet_input(args, _, tokenizer, prompt_text):
107
+ prefix = args["prefix"] if args["prefix"] else args["padding_text"] if args["padding_text"] else ""
108
+ prompt_text = prefix + prompt_text
109
+ return prompt_text
110
+
111
+
112
+ def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
113
+ prefix = args["prefix"] if args["prefix"] else args["padding_text"] if args["padding_text"] else ""
114
+ prompt_text = prefix + prompt_text
115
+ return prompt_text
116
+
117
+
118
+ PREPROCESSING_FUNCTIONS = {
119
+ "ctrl": prepare_ctrl_input,
120
+ "xlm": prepare_xlm_input,
121
+ "xlnet": prepare_xlnet_input,
122
+ "transfo-xl": prepare_transfoxl_input,
123
+ }
124
+
125
+
126
+ def adjust_length_to_model(length, max_sequence_length):
127
+ if length < 0 and max_sequence_length > 0:
128
+ length = max_sequence_length
129
+ elif 0 < max_sequence_length < length:
130
+ length = max_sequence_length # No generation bigger than model size
131
+ elif length < 0:
132
+ length = MAX_LENGTH # avoid infinite loop
133
+ return length
134
+
135
+
136
+ def sparse_model_config(model_config):
137
+ embedding_size = None
138
+ if hasattr(model_config, "hidden_size"):
139
+ embedding_size = model_config.hidden_size
140
+ elif hasattr(model_config, "n_embed"):
141
+ embedding_size = model_config.n_embed
142
+ elif hasattr(model_config, "n_embd"):
143
+ embedding_size = model_config.n_embd
144
+
145
+ num_head = None
146
+ if hasattr(model_config, "num_attention_heads"):
147
+ num_head = model_config.num_attention_heads
148
+ elif hasattr(model_config, "n_head"):
149
+ num_head = model_config.n_head
150
+
151
+ if embedding_size is None or num_head is None or num_head == 0:
152
+ raise ValueError("Check the model config")
153
+
154
+ num_embedding_size_per_head = int(embedding_size / num_head)
155
+ if hasattr(model_config, "n_layer"):
156
+ num_layer = model_config.n_layer
157
+ elif hasattr(model_config, "num_hidden_layers"):
158
+ num_layer = model_config.num_hidden_layers
159
+ else:
160
+ raise ValueError("Number of hidden layers couldn't be determined from the model config")
161
+
162
+ return num_layer, num_head, num_embedding_size_per_head
163
+
164
+
165
+ def generate_past_key_values(model, batch_size, seq_len):
166
+ num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
167
+ if model.config.model_type == "bloom":
168
+ past_key_values = tuple(
169
+ (
170
+ torch.empty(int(num_attention_heads * batch_size), num_embedding_size_per_head, seq_len)
171
+ .to(model.dtype)
172
+ .to(model.device),
173
+ torch.empty(int(num_attention_heads * batch_size), seq_len, num_embedding_size_per_head)
174
+ .to(model.dtype)
175
+ .to(model.device),
176
+ )
177
+ for _ in range(num_block_layers)
178
+ )
179
+ else:
180
+ past_key_values = tuple(
181
+ (
182
+ torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
183
+ .to(model.dtype)
184
+ .to(model.device),
185
+ torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
186
+ .to(model.dtype)
187
+ .to(model.device),
188
+ )
189
+ for _ in range(num_block_layers)
190
+ )
191
+ return past_key_values
192
+
193
+
194
+ def prepare_jit_inputs(inputs, model, tokenizer):
195
+ batch_size = len(inputs)
196
+ dummy_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
197
+ dummy_input = dummy_input.to(model.device)
198
+ if model.config.use_cache:
199
+ dummy_input["past_key_values"] = generate_past_key_values(model, batch_size, 1)
200
+ dummy_input["attention_mask"] = torch.cat(
201
+ [
202
+ torch.zeros(dummy_input["attention_mask"].shape[0], 1)
203
+ .to(dummy_input["attention_mask"].dtype)
204
+ .to(model.device),
205
+ dummy_input["attention_mask"],
206
+ ],
207
+ -1,
208
+ )
209
+ return dummy_input
210
+
211
+
212
+ class _ModelFallbackWrapper(GenerationMixin):
213
+ __slots__ = ("_optimized", "_default")
214
+
215
+ def __init__(self, optimized, default):
216
+ self._optimized = optimized
217
+ self._default = default
218
+
219
+ def __call__(self, *args, **kwargs):
220
+ if kwargs["past_key_values"] is None and self._default.config.use_cache:
221
+ kwargs["past_key_values"] = generate_past_key_values(self._default, kwargs["input_ids"].shape[0], 0)
222
+ kwargs.pop("position_ids", None)
223
+ for k in list(kwargs.keys()):
224
+ if kwargs[k] is None or isinstance(kwargs[k], bool):
225
+ kwargs.pop(k)
226
+ outputs = self._optimized(**kwargs)
227
+ lm_logits = outputs[0]
228
+ past_key_values = outputs[1]
229
+ fixed_output = CausalLMOutputWithPast(
230
+ loss=None,
231
+ logits=lm_logits,
232
+ past_key_values=past_key_values,
233
+ hidden_states=None,
234
+ attentions=None,
235
+ )
236
+ return fixed_output
237
+
238
+ def __getattr__(self, item):
239
+ return getattr(self._default, item)
240
+
241
+ def prepare_inputs_for_generation(
242
+ self, input_ids, past_key_values=None, inputs_embeds=None, use_cache=None, **kwargs
243
+ ):
244
+ return self._default.prepare_inputs_for_generation(
245
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, **kwargs
246
+ )
247
+
248
+ def _reorder_cache(
249
+ self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
250
+ ) -> Tuple[Tuple[torch.Tensor]]:
251
+ """
252
+ This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
253
+ [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
254
+ beam_idx at every generation step.
255
+ """
256
+ return self._default._reorder_cache(past_key_values, beam_idx)
257
+
258
+
259
+ def remove_tokens_before_copula(text):
260
+ sentences = text.split(",")
261
+ result = [sentences[0]]
262
+ for sentence in sentences[1:]:
263
+ tokens = nltk.word_tokenize(sentence)
264
+
265
+ target_indices = [i for i, token in enumerate(tokens) if token.lower() in ["is", "are", "am"]]
266
+
267
+ if target_indices:
268
+ last_target_index = target_indices[-1]
269
+ result.append(tokens[last_target_index + 1:])
270
+ else:
271
+ result.append(tokens)
272
+
273
+ all_sentences = [" ".join(sen) for sen in result[1:]]
274
+ all_sentences.insert(0, result[0])
275
+ result_text = ",".join(all_sentences)
276
+ return result_text
277
+
278
+
279
+ def generate_prompt(
280
+ prompt_text,
281
+ args,
282
+ zh_en_translator,
283
+ nlp,
284
+ model,
285
+ tokenizer,
286
+ distributed_state,
287
+ ):
288
+
289
+ max_seq_length = getattr(model.config, "max_position_embeddings", 0)
290
+ args["length"] = adjust_length_to_model(args["length"], max_sequence_length=max_seq_length)
291
+ while(1):
292
+ prompt_text = zh_en_translator(prompt_text)
293
+ # only support single input.
294
+
295
+ # Different models need different input formatting and/or extra arguments
296
+ requires_preprocessing = args["model_type"] in PREPROCESSING_FUNCTIONS.keys()
297
+ if requires_preprocessing:
298
+ prepare_input = PREPROCESSING_FUNCTIONS.get(args["model_type"])
299
+ preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
300
+
301
+ if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
302
+ tokenizer_kwargs = {"add_space_before_punct_symbol": True}
303
+ else:
304
+ tokenizer_kwargs = {}
305
+
306
+ encoded_prompt = tokenizer.encode(
307
+ preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
308
+ )
309
+ else:
310
+ prefix = args["prefix"] if args["prefix"] else args["padding_text"]
311
+ encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
312
+ encoded_prompt = encoded_prompt.to(distributed_state.device)
313
+
314
+ if encoded_prompt.size()[-1] == 0:
315
+ input_ids = None
316
+ else:
317
+ input_ids = encoded_prompt
318
+
319
+ if args["jit"]:
320
+ jit_input_texts = ["enable jit"]
321
+ jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
322
+ torch._C._jit_set_texpr_fuser_enabled(False)
323
+ model.config.return_dict = False
324
+ if hasattr(model, "forward"):
325
+ sig = inspect.signature(model.forward)
326
+ else:
327
+ sig = inspect.signature(model.__call__)
328
+ jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
329
+ traced_model = torch.jit.trace(model, jit_inputs, strict=False)
330
+ traced_model = torch.jit.freeze(traced_model.eval())
331
+ traced_model(*jit_inputs)
332
+ traced_model(*jit_inputs)
333
+
334
+ model = _ModelFallbackWrapper(traced_model, model)
335
+
336
+ generated_sequences = []
337
+
338
+ for generated_sequence_idx in range(args["num_return_sequences"]):
339
+ repeat_gen_time = 0
340
+ while(1):
341
+ repeat_gen_time = repeat_gen_time + 1
342
+ generated_sequence = model.generate(
343
+ input_ids=input_ids,
344
+ length_penalty=args["length_penalty"],
345
+ max_length=args["length"] + len(encoded_prompt[0]),
346
+ temperature=args["temperature"],
347
+ top_k=args["k"],
348
+ top_p=args["p"],
349
+ repetition_penalty=args["repetition_penalty"],
350
+ do_sample=True,
351
+ num_return_sequences=1,
352
+ pad_token_id=tokenizer.pad_token_id
353
+ )
354
+ # Remove the n_sequence dimension when returning single sequence
355
+ if len(generated_sequence.shape) >1:
356
+ generated_sequence.squeeze_()
357
+
358
+ generated_sequence = generated_sequence.tolist()
359
+
360
+ # Decode text
361
+ text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
362
+
363
+ # Remove all text after the stop token
364
+ text = text[: text.find(args["stop_token"]) if args["stop_token"] else None]
365
+
366
+ # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
367
+ total_sequence = (
368
+ prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
369
+ )
370
+
371
+ break
372
+ total_sequence = remove_tokens_before_copula(total_sequence)
373
+ generated_sequences.append(total_sequence)
374
+
375
+ return generated_sequences
376
+
377
+
378
+ if __name__ == "__main__":
379
+ generate_prompt()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl_py==2.0.0
2
+ accelerate==0.24.1
3
+ datasets==2.12.0
4
+ evaluate==0.4.1
5
+ Flask==3.0.0
6
+ nltk==3.8.1
7
+ numpy==1.24.4
8
+ pandas==1.5.3
9
+ Requests==2.31.0
10
+ rouge_score==0.1.2
11
+ six==1.16.0
12
+ spacy==3.7.2
13
+ torch==2.1.0
14
+ tqdm==4.65.0
15
+ transformers==4.36.1
utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ def get_tok_and_model(path_for_model):
6
+ if not os.path.exists(path_for_model):
7
+ raise RuntimeError("no cached model.")
8
+ tok = AutoTokenizer.from_pretrained(path_for_model, padding_side='left')
9
+ tok.pad_token_id = 50256
10
+ # default for open-ended generation
11
+ model = AutoModelForCausalLM.from_pretrained(path_for_model)
12
+ return tok, model