pseudotensor commited on
Commit
1c674f6
·
1 Parent(s): 8c85f5b

Update with h2oGPT hash 7c5db3692798dba31c4c415429e1ca06e12dd480

Browse files
Files changed (4) hide show
  1. app.py +615 -307
  2. client_test.py +22 -50
  3. finetune.py +2 -2
  4. utils.py +11 -8
app.py CHANGED
@@ -31,7 +31,10 @@ is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
31
  is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
32
  is_low_mem = is_hf # assumes run on 24GB consumer GPU
33
  admin_pass = os.getenv("ADMIN_PASS")
 
 
34
 
 
35
 
36
  def main(
37
  load_8bit: bool = False,
@@ -40,7 +43,7 @@ def main(
40
  base_model: str = '',
41
  tokenizer_base_model: str = '',
42
  lora_weights: str = "",
43
- force_1_gpu: bool = True,
44
 
45
  prompt_type: Union[int, str] = None,
46
  # input to generation
@@ -142,11 +145,12 @@ def main(
142
  if not gradio:
143
  if eval_sharegpt_prompts_only > 0:
144
  # override default examples with shareGPT ones for human-level eval purposes only
145
- filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
146
- if not os.path.isfile(filename):
147
- os.system('wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
 
148
  import json
149
- data = json.load(open(filename, 'rt'))
150
  # focus on data that starts with human, else likely chopped from other data
151
  turn_start = 0 # odd in general
152
  data = [x for x in data if len(x['conversations']) > turn_start + 1 and
@@ -162,12 +166,29 @@ def main(
162
  assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
163
  output = data[i]['conversations'][turn_start + 1]['value']
164
  examplenew = example1.copy()
165
- examplenew[0] = instruction
166
- examplenew[1] = '' # no input
167
- examplenew[2] = '' # no context
 
168
  examples.append(examplenew)
169
  responses.append(output)
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  with torch.device("cuda"):
172
  # ensure was set right above before examples generated
173
  assert not stream_output, "stream_output=True does not make sense with example loop"
@@ -180,7 +201,7 @@ def main(
180
  if not eval_sharegpt_as_output:
181
  model, tokenizer, device = get_model(**locals())
182
  model_state = [model, tokenizer, device, base_model]
183
- fun = partial(evaluate, model_state, debug=debug, chat=chat, save_dir=save_dir)
184
  else:
185
  assert eval_sharegpt_prompts_only > 0
186
 
@@ -191,15 +212,17 @@ def main(
191
  fun = get_response
192
  t0 = time.time()
193
  score_dump = []
194
- num_examples = len(examples)
195
 
196
  import matplotlib.pyplot as plt
197
 
198
  for exi, ex in enumerate(examples):
 
 
 
199
  clear_torch_cache()
200
  print("")
201
  print("START" + "=" * 100)
202
- print("Question: %s %s" % (ex[0], ('input=%s' % ex[1] if ex[1] else '')))
203
  print("-" * 105)
204
  # fun yields as generator, so have to iterate over it
205
  # Also means likely do NOT want --stream_output=True, else would show all generations
@@ -208,14 +231,14 @@ def main(
208
  if smodel:
209
  score_with_prompt = False
210
  if score_with_prompt:
211
- data_point = dict(instruction=ex[0], input=ex[1])
212
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
213
  prompt = prompter.generate_prompt(data_point)
214
  else:
215
  # just raw input and output
216
- assert ex[1] in [None, ''] # should be no iinput
217
- assert ex[2] in [None, ''] # should be no context
218
- prompt = ex[0]
219
  cutoff_len = 768 if is_low_mem else 2048
220
  inputs = stokenizer(prompt, res,
221
  return_tensors="pt",
@@ -228,10 +251,11 @@ def main(
228
  traceback.print_exc()
229
  score = 0.0
230
  clear_torch_cache()
231
- except RuntimeError as e:
232
  if 'Expected all tensors to be on the same device' in str(e) or \
233
  'expected scalar type Half but found Float' in str(e) or \
234
- 'probability tensor contains either' in str(e):
 
235
  print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
236
  flush=True)
237
  traceback.print_exc()
@@ -242,29 +266,16 @@ def main(
242
  print("SCORE %s: %s" % (exi, score), flush=True)
243
  score_dump.append(ex + [prompt, res, score])
244
  # dump every score in case abort
245
- scoring_path = 'scoring'
246
- os.makedirs(scoring_path, exist_ok=True)
247
- if eval_sharegpt_as_output:
248
- used_base_model = 'gpt35'
249
- used_lora_weights = ''
250
- else:
251
- used_base_model = str(base_model.split('/')[-1])
252
- used_lora_weights = str(lora_weights.split('/')[-1])
253
- df_scores = pd.DataFrame(score_dump, columns=eval_func_param_names + ['prompt', 'response', 'score'])
254
- filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
255
- eval_sharegpt_prompts_only_seed,
256
- eval_sharegpt_as_output,
257
- used_base_model,
258
- used_lora_weights)
259
- filename = os.path.join(scoring_path, filename)
260
- df_scores.to_parquet(filename, index=False)
261
  # plot histogram so far
262
  plt.figure(figsize=(10, 10))
263
  plt.hist(df_scores['score'], bins=20)
264
  score_avg = np.mean(df_scores['score'])
265
  score_median = np.median(df_scores['score'])
266
  plt.title("Score avg: %s median: %s" % (score_avg, score_median))
267
- plt.savefig(filename.replace('.parquet', '.png'))
268
  plt.close()
269
 
270
  print("END" + "=" * 102)
@@ -273,7 +284,8 @@ def main(
273
  print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
274
  t1 = time.time()
275
  print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
276
- return
 
277
  if gradio:
278
  go_gradio(**locals())
279
 
@@ -287,7 +299,9 @@ def get_device():
287
  return device
288
 
289
 
290
- def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type, force_1_gpu=True, use_auth_token=False):
 
 
291
  """
292
  Ensure model gets on correct device
293
  :param base_model:
@@ -295,6 +309,8 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
295
  :param load_half:
296
  :param model_kwargs:
297
  :param reward_type:
 
 
298
  :return:
299
  """
300
  with init_empty_weights():
@@ -319,14 +335,14 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
319
  device_map.update(device_map_model)
320
  print('device_map: %s' % device_map, flush=True)
321
 
322
- if force_1_gpu:
323
  # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
324
  # So avoid for now, just put on first GPU, unless score_model, put on last
325
  n_gpus = torch.cuda.device_count()
326
  if reward_type:
327
  device_map = {'': n_gpus - 1}
328
  else:
329
- device_map = {'': 0}
330
 
331
  load_in_8bit = model_kwargs.get('load_in_8bit', False)
332
  model_kwargs['device_map'] = device_map
@@ -351,7 +367,7 @@ def get_model(
351
  base_model: str = '',
352
  tokenizer_base_model: str = '',
353
  lora_weights: str = "",
354
- force_1_gpu: bool = False,
355
 
356
  llama_type: bool = None,
357
  reward_type: bool = None,
@@ -371,7 +387,7 @@ def get_model(
371
  :param base_model: name/path of base model
372
  :param tokenizer_base_model: name/path of tokenizer
373
  :param lora_weights: name/path
374
- :param force_1_gpu:
375
  :param llama_type: whether LLaMa type model
376
  :param reward_type: reward type model for sequence classification
377
  :param local_files_only: use local files instead of from HF
@@ -432,7 +448,7 @@ def get_model(
432
  with torch.device("cuda"):
433
  if infer_devices:
434
  model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
435
- force_1_gpu=force_1_gpu, use_auth_token=use_auth_token)
436
  else:
437
  if load_half and not load_8bit:
438
  model = model_loader.from_pretrained(
@@ -511,7 +527,6 @@ def get_score_model(**kwargs):
511
 
512
 
513
  def go_gradio(**kwargs):
514
-
515
  # get default model
516
  all_kwargs = kwargs.copy()
517
  all_kwargs.update(locals())
@@ -526,11 +541,10 @@ def go_gradio(**kwargs):
526
  smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
527
 
528
  if 'mbart-' in kwargs['model_lower']:
529
- instruction_label = "Text to translate"
530
  else:
531
- instruction_label = "Instruction"
532
- if kwargs['chat']:
533
- instruction_label = "You (Shift-Enter or push Submit to send message)"
534
 
535
  title = 'h2oGPT'
536
  if kwargs['verbose']:
@@ -542,9 +556,9 @@ def go_gradio(**kwargs):
542
  else:
543
  description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
544
  if is_public:
545
- description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
546
  if kwargs['load_8bit']:
547
- description += """<i><li> Model is loaded in 8-bit, model loading-unloading is disabled, and other limitations exist in order to fit on GPUs with lower amounts of VRAM, so UX can be worse than non-hosted version.</i></li>"""
548
  description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
549
  description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
550
 
@@ -630,6 +644,7 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
630
  return chat_message
631
  else:
632
  raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
 
633
  Chatbot._postprocess_chat_messages = _postprocess_chat_messages
634
 
635
  demo = gr.Blocks(theme=gr.themes.Soft(**colors_dict), css=css_code, title="h2oGPT", analytics_enabled=False)
@@ -645,14 +660,32 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
645
  lora_options = [kwargs['lora_weights'].strip()] + lora_options
646
  # always add in no lora case
647
  # add fake space so doesn't go away in gradio dropdown
648
- lora_options = [' '] + kwargs['extra_lora_options']
 
 
 
 
649
 
650
- output_label0 = f'h2oGPT [Model: {kwargs.get("base_model")}]' if kwargs.get('base_model') else 'h2oGPT [ !!! Please Load Model in Models Tab !!! ]'
 
 
 
 
 
 
 
 
 
 
 
 
 
651
 
652
  with demo:
653
  # avoid actual model/tokenizer here or anything that would be bad to deepcopy
654
  # https://github.com/gradio-app/gradio/issues/3558
655
  model_state = gr.State(['model', 'tokenizer', device, kwargs['base_model']])
 
656
  model_options_state = gr.State([model_options])
657
  lora_options_state = gr.State([lora_options])
658
  gr.Markdown(
@@ -663,57 +696,69 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
663
  {task_info_md}
664
  """)
665
  if is_hf:
666
- gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
 
667
 
668
  # go button visible if
669
- base_wanted = bool(kwargs['base_model']) and kwargs['login_mode_if_model0']
670
  go_btn = gr.Button(value="ENTER", visible=base_wanted, variant="primary")
671
  normal_block = gr.Row(visible=not base_wanted)
672
  with normal_block:
673
  with gr.Tabs():
674
  with gr.Row():
675
- if not kwargs['chat']:
676
- with gr.Column():
677
- instruction = gr.Textbox(
678
- lines=4, label=instruction_label,
679
- placeholder=kwargs['placeholder_instruction'],
680
- )
681
- iinput = gr.Textbox(lines=4, label="Input",
682
- placeholder=kwargs['placeholder_input'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  flag_btn = gr.Button("Flag")
684
  if kwargs['score_model']:
685
- if not kwargs['auto_score']:
686
  with gr.Column():
687
- score_btn = gr.Button("Score last prompt & response")
688
- score_text = gr.Textbox("Response Score: NA", show_label=False)
 
 
 
 
 
 
 
689
  else:
690
  score_text = gr.Textbox("Response Score: NA", show_label=False)
691
- with gr.Column():
692
- if kwargs['chat']:
693
- text_output = gr.Chatbot(label=output_label0).style(height=kwargs['height'] or 400)
694
- with gr.Row():
695
- with gr.Column(scale=50):
696
- instruction = gr.Textbox(
697
- lines=4, label=instruction_label,
698
- placeholder=kwargs['placeholder_instruction'],
699
- )
700
- with gr.Row(): # .style(equal_height=False, equal_width=False):
701
- submit = gr.Button(value='Submit').style(full_width=False, size='sm')
702
- stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
703
- with gr.Row():
704
- clear = gr.Button("New Conversation")
705
- flag_btn = gr.Button("Flag")
706
- if kwargs['score_model']:
707
- if not kwargs['auto_score']:
708
- with gr.Column():
709
- score_btn = gr.Button("Score last prompt & response").style(full_width=False, size='sm')
710
- score_text = gr.Textbox("Response Score: NA", show_label=False)
711
- else:
712
- score_text = gr.Textbox("Response Score: NA", show_label=False)
713
- retry = gr.Button("Regenerate")
714
- undo = gr.Button("Undo")
715
- else:
716
- text_output = gr.Textbox(lines=5, label=output_label0)
717
  with gr.TabItem("Input/Output"):
718
  with gr.Row():
719
  if 'mbart-' in kwargs['model_lower']:
@@ -731,7 +776,12 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
731
  prompt_type = gr.Dropdown(prompt_types_strings,
732
  value=kwargs['prompt_type'], label="Prompt Type",
733
  visible=not is_public)
734
- temperature = gr.Slider(minimum=0, maximum=3,
 
 
 
 
 
735
  value=kwargs['temperature'],
736
  label="Temperature",
737
  info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
@@ -770,30 +820,45 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
770
  value=kwargs['num_return_sequences'],
771
  label="Number Returns", info="Must be <= num_beams",
772
  visible=not is_public)
773
- do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
774
- value=kwargs['do_sample'])
775
- if kwargs['chat']:
776
- iinput = gr.Textbox(lines=4, label="Input",
777
- placeholder=kwargs['placeholder_input'],
778
- visible=not is_public)
779
- # nominally empty for chat mode
780
- context = gr.Textbox(lines=1, label="Context",
781
- info="Ignored in chat mode.",
782
- visible=not is_public)
783
 
784
  with gr.TabItem("Models"):
 
 
 
 
 
 
785
  with gr.Row():
 
 
786
  with gr.Column():
787
  with gr.Row(scale=1):
788
  with gr.Column(scale=50):
789
- model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
790
- lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
 
 
791
  with gr.Column(scale=1):
792
- load_msg = "Load Model/LORA" if not is_public \
793
- else "LOAD DISABLED FOR HOSTED DEMO"
794
  load_model_button = gr.Button(load_msg)
 
 
 
 
 
 
 
 
795
  model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
796
- lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
 
797
  with gr.Row(scale=1):
798
  with gr.Column(scale=50):
799
  new_model = gr.Textbox(label="New Model HF name/path")
@@ -801,6 +866,30 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
801
  with gr.Column(scale=1):
802
  add_model_button = gr.Button("Add new model name")
803
  add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
  with gr.TabItem("System"):
805
  system_row = gr.Row(visible=not is_public)
806
  admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
@@ -830,6 +919,9 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
830
  kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
831
  fun = partial(evaluate,
832
  **kwargs_evaluate)
 
 
 
833
 
834
  dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
835
  size="sm",
@@ -847,193 +939,320 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
847
  }""",
848
  api_name="dark",
849
  )
850
- if not kwargs['chat']:
851
- submit = gr.Button("Submit")
852
- submit_event = submit.click(fun, inputs=[model_state] + inputs_list, outputs=text_output, api_name='submit')
 
 
 
 
 
 
 
 
 
 
 
853
 
854
  # examples after submit or any other buttons for chat or no chat
855
  if kwargs['examples'] is not None and kwargs['show_examples']:
856
  gr.Examples(examples=kwargs['examples'], inputs=inputs_list)
857
 
858
  # Score
859
- def score_last_response(*args):
860
  """ Similar to user() """
861
  args_list = list(args)
862
- history = args_list[-1]
863
- if history is None:
864
- print("Bad history in scoring last response, fix for now", flush=True)
865
- history = []
866
- if smodel is not None and \
867
- stokenizer is not None and \
868
- sdevice is not None and \
869
- history is not None and len(history) > 0 and \
870
- history[-1] is not None and \
871
- len(history[-1]) >= 2:
872
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
873
-
874
- max_length_tokenize = 512 if is_low_mem else 2048
875
- cutoff_len = max_length_tokenize*4 # restrict deberta related to max for LLM
876
-
877
- question = history[-1][0]
878
- question = question[-cutoff_len:]
879
-
880
- answer = history[-1][1]
881
- answer = answer[-cutoff_len:]
882
-
883
- inputs = stokenizer(question, answer,
884
- return_tensors="pt",
885
- truncation=True,
886
- max_length=max_length_tokenize).to(smodel.device)
887
- try:
888
- score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
889
- except torch.cuda.OutOfMemoryError as e:
890
- print("GPU OOM: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
891
- del inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
  traceback.print_exc()
893
  clear_torch_cache()
894
- return 'Response Score: GPU OOM'
895
- except RuntimeError as e:
896
- if 'Expected all tensors to be on the same device' in str(e) or \
897
- 'expected scalar type Half but found Float' in str(e) or \
898
- 'probability tensor contains either' in str(e):
899
- print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
900
- traceback.print_exc()
901
- clear_torch_cache()
902
- return 'Response Score: GPU Error'
903
- else:
904
- raise
905
- os.environ['TOKENIZERS_PARALLELISM'] = 'true'
906
- return 'Response Score: {:.1%}'.format(score)
907
- else:
908
- return 'Response Score: NA'
909
 
910
  if kwargs['score_model']:
911
  score_args = dict(fn=score_last_response,
912
  inputs=inputs_list + [text_output],
913
  outputs=[score_text],
914
  )
 
 
 
 
 
 
 
 
 
915
  if not kwargs['auto_score']:
916
- score_event = score_btn.click(**score_args, queue=stream_output, api_name='score')
917
-
918
- if kwargs['chat']:
919
- def user(*args, undo=False, sanitize_user_prompt=True):
920
- args_list = list(args)
921
- user_message = args_list[0]
922
- input1 = args_list[1]
923
- context1 = args_list[2]
924
- if input1 and not user_message.endswith(':'):
925
- user_message1 = user_message + ":" + input1
926
- elif input1:
927
- user_message1 = user_message + input1
928
- else:
929
- user_message1 = user_message
930
- if sanitize_user_prompt:
931
- from better_profanity import profanity
932
- user_message1 = profanity.censor(user_message1)
 
 
 
 
 
 
 
 
 
 
933
 
934
- history = args_list[-1]
935
- if undo and history:
936
- history.pop()
937
- args_list = args_list[:-1]
938
- if history is None:
 
 
939
  print("Bad history, fix for now", flush=True)
940
- history = []
941
- if undo:
942
- return "", history
943
- else:
944
- return "", history + [[user_message1, None]]
945
-
946
- def bot(*args, retry=False):
947
- args_list = list(args)
948
- history = args_list[-1]
949
- if retry and history:
950
- history.pop()
951
- if not history:
952
- print("No history", flush=True)
953
- return
954
- instruction1 = history[-1][0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
  context1 = ''
956
- if kwargs['chat_history'] > 0:
957
- prompt_type1 = args_list[prompt_type_arg_id]
958
- context1 = ''
959
- for histi in range(len(history) - 1):
960
- data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
961
- context1 += generate_prompt(data_point, prompt_type1, kwargs['chat'], reduced=True)[0].replace(
962
- '<br>', '\n')
963
- if not context1.endswith('\n'):
964
- context1 += '\n'
965
- if context1 and not context1.endswith('\n'):
966
- context1 += '\n' # ensure if terminates abruptly, then human continues on next line
967
- args_list[0] = instruction1
968
- # only include desired chat history
969
- args_list[2] = context1[-kwargs['chat_history']:]
970
- model_state1 = args_list[-2]
971
- args_list = args_list[:-2]
972
- fun1 = partial(evaluate,
973
- model_state1,
974
- **kwargs_evaluate)
975
- try:
976
- for output in fun1(*tuple(args_list)):
977
- bot_message = output
978
- history[-1][1] = bot_message
979
- yield history
980
- except StopIteration:
981
  yield history
982
- except RuntimeError as e:
983
- if "generator raised StopIteration" in str(e):
984
- # assume last entry was bad, undo
985
- history.pop()
986
- yield history
987
- raise
988
- except Exception as e:
989
- # put error into user input
990
- history[-1][0] = "Exception: %s" % str(e)
991
  yield history
992
- raise
993
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
994
 
995
- user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
996
- inputs=inputs_list + [text_output],
997
- outputs=[instruction, text_output],
998
- )
999
- bot_args = dict(fn=bot,
1000
- inputs=inputs_list + [model_state] + [text_output],
1001
- outputs=[text_output],
1002
- )
1003
- retry_bot_args = dict(fn=functools.partial(bot, retry=True),
1004
- inputs=inputs_list + [model_state] + [text_output],
1005
- outputs=[text_output],
1006
- )
1007
- undo_user_args = dict(fn=functools.partial(user, undo=True),
1008
- inputs=inputs_list + [text_output],
1009
- outputs=[instruction, text_output],
1010
- )
1011
-
1012
- if kwargs['auto_score']:
1013
- submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction').then(
1014
- **bot_args, api_name='instruction_bot',
1015
- ).then(**score_args, api_name='instruction_bot_score').then(clear_torch_cache)
1016
- submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit').then(
1017
- **bot_args, api_name='submit_bot',
1018
- ).then(**score_args, api_name='submit_bot_score').then(clear_torch_cache)
1019
- submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry').then(
1020
- **retry_bot_args, api_name='retry_bot',
1021
- ).then(**score_args, api_name='retry_bot_score').then(clear_torch_cache)
1022
- submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo').then(**score_args, api_name='undo_score')
1023
- else:
1024
- submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction').then(
1025
- **bot_args, api_name='instruction_bot',
1026
- ).then(clear_torch_cache)
1027
- submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit').then(
1028
- **bot_args, api_name='submit_bot',
1029
- ).then(clear_torch_cache)
1030
- submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry').then(
1031
- **retry_bot_args, api_name='retry_bot',
1032
- ).then(clear_torch_cache)
1033
- submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo')
1034
- clear.click(lambda: None, None, text_output, queue=False, api_name='clear')
1035
-
1036
- def load_model(model_name, lora_weights, model_state_old, prompt_type_old):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
  # ensure old model removed from GPU memory
1038
  if kwargs['debug']:
1039
  print("Pre-switch pre-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
@@ -1058,23 +1277,35 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
1058
  clear_torch_cache()
1059
  if kwargs['debug']:
1060
  print("Pre-switch post-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
1061
- all_kwargs['base_model'] = model_name.strip()
 
 
 
 
 
 
 
 
 
 
 
1062
  model_lower = model_name.strip().lower()
1063
  if model_lower in inv_prompt_type_to_model_lower:
1064
  prompt_type1 = inv_prompt_type_to_model_lower[model_lower]
1065
  else:
1066
  prompt_type1 = prompt_type_old
1067
 
1068
- all_kwargs['lora_weights'] = lora_weights.strip()
1069
- model1, tokenizer1, device1 = get_model(**all_kwargs)
 
 
 
 
1070
  clear_torch_cache()
1071
 
1072
  if kwargs['debug']:
1073
  print("Post-switch GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
1074
- return {model_state: [model1, tokenizer1, device1, model_name],
1075
- model_used: model_name,
1076
- lora_used: lora_weights,
1077
- prompt_type: prompt_type1}
1078
 
1079
  def dropdown_prompt_type_list(x):
1080
  return gr.Dropdown.update(value=x)
@@ -1083,54 +1314,92 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
1083
  return gr.Textbox.update(label=f'h2oGPT [Model: {model_used_in}]')
1084
 
1085
  load_model_args = dict(fn=load_model,
1086
- inputs=[model_choice, lora_choice, model_state, prompt_type],
 
1087
  outputs=[model_state, model_used, lora_used, prompt_type])
1088
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1089
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
 
1090
  if not is_public:
1091
  load_model_event = load_model_button.click(**load_model_args) \
1092
- .then(**prompt_update_args) \
1093
- .then(**chatbot_update_args) \
1094
- .then(clear_torch_cache)
 
 
 
 
 
 
 
 
 
 
 
 
 
1095
 
1096
  def dropdown_model_list(list0, x):
1097
  new_state = [list0[0] + [x]]
1098
  new_options = [*new_state[0]]
1099
- return gr.Dropdown.update(value=x, choices=new_options), '', new_state
 
 
1100
 
1101
  add_model_event = add_model_button.click(fn=dropdown_model_list,
1102
  inputs=[model_options_state, new_model],
1103
- outputs=[model_choice, new_model, model_options_state])
1104
 
1105
- def dropdown_lora_list(list0, x):
1106
  new_state = [list0[0] + [x]]
1107
  new_options = [*new_state[0]]
1108
- return gr.Dropdown.update(value=x, choices=new_options), '', new_state
 
 
 
 
 
1109
 
1110
  add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
1111
- inputs=[lora_options_state, new_lora],
1112
- outputs=[lora_choice, new_lora, lora_options_state])
1113
 
1114
  go_btn.click(lambda: gr.update(visible=False), None, go_btn, api_name="go") \
1115
  .then(lambda: gr.update(visible=True), None, normal_block) \
1116
  .then(**load_model_args).then(**prompt_update_args)
1117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
  # callback for logging flagged input/output
1119
  callback.setup(inputs_list + [text_output], "flagged_data_points")
1120
  flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
1121
  api_name='flag')
 
 
1122
 
1123
  def get_system_info():
1124
  return gr.Textbox.update(value=system_info_print())
1125
 
1126
  system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
1127
 
1128
- if kwargs['chat']:
1129
-
1130
- # don't pass text_output, don't want to clear output, just stop it
1131
- # FIXME: have to click once to stop output and second time to stop GPUs going
1132
- stop_btn.click(lambda: None, None, None, cancels=[submit_event, submit_event2, submit_event3],
1133
- queue=False, api_name='stop').then(clear_torch_cache)
1134
 
1135
  demo.queue(concurrency_count=1)
1136
  favicon_path = "h2o-logo.svg"
@@ -1141,10 +1410,16 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
1141
 
1142
 
1143
  input_args_list = ['model_state']
1144
- inputs_kwargs_list = ['debug', 'chat', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
1145
 
1146
 
1147
  def get_inputs_list(inputs_dict, model_lower):
 
 
 
 
 
 
1148
  inputs_list_names = list(inspect.signature(evaluate).parameters)
1149
  inputs_list = []
1150
  for k in inputs_list_names:
@@ -1159,9 +1434,6 @@ def get_inputs_list(inputs_dict, model_lower):
1159
  return inputs_list
1160
 
1161
 
1162
- # index of prompt_type in evaluate function, after model_state
1163
- prompt_type_arg_id = 4
1164
-
1165
  eval_func_param_names = ['instruction',
1166
  'iinput',
1167
  'context',
@@ -1178,6 +1450,9 @@ eval_func_param_names = ['instruction',
1178
  'repetition_penalty',
1179
  'num_return_sequences',
1180
  'do_sample',
 
 
 
1181
  ]
1182
 
1183
 
@@ -1200,12 +1475,14 @@ def evaluate(
1200
  repetition_penalty,
1201
  num_return_sequences,
1202
  do_sample,
 
 
 
1203
  # END NOTE: Examples must have same order of parameters
1204
  src_lang=None,
1205
  tgt_lang=None,
1206
  debug=False,
1207
  save_dir=None,
1208
- chat=False,
1209
  hard_stop_list=None,
1210
  sanitize_bot_response=True,
1211
  model_state0=None,
@@ -1214,10 +1491,15 @@ def evaluate(
1214
  if debug:
1215
  locals_dict = locals().copy()
1216
  locals_dict.pop('model_state', None)
 
1217
  print(locals_dict)
1218
 
1219
  no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
1220
 
 
 
 
 
1221
  if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
1222
  # try to free-up original model (i.e. list was passed as reference)
1223
  if model_state0 is not None and model_state0[0] is not None:
@@ -1234,10 +1516,18 @@ def evaluate(
1234
  else:
1235
  raise AssertionError(no_model_msg)
1236
 
 
 
 
1237
  assert base_model.strip(), no_model_msg
1238
  assert model, "Model is missing"
1239
  assert tokenizer, "Tokenizer is missing"
1240
 
 
 
 
 
 
1241
  data_point = dict(context=context, instruction=instruction, input=iinput)
1242
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
1243
  prompt = prompter.generate_prompt(data_point)
@@ -1272,16 +1562,16 @@ def evaluate(
1272
  elif prompt_type == 'instruct_vicuna':
1273
  # even below is not enough, generic strings and many ways to encode
1274
  stop_words = [
1275
- '### Human:',
1276
- """
1277
  ### Human:""",
1278
- """
1279
  ### Human:
1280
  """,
1281
- '### Assistant:',
1282
- """
1283
  ### Assistant:""",
1284
- """
1285
  ### Assistant:
1286
  """,
1287
  ]
@@ -1299,7 +1589,7 @@ def evaluate(
1299
  if tokenizer.pad_token:
1300
  stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
1301
  # handle fake \n added
1302
- stop_words_ids = [x[1:] if y[0] == '\n' else x for x,y in zip(stop_words_ids, stop_words)]
1303
  # build stopper
1304
  stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
1305
  else:
@@ -1397,15 +1687,18 @@ def evaluate(
1397
  traceback.print_exc()
1398
  clear_torch_cache()
1399
  return
1400
- except RuntimeError as e:
1401
  if 'Expected all tensors to be on the same device' in str(e) or \
1402
  'expected scalar type Half but found Float' in str(e) or \
1403
- 'probability tensor contains either' in str(e):
 
1404
  print(
1405
  "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
1406
  flush=True)
1407
  traceback.print_exc()
1408
  clear_torch_cache()
 
 
1409
  return
1410
  else:
1411
  raise
@@ -1456,6 +1749,7 @@ def get_generate_params(model_lower, chat,
1456
  if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
1457
  prompt_type = inv_prompt_type_to_model_lower[model_lower]
1458
 
 
1459
  if show_examples is None:
1460
  if chat:
1461
  show_examples = False
@@ -1516,7 +1810,8 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
1516
  else:
1517
  prompt_type = ''
1518
  examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
1519
- stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1, False]]
 
1520
  task_info = "No task"
1521
  if prompt_type == 'instruct':
1522
  task_info = "Answer question or follow imperative as instruction with optionally input."
@@ -1551,6 +1846,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
1551
  repetition_penalty = repetition_penalty or 1.07
1552
  num_return_sequences = min(num_beams, num_return_sequences or 1)
1553
  do_sample = False if do_sample is None else do_sample
 
1554
  params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
1555
  early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
1556
 
@@ -1594,6 +1890,18 @@ y = np.random.randint(0, 1, 100)
1594
  src_lang = "English"
1595
  tgt_lang = "Russian"
1596
 
 
 
 
 
 
 
 
 
 
 
 
 
1597
  return placeholder_instruction, placeholder_input, \
1598
  stream_output, show_examples, \
1599
  prompt_type, temperature, top_p, top_k, num_beams, \
 
31
  is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
32
  is_low_mem = is_hf # assumes run on 24GB consumer GPU
33
  admin_pass = os.getenv("ADMIN_PASS")
34
+ # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
35
+ raise_generate_gpu_exceptions = True
36
 
37
+ eval_extra_columns = ['prompt', 'response', 'score']
38
 
39
  def main(
40
  load_8bit: bool = False,
 
43
  base_model: str = '',
44
  tokenizer_base_model: str = '',
45
  lora_weights: str = "",
46
+ gpu_id: int = 0, # if infer_devices = True and gpu_id != -1
47
 
48
  prompt_type: Union[int, str] = None,
49
  # input to generation
 
145
  if not gradio:
146
  if eval_sharegpt_prompts_only > 0:
147
  # override default examples with shareGPT ones for human-level eval purposes only
148
+ eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
149
+ if not os.path.isfile(eval_filename):
150
+ os.system(
151
+ 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
152
  import json
153
+ data = json.load(open(eval_filename, 'rt'))
154
  # focus on data that starts with human, else likely chopped from other data
155
  turn_start = 0 # odd in general
156
  data = [x for x in data if len(x['conversations']) > turn_start + 1 and
 
166
  assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
167
  output = data[i]['conversations'][turn_start + 1]['value']
168
  examplenew = example1.copy()
169
+ assert not chat, "No gradio must use chat=False, uses nochat isntruct"
170
+ examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
171
+ examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input
172
+ examplenew[eval_func_param_names.index('context')] = '' # no context
173
  examples.append(examplenew)
174
  responses.append(output)
175
 
176
+ num_examples = len(examples)
177
+ scoring_path = 'scoring'
178
+ os.makedirs(scoring_path, exist_ok=True)
179
+ if eval_sharegpt_as_output:
180
+ used_base_model = 'gpt35'
181
+ used_lora_weights = ''
182
+ else:
183
+ used_base_model = str(base_model.split('/')[-1])
184
+ used_lora_weights = str(lora_weights.split('/')[-1])
185
+ eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
186
+ eval_sharegpt_prompts_only_seed,
187
+ eval_sharegpt_as_output,
188
+ used_base_model,
189
+ used_lora_weights)
190
+ eval_filename = os.path.join(scoring_path, eval_filename)
191
+
192
  with torch.device("cuda"):
193
  # ensure was set right above before examples generated
194
  assert not stream_output, "stream_output=True does not make sense with example loop"
 
201
  if not eval_sharegpt_as_output:
202
  model, tokenizer, device = get_model(**locals())
203
  model_state = [model, tokenizer, device, base_model]
204
+ fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
205
  else:
206
  assert eval_sharegpt_prompts_only > 0
207
 
 
212
  fun = get_response
213
  t0 = time.time()
214
  score_dump = []
 
215
 
216
  import matplotlib.pyplot as plt
217
 
218
  for exi, ex in enumerate(examples):
219
+ instruction = ex[eval_func_param_names.index('instruction_nochat')]
220
+ iinput = ex[eval_func_param_names.index('iinput_nochat')]
221
+ context = ex[eval_func_param_names.index('context')]
222
  clear_torch_cache()
223
  print("")
224
  print("START" + "=" * 100)
225
+ print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
226
  print("-" * 105)
227
  # fun yields as generator, so have to iterate over it
228
  # Also means likely do NOT want --stream_output=True, else would show all generations
 
231
  if smodel:
232
  score_with_prompt = False
233
  if score_with_prompt:
234
+ data_point = dict(instruction=instruction, input=iinput, context=context)
235
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
236
  prompt = prompter.generate_prompt(data_point)
237
  else:
238
  # just raw input and output
239
+ assert iinput in [None, ''] # should be no iinput
240
+ assert context in [None, ''] # should be no context
241
+ prompt = instruction
242
  cutoff_len = 768 if is_low_mem else 2048
243
  inputs = stokenizer(prompt, res,
244
  return_tensors="pt",
 
251
  traceback.print_exc()
252
  score = 0.0
253
  clear_torch_cache()
254
+ except (Exception, RuntimeError) as e:
255
  if 'Expected all tensors to be on the same device' in str(e) or \
256
  'expected scalar type Half but found Float' in str(e) or \
257
+ 'probability tensor contains either' in str(e) or \
258
+ 'cublasLt ran into an error!' in str(e):
259
  print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
260
  flush=True)
261
  traceback.print_exc()
 
266
  print("SCORE %s: %s" % (exi, score), flush=True)
267
  score_dump.append(ex + [prompt, res, score])
268
  # dump every score in case abort
269
+ df_scores = pd.DataFrame(score_dump,
270
+ columns=eval_func_param_names + eval_extra_columns)
271
+ df_scores.to_parquet(eval_filename, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  # plot histogram so far
273
  plt.figure(figsize=(10, 10))
274
  plt.hist(df_scores['score'], bins=20)
275
  score_avg = np.mean(df_scores['score'])
276
  score_median = np.median(df_scores['score'])
277
  plt.title("Score avg: %s median: %s" % (score_avg, score_median))
278
+ plt.savefig(eval_filename.replace('.parquet', '.png'))
279
  plt.close()
280
 
281
  print("END" + "=" * 102)
 
284
  print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
285
  t1 = time.time()
286
  print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
287
+ return eval_filename
288
+
289
  if gradio:
290
  go_gradio(**locals())
291
 
 
299
  return device
300
 
301
 
302
+ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
303
+ gpu_id=0,
304
+ use_auth_token=False):
305
  """
306
  Ensure model gets on correct device
307
  :param base_model:
 
309
  :param load_half:
310
  :param model_kwargs:
311
  :param reward_type:
312
+ :param gpu_id:
313
+ :param use_auth_token:
314
  :return:
315
  """
316
  with init_empty_weights():
 
335
  device_map.update(device_map_model)
336
  print('device_map: %s' % device_map, flush=True)
337
 
338
+ if gpu_id >= 0:
339
  # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
340
  # So avoid for now, just put on first GPU, unless score_model, put on last
341
  n_gpus = torch.cuda.device_count()
342
  if reward_type:
343
  device_map = {'': n_gpus - 1}
344
  else:
345
+ device_map = {'': min(n_gpus - 1, gpu_id)}
346
 
347
  load_in_8bit = model_kwargs.get('load_in_8bit', False)
348
  model_kwargs['device_map'] = device_map
 
367
  base_model: str = '',
368
  tokenizer_base_model: str = '',
369
  lora_weights: str = "",
370
+ gpu_id: int = 0,
371
 
372
  llama_type: bool = None,
373
  reward_type: bool = None,
 
387
  :param base_model: name/path of base model
388
  :param tokenizer_base_model: name/path of tokenizer
389
  :param lora_weights: name/path
390
+ :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
391
  :param llama_type: whether LLaMa type model
392
  :param reward_type: reward type model for sequence classification
393
  :param local_files_only: use local files instead of from HF
 
448
  with torch.device("cuda"):
449
  if infer_devices:
450
  model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
451
+ gpu_id=gpu_id, use_auth_token=use_auth_token)
452
  else:
453
  if load_half and not load_8bit:
454
  model = model_loader.from_pretrained(
 
527
 
528
 
529
  def go_gradio(**kwargs):
 
530
  # get default model
531
  all_kwargs = kwargs.copy()
532
  all_kwargs.update(locals())
 
541
  smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
542
 
543
  if 'mbart-' in kwargs['model_lower']:
544
+ instruction_label_nochat = "Text to translate"
545
  else:
546
+ instruction_label_nochat = "Instruction"
547
+ instruction_label = "You (Shift-Enter or push Submit to send message)"
 
548
 
549
  title = 'h2oGPT'
550
  if kwargs['verbose']:
 
556
  else:
557
  description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
558
  if is_public:
559
+ description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content. Use at own risk.</i></li>"""
560
  if kwargs['load_8bit']:
561
+ description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
562
  description += """<i><li>Conversations may be used to improve h2oGPT. Do not share sensitive information.</i></li>"""
563
  description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
564
 
 
644
  return chat_message
645
  else:
646
  raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
647
+
648
  Chatbot._postprocess_chat_messages = _postprocess_chat_messages
649
 
650
  demo = gr.Blocks(theme=gr.themes.Soft(**colors_dict), css=css_code, title="h2oGPT", analytics_enabled=False)
 
660
  lora_options = [kwargs['lora_weights'].strip()] + lora_options
661
  # always add in no lora case
662
  # add fake space so doesn't go away in gradio dropdown
663
+ no_lora_str = no_model_str = '[None/Remove]'
664
+ lora_options = [no_lora_str] + kwargs['extra_lora_options'] # FIXME: why double?
665
+ # always add in no model case so can free memory
666
+ # add fake space so doesn't go away in gradio dropdown
667
+ model_options = [no_model_str] + model_options
668
 
669
+ # transcribe, will be detranscribed before use by evaluate()
670
+ if not kwargs['lora_weights'].strip():
671
+ kwargs['lora_weights'] = no_lora_str
672
+
673
+ if not kwargs['base_model'].strip():
674
+ kwargs['base_model'] = no_model_str
675
+
676
+ # transcribe for gradio
677
+ kwargs['gpu_id'] = str(kwargs['gpu_id'])
678
+
679
+ no_model_msg = 'h2oGPT [ !!! Please Load Model in Models Tab !!! ]'
680
+ output_label0 = f'h2oGPT [Model: {kwargs.get("base_model")}]' if kwargs.get(
681
+ 'base_model') else no_model_msg
682
+ output_label0_model2 = no_model_msg
683
 
684
  with demo:
685
  # avoid actual model/tokenizer here or anything that would be bad to deepcopy
686
  # https://github.com/gradio-app/gradio/issues/3558
687
  model_state = gr.State(['model', 'tokenizer', device, kwargs['base_model']])
688
+ model_state2 = gr.State([None, None, None, None])
689
  model_options_state = gr.State([model_options])
690
  lora_options_state = gr.State([lora_options])
691
  gr.Markdown(
 
696
  {task_info_md}
697
  """)
698
  if is_hf:
699
+ gr.HTML(
700
+ '''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
701
 
702
  # go button visible if
703
+ base_wanted = kwargs['base_model'] != no_model_str and kwargs['login_mode_if_model0']
704
  go_btn = gr.Button(value="ENTER", visible=base_wanted, variant="primary")
705
  normal_block = gr.Row(visible=not base_wanted)
706
  with normal_block:
707
  with gr.Tabs():
708
  with gr.Row():
709
+ col_nochat = gr.Column(visible=not kwargs['chat'])
710
+ with col_nochat: # FIXME: for model comparison, and check rest
711
+ text_output_nochat = gr.Textbox(lines=5, label=output_label0)
712
+ instruction_nochat = gr.Textbox(
713
+ lines=4, label=instruction_label_nochat,
714
+ placeholder=kwargs['placeholder_instruction'],
715
+ )
716
+ iinput_nochat = gr.Textbox(lines=4, label="Input context for Instruction",
717
+ placeholder=kwargs['placeholder_input'])
718
+ submit_nochat = gr.Button("Submit")
719
+ flag_btn_nochat = gr.Button("Flag")
720
+ if kwargs['score_model']:
721
+ if not kwargs['auto_score']:
722
+ with gr.Column():
723
+ score_btn_nochat = gr.Button("Score last prompt & response")
724
+ score_text_nochat = gr.Textbox("Response Score: NA", show_label=False)
725
+ else:
726
+ score_text_nochat = gr.Textbox("Response Score: NA", show_label=False)
727
+ col_chat = gr.Column(visible=kwargs['chat'])
728
+ with col_chat:
729
+ with gr.Row():
730
+ text_output = gr.Chatbot(label=output_label0).style(height=kwargs['height'] or 400)
731
+ text_output2 = gr.Chatbot(label=output_label0_model2, visible=False).style(
732
+ height=kwargs['height'] or 400)
733
+ with gr.Row():
734
+ with gr.Column(scale=50):
735
+ instruction = gr.Textbox(
736
+ lines=4, label=instruction_label,
737
+ placeholder=kwargs['placeholder_instruction'],
738
+ )
739
+ with gr.Row(): # .style(equal_height=False, equal_width=False):
740
+ submit = gr.Button(value='Submit').style(full_width=False, size='sm')
741
+ stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
742
+ with gr.Row():
743
+ clear = gr.Button("New Conversation")
744
  flag_btn = gr.Button("Flag")
745
  if kwargs['score_model']:
746
+ if not kwargs['auto_score']: # FIXME: For checkbox model2
747
  with gr.Column():
748
+ with gr.Row():
749
+ score_btn = gr.Button("Score last prompt & response").style(
750
+ full_width=False, size='sm')
751
+ score_text = gr.Textbox("Response Score: NA", show_label=False)
752
+ score_res2 = gr.Row(visible=False)
753
+ with score_res2:
754
+ score_btn2 = gr.Button("Score last prompt & response 2").style(
755
+ full_width=False, size='sm')
756
+ score_text2 = gr.Textbox("Response Score2: NA", show_label=False)
757
  else:
758
  score_text = gr.Textbox("Response Score: NA", show_label=False)
759
+ score_text2 = gr.Textbox("Response Score2: NA", show_label=False, visible=False)
760
+ retry = gr.Button("Regenerate")
761
+ undo = gr.Button("Undo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
  with gr.TabItem("Input/Output"):
763
  with gr.Row():
764
  if 'mbart-' in kwargs['model_lower']:
 
776
  prompt_type = gr.Dropdown(prompt_types_strings,
777
  value=kwargs['prompt_type'], label="Prompt Type",
778
  visible=not is_public)
779
+ prompt_type2 = gr.Dropdown(prompt_types_strings,
780
+ value=kwargs['prompt_type'], label="Prompt Type Model 2",
781
+ visible=not is_public and False)
782
+ do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
783
+ value=kwargs['do_sample'])
784
+ temperature = gr.Slider(minimum=0.01, maximum=3,
785
  value=kwargs['temperature'],
786
  label="Temperature",
787
  info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
 
820
  value=kwargs['num_return_sequences'],
821
  label="Number Returns", info="Must be <= num_beams",
822
  visible=not is_public)
823
+ iinput = gr.Textbox(lines=4, label="Input",
824
+ placeholder=kwargs['placeholder_input'],
825
+ visible=not is_public)
826
+ context = gr.Textbox(lines=3, label="System Pre-Context",
827
+ info="Directly pre-appended without prompt processing",
828
+ visible=not is_public and not kwargs['chat'])
829
+ chat = gr.components.Checkbox(label="Chat mode", value=kwargs['chat'],
830
+ visible=not is_public)
 
 
831
 
832
  with gr.TabItem("Models"):
833
+ load_msg = "Load-Unload Model/LORA" if not is_public \
834
+ else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO"
835
+ load_msg2 = "Load-Unload Model/LORA 2" if not is_public \
836
+ else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO 2"
837
+ compare_checkbox = gr.components.Checkbox(label="Compare Mode",
838
+ value=False, visible=not is_public)
839
  with gr.Row():
840
+ n_gpus = torch.cuda.device_count()
841
+ n_gpus_list = [str(x) for x in list(range(-1, n_gpus))]
842
  with gr.Column():
843
  with gr.Row(scale=1):
844
  with gr.Column(scale=50):
845
+ model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model",
846
+ value=kwargs['base_model'])
847
+ lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA",
848
+ value=kwargs['lora_weights'], visible=kwargs['show_lora'])
849
  with gr.Column(scale=1):
 
 
850
  load_model_button = gr.Button(load_msg)
851
+ model_load8bit_checkbox = gr.components.Checkbox(
852
+ label="Load 8-bit [Not all models support]",
853
+ value=kwargs['load_8bit'])
854
+ model_infer_devices_checkbox = gr.components.Checkbox(
855
+ label="Infer Devices [If GPU ID=-1 or not Checked, then will spread model over GPUs]",
856
+ value=kwargs['infer_devices'])
857
+ model_gpu = gr.Dropdown(n_gpus_list, label="GPU ID [-1 = all GPUs]",
858
+ value=kwargs['gpu_id'])
859
  model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
860
+ lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
861
+ visible=kwargs['show_lora'])
862
  with gr.Row(scale=1):
863
  with gr.Column(scale=50):
864
  new_model = gr.Textbox(label="New Model HF name/path")
 
866
  with gr.Column(scale=1):
867
  add_model_button = gr.Button("Add new model name")
868
  add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
869
+ col_model2 = gr.Column(visible=False)
870
+ with col_model2:
871
+ with gr.Row(scale=1):
872
+ with gr.Column(scale=50):
873
+ model_choice2 = gr.Dropdown(model_options_state.value[0], label="Choose Model 2",
874
+ value=no_model_str)
875
+ lora_choice2 = gr.Dropdown(lora_options_state.value[0], label="Choose LORA 2",
876
+ value=no_lora_str,
877
+ visible=kwargs['show_lora'])
878
+ with gr.Column(scale=1):
879
+ load_model_button2 = gr.Button(load_msg2)
880
+ model_load8bit_checkbox2 = gr.components.Checkbox(
881
+ label="Load 8-bit 2 [Not all models support]",
882
+ value=kwargs['load_8bit'])
883
+ model_infer_devices_checkbox2 = gr.components.Checkbox(
884
+ label="Infer Devices 2 [If GPU ID=-1 or not Checked, then will spread model over GPUs]",
885
+ value=kwargs[
886
+ 'infer_devices'])
887
+ model_gpu2 = gr.Dropdown(n_gpus_list, label="GPU ID [-1 = all GPUs]",
888
+ value=kwargs['gpu_id'])
889
+ # no model/lora loaded ever in model2 by default
890
+ model_used2 = gr.Textbox(label="Current Model 2", value=no_model_str)
891
+ lora_used2 = gr.Textbox(label="Current LORA 2", value=no_lora_str,
892
+ visible=kwargs['show_lora'])
893
  with gr.TabItem("System"):
894
  system_row = gr.Row(visible=not is_public)
895
  admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
 
919
  kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
920
  fun = partial(evaluate,
921
  **kwargs_evaluate)
922
+ fun2 = partial(evaluate,
923
+ model_state2,
924
+ **kwargs_evaluate)
925
 
926
  dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
927
  size="sm",
 
939
  }""",
940
  api_name="dark",
941
  )
942
+
943
+ # Control chat and non-chat blocks, which can be independently used by chat checkbox swap
944
+ def col_nochat_fun(x):
945
+ return gr.Column.update(visible=not x)
946
+
947
+ def col_chat_fun(x):
948
+ return gr.Column.update(visible=x)
949
+
950
+ def context_fun(x):
951
+ return gr.Textbox.update(visible=not x)
952
+
953
+ chat.select(col_nochat_fun, chat, col_nochat, api_name="chat_checkbox") \
954
+ .then(col_chat_fun, chat, col_chat) \
955
+ .then(context_fun, chat, context)
956
 
957
  # examples after submit or any other buttons for chat or no chat
958
  if kwargs['examples'] is not None and kwargs['show_examples']:
959
  gr.Examples(examples=kwargs['examples'], inputs=inputs_list)
960
 
961
  # Score
962
+ def score_last_response(*args, nochat=False, model2=False):
963
  """ Similar to user() """
964
  args_list = list(args)
965
+
966
+ max_length_tokenize = 512 if is_low_mem else 2048
967
+ cutoff_len = max_length_tokenize * 4 # restrict deberta related to max for LLM
968
+
969
+ if not nochat:
970
+ history = args_list[-1]
971
+ if history is None:
972
+ if not model2:
973
+ # maybe only doing first model, no need to complain
974
+ print("Bad history in scoring last response, fix for now", flush=True)
975
+ history = []
976
+ if smodel is not None and \
977
+ stokenizer is not None and \
978
+ sdevice is not None and \
979
+ history is not None and len(history) > 0 and \
980
+ history[-1] is not None and \
981
+ len(history[-1]) >= 2:
982
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
983
+
984
+ question = history[-1][0]
985
+
986
+ answer = history[-1][1]
987
+ else:
988
+ return 'Response Score: NA'
989
+ else:
990
+ answer = args_list[-1]
991
+ instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
992
+ question = args_list[instruction_nochat_arg_id]
993
+
994
+ if question is None:
995
+ return 'Response Score: Bad Question'
996
+ if answer is None:
997
+ return 'Response Score: Bad Answer'
998
+
999
+ question = question[-cutoff_len:]
1000
+ answer = answer[-cutoff_len:]
1001
+
1002
+ inputs = stokenizer(question, answer,
1003
+ return_tensors="pt",
1004
+ truncation=True,
1005
+ max_length=max_length_tokenize).to(smodel.device)
1006
+ try:
1007
+ score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
1008
+ except torch.cuda.OutOfMemoryError as e:
1009
+ print("GPU OOM: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
1010
+ del inputs
1011
+ traceback.print_exc()
1012
+ clear_torch_cache()
1013
+ return 'Response Score: GPU OOM'
1014
+ except (Exception, RuntimeError) as e:
1015
+ if 'Expected all tensors to be on the same device' in str(e) or \
1016
+ 'expected scalar type Half but found Float' in str(e) or \
1017
+ 'probability tensor contains either' in str(e) or \
1018
+ 'cublasLt ran into an error!' in str(e):
1019
+ print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
1020
+ flush=True)
1021
  traceback.print_exc()
1022
  clear_torch_cache()
1023
+ return 'Response Score: GPU Error'
1024
+ else:
1025
+ raise
1026
+ os.environ['TOKENIZERS_PARALLELISM'] = 'true'
1027
+ return 'Response Score: {:.1%}'.format(score)
 
 
 
 
 
 
 
 
 
 
1028
 
1029
  if kwargs['score_model']:
1030
  score_args = dict(fn=score_last_response,
1031
  inputs=inputs_list + [text_output],
1032
  outputs=[score_text],
1033
  )
1034
+ score_args2 = dict(fn=partial(score_last_response, model2=True),
1035
+ inputs=inputs_list + [text_output2],
1036
+ outputs=[score_text2],
1037
+ )
1038
+
1039
+ score_args_nochat = dict(fn=partial(score_last_response, nochat=True),
1040
+ inputs=inputs_list + [text_output_nochat],
1041
+ outputs=[score_text_nochat],
1042
+ )
1043
  if not kwargs['auto_score']:
1044
+ score_event = score_btn.click(**score_args, queue=stream_output, api_name='score') \
1045
+ .then(**score_args2, queue=stream_output, api_name='score2')
1046
+ score_event_nochat = score_btn_nochat.click(**score_args_nochat, queue=stream_output,
1047
+ api_name='score_nochat')
1048
+
1049
+ def user(*args, undo=False, sanitize_user_prompt=True, model2=False):
1050
+ """
1051
+ User that fills history for bot
1052
+ :param args:
1053
+ :param undo:
1054
+ :param sanitize_user_prompt:
1055
+ :param model2:
1056
+ :return:
1057
+ """
1058
+ args_list = list(args)
1059
+ user_message = args_list[0]
1060
+ input1 = args_list[1]
1061
+ context1 = args_list[2]
1062
+ if input1 and not user_message.endswith(':'):
1063
+ user_message1 = user_message + ":" + input1
1064
+ elif input1:
1065
+ user_message1 = user_message + input1
1066
+ else:
1067
+ user_message1 = user_message
1068
+ if sanitize_user_prompt:
1069
+ from better_profanity import profanity
1070
+ user_message1 = profanity.censor(user_message1)
1071
 
1072
+ history = args_list[-1]
1073
+ if undo and history:
1074
+ history.pop()
1075
+ args_list = args_list[:-1] # FYI, even if unused currently
1076
+ if history is None:
1077
+ if not model2:
1078
+ # no need to complain so often unless model1
1079
  print("Bad history, fix for now", flush=True)
1080
+ history = []
1081
+ # ensure elements not mixed across models as output,
1082
+ # even if input is currently same source
1083
+ history = history.copy()
1084
+ if undo:
1085
+ return history
1086
+ else:
1087
+ # FIXME: compare, same history for now
1088
+ return history + [[user_message1, None]]
1089
+
1090
+ def bot(*args, retry=False):
1091
+ """
1092
+ bot that consumes history for user input
1093
+ instruction (from input_list) itself is not consumed by bot
1094
+ :param args:
1095
+ :param retry:
1096
+ :return:
1097
+ """
1098
+ args_list = list(args).copy()
1099
+ history = args_list[-1] # model_state is -2
1100
+ if retry and history:
1101
+ history.pop()
1102
+ if not history:
1103
+ print("No history", flush=True)
1104
+ return
1105
+ # ensure output will be unique to models
1106
+ history = history.copy()
1107
+ instruction1 = history[-1][0]
1108
+ context1 = ''
1109
+ if kwargs['chat_history'] > 0:
1110
+ prompt_type_arg_id = eval_func_param_names.index('prompt_type')
1111
+ prompt_type1 = args_list[prompt_type_arg_id]
1112
+ chat_arg_id = eval_func_param_names.index('chat')
1113
+ chat1 = args_list[chat_arg_id]
1114
  context1 = ''
1115
+ for histi in range(len(history) - 1):
1116
+ data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
1117
+ context1 += generate_prompt(data_point, prompt_type1, chat1, reduced=True)[0].replace(
1118
+ '<br>', '\n')
1119
+ if not context1.endswith('\n'):
1120
+ context1 += '\n'
1121
+ if context1 and not context1.endswith('\n'):
1122
+ context1 += '\n' # ensure if terminates abruptly, then human continues on next line
1123
+ args_list[0] = instruction1 # override original instruction with history from user
1124
+ # only include desired chat history
1125
+ args_list[2] = context1[-kwargs['chat_history']:]
1126
+ model_state1 = args_list[-2]
1127
+ if model_state1[0] is None or model_state1[0] == no_model_str:
1128
+ return
1129
+ args_list = args_list[:-2]
1130
+ fun1 = partial(evaluate,
1131
+ model_state1,
1132
+ **kwargs_evaluate)
1133
+ try:
1134
+ for output in fun1(*tuple(args_list)):
1135
+ bot_message = output
1136
+ history[-1][1] = bot_message
 
 
 
1137
  yield history
1138
+ except StopIteration:
1139
+ yield history
1140
+ except RuntimeError as e:
1141
+ if "generator raised StopIteration" in str(e):
1142
+ # assume last entry was bad, undo
1143
+ history.pop()
 
 
 
1144
  yield history
1145
+ raise
1146
+ except Exception as e:
1147
+ # put error into user input
1148
+ history[-1][0] = "Exception: %s" % str(e)
1149
+ yield history
1150
+ raise
1151
+ return
1152
+
1153
+ # NORMAL MODEL
1154
+ user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
1155
+ inputs=inputs_list + [text_output],
1156
+ outputs=text_output,
1157
+ )
1158
+ bot_args = dict(fn=bot,
1159
+ inputs=inputs_list + [model_state] + [text_output],
1160
+ outputs=text_output,
1161
+ )
1162
+ retry_bot_args = dict(fn=functools.partial(bot, retry=True),
1163
+ inputs=inputs_list + [model_state] + [text_output],
1164
+ outputs=text_output,
1165
+ )
1166
+ undo_user_args = dict(fn=functools.partial(user, undo=True),
1167
+ inputs=inputs_list + [text_output],
1168
+ outputs=text_output,
1169
+ )
1170
 
1171
+ # MODEL2
1172
+ user_args2 = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt'], model2=True),
1173
+ inputs=inputs_list + [text_output2],
1174
+ outputs=text_output2,
1175
+ )
1176
+ bot_args2 = dict(fn=bot,
1177
+ inputs=inputs_list + [model_state2] + [text_output2],
1178
+ outputs=text_output2,
1179
+ )
1180
+ retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
1181
+ inputs=inputs_list + [model_state2] + [text_output2],
1182
+ outputs=text_output2,
1183
+ )
1184
+ undo_user_args2 = dict(fn=functools.partial(user, undo=True),
1185
+ inputs=inputs_list + [text_output2],
1186
+ outputs=text_output2,
1187
+ )
1188
+
1189
+ def clear_instruct():
1190
+ return gr.Textbox.update(value='')
1191
+
1192
+ if kwargs['auto_score']:
1193
+ # in case 2nd model, consume instruction first, so can clear quickly
1194
+ # bot doesn't consume instruction itself, just history from user, so why works
1195
+ submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction') \
1196
+ .then(**user_args2, queue=stream_output, api_name='instruction2') \
1197
+ .then(clear_instruct, None, instruction) \
1198
+ .then(**bot_args, api_name='instruction_bot') \
1199
+ .then(**score_args, api_name='instruction_bot_score') \
1200
+ .then(**bot_args2, api_name='instruction_bot2') \
1201
+ .then(**score_args2, api_name='instruction_bot_score2') \
1202
+ .then(clear_torch_cache)
1203
+ submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit') \
1204
+ .then(**user_args2, queue=stream_output, api_name='submit2') \
1205
+ .then(**bot_args, api_name='submit_bot') \
1206
+ .then(clear_instruct, None, instruction) \
1207
+ .then(**score_args, api_name='submit_bot_score') \
1208
+ .then(**bot_args2, api_name='submit_bot2') \
1209
+ .then(**score_args2, api_name='submit_bot_score2') \
1210
+ .then(clear_torch_cache)
1211
+ submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry') \
1212
+ .then(**user_args2, queue=stream_output, api_name='retry2') \
1213
+ .then(clear_instruct, None, instruction) \
1214
+ .then(**retry_bot_args, api_name='retry_bot') \
1215
+ .then(**score_args, api_name='retry_bot_score') \
1216
+ .then(**retry_bot_args2, api_name='retry_bot2') \
1217
+ .then(**score_args2, api_name='retry_bot_score2') \
1218
+ .then(clear_torch_cache)
1219
+ submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo') \
1220
+ .then(**score_args, api_name='undo_score') \
1221
+ .then(**undo_user_args2, queue=stream_output, api_name='undo2') \
1222
+ .then(**score_args2, api_name='undo_score2') \
1223
+ .then(clear_instruct, None, instruction)
1224
+ else:
1225
+ submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction') \
1226
+ .then(**user_args2, queue=stream_output, api_name='instruction2') \
1227
+ .then(clear_instruct, None, instruction) \
1228
+ .then(**bot_args, api_name='instruction_bot') \
1229
+ .then(**bot_args2, api_name='instruction_bot2') \
1230
+ .then(clear_torch_cache)
1231
+ submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit') \
1232
+ .then(**user_args2, queue=stream_output, api_name='submit2') \
1233
+ .then(clear_instruct, None, instruction) \
1234
+ .then(**bot_args, api_name='submit_bot') \
1235
+ .then(**bot_args2, api_name='submit_bot2') \
1236
+ .then(clear_torch_cache)
1237
+ submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry') \
1238
+ .then(**user_args2, queue=stream_output, api_name='retry2') \
1239
+ .then(clear_instruct, None, instruction) \
1240
+ .then(**retry_bot_args, api_name='retry_bot') \
1241
+ .then(**retry_bot_args2, api_name='retry_bot2') \
1242
+ .then(clear_torch_cache)
1243
+ submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo') \
1244
+ .then(**undo_user_args2, queue=stream_output, api_name='undo2')
1245
+
1246
+ # does both models
1247
+ clear.click(lambda: None, None, text_output, queue=False, api_name='clear') \
1248
+ .then(lambda: None, None, text_output2, queue=False, api_name='clear2')
1249
+ # FIXME: compare
1250
+ submit_event_nochat = submit_nochat.click(fun, inputs=[model_state] + inputs_list,
1251
+ outputs=text_output_nochat, api_name='submit_nochat') \
1252
+ .then(**score_args_nochat, api_name='instruction_bot_score_nochat') \
1253
+ .then(clear_torch_cache)
1254
+
1255
+ def load_model(model_name, lora_weights, model_state_old, prompt_type_old, load_8bit, infer_devices, gpu_id):
1256
  # ensure old model removed from GPU memory
1257
  if kwargs['debug']:
1258
  print("Pre-switch pre-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
 
1277
  clear_torch_cache()
1278
  if kwargs['debug']:
1279
  print("Pre-switch post-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
1280
+
1281
+ if model_name is None or model_name == no_model_str:
1282
+ # no-op if no model, just free memory
1283
+ # no detranscribe needed for model, never go into evaluate
1284
+ lora_weights = no_lora_str
1285
+ return [None, None, None, model_name], model_name, lora_weights, prompt_type_old
1286
+
1287
+ all_kwargs1 = all_kwargs.copy()
1288
+ all_kwargs1['base_model'] = model_name.strip()
1289
+ all_kwargs1['load_8bit'] = load_8bit
1290
+ all_kwargs1['infer_devices'] = infer_devices
1291
+ all_kwargs1['gpu_id'] = int(gpu_id) # detranscribe
1292
  model_lower = model_name.strip().lower()
1293
  if model_lower in inv_prompt_type_to_model_lower:
1294
  prompt_type1 = inv_prompt_type_to_model_lower[model_lower]
1295
  else:
1296
  prompt_type1 = prompt_type_old
1297
 
1298
+ # detranscribe
1299
+ if lora_weights == no_lora_str:
1300
+ lora_weights = ''
1301
+
1302
+ all_kwargs1['lora_weights'] = lora_weights.strip()
1303
+ model1, tokenizer1, device1 = get_model(**all_kwargs1)
1304
  clear_torch_cache()
1305
 
1306
  if kwargs['debug']:
1307
  print("Post-switch GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
1308
+ return [model1, tokenizer1, device1, model_name], model_name, lora_weights, prompt_type1
 
 
 
1309
 
1310
  def dropdown_prompt_type_list(x):
1311
  return gr.Dropdown.update(value=x)
 
1314
  return gr.Textbox.update(label=f'h2oGPT [Model: {model_used_in}]')
1315
 
1316
  load_model_args = dict(fn=load_model,
1317
+ inputs=[model_choice, lora_choice, model_state, prompt_type,
1318
+ model_load8bit_checkbox, model_infer_devices_checkbox, model_gpu],
1319
  outputs=[model_state, model_used, lora_used, prompt_type])
1320
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1321
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
1322
+ nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
1323
  if not is_public:
1324
  load_model_event = load_model_button.click(**load_model_args) \
1325
+ .then(**prompt_update_args) \
1326
+ .then(**chatbot_update_args) \
1327
+ .then(**nochat_update_args) \
1328
+ .then(clear_torch_cache)
1329
+
1330
+ load_model_args2 = dict(fn=load_model,
1331
+ inputs=[model_choice2, lora_choice2, model_state2, prompt_type2,
1332
+ model_load8bit_checkbox2, model_infer_devices_checkbox2, model_gpu2],
1333
+ outputs=[model_state2, model_used2, lora_used2, prompt_type2])
1334
+ prompt_update_args2 = dict(fn=dropdown_prompt_type_list, inputs=prompt_type2, outputs=prompt_type2)
1335
+ chatbot_update_args2 = dict(fn=chatbot_list, inputs=[text_output2, model_used2], outputs=text_output2)
1336
+ if not is_public:
1337
+ load_model_event2 = load_model_button2.click(**load_model_args2) \
1338
+ .then(**prompt_update_args2) \
1339
+ .then(**chatbot_update_args2) \
1340
+ .then(clear_torch_cache)
1341
 
1342
  def dropdown_model_list(list0, x):
1343
  new_state = [list0[0] + [x]]
1344
  new_options = [*new_state[0]]
1345
+ return gr.Dropdown.update(value=x, choices=new_options), \
1346
+ gr.Dropdown.update(value=x, choices=new_options), \
1347
+ '', new_state
1348
 
1349
  add_model_event = add_model_button.click(fn=dropdown_model_list,
1350
  inputs=[model_options_state, new_model],
1351
+ outputs=[model_choice, model_choice2, new_model, model_options_state])
1352
 
1353
+ def dropdown_lora_list(list0, x, model_used1, lora_used1, model_used2, lora_used2):
1354
  new_state = [list0[0] + [x]]
1355
  new_options = [*new_state[0]]
1356
+ # don't switch drop-down to added lora if already have model loaded
1357
+ x1 = x if model_used1 == no_model_str else lora_used1
1358
+ x2 = x if model_used2 == no_model_str else lora_used2
1359
+ return gr.Dropdown.update(value=x1, choices=new_options), \
1360
+ gr.Dropdown.update(value=x2, choices=new_options), \
1361
+ '', new_state
1362
 
1363
  add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
1364
+ inputs=[lora_options_state, new_lora, model_used, lora_used, model_used2, lora_used2],
1365
+ outputs=[lora_choice, lora_choice2, new_lora, lora_options_state])
1366
 
1367
  go_btn.click(lambda: gr.update(visible=False), None, go_btn, api_name="go") \
1368
  .then(lambda: gr.update(visible=True), None, normal_block) \
1369
  .then(**load_model_args).then(**prompt_update_args)
1370
 
1371
+ def compare_textbox_fun(x):
1372
+ return gr.Textbox.update(visible=x)
1373
+
1374
+ def compare_column_fun(x):
1375
+ return gr.Column.update(visible=x)
1376
+
1377
+ def compare_prompt_fun(x):
1378
+ return gr.Dropdown.update(visible=x)
1379
+
1380
+ compare_checkbox.select(compare_textbox_fun, compare_checkbox, text_output2, api_name="compare_checkbox") \
1381
+ .then(compare_column_fun, compare_checkbox, col_model2) \
1382
+ .then(compare_prompt_fun, compare_checkbox, prompt_type2) \
1383
+ .then(compare_textbox_fun, compare_checkbox, score_text2)
1384
+ # FIXME: add score_res2 in condition, but do better
1385
+
1386
  # callback for logging flagged input/output
1387
  callback.setup(inputs_list + [text_output], "flagged_data_points")
1388
  flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
1389
  api_name='flag')
1390
+ flag_btn_nochat.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
1391
+ api_name='flag_nochat')
1392
 
1393
  def get_system_info():
1394
  return gr.Textbox.update(value=system_info_print())
1395
 
1396
  system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
1397
 
1398
+ # don't pass text_output, don't want to clear output, just stop it
1399
+ # FIXME: have to click once to stop output and second time to stop GPUs going
1400
+ stop_btn.click(lambda: None, None, None,
1401
+ cancels=[submit_event_nochat, submit_event, submit_event2, submit_event3],
1402
+ queue=False, api_name='stop').then(clear_torch_cache)
 
1403
 
1404
  demo.queue(concurrency_count=1)
1405
  favicon_path = "h2o-logo.svg"
 
1410
 
1411
 
1412
  input_args_list = ['model_state']
1413
+ inputs_kwargs_list = ['debug', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
1414
 
1415
 
1416
  def get_inputs_list(inputs_dict, model_lower):
1417
+ """
1418
+ map gradio objects in locals() to inputs for evaluate().
1419
+ :param inputs_dict:
1420
+ :param model_lower:
1421
+ :return:
1422
+ """
1423
  inputs_list_names = list(inspect.signature(evaluate).parameters)
1424
  inputs_list = []
1425
  for k in inputs_list_names:
 
1434
  return inputs_list
1435
 
1436
 
 
 
 
1437
  eval_func_param_names = ['instruction',
1438
  'iinput',
1439
  'context',
 
1450
  'repetition_penalty',
1451
  'num_return_sequences',
1452
  'do_sample',
1453
+ 'chat',
1454
+ 'instruction_nochat',
1455
+ 'iinput_nochat',
1456
  ]
1457
 
1458
 
 
1475
  repetition_penalty,
1476
  num_return_sequences,
1477
  do_sample,
1478
+ chat,
1479
+ instruction_nochat,
1480
+ iinput_nochat,
1481
  # END NOTE: Examples must have same order of parameters
1482
  src_lang=None,
1483
  tgt_lang=None,
1484
  debug=False,
1485
  save_dir=None,
 
1486
  hard_stop_list=None,
1487
  sanitize_bot_response=True,
1488
  model_state0=None,
 
1491
  if debug:
1492
  locals_dict = locals().copy()
1493
  locals_dict.pop('model_state', None)
1494
+ locals_dict.pop('model_state0', None)
1495
  print(locals_dict)
1496
 
1497
  no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
1498
 
1499
+ if model_state0 is None:
1500
+ # e.g. for no gradio case, set dummy value, else should be set
1501
+ model_state0 = [None, None, None, None]
1502
+
1503
  if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
1504
  # try to free-up original model (i.e. list was passed as reference)
1505
  if model_state0 is not None and model_state0[0] is not None:
 
1516
  else:
1517
  raise AssertionError(no_model_msg)
1518
 
1519
+ if base_model is None:
1520
+ raise AssertionError(no_model_msg)
1521
+
1522
  assert base_model.strip(), no_model_msg
1523
  assert model, "Model is missing"
1524
  assert tokenizer, "Tokenizer is missing"
1525
 
1526
+ # choose chat or non-chat mode
1527
+ if not chat:
1528
+ instruction = instruction_nochat
1529
+ iinput = iinput_nochat
1530
+
1531
  data_point = dict(context=context, instruction=instruction, input=iinput)
1532
  prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
1533
  prompt = prompter.generate_prompt(data_point)
 
1562
  elif prompt_type == 'instruct_vicuna':
1563
  # even below is not enough, generic strings and many ways to encode
1564
  stop_words = [
1565
+ '### Human:',
1566
+ """
1567
  ### Human:""",
1568
+ """
1569
  ### Human:
1570
  """,
1571
+ '### Assistant:',
1572
+ """
1573
  ### Assistant:""",
1574
+ """
1575
  ### Assistant:
1576
  """,
1577
  ]
 
1589
  if tokenizer.pad_token:
1590
  stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
1591
  # handle fake \n added
1592
+ stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
1593
  # build stopper
1594
  stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
1595
  else:
 
1687
  traceback.print_exc()
1688
  clear_torch_cache()
1689
  return
1690
+ except (Exception, RuntimeError) as e:
1691
  if 'Expected all tensors to be on the same device' in str(e) or \
1692
  'expected scalar type Half but found Float' in str(e) or \
1693
+ 'probability tensor contains either' in str(e) or \
1694
+ 'cublasLt ran into an error!' in str(e):
1695
  print(
1696
  "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
1697
  flush=True)
1698
  traceback.print_exc()
1699
  clear_torch_cache()
1700
+ if raise_generate_gpu_exceptions:
1701
+ raise
1702
  return
1703
  else:
1704
  raise
 
1749
  if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
1750
  prompt_type = inv_prompt_type_to_model_lower[model_lower]
1751
 
1752
+ # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
1753
  if show_examples is None:
1754
  if chat:
1755
  show_examples = False
 
1810
  else:
1811
  prompt_type = ''
1812
  examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
1813
+ stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1,
1814
+ False]]
1815
  task_info = "No task"
1816
  if prompt_type == 'instruct':
1817
  task_info = "Answer question or follow imperative as instruction with optionally input."
 
1846
  repetition_penalty = repetition_penalty or 1.07
1847
  num_return_sequences = min(num_beams, num_return_sequences or 1)
1848
  do_sample = False if do_sample is None else do_sample
1849
+ # doesn't include chat, instruction_nochat, iinput_nochat, added later
1850
  params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
1851
  early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
1852
 
 
1890
  src_lang = "English"
1891
  tgt_lang = "Russian"
1892
 
1893
+ # move to correct position
1894
+ for example in examples:
1895
+ example += [chat, '', '']
1896
+ # adjust examples if non-chat mode
1897
+ if not chat:
1898
+ example[eval_func_param_names.index('instruction_nochat')] = example[
1899
+ eval_func_param_names.index('instruction')]
1900
+ example[eval_func_param_names.index('instruction')] = ''
1901
+
1902
+ example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
1903
+ example[eval_func_param_names.index('iinput')] = ''
1904
+
1905
  return placeholder_instruction, placeholder_input, \
1906
  stream_output, show_examples, \
1907
  prompt_type, temperature, top_p, top_k, num_beams, \
client_test.py CHANGED
@@ -1,9 +1,9 @@
1
  """
2
- Client test. Simplest case is chat=False and stream_output=False
3
 
4
- Run server with same choices:
5
 
6
- python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b --chat=False --stream_output=False
7
 
8
  NOTE: For private models, add --use-auth_token=True
9
 
@@ -17,7 +17,6 @@ python client_test.py
17
 
18
  debug = False
19
 
20
- import time
21
  import os
22
  os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
23
  from gradio_client import Client
@@ -26,8 +25,8 @@ client = Client("http://localhost:7860")
26
  if debug:
27
  print(client.view_api(all_endpoints=True))
28
 
29
- instruction = "Who are you?"
30
- iinput = ''
31
  context = ''
32
  # streaming output is supported, loops over and outputs each generation in streaming mode
33
  # but leave stream_output=False for simple input/output mode
@@ -37,19 +36,17 @@ temperature = 0.1
37
  top_p = 0.75
38
  top_k = 40
39
  num_beams = 1
40
- max_new_tokens = 500
41
  min_new_tokens = 0
42
  early_stopping = False
43
- max_time = 180
44
  repetition_penalty = 1.0
45
  num_return_sequences = 1
46
  do_sample = True
47
-
48
- # CHOOSE: must match server
49
- # NOTE chat mode works through files on gradio
50
- # and client currently would have to work through those files
51
- # in tmp, so not best for client. So default to False
52
  chat = False
 
 
53
 
54
 
55
  def test_client_basic():
@@ -68,43 +65,18 @@ def test_client_basic():
68
  max_time,
69
  repetition_penalty,
70
  num_return_sequences,
71
- do_sample]
72
-
73
- if not chat:
74
- # requires generate.py to run with --chat=False
75
- api_name = '/submit'
76
- res = client.predict(
77
- *tuple(args),
78
- api_name=api_name,
79
- )
80
- print(md_to_text(res))
81
- else:
82
- api_name = '/instruction'
83
- import json
84
- foofile = '/tmp/foo.json'
85
- with open(foofile, 'wt') as f:
86
- json.dump([['', None]], f)
87
- args += [foofile]
88
- if not stream_output:
89
- for res in client.predict(
90
- *tuple(args),
91
- api_name=api_name,
92
- ):
93
- print(res)
94
- res_file = client.predict(*tuple(args), api_name='/instruction_bot')
95
- res = json.load(open(res_file, "rt"))[-1][-1]
96
- print(md_to_text(res))
97
- else:
98
- print("streaming instruction_bot", flush=True)
99
- job = client.submit(*tuple(args), api_name='/instruction_bot')
100
- while not job.done():
101
- outputs_list = job.communicator.job.outputs
102
- if outputs_list:
103
- res_file = job.communicator.job.outputs[-1]
104
- res = json.load(open(res_file, "rt"))[-1][-1]
105
- print(md_to_text(res))
106
- time.sleep(0.1)
107
- print(job.outputs())
108
 
109
 
110
  import markdown # pip install markdown
 
1
  """
2
+ Client test.
3
 
4
+ Run server:
5
 
6
+ python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b
7
 
8
  NOTE: For private models, add --use-auth_token=True
9
 
 
17
 
18
  debug = False
19
 
 
20
  import os
21
  os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
22
  from gradio_client import Client
 
25
  if debug:
26
  print(client.view_api(all_endpoints=True))
27
 
28
+ instruction = '' # only for chat=True
29
+ iinput = '' # only for chat=True
30
  context = ''
31
  # streaming output is supported, loops over and outputs each generation in streaming mode
32
  # but leave stream_output=False for simple input/output mode
 
36
  top_p = 0.75
37
  top_k = 40
38
  num_beams = 1
39
+ max_new_tokens = 50
40
  min_new_tokens = 0
41
  early_stopping = False
42
+ max_time = 20
43
  repetition_penalty = 1.0
44
  num_return_sequences = 1
45
  do_sample = True
46
+ # only these 2 below used if pass chat=False
 
 
 
 
47
  chat = False
48
+ instruction_nochat = "Who are you?"
49
+ iinput_nochat = ''
50
 
51
 
52
  def test_client_basic():
 
65
  max_time,
66
  repetition_penalty,
67
  num_return_sequences,
68
+ do_sample,
69
+ chat,
70
+ instruction_nochat,
71
+ iinput_nochat,
72
+ ]
73
+ api_name = '/submit_nochat'
74
+ res = client.predict(
75
+ *tuple(args),
76
+ api_name=api_name,
77
+ )
78
+ res_dict = dict(instruction_nochat=instruction_nochat, iinput_nochat=iinput_nochat, response=md_to_text(res))
79
+ print(res_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  import markdown # pip install markdown
finetune.py CHANGED
@@ -121,7 +121,7 @@ def train(
121
  save_code: bool = False,
122
  run_id: int = None,
123
 
124
- base_model: str = 'h2oai/h2ogpt-oig-oasst1-256-6.9b',
125
  # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
126
  # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
127
  # base_model: str = 'EleutherAI/gpt-neox-20b',
@@ -810,7 +810,7 @@ Current Time: {}
810
 
811
 
812
  def generate_prompt(data_point, prompt_type, chat, reduced):
813
- context = data_point.get('context') if chat else ''
814
  if context is None:
815
  context = ''
816
  instruction = data_point.get('instruction')
 
121
  save_code: bool = False,
122
  run_id: int = None,
123
 
124
+ base_model: str = 'h2oai/h2ogpt-oig-oasst1-512-6.9b',
125
  # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
126
  # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
127
  # base_model: str = 'EleutherAI/gpt-neox-20b',
 
810
 
811
 
812
  def generate_prompt(data_point, prompt_type, chat, reduced):
813
+ context = data_point.get('context')
814
  if context is None:
815
  context = ''
816
  instruction = data_point.get('instruction')
utils.py CHANGED
@@ -1,12 +1,10 @@
1
- import contextlib
2
  import os
3
  import gc
4
  import random
5
- import shutil
6
  import time
7
  import traceback
8
  import zipfile
9
-
10
  import filelock
11
  import numpy as np
12
  import pandas as pd
@@ -95,17 +93,22 @@ def system_info_print():
95
  return "Error: %s" % str(e)
96
 
97
 
98
- def zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
99
  try:
100
- return _zip_data(zip_path=zip_path, base_dir=base_dir, root_dirs=root_dirs)
101
  except Exception as e:
102
  traceback.print_exc()
103
  print('Exception in zipping: %s' % str(e))
104
 
105
 
106
- def _zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
 
 
 
 
107
  assert root_dirs is not None
108
- with zipfile.ZipFile(zip_path, "w") as expt_zip:
 
109
  for root_dir in root_dirs:
110
  if root_dir is None:
111
  continue
@@ -115,7 +118,7 @@ def _zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
115
  assert os.path.exists(file_to_archive)
116
  path_to_archive = os.path.relpath(file_to_archive, base_dir)
117
  expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
118
- return "data.zip"
119
 
120
 
121
  def save_generate_output(output=None, base_model=None, save_dir=None):
 
 
1
  import os
2
  import gc
3
  import random
 
4
  import time
5
  import traceback
6
  import zipfile
7
+ from datetime import datetime
8
  import filelock
9
  import numpy as np
10
  import pandas as pd
 
93
  return "Error: %s" % str(e)
94
 
95
 
96
+ def zip_data(root_dirs=None, zip_file=None, base_dir='./'):
97
  try:
98
+ return _zip_data(zip_file=zip_file, base_dir=base_dir, root_dirs=root_dirs)
99
  except Exception as e:
100
  traceback.print_exc()
101
  print('Exception in zipping: %s' % str(e))
102
 
103
 
104
+ def _zip_data(root_dirs=None, zip_file=None, base_dir='./'):
105
+ if zip_file is None:
106
+ datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_")
107
+ host_name = os.getenv('HF_HOSTNAME', 'emptyhost')
108
+ zip_file = "data_%s_%s.zip" % (datetime_str, host_name)
109
  assert root_dirs is not None
110
+
111
+ with zipfile.ZipFile(zip_file, "w") as expt_zip:
112
  for root_dir in root_dirs:
113
  if root_dir is None:
114
  continue
 
118
  assert os.path.exists(file_to_archive)
119
  path_to_archive = os.path.relpath(file_to_archive, base_dir)
120
  expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
121
+ return zip_file
122
 
123
 
124
  def save_generate_output(output=None, base_model=None, save_dir=None):