pseudotensor commited on
Commit
0a5ce48
1 Parent(s): 190bc9c

Update with h2ogpt hash 24c76a5944a7bc0ee6249ecab5ff915592771e88

Browse files
Files changed (1) hide show
  1. app.py +37 -25
app.py CHANGED
@@ -27,6 +27,11 @@ from finetune import get_loaders, example_data_points, generate_prompt, get_gith
27
  human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
28
  from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
29
 
 
 
 
 
 
30
 
31
  def main(
32
  load_8bit: bool = False,
@@ -90,15 +95,22 @@ def main(
90
  ):
91
  # allow set token directly
92
  use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
93
- # override share if in spaces
94
- if os.environ.get("HUGGINGFACE_SPACES"):
95
- share = False
96
- base_model = 'h2oai/h2ogpt-oasst1-512-12b'
97
- load_8bit = True
98
- temperature = 0.7
99
- top_p = 1
100
- top_k = 100
101
  do_sample = True
 
 
 
 
 
 
 
 
 
 
102
 
103
  # get defaults
104
  model_lower = base_model.lower()
@@ -202,7 +214,7 @@ def main(
202
  assert ex[1] in [None, ''] # should be no iinput
203
  assert ex[2] in [None, ''] # should be no context
204
  prompt = ex[0]
205
- cutoff_len = 768 if os.environ.get("HUGGINGFACE_SPACES") else 2048
206
  inputs = stokenizer(prompt, res,
207
  return_tensors="pt",
208
  truncation=True,
@@ -526,11 +538,11 @@ def go_gradio(**kwargs):
526
  """
527
  else:
528
  description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
529
- if os.environ.get("HUGGINGFACE_SPACES"):
530
  description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
531
  if kwargs['load_8bit']:
532
- description += """<i><li> Model is loaded in 8-bit and HF spaces version has other limitations in order to fit on HF GPUs, so UX can be worse than native app.</i></li>"""
533
- description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
534
 
535
  if kwargs['verbose']:
536
  task_info_md = f"""
@@ -617,7 +629,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
617
  {description}
618
  {task_info_md}
619
  """)
620
- if os.environ.get("HUGGINGFACE_SPACES"):
621
  gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
622
 
623
  # go button visible if
@@ -685,7 +697,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
685
  value=kwargs['stream_output'])
686
  prompt_type = gr.Dropdown(prompt_types_strings,
687
  value=kwargs['prompt_type'], label="Prompt Type",
688
- visible=not os.environ.get("HUGGINGFACE_SPACES"))
689
  temperature = gr.Slider(minimum=0, maximum=3,
690
  value=kwargs['temperature'],
691
  label="Temperature",
@@ -698,12 +710,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
698
  value=kwargs['top_k'], label="Top k",
699
  info='Num. tokens to sample from'
700
  )
701
- max_beams = 8 if not os.environ.get("HUGGINGFACE_SPACES") else 2
702
  num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
703
  value=min(max_beams, kwargs['num_beams']), label="Beams",
704
  info="Number of searches for optimal overall probability. "
705
  "Uses more GPU memory/compute")
706
- max_max_new_tokens = 2048 if not os.environ.get("HUGGINGFACE_SPACES") else kwargs['max_new_tokens']
707
  max_new_tokens = gr.Slider(
708
  minimum=1, maximum=max_max_new_tokens, step=1,
709
  value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
@@ -714,7 +726,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
714
  )
715
  early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
716
  value=kwargs['early_stopping'])
717
- max_max_time = 60 * 5 if not os.environ.get("HUGGINGFACE_SPACES") else 60
718
  max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
719
  value=min(max_max_time, kwargs['max_time']), label="Max. time",
720
  info="Max. time to search optimal output.")
@@ -724,17 +736,17 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
724
  num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
725
  value=kwargs['num_return_sequences'],
726
  label="Number Returns", info="Must be <= num_beams",
727
- visible=not os.environ.get("HUGGINGFACE_SPACES"))
728
  do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
729
  value=kwargs['do_sample'])
730
  if kwargs['chat']:
731
  iinput = gr.Textbox(lines=4, label="Input",
732
  placeholder=kwargs['placeholder_input'],
733
- visible=not os.environ.get("HUGGINGFACE_SPACES"))
734
  # nominally empty for chat mode
735
  context = gr.Textbox(lines=1, label="Context",
736
  info="Ignored in chat mode.",
737
- visible=not os.environ.get("HUGGINGFACE_SPACES"))
738
 
739
  with gr.TabItem("Models"):
740
  with gr.Row():
@@ -744,8 +756,8 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
744
  model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
745
  lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
746
  with gr.Column(scale=1):
747
- load_msg = "Load Model/LORA" if not os.environ.get("HUGGINGFACE_SPACES") \
748
- else "LOAD DISABLED ON HF SPACES"
749
  load_model_button = gr.Button(load_msg)
750
  model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
751
  lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
@@ -811,7 +823,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
811
  len(history[-1]) >= 2:
812
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
813
 
814
- max_length_tokenize = 512 if os.environ.get("HUGGINGFACE_SPACES") else 2048
815
  cutoff_len = max_length_tokenize*4 # restrict deberta related to max for LLM
816
 
817
  question = history[-1][0]
@@ -1025,7 +1037,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
1025
  outputs=[model_state, model_used, lora_used, prompt_type])
1026
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1027
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
1028
- if not os.environ.get("HUGGINGFACE_SPACES"):
1029
  load_model_event = load_model_button.click(**load_model_args) \
1030
  .then(**prompt_update_args) \
1031
  .then(**chatbot_update_args) \
@@ -1243,7 +1255,7 @@ def evaluate(
1243
  # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
1244
  # RuntimeError: expected scalar type Half but found Float
1245
  # with - 256
1246
- max_length_tokenize = 768 - 256 if os.environ.get("HUGGINGFACE_SPACES") else 2048 - 256
1247
  cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens
1248
  output_smallest = 30 * 4
1249
  prompt = prompt[-cutoff_len - output_smallest:]
 
27
  human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
28
  from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
29
 
30
+ is_hf = os.getenv("HUGGINGFACE_SPACES")
31
+ is_gpth2oai = os.getenv("GPT_H2O_AI")
32
+ is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
33
+ is_low_mem = is_hf # assumes run on 24GB consumer GPU
34
+
35
 
36
  def main(
37
  load_8bit: bool = False,
 
95
  ):
96
  # allow set token directly
97
  use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
98
+
99
+ if is_public:
100
+ temperature = 0.4
101
+ top_p = 0.85
102
+ top_k = 70
 
 
 
103
  do_sample = True
104
+ if is_low_mem:
105
+ base_model = 'h2oai/h2ogpt-oasst1-512-12b'
106
+ load_8bit = True
107
+ else:
108
+ base_model = 'h2oai/h2ogpt-oasst1-512-20b'
109
+ if is_low_mem:
110
+ load_8bit = True
111
+ if is_hf:
112
+ # must override share if in spaces
113
+ share = False
114
 
115
  # get defaults
116
  model_lower = base_model.lower()
 
214
  assert ex[1] in [None, ''] # should be no iinput
215
  assert ex[2] in [None, ''] # should be no context
216
  prompt = ex[0]
217
+ cutoff_len = 768 if is_low_mem else 2048
218
  inputs = stokenizer(prompt, res,
219
  return_tensors="pt",
220
  truncation=True,
 
538
  """
539
  else:
540
  description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
541
+ if is_public:
542
  description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
543
  if kwargs['load_8bit']:
544
+ description += """<i><li> Model is loaded in 8-bit and with other limitations in order to fit on GPUs with lower amounts of VRAM, so UX can be worse than non-hosted version.</i></li>"""
545
+ description += """<i><li>Model loading and unloading disabled to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
546
 
547
  if kwargs['verbose']:
548
  task_info_md = f"""
 
629
  {description}
630
  {task_info_md}
631
  """)
632
+ if is_hf:
633
  gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
634
 
635
  # go button visible if
 
697
  value=kwargs['stream_output'])
698
  prompt_type = gr.Dropdown(prompt_types_strings,
699
  value=kwargs['prompt_type'], label="Prompt Type",
700
+ visible=not is_public)
701
  temperature = gr.Slider(minimum=0, maximum=3,
702
  value=kwargs['temperature'],
703
  label="Temperature",
 
710
  value=kwargs['top_k'], label="Top k",
711
  info='Num. tokens to sample from'
712
  )
713
+ max_beams = 8 if not is_low_mem else 2
714
  num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
715
  value=min(max_beams, kwargs['num_beams']), label="Beams",
716
  info="Number of searches for optimal overall probability. "
717
  "Uses more GPU memory/compute")
718
+ max_max_new_tokens = 2048 if not is_low_mem else kwargs['max_new_tokens']
719
  max_new_tokens = gr.Slider(
720
  minimum=1, maximum=max_max_new_tokens, step=1,
721
  value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
 
726
  )
727
  early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
728
  value=kwargs['early_stopping'])
729
+ max_max_time = 60 * 5 if not is_low_mem else 60
730
  max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
731
  value=min(max_max_time, kwargs['max_time']), label="Max. time",
732
  info="Max. time to search optimal output.")
 
736
  num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
737
  value=kwargs['num_return_sequences'],
738
  label="Number Returns", info="Must be <= num_beams",
739
+ visible=not is_public)
740
  do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
741
  value=kwargs['do_sample'])
742
  if kwargs['chat']:
743
  iinput = gr.Textbox(lines=4, label="Input",
744
  placeholder=kwargs['placeholder_input'],
745
+ visible=not is_public)
746
  # nominally empty for chat mode
747
  context = gr.Textbox(lines=1, label="Context",
748
  info="Ignored in chat mode.",
749
+ visible=not is_public)
750
 
751
  with gr.TabItem("Models"):
752
  with gr.Row():
 
756
  model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
757
  lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
758
  with gr.Column(scale=1):
759
+ load_msg = "Load Model/LORA" if not is_public \
760
+ else "LOAD DISABLED FOR HOSTED DEMO"
761
  load_model_button = gr.Button(load_msg)
762
  model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
763
  lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
 
823
  len(history[-1]) >= 2:
824
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
825
 
826
+ max_length_tokenize = 512 if is_low_mem else 2048
827
  cutoff_len = max_length_tokenize*4 # restrict deberta related to max for LLM
828
 
829
  question = history[-1][0]
 
1037
  outputs=[model_state, model_used, lora_used, prompt_type])
1038
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1039
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
1040
+ if not is_public:
1041
  load_model_event = load_model_button.click(**load_model_args) \
1042
  .then(**prompt_update_args) \
1043
  .then(**chatbot_update_args) \
 
1255
  # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
1256
  # RuntimeError: expected scalar type Half but found Float
1257
  # with - 256
1258
+ max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
1259
  cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens
1260
  output_smallest = 30 * 4
1261
  prompt = prompt[-cutoff_len - output_smallest:]