Research-chatbot

Runtime error

App Files Files Community

pseudotensor commited on Apr 23, 2023

Commit

1c674f6

1 Parent(s): 8c85f5b

Update with h2oGPT hash 7c5db3692798dba31c4c415429e1ca06e12dd480

Browse files

Files changed (4) hide show

app.py +615 -307
client_test.py +22 -50
finetune.py +2 -2
utils.py +11 -8

app.py CHANGED Viewed

@@ -31,7 +31,10 @@ is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
 is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
 is_low_mem = is_hf  # assumes run on 24GB consumer GPU
 admin_pass = os.getenv("ADMIN_PASS")
 def main(
         load_8bit: bool = False,
@@ -40,7 +43,7 @@ def main(
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
-        force_1_gpu: bool = True,
         prompt_type: Union[int, str] = None,
         # input to generation
@@ -142,11 +145,12 @@ def main(
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
-            filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
-            if not os.path.isfile(filename):
-                os.system('wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
             import json
-            data = json.load(open(filename, 'rt'))
             # focus on data that starts with human, else likely chopped from other data
             turn_start = 0  # odd in general
             data = [x for x in data if len(x['conversations']) > turn_start + 1 and
@@ -162,12 +166,29 @@ def main(
                 assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
                 output = data[i]['conversations'][turn_start + 1]['value']
                 examplenew = example1.copy()
-                examplenew[0] = instruction
-                examplenew[1] = ''  # no input
-                examplenew[2] = ''  # no context
                 examples.append(examplenew)
                 responses.append(output)
         with torch.device("cuda"):
             # ensure was set right above before examples generated
             assert not stream_output, "stream_output=True does not make sense with example loop"
@@ -180,7 +201,7 @@ def main(
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
-                fun = partial(evaluate, model_state, debug=debug, chat=chat, save_dir=save_dir)
             else:
                 assert eval_sharegpt_prompts_only > 0
@@ -191,15 +212,17 @@ def main(
                 fun = get_response
             t0 = time.time()
             score_dump = []
-            num_examples = len(examples)
             import matplotlib.pyplot as plt
             for exi, ex in enumerate(examples):
                 clear_torch_cache()
                 print("")
                 print("START" + "=" * 100)
-                print("Question: %s %s" % (ex[0], ('input=%s' % ex[1] if ex[1] else '')))
                 print("-" * 105)
                 # fun yields as generator, so have to iterate over it
                 # Also means likely do NOT want --stream_output=True, else would show all generations
@@ -208,14 +231,14 @@ def main(
                     if smodel:
                         score_with_prompt = False
                         if score_with_prompt:
-                            data_point = dict(instruction=ex[0], input=ex[1])
                             prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
                             prompt = prompter.generate_prompt(data_point)
                         else:
                             # just raw input and output
-                            assert ex[1] in [None, '']  # should be no iinput
-                            assert ex[2] in [None, '']  # should be no context
-                            prompt = ex[0]
                         cutoff_len = 768 if is_low_mem else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
@@ -228,10 +251,11 @@ def main(
                             traceback.print_exc()
                             score = 0.0
                             clear_torch_cache()
-                        except RuntimeError as e:
                             if 'Expected all tensors to be on the same device' in str(e) or \
                                     'expected scalar type Half but found Float' in str(e) or \
-                                    'probability tensor contains either' in str(e):
                                 print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
                                       flush=True)
                                 traceback.print_exc()
@@ -242,29 +266,16 @@ def main(
                         print("SCORE %s: %s" % (exi, score), flush=True)
                         score_dump.append(ex + [prompt, res, score])
                         # dump every score in case abort
-                        scoring_path = 'scoring'
-                        os.makedirs(scoring_path, exist_ok=True)
-                        if eval_sharegpt_as_output:
-                            used_base_model = 'gpt35'
-                            used_lora_weights = ''
-                        else:
-                            used_base_model = str(base_model.split('/')[-1])
-                            used_lora_weights = str(lora_weights.split('/')[-1])
-                        df_scores = pd.DataFrame(score_dump, columns=eval_func_param_names + ['prompt', 'response', 'score'])
-                        filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
-                                                                         eval_sharegpt_prompts_only_seed,
-                                                                         eval_sharegpt_as_output,
-                                                                         used_base_model,
-                                                                         used_lora_weights)
-                        filename = os.path.join(scoring_path, filename)
-                        df_scores.to_parquet(filename, index=False)
                         # plot histogram so far
                         plt.figure(figsize=(10, 10))
                         plt.hist(df_scores['score'], bins=20)
                         score_avg = np.mean(df_scores['score'])
                         score_median = np.median(df_scores['score'])
                         plt.title("Score avg: %s median: %s" % (score_avg, score_median))
-                        plt.savefig(filename.replace('.parquet', '.png'))
                         plt.close()
                 print("END" + "=" * 102)
@@ -273,7 +284,8 @@ def main(
                 print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
             t1 = time.time()
             print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
-        return
     if gradio:
         go_gradio(**locals())
@@ -287,7 +299,9 @@ def get_device():
     return device
-def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type, force_1_gpu=True, use_auth_token=False):
     """
     Ensure model gets on correct device
     :param base_model:
@@ -295,6 +309,8 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     :param load_half:
     :param model_kwargs:
     :param reward_type:
     :return:
     """
     with init_empty_weights():
@@ -319,14 +335,14 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
         device_map.update(device_map_model)
     print('device_map: %s' % device_map, flush=True)
-    if force_1_gpu:
         # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
         # So avoid for now, just put on first GPU, unless score_model, put on last
         n_gpus = torch.cuda.device_count()
         if reward_type:
             device_map = {'': n_gpus - 1}
         else:
-            device_map = {'': 0}
     load_in_8bit = model_kwargs.get('load_in_8bit', False)
     model_kwargs['device_map'] = device_map
@@ -351,7 +367,7 @@ def get_model(
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
-        force_1_gpu: bool = False,
         llama_type: bool = None,
         reward_type: bool = None,
@@ -371,7 +387,7 @@ def get_model(
     :param base_model: name/path of base model
     :param tokenizer_base_model: name/path of tokenizer
     :param lora_weights: name/path
-    :param force_1_gpu:
     :param llama_type: whether LLaMa type model
     :param reward_type: reward type model for sequence classification
     :param local_files_only: use local files instead of from HF
@@ -432,7 +448,7 @@ def get_model(
             with torch.device("cuda"):
                 if infer_devices:
                     model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
-                                               force_1_gpu=force_1_gpu, use_auth_token=use_auth_token)
                 else:
                     if load_half and not load_8bit:
                         model = model_loader.from_pretrained(
@@ -511,7 +527,6 @@ def get_score_model(**kwargs):
 def go_gradio(**kwargs):
     # get default model
     all_kwargs = kwargs.copy()
     all_kwargs.update(locals())
@@ -526,11 +541,10 @@ def go_gradio(**kwargs):
     smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
     if 'mbart-' in kwargs['model_lower']:
-        instruction_label = "Text to translate"
     else:
-        instruction_label = "Instruction"
-    if kwargs['chat']:
-        instruction_label = "You (Shift-Enter or push Submit to send message)"
     title = 'h2oGPT'
     if kwargs['verbose']:
@@ -542,9 +556,9 @@ def go_gradio(**kwargs):
     else:
         description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
     if is_public:
-        description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
         if kwargs['load_8bit']:
-            description += """<i><li> Model is loaded in 8-bit, model loading-unloading is disabled, and other limitations exist in order to fit on GPUs with lower amounts of VRAM, so UX can be worse than non-hosted version.</i></li>"""
         description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
         description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
@@ -630,6 +644,7 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                 return chat_message
             else:
                 raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
         Chatbot._postprocess_chat_messages = _postprocess_chat_messages
     demo = gr.Blocks(theme=gr.themes.Soft(**colors_dict), css=css_code, title="h2oGPT", analytics_enabled=False)
@@ -645,14 +660,32 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
         lora_options = [kwargs['lora_weights'].strip()] + lora_options
     # always add in no lora case
     # add fake space so doesn't go away in gradio dropdown
-    lora_options = [' '] + kwargs['extra_lora_options']
-    output_label0 = f'h2oGPT [Model: {kwargs.get("base_model")}]' if kwargs.get('base_model') else 'h2oGPT [   !!! Please Load Model in Models Tab !!!   ]'
     with demo:
         # avoid actual model/tokenizer here or anything that would be bad to deepcopy
         # https://github.com/gradio-app/gradio/issues/3558
         model_state = gr.State(['model', 'tokenizer', device, kwargs['base_model']])
         model_options_state = gr.State([model_options])
         lora_options_state = gr.State([lora_options])
         gr.Markdown(
@@ -663,57 +696,69 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
             {task_info_md}
             """)
         if is_hf:
-            gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
-        base_wanted = bool(kwargs['base_model']) and kwargs['login_mode_if_model0']
         go_btn = gr.Button(value="ENTER", visible=base_wanted, variant="primary")
         normal_block = gr.Row(visible=not base_wanted)
         with normal_block:
             with gr.Tabs():
                 with gr.Row():
-                    if not kwargs['chat']:
-                        with gr.Column():
-                            instruction = gr.Textbox(
-                                lines=4, label=instruction_label,
-                                placeholder=kwargs['placeholder_instruction'],
-                            )
-                            iinput = gr.Textbox(lines=4, label="Input",
-                                                placeholder=kwargs['placeholder_input'])
                             flag_btn = gr.Button("Flag")
                             if kwargs['score_model']:
-                                if not kwargs['auto_score']:
                                     with gr.Column():
-                                        score_btn = gr.Button("Score last prompt & response")
-                                        score_text = gr.Textbox("Response Score: NA", show_label=False)
                                 else:
                                     score_text = gr.Textbox("Response Score: NA", show_label=False)
-                    with gr.Column():
-                        if kwargs['chat']:
-                            text_output = gr.Chatbot(label=output_label0).style(height=kwargs['height'] or 400)
-                            with gr.Row():
-                                with gr.Column(scale=50):
-                                    instruction = gr.Textbox(
-                                        lines=4, label=instruction_label,
-                                        placeholder=kwargs['placeholder_instruction'],
-                                    )
-                                with gr.Row():  # .style(equal_height=False, equal_width=False):
-                                    submit = gr.Button(value='Submit').style(full_width=False, size='sm')
-                                    stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
-                            with gr.Row():
-                                clear = gr.Button("New Conversation")
-                                flag_btn = gr.Button("Flag")
-                                if kwargs['score_model']:
-                                    if not kwargs['auto_score']:
-                                        with gr.Column():
-                                            score_btn = gr.Button("Score last prompt & response").style(full_width=False, size='sm')
-                                            score_text = gr.Textbox("Response Score: NA", show_label=False)
-                                    else:
-                                        score_text = gr.Textbox("Response Score: NA", show_label=False)
-                                retry = gr.Button("Regenerate")
-                                undo = gr.Button("Undo")
-                        else:
-                            text_output = gr.Textbox(lines=5, label=output_label0)
                 with gr.TabItem("Input/Output"):
                     with gr.Row():
                         if 'mbart-' in kwargs['model_lower']:
@@ -731,7 +776,12 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                             prompt_type = gr.Dropdown(prompt_types_strings,
                                                       value=kwargs['prompt_type'], label="Prompt Type",
                                                       visible=not is_public)
-                            temperature = gr.Slider(minimum=0, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
                                                     info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
@@ -770,30 +820,45 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                                                              value=kwargs['num_return_sequences'],
                                                              label="Number Returns", info="Must be <= num_beams",
                                                              visible=not is_public)
-                            do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
-                                                    value=kwargs['do_sample'])
-                            if kwargs['chat']:
-                                iinput = gr.Textbox(lines=4, label="Input",
-                                                    placeholder=kwargs['placeholder_input'],
-                                                    visible=not is_public)
-                            # nominally empty for chat mode
-                            context = gr.Textbox(lines=1, label="Context",
-                                                 info="Ignored in chat mode.",
-                                                 visible=not is_public)
                 with gr.TabItem("Models"):
                     with gr.Row():
                         with gr.Column():
                             with gr.Row(scale=1):
                                 with gr.Column(scale=50):
-                                    model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
-                                    lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
-                                    load_msg = "Load Model/LORA" if not is_public \
-                                        else "LOAD DISABLED FOR HOSTED DEMO"
                                     load_model_button = gr.Button(load_msg)
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
-                                    lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                             with gr.Row(scale=1):
                                 with gr.Column(scale=50):
                                     new_model = gr.Textbox(label="New Model HF name/path")
@@ -801,6 +866,30 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                                 with gr.Column(scale=1):
                                     add_model_button = gr.Button("Add new model name")
                                     add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
                 with gr.TabItem("System"):
                     system_row = gr.Row(visible=not is_public)
                     admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
@@ -830,6 +919,9 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
         kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
         fun = partial(evaluate,
                       **kwargs_evaluate)
         dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
             size="sm",
@@ -847,193 +939,320 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
         }""",
             api_name="dark",
         )
-        if not kwargs['chat']:
-            submit = gr.Button("Submit")
-            submit_event = submit.click(fun, inputs=[model_state] + inputs_list, outputs=text_output, api_name='submit')
         # examples after submit or any other buttons for chat or no chat
         if kwargs['examples'] is not None and kwargs['show_examples']:
             gr.Examples(examples=kwargs['examples'], inputs=inputs_list)
         # Score
-        def score_last_response(*args):
             """ Similar to user() """
             args_list = list(args)
-            history = args_list[-1]
-            if history is None:
-                print("Bad history in scoring last response, fix for now", flush=True)
-                history = []
-            if smodel is not None and \
-                    stokenizer is not None and \
-                    sdevice is not None and \
-                    history is not None and len(history) > 0 and \
-                    history[-1] is not None and \
-                    len(history[-1]) >= 2:
-                os.environ['TOKENIZERS_PARALLELISM'] = 'false'
-                max_length_tokenize = 512 if is_low_mem else 2048
-                cutoff_len = max_length_tokenize*4  # restrict deberta related to max for LLM
-                question = history[-1][0]
-                question = question[-cutoff_len:]
-                answer = history[-1][1]
-                answer = answer[-cutoff_len:]
-                inputs = stokenizer(question, answer,
-                                    return_tensors="pt",
-                                    truncation=True,
-                                    max_length=max_length_tokenize).to(smodel.device)
-                try:
-                    score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
-                except torch.cuda.OutOfMemoryError as e:
-                    print("GPU OOM: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
-                    del inputs
                     traceback.print_exc()
                     clear_torch_cache()
-                    return 'Response Score: GPU OOM'
-                except RuntimeError as e:
-                    if 'Expected all tensors to be on the same device' in str(e) or \
-                            'expected scalar type Half but found Float' in str(e) or \
-                            'probability tensor contains either' in str(e):
-                        print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
-                        traceback.print_exc()
-                        clear_torch_cache()
-                        return 'Response Score: GPU Error'
-                    else:
-                        raise
-                os.environ['TOKENIZERS_PARALLELISM'] = 'true'
-                return 'Response Score: {:.1%}'.format(score)
-            else:
-                return 'Response Score: NA'
         if kwargs['score_model']:
             score_args = dict(fn=score_last_response,
                               inputs=inputs_list + [text_output],
                               outputs=[score_text],
                               )
             if not kwargs['auto_score']:
-                score_event = score_btn.click(**score_args, queue=stream_output, api_name='score')
-        if kwargs['chat']:
-            def user(*args, undo=False, sanitize_user_prompt=True):
-                args_list = list(args)
-                user_message = args_list[0]
-                input1 = args_list[1]
-                context1 = args_list[2]
-                if input1 and not user_message.endswith(':'):
-                    user_message1 = user_message + ":" + input1
-                elif input1:
-                    user_message1 = user_message + input1
-                else:
-                    user_message1 = user_message
-                if sanitize_user_prompt:
-                    from better_profanity import profanity
-                    user_message1 = profanity.censor(user_message1)
-                history = args_list[-1]
-                if undo and history:
-                    history.pop()
-                args_list = args_list[:-1]
-                if history is None:
                     print("Bad history, fix for now", flush=True)
-                    history = []
-                if undo:
-                    return "", history
-                else:
-                    return "", history + [[user_message1, None]]
-            def bot(*args, retry=False):
-                args_list = list(args)
-                history = args_list[-1]
-                if retry and history:
-                    history.pop()
-                if not history:
-                    print("No history", flush=True)
-                    return
-                instruction1 = history[-1][0]
                 context1 = ''
-                if kwargs['chat_history'] > 0:
-                    prompt_type1 = args_list[prompt_type_arg_id]
-                    context1 = ''
-                    for histi in range(len(history) - 1):
-                        data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
-                        context1 += generate_prompt(data_point, prompt_type1, kwargs['chat'], reduced=True)[0].replace(
-                            '<br>', '\n')
-                        if not context1.endswith('\n'):
-                            context1 += '\n'
-                    if context1 and not context1.endswith('\n'):
-                        context1 += '\n'  # ensure if terminates abruptly, then human continues on next line
-                args_list[0] = instruction1
-                # only include desired chat history
-                args_list[2] = context1[-kwargs['chat_history']:]
-                model_state1 = args_list[-2]
-                args_list = args_list[:-2]
-                fun1 = partial(evaluate,
-                               model_state1,
-                               **kwargs_evaluate)
-                try:
-                    for output in fun1(*tuple(args_list)):
-                        bot_message = output
-                        history[-1][1] = bot_message
-                        yield history
-                except StopIteration:
                     yield history
-                except RuntimeError as e:
-                    if "generator raised StopIteration" in str(e):
-                        # assume last entry was bad, undo
-                        history.pop()
-                        yield history
-                    raise
-                except Exception as e:
-                    # put error into user input
-                    history[-1][0] = "Exception: %s" % str(e)
                     yield history
-                    raise
-                return
-            user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
-                             inputs=inputs_list + [text_output],
-                             outputs=[instruction, text_output],
-                             )
-            bot_args = dict(fn=bot,
-                            inputs=inputs_list + [model_state] + [text_output],
-                            outputs=[text_output],
-                            )
-            retry_bot_args = dict(fn=functools.partial(bot, retry=True),
-                                  inputs=inputs_list + [model_state] + [text_output],
-                                  outputs=[text_output],
-                                  )
-            undo_user_args = dict(fn=functools.partial(user, undo=True),
-                                  inputs=inputs_list + [text_output],
-                                  outputs=[instruction, text_output],
-                                  )
-            if kwargs['auto_score']:
-                submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction').then(
-                    **bot_args, api_name='instruction_bot',
-                ).then(**score_args, api_name='instruction_bot_score').then(clear_torch_cache)
-                submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit').then(
-                    **bot_args, api_name='submit_bot',
-                ).then(**score_args, api_name='submit_bot_score').then(clear_torch_cache)
-                submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry').then(
-                    **retry_bot_args, api_name='retry_bot',
-                ).then(**score_args, api_name='retry_bot_score').then(clear_torch_cache)
-                submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo').then(**score_args, api_name='undo_score')
-            else:
-                submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction').then(
-                    **bot_args, api_name='instruction_bot',
-                ).then(clear_torch_cache)
-                submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit').then(
-                    **bot_args, api_name='submit_bot',
-                ).then(clear_torch_cache)
-                submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry').then(
-                    **retry_bot_args, api_name='retry_bot',
-                ).then(clear_torch_cache)
-                submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo')
-            clear.click(lambda: None, None, text_output, queue=False, api_name='clear')
-        def load_model(model_name, lora_weights, model_state_old, prompt_type_old):
             # ensure old model removed from GPU memory
             if kwargs['debug']:
                 print("Pre-switch pre-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
@@ -1058,23 +1277,35 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
             clear_torch_cache()
             if kwargs['debug']:
                 print("Pre-switch post-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
-            all_kwargs['base_model'] = model_name.strip()
             model_lower = model_name.strip().lower()
             if model_lower in inv_prompt_type_to_model_lower:
                 prompt_type1 = inv_prompt_type_to_model_lower[model_lower]
             else:
                 prompt_type1 = prompt_type_old
-            all_kwargs['lora_weights'] = lora_weights.strip()
-            model1, tokenizer1, device1 = get_model(**all_kwargs)
             clear_torch_cache()
             if kwargs['debug']:
                 print("Post-switch GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
-            return {model_state: [model1, tokenizer1, device1, model_name],
-                    model_used: model_name,
-                    lora_used: lora_weights,
-                    prompt_type: prompt_type1}
         def dropdown_prompt_type_list(x):
             return gr.Dropdown.update(value=x)
@@ -1083,54 +1314,92 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
             return gr.Textbox.update(label=f'h2oGPT [Model: {model_used_in}]')
         load_model_args = dict(fn=load_model,
-                               inputs=[model_choice, lora_choice, model_state, prompt_type],
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
         if not is_public:
             load_model_event = load_model_button.click(**load_model_args) \
-                                                 .then(**prompt_update_args) \
-                                                 .then(**chatbot_update_args) \
-                                                 .then(clear_torch_cache)
         def dropdown_model_list(list0, x):
             new_state = [list0[0] + [x]]
             new_options = [*new_state[0]]
-            return gr.Dropdown.update(value=x, choices=new_options), '', new_state
         add_model_event = add_model_button.click(fn=dropdown_model_list,
                                                  inputs=[model_options_state, new_model],
-                                                 outputs=[model_choice, new_model, model_options_state])
-        def dropdown_lora_list(list0, x):
             new_state = [list0[0] + [x]]
             new_options = [*new_state[0]]
-            return gr.Dropdown.update(value=x, choices=new_options), '', new_state
         add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
-                                               inputs=[lora_options_state, new_lora],
-                                               outputs=[lora_choice, new_lora, lora_options_state])
         go_btn.click(lambda: gr.update(visible=False), None, go_btn, api_name="go") \
             .then(lambda: gr.update(visible=True), None, normal_block) \
             .then(**load_model_args).then(**prompt_update_args)
         # callback for logging flagged input/output
         callback.setup(inputs_list + [text_output], "flagged_data_points")
         flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
                        api_name='flag')
         def get_system_info():
             return gr.Textbox.update(value=system_info_print())
         system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
-        if kwargs['chat']:
-            # don't pass text_output, don't want to clear output, just stop it
-            # FIXME: have to click once to stop output and second time to stop GPUs going
-            stop_btn.click(lambda: None, None, None, cancels=[submit_event, submit_event2, submit_event3],
-                           queue=False, api_name='stop').then(clear_torch_cache)
     demo.queue(concurrency_count=1)
     favicon_path = "h2o-logo.svg"
@@ -1141,10 +1410,16 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
 input_args_list = ['model_state']
-inputs_kwargs_list = ['debug', 'chat', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
 def get_inputs_list(inputs_dict, model_lower):
     inputs_list_names = list(inspect.signature(evaluate).parameters)
     inputs_list = []
     for k in inputs_list_names:
@@ -1159,9 +1434,6 @@ def get_inputs_list(inputs_dict, model_lower):
     return inputs_list
-# index of prompt_type in evaluate function, after model_state
-prompt_type_arg_id = 4
 eval_func_param_names = ['instruction',
                          'iinput',
                          'context',
@@ -1178,6 +1450,9 @@ eval_func_param_names = ['instruction',
                          'repetition_penalty',
                          'num_return_sequences',
                          'do_sample',
                          ]
@@ -1200,12 +1475,14 @@ def evaluate(
         repetition_penalty,
         num_return_sequences,
         do_sample,
         # END NOTE: Examples must have same order of parameters
         src_lang=None,
         tgt_lang=None,
         debug=False,
         save_dir=None,
-        chat=False,
         hard_stop_list=None,
         sanitize_bot_response=True,
         model_state0=None,
@@ -1214,10 +1491,15 @@ def evaluate(
     if debug:
         locals_dict = locals().copy()
         locals_dict.pop('model_state', None)
         print(locals_dict)
     no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
     if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
         # try to free-up original model (i.e. list was passed as reference)
         if model_state0 is not None and model_state0[0] is not None:
@@ -1234,10 +1516,18 @@ def evaluate(
     else:
         raise AssertionError(no_model_msg)
     assert base_model.strip(), no_model_msg
     assert model, "Model is missing"
     assert tokenizer, "Tokenizer is missing"
     data_point = dict(context=context, instruction=instruction, input=iinput)
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
@@ -1272,16 +1562,16 @@ def evaluate(
         elif prompt_type == 'instruct_vicuna':
             # even below is not enough, generic strings and many ways to encode
             stop_words = [
-                          '### Human:',
-                          """
 ### Human:""",
-                          """
 ### Human:
 """,
-                          '### Assistant:',
-                          """
 ### Assistant:""",
-                          """
 ### Assistant:
 """,
             ]
@@ -1299,7 +1589,7 @@ def evaluate(
         if tokenizer.pad_token:
             stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
         # handle fake \n added
-        stop_words_ids = [x[1:] if y[0] == '\n' else x for x,y in zip(stop_words_ids, stop_words)]
         # build stopper
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
     else:
@@ -1397,15 +1687,18 @@ def evaluate(
                     traceback.print_exc()
                     clear_torch_cache()
                     return
-                except RuntimeError as e:
                     if 'Expected all tensors to be on the same device' in str(e) or \
                             'expected scalar type Half but found Float' in str(e) or \
-                            'probability tensor contains either' in str(e):
                         print(
                             "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
                             flush=True)
                         traceback.print_exc()
                         clear_torch_cache()
                         return
                     else:
                         raise
@@ -1456,6 +1749,7 @@ def get_generate_params(model_lower, chat,
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
     if show_examples is None:
         if chat:
             show_examples = False
@@ -1516,7 +1810,8 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
         else:
             prompt_type = ''
         examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
-                      stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1, False]]
         task_info = "No task"
         if prompt_type == 'instruct':
             task_info = "Answer question or follow imperative as instruction with optionally input."
@@ -1551,6 +1846,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
         repetition_penalty = repetition_penalty or 1.07
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
     params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
                    early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
@@ -1594,6 +1890,18 @@ y = np.random.randint(0, 1, 100)
     src_lang = "English"
     tgt_lang = "Russian"
     return placeholder_instruction, placeholder_input, \
            stream_output, show_examples, \
            prompt_type, temperature, top_p, top_k, num_beams, \

 is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
 is_low_mem = is_hf  # assumes run on 24GB consumer GPU
 admin_pass = os.getenv("ADMIN_PASS")
+# will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
+raise_generate_gpu_exceptions = True
+eval_extra_columns = ['prompt', 'response', 'score']
 def main(
         load_8bit: bool = False,
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
+        gpu_id: int = 0,  # if infer_devices = True and gpu_id != -1
         prompt_type: Union[int, str] = None,
         # input to generation
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
+            eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
+            if not os.path.isfile(eval_filename):
+                os.system(
+                    'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
             import json
+            data = json.load(open(eval_filename, 'rt'))
             # focus on data that starts with human, else likely chopped from other data
             turn_start = 0  # odd in general
             data = [x for x in data if len(x['conversations']) > turn_start + 1 and
                 assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
                 output = data[i]['conversations'][turn_start + 1]['value']
                 examplenew = example1.copy()
+                assert not chat, "No gradio must use chat=False, uses nochat isntruct"
+                examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
+                examplenew[eval_func_param_names.index('iinput_nochat')] = ''  # no input
+                examplenew[eval_func_param_names.index('context')] = ''  # no context
                 examples.append(examplenew)
                 responses.append(output)
+        num_examples = len(examples)
+        scoring_path = 'scoring'
+        os.makedirs(scoring_path, exist_ok=True)
+        if eval_sharegpt_as_output:
+            used_base_model = 'gpt35'
+            used_lora_weights = ''
+        else:
+            used_base_model = str(base_model.split('/')[-1])
+            used_lora_weights = str(lora_weights.split('/')[-1])
+        eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
+                                                                 eval_sharegpt_prompts_only_seed,
+                                                                 eval_sharegpt_as_output,
+                                                                 used_base_model,
+                                                                 used_lora_weights)
+        eval_filename = os.path.join(scoring_path, eval_filename)
         with torch.device("cuda"):
             # ensure was set right above before examples generated
             assert not stream_output, "stream_output=True does not make sense with example loop"
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
+                fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
             else:
                 assert eval_sharegpt_prompts_only > 0
                 fun = get_response
             t0 = time.time()
             score_dump = []
             import matplotlib.pyplot as plt
             for exi, ex in enumerate(examples):
+                instruction = ex[eval_func_param_names.index('instruction_nochat')]
+                iinput = ex[eval_func_param_names.index('iinput_nochat')]
+                context = ex[eval_func_param_names.index('context')]
                 clear_torch_cache()
                 print("")
                 print("START" + "=" * 100)
+                print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
                 print("-" * 105)
                 # fun yields as generator, so have to iterate over it
                 # Also means likely do NOT want --stream_output=True, else would show all generations
                     if smodel:
                         score_with_prompt = False
                         if score_with_prompt:
+                            data_point = dict(instruction=instruction, input=iinput, context=context)
                             prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
                             prompt = prompter.generate_prompt(data_point)
                         else:
                             # just raw input and output
+                            assert iinput in [None, '']  # should be no iinput
+                            assert context in [None, '']  # should be no context
+                            prompt = instruction
                         cutoff_len = 768 if is_low_mem else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
                             traceback.print_exc()
                             score = 0.0
                             clear_torch_cache()
+                        except (Exception, RuntimeError) as e:
                             if 'Expected all tensors to be on the same device' in str(e) or \
                                     'expected scalar type Half but found Float' in str(e) or \
+                                    'probability tensor contains either' in str(e) or \
+                                    'cublasLt ran into an error!' in str(e):
                                 print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
                                       flush=True)
                                 traceback.print_exc()
                         print("SCORE %s: %s" % (exi, score), flush=True)
                         score_dump.append(ex + [prompt, res, score])
                         # dump every score in case abort
+                        df_scores = pd.DataFrame(score_dump,
+                                                 columns=eval_func_param_names + eval_extra_columns)
+                        df_scores.to_parquet(eval_filename, index=False)
                         # plot histogram so far
                         plt.figure(figsize=(10, 10))
                         plt.hist(df_scores['score'], bins=20)
                         score_avg = np.mean(df_scores['score'])
                         score_median = np.median(df_scores['score'])
                         plt.title("Score avg: %s median: %s" % (score_avg, score_median))
+                        plt.savefig(eval_filename.replace('.parquet', '.png'))
                         plt.close()
                 print("END" + "=" * 102)
                 print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
             t1 = time.time()
             print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
+        return eval_filename
     if gradio:
         go_gradio(**locals())
     return device
+def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
+                       gpu_id=0,
+                       use_auth_token=False):
     """
     Ensure model gets on correct device
     :param base_model:
     :param load_half:
     :param model_kwargs:
     :param reward_type:
+    :param gpu_id:
+    :param use_auth_token:
     :return:
     """
     with init_empty_weights():
         device_map.update(device_map_model)
     print('device_map: %s' % device_map, flush=True)
+    if gpu_id >= 0:
         # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
         # So avoid for now, just put on first GPU, unless score_model, put on last
         n_gpus = torch.cuda.device_count()
         if reward_type:
             device_map = {'': n_gpus - 1}
         else:
+            device_map = {'': min(n_gpus - 1, gpu_id)}
     load_in_8bit = model_kwargs.get('load_in_8bit', False)
     model_kwargs['device_map'] = device_map
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
+        gpu_id: int = 0,
         llama_type: bool = None,
         reward_type: bool = None,
     :param base_model: name/path of base model
     :param tokenizer_base_model: name/path of tokenizer
     :param lora_weights: name/path
+    :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
     :param llama_type: whether LLaMa type model
     :param reward_type: reward type model for sequence classification
     :param local_files_only: use local files instead of from HF
             with torch.device("cuda"):
                 if infer_devices:
                     model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
+                                               gpu_id=gpu_id, use_auth_token=use_auth_token)
                 else:
                     if load_half and not load_8bit:
                         model = model_loader.from_pretrained(
 def go_gradio(**kwargs):
     # get default model
     all_kwargs = kwargs.copy()
     all_kwargs.update(locals())
     smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
     if 'mbart-' in kwargs['model_lower']:
+        instruction_label_nochat = "Text to translate"
     else:
+        instruction_label_nochat = "Instruction"
+    instruction_label = "You (Shift-Enter or push Submit to send message)"
     title = 'h2oGPT'
     if kwargs['verbose']:
     else:
         description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
     if is_public:
+        description += """<p><b> DISCLAIMERS: </b><ul><i><li>The model was trained on The Pile and other data, which may contain objectionable content.  Use at own risk.</i></li>"""
         if kwargs['load_8bit']:
+            description += """<i><li> Model is loaded in 8-bit and has other restrictions on this host. UX can be worse than non-hosted version.</i></li>"""
         description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
         description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
                 return chat_message
             else:
                 raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
         Chatbot._postprocess_chat_messages = _postprocess_chat_messages
     demo = gr.Blocks(theme=gr.themes.Soft(**colors_dict), css=css_code, title="h2oGPT", analytics_enabled=False)
         lora_options = [kwargs['lora_weights'].strip()] + lora_options
     # always add in no lora case
     # add fake space so doesn't go away in gradio dropdown
+    no_lora_str = no_model_str = '[None/Remove]'
+    lora_options = [no_lora_str] + kwargs['extra_lora_options']  # FIXME: why double?
+    # always add in no model case so can free memory
+    # add fake space so doesn't go away in gradio dropdown
+    model_options = [no_model_str] + model_options
+    # transcribe, will be detranscribed before use by evaluate()
+    if not kwargs['lora_weights'].strip():
+        kwargs['lora_weights'] = no_lora_str
+    if not kwargs['base_model'].strip():
+        kwargs['base_model'] = no_model_str
+    # transcribe for gradio
+    kwargs['gpu_id'] = str(kwargs['gpu_id'])
+    no_model_msg = 'h2oGPT [   !!! Please Load Model in Models Tab !!!   ]'
+    output_label0 = f'h2oGPT [Model: {kwargs.get("base_model")}]' if kwargs.get(
+        'base_model') else no_model_msg
+    output_label0_model2 = no_model_msg
     with demo:
         # avoid actual model/tokenizer here or anything that would be bad to deepcopy
         # https://github.com/gradio-app/gradio/issues/3558
         model_state = gr.State(['model', 'tokenizer', device, kwargs['base_model']])
+        model_state2 = gr.State([None, None, None, None])
         model_options_state = gr.State([model_options])
         lora_options_state = gr.State([lora_options])
         gr.Markdown(
             {task_info_md}
             """)
         if is_hf:
+            gr.HTML(
+                '''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
+        base_wanted = kwargs['base_model'] != no_model_str and kwargs['login_mode_if_model0']
         go_btn = gr.Button(value="ENTER", visible=base_wanted, variant="primary")
         normal_block = gr.Row(visible=not base_wanted)
         with normal_block:
             with gr.Tabs():
                 with gr.Row():
+                    col_nochat = gr.Column(visible=not kwargs['chat'])
+                    with col_nochat:  # FIXME: for model comparison, and check rest
+                        text_output_nochat = gr.Textbox(lines=5, label=output_label0)
+                        instruction_nochat = gr.Textbox(
+                            lines=4, label=instruction_label_nochat,
+                            placeholder=kwargs['placeholder_instruction'],
+                        )
+                        iinput_nochat = gr.Textbox(lines=4, label="Input context for Instruction",
+                                                   placeholder=kwargs['placeholder_input'])
+                        submit_nochat = gr.Button("Submit")
+                        flag_btn_nochat = gr.Button("Flag")
+                        if kwargs['score_model']:
+                            if not kwargs['auto_score']:
+                                with gr.Column():
+                                    score_btn_nochat = gr.Button("Score last prompt & response")
+                                    score_text_nochat = gr.Textbox("Response Score: NA", show_label=False)
+                            else:
+                                score_text_nochat = gr.Textbox("Response Score: NA", show_label=False)
+                    col_chat = gr.Column(visible=kwargs['chat'])
+                    with col_chat:
+                        with gr.Row():
+                            text_output = gr.Chatbot(label=output_label0).style(height=kwargs['height'] or 400)
+                            text_output2 = gr.Chatbot(label=output_label0_model2, visible=False).style(
+                                height=kwargs['height'] or 400)
+                        with gr.Row():
+                            with gr.Column(scale=50):
+                                instruction = gr.Textbox(
+                                    lines=4, label=instruction_label,
+                                    placeholder=kwargs['placeholder_instruction'],
+                                )
+                            with gr.Row():  # .style(equal_height=False, equal_width=False):
+                                submit = gr.Button(value='Submit').style(full_width=False, size='sm')
+                                stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
+                        with gr.Row():
+                            clear = gr.Button("New Conversation")
                             flag_btn = gr.Button("Flag")
                             if kwargs['score_model']:
+                                if not kwargs['auto_score']:  # FIXME: For checkbox model2
                                     with gr.Column():
+                                        with gr.Row():
+                                            score_btn = gr.Button("Score last prompt & response").style(
+                                                full_width=False, size='sm')
+                                            score_text = gr.Textbox("Response Score: NA", show_label=False)
+                                        score_res2 = gr.Row(visible=False)
+                                        with score_res2:
+                                            score_btn2 = gr.Button("Score last prompt & response 2").style(
+                                                full_width=False, size='sm')
+                                            score_text2 = gr.Textbox("Response Score2: NA", show_label=False)
                                 else:
                                     score_text = gr.Textbox("Response Score: NA", show_label=False)
+                                    score_text2 = gr.Textbox("Response Score2: NA", show_label=False, visible=False)
+                            retry = gr.Button("Regenerate")
+                            undo = gr.Button("Undo")
                 with gr.TabItem("Input/Output"):
                     with gr.Row():
                         if 'mbart-' in kwargs['model_lower']:
                             prompt_type = gr.Dropdown(prompt_types_strings,
                                                       value=kwargs['prompt_type'], label="Prompt Type",
                                                       visible=not is_public)
+                            prompt_type2 = gr.Dropdown(prompt_types_strings,
+                                                       value=kwargs['prompt_type'], label="Prompt Type Model 2",
+                                                       visible=not is_public and False)
+                            do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
+                                                    value=kwargs['do_sample'])
+                            temperature = gr.Slider(minimum=0.01, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
                                                     info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
                                                              value=kwargs['num_return_sequences'],
                                                              label="Number Returns", info="Must be <= num_beams",
                                                              visible=not is_public)
+                            iinput = gr.Textbox(lines=4, label="Input",
+                                                placeholder=kwargs['placeholder_input'],
+                                                visible=not is_public)
+                            context = gr.Textbox(lines=3, label="System Pre-Context",
+                                                 info="Directly pre-appended without prompt processing",
+                                                 visible=not is_public and not kwargs['chat'])
+                            chat = gr.components.Checkbox(label="Chat mode", value=kwargs['chat'],
+                                                          visible=not is_public)
                 with gr.TabItem("Models"):
+                    load_msg = "Load-Unload Model/LORA" if not is_public \
+                        else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO"
+                    load_msg2 = "Load-Unload Model/LORA 2" if not is_public \
+                        else "LOAD-UNLOAD DISABLED FOR HOSTED DEMO 2"
+                    compare_checkbox = gr.components.Checkbox(label="Compare Mode",
+                                                              value=False, visible=not is_public)
                     with gr.Row():
+                        n_gpus = torch.cuda.device_count()
+                        n_gpus_list = [str(x) for x in list(range(-1, n_gpus))]
                         with gr.Column():
                             with gr.Row(scale=1):
                                 with gr.Column(scale=50):
+                                    model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model",
+                                                               value=kwargs['base_model'])
+                                    lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA",
+                                                              value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
                                     load_model_button = gr.Button(load_msg)
+                                    model_load8bit_checkbox = gr.components.Checkbox(
+                                        label="Load 8-bit [Not all models support]",
+                                        value=kwargs['load_8bit'])
+                                    model_infer_devices_checkbox = gr.components.Checkbox(
+                                        label="Infer Devices [If GPU ID=-1 or not Checked, then will spread model over GPUs]",
+                                        value=kwargs['infer_devices'])
+                                    model_gpu = gr.Dropdown(n_gpus_list, label="GPU ID [-1 = all GPUs]",
+                                                            value=kwargs['gpu_id'])
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
+                                    lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
+                                                           visible=kwargs['show_lora'])
                             with gr.Row(scale=1):
                                 with gr.Column(scale=50):
                                     new_model = gr.Textbox(label="New Model HF name/path")
                                 with gr.Column(scale=1):
                                     add_model_button = gr.Button("Add new model name")
                                     add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
+                        col_model2 = gr.Column(visible=False)
+                        with col_model2:
+                            with gr.Row(scale=1):
+                                with gr.Column(scale=50):
+                                    model_choice2 = gr.Dropdown(model_options_state.value[0], label="Choose Model 2",
+                                                                value=no_model_str)
+                                    lora_choice2 = gr.Dropdown(lora_options_state.value[0], label="Choose LORA 2",
+                                                               value=no_lora_str,
+                                                               visible=kwargs['show_lora'])
+                                with gr.Column(scale=1):
+                                    load_model_button2 = gr.Button(load_msg2)
+                                    model_load8bit_checkbox2 = gr.components.Checkbox(
+                                        label="Load 8-bit 2 [Not all models support]",
+                                        value=kwargs['load_8bit'])
+                                    model_infer_devices_checkbox2 = gr.components.Checkbox(
+                                        label="Infer Devices 2 [If GPU ID=-1 or not Checked, then will spread model over GPUs]",
+                                        value=kwargs[
+                                            'infer_devices'])
+                                    model_gpu2 = gr.Dropdown(n_gpus_list, label="GPU ID [-1 = all GPUs]",
+                                                             value=kwargs['gpu_id'])
+                                    # no model/lora loaded ever in model2 by default
+                                    model_used2 = gr.Textbox(label="Current Model 2", value=no_model_str)
+                                    lora_used2 = gr.Textbox(label="Current LORA 2", value=no_lora_str,
+                                                            visible=kwargs['show_lora'])
                 with gr.TabItem("System"):
                     system_row = gr.Row(visible=not is_public)
                     admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
         kwargs_evaluate = {k: v for k, v in all_kwargs.items() if k in inputs_kwargs_list}
         fun = partial(evaluate,
                       **kwargs_evaluate)
+        fun2 = partial(evaluate,
+                       model_state2,
+                       **kwargs_evaluate)
         dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
             size="sm",
         }""",
             api_name="dark",
         )
+        # Control chat and non-chat blocks, which can be independently used by chat checkbox swap
+        def col_nochat_fun(x):
+            return gr.Column.update(visible=not x)
+        def col_chat_fun(x):
+            return gr.Column.update(visible=x)
+        def context_fun(x):
+            return gr.Textbox.update(visible=not x)
+        chat.select(col_nochat_fun, chat, col_nochat, api_name="chat_checkbox") \
+            .then(col_chat_fun, chat, col_chat) \
+            .then(context_fun, chat, context)
         # examples after submit or any other buttons for chat or no chat
         if kwargs['examples'] is not None and kwargs['show_examples']:
             gr.Examples(examples=kwargs['examples'], inputs=inputs_list)
         # Score
+        def score_last_response(*args, nochat=False, model2=False):
             """ Similar to user() """
             args_list = list(args)
+            max_length_tokenize = 512 if is_low_mem else 2048
+            cutoff_len = max_length_tokenize * 4  # restrict deberta related to max for LLM
+            if not nochat:
+                history = args_list[-1]
+                if history is None:
+                    if not model2:
+                        # maybe only doing first model, no need to complain
+                        print("Bad history in scoring last response, fix for now", flush=True)
+                    history = []
+                if smodel is not None and \
+                        stokenizer is not None and \
+                        sdevice is not None and \
+                        history is not None and len(history) > 0 and \
+                        history[-1] is not None and \
+                        len(history[-1]) >= 2:
+                    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+                    question = history[-1][0]
+                    answer = history[-1][1]
+                else:
+                    return 'Response Score: NA'
+            else:
+                answer = args_list[-1]
+                instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
+                question = args_list[instruction_nochat_arg_id]
+            if question is None:
+                return 'Response Score: Bad Question'
+            if answer is None:
+                return 'Response Score: Bad Answer'
+            question = question[-cutoff_len:]
+            answer = answer[-cutoff_len:]
+            inputs = stokenizer(question, answer,
+                                return_tensors="pt",
+                                truncation=True,
+                                max_length=max_length_tokenize).to(smodel.device)
+            try:
+                score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
+            except torch.cuda.OutOfMemoryError as e:
+                print("GPU OOM: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
+                del inputs
+                traceback.print_exc()
+                clear_torch_cache()
+                return 'Response Score: GPU OOM'
+            except (Exception, RuntimeError) as e:
+                if 'Expected all tensors to be on the same device' in str(e) or \
+                        'expected scalar type Half but found Float' in str(e) or \
+                        'probability tensor contains either' in str(e) or \
+                        'cublasLt ran into an error!' in str(e):
+                    print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
+                          flush=True)
                     traceback.print_exc()
                     clear_torch_cache()
+                    return 'Response Score: GPU Error'
+                else:
+                    raise
+            os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+            return 'Response Score: {:.1%}'.format(score)
         if kwargs['score_model']:
             score_args = dict(fn=score_last_response,
                               inputs=inputs_list + [text_output],
                               outputs=[score_text],
                               )
+            score_args2 = dict(fn=partial(score_last_response, model2=True),
+                               inputs=inputs_list + [text_output2],
+                               outputs=[score_text2],
+                               )
+            score_args_nochat = dict(fn=partial(score_last_response, nochat=True),
+                                     inputs=inputs_list + [text_output_nochat],
+                                     outputs=[score_text_nochat],
+                                     )
             if not kwargs['auto_score']:
+                score_event = score_btn.click(**score_args, queue=stream_output, api_name='score') \
+                    .then(**score_args2, queue=stream_output, api_name='score2')
+                score_event_nochat = score_btn_nochat.click(**score_args_nochat, queue=stream_output,
+                                                            api_name='score_nochat')
+        def user(*args, undo=False, sanitize_user_prompt=True, model2=False):
+            """
+            User that fills history for bot
+            :param args:
+            :param undo:
+            :param sanitize_user_prompt:
+            :param model2:
+            :return:
+            """
+            args_list = list(args)
+            user_message = args_list[0]
+            input1 = args_list[1]
+            context1 = args_list[2]
+            if input1 and not user_message.endswith(':'):
+                user_message1 = user_message + ":" + input1
+            elif input1:
+                user_message1 = user_message + input1
+            else:
+                user_message1 = user_message
+            if sanitize_user_prompt:
+                from better_profanity import profanity
+                user_message1 = profanity.censor(user_message1)
+            history = args_list[-1]
+            if undo and history:
+                history.pop()
+            args_list = args_list[:-1]  # FYI, even if unused currently
+            if history is None:
+                if not model2:
+                    # no need to complain so often unless model1
                     print("Bad history, fix for now", flush=True)
+                history = []
+            # ensure elements not mixed across models as output,
+            # even if input is currently same source
+            history = history.copy()
+            if undo:
+                return history
+            else:
+                # FIXME: compare, same history for now
+                return history + [[user_message1, None]]
+        def bot(*args, retry=False):
+            """
+            bot that consumes history for user input
+            instruction (from input_list) itself is not consumed by bot
+            :param args:
+            :param retry:
+            :return:
+            """
+            args_list = list(args).copy()
+            history = args_list[-1]  # model_state is -2
+            if retry and history:
+                history.pop()
+            if not history:
+                print("No history", flush=True)
+                return
+            # ensure output will be unique to models
+            history = history.copy()
+            instruction1 = history[-1][0]
+            context1 = ''
+            if kwargs['chat_history'] > 0:
+                prompt_type_arg_id = eval_func_param_names.index('prompt_type')
+                prompt_type1 = args_list[prompt_type_arg_id]
+                chat_arg_id = eval_func_param_names.index('chat')
+                chat1 = args_list[chat_arg_id]
                 context1 = ''
+                for histi in range(len(history) - 1):
+                    data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
+                    context1 += generate_prompt(data_point, prompt_type1, chat1, reduced=True)[0].replace(
+                        '<br>', '\n')
+                    if not context1.endswith('\n'):
+                        context1 += '\n'
+                if context1 and not context1.endswith('\n'):
+                    context1 += '\n'  # ensure if terminates abruptly, then human continues on next line
+            args_list[0] = instruction1  # override original instruction with history from user
+            # only include desired chat history
+            args_list[2] = context1[-kwargs['chat_history']:]
+            model_state1 = args_list[-2]
+            if model_state1[0] is None or model_state1[0] == no_model_str:
+                return
+            args_list = args_list[:-2]
+            fun1 = partial(evaluate,
+                           model_state1,
+                           **kwargs_evaluate)
+            try:
+                for output in fun1(*tuple(args_list)):
+                    bot_message = output
+                    history[-1][1] = bot_message
                     yield history
+            except StopIteration:
+                yield history
+            except RuntimeError as e:
+                if "generator raised StopIteration" in str(e):
+                    # assume last entry was bad, undo
+                    history.pop()
                     yield history
+                raise
+            except Exception as e:
+                # put error into user input
+                history[-1][0] = "Exception: %s" % str(e)
+                yield history
+                raise
+            return
+        # NORMAL MODEL
+        user_args = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt']),
+                         inputs=inputs_list + [text_output],
+                         outputs=text_output,
+                         )
+        bot_args = dict(fn=bot,
+                        inputs=inputs_list + [model_state] + [text_output],
+                        outputs=text_output,
+                        )
+        retry_bot_args = dict(fn=functools.partial(bot, retry=True),
+                              inputs=inputs_list + [model_state] + [text_output],
+                              outputs=text_output,
+                              )
+        undo_user_args = dict(fn=functools.partial(user, undo=True),
+                              inputs=inputs_list + [text_output],
+                              outputs=text_output,
+                              )
+        # MODEL2
+        user_args2 = dict(fn=functools.partial(user, sanitize_user_prompt=kwargs['sanitize_user_prompt'], model2=True),
+                          inputs=inputs_list + [text_output2],
+                          outputs=text_output2,
+                          )
+        bot_args2 = dict(fn=bot,
+                         inputs=inputs_list + [model_state2] + [text_output2],
+                         outputs=text_output2,
+                         )
+        retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
+                               inputs=inputs_list + [model_state2] + [text_output2],
+                               outputs=text_output2,
+                               )
+        undo_user_args2 = dict(fn=functools.partial(user, undo=True),
+                               inputs=inputs_list + [text_output2],
+                               outputs=text_output2,
+                               )
+        def clear_instruct():
+            return gr.Textbox.update(value='')
+        if kwargs['auto_score']:
+            # in case 2nd model, consume instruction first, so can clear quickly
+            # bot doesn't consume instruction itself, just history from user, so why works
+            submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction') \
+                .then(**user_args2, queue=stream_output, api_name='instruction2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**bot_args, api_name='instruction_bot') \
+                .then(**score_args, api_name='instruction_bot_score') \
+                .then(**bot_args2, api_name='instruction_bot2') \
+                .then(**score_args2, api_name='instruction_bot_score2') \
+                .then(clear_torch_cache)
+            submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit') \
+                .then(**user_args2, queue=stream_output, api_name='submit2') \
+                .then(**bot_args, api_name='submit_bot') \
+                .then(clear_instruct, None, instruction) \
+                .then(**score_args, api_name='submit_bot_score') \
+                .then(**bot_args2, api_name='submit_bot2') \
+                .then(**score_args2, api_name='submit_bot_score2') \
+                .then(clear_torch_cache)
+            submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry') \
+                .then(**user_args2, queue=stream_output, api_name='retry2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**retry_bot_args, api_name='retry_bot') \
+                .then(**score_args, api_name='retry_bot_score') \
+                .then(**retry_bot_args2, api_name='retry_bot2') \
+                .then(**score_args2, api_name='retry_bot_score2') \
+                .then(clear_torch_cache)
+            submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo') \
+                .then(**score_args, api_name='undo_score') \
+                .then(**undo_user_args2, queue=stream_output, api_name='undo2') \
+                .then(**score_args2, api_name='undo_score2') \
+                .then(clear_instruct, None, instruction)
+        else:
+            submit_event = instruction.submit(**user_args, queue=stream_output, api_name='instruction') \
+                .then(**user_args2, queue=stream_output, api_name='instruction2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**bot_args, api_name='instruction_bot') \
+                .then(**bot_args2, api_name='instruction_bot2') \
+                .then(clear_torch_cache)
+            submit_event2 = submit.click(**user_args, queue=stream_output, api_name='submit') \
+                .then(**user_args2, queue=stream_output, api_name='submit2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**bot_args, api_name='submit_bot') \
+                .then(**bot_args2, api_name='submit_bot2') \
+                .then(clear_torch_cache)
+            submit_event3 = retry.click(**user_args, queue=stream_output, api_name='retry') \
+                .then(**user_args2, queue=stream_output, api_name='retry2') \
+                .then(clear_instruct, None, instruction) \
+                .then(**retry_bot_args, api_name='retry_bot') \
+                .then(**retry_bot_args2, api_name='retry_bot2') \
+                .then(clear_torch_cache)
+            submit_event4 = undo.click(**undo_user_args, queue=stream_output, api_name='undo') \
+                .then(**undo_user_args2, queue=stream_output, api_name='undo2')
+        # does both models
+        clear.click(lambda: None, None, text_output, queue=False, api_name='clear') \
+            .then(lambda: None, None, text_output2, queue=False, api_name='clear2')
+        # FIXME: compare
+        submit_event_nochat = submit_nochat.click(fun, inputs=[model_state] + inputs_list,
+                                                  outputs=text_output_nochat, api_name='submit_nochat') \
+            .then(**score_args_nochat, api_name='instruction_bot_score_nochat') \
+            .then(clear_torch_cache)
+        def load_model(model_name, lora_weights, model_state_old, prompt_type_old, load_8bit, infer_devices, gpu_id):
             # ensure old model removed from GPU memory
             if kwargs['debug']:
                 print("Pre-switch pre-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
             clear_torch_cache()
             if kwargs['debug']:
                 print("Pre-switch post-del GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
+            if model_name is None or model_name == no_model_str:
+                # no-op if no model, just free memory
+                # no detranscribe needed for model, never go into evaluate
+                lora_weights = no_lora_str
+                return [None, None, None, model_name], model_name, lora_weights, prompt_type_old
+            all_kwargs1 = all_kwargs.copy()
+            all_kwargs1['base_model'] = model_name.strip()
+            all_kwargs1['load_8bit'] = load_8bit
+            all_kwargs1['infer_devices'] = infer_devices
+            all_kwargs1['gpu_id'] = int(gpu_id)  # detranscribe
             model_lower = model_name.strip().lower()
             if model_lower in inv_prompt_type_to_model_lower:
                 prompt_type1 = inv_prompt_type_to_model_lower[model_lower]
             else:
                 prompt_type1 = prompt_type_old
+            # detranscribe
+            if lora_weights == no_lora_str:
+                lora_weights = ''
+            all_kwargs1['lora_weights'] = lora_weights.strip()
+            model1, tokenizer1, device1 = get_model(**all_kwargs1)
             clear_torch_cache()
             if kwargs['debug']:
                 print("Post-switch GPU memory: %s" % torch.cuda.memory_allocated(), flush=True)
+            return [model1, tokenizer1, device1, model_name], model_name, lora_weights, prompt_type1
         def dropdown_prompt_type_list(x):
             return gr.Dropdown.update(value=x)
             return gr.Textbox.update(label=f'h2oGPT [Model: {model_used_in}]')
         load_model_args = dict(fn=load_model,
+                               inputs=[model_choice, lora_choice, model_state, prompt_type,
+                                       model_load8bit_checkbox, model_infer_devices_checkbox, model_gpu],
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
+        nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
         if not is_public:
             load_model_event = load_model_button.click(**load_model_args) \
+                .then(**prompt_update_args) \
+                .then(**chatbot_update_args) \
+                .then(**nochat_update_args) \
+                .then(clear_torch_cache)
+        load_model_args2 = dict(fn=load_model,
+                                inputs=[model_choice2, lora_choice2, model_state2, prompt_type2,
+                                        model_load8bit_checkbox2, model_infer_devices_checkbox2, model_gpu2],
+                                outputs=[model_state2, model_used2, lora_used2, prompt_type2])
+        prompt_update_args2 = dict(fn=dropdown_prompt_type_list, inputs=prompt_type2, outputs=prompt_type2)
+        chatbot_update_args2 = dict(fn=chatbot_list, inputs=[text_output2, model_used2], outputs=text_output2)
+        if not is_public:
+            load_model_event2 = load_model_button2.click(**load_model_args2) \
+                .then(**prompt_update_args2) \
+                .then(**chatbot_update_args2) \
+                .then(clear_torch_cache)
         def dropdown_model_list(list0, x):
             new_state = [list0[0] + [x]]
             new_options = [*new_state[0]]
+            return gr.Dropdown.update(value=x, choices=new_options), \
+                   gr.Dropdown.update(value=x, choices=new_options), \
+                   '', new_state
         add_model_event = add_model_button.click(fn=dropdown_model_list,
                                                  inputs=[model_options_state, new_model],
+                                                 outputs=[model_choice, model_choice2, new_model, model_options_state])
+        def dropdown_lora_list(list0, x, model_used1, lora_used1, model_used2, lora_used2):
             new_state = [list0[0] + [x]]
             new_options = [*new_state[0]]
+            # don't switch drop-down to added lora if already have model loaded
+            x1 = x if model_used1 == no_model_str else lora_used1
+            x2 = x if model_used2 == no_model_str else lora_used2
+            return gr.Dropdown.update(value=x1, choices=new_options), \
+                   gr.Dropdown.update(value=x2, choices=new_options), \
+                   '', new_state
         add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
+                                               inputs=[lora_options_state, new_lora, model_used, lora_used, model_used2, lora_used2],
+                                               outputs=[lora_choice, lora_choice2, new_lora, lora_options_state])
         go_btn.click(lambda: gr.update(visible=False), None, go_btn, api_name="go") \
             .then(lambda: gr.update(visible=True), None, normal_block) \
             .then(**load_model_args).then(**prompt_update_args)
+        def compare_textbox_fun(x):
+            return gr.Textbox.update(visible=x)
+        def compare_column_fun(x):
+            return gr.Column.update(visible=x)
+        def compare_prompt_fun(x):
+            return gr.Dropdown.update(visible=x)
+        compare_checkbox.select(compare_textbox_fun, compare_checkbox, text_output2, api_name="compare_checkbox") \
+            .then(compare_column_fun, compare_checkbox, col_model2) \
+            .then(compare_prompt_fun, compare_checkbox, prompt_type2) \
+            .then(compare_textbox_fun, compare_checkbox, score_text2)
+        # FIXME: add score_res2 in condition, but do better
         # callback for logging flagged input/output
         callback.setup(inputs_list + [text_output], "flagged_data_points")
         flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
                        api_name='flag')
+        flag_btn_nochat.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
+                              api_name='flag_nochat')
         def get_system_info():
             return gr.Textbox.update(value=system_info_print())
         system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
+        # don't pass text_output, don't want to clear output, just stop it
+        # FIXME: have to click once to stop output and second time to stop GPUs going
+        stop_btn.click(lambda: None, None, None,
+                       cancels=[submit_event_nochat, submit_event, submit_event2, submit_event3],
+                       queue=False, api_name='stop').then(clear_torch_cache)
     demo.queue(concurrency_count=1)
     favicon_path = "h2o-logo.svg"
 input_args_list = ['model_state']
+inputs_kwargs_list = ['debug', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
 def get_inputs_list(inputs_dict, model_lower):
+    """
+    map gradio objects in locals() to inputs for evaluate().
+    :param inputs_dict:
+    :param model_lower:
+    :return:
+    """
     inputs_list_names = list(inspect.signature(evaluate).parameters)
     inputs_list = []
     for k in inputs_list_names:
     return inputs_list
 eval_func_param_names = ['instruction',
                          'iinput',
                          'context',
                          'repetition_penalty',
                          'num_return_sequences',
                          'do_sample',
+                         'chat',
+                         'instruction_nochat',
+                         'iinput_nochat',
                          ]
         repetition_penalty,
         num_return_sequences,
         do_sample,
+        chat,
+        instruction_nochat,
+        iinput_nochat,
         # END NOTE: Examples must have same order of parameters
         src_lang=None,
         tgt_lang=None,
         debug=False,
         save_dir=None,
         hard_stop_list=None,
         sanitize_bot_response=True,
         model_state0=None,
     if debug:
         locals_dict = locals().copy()
         locals_dict.pop('model_state', None)
+        locals_dict.pop('model_state0', None)
         print(locals_dict)
     no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
+    if model_state0 is None:
+        # e.g. for no gradio case, set dummy value, else should be set
+        model_state0 = [None, None, None, None]
     if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
         # try to free-up original model (i.e. list was passed as reference)
         if model_state0 is not None and model_state0[0] is not None:
     else:
         raise AssertionError(no_model_msg)
+    if base_model is None:
+        raise AssertionError(no_model_msg)
     assert base_model.strip(), no_model_msg
     assert model, "Model is missing"
     assert tokenizer, "Tokenizer is missing"
+    # choose chat or non-chat mode
+    if not chat:
+        instruction = instruction_nochat
+        iinput = iinput_nochat
     data_point = dict(context=context, instruction=instruction, input=iinput)
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
         elif prompt_type == 'instruct_vicuna':
             # even below is not enough, generic strings and many ways to encode
             stop_words = [
+                '### Human:',
+                """
 ### Human:""",
+                """
 ### Human:
 """,
+                '### Assistant:',
+                """
 ### Assistant:""",
+                """
 ### Assistant:
 """,
             ]
         if tokenizer.pad_token:
             stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
         # handle fake \n added
+        stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
         # build stopper
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
     else:
                     traceback.print_exc()
                     clear_torch_cache()
                     return
+                except (Exception, RuntimeError) as e:
                     if 'Expected all tensors to be on the same device' in str(e) or \
                             'expected scalar type Half but found Float' in str(e) or \
+                            'probability tensor contains either' in str(e) or \
+                            'cublasLt ran into an error!' in str(e):
                         print(
                             "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
                             flush=True)
                         traceback.print_exc()
                         clear_torch_cache()
+                        if raise_generate_gpu_exceptions:
+                            raise
                         return
                     else:
                         raise
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
+    # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
     if show_examples is None:
         if chat:
             show_examples = False
         else:
             prompt_type = ''
         examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
+                      stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1,
+                      False]]
         task_info = "No task"
         if prompt_type == 'instruct':
             task_info = "Answer question or follow imperative as instruction with optionally input."
         repetition_penalty = repetition_penalty or 1.07
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
+    # doesn't include chat, instruction_nochat, iinput_nochat, added later
     params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
                    early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
     src_lang = "English"
     tgt_lang = "Russian"
+    # move to correct position
+    for example in examples:
+        example += [chat, '', '']
+        # adjust examples if non-chat mode
+        if not chat:
+            example[eval_func_param_names.index('instruction_nochat')] = example[
+                eval_func_param_names.index('instruction')]
+            example[eval_func_param_names.index('instruction')] = ''
+            example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
+            example[eval_func_param_names.index('iinput')] = ''
     return placeholder_instruction, placeholder_input, \
            stream_output, show_examples, \
            prompt_type, temperature, top_p, top_k, num_beams, \

client_test.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """
-Client test.  Simplest case is chat=False and stream_output=False
-Run server with same choices:
-python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b --chat=False --stream_output=False
 NOTE: For private models, add --use-auth_token=True
@@ -17,7 +17,6 @@ python client_test.py
 debug = False
-import time
 import os
 os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
 from gradio_client import Client
@@ -26,8 +25,8 @@ client = Client("http://localhost:7860")
 if debug:
     print(client.view_api(all_endpoints=True))
-instruction = "Who are you?"
-iinput = ''
 context = ''
 # streaming output is supported, loops over and outputs each generation in streaming mode
 # but leave stream_output=False for simple input/output mode
@@ -37,19 +36,17 @@ temperature = 0.1
 top_p = 0.75
 top_k = 40
 num_beams = 1
-max_new_tokens = 500
 min_new_tokens = 0
 early_stopping = False
-max_time = 180
 repetition_penalty = 1.0
 num_return_sequences = 1
 do_sample = True
-# CHOOSE: must match server
-# NOTE chat mode works through files on gradio
-# and client currently would have to work through those files
-# in tmp, so not best for client.  So default to False
 chat = False
 def test_client_basic():
@@ -68,43 +65,18 @@ def test_client_basic():
             max_time,
             repetition_penalty,
             num_return_sequences,
-            do_sample]
-    if not chat:
-        # requires generate.py to run with --chat=False
-        api_name = '/submit'
-        res = client.predict(
-            *tuple(args),
-            api_name=api_name,
-        )
-        print(md_to_text(res))
-    else:
-        api_name = '/instruction'
-        import json
-        foofile = '/tmp/foo.json'
-        with open(foofile, 'wt') as f:
-            json.dump([['', None]], f)
-        args += [foofile]
-        if not stream_output:
-            for res in client.predict(
-                    *tuple(args),
-                    api_name=api_name,
-            ):
-                print(res)
-            res_file = client.predict(*tuple(args), api_name='/instruction_bot')
-            res = json.load(open(res_file, "rt"))[-1][-1]
-            print(md_to_text(res))
-        else:
-            print("streaming instruction_bot", flush=True)
-            job = client.submit(*tuple(args), api_name='/instruction_bot')
-            while not job.done():
-                outputs_list = job.communicator.job.outputs
-                if outputs_list:
-                    res_file = job.communicator.job.outputs[-1]
-                    res = json.load(open(res_file, "rt"))[-1][-1]
-                    print(md_to_text(res))
-                time.sleep(0.1)
-            print(job.outputs())
 import markdown  # pip install markdown

 """
+Client test.
+Run server:
+python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b
 NOTE: For private models, add --use-auth_token=True
 debug = False
 import os
 os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
 from gradio_client import Client
 if debug:
     print(client.view_api(all_endpoints=True))
+instruction = ''  # only for chat=True
+iinput = ''  # only for chat=True
 context = ''
 # streaming output is supported, loops over and outputs each generation in streaming mode
 # but leave stream_output=False for simple input/output mode
 top_p = 0.75
 top_k = 40
 num_beams = 1
+max_new_tokens = 50
 min_new_tokens = 0
 early_stopping = False
+max_time = 20
 repetition_penalty = 1.0
 num_return_sequences = 1
 do_sample = True
+# only these 2 below used if pass chat=False
 chat = False
+instruction_nochat = "Who are you?"
+iinput_nochat = ''
 def test_client_basic():
             max_time,
             repetition_penalty,
             num_return_sequences,
+            do_sample,
+            chat,
+            instruction_nochat,
+            iinput_nochat,
+            ]
+    api_name = '/submit_nochat'
+    res = client.predict(
+        *tuple(args),
+        api_name=api_name,
+    )
+    res_dict = dict(instruction_nochat=instruction_nochat, iinput_nochat=iinput_nochat, response=md_to_text(res))
+    print(res_dict)
 import markdown  # pip install markdown

finetune.py CHANGED Viewed

@@ -121,7 +121,7 @@ def train(
         save_code: bool = False,
         run_id: int = None,
-        base_model: str = 'h2oai/h2ogpt-oig-oasst1-256-6.9b',
         # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
         # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
         # base_model: str = 'EleutherAI/gpt-neox-20b',
@@ -810,7 +810,7 @@ Current Time: {}
 def generate_prompt(data_point, prompt_type, chat, reduced):
-    context = data_point.get('context') if chat else ''
     if context is None:
         context = ''
     instruction = data_point.get('instruction')

         save_code: bool = False,
         run_id: int = None,
+        base_model: str = 'h2oai/h2ogpt-oig-oasst1-512-6.9b',
         # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
         # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
         # base_model: str = 'EleutherAI/gpt-neox-20b',
 def generate_prompt(data_point, prompt_type, chat, reduced):
+    context = data_point.get('context')
     if context is None:
         context = ''
     instruction = data_point.get('instruction')

utils.py CHANGED Viewed

@@ -1,12 +1,10 @@
-import contextlib
 import os
 import gc
 import random
-import shutil
 import time
 import traceback
 import zipfile
 import filelock
 import numpy as np
 import pandas as pd
@@ -95,17 +93,22 @@ def system_info_print():
         return "Error: %s" % str(e)
-def zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
     try:
-        return _zip_data(zip_path=zip_path, base_dir=base_dir, root_dirs=root_dirs)
     except Exception as e:
         traceback.print_exc()
         print('Exception in zipping: %s' % str(e))
-def _zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
     assert root_dirs is not None
-    with zipfile.ZipFile(zip_path, "w") as expt_zip:
         for root_dir in root_dirs:
             if root_dir is None:
                 continue
@@ -115,7 +118,7 @@ def _zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
                     assert os.path.exists(file_to_archive)
                     path_to_archive = os.path.relpath(file_to_archive, base_dir)
                     expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
-    return "data.zip"
 def save_generate_output(output=None, base_model=None, save_dir=None):

 import os
 import gc
 import random
 import time
 import traceback
 import zipfile
+from datetime import datetime
 import filelock
 import numpy as np
 import pandas as pd
         return "Error: %s" % str(e)
+def zip_data(root_dirs=None, zip_file=None, base_dir='./'):
     try:
+        return _zip_data(zip_file=zip_file, base_dir=base_dir, root_dirs=root_dirs)
     except Exception as e:
         traceback.print_exc()
         print('Exception in zipping: %s' % str(e))
+def _zip_data(root_dirs=None, zip_file=None, base_dir='./'):
+    if zip_file is None:
+        datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_")
+        host_name = os.getenv('HF_HOSTNAME', 'emptyhost')
+        zip_file = "data_%s_%s.zip" % (datetime_str, host_name)
     assert root_dirs is not None
+    with zipfile.ZipFile(zip_file, "w") as expt_zip:
         for root_dir in root_dirs:
             if root_dir is None:
                 continue
                     assert os.path.exists(file_to_archive)
                     path_to_archive = os.path.relpath(file_to_archive, base_dir)
                     expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
+    return zip_file
 def save_generate_output(output=None, base_model=None, save_dir=None):