File size: 3,380 Bytes
efe0924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Client test.  Simplest case is chat=False and stream_output=False

Run server with same choices:

python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b --chat=False --stream_output=False

NOTE: For private models, add --use-auth_token=True

NOTE: --infer_devices=True (default) must be used for multi-GPU in case see failures with cuda:x cuda:y mismatches.
Currently, this will force model to be on a single GPU.

Then run this client as:

python client_test.py
"""

debug = False

import time
import os
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
from gradio_client import Client

client = Client("http://localhost:7860")
if debug:
    print(client.view_api(all_endpoints=True))

instruction = "Who are you?"
iinput = ''
context = ''
# streaming output is supported, loops over and outputs each generation in streaming mode
# but leave stream_output=False for simple input/output mode
stream_output = False
prompt_type = 'human_bot'
temperature = 0.1
top_p = 0.75
top_k = 40
num_beams = 1
max_new_tokens = 500
min_new_tokens = 0
early_stopping = False
max_time = 180
repetition_penalty = 1.0
num_return_sequences = 1
do_sample = True

# CHOOSE: must match server
# NOTE chat mode works through files on gradio
# and client currently would have to work through those files
# in tmp, so not best for client.  So default to False
chat = False


def test_client_basic():
    args = [instruction,
            iinput,
            context,
            stream_output,
            prompt_type,
            temperature,
            top_p,
            top_k,
            num_beams,
            max_new_tokens,
            min_new_tokens,
            early_stopping,
            max_time,
            repetition_penalty,
            num_return_sequences,
            do_sample]

    if not chat:
        # requires generate.py to run with --chat=False
        api_name = '/submit'
        res = client.predict(
            *tuple(args),
            api_name=api_name,
        )
        print(md_to_text(res))
    else:
        api_name = '/instruction'
        import json
        foofile = '/tmp/foo.json'
        with open(foofile, 'wt') as f:
            json.dump([['', None]], f)
        args += [foofile]
        if not stream_output:
            for res in client.predict(
                    *tuple(args),
                    api_name=api_name,
            ):
                print(res)
            res_file = client.predict(*tuple(args), api_name='/instruction_bot')
            res = json.load(open(res_file, "rt"))[-1][-1]
            print(md_to_text(res))
        else:
            print("streaming instruction_bot", flush=True)
            job = client.submit(*tuple(args), api_name='/instruction_bot')
            while not job.done():
                outputs_list = job.communicator.job.outputs
                if outputs_list:
                    res_file = job.communicator.job.outputs[-1]
                    res = json.load(open(res_file, "rt"))[-1][-1]
                    print(md_to_text(res))
                time.sleep(0.1)
            print(job.outputs())


import markdown  # pip install markdown
from bs4 import BeautifulSoup  # pip install beautifulsoup4


def md_to_text(md):
    html = markdown.markdown(md)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()


if __name__ == '__main__':
    test_client_basic()