SebastianSchramm commited on
Commit
2f8f51f
1 Parent(s): e7b3eec
Files changed (2) hide show
  1. app.py +156 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TextIteratorStreamer
4
+ import torch
5
+ from threading import Thread
6
+ from huggingface_hub import Repository
7
+ import json
8
+
9
+ theme = gr.themes.Monochrome(
10
+ primary_hue="indigo",
11
+ secondary_hue="blue",
12
+ neutral_hue="slate",
13
+ radius_size=gr.themes.sizes.radius_sm,
14
+ font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
15
+ )
16
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
+
18
+ # Load peft config for pre-trained checkpoint etc.
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ model_id = "SebastianSchramm/Cerebras-GPT-111M-instruction"
21
+ if device == "cpu":
22
+ model = AutoModelForCausalLM.from_pretrained(model_id)
23
+ else:
24
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
25
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
26
+
27
+ prompt_template = "Below is an instruction that describes a task, paired with an input that provides further context.\n" \
28
+ "Write a response that appropriately completes the request.\n\n" \
29
+ "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
30
+
31
+
32
+ def generate(instruction, input, temperature=1.0, max_new_tokens=256, top_p=0.9, length_penalty=1.0):
33
+ formatted_instruction = prompt_template.format(instruction=instruction, input=input)
34
+
35
+ # make sure temperature top_p and length_penalty are floats
36
+ temperature = float(temperature)
37
+ top_p = float(top_p)
38
+ length_penalty = float(length_penalty)
39
+
40
+ # STREAMING BASED ON git+https://github.com/gante/transformers.git@streamer_iterator
41
+
42
+ # streaming
43
+ streamer = TextIteratorStreamer(tokenizer)
44
+ model_inputs = tokenizer(formatted_instruction, return_tensors="pt", truncation=True, max_length=2048)
45
+ # move to gpu
46
+ model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
47
+
48
+ generate_kwargs = dict(
49
+ top_p=top_p,
50
+ top_k=0,
51
+ temperature=temperature,
52
+ do_sample=True,
53
+ max_new_tokens=max_new_tokens,
54
+ early_stopping=True,
55
+ length_penalty=length_penalty,
56
+ eos_token_id=tokenizer.eos_token_id,
57
+ pad_token_id=tokenizer.eos_token_id,
58
+ )
59
+ t = Thread(target=model.generate, kwargs={**dict(model_inputs, streamer=streamer), **generate_kwargs})
60
+ t.start()
61
+
62
+ output = ""
63
+ hidden_output = ""
64
+ for new_text in streamer:
65
+ # skip streaming until new text is available
66
+ if len(hidden_output) <= len(formatted_instruction):
67
+ hidden_output += new_text
68
+ continue
69
+ # replace eos token
70
+ if tokenizer.eos_token in new_text:
71
+ new_text = new_text.replace(tokenizer.eos_token, "")
72
+ output += new_text
73
+ yield output
74
+ return output
75
+
76
+ examples = [
77
+ ]
78
+
79
+ def process_example(args):
80
+ for x in generate(args):
81
+ pass
82
+ return x
83
+
84
+ with gr.Blocks(theme=theme) as demo:
85
+ with gr.Column():
86
+ gr.Markdown(
87
+ """<h1><center>Instruction-tuned Cerebras GPT 111M Language Model for Text</center></h1>
88
+ <p>
89
+ [Cerebras-GPT-111M-instruction](SebastianSchramm/Cerebras-GPT-111M-instruction)
90
+ </p>
91
+ """
92
+ )
93
+ with gr.Row():
94
+ with gr.Column(scale=3):
95
+ instruction = gr.Textbox(placeholder="Instruction...", label="instruction")
96
+ input = gr.Textbox(placeholder="Input...", label="input")
97
+ output = gr.Textbox(
98
+ interactive=False,
99
+ lines=8,
100
+ label="Response",
101
+ placeholder="Response will be shown here...",
102
+ )
103
+ submit = gr.Button("Generate", variant="primary")
104
+ gr.Examples(
105
+ examples=examples,
106
+ inputs=[instruction, input],
107
+ cache_examples=True,
108
+ fn=process_example,
109
+ outputs=[output],
110
+ )
111
+
112
+ with gr.Column(scale=1):
113
+ temperature = gr.Slider(
114
+ label="Temperature",
115
+ value=1.0,
116
+ minimum=0.01,
117
+ maximum=1.0,
118
+ step=0.1,
119
+ interactive=True,
120
+ info="The higher more random",
121
+ )
122
+ max_new_tokens = gr.Slider(
123
+ label="Max new tokens",
124
+ value=256,
125
+ minimum=0,
126
+ maximum=2048,
127
+ step=5,
128
+ interactive=True,
129
+ info="The maximum numbers of new tokens",
130
+ )
131
+ top_p = gr.Slider(
132
+ label="Top p",
133
+ value=0.9,
134
+ minimum=0.01,
135
+ maximum=1,
136
+ step=0.05,
137
+ interactive=True,
138
+ info="probabilities that add up are kept",
139
+ )
140
+ length_penalty = gr.Slider(
141
+ label="Length penalty",
142
+ value=1.0,
143
+ minimum=-10.0,
144
+ maximum=10.0,
145
+ step=0.1,
146
+ interactive=True,
147
+ info="> 0.0 longer, < 0.0 shorter",
148
+ )
149
+
150
+ submit.click(generate, inputs=[instruction, input, temperature, max_new_tokens, top_p, length_penalty], outputs=[output])
151
+ instruction.submit(
152
+ generate, inputs=[instruction, input, temperature, max_new_tokens, top_p, length_penalty], outputs=[output]
153
+ )
154
+
155
+ demo.queue(concurrency_count=1)
156
+ demo.launch(enable_queue=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/peft.git
2
+ git+https://github.com/huggingface/transformers.git
3
+ huggingface_hub
4
+ accelerate==0.17.1
5
+ bitsandbytes