dwb2023 commited on
Commit
b65471a
·
verified ·
1 Parent(s): 3ffd3b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -110
app.py CHANGED
@@ -1,16 +1,14 @@
1
- import os
2
- from threading import Thread
3
- from typing import Iterator
4
-
5
  import gradio as gr
6
- import spaces
7
  import torch
8
  from transformers import (
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
11
  TextIteratorStreamer,
12
- LlamaTokenizer,
13
  )
 
 
 
 
14
  import subprocess
15
 
16
  subprocess.run(
@@ -19,129 +17,89 @@ subprocess.run(
19
  shell=True,
20
  )
21
 
 
22
 
23
- MAX_MAX_NEW_TOKENS = 1024
24
- DEFAULT_MAX_NEW_TOKENS = 50
25
- MAX_INPUT_TOKEN_LENGTH = 512
26
-
27
- DESCRIPTION = """\
28
- # Phi-3-mini-4k-instruct
29
-
30
- This Space demonstrates [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) by Microsoft. Please, check the original model card for details.
31
-
32
- For additional detail on the model, including a link to the arXiv paper, refer to the [Hugging Face Paper page for Phi 3](https://huggingface.co/papers/2404.14219) .
33
- """
34
 
35
  model = AutoModelForCausalLM.from_pretrained(
36
- "microsoft/Phi-3-mini-4k-instruct",
37
- device_map="cuda",
38
- torch_dtype="auto",
39
- trust_remote_code=True
40
  )
41
- tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
42
-
43
- if tokenizer.pad_token == None:
44
- tokenizer.pad_token = tokenizer.eos_token
45
- tokenizer.pad_token_id = tokenizer.eos_token_id
46
- model.config.pad_token_id = tokenizer.eos_token_id
47
-
48
- def generate(
49
- message: str,
50
- chat_history: list[tuple[str, str]],
51
- max_new_tokens: int = 1024,
52
- temperature: float = 0.1,
53
- top_p: float = 0.4,
54
- top_k: int = 10,
55
- repetition_penalty: float = 1.4,
56
- ) -> Iterator[str]:
57
-
58
- historical_text = ""
59
- #Prepend the entire chat history to the message with new lines between each message
60
- for user, assistant in chat_history:
61
- historical_text += f"\n{user}\n{assistant}"
62
-
63
- if len(historical_text) > 0:
64
- message = historical_text + f"\n{message}"
65
- input_ids = tokenizer([message], return_tensors="pt").input_ids
66
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
67
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
68
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
69
- input_ids = input_ids.to(model.device)
70
-
71
- streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
72
  generate_kwargs = dict(
73
- {"input_ids": input_ids},
74
  streamer=streamer,
75
- max_new_tokens=max_new_tokens,
76
  do_sample=True,
77
- top_p=top_p,
78
- top_k=top_k,
79
  temperature=temperature,
80
- num_beams=1,
81
- pad_token_id = tokenizer.eos_token_id,
82
- repetition_penalty=repetition_penalty,
83
- no_repeat_ngram_size=5,
84
- early_stopping=False,
85
  )
 
 
 
 
86
  t = Thread(target=model.generate, kwargs=generate_kwargs)
87
  t.start()
88
 
89
- outputs = []
90
- for text in streamer:
91
- outputs.append(text)
92
- yield "".join(outputs)
 
 
93
 
94
 
95
- chat_interface = gr.ChatInterface(
96
- fn=generate,
 
 
 
 
 
97
  additional_inputs=[
98
  gr.Slider(
99
- label="Max new tokens",
100
- minimum=1,
101
- maximum=MAX_MAX_NEW_TOKENS,
102
- step=1,
103
- value=DEFAULT_MAX_NEW_TOKENS,
104
- ),
105
- gr.Slider(
106
- label="Temperature",
107
- minimum=0.1,
108
- maximum=4.0,
109
- step=0.1,
110
- value=0.1,
111
- ),
112
- gr.Slider(
113
- label="Top-p (nucleus sampling)",
114
- minimum=0.05,
115
- maximum=1.0,
116
- step=0.05,
117
- value=0.5,
118
  ),
 
119
  gr.Slider(
120
- label="Top-k",
121
- minimum=1,
122
- maximum=1000,
123
  step=1,
124
- value=3,
125
- ),
126
- gr.Slider(
127
- label="Repetition penalty",
128
- minimum=1.0,
129
- maximum=2.0,
130
- step=0.05,
131
- value=1.4,
132
  ),
133
  ],
134
- stop_btn=None,
135
- cache_examples=False,
136
- examples=[
137
- ["Explain quantum physics in 5 words or less:"],
138
- ["Question: What do you call a bear with no teeth?\nAnswer:"],
139
- ],
140
  )
141
-
142
- with gr.Blocks(css="style.css") as demo:
143
- gr.Markdown(DESCRIPTION)
144
- chat_interface.render()
145
-
146
- if __name__ == "__main__":
147
- demo.queue(max_size=20).launch()
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
  from transformers import (
4
  AutoModelForCausalLM,
5
  AutoTokenizer,
6
  TextIteratorStreamer,
 
7
  )
8
+ import os
9
+ from threading import Thread
10
+ import spaces
11
+ import time
12
  import subprocess
13
 
14
  subprocess.run(
 
17
  shell=True,
18
  )
19
 
20
+ token = os.environ["HF_TOKEN"]
21
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  model = AutoModelForCausalLM.from_pretrained(
24
+ "microsoft/Phi-3-mini-128k-instruct",
25
+ token=token,
26
+ trust_remote_code=True,
 
27
  )
28
+ tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
29
+ terminators = [
30
+ tok.eos_token_id,
31
+ ]
32
+
33
+ if torch.cuda.is_available():
34
+ device = torch.device("cuda")
35
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
36
+ else:
37
+ device = torch.device("cpu")
38
+ print("Using CPU")
39
+
40
+ model = model.to(device)
41
+ # Dispatch Errors
42
+
43
+
44
+ @spaces.GPU(duration=60)
45
+ def chat(message, history, temperature, do_sample, max_tokens):
46
+ chat = []
47
+ for item in history:
48
+ chat.append({"role": "user", "content": item[0]})
49
+ if item[1] is not None:
50
+ chat.append({"role": "assistant", "content": item[1]})
51
+ chat.append({"role": "user", "content": message})
52
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
53
+ model_inputs = tok([messages], return_tensors="pt").to(device)
54
+ streamer = TextIteratorStreamer(
55
+ tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
56
+ )
 
 
57
  generate_kwargs = dict(
58
+ model_inputs,
59
  streamer=streamer,
60
+ max_new_tokens=max_tokens,
61
  do_sample=True,
 
 
62
  temperature=temperature,
63
+ eos_token_id=terminators,
 
 
 
 
64
  )
65
+
66
+ if temperature == 0:
67
+ generate_kwargs["do_sample"] = False
68
+
69
  t = Thread(target=model.generate, kwargs=generate_kwargs)
70
  t.start()
71
 
72
+ partial_text = ""
73
+ for new_text in streamer:
74
+ partial_text += new_text
75
+ yield partial_text
76
+
77
+ yield partial_text
78
 
79
 
80
+ demo = gr.ChatInterface(
81
+ fn=chat,
82
+ examples=[["Write me a poem about Machine Learning."]],
83
+ # multimodal=False,
84
+ additional_inputs_accordion=gr.Accordion(
85
+ label="⚙️ Parameters", open=False, render=False
86
+ ),
87
  additional_inputs=[
88
  gr.Slider(
89
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  ),
91
+ gr.Checkbox(label="Sampling", value=True),
92
  gr.Slider(
93
+ minimum=128,
94
+ maximum=4096,
 
95
  step=1,
96
+ value=512,
97
+ label="Max new tokens",
98
+ render=False,
 
 
 
 
 
99
  ),
100
  ],
101
+ stop_btn="Stop Generation",
102
+ title="Chat With LLMs",
103
+ description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
 
 
 
104
  )
105
+ demo.launch()