taufiqdp commited on
Commit
f816b98
β€’
1 Parent(s): 3d25b03

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +124 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import spaces
4
+ import subprocess
5
+ import gradio as gr
6
+
7
+ from threading import Thread
8
+ from huggingface_hub import login
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
10
+
11
+ login(os.environ.get("HF_TOKEN"))
12
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
+
14
+ model_id = "microsoft/Phi-3-mini-128k-instruct"
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_id,
19
+ device_map="auto",
20
+ attn_implementation="flash_attention_2"
21
+ )
22
+
23
+ @spaces.GPU()
24
+ def generate(
25
+ message: str,
26
+ chat_history: list[tuple[str, str]],
27
+ system_prompt: str,
28
+ max_new_tokens: int,
29
+ temperature: float,
30
+ top_p: float,
31
+ top_k: int,
32
+ repetition_penalty: int
33
+ ):
34
+
35
+ conversation = []
36
+ if system_prompt:
37
+ conversation.append({"role": "system", "content": system_prompt})
38
+
39
+ for user, assistant in chat_history:
40
+ conversation.append({"role": "user", "content": user})
41
+ conversation.append({"role": "assistant", "content": assistant})
42
+
43
+ conversation.append({"role": "user", "content": message})
44
+
45
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
46
+ input_ids, attention_mask = tokenizer.apply_chat_template(
47
+ conversation,
48
+ add_generation_prompt=True,
49
+ return_tensors="pt",
50
+ return_dict=True
51
+ ).to(model.device).values()
52
+
53
+ generate_kwargs = dict(
54
+ {"input_ids": input_ids, "attention_mask": attention_mask},
55
+ streamer=streamer,
56
+ do_sample=True,
57
+ temperature=temperature,
58
+ max_new_tokens=max_new_tokens,
59
+ top_k=top_k,
60
+ repetition_penalty=repetition_penalty,
61
+ top_p=top_p
62
+ )
63
+
64
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
65
+ t.start()
66
+ outputs = []
67
+ for new_token in streamer:
68
+ outputs.append(new_token)
69
+ yield "".join(outputs)
70
+
71
+
72
+ gr.ChatInterface(
73
+ fn=generate,
74
+ title="πŸš€ Phi-3 mini 128k instruct",
75
+ description="",
76
+ additional_inputs=[
77
+ gr.Textbox(
78
+ label="System prompt",
79
+ lines=5,
80
+ value="You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."
81
+ ),
82
+ gr.Slider(
83
+ label="Max new tokens",
84
+ minimum=1,
85
+ maximum=2048,
86
+ step=1,
87
+ value=1024,
88
+ ),
89
+ gr.Slider(
90
+ label="Temperature",
91
+ minimum=0.1,
92
+ maximum=4.0,
93
+ step=0.1,
94
+ value=0.6,
95
+ ),
96
+ gr.Slider(
97
+ label="Top-p (nucleus sampling)",
98
+ minimum=0.05,
99
+ maximum=1.0,
100
+ step=0.05,
101
+ value=0.9,
102
+ ),
103
+ gr.Slider(
104
+ label="Top-k",
105
+ minimum=1,
106
+ maximum=1000,
107
+ step=1,
108
+ value=50,
109
+ ),
110
+ gr.Slider(
111
+ label="Repetition penalty",
112
+ minimum=1.0,
113
+ maximum=2.0,
114
+ step=0.05,
115
+ value=1.2,
116
+ ),
117
+ ],
118
+ stop_btn=None,
119
+ examples=[
120
+ ["Can you provide ways to eat combinations of bananas and dragonfruits?"],
121
+ ["Write a story about a dragon fruit that flies into outer space!"],
122
+ ["I am going to Bali, what should I see"],
123
+ ],
124
+ ).queue().launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers==4.40.0
2
+ accelerate==0.29.3
3
+ gradio==4.27.0