Files changed (2) hide show
  1. app.py +168 -8
  2. requirements.txt +13 -2
app.py CHANGED
@@ -1,35 +1,195 @@
1
  from transformers import AutoModel, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
2
  import gradio as gr
3
  import torch
 
 
 
 
 
 
 
 
 
4
 
5
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
6
 
7
- tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.3", trust_remote_code=True)
8
- model = LlamaForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.3", trust_remote_code=True).to(DEVICE)
 
 
 
 
 
 
 
9
  model = model.eval()
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def predict(input, history=None):
12
  if history is None:
13
  history = []
14
- new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt')
 
 
15
  bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
16
- history = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).tolist()
 
 
17
  # convert the tokens to text, and then split the responses into the right format
18
  response = tokenizer.decode(history[0]).split("<|endoftext|>")
19
- response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
 
 
20
  return response, history
21
 
22
 
23
  with gr.Blocks() as demo:
24
- gr.Markdown('''## Confidential HuggingFace Runner
25
- ''')
 
 
26
  state = gr.State([])
27
  chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
28
  with gr.Row():
29
  with gr.Column(scale=4):
30
- txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
 
 
31
  with gr.Column(scale=1):
32
  button = gr.Button("Generate")
33
  txt.submit(predict, [txt, state], [chatbot, state])
34
  button.click(predict, [txt, state], [chatbot, state])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  demo.queue().launch(share=True, server_name="0.0.0.0")
 
1
  from transformers import AutoModel, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
2
  import gradio as gr
3
  import torch
4
+ import os
5
+ import io
6
+ import sys
7
+ import platform
8
+ import intel_extension_for_pytorch as ipex
9
+ import intel_extension_for_pytorch._C as ipex_core
10
+ from cpuinfo import get_cpu_info
11
+ from contextlib import redirect_stdout
12
+
13
 
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
+ ROOT = '/'
17
+ SELF_ROOT = '/proc/self/root'
18
+
19
+ tokenizer = LlamaTokenizer.from_pretrained(
20
+ "lmsys/vicuna-7b-v1.3", trust_remote_code=True
21
+ )
22
+ model = LlamaForCausalLM.from_pretrained(
23
+ "lmsys/vicuna-7b-v1.3", trust_remote_code=True
24
+ ).to(DEVICE)
25
  model = model.eval()
26
 
27
+
28
+ def in_chroot():
29
+ '''
30
+ Return true if running in a chroot environment.
31
+ '''
32
+ try:
33
+ root_stat = os.stat(ROOT)
34
+ self_stat = os.stat(SELF_ROOT)
35
+ except FileNotFoundError as e:
36
+ sys.exit(f"ERROR: Failed to stat: {e}")
37
+
38
+ root_inode = root_stat.st_ino
39
+ self_inode = self_stat.st_ino
40
+
41
+ # Inode 2 is the root inode for most filesystems.
42
+ # However, XFS uses 128 for root.
43
+ if root_inode not in [2, 128]:
44
+ return True
45
+
46
+ return not (root_inode == self_inode)
47
+
48
+
49
+ def get_features():
50
+ '''
51
+ Returns a dictionary of all feature:
52
+
53
+ key: feature name.
54
+ value: Boolean showing if feature available.
55
+ '''
56
+
57
+ cpu_info = get_cpu_info()
58
+ flags = cpu_info["flags"]
59
+
60
+ detect_ipex_amx_enabled = lambda: ipex_core._get_current_isa_level() == 'AMX'
61
+ detect_ipex_amx_available = (
62
+ lambda: ipex_core._get_highest_cpu_support_isa_level() == 'AMX'
63
+ )
64
+
65
+ features = {
66
+ 'VM': 'hypervisor' in flags,
67
+ 'TDX TD': 'tdx_guest' in flags,
68
+ 'AMX available': 'amx_tile' in flags,
69
+ 'AMX-BF16 available': 'amx_bf16' in flags,
70
+ 'AMX-INT8 available': 'amx_int8' in flags,
71
+ 'AVX-VNNI available': 'avx_vnni' in flags,
72
+ 'AVX512-VNNI available': 'avx512_vnni' in flags,
73
+ 'AVX512-FP16 available': 'avx512_fp16' in flags,
74
+ 'AVX512-BF16 available': 'avx512_bf16' in flags,
75
+ 'AMX IPEX available': detect_ipex_amx_available(),
76
+ 'AMX IPEX enabled': detect_ipex_amx_enabled(),
77
+ }
78
+
79
+ return features
80
+
81
+
82
+ def get_debug_details():
83
+ '''
84
+ Return a block of markdown text that shows useful debug
85
+ information.
86
+ '''
87
+
88
+ # ipex.version() prints to stdout, so redirect stdout to
89
+ # capture the output.
90
+ buffer = io.StringIO()
91
+
92
+ with redirect_stdout(buffer):
93
+ ipex.version()
94
+
95
+ ipex_version_details = buffer.getvalue().replace("\n", ", ")
96
+
97
+ ipex_current_isa_level = ipex_core._get_current_isa_level()
98
+ ipex_max_isa_level = ipex_core._get_highest_cpu_support_isa_level()
99
+
100
+ ipex_env_var = os.getenv('ATEN_CPU_CAPABILITY')
101
+ onednn_env_var = os.getenv('ONEDNN_MAX_CPU_ISA')
102
+
103
+ in_chroot_result = in_chroot()
104
+
105
+ cpu_info = get_cpu_info()
106
+ flags = cpu_info["flags"]
107
+
108
+ # Note that rather than using `<details>`, we could use gradio.Accordian(),
109
+ # but the markdown version is more visually compact.
110
+ md = f"""
111
+ <details>
112
+ <summary>Click to show debug details</summary>
113
+
114
+ | Feature | Value |
115
+ |-|-|
116
+ | Arch | `{cpu_info['arch']}` |
117
+ | CPU | `{cpu_info['brand_raw']}` |
118
+ | CPU flags | `{flags}` |
119
+ | Python version | `{sys.version}` (implementation: `{platform.python_implementation()}`) |
120
+ | Python version details | `{sys.version_info}` |
121
+ | PyTorch version | `{torch.__version__}` |
122
+ | IPEX version | `{ipex.ipex_version}` |
123
+ | IPEX CPU detected | `{ipex_core._has_cpu()}` |
124
+ | IPEX XPU detected | `{ipex_core._has_xpu()}` |
125
+ | IPEX version details | `{ipex_version_details}` |
126
+ | IPEX env var `ATEN_CPU_CAPABILITY` | `{ipex_env_var}` |
127
+ | IPEX current ISA level | `{ipex_current_isa_level}` |
128
+ | IPEX max ISA level | `{ipex_max_isa_level}` |
129
+ | oneDNN env var `ONEDNN_MAX_CPU_ISA` | `{onednn_env_var}` |
130
+ | in chroot | `{in_chroot_result}` |
131
+
132
+ </details>
133
+ """
134
+
135
+ return md
136
+
137
+
138
  def predict(input, history=None):
139
  if history is None:
140
  history = []
141
+ new_user_input_ids = tokenizer.encode(
142
+ input + tokenizer.eos_token, return_tensors='pt'
143
+ )
144
  bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
145
+ history = model.generate(
146
+ bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
147
+ ).tolist()
148
  # convert the tokens to text, and then split the responses into the right format
149
  response = tokenizer.decode(history[0]).split("<|endoftext|>")
150
+ response = [
151
+ (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
152
+ ] # convert to tuples of list
153
  return response, history
154
 
155
 
156
  with gr.Blocks() as demo:
157
+ gr.Markdown(
158
+ '''## Confidential HuggingFace Runner
159
+ '''
160
+ )
161
  state = gr.State([])
162
  chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
163
  with gr.Row():
164
  with gr.Column(scale=4):
165
+ txt = gr.Textbox(
166
+ show_label=False, placeholder="Enter text and press enter"
167
+ ).style(container=False)
168
  with gr.Column(scale=1):
169
  button = gr.Button("Generate")
170
  txt.submit(predict, [txt, state], [chatbot, state])
171
  button.click(predict, [txt, state], [chatbot, state])
172
+
173
+ with gr.Row():
174
+ features_dict = get_features()
175
+
176
+ all_features = features_dict.keys()
177
+
178
+ # Get a list of feature names that are actually set/available
179
+ set_features = [key for key in features_dict if features_dict[key]]
180
+
181
+ gr.CheckboxGroup(
182
+ all_features,
183
+ label="Features",
184
+ # Make the boxes read-only
185
+ interactive=False,
186
+ # Specify which features were detected
187
+ value=set_features,
188
+ info="Features detected from environment",
189
+ )
190
+
191
+ with gr.Row():
192
+ debug_details = get_debug_details()
193
+ gr.Markdown(debug_details)
194
+
195
  demo.queue().launch(share=True, server_name="0.0.0.0")
requirements.txt CHANGED
@@ -1,6 +1,17 @@
1
- torch
 
 
 
 
 
 
2
  cpm_kernels
3
  icetk
4
  gradio==3.50.2
5
  accelerate
6
- git+https://github.com/huggingface/transformers
 
 
 
 
 
 
1
+ # For pytorch
2
+ --find-links https://download.pytorch.org/whl/torch_stable.html
3
+
4
+ # For ipex
5
+ --trusted-host pytorch-extension.intel.com
6
+ --extra-index-url http://pytorch-extension.intel.com/release-whl/stable/cpu/us/intel-extension-for-pytorchtorch
7
+
8
  cpm_kernels
9
  icetk
10
  gradio==3.50.2
11
  accelerate
12
+ git+https://github.com/huggingface/transformers
13
+ py-cpuinfo
14
+
15
+ # Versions must match
16
+ torch==2.3.0+cpu
17
+ intel-extension-for-pytorch==2.3.0