s3nh commited on
Commit
b5edb1f
1 Parent(s): 1c65322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -44
app.py CHANGED
@@ -1,6 +1,4 @@
1
- """Run codes."""
2
- # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
- # ruff: noqa: E501
4
  import os
5
  import platform
6
  import random
@@ -8,7 +6,6 @@ import time
8
  from dataclasses import asdict, dataclass
9
  from pathlib import Path
10
 
11
- # from types import SimpleNamespace
12
  import gradio as gr
13
  import psutil
14
  from about_time import about_time
@@ -16,22 +13,6 @@ from ctransformers import AutoModelForCausalLM
16
  from dl_hf_model import dl_hf_model
17
  from loguru import logger
18
 
19
- # filename_list = [
20
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
21
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
22
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_M.bin",
23
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_S.bin",
24
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin",
25
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_1.bin",
26
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_M.bin",
27
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_S.bin",
28
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_0.bin",
29
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_1.bin",
30
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_K_M.bin",
31
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_K_S.bin",
32
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q6_K.bin",
33
- # "Wizard-Vicuna-7B-Uncensored.ggmlv3.q8_0.bin",
34
- # ]
35
 
36
  URL = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin" # 4.05G
37
 
@@ -44,10 +25,7 @@ _ = (
44
  )
45
 
46
  if _:
47
- # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q2_K.bin"
48
  url = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin" # 2.87G
49
- # url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/blob/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin" # 2.87G
50
- # url = "https://huggingface.co/TheBloke/llama2_7b_chat_uncensored-GGML/blob/main/llama2_7b_chat_uncensored.ggmlv3.q4_K_M.bin" # 4.08G
51
 
52
 
53
  prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
@@ -123,16 +101,14 @@ except Exception as exc_:
123
  LLM = AutoModelForCausalLM.from_pretrained(
124
  model_loc,
125
  model_type="llama",
126
- # threads=cpu_count,
127
  )
128
 
129
  logger.info(f"done load llm {model_loc=} {file_size=}G")
130
 
131
  os.environ["TZ"] = "Asia/Shanghai"
132
  try:
133
- time.tzset() # type: ignore # pylint: disable=no-member
134
- except Exception:
135
- # Windows
136
  logger.warning("Windows, cant run time.tzset()")
137
 
138
  _ = """
@@ -162,8 +138,7 @@ def generate(
162
  config: GenerationConfig = GenerationConfig(),
163
  ):
164
  """Run model inference, will return a Generator if streaming is true."""
165
- # _ = prompt_template.format(question=question)
166
- # print(_)
167
 
168
  prompt = prompt_template.format(question=question)
169
 
@@ -177,16 +152,13 @@ logger.debug(f"{asdict(GenerationConfig())=}")
177
 
178
 
179
  def user(user_message, history):
180
- # return user_message, history + [[user_message, None]]
181
  history.append([user_message, None])
182
- return user_message, history # keep user_message
183
 
184
 
185
  def user1(user_message, history):
186
- # return user_message, history + [[user_message, None]]
187
  history.append([user_message, None])
188
- return "", history # clear user_message
189
-
190
 
191
  def bot_(history):
192
  user_message = history[-1][0]
@@ -208,7 +180,7 @@ def bot(history):
208
 
209
  logger.debug(f"{user_message=}")
210
 
211
- with about_time() as atime: # type: ignore
212
  flag = 1
213
  prefix = ""
214
  then = time.time()
@@ -224,15 +196,14 @@ def bot(history):
224
  print(prefix, end="", flush=True)
225
  logger.debug(f"{prefix=}")
226
  print(elm, end="", flush=True)
227
- # logger.debug(f"{elm}")
228
 
229
  response.append(elm)
230
  history[-1][1] = prefix + "".join(response)
231
  yield history
232
 
233
  _ = (
234
- f"(time elapsed: {atime.duration_human}, " # type: ignore
235
- f"{atime.duration/len(''.join(response)):.2f}s/char)" # type: ignore
236
  )
237
 
238
  history[-1][1] = "".join(response) + f"\n{_}"
@@ -250,10 +221,8 @@ def predict_api(prompt):
250
  repetition_penalty=1.0,
251
  max_new_tokens=512, # adjust as needed
252
  seed=42,
253
- reset=True, # reset history (cache)
254
  stream=False,
255
- # threads=cpu_count,
256
- # stop=prompt_prefix[1:2],
257
  )
258
 
259
  response = generate(
@@ -265,9 +234,6 @@ def predict_api(prompt):
265
  except Exception as exc:
266
  logger.error(exc)
267
  response = f"{exc=}"
268
- # bot = {"inputs": [response]}
269
- # bot = [(prompt, response)]
270
-
271
  return response
272
 
273
 
 
1
+
 
 
2
  import os
3
  import platform
4
  import random
 
6
  from dataclasses import asdict, dataclass
7
  from pathlib import Path
8
 
 
9
  import gradio as gr
10
  import psutil
11
  from about_time import about_time
 
13
  from dl_hf_model import dl_hf_model
14
  from loguru import logger
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  URL = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin" # 4.05G
18
 
 
25
  )
26
 
27
  if _:
 
28
  url = "https://huggingface.co/s3nh/mamba-gpt-3b-GGML/resolve/main/mamba-gpt-3b.ggmlv3.q8_0.bin" # 2.87G
 
 
29
 
30
 
31
  prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
101
  LLM = AutoModelForCausalLM.from_pretrained(
102
  model_loc,
103
  model_type="llama",
 
104
  )
105
 
106
  logger.info(f"done load llm {model_loc=} {file_size=}G")
107
 
108
  os.environ["TZ"] = "Asia/Shanghai"
109
  try:
110
+ time.tzset()
111
+
 
112
  logger.warning("Windows, cant run time.tzset()")
113
 
114
  _ = """
 
138
  config: GenerationConfig = GenerationConfig(),
139
  ):
140
  """Run model inference, will return a Generator if streaming is true."""
141
+
 
142
 
143
  prompt = prompt_template.format(question=question)
144
 
 
152
 
153
 
154
  def user(user_message, history):
 
155
  history.append([user_message, None])
156
+ return user_message, history
157
 
158
 
159
  def user1(user_message, history):
 
160
  history.append([user_message, None])
161
+ return "", history
 
162
 
163
  def bot_(history):
164
  user_message = history[-1][0]
 
180
 
181
  logger.debug(f"{user_message=}")
182
 
183
+ with about_time() as atime:
184
  flag = 1
185
  prefix = ""
186
  then = time.time()
 
196
  print(prefix, end="", flush=True)
197
  logger.debug(f"{prefix=}")
198
  print(elm, end="", flush=True)
 
199
 
200
  response.append(elm)
201
  history[-1][1] = prefix + "".join(response)
202
  yield history
203
 
204
  _ = (
205
+ f"(time elapsed: {atime.duration_human}, "
206
+ f"{atime.duration/len(''.join(response)):.2f}s/char)"
207
  )
208
 
209
  history[-1][1] = "".join(response) + f"\n{_}"
 
221
  repetition_penalty=1.0,
222
  max_new_tokens=512, # adjust as needed
223
  seed=42,
224
+ reset=True,
225
  stream=False,
 
 
226
  )
227
 
228
  response = generate(
 
234
  except Exception as exc:
235
  logger.error(exc)
236
  response = f"{exc=}"
 
 
 
237
  return response
238
 
239