muryshev commited on
Commit
a3a2261
1 Parent(s): f5355b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -26
app.py CHANGED
@@ -37,7 +37,17 @@ model_name = "ggml-model-q4_1.gguf"
37
 
38
  snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
39
 
40
-
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def get_message_tokens(model, role, content):
@@ -107,7 +117,7 @@ def generate_unknown_response():
107
  def generate_search_request():
108
  global stop_generation
109
  stop_generation = False
110
-
111
 
112
 
113
  data = request.get_json()
@@ -125,17 +135,7 @@ def generate_search_request():
125
  top_k = 20
126
  return_full_text = parameters.get("return_full_text", False)
127
 
128
- model = Llama(
129
- model_path=model_name,
130
- n_ctx=2000,
131
- n_parts=1,
132
- #n_batch=100,
133
- logits_all=True,
134
- #n_threads=12,
135
- verbose=True,
136
- n_gpu_layers=30,
137
- n_gqa=8 #must be set for 70b models
138
- )
139
 
140
  tokens = get_system_tokens_for_preprompt(model, preprompt)
141
  tokens.append(LINEBREAK_TOKEN)
@@ -157,7 +157,7 @@ def generate_search_request():
157
  def generate_response():
158
  global stop_generation
159
  stop_generation = False
160
-
161
 
162
  data = request.get_json()
163
  app.logger.info(data)
@@ -175,18 +175,6 @@ def generate_response():
175
  return_full_text = parameters.get("return_full_text", False)
176
 
177
 
178
- model = Llama(
179
- model_path=model_name,
180
- n_ctx=2000,
181
- n_parts=1,
182
- #n_batch=100,
183
- logits_all=True,
184
- #n_threads=12,
185
- verbose=True,
186
- n_gpu_layers=30,
187
- n_gqa=8 #must be set for 70b models
188
- )
189
-
190
  # Generate the response
191
  #system_tokens = get_system_tokens(model)
192
  #tokens = system_tokens
 
37
 
38
  snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
39
 
40
+ model = Llama(
41
+ model_path=model_name,
42
+ n_ctx=2000,
43
+ n_parts=1,
44
+ #n_batch=100,
45
+ logits_all=True,
46
+ #n_threads=12,
47
+ verbose=True,
48
+ n_gpu_layers=35,
49
+ n_gqa=8 #must be set for 70b models
50
+ )
51
 
52
 
53
  def get_message_tokens(model, role, content):
 
117
  def generate_search_request():
118
  global stop_generation
119
  stop_generation = False
120
+ model.reset()
121
 
122
 
123
  data = request.get_json()
 
135
  top_k = 20
136
  return_full_text = parameters.get("return_full_text", False)
137
 
138
+
 
 
 
 
 
 
 
 
 
 
139
 
140
  tokens = get_system_tokens_for_preprompt(model, preprompt)
141
  tokens.append(LINEBREAK_TOKEN)
 
157
  def generate_response():
158
  global stop_generation
159
  stop_generation = False
160
+ model.reset()
161
 
162
  data = request.get_json()
163
  app.logger.info(data)
 
175
  return_full_text = parameters.get("return_full_text", False)
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  # Generate the response
179
  #system_tokens = get_system_tokens(model)
180
  #tokens = system_tokens