muryshev commited on
Commit
cb77897
1 Parent(s): 67ad4fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -14
app.py CHANGED
@@ -3,6 +3,8 @@ import logging
3
  from llama_cpp import Llama
4
  import threading
5
  from huggingface_hub import snapshot_download
 
 
6
 
7
  SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
8
  SYSTEM_TOKEN = 1788
@@ -17,6 +19,7 @@ ROLE_TOKENS = {
17
  }
18
 
19
  CONTEXT_SIZE = 4000
 
20
 
21
  # Create a lock object
22
  lock = threading.Lock()
@@ -35,22 +38,52 @@ app.logger.setLevel(logging.DEBUG) # Set the desired logging level
35
  repo_name = "IlyaGusev/saiga2_70b_gguf"
36
  model_name = "ggml-model-q4_1.gguf"
37
 
 
 
 
 
 
 
38
 
 
39
 
40
- snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
 
41
 
42
- model = Llama(
43
- model_path=model_name,
44
- n_ctx=CONTEXT_SIZE,
45
- n_parts=1,
46
- #n_batch=100,
47
- logits_all=True,
48
- n_threads=12,
49
- verbose=True,
50
- n_gpu_layers=35,
51
- n_gqa=8 #must be set for 70b models
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
54
 
55
  def get_message_tokens(model, role, content):
56
  message_tokens = model.tokenize(content.encode("utf-8"))
@@ -73,10 +106,10 @@ def get_system_tokens_for_preprompt(model, preprompt):
73
  }
74
  return get_message_tokens(model, **system_message)
75
 
76
- app.logger.info('Evaluating system tokens start')
77
  #system_tokens = get_system_tokens(model)
78
  #model.eval(system_tokens)
79
- app.logger.info('Evaluating system tokens end')
80
 
81
  stop_generation = False
82
 
@@ -96,8 +129,19 @@ def generate_tokens(model, generator):
96
  yield token_str
97
  except Exception as e:
98
  app.logger.info('generator exception')
 
99
  yield b'' # End of chunk
100
 
 
 
 
 
 
 
 
 
 
 
101
  @app.route('/stop_generation', methods=['GET'])
102
  def handler_stop_generation():
103
  global stop_generation
 
3
  from llama_cpp import Llama
4
  import threading
5
  from huggingface_hub import snapshot_download
6
+ import gc
7
+ import os.path
8
 
9
  SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
10
  SYSTEM_TOKEN = 1788
 
19
  }
20
 
21
  CONTEXT_SIZE = 4000
22
+ ENABLE_GPU = False
23
 
24
  # Create a lock object
25
  lock = threading.Lock()
 
38
  repo_name = "IlyaGusev/saiga2_70b_gguf"
39
  model_name = "ggml-model-q4_1.gguf"
40
 
41
+ #repo_name = "IlyaGusev/saiga2_7b_gguf"
42
+ #model_name = "model-q4_K.gguf"
43
+ local_dir = '.'
44
+
45
+ if os.path.isdir('/data'):
46
+ app.logger.info('Persistent storage enabled')
47
 
48
+ model = None
49
 
50
+ model_path = snapshot_download(repo_id=repo_name, allow_patterns=model_name) + '/' + model_name
51
+ app.logger.info('Model path: ' + model_path)
52
 
53
+ def init_model(context_size, enable_gpu=False, gpu_layer_number=35):
54
+ global model
55
+
56
+ if model is not None:
57
+ del model
58
+ gc.collect()
59
+
60
+ if enable_gpu:
61
+ model = Llama(
62
+ model_path=model_path,
63
+ n_ctx=context_size,
64
+ n_parts=1,
65
+ #n_batch=100,
66
+ logits_all=True,
67
+ #n_threads=12,
68
+ verbose=True,
69
+ n_gpu_layers=gpu_layer_number,
70
+ n_gqa=8 #must be set for 70b models
71
+ )
72
+ return model
73
+ else:
74
+ model = Llama(
75
+ model_path=model_path,
76
+ n_ctx=context_size,
77
+ n_parts=1,
78
+ #n_batch=100,
79
+ logits_all=True,
80
+ #n_threads=12,
81
+ verbose=True,
82
+ n_gqa=8 #must be set for 70b models
83
+ )
84
+ return model
85
 
86
+ init_model(CONTEXT_SIZE, ENABLE_GPU, 35)
87
 
88
  def get_message_tokens(model, role, content):
89
  message_tokens = model.tokenize(content.encode("utf-8"))
 
106
  }
107
  return get_message_tokens(model, **system_message)
108
 
109
+ #app.logger.info('Evaluating system tokens start')
110
  #system_tokens = get_system_tokens(model)
111
  #model.eval(system_tokens)
112
+ #app.logger.info('Evaluating system tokens end')
113
 
114
  stop_generation = False
115
 
 
129
  yield token_str
130
  except Exception as e:
131
  app.logger.info('generator exception')
132
+ app.logger.info(e)
133
  yield b'' # End of chunk
134
 
135
+ @app.route('/change_context_size', methods=['GET'])
136
+ def handler_change_context_size():
137
+ global stop_generation, model
138
+ stop_generation = True
139
+
140
+ new_size = int(request.args.get('size', CONTEXT_SIZE))
141
+ init_model(new_size, enable_gpu=ENABLE_GPU)
142
+
143
+ return Response('Size changed', content_type='text/plain')
144
+
145
  @app.route('/stop_generation', methods=['GET'])
146
  def handler_stop_generation():
147
  global stop_generation