llama-python-streamingllm

Runtime error

App Files Files Community

Limour commited on Feb 10, 2024

Commit

026cf13

verified ·

1 Parent(s): cf65ac6

Upload 3 files

Browse files

Files changed (3) hide show

chat_template.py +36 -0
gradio_streamingllm.py +4 -12
llama_cpp_python_streamingllm.py +6 -26

chat_template.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 class ChatTemplate:
     cache = {}
     def __init__(self, model, im_start=r'<|im_start|>', im_end=r'<|im_end|>', nl='\n'):
         self.model = model
@@ -31,7 +32,42 @@ class ChatTemplate:
             self.cache[key] = copy.deepcopy(value)  # 深拷贝一下
             return value
     def __call__(self, _role, prompt=None):
         if prompt is None:
             return self._get(_role)
         # print(_role, prompt, self.cache)

 class ChatTemplate:
     cache = {}
+    roles = set()
     def __init__(self, model, im_start=r'<|im_start|>', im_end=r'<|im_end|>', nl='\n'):
         self.model = model
             self.cache[key] = copy.deepcopy(value)  # 深拷贝一下
             return value
+    def _add_role(self, _role):
+        if _role:
+            self.roles.add('\n' + _role)
+    def eos_in_role(self, history: str, t_bot):
+        if not (history.endswith('\n') or history.endswith('\r')):
+            return 0
+        tmp = history.rstrip()
+        for _role in self.roles:
+            if tmp.endswith(_role):
+                n = len(t_bot)
+                for i in range(1, n):  # 找出需要弃置的tokens长度
+                    tmp = self.model.str_detokenize(t_bot[n - i:])
+                    if tmp.rstrip().endswith(_role):
+                        print('eos_in_role', t_bot[n - i:], repr(tmp))
+                        return i
+                print('eos_in_role missing')
+                break
+        return 0
+    def eos_in_nlnl(self, history: str, t_bot):
+        if not (history.endswith('\n\n') or history.endswith('\n\r\n')):
+            return 0
+        n = len(t_bot)
+        for i in range(1, n):  # 找出需要弃置的tokens长度
+            tmp = self.model.str_detokenize(t_bot[n - i:])
+            if tmp.endswith('\n\n') or tmp.endswith('\n\r\n'):
+                if tmp.startswith(']'):  # 避免误判
+                    return 0
+                print('eos_in_nlnl', t_bot[n - i:], repr(tmp))
+                return i
+        print('eos_in_nlnl missing')
+        return 0
     def __call__(self, _role, prompt=None):
+        self._add_role(_role)
         if prompt is None:
             return self._get(_role)
         # print(_role, prompt, self.cache)

gradio_streamingllm.py CHANGED Viewed

@@ -28,6 +28,9 @@ from mods.btn_reset import init as btn_reset_init
 # ========== 聊天的模版 默认 chatml ==========
 from chat_template import ChatTemplate
 #  ========== 全局锁，确保只能进行一个会话 ==========
 cfg['session_lock'] = threading.Lock()
 cfg['session_active'] = False
@@ -84,8 +87,6 @@ with gr.Blocks() as role:
     cfg['role_chat_style'] = gr.Textbox(lines=10, label="回复示例", **cfg['role_chat_style'])
     # ========== 加载角色卡-缓存 ==========
-    from mods.load_cache import init as load_cache_init
     text_display_init(cfg)
     load_cache_init(cfg)
@@ -99,15 +100,6 @@ with gr.Blocks() as chatting:
             cfg['vo'] = gr.Textbox(label='VO', show_copy_button=True, elem_id="VO-area")
             cfg['s_info'] = gr.Textbox(value=cfg['model'].venv_info, max_lines=1, label='info', interactive=False)
     cfg['msg'] = gr.Textbox(label='Prompt', lines=2, max_lines=2, elem_id='prompt', autofocus=True, **cfg['msg'])
-    with gr.Row():
-        cfg['btn_vo'] = gr.Button("旁白")
-        cfg['btn_rag'] = gr.Button("RAG")
-        cfg['btn_retry'] = gr.Button("Retry")
-        cfg['btn_com1'] = gr.Button("自定义1")
-        cfg['btn_reset'] = gr.Button("Reset")
-        cfg['btn_debug'] = gr.Button("Debug")
-        cfg['btn_submit'] = gr.Button("Submit")
-        cfg['btn_suggest'] = gr.Button("建议")
     cfg['gr'] = gr
     btn_com_init(cfg)
@@ -164,4 +156,4 @@ demo = gr.TabbedInterface([chatting, setting, role],
                           ["聊天", "设置", '角色'],
                           css=custom_css)
 gr.close_all()
-demo.queue(api_open=False, max_size=1).launch(share=False)

 # ========== 聊天的模版 默认 chatml ==========
 from chat_template import ChatTemplate
+# ========== 加载角色卡-缓存 ==========
+from mods.load_cache import init as load_cache_init
 #  ========== 全局锁，确保只能进行一个会话 ==========
 cfg['session_lock'] = threading.Lock()
 cfg['session_active'] = False
     cfg['role_chat_style'] = gr.Textbox(lines=10, label="回复示例", **cfg['role_chat_style'])
     # ========== 加载角色卡-缓存 ==========
     text_display_init(cfg)
     load_cache_init(cfg)
             cfg['vo'] = gr.Textbox(label='VO', show_copy_button=True, elem_id="VO-area")
             cfg['s_info'] = gr.Textbox(value=cfg['model'].venv_info, max_lines=1, label='info', interactive=False)
     cfg['msg'] = gr.Textbox(label='Prompt', lines=2, max_lines=2, elem_id='prompt', autofocus=True, **cfg['msg'])
     cfg['gr'] = gr
     btn_com_init(cfg)
                           ["聊天", "设置", '角色'],
                           css=custom_css)
 gr.close_all()
+demo.queue(api_open=False, max_size=1).launch(share=False, show_error=True, show_api=False)

llama_cpp_python_streamingllm.py CHANGED Viewed

@@ -6,35 +6,13 @@ from ctypes import POINTER
 from KMP_list import kmp_search, compute_lps_array
-def is_UTF8_incomplete(all_text):
-    multibyte_fix = 0
-    if len(all_text) < 3:
-        all_text = b'000' + all_text
-    for k, char in enumerate(all_text[-3:]):
-        k = 3 - k
-        for num, pattern in [(2, 192), (3, 224), (4, 240)]:
-            # Bitwise AND check
-            if num > k and pattern & char == pattern:
-                multibyte_fix = num - k
-    return multibyte_fix
-def get_complete_UTF8(all_text):
-    multibyte_fix = is_UTF8_incomplete(all_text)
-    if multibyte_fix > 0:
-        multibyte_fix = multibyte_fix - 3
-        return all_text[:multibyte_fix].decode("utf-8")
-    else:
-        return all_text.decode("utf-8")
 class StreamingLLM(Llama):
     def __init__(self, model_path: str, **kwargs):
         super().__init__(model_path, **kwargs)
         self._venv_init()
     def str_detokenize(self, tokens) -> str:
-        return get_complete_UTF8(self.detokenize(tokens))
     def kv_cache_seq_trim(self):
         self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
@@ -103,9 +81,9 @@ class StreamingLLM(Llama):
                     break
         return True
-    def venv_pop_token(self):
-        self.n_tokens -= 1
-        self.venv[-1] -= 1
         self.kv_cache_seq_trim()
     @property
@@ -113,6 +91,8 @@ class StreamingLLM(Llama):
         return str((self.n_tokens, self.venv, self.venv_idx_map))
     def kv_cache_seq_ltrim(self, n_keep, n_discard=256, n_past=-1, im_start=None):
         if n_past < 0:
             n_past = self.n_tokens
         if im_start is not None:  # [<|im_start|>, name, nl]

 from KMP_list import kmp_search, compute_lps_array
 class StreamingLLM(Llama):
     def __init__(self, model_path: str, **kwargs):
         super().__init__(model_path, **kwargs)
         self._venv_init()
     def str_detokenize(self, tokens) -> str:
+        return self.detokenize(tokens).decode('utf-8', errors='ignore')
     def kv_cache_seq_trim(self):
         self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
                     break
         return True
+    def venv_pop_token(self, n=1):
+        self.n_tokens -= n
+        self.venv[-1] -= n
         self.kv_cache_seq_trim()
     @property
         return str((self.n_tokens, self.venv, self.venv_idx_map))
     def kv_cache_seq_ltrim(self, n_keep, n_discard=256, n_past=-1, im_start=None):
+        if n_keep < 0:
+            return
         if n_past < 0:
             n_past = self.n_tokens
         if im_start is not None:  # [<|im_start|>, name, nl]