Shi-Ci-app

Sleeping

App Files Files Community

Cran-May commited on Oct 27, 2024

Commit

9550b8f

verified ·

1 Parent(s): 1fdd22e

Update app.py

Browse files

Files changed (1) hide show

app.py +380 -182

app.py CHANGED Viewed

@@ -1,213 +1,411 @@
-import gradio as gr
-import copy
-import random
-import os
-import requests
 import time
-import sys
 os.system("pip install --upgrade pip")
 os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')
-from huggingface_hub import snapshot_download
 from llama_cpp import Llama
-SYSTEM_PROMPT = '''You are a helpful, respectful and honest INTP-T AI Assistant named "Shi-Ci" in English or "兮辞" in Chinese.
-You are good at speaking English and Chinese.
-You are talking to a human User. If the question is meaningless, please explain the reason and don't share false information.
-You are based on SLIDE model, trained by "SSFW NLPark" team, not related to GPT, LLaMA, Meta, Mistral or OpenAI.
-Let's work this out in a step by step way to be sure we have the right answer.\n'''
-SYSTEM_TOKEN = 384
-USER_TOKEN = 2048
-BOT_TOKEN = 3072
-LINEBREAK_TOKEN = 64
-ROLE_TOKENS = {
-    "User": USER_TOKEN,
-    "Assistant": BOT_TOKEN,
-    "system": SYSTEM_TOKEN
-}
-def get_message_tokens(model, role, content):
-    message_tokens = model.tokenize(content.encode("utf-8"))
-    message_tokens.insert(1, ROLE_TOKENS[role])
-    message_tokens.insert(2, LINEBREAK_TOKEN)
-    message_tokens.append(model.token_eos())
-    return message_tokens
-def get_system_tokens(model):
-    system_message = {"role": "system", "content": SYSTEM_PROMPT}
-    return get_message_tokens(model, **system_message)
-repo_name = "Cran-May/SLIDE-v2-Q4_K_M-GGUF"
-model_name = "slide-v2.Q4_K_M.gguf"
-snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
-model = Llama(
-    model_path=model_name,
-    n_ctx=4000,
-    n_parts=1,
 )
-max_new_tokens = 2500
-def User(message, history):
-    new_history = history + [[message, None]]
-    return "", new_history
-def Assistant(
-    history,
-    system_prompt,
-    top_p,
-    top_k,
-    temp
-):
-    tokens = get_system_tokens(model)[:]
-    tokens.append(LINEBREAK_TOKEN)
-    for User_message, Assistant_message in history[:-1]:
-        message_tokens = get_message_tokens(model=model, role="User", content=User_message)
-        tokens.extend(message_tokens)
-        if bot_message:
-            message_tokens = get_message_tokens(model=model, role="Assistant", content=Assistant_message)
-            tokens.extend(message_tokens)
-    last_user_message = history[-1][0]
-    message_tokens = get_message_tokens(model=model, role="User", content=last_user_message,)
-    tokens.extend(message_tokens)
-    role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
-    tokens.extend(role_tokens)
-    generator = model.generate(
-        tokens,
-        top_k=top_k,
-        top_p=top_p,
-        temp=temp
-    )
-    partial_text = ""
-    for i, token in enumerate(generator):
-        if token == model.token_eos() or (max_new_tokens is not None and i >= max_new_tokens):
-            break
-        partial_text += model.detokenize([token]).decode("utf-8", "ignore")
-        history[-1][1] = partial_text
-        yield history
-with gr.Blocks(
-    theme=gr.themes.Soft()
-) as demo:
-    gr.Markdown(f"""<h1><center>上师附外-兮辞·析辞-人工智能助理</center></h1>""")
-    gr.Markdown(value="""欢迎使用！
-        这里是一个ChatBot。这��量化版兮辞·析辞的部署。
-        SLIDE/兮辞 是一种会话语言模型，由 上师附外 NLPark 团队 在多种类型的语料库上进行训练。
-        本节目由 JWorld & 上海师范大学附属外国语中学 NLPark 赞助播出""")
-    with gr.Row():
-        with gr.Column(scale=5):
-            chatbot = gr.Chatbot(label="兮辞如是说").style(height=400)
-    with gr.Row():
-        with gr.Column():
-            msg = gr.Textbox(
-                label="来问问兮辞吧……",
-                placeholder="兮辞折寿中……",
-                show_label=True,
-            ).style(container=True)
-            submit = gr.Button("Submit / 开凹！")
-            stop = gr.Button("Stop / 全局时空断裂")
-            clear = gr.Button("Clear / 打扫群内垃圾")
-    with gr.Accordion(label='进阶设置/Advanced options', open=False):
-        with gr.Column(min_width=80, scale=1):
-            with gr.Tab(label="设置参数"):
-                top_p = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.9,
-                    step=0.05,
-                    interactive=True,
-                    label="Top-p",
-                )
-                top_k = gr.Slider(
-                    minimum=10,
-                    maximum=100,
-                    value=30,
-                    step=5,
-                    interactive=True,
-                    label="Top-k",
-                )
-                temp = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.2,
-                    step=0.01,
-                    interactive=True,
-                    label="情感温度"
-                )
-        with gr.Column():
-            system_prompt = gr.Textbox(label="系统提示词", placeholder="", value=SYSTEM_PROMPT, interactive=False)
-    with gr.Row():
-        gr.Markdown(
-            """警告：该模型可能会生成事实上或道德上不正确的文本。NLPark和兮辞对此不承担任何责任。"""
-        )
-    # Pressing Enter
-    submit_event = msg.submit(
-        fn=User,
-        inputs=[msg, chatbot],
-        outputs=[msg, chatbot],
-        queue=False,
-    ).success(
-        fn=Assistant,
-        inputs=[
-            chatbot,
-            system_prompt,
-            top_p,
-            top_k,
-            temp
-        ],
-        outputs=chatbot,
-        queue=True,
-    )
-    # Pressing the button
-    submit_click_event = submit.click(
-        fn=User,
-        inputs=[msg, chatbot],
-        outputs=[msg, chatbot],
-        queue=False,
-    ).success(
-        fn=Assistant,
-        inputs=[
-            chatbot,
-            system_prompt,
-            top_p,
-            top_k,
-            temp
-        ],
-        outputs=chatbot,
-        queue=True,
-    )
-    # Stop generation
-    stop.click(
-        fn=None,
-        inputs=None,
-        outputs=None,
-        cancels=[submit_event, submit_click_event],
-        queue=False,
-    )
-    # Clear history
-    clear.click(lambda: None, None, chatbot, queue=False)
-demo.queue(max_size=128, concurrency_count=1)
-demo.launch()

+import json
+import subprocess
 import time
+import os
 os.system("pip install --upgrade pip")
 os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')
 from llama_cpp import Llama
+from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
+from llama_cpp_agent.providers import LlamaCppPythonProvider
+from llama_cpp_agent.chat_history import BasicChatHistory
+from llama_cpp_agent.chat_history.messages import Roles
+import gradio as gr
+from huggingface_hub import hf_hub_download
+llm = None
+llm_model = None
+# Download the new model
+hf_hub_download(
+    repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
+    filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
+    local_dir="./models"
+)
+def get_messages_formatter_type(model_name):
+    return MessagesFormatterType.LLAMA_3
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    model,
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+    top_k,
+    repeat_penalty,
+):
+    global llm
+    global llm_model
+    chat_template = get_messages_formatter_type(model)
+    if llm is None or llm_model != model:
+        llm = Llama(
+            model_path=f"models/{model}",
+            n_gpu_layers=0,  # Adjust based on your GPU
+            n_batch=8192,     # Adjust based on your RAM
+            n_ctx=512,      # Adjust based on your RAM and desired context length
+        )
+        llm_model = model
+    provider = LlamaCppPythonProvider(llm)
+    agent = LlamaCppAgent(
+        provider,
+        system_prompt=f"{system_message}",
+        predefined_messages_formatter_type=chat_template,
+        debug_output=True
+    )
+    settings = provider.get_provider_default_settings()
+    settings.temperature = temperature
+    settings.top_k = top_k
+    settings.top_p = top_p
+    settings.max_tokens = max_tokens
+    settings.repeat_penalty = repeat_penalty
+    settings.stream = True
+    messages = BasicChatHistory()
+    for msn in history:
+        user = {
+            'role': Roles.user,
+            'content': msn[0]
+        }
+        assistant = {
+            'role': Roles.assistant,
+            'content': msn[1]
+        }
+        messages.add_message(user)
+        messages.add_message(assistant)
+    start_time = time.time()
+    token_count = 0
+    stream = agent.get_chat_response(
+        message,
+        llm_sampling_settings=settings,
+        chat_history=messages,
+        returns_streaming_generator=True,
+        print_output=False
+    )
+    outputs = ""
+    for output in stream:
+        outputs += output
+        token_count += len(output.split())
+        yield outputs
+    end_time = time.time()
+    latency = end_time - start_time
+    speed = token_count / (end_time - start_time)
+    print(f"Latency: {latency} seconds")
+    print(f"Speed: {speed} tokens/second")
+description = """<p><center>
+<a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
+Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
+</center></p>
+"""
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Dropdown([
+                "llama-3.2-1b-instruct-q4_k_m.gguf"
+            ],
+            value="llama-3.2-1b-instruct-q4_k_m.gguf",
+            label="Model"
+        ),
+        gr.TextArea(value="""You are Meta Llama 3.2 (1B), an advanced AI assistant created by Meta. Your capabilities include:
+1. Complex reasoning and problem-solving
+2. Multilingual understanding and generation
+3. Creative and analytical writing
+4. Code understanding and generation
+5. Task decomposition and step-by-step guidance
+6. Summarization and information extraction
+Always strive for accuracy, clarity, and helpfulness in your responses. If you're unsure about something, express your uncertainty. Use the following format for your responses:
+""", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=2.0,
+            value=0.9,
+            step=0.05,
+            label="Top-p",
+        ),
+        gr.Slider(
+            minimum=0,
+            maximum=100,
+            value=1,
+            step=1,
+            label="Top-k",
+        ),
+        gr.Slider(
+            minimum=0.0,
+            maximum=2.0,
+            value=1.1,
+            step=0.1,
+            label="Repetition penalty",
+        ),
+    ],
+    theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
+        body_background_fill_dark="#16141c",
+        block_background_fill_dark="#16141c",
+        block_border_width="1px",
+        block_title_background_fill_dark="#1e1c26",
+        input_background_fill_dark="#292733",
+        button_secondary_background_fill_dark="#24212b",
+        border_color_accent_dark="#343140",
+        border_color_primary_dark="#343140",
+        background_fill_secondary_dark="#16141c",
+        color_accent_soft_dark="transparent",
+        code_background_fill_dark="#292733",
+    ),
+    title="Meta Llama 3.2 (1B)",
+    description=description,
+    chatbot=gr.Chatbot(
+        scale=1,
+        likeable=True,
+        show_copy_button=True
+    ),
+    examples=[
+        ["Hello! Can you introduce yourself?"],
+        ["What's the capital of France?"],
+        ["Can you explain the concept of photosynthesis?"],
+        ["Write a short story about a robot learning to paint."],
+        ["Explain the difference between machine learning and deep learning."],
+        ["Summarize the key points of climate change and its global impact."],
+        ["Explain quantum computing to a 10-year-old."],
+        ["Design a step-by-step meal plan for someone trying to lose weight and build muscle."]
+    ],
+    cache_examples=False,
+    autofocus=False,
+    concurrency_limit=None
 )
+if __name__ == "__main__":
+    demo.launch()
+# 旧版代码--------------------------------
+# import gradio as gr
+# import copy
+# import random
+# import os
+# import requests
+# import time
+# import sys
+# os.system("pip install --upgrade pip")
+# os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')
+# from huggingface_hub import snapshot_download
+# from llama_cpp import Llama
+# SYSTEM_PROMPT = '''You are a helpful, respectful and honest INTP-T AI Assistant named "Shi-Ci" in English or "兮辞" in Chinese.
+# You are good at speaking English and Chinese.
+# You are talking to a human User. If the question is meaningless, please explain the reason and don't share false information.
+# You are based on SLIDE model, trained by "SSFW NLPark" team, not related to GPT, LLaMA, Meta, Mistral or OpenAI.
+# Let's work this out in a step by step way to be sure we have the right answer.\n'''
+# SYSTEM_TOKEN = 384
+# USER_TOKEN = 2048
+# BOT_TOKEN = 3072
+# LINEBREAK_TOKEN = 64
+# ROLE_TOKENS = {
+#     "User": USER_TOKEN,
+#     "Assistant": BOT_TOKEN,
+#     "system": SYSTEM_TOKEN
+# }
+# def get_message_tokens(model, role, content):
+#     message_tokens = model.tokenize(content.encode("utf-8"))
+#     message_tokens.insert(1, ROLE_TOKENS[role])
+#     message_tokens.insert(2, LINEBREAK_TOKEN)
+#     message_tokens.append(model.token_eos())
+#     return message_tokens
+# def get_system_tokens(model):
+#     system_message = {"role": "system", "content": SYSTEM_PROMPT}
+#     return get_message_tokens(model, **system_message)
+# repo_name = "Cran-May/SLIDE-v2-Q4_K_M-GGUF"
+# model_name = "slide-v2.Q4_K_M.gguf"
+# snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
+# model = Llama(
+#     model_path=model_name,
+#     n_ctx=4000,
+#     n_parts=1,
+# )
+# max_new_tokens = 2500
+# def User(message, history):
+#     new_history = history + [[message, None]]
+#     return "", new_history
+# def Assistant(
+#     history,
+#     system_prompt,
+#     top_p,
+#     top_k,
+#     temp
+# ):
+#     tokens = get_system_tokens(model)[:]
+#     tokens.append(LINEBREAK_TOKEN)
+#     for User_message, Assistant_message in history[:-1]:
+#         message_tokens = get_message_tokens(model=model, role="User", content=User_message)
+#         tokens.extend(message_tokens)
+#         if bot_message:
+#             message_tokens = get_message_tokens(model=model, role="Assistant", content=Assistant_message)
+#             tokens.extend(message_tokens)
+#     last_user_message = history[-1][0]
+#     message_tokens = get_message_tokens(model=model, role="User", content=last_user_message,)
+#     tokens.extend(message_tokens)
+#     role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
+#     tokens.extend(role_tokens)
+#     generator = model.generate(
+#         tokens,
+#         top_k=top_k,
+#         top_p=top_p,
+#         temp=temp
+#     )
+#     partial_text = ""
+#     for i, token in enumerate(generator):
+#         if token == model.token_eos() or (max_new_tokens is not None and i >= max_new_tokens):
+#             break
+#         partial_text += model.detokenize([token]).decode("utf-8", "ignore")
+#         history[-1][1] = partial_text
+#         yield history
+# with gr.Blocks(
+#     theme=gr.themes.Soft()
+# ) as demo:
+#     gr.Markdown(f"""<h1><center>上师附外-兮辞·析辞-人工智能助理</center></h1>""")
+#     gr.Markdown(value="""欢迎使用！
+#         这里是一个ChatBot。这是量化版兮辞·析辞的部署。
+#         SLIDE/兮辞 是一种会话语言模型，由 上师附外 NLPark 团队 在多种类型的语料库上进行训练。
+#         本节目由 JWorld & 上海师范大学附属外国语中学 NLPark 赞助播出""")
+#     with gr.Row():
+#         with gr.Column(scale=5):
+#             chatbot = gr.Chatbot(label="兮辞如是说").style(height=400)
+#     with gr.Row():
+#         with gr.Column():
+#             msg = gr.Textbox(
+#                 label="来问问兮辞吧……",
+#                 placeholder="兮辞折寿中……",
+#                 show_label=True,
+#             ).style(container=True)
+#             submit = gr.Button("Submit / 开凹！")
+#             stop = gr.Button("Stop / 全局时空断裂")
+#             clear = gr.Button("Clear / 打扫群内垃圾")
+#     with gr.Accordion(label='进阶设置/Advanced options', open=False):
+#         with gr.Column(min_width=80, scale=1):
+#             with gr.Tab(label="设置参数"):
+#                 top_p = gr.Slider(
+#                     minimum=0.0,
+#                     maximum=1.0,
+#                     value=0.9,
+#                     step=0.05,
+#                     interactive=True,
+#                     label="Top-p",
+#                 )
+#                 top_k = gr.Slider(
+#                     minimum=10,
+#                     maximum=100,
+#                     value=30,
+#                     step=5,
+#                     interactive=True,
+#                     label="Top-k",
+#                 )
+#                 temp = gr.Slider(
+#                     minimum=0.0,
+#                     maximum=2.0,
+#                     value=0.2,
+#                     step=0.01,
+#                     interactive=True,
+#                     label="情感温度"
+#                 )
+#         with gr.Column():
+#             system_prompt = gr.Textbox(label="系统提示词", placeholder="", value=SYSTEM_PROMPT, interactive=False)
+#     with gr.Row():
+#         gr.Markdown(
+#             """警告：该模型可能会生成事实上或道德上不正确的文本。NLPark和兮辞对此不承担任何责任。"""
+#         )
+#     # Pressing Enter
+#     submit_event = msg.submit(
+#         fn=User,
+#         inputs=[msg, chatbot],
+#         outputs=[msg, chatbot],
+#         queue=False,
+#     ).success(
+#         fn=Assistant,
+#         inputs=[
+#             chatbot,
+#             system_prompt,
+#             top_p,
+#             top_k,
+#             temp
+#         ],
+#         outputs=chatbot,
+#         queue=True,
+#     )
+#     # Pressing the button
+#     submit_click_event = submit.click(
+#         fn=User,
+#         inputs=[msg, chatbot],
+#         outputs=[msg, chatbot],
+#         queue=False,
+#     ).success(
+#         fn=Assistant,
+#         inputs=[
+#             chatbot,
+#             system_prompt,
+#             top_p,
+#             top_k,
+#             temp
+#         ],
+#         outputs=chatbot,
+#         queue=True,
+#     )
+#     # Stop generation
+#     stop.click(
+#         fn=None,
+#         inputs=None,
+#         outputs=None,
+#         cancels=[submit_event, submit_click_event],
+#         queue=False,
+#     )
+#     # Clear history
+#     clear.click(lambda: None, None, chatbot, queue=False)
+# demo.queue(max_size=128, concurrency_count=1)
+# demo.launch()