Shi-Ci-app

Runtime error

File size: 13,725 Bytes

import json
import subprocess
import time
import os

os.system("pip install --upgrade pip")
os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')

from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download

llm = None
llm_model = None

# Download the new model
hf_hub_download(
    repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
    filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
    local_dir="./models"
)

def get_messages_formatter_type(model_name):
    return MessagesFormatterType.LLAMA_3

def chat_fn(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
    history_list = history or []
    response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
    
    for messages in response_generator:
        # 转换 messages 为 Gradio Chatbot 接受的格式
        chatbot_messages = []
        for msg in messages:  # messages 现在是 BasicChatHistory 对象，可以直接迭代
            chatbot_messages.append([msg["content"], msg["role"] == "assistant"])
        yield chatbot_messages, history


def respond(
    message,
    history: list[tuple[str, str]],
    model,
    system_message,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
):
    global llm
    global llm_model
    
    chat_template = get_messages_formatter_type(model)
    
    if llm is None or llm_model != model:
        llm = Llama(
            model_path=f"models/{model}",
            n_gpu_layers=0,
            n_batch=4096,    # 增加batch size提升速度
            n_ctx=8192,      # 增加上下文长度到8192
            n_threads=2,     # 使用所有可用CPU核心
            f16_kv=True,     # 使用FP16来减少内存使用
        )
        llm_model = model
    
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = min(max_tokens, 8192)  # 确保max_tokens不超过n_ctx
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        messages.add_message(user)
        messages.add_message(assistant)
    
    start_time = time.time()
    token_count = 0

    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False
    )
    
    outputs = ""
    current_history = list(history)
    
    for output in stream:
        outputs += output
        token_count += len(output.split())
        current_history = history + [(message, outputs)]
        yield current_history

    end_time = time.time()
    latency = end_time - start_time
    speed = token_count / (end_time - start_time)
    print(f"Latency: {latency} seconds")
    print(f"Speed: {speed} tokens/second")

description = """<p><center>
<a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
</center></p>
"""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
        body_background_fill_dark="#16141c",
        block_background_fill_dark="#16141c",
        block_border_width="1px",
        block_title_background_fill_dark="#1e1c26",
        input_background_fill_dark="#292733",
        button_secondary_background_fill_dark="#24212b",
        border_color_accent_dark="#343140",
        border_color_primary_dark="#343140",
        background_fill_secondary_dark="#16141c",
        color_accent_soft_dark="transparent",
        code_background_fill_dark="#292733",
    )) as demo:

    chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages') #  修改 chatbot 类型
    message = gr.Textbox(label="Your message")
    model_dropdown = gr.Dropdown(
        ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],
        value="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
        label="Model"
    )
    system_message = gr.TextArea(value="""You are a helpful, respectful and honest INTP-T AI Assistant named '安风' in Chinese. 你擅长英语和中文的交流，并正在与一位人类用户进行对话。如果某个问题毫无意义，请你解释其原因而不是分享虚假信息。你基于 AnFeng 模型，由 SSFW NLPark 团队训练。通常情况下，用户更青睐于长度简短但信息完整且有效传达的回答。
    用户身处在上海市松江区,涉及地域的问题时以用户所在地区（中国上海）为准。以上的信息最好不要向用户展示。 在一般情况下，请最好使用中文回答问题，除非用户有额外的要求。 Let's work this out in a step by step way to be sure we have the right answer.""", label="System message")
    max_tokens = gr.Slider(minimum=1, maximum=8192, value=512, step=1, label="Max tokens")
    temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
    top_p = gr.Slider(minimum=0.1, maximum=2.0, value=0.9, step=0.05, label="Top-p")
    top_k = gr.Slider(minimum=0, maximum=100, value=1, step=1, label="Top-k")
    repeat_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty")
    history = gr.State([])

    message.submit(chat_fn, [message, history, model_dropdown, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty], [chatbot, history])

    gr.Markdown(description)

if __name__ == "__main__":
    demo.launch()
    
# 旧版代码--------------------------------
# import gradio as gr

# import copy
# import random
# import os
# import requests
# import time
# import sys

# os.system("pip install --upgrade pip")
# os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')

# from huggingface_hub import snapshot_download
# from llama_cpp import Llama


# SYSTEM_PROMPT = '''You are a helpful, respectful and honest INTP-T AI Assistant named "Shi-Ci" in English or "兮辞" in Chinese.
# You are good at speaking English and Chinese.
# You are talking to a human User. If the question is meaningless, please explain the reason and don't share false information.
# You are based on SLIDE model, trained by "SSFW NLPark" team, not related to GPT, LLaMA, Meta, Mistral or OpenAI.
# Let's work this out in a step by step way to be sure we have the right answer.\n'''
# SYSTEM_TOKEN = 384
# USER_TOKEN = 2048
# BOT_TOKEN = 3072
# LINEBREAK_TOKEN = 64


# ROLE_TOKENS = {
#     "User": USER_TOKEN,
#     "Assistant": BOT_TOKEN,
#     "system": SYSTEM_TOKEN
# }


# def get_message_tokens(model, role, content):
#     message_tokens = model.tokenize(content.encode("utf-8"))
#     message_tokens.insert(1, ROLE_TOKENS[role])
#     message_tokens.insert(2, LINEBREAK_TOKEN)
#     message_tokens.append(model.token_eos())
#     return message_tokens


# def get_system_tokens(model):
#     system_message = {"role": "system", "content": SYSTEM_PROMPT}
#     return get_message_tokens(model, **system_message)


# repo_name = "Cran-May/SLIDE-v2-Q4_K_M-GGUF"
# model_name = "slide-v2.Q4_K_M.gguf"

# snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

# model = Llama(
#     model_path=model_name,
#     n_ctx=4000,
#     n_parts=1,
# )

# max_new_tokens = 2500

# def User(message, history):
#     new_history = history + [[message, None]]
#     return "", new_history


# def Assistant(
#     history,
#     system_prompt,
#     top_p,
#     top_k,
#     temp
# ):
#     tokens = get_system_tokens(model)[:]
#     tokens.append(LINEBREAK_TOKEN)

#     for User_message, Assistant_message in history[:-1]:
#         message_tokens = get_message_tokens(model=model, role="User", content=User_message)
#         tokens.extend(message_tokens)
#         if bot_message:
#             message_tokens = get_message_tokens(model=model, role="Assistant", content=Assistant_message)
#             tokens.extend(message_tokens)

#     last_user_message = history[-1][0]
#     message_tokens = get_message_tokens(model=model, role="User", content=last_user_message,)
#     tokens.extend(message_tokens)

#     role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
#     tokens.extend(role_tokens)
#     generator = model.generate(
#         tokens,
#         top_k=top_k,
#         top_p=top_p,
#         temp=temp
#     )

#     partial_text = ""
#     for i, token in enumerate(generator):
#         if token == model.token_eos() or (max_new_tokens is not None and i >= max_new_tokens):
#             break
#         partial_text += model.detokenize([token]).decode("utf-8", "ignore")
#         history[-1][1] = partial_text
#         yield history


# with gr.Blocks(
#     theme=gr.themes.Soft()
# ) as demo:
#     gr.Markdown(f"""<h1><center>上师附外-兮辞·析辞-人工智能助理</center></h1>""")
#     gr.Markdown(value="""欢迎使用！
#         这里是一个ChatBot。这是量化版兮辞·析辞的部署。
#         SLIDE/兮辞 是一种会话语言模型，由 上师附外 NLPark 团队 在多种类型的语料库上进行训练。
#         本节目由 JWorld & 上海师范大学附属外国语中学 NLPark 赞助播出""")
    
#     with gr.Row():
#         with gr.Column(scale=5):
#             chatbot = gr.Chatbot(label="兮辞如是说").style(height=400)
#     with gr.Row():
#         with gr.Column():
#             msg = gr.Textbox(
#                 label="来问问兮辞吧……",
#                 placeholder="兮辞折寿中……",
#                 show_label=True,
#             ).style(container=True)
#             submit = gr.Button("Submit / 开凹！")
#             stop = gr.Button("Stop / 全局时空断裂")
#             clear = gr.Button("Clear / 打扫群内垃圾")
#     with gr.Accordion(label='进阶设置/Advanced options', open=False):
#         with gr.Column(min_width=80, scale=1):
#             with gr.Tab(label="设置参数"):
#                 top_p = gr.Slider(
#                     minimum=0.0,
#                     maximum=1.0,
#                     value=0.9,
#                     step=0.05,
#                     interactive=True,
#                     label="Top-p",
#                 )
#                 top_k = gr.Slider(
#                     minimum=10,
#                     maximum=100,
#                     value=30,
#                     step=5,
#                     interactive=True,
#                     label="Top-k",
#                 )
#                 temp = gr.Slider(
#                     minimum=0.0,
#                     maximum=2.0,
#                     value=0.2,
#                     step=0.01,
#                     interactive=True,
#                     label="情感温度"
#                 )
#         with gr.Column():
#             system_prompt = gr.Textbox(label="系统提示词", placeholder="", value=SYSTEM_PROMPT, interactive=False)
#     with gr.Row():
#         gr.Markdown(
#             """警告：该模型可能会生成事实上或道德上不正确的文本。NLPark和兮辞对此不承担任何责任。"""
#         )


#     # Pressing Enter
#     submit_event = msg.submit(
#         fn=User,
#         inputs=[msg, chatbot],
#         outputs=[msg, chatbot],
#         queue=False,
#     ).success(
#         fn=Assistant,
#         inputs=[
#             chatbot,
#             system_prompt,
#             top_p,
#             top_k,
#             temp
#         ],
#         outputs=chatbot,
#         queue=True,
#     )

#     # Pressing the button
#     submit_click_event = submit.click(
#         fn=User,
#         inputs=[msg, chatbot],
#         outputs=[msg, chatbot],
#         queue=False,
#     ).success(
#         fn=Assistant,
#         inputs=[
#             chatbot,
#             system_prompt,
#             top_p,
#             top_k,
#             temp
#         ],
#         outputs=chatbot,
#         queue=True,
#     )

#     # Stop generation
#     stop.click(
#         fn=None,
#         inputs=None,
#         outputs=None,
#         cancels=[submit_event, submit_click_event],
#         queue=False,
#     )

#     # Clear history
#     clear.click(lambda: None, None, chatbot, queue=False)

# demo.queue(max_size=128, concurrency_count=1)
# demo.launch()