|
import os |
|
|
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
from llama_cpp import Llama |
|
import spaces |
|
|
|
huggingface_token = os.getenv("HF_TOKEN") |
|
|
|
infer_prompt = "فيما يلي تعليمات تصف مهمة. اكتب استجابة تكمل الطلب بشكل مناسب.\n\n### تعليمات:\n{}\n\n### إجابة:\n" |
|
model_id = "nazimali/mistral-7b-v0.3-instruct-arabic" |
|
file_name = "Q8_0.gguf" |
|
llm = None |
|
|
|
|
|
hf_hub_download( |
|
repo_id=model_id, |
|
filename=file_name, |
|
local_dir="./models", |
|
token=huggingface_token, |
|
) |
|
|
|
|
|
@spaces.GPU |
|
def respond( |
|
message, |
|
history, |
|
): |
|
global llm |
|
if llm is None: |
|
llm = Llama( |
|
model_path=f"./models/{file_name}", |
|
flash_attn=True, |
|
n_gpu_layers=-1, |
|
n_ctx=1024, |
|
verbose=True, |
|
) |
|
|
|
stream = llm.create_chat_completion( |
|
messages=[{"role": "user", "content": infer_prompt.format(message) }], |
|
max_tokens=512, |
|
repeat_penalty=1.2, |
|
stream=True, |
|
temperature=0.7, |
|
top_k=40, |
|
top_p=0.95, |
|
) |
|
|
|
outputs = "" |
|
for output in stream: |
|
print(output) |
|
outputs += output["choices"][0]["delta"].get("content", "") |
|
yield outputs |
|
|
|
|
|
|
|
demo = gr.ChatInterface(respond, type="messages", examples=["السلام عليكم كيف حالك؟", "hello"], title="Mistral 7B Arabic Fine-tuned") |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |