|
import os |
|
import copy |
|
import gradio as gr |
|
from typing import List, Tuple |
|
from llama_cpp import Llama |
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
llm = Llama( |
|
model_path=hf_hub_download( |
|
repo_id=os.environ.get("REPO_ID", "mradermacher/Atlas-Chat-2B-GGUF"), |
|
filename=os.environ.get("MODEL_FILE", "Atlas-Chat-2B.Q8_0.gguf"), |
|
), |
|
n_ctx=2048, |
|
) |
|
|
|
|
|
|
|
training_prompt = """<start_of_turn>user |
|
{}<end_of_turn> |
|
<start_of_turn>model |
|
{}<end_of_turn>""" |
|
|
|
|
|
|
|
def response( |
|
user_message: str, |
|
chat_history: List[Tuple[str, str]], |
|
max_response_length: int, |
|
temperature: float, |
|
top_p: float, |
|
): |
|
if not user_message.strip(): |
|
return "تقدروا تكتبوا الرسالة مرة اخرى؟" |
|
|
|
|
|
formatted_prompt = "" |
|
for user_input, model_response in chat_history: |
|
formatted_prompt += training_prompt.format(user_input, model_response) |
|
|
|
|
|
formatted_prompt += training_prompt.format(user_message, "") |
|
|
|
try: |
|
output = llm( |
|
formatted_prompt, |
|
max_tokens=max_response_length, |
|
temperature=temperature, |
|
top_p=top_p, |
|
top_k=40, |
|
repeat_penalty=1.1, |
|
stop=["<end_of_turn>", "<|endoftext|>"], |
|
stream=True, |
|
) |
|
|
|
response_text = "" |
|
for out in output: |
|
stream = copy.deepcopy(out) |
|
response_text += stream["choices"][0]["text"] |
|
return response_text |
|
|
|
except Exception as e: |
|
return f"شي خطأ وقع: {str(e)}" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
response, |
|
title="AtlasChat-mini", |
|
description="""\ |
|
# AtlasChat-mini 2B |
|
This is a demo of [`MBZUAI-Paris/Atlas-Chat-2B`](https://huggingface.co/mbzuai-paris/atlas-chat-2b). For more details, please check [the paper](https://arxiv.org/pdf/2409.17912). |
|
|
|
Looking for a larger and more powerful version? Try the 9B version in [HuggingFace](https://huggingface.co/mbzuai-paris/atlas-chat-9b). |
|
|
|
This demo was done using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library for efficient inference and is running the [`mradermacher/Atlas-Chat-2B-GGUF`](https://huggingface.co/mradermacher/Atlas-Chat-2B-GGUF) version with 8-bit Q8_0 quantization. |
|
""", |
|
examples=[ |
|
['What is the capital of Morocco?'], |
|
['كيفاش نوجد شي طاجين ؟'], |
|
['واش تقدر تعوض Google؟'], |
|
['عاود لي شي نكتة'] |
|
], |
|
cache_examples=False, |
|
additional_inputs=[ |
|
gr.Slider(minimum=1, maximum=1024, value=128, step=1, label="Max New Tokens"), |
|
gr.Slider(minimum=0.1, maximum=3.0, value=0.5, step=0.1, label="Temperature"), |
|
gr.Slider(minimum=0.1, maximum=1.0, value=0.90, step=0.05, label="Top-p (nucleus sampling)"), |
|
], |
|
) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |