import os import copy import gradio as gr from typing import List, Tuple from llama_cpp import Llama from huggingface_hub import hf_hub_download # Load the LLaMA model llm = Llama( model_path=hf_hub_download( repo_id=os.environ.get("REPO_ID", "mradermacher/Atlas-Chat-2B-GGUF"), filename=os.environ.get("MODEL_FILE", "Atlas-Chat-2B.Q8_0.gguf"), ), n_ctx=2048, # context window size ) # Training prompt template training_prompt = """user {} model {}""" # Generate response function def response( user_message: str, chat_history: List[Tuple[str, str]], max_response_length: int, temperature: float, top_p: float, ): if not user_message.strip(): return "تقدروا تكتبوا الرسالة مرة اخرى؟" # Format chat history into the prompt formatted_prompt = "" for user_input, model_response in chat_history: formatted_prompt += training_prompt.format(user_input, model_response) # Add the current user message to the formatted prompt formatted_prompt += training_prompt.format(user_message, "") try: output = llm( formatted_prompt, max_tokens=max_response_length, temperature=temperature, top_p=top_p, top_k=40, repeat_penalty=1.1, stop=["", "<|endoftext|>"], stream=True, ) response_text = "" for out in output: stream = copy.deepcopy(out) response_text += stream["choices"][0]["text"] return response_text except Exception as e: return f"شي خطأ وقع: {str(e)}" # Create the Gradio chat interface demo = gr.ChatInterface( response, title="AtlasChat-mini", description="""\ # AtlasChat-mini 2B This is a demo of [`MBZUAI-Paris/Atlas-Chat-2B`](https://huggingface.co/mbzuai-paris/atlas-chat-2b). For more details, please check [our paper](https://arxiv.org/pdf/2409.17912). Looking for a larger and more powerful version? Try the 9B version in [Hugging Face](https://huggingface.co/mbzuai-paris/atlas-chat-9b). This demo was done using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library for efficient inference and is running the [`mradermacher/Atlas-Chat-2B-GGUF`](https://huggingface.co/mradermacher/Atlas-Chat-2B-GGUF) version with 8-bit Q8_0 quantization. """, examples=[ ['What is the capital of Morocco?'], ['كيفاش نوجد شي طاجين ؟'], ['واش تقدر تعوض Google ؟'], ['عاود لي شي نكتة'] ], cache_examples=False, additional_inputs=[ gr.Slider(minimum=1, maximum=1024, value=128, step=1, label="Max New Tokens"), gr.Slider(minimum=0.1, maximum=3.0, value=0.5, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.90, step=0.05, label="Top-p (nucleus sampling)"), ], ) # Launch the demo if __name__ == "__main__": demo.launch()