# Importing libraries from llama_cpp import Llama from time import time import gradio as gr import psutil # Initing things print("! INITING LLAMA MODEL !") llm = Llama(model_path="./model.bin") # LLaMa model llama_model_name = "TheBloke/openchat_3.5-GGUF" print("! INITING DONE !") # Preparing things to work title = "llama.cpp API" desc = '''

Hello, world!

This is showcase how to make own server with Llama2 model.
I'm using here 7b model just for example. Also here's only CPU power.
But you can use GPU power as well!

How to GPU?

Change `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS` in Dockerfile on `CMAKE_ARGS="-DLLAMA_CUBLAS=on"`. Also you can try `DLLAMA_CLBLAST` or `DLLAMA_METAL`.

How to test it on own machine?

You can install Docker, build image and run it. I made `run-docker.sh` for ya. To stop container run `docker ps`, find name of container and run `docker stop _dockerContainerName_`
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.

''' + f"Memory used: {psutil.virtual_memory()[2]}
" + ''' Powered by llama-cpp-python and Gradio.

''' # Loading prompt with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() with open('system.message', 'r', encoding='utf-8') as f: system_message = f.read() def generate_answer(request: str, max_tokens: int = 256, custom_prompt: str = None): t0 = time() logs = f"Request: {request}\nMax tokens: {max_tokens}\nCustom prompt: {custom_prompt}\n" try: maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64 userPrompt = prompt.replace("{prompt}", request) userPrompt = userPrompt.replace( "{system_message}", custom_prompt if isinstance(custom_prompt, str) and len(custom_prompt.strip()) > 1 and custom_prompt.strip() not in ['', None, ' '] else system_message ) logs += f"\nFinal prompt: {userPrompt}\n" except: return "Not enough data! Check that you passed all needed data.", logs try: # this shitty fix will be until i willnt figure out why sometimes there is empty output counter = 1 while counter <= 3: logs += f"Attempt {counter} to generate answer...\n" output = llm(userPrompt, max_tokens=maxTokens, stop=["<|im_end|>", "<|end_of_turn|>"], echo=False) text = output["choices"][0]["text"] if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']: break counter += 1 logs += f"Final attempt: {counter}\n" if len(text.strip()) <= 1 or text.strip() in ['', None, ' ']: logs += f"Generated and aborted: {text}" text = "Sorry, but something went wrong while generating answer. Try again or fix code. If you are maintainer of this space, look into logs." logs += f"\nFinal: '''{text}'''" logs += f"\n\nTime spent: {time()-t0}" return text, logs except Exception as e: logs += str(e) logs += f"\n\nTime spent: {time()-t0}" return "Oops! Internal server error. Check the logs of space/instance.", logs print("! LOAD GRADIO INTERFACE !") demo = gr.Interface( fn=generate_answer, inputs=[ gr.components.Textbox(label="Input"), gr.components.Number(value=256), gr.components.Textbox(label="Custom system prompt"), ], outputs=[ gr.components.Textbox(label="Output"), gr.components.Textbox(label="Logs") ], title=title, description=desc, allow_flagging='never' ) demo.queue() print("! LAUNCHING GRADIO !") demo.launch(server_name="0.0.0.0")