# Importing libraries from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration from llama_cpp import Llama import gradio as gr import psutil # Initing things #print("! DOWNLOADING TOKENIZER AND SETTING ALL UP !") #translator_tokenizer = M2M100Tokenizer.from_pretrained( # tokenizer for translator # "facebook/m2m100_418M", cache_dir="translator/" #) #print("! DOWNLOADING MODEL AND SETTING ALL UP !") #translator_model = M2M100ForConditionalGeneration.from_pretrained( # translator model # "facebook/m2m100_418M", cache_dir="translator/" #) #print("! SETTING MODEL IN EVALUATION MODE !") #translator_model.eval() print("! INITING LLAMA MODEL !") llm = Llama(model_path="./model.bin") # LLaMa model llama_model_name = "TheBloke/Llama-2-13B-chat-GGUF" print("! INITING DONE !") # Preparing things to work #translator_tokenizer.src_lang = "en" title = "llama.cpp API" desc = '''

Hello, world!

This is showcase how to make own server with Llama2 model.
I'm using here 7b model just for example. Also here's only CPU power.
But you can use GPU power as well!

How to GPU?

Change `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS` in Dockerfile on `CMAKE_ARGS="-DLLAMA_CUBLAS=on"`. Also you can try `DLLAMA_CLBLAST`, `DLLAMA_METAL` or `DLLAMA_METAL`.
Powered by llama-cpp-python, Quart and Uvicorn.

How to test it on own machine?

You can install Docker, build image and run it. I made `run-docker.sh` for ya. To stop container run `docker ps`, find name of container and run `docker stop _dockerContainerName_`
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.

''' + f"Memory used: {psutil.virtual_memory()[2]}
" + ''' ''' # Loading prompt with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() def generate_answer(request: str, max_tokens: int = 256, language: str = "en", custom_prompt: str = None): try: maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64 if isinstance(custom_prompt, str): userPrompt = custom_prompt + "\n\nUser: " + request + "\nAssistant: " else: userPrompt = prompt + "\n\nUser: " + request + "\nAssistant: " except: return "Not enough data! Check that you passed all needed data." try: output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False) text = output["choices"][0]["text"] # i allowed only certain languages (its not discrimination, its just other popular language on my opinion!!!): # russian (ru), ukranian (uk), chinese (zh) #if language in ["ru", "uk", "zh"]: #encoded_input = translator_tokenizer(output, return_tensors="pt") #generated_tokens = translator_model.generate( # **encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(language) #) #translated_text = translator_tokenizer.batch_decode( # generated_tokens, skip_special_tokens=True #)[0] #return translated_text return text except Exception as e: print(e) return "Oops! Internal server error. Check the logs of space/instance." print("! LOAD GRADIO INTERFACE !") demo = gr.Interface( fn=generate_answer, inputs=[ gr.components.Textbox(label="Input"), gr.components.Number(value=256), gr.components.Dropdown(label="Target Language", value="en", choices=["en", "ru", "uk", "zh"]), gr.components.Textbox(label="Custom system prompt"), ], outputs=["text"], title=title, description=desc ) #demo.queue() print("! LAUNCHING GRADIO !") demo.launch()