Spaces:
Sleeping
Sleeping
File size: 5,019 Bytes
d1343e4 559ea97 021692e 1391fc1 2a813c3 968018c 5d264d1 968018c 394f37d 968018c 2a813c3 d1343e4 5d264d1 021692e 43a82b2 021692e 79c6bfe 021692e 79c6bfe 021692e e543d33 d1343e4 43a82b2 d1343e4 536efdb 2a813c3 a9310a4 67288d4 021692e d4735f7 021692e d4735f7 84b88a1 021692e 67288d4 021692e 84b88a1 0fef086 84b88a1 0fef086 84b88a1 0fef086 84b88a1 d4735f7 67288d4 5d264d1 67288d4 021692e 67288d4 3cd9e10 021692e 9cb7ee7 021692e a9310a4 021692e 43a82b2 021692e 67288d4 021692e 416e232 a526b93 5d264d1 a526b93 5d264d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# Importing libraries
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
from llama_cpp import Llama
import gradio as gr
import psutil
# Initing things
print("! DOWNLOADING TOKENIZER AND SETTING ALL UP !")
translator_tokenizer = M2M100Tokenizer.from_pretrained( # tokenizer for translator
"facebook/m2m100_418M", cache_dir="translator/"
)
print("! DOWNLOADING MODEL AND SETTING ALL UP !")
translator_model = M2M100ForConditionalGeneration.from_pretrained( # translator model
"facebook/m2m100_418M", cache_dir="translator/"
)
print("! SETTING MODEL IN EVALUATION MODE !")
translator_model.eval()
print("! INITING LLAMA MODEL !")
llm = Llama(model_path="./model.bin") # LLaMa model
llama_model_name = "TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF"
print("! INITING DONE !")
# Preparing things to work
translator_tokenizer.src_lang = "en"
title = "llama.cpp API"
desc = '''<h1>Hello, world!</h1>
This is showcase how to make own server with Llama2 model.<br>
I'm using here 13b model just for example. Also here's only CPU power.<br>
But you can use GPU power as well!<br><br>
<h1>How to GPU?</h1>
Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
<h1>How to test it on own machine?</h1>
You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
<br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a> and <a href="https://www.gradio.app/">Gradio</a>.<br><br>'''
'''
# Defining languages for translator (i just chose popular on my opinion languages!!!)
ru - Russian
uk - Ukranian
zh - Chinese
de - German
fr - French
hi - Hindi
it - Italian
ja - Japanese
es - Spanish
ar - Arabic
'''
languages = ["ru", "uk", "zh", "de", "fr", "hi", "it", "ja", "es", "ar"]
# Loading prompt
with open('system.prompt', 'r', encoding='utf-8') as f:
prompt = f.read()
def generate_answer(request: str, max_tokens: int = 256, language: str = "en", custom_prompt: str = None):
logs = f"Request: {request}\nMax tokens: {max_tokens}\nLanguage: {language}\nCustom prompt: {custom_prompt}\n"
try:
maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
if isinstance(custom_prompt, str):
userPrompt = custom_prompt.replace("{prompt}", request)
else:
userPrompt = prompt.replace("{prompt}", request)
logs += f"\nFinal prompt: {userPrompt}\n"
except:
return "Not enough data! Check that you passed all needed data.", logs
try:
# this shitty fix will be until i willnt figure out why sometimes there is empty output
counter = 1
while True:
logs += f"Attempt {counter} to generate answer...\n"
output = llm(userPrompt, max_tokens=maxTokens, stop=["User:"], echo=False)
text = output["choices"][0]["text"]
if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']:
break
counter += 1
logs += f"Final attempt: {counter}\n"
if language in languages and language != "en":
logs += f"\nTranslating from en to {language}"
encoded_input = translator_tokenizer(text, return_tensors="pt")
generated_tokens = translator_model.generate(
**encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(language)
)
translated_text = translator_tokenizer.batch_decode(
generated_tokens, skip_special_tokens=True
)[0]
logs += f"\nTranslated: {translated_text}\nOriginal: {text}"
return translated_text, logs
logs += f"\nOriginal: {text}"
return text, logs
except Exception as e:
print(e)
return "Oops! Internal server error. Check the logs of space/instance.", logs
print("\n\n\n")
print("! LOAD GRADIO INTERFACE !")
demo = gr.Interface(
fn=generate_answer,
inputs=[
gr.components.Textbox(label="Input"),
gr.components.Number(value=256),
gr.components.Dropdown(label="Target Language", value="en", choices=["en"]+languages),
gr.components.Textbox(label="Custom system prompt"),
],
outputs=[
gr.components.Textbox(label="Output"),
gr.components.Textbox(label="Logs")
],
title=title,
description=desc,
allow_flagging='never'
)
demo.queue()
print("! LAUNCHING GRADIO !")
demo.launch(server_name="0.0.0.0") |