File size: 5,019 Bytes
d1343e4
 
559ea97
021692e
1391fc1
2a813c3
968018c
5d264d1
 
 
 
 
 
 
 
 
 
968018c
 
394f37d
968018c
2a813c3
d1343e4
5d264d1
021692e
43a82b2
021692e
79c6bfe
 
021692e
79c6bfe
021692e
 
 
 
e543d33
d1343e4
43a82b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1343e4
536efdb
2a813c3
 
a9310a4
67288d4
021692e
 
 
d4735f7
021692e
d4735f7
84b88a1
021692e
67288d4
021692e
 
84b88a1
0fef086
84b88a1
0fef086
84b88a1
 
 
 
0fef086
 
84b88a1
d4735f7
 
67288d4
5d264d1
 
 
 
 
 
 
67288d4
 
 
 
021692e
 
67288d4
3cd9e10
021692e
9cb7ee7
021692e
a9310a4
021692e
 
 
43a82b2
021692e
 
67288d4
 
 
 
021692e
416e232
 
a526b93
5d264d1
a526b93
5d264d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Importing libraries
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
from llama_cpp import Llama
import gradio as gr
import psutil

# Initing things                
print("! DOWNLOADING TOKENIZER AND SETTING ALL UP !")
translator_tokenizer = M2M100Tokenizer.from_pretrained(            # tokenizer for translator
    "facebook/m2m100_418M", cache_dir="translator/"
)
print("! DOWNLOADING MODEL AND SETTING ALL UP !")
translator_model = M2M100ForConditionalGeneration.from_pretrained( # translator model
    "facebook/m2m100_418M", cache_dir="translator/"
)
print("! SETTING MODEL IN EVALUATION MODE !")
translator_model.eval()
print("! INITING LLAMA MODEL !")
llm = Llama(model_path="./model.bin")                              # LLaMa model
llama_model_name = "TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF" 
print("! INITING DONE !")

# Preparing things to work
translator_tokenizer.src_lang = "en"
title = "llama.cpp API"
desc = '''<h1>Hello, world!</h1>
This is showcase how to make own server with Llama2 model.<br>
I'm using here 13b model just for example. Also here's only CPU power.<br>
But you can use GPU power as well!<br><br>
<h1>How to GPU?</h1>
Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
<h1>How to test it on own machine?</h1>
You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
<br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a> and <a href="https://www.gradio.app/">Gradio</a>.<br><br>'''

'''
    # Defining languages for translator (i just chose popular on my opinion languages!!!)
    ru - Russian
    uk - Ukranian
    zh - Chinese
    de - German
    fr - French
    hi - Hindi
    it - Italian
    ja - Japanese
    es - Spanish
    ar - Arabic
'''
languages = ["ru", "uk", "zh", "de", "fr", "hi", "it", "ja", "es", "ar"]

# Loading prompt
with open('system.prompt', 'r', encoding='utf-8') as f:
    prompt = f.read()

def generate_answer(request: str, max_tokens: int = 256, language: str = "en", custom_prompt: str = None):
    logs = f"Request: {request}\nMax tokens: {max_tokens}\nLanguage: {language}\nCustom prompt: {custom_prompt}\n"
    try:
        maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
        if isinstance(custom_prompt, str):
            userPrompt = custom_prompt.replace("{prompt}", request)
        else:
            userPrompt = prompt.replace("{prompt}", request)
        logs += f"\nFinal prompt: {userPrompt}\n"
    except:
        return "Not enough data! Check that you passed all needed data.", logs
    
    try:
        # this shitty fix will be until i willnt figure out why sometimes there is empty output
        counter = 1
        while True:
            logs += f"Attempt {counter} to generate answer...\n"
            output = llm(userPrompt, max_tokens=maxTokens, stop=["User:"], echo=False)
            text = output["choices"][0]["text"]
            if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']:
                break
            counter += 1
        logs += f"Final attempt: {counter}\n"
                
        
        if language in languages and language != "en":
            logs += f"\nTranslating from en to {language}"
            encoded_input = translator_tokenizer(text, return_tensors="pt")
            generated_tokens = translator_model.generate(
                **encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(language)
            )
            translated_text = translator_tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )[0]
            logs += f"\nTranslated: {translated_text}\nOriginal: {text}"
            return translated_text, logs
        logs += f"\nOriginal: {text}"
        return text, logs
    except Exception as e:
        print(e)
        return "Oops! Internal server error. Check the logs of space/instance.", logs
    print("\n\n\n")

print("! LOAD GRADIO INTERFACE !")
demo = gr.Interface(
    fn=generate_answer,
    inputs=[
        gr.components.Textbox(label="Input"),
        gr.components.Number(value=256),
        gr.components.Dropdown(label="Target Language", value="en", choices=["en"]+languages),
        gr.components.Textbox(label="Custom system prompt"),
    ],
    outputs=[
        gr.components.Textbox(label="Output"),
        gr.components.Textbox(label="Logs")
    ],
    title=title,
    description=desc,
    allow_flagging='never'
)
demo.queue()
print("! LAUNCHING GRADIO !")
demo.launch(server_name="0.0.0.0")