File size: 3,151 Bytes
bee5263
 
 
3fbd422
a58b418
4091744
8e7163b
9f9177a
3fbd422
bee5263
eece387
 
4091744
 
 
eece387
 
 
 
 
 
 
 
 
bee5263
 
 
 
 
4cbaa02
dbcfd8e
 
29fbbe7
dbcfd8e
 
 
bee5263
3fbd422
 
 
 
d465d44
f79168b
 
 
3fbd422
bee5263
 
3fbd422
eece387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fbd422
eece387
 
 
bee5263
 
 
9f9177a
fff1df0
9f9177a
 
f6819c7
 
 
 
 
bab92d5
f0a5811
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import prompt_style
import os
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import time


# model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF"
# filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf"
# model_path = hf_hub_download(repo_id=model_id, filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf", token=os.environ['HF_TOKEN'])
# model = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=False)

# model = Llama.from_pretrained(repo_id=model_id, filename=filename, n_gpu_layers=-1, token=os.environ['HF_TOKEN'], 
#                               n_ctx=4096, verbose=False, attn_implementation="flash_attention_2")

from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3"
model_8bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True), 
                                                  token=os.environ['HF_TOKEN'], attn_implementation="flash_attention_2")


class Item(BaseModel):
    prompt: str
    history: list
    system_prompt: str
    temperature: float = 0.8
    max_new_tokens: int = 1024
    top_p: float = 0.95
    repetition_penalty: float = 1.0
    seed : int = 42
    
app = FastAPI()

def format_prompt(item: Item):
    messages = [
        {"role": "system", "content": prompt_style.data},
    ]
    for it in item.history:
        messages.append({"role" : "user", "content": it[0]})
        messages.append({"role" : "assistant", "content": it[1]})
    messages.append({"role" : "user", "content": item.prompt})
    return messages

def generate(item: Item):
    formatted_prompt = format_prompt(item)
    # output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed,  
    #                                       temperature=item.temperature, max_tokens=item.max_new_tokens)
    # out = output['choices'][0]['message']['content']
    # return out
    
    input_ids = tokenizer.apply_chat_template(
        formatted_prompt,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = model_8bit.generate(
        input_ids,
        max_new_tokens=item.max_new_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=item.temperature,
        top_p=item.top_p,
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

    # inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    # generated_ids = model.generate(**inputs)
    # outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

@app.post("/generate/")
async def generate_text(item: Item):
    t1 = time.time()
    ans = generate(item)
    print(ans)
    print(f"time: {str(time.time() - t1)}")
    return {"response": ans}


@app.get("/")
def read_root():
    
    return {"Hello": "Worlds"}