Spaces:
Runtime error
Runtime error
File size: 4,091 Bytes
dda1539 34cbb1e 0633ac3 34cbb1e f9e22c8 5824320 f9e22c8 dda1539 79a5328 dda1539 37a92ca dda1539 79a5328 dda1539 8f4ffb5 b01dd78 cfcc2e7 dda1539 b01dd78 dda1539 b01dd78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
import json
import os
import spaces
import torch
from dotenv import load_dotenv
from huggingface_hub import login, snapshot_download
from superposed.llama.superposed_generation import SuperposedLlama
from superposed.llama.tokenizer import Tokenizer
from superposed.ngrams.ngram_models import make_models
# Set torch dist variables
os.environ['RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"
os.environ['MASTER_PORT'] = "12193"
os.environ['MASTER_ADDR'] = "127.0.0.1"
def load_models():
model = SuperposedLlama.build(ckpt_dir=weight_path,
tokenizer_path=f'{weight_path}/tokenizer.model',
max_seq_len=100,
max_batch_size=32,
device="cuda",
model_parallel_size=1)
return model
# load_dotenv()
# print(os.getenv("HF_ACCESS_TOKEN"))
login(os.getenv("HF_ACCESS_TOKEN"))
if not os.path.exists("./weights/"):
os.mkdir("./weights/")
snapshot_download(repo_id="meta-llama/Llama-2-7b", local_dir="./weights/")
weight_path = "./weights/"
# Load params
param_file = "params/p15_d3_ngram4_mixed.json"
with open(param_file, "r") as f:
params = json.load(f)
alpha = params["alpha"]
temp = params["temp"]
n_drafts = params["n_drafts"]
prompt_len = params["prompt_len"]
n_token_sample = params["n_token_sample"]
i_weights = params["i_weights"]
i_length = params["i_length"]
# Load main model
model = load_models()
tokenizer = Tokenizer(f'{weight_path}/tokenizer.model')
# Create ngram models
ngrams = make_models("ckpts-200k", bigram=True, trigram=True, fourgram=True, fivegram=False, sixgram=False, sevengram=False)
def decode(tokenizer, encoding):
"""
Args:
tokenizer (Any): Tokenizer
encoding (torch.Tensor): Encoding
Returns:
decoding (str)
"""
eos_locs = (encoding == tokenizer.eos_id).nonzero()
if len(eos_locs > 0):
encoding = encoding[:eos_locs[0]]
return tokenizer.decode(encoding.to(torch.int32).tolist())
@spaces.GPU
def update_options(input, num_tokens):
tokenized_prompts = tokenizer.encode([input], True, False)
print("Processed prompt")
model.model.to("cuda")
model.model.device = "cuda"
alive_gens, _ = model.sup_generate(prompt_tokens=tokenized_prompts,
smoothing="geom",
max_gen_len=num_tokens,
n_token_sample=n_token_sample,
alpha=alpha,
temp=temp,
n_drafts=n_drafts,
i_weights=i_weights,
i_length=i_length,
ngrams=ngrams,
get_time=False,
penalty=200)
print("Generated")
gens = alive_gens[0].reshape(n_drafts, -1)
return decode(tokenizer, gens[0]), decode(tokenizer, gens[1]), decode(tokenizer, gens[2])
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# Superposed Decoding
Start typing below to see suggestions.
""")
slider = gr.Slider(minimum=1, maximum=10, step=1, label="Generation length", value=10)
inp = gr.Textbox(placeholder="Type anything!", lines=3)
option1 = gr.Button(value="Option 1")
option2 = gr.Button(value="Option 2")
option3 = gr.Button(value="Option 3")
inp.change(update_options, inputs=[inp, slider], outputs=[option1, option2, option3])
# Button updates
@option1.click(inputs=[inp, option1], outputs=inp)
def option1_click(curr, txt):
return curr + txt
@option2.click(inputs=[inp, option2], outputs=inp)
def option2_click(curr, txt):
return curr + txt
@option3.click(inputs=[inp, option3], outputs=inp)
def option3_click(curr, txt):
return curr + txt
if __name__ == "__main__":
demo.launch(share=True) |