Spaces:
Running
Running
File size: 5,598 Bytes
4a2c956 e63ee0a 7c790c0 61b9ff7 a00d592 4a2c956 7c790c0 a00d592 7c790c0 a00d592 7c790c0 bd8572f e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 7c790c0 e63ee0a 7c790c0 ea2eccb 7c790c0 0886a44 7c790c0 e63ee0a 29c4970 0083156 7c790c0 2d1a42e d2d01b7 2d1a42e 7c790c0 e63ee0a a00d592 e63ee0a bd8572f e63ee0a d2d01b7 a00d592 e63ee0a 2d1a42e e63ee0a 2d1a42e 7c790c0 e63ee0a 7c790c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import os, gc, copy, torch
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
nvmlInit()
gpu_h = nvmlDeviceGetHandleByIndex(0)
ctx_limit = 3000
title = "RWKV-5-World-1.5B-v2-OnlyForTest_56%_trained-20231013-ctx4096"
os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="BlinkDL/temp", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cuda fp16')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
def generate_prompt(instruction, input=""):
instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
if input:
return f"""Instruction: {instruction}
Input: {input}
Response:"""
else:
return f"""User: hi
Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.
User: {instruction}
Assistant:"""
def evaluate(
ctx,
token_count=200,
temperature=1.0,
top_p=0.7,
presencePenalty = 0.1,
countPenalty = 0.1,
):
args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
alpha_frequency = countPenalty,
alpha_presence = presencePenalty,
token_ban = [], # ban the generation of some tokens
token_stop = [0]) # stop generation whenever you see any token here
all_tokens = []
out_last = 0
out_str = ''
occurrence = {}
state = None
for i in range(int(token_count)):
out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
for n in occurrence:
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
if token in args.token_stop:
break
all_tokens += [token]
for xxx in occurrence:
occurrence[xxx] *= 0.996
if token not in occurrence:
occurrence[token] = 1
else:
occurrence[token] += 1
tmp = pipeline.decode(all_tokens[out_last:])
if '\ufffd' not in tmp:
out_str += tmp
yield out_str.strip()
out_last = i + 1
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
del out
del state
gc.collect()
torch.cuda.empty_cache()
yield out_str.strip()
examples = [
[generate_prompt("Tell me about ravens."), 500, 1, 0.5, 0.4, 0.4],
[generate_prompt("Write a python function to mine 1 BTC."), 500, 1, 0.5, 0.4, 0.4],
[generate_prompt("Write a song about ravens."), 500, 1, 0.5, 0.4, 0.4],
[generate_prompt("Write a story using the following information", "A man named Alex chops a tree down"), 500, 1, 0.5, 0.4, 0.4],
[generate_prompt("Generate a list of adjectives that describe a person as brave."), 500, 1, 0.5, 0.4, 0.4],
[generate_prompt("You have $100, and your goal is to turn that into as much money as possible with AI and Machine Learning. Please respond with detailed plan."), 500, 1, 0.5, 0.4, 0.4],
]
##########################################################################
with gr.Blocks(title=title) as demo:
gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
with gr.Tab("Instruct mode"):
gr.Markdown(f"This is a ***1.5B*** (!!!) [RWKV-5 World v2](https://huggingface.co/BlinkDL/rwkv-5-world), which is 100% RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) and attention-free. *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}. For best results, *** keep you prompt short and clear ***.")
with gr.Row():
with gr.Column():
instruction = gr.Textbox(lines=2, label="Prompt", value=generate_prompt("Tell me about ravens."))
token_count = gr.Slider(10, 500, label="Max Tokens", step=10, value=500)
temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
with gr.Column():
with gr.Row():
submit = gr.Button("Submit", variant="primary")
clear = gr.Button("Clear", variant="secondary")
output = gr.Textbox(label="Output", lines=5)
data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
clear.click(lambda: None, [], [output])
data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])
demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False)
|