BlinkDL commited on
Commit
61b9ff7
·
1 Parent(s): f0b8656

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -4
app.py CHANGED
@@ -2,8 +2,11 @@ import gradio as gr
2
  import os
3
  from datetime import datetime
4
  from huggingface_hub import hf_hub_download
 
 
 
5
 
6
- title = "RWKV-4 14B fp16 ctx4096"
7
  desc = '''Links:
8
  <a href='https://github.com/BlinkDL/ChatRWKV' target="_blank" style="margin:0 1em">ChatRWKV</a>
9
  <a href='https://github.com/BlinkDL/RWKV-LM' target="_blank" style="margin:0 1em">RWKV-LM</a>
@@ -14,8 +17,10 @@ os.environ["RWKV_JIT_ON"] = '1'
14
  os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
15
 
16
  from rwkv.model import RWKV
17
- model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename="RWKV-4-Pile-14B-20230227-ctx4096-test503.pth")
18
- model = RWKV(model=model_path, strategy='cuda fp16 *32 -> cpu fp32')
 
 
19
  from rwkv.utils import PIPELINE, PIPELINE_ARGS
20
  pipeline = PIPELINE(model, "20B_tokenizer.json")
21
 
@@ -39,13 +44,16 @@ def infer(
39
  else:
40
  ctx = f'\n{ctx.strip()}'
41
 
 
 
 
42
  all_tokens = []
43
  out_last = 0
44
  out_str = ''
45
  occurrence = {}
46
  state = None
47
  for i in range(int(token_count)):
48
- out, state = model.forward(pipeline.encode(ctx)[:4096] if i == 0 else [token], state)
49
  for n in args.token_ban:
50
  out[n] = -float('inf')
51
  for n in occurrence:
 
2
  import os
3
  from datetime import datetime
4
  from huggingface_hub import hf_hub_download
5
+ from pynvml import *
6
+ nvmlInit()
7
+ gpu_h = nvmlDeviceGetHandleByIndex(0)
8
 
9
+ title = "RWKV-4 14B fp16 ctx1024"
10
  desc = '''Links:
11
  <a href='https://github.com/BlinkDL/ChatRWKV' target="_blank" style="margin:0 1em">ChatRWKV</a>
12
  <a href='https://github.com/BlinkDL/RWKV-LM' target="_blank" style="margin:0 1em">RWKV-LM</a>
 
17
  os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
18
 
19
  from rwkv.model import RWKV
20
+ # model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename="RWKV-4-Pile-14B-20230213-8019.pth")
21
+ # model = RWKV(model=model_path, strategy='cuda fp16 *34 -> cpu fp32')
22
+ model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-169m", filename="RWKV-4-Pile-169M-20220807-8023.pth")
23
+ model = RWKV(model=model_path, strategy='cuda fp16')
24
  from rwkv.utils import PIPELINE, PIPELINE_ARGS
25
  pipeline = PIPELINE(model, "20B_tokenizer.json")
26
 
 
44
  else:
45
  ctx = f'\n{ctx.strip()}'
46
 
47
+ gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
48
+ print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
49
+
50
  all_tokens = []
51
  out_last = 0
52
  out_str = ''
53
  occurrence = {}
54
  state = None
55
  for i in range(int(token_count)):
56
+ out, state = model.forward(pipeline.encode(ctx)[:1024] if i == 0 else [token], state)
57
  for n in args.token_ban:
58
  out[n] = -float('inf')
59
  for n in occurrence: