austinsilveria commited on
Commit
33ad5e9
1 Parent(s): 4296400

take out the trash

Browse files
Files changed (1) hide show
  1. app.py +49 -33
app.py CHANGED
@@ -1,4 +1,6 @@
1
  from threading import Thread
 
 
2
 
3
  import streamlit as st
4
 
@@ -7,43 +9,57 @@ from transformers import AutoTokenizer, TextIteratorStreamer, set_seed
7
  from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
8
  from configuration_tricksy import TricksyConfig
9
 
10
- def generate():
11
- set_seed(42)
12
-
13
- # 13.4 GB (16 bit)
14
- model_name = 'facebook/opt-6.7b'
15
- disk_weights = OPTDiskWeights(model_name)
16
- tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
17
- tokenizer = AutoTokenizer.from_pretrained(model_name)
18
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
19
-
20
- inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
21
-
22
- print()
23
- generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
24
- thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
25
- thread.start()
26
- generated_text = ''
27
- with st.chat_message("user"):
28
- t = st.empty()
29
- for new_text in streamer:
30
- generated_text += new_text.replace('\n', ' \n')
31
- t.write(generated_text)
32
-
33
- stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
34
- stats_text += f' \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
35
- stats_text += f' \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
36
- st.write(stats_text)
37
 
38
  prompt = st.text_area('Prompt', 'Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
39
 
40
  col1, col2 = st.columns(2)
41
- with col1:
42
- submit = st.button('Submit', on_click=generate)
43
- with col2:
44
- use_tricksy = st.toggle('Use Tricksy', True, help='If true, only send sparse MLP weight diffs to GPU. If false, send all weights to GPU.')
45
 
46
  with st.expander('Additional options'):
47
- max_new_tokens = st.slider('Max new tokens', 1, 500, 100)
48
  top_k = st.slider('Top-k sampling', 1, 500, 50)
49
- top_p = st.slider('Top-p (nucleus sampling)', 0.0, 1.0, .9)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from threading import Thread
2
+ import gc
3
+ import time
4
 
5
  import streamlit as st
6
 
 
9
  from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
10
  from configuration_tricksy import TricksyConfig
11
 
12
+ if 'submit' in st.session_state and st.session_state.submit == True:
13
+ st.session_state.generating = True
14
+ else:
15
+ st.session_state.generating = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  prompt = st.text_area('Prompt', 'Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
18
 
19
  col1, col2 = st.columns(2)
 
 
 
 
20
 
21
  with st.expander('Additional options'):
22
+ max_new_tokens = st.slider('Max new tokens', 1, 500, 50)
23
  top_k = st.slider('Top-k sampling', 1, 500, 50)
24
+ top_p = st.slider('Top-p (nucleus sampling)', 0.0, 1.0, .9)
25
+
26
+ out = st.chat_message('user')
27
+ stats = st.empty()
28
+
29
+ with col1:
30
+ use_tricksy = st.toggle('Use Tricksy', True, help='If true, only send sparse MLP weight diffs to GPU. If false, send all weights to GPU.')
31
+ with col2:
32
+ if st.button('Submit', disabled=st.session_state.generating, key='submit'):
33
+ set_seed(42)
34
+ # 13.4 GB (16 bit)
35
+ model_name = 'facebook/opt-6.7b'
36
+ disk_weights = OPTDiskWeights(model_name)
37
+ tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
38
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
39
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
40
+
41
+ inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
42
+
43
+ print()
44
+ generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
45
+ thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
46
+ thread.start()
47
+ generated_text = ''
48
+ with out:
49
+ t = st.empty()
50
+ for new_text in streamer:
51
+ generated_text += new_text.replace('\n', ' \n')
52
+ t.write(generated_text)
53
+
54
+ stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
55
+ stats_text += f' \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
56
+ stats_text += f' \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
57
+ stats.write(stats_text)
58
+
59
+ disk_weights = None
60
+ tricksy_model = None
61
+ time.sleep(.2)
62
+ # st.write(f'num open files: {len(psutil.Process().open_files())}')
63
+ torch.cuda.empty_cache()
64
+ gc.collect()
65
+ torch.cuda.reset_peak_memory_stats()