Asankhaya Sharma commited on
Commit
fc4600f
·
1 Parent(s): c4709fb

add the trained model

Browse files
Files changed (4) hide show
  1. app.py +13 -6
  2. config.json +39 -0
  3. pytorch_model.bin +3 -0
  4. tokenizer.json +0 -0
app.py CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
3
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
 
6
- checkpoint = "gpt2-large"
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
9
  @st.cache
@@ -13,10 +13,11 @@ def load_model(model_name):
13
 
14
  model = load_model(checkpoint)
15
 
16
- def infer(input_ids, max_tokens, temperature, top_k, top_p):
17
 
18
  output_sequences = model.generate(
19
  input_ids=input_ids,
 
20
  max_new_tokens=max_tokens,
21
  temperature=temperature,
22
  top_k=top_k,
@@ -39,18 +40,24 @@ st.write("This is a LLM that was fine-tuned on a dataset of investment memos to
39
 
40
  sent = st.text_area("Text", default_value, height = 400)
41
 
42
- max_tokens = st.sidebar.slider("Max Tokens", min_value = 32, max_value=512)
43
  temperature = st.sidebar.slider("Temperature", value = 0.8, min_value = 0.0, max_value=1.0, step=0.05)
44
  top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 4)
45
  top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.9)
46
 
47
- encoded_prompt = tokenizer.encode(tokenizer.eos_token+sent, add_special_tokens=False, return_tensors="pt")
 
 
 
 
 
 
48
  if encoded_prompt.size()[-1] == 0:
49
  input_ids = None
50
  else:
51
  input_ids = encoded_prompt
52
 
53
- output_sequences = infer(input_ids, max_tokens, temperature, top_k, top_p)
54
 
55
  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
56
  print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
@@ -70,4 +77,4 @@ for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
70
  generated_sequences.append(total_sequence)
71
  print(total_sequence)
72
 
73
- st.write(generated_sequences[-1])
 
3
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
 
6
+ checkpoint = "."
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
9
  @st.cache
 
13
 
14
  model = load_model(checkpoint)
15
 
16
+ def infer(input_ids, bad_words_ids, max_tokens, temperature, top_k, top_p):
17
 
18
  output_sequences = model.generate(
19
  input_ids=input_ids,
20
+ bad_words_ids = bad_words_ids,
21
  max_new_tokens=max_tokens,
22
  temperature=temperature,
23
  top_k=top_k,
 
40
 
41
  sent = st.text_area("Text", default_value, height = 400)
42
 
43
+ max_tokens = st.sidebar.slider("Max Tokens", min_value = 16, max_value=64)
44
  temperature = st.sidebar.slider("Temperature", value = 0.8, min_value = 0.0, max_value=1.0, step=0.05)
45
  top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 4)
46
  top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.9)
47
 
48
+ # print(model.config.max_position_embeddings)
49
+
50
+ encoded_prompt = tokenizer.encode(tokenizer.eos_token+sent, max_length=1024, return_tensors="pt", truncation=True)
51
+
52
+ # get tokens of words that should not be generated
53
+ bad_words_ids = tokenizer(["confidential", "angel.co", "angellist.com"], add_special_tokens=False).input_ids
54
+
55
  if encoded_prompt.size()[-1] == 0:
56
  input_ids = None
57
  else:
58
  input_ids = encoded_prompt
59
 
60
+ output_sequences = infer(input_ids, bad_words_ids, max_tokens, temperature, top_k, top_p)
61
 
62
  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
63
  print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
 
77
  generated_sequences.append(total_sequence)
78
  print(total_sequence)
79
 
80
+ st.markdown(generated_sequences[-1])
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2-large",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1280,
16
+ "n_head": 20,
17
+ "n_inner": null,
18
+ "n_layer": 36,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.27.0.dev0",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90483692644e017cc03a2b5470912ab80369b4a79deb4f031e3fced773988bbb
3
+ size 3134031497
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff