gpt-99 commited on
Commit
a090ab6
·
verified ·
1 Parent(s): 40cf5e7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -9
app.py CHANGED
@@ -47,6 +47,7 @@ def do_steering(model, test_toks, steering_vec, scale=1, normalise=True, layer=N
47
  sv = steering_vec / steering_vec.norm()
48
  else:
49
  sv = steering_vec
 
50
  if proj:
51
  sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
52
  input[0][:,:,:] = input[0][:,:,:] - scale * sv
@@ -80,9 +81,8 @@ def create_steering_vector(towards, away):
80
  return steering_vecs
81
 
82
  def chat(message, history, steering_vec, layer):
83
- history_formatted = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg} for i, msg in enumerate(history)]
84
- history_formatted.append({"role": "user", "content": message})
85
-
86
  input_ids = tokenize_instructions(tokenizer, [history_formatted])
87
 
88
  generations_baseline = do_steering(model, input_ids.to(device), None)
@@ -90,11 +90,14 @@ def chat(message, history, steering_vec, layer):
90
  response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
91
 
92
  if steering_vec is not None:
93
- generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=1)
94
  for j in range(generation_intervene.shape[0]):
95
  response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
96
 
97
- response = response_baseline + "\n\n" + response_intervention
 
 
 
98
 
99
  return [(message, response)]
100
 
@@ -106,6 +109,45 @@ def launch_app():
106
  away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
107
 
108
  towards_default = ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  with gr.Row():
111
  towards = gr.Textbox(label="Towards (comma-separated)", value= ", ".join(sentence.replace(",", "") for sentence in towards_default))
@@ -132,7 +174,3 @@ def launch_app():
132
  if __name__ == "__main__":
133
  launch_app()
134
 
135
-
136
- # clean up
137
- # nicer baseline vs intervention
138
- # auto clear after messgae
 
47
  sv = steering_vec / steering_vec.norm()
48
  else:
49
  sv = steering_vec
50
+ sv = torch.clamp(sv, min=-1e3, max=1e3)
51
  if proj:
52
  sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
53
  input[0][:,:,:] = input[0][:,:,:] - scale * sv
 
81
  return steering_vecs
82
 
83
  def chat(message, history, steering_vec, layer):
84
+ history_formatted = [{"role": "user", "content": message}]
85
+
 
86
  input_ids = tokenize_instructions(tokenizer, [history_formatted])
87
 
88
  generations_baseline = do_steering(model, input_ids.to(device), None)
 
90
  response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
91
 
92
  if steering_vec is not None:
93
+ generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=0.5)
94
  for j in range(generation_intervene.shape[0]):
95
  response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
96
 
97
+ response = response_baseline
98
+
99
+ if steering_vec is not None:
100
+ response += "\n\n" + response_intervention
101
 
102
  return [(message, response)]
103
 
 
109
  away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
110
 
111
  towards_default = ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']
112
+
113
+ instructions = """
114
+ ### Instructions for Using the Steering Chatbot
115
+
116
+ Welcome to the Steering Chatbot! This app allows you to explore how language models can be guided (or "steered")
117
+ to generate different types of responses. You will be able to create **steering vectors** that influence the chatbot to either generate responses
118
+ that favor one set of ideas (like "love") or avoid another set (like "hate").
119
+
120
+ #### How to Use the App:
121
+
122
+ 1. **Define Your "Towards" and "Away" Directions:**
123
+ - In the **"Towards"** text box, enter a list of concepts, words, or phrases (comma-separated) that you want the model to generate responses toward.
124
+ For example, you might use: `love, happiness, kindness`.
125
+ - In the **"Away"** text box, enter a list of concepts, words, or phrases that you want the model to steer away from.
126
+ For example: `hate, anger, sadness`.
127
+
128
+ 2. **Create a Steering Vector:**
129
+ - Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
130
+ This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
131
+ - You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
132
+
133
+ 3. **Chat with the Model:**
134
+ - Type a message in the chatbox and press Enter. The model will generate two responses:
135
+ - **Baseline Response:** This is the model’s response without any steering vector applied.
136
+ - **Intervention Response:** This is the response after applying the steering vector.
137
+
138
+ 4. **Compare Results:**
139
+ - The chatbot will show both the baseline (non-steered) and the intervention (steered) responses.
140
+ You can compare them to see how much influence the steering vector had on the generated text.
141
+
142
+ **Tips:**
143
+ - Try experimenting with different word sets for "Towards" and "Away" to see how it affects the chatbot's behavior.
144
+ - Adjusting the **layer slider** allows you to control at which stage of the model's processing the steering vector is applied,
145
+ which can lead to different types of modifications in the output.
146
+
147
+ Happy chatting!
148
+ """
149
+
150
+ instruction_dropdown = gr.Markdown(instructions)
151
 
152
  with gr.Row():
153
  towards = gr.Textbox(label="Towards (comma-separated)", value= ", ".join(sentence.replace(",", "") for sentence in towards_default))
 
174
  if __name__ == "__main__":
175
  launch_app()
176