Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -47,6 +47,7 @@ def do_steering(model, test_toks, steering_vec, scale=1, normalise=True, layer=N
|
|
47 |
sv = steering_vec / steering_vec.norm()
|
48 |
else:
|
49 |
sv = steering_vec
|
|
|
50 |
if proj:
|
51 |
sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
|
52 |
input[0][:,:,:] = input[0][:,:,:] - scale * sv
|
@@ -80,9 +81,8 @@ def create_steering_vector(towards, away):
|
|
80 |
return steering_vecs
|
81 |
|
82 |
def chat(message, history, steering_vec, layer):
|
83 |
-
history_formatted = [{"role": "user"
|
84 |
-
|
85 |
-
|
86 |
input_ids = tokenize_instructions(tokenizer, [history_formatted])
|
87 |
|
88 |
generations_baseline = do_steering(model, input_ids.to(device), None)
|
@@ -90,11 +90,14 @@ def chat(message, history, steering_vec, layer):
|
|
90 |
response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
|
91 |
|
92 |
if steering_vec is not None:
|
93 |
-
generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=
|
94 |
for j in range(generation_intervene.shape[0]):
|
95 |
response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
|
96 |
|
97 |
-
response = response_baseline
|
|
|
|
|
|
|
98 |
|
99 |
return [(message, response)]
|
100 |
|
@@ -106,6 +109,45 @@ def launch_app():
|
|
106 |
away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
|
107 |
|
108 |
towards_default = ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
with gr.Row():
|
111 |
towards = gr.Textbox(label="Towards (comma-separated)", value= ", ".join(sentence.replace(",", "") for sentence in towards_default))
|
@@ -132,7 +174,3 @@ def launch_app():
|
|
132 |
if __name__ == "__main__":
|
133 |
launch_app()
|
134 |
|
135 |
-
|
136 |
-
# clean up
|
137 |
-
# nicer baseline vs intervention
|
138 |
-
# auto clear after messgae
|
|
|
47 |
sv = steering_vec / steering_vec.norm()
|
48 |
else:
|
49 |
sv = steering_vec
|
50 |
+
sv = torch.clamp(sv, min=-1e3, max=1e3)
|
51 |
if proj:
|
52 |
sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
|
53 |
input[0][:,:,:] = input[0][:,:,:] - scale * sv
|
|
|
81 |
return steering_vecs
|
82 |
|
83 |
def chat(message, history, steering_vec, layer):
|
84 |
+
history_formatted = [{"role": "user", "content": message}]
|
85 |
+
|
|
|
86 |
input_ids = tokenize_instructions(tokenizer, [history_formatted])
|
87 |
|
88 |
generations_baseline = do_steering(model, input_ids.to(device), None)
|
|
|
90 |
response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
|
91 |
|
92 |
if steering_vec is not None:
|
93 |
+
generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=0.5)
|
94 |
for j in range(generation_intervene.shape[0]):
|
95 |
response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
|
96 |
|
97 |
+
response = response_baseline
|
98 |
+
|
99 |
+
if steering_vec is not None:
|
100 |
+
response += "\n\n" + response_intervention
|
101 |
|
102 |
return [(message, response)]
|
103 |
|
|
|
109 |
away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
|
110 |
|
111 |
towards_default = ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']
|
112 |
+
|
113 |
+
instructions = """
|
114 |
+
### Instructions for Using the Steering Chatbot
|
115 |
+
|
116 |
+
Welcome to the Steering Chatbot! This app allows you to explore how language models can be guided (or "steered")
|
117 |
+
to generate different types of responses. You will be able to create **steering vectors** that influence the chatbot to either generate responses
|
118 |
+
that favor one set of ideas (like "love") or avoid another set (like "hate").
|
119 |
+
|
120 |
+
#### How to Use the App:
|
121 |
+
|
122 |
+
1. **Define Your "Towards" and "Away" Directions:**
|
123 |
+
- In the **"Towards"** text box, enter a list of concepts, words, or phrases (comma-separated) that you want the model to generate responses toward.
|
124 |
+
For example, you might use: `love, happiness, kindness`.
|
125 |
+
- In the **"Away"** text box, enter a list of concepts, words, or phrases that you want the model to steer away from.
|
126 |
+
For example: `hate, anger, sadness`.
|
127 |
+
|
128 |
+
2. **Create a Steering Vector:**
|
129 |
+
- Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
|
130 |
+
This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
|
131 |
+
- You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
|
132 |
+
|
133 |
+
3. **Chat with the Model:**
|
134 |
+
- Type a message in the chatbox and press Enter. The model will generate two responses:
|
135 |
+
- **Baseline Response:** This is the model’s response without any steering vector applied.
|
136 |
+
- **Intervention Response:** This is the response after applying the steering vector.
|
137 |
+
|
138 |
+
4. **Compare Results:**
|
139 |
+
- The chatbot will show both the baseline (non-steered) and the intervention (steered) responses.
|
140 |
+
You can compare them to see how much influence the steering vector had on the generated text.
|
141 |
+
|
142 |
+
**Tips:**
|
143 |
+
- Try experimenting with different word sets for "Towards" and "Away" to see how it affects the chatbot's behavior.
|
144 |
+
- Adjusting the **layer slider** allows you to control at which stage of the model's processing the steering vector is applied,
|
145 |
+
which can lead to different types of modifications in the output.
|
146 |
+
|
147 |
+
Happy chatting!
|
148 |
+
"""
|
149 |
+
|
150 |
+
instruction_dropdown = gr.Markdown(instructions)
|
151 |
|
152 |
with gr.Row():
|
153 |
towards = gr.Textbox(label="Towards (comma-separated)", value= ", ".join(sentence.replace(",", "") for sentence in towards_default))
|
|
|
174 |
if __name__ == "__main__":
|
175 |
launch_app()
|
176 |
|
|
|
|
|
|
|
|