Spaces:

gpt-99
/

steering-vectors

Running

App Files Files Community

gpt-99 commited on Oct 17, 2024

Commit

a090ab6

verified ·

1 Parent(s): 40cf5e7

Upload app.py

Browse files

Files changed (1) hide show

app.py +47 -9

app.py CHANGED Viewed

@@ -47,6 +47,7 @@ def do_steering(model, test_toks, steering_vec, scale=1, normalise=True, layer=N
                 sv = steering_vec / steering_vec.norm()
             else:
                 sv = steering_vec
             if proj:
                 sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
             input[0][:,:,:] = input[0][:,:,:] - scale * sv
@@ -80,9 +81,8 @@ def create_steering_vector(towards, away):
     return steering_vecs
 def chat(message, history, steering_vec, layer):
-    history_formatted = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg} for i, msg in enumerate(history)]
-    history_formatted.append({"role": "user", "content": message})
     input_ids = tokenize_instructions(tokenizer, [history_formatted])
     generations_baseline = do_steering(model, input_ids.to(device), None)
@@ -90,11 +90,14 @@ def chat(message, history, steering_vec, layer):
         response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
     if steering_vec is not None:
-        generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=1)
         for j in range(generation_intervene.shape[0]):
             response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
-    response = response_baseline + "\n\n" + response_intervention
     return [(message, response)]
@@ -106,6 +109,45 @@ def launch_app():
         away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
         towards_default = ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']
         with gr.Row():
             towards = gr.Textbox(label="Towards (comma-separated)", value= ", ".join(sentence.replace(",", "") for sentence in towards_default))
@@ -132,7 +174,3 @@ def launch_app():
 if __name__ == "__main__":
     launch_app()
-    # clean up
-    # nicer baseline vs intervention
-    # auto clear after messgae

                 sv = steering_vec / steering_vec.norm()
             else:
                 sv = steering_vec
+            sv = torch.clamp(sv, min=-1e3, max=1e3)
             if proj:
                 sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
             input[0][:,:,:] = input[0][:,:,:] - scale * sv
     return steering_vecs
 def chat(message, history, steering_vec, layer):
+    history_formatted = [{"role": "user", "content": message}]
     input_ids = tokenize_instructions(tokenizer, [history_formatted])
     generations_baseline = do_steering(model, input_ids.to(device), None)
         response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True, layer=layer)}"
     if steering_vec is not None:
+        generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=0.5)
         for j in range(generation_intervene.shape[0]):
             response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
+    response = response_baseline
+    if steering_vec is not None:
+        response += "\n\n" + response_intervention
     return [(message, response)]
         away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
         towards_default = ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']
+        instructions = """
+        ### Instructions for Using the Steering Chatbot
+        Welcome to the Steering Chatbot! This app allows you to explore how language models can be guided (or "steered")
+        to generate different types of responses. You will be able to create **steering vectors** that influence the chatbot to either generate responses
+        that favor one set of ideas (like "love") or avoid another set (like "hate").
+        #### How to Use the App:
+        1. **Define Your "Towards" and "Away" Directions:**
+           - In the **"Towards"** text box, enter a list of concepts, words, or phrases (comma-separated) that you want the model to generate responses toward.
+             For example, you might use: `love, happiness, kindness`.
+           - In the **"Away"** text box, enter a list of concepts, words, or phrases that you want the model to steer away from.
+             For example: `hate, anger, sadness`.
+        2. **Create a Steering Vector:**
+           - Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
+             This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
+           - You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
+        3. **Chat with the Model:**
+           - Type a message in the chatbox and press Enter. The model will generate two responses:
+             - **Baseline Response:** This is the model’s response without any steering vector applied.
+             - **Intervention Response:** This is the response after applying the steering vector.
+        4. **Compare Results:**
+           - The chatbot will show both the baseline (non-steered) and the intervention (steered) responses.
+             You can compare them to see how much influence the steering vector had on the generated text.
+        **Tips:**
+        - Try experimenting with different word sets for "Towards" and "Away" to see how it affects the chatbot's behavior.
+        - Adjusting the **layer slider** allows you to control at which stage of the model's processing the steering vector is applied,
+          which can lead to different types of modifications in the output.
+        Happy chatting!
+        """
+        instruction_dropdown = gr.Markdown(instructions)
         with gr.Row():
             towards = gr.Textbox(label="Towards (comma-separated)", value= ", ".join(sentence.replace(",", "") for sentence in towards_default))
 if __name__ == "__main__":
     launch_app()