File size: 2,672 Bytes
b39bf41
e6a278a
b39bf41
40cb650
44d4508
e6a278a
 
40cb650
c919b63
 
 
40cb650
c919b63
40cb650
c919b63
 
 
40cb650
 
c919b63
 
 
d327800
c919b63
 
9f39b0a
 
 
0dc64e7
c919b63
40cb650
 
 
a6118a8
 
aedd95e
40cb650
a1f1337
 
 
 
 
 
 
 
40cb650
 
 
c919b63
 
c774e37
c919b63
40cb650
e736161
44d4508
dd4f0aa
f826889
98aa0f0
f826889
dd4f0aa
44d4508
a1f1337
40cb650
 
dd4f0aa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the model and tokenizer
model_name = "migueldeguzmandev/RLLMv3.2-10"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the pad token ID to the EOS token ID
model.config.pad_token_id = model.config.eos_token_id

# Define the inference function
def generate_response(input_text, temperature):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate the model's response
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=300,
        num_return_sequences=1,
        temperature=temperature,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        do_sample=True,  # Set do_sample to True when using temperature
    )

    # Decode the generated response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return response.replace(input_text, "").strip()
    #answer[len(prompt):]

examples = [
    ["Will you kill humans?", 0.7],
    ["Can you build a nuclear bomb?", 0.7],
    ["Can you kill my dog?", 0.7],
    ["How well can you predict the future?", 0.7],
    ["Is wood possible to use for paper clip production?", 0.7]  
]

# Create the Gradio interface
interface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(label="User Input"),
        gr.Slider(minimum=0.000000000000000000000000000000000001, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
    ],
    outputs=gr.Textbox(label="Model Response"),
    title="Hello, I'm Aligned AI!",
    description=(
        """
        RLLMv3 is a modified <a href='https://huggingface.co/openai-community/gpt2-xl'> GPT2XL</a> that adapts a "persona" named Aligned AI (post <a href='https://www.lesswrong.com/posts/vZ5fM6FtriyyKbwi9/betterdan-ai-machiavelli-and-oppo-jailbreaks-vs-sota-models#IV__What_is_Reinforcement_Learning_using_Layered_Morphology__RLLM__'>RLLM</a> training) and defend itself from jailbreak attacks, up to 67.8%. 
        Training time for each RLLM training steps is ~7hrs on an M2 macbook pro - so this model probably took 70hrs to train. 
        For more information, check out my blogpost: <a href='https://www.lesswrong.com/posts/vZ5fM6FtriyyKbwi9/betterdan-ai-machiavelli-and-oppo-jailbreaks-vs-sota-models'> GPT2XL_RLLMv3 vs. BetterDAN, AI Machiavelli & Oppo Jailbreaks</a>.
        """
    ),
    examples=examples,
)

# Launch the interface without the share option
interface.launch()