File size: 21,608 Bytes
9fade2a
bfc5ccd
 
 
 
 
7261a26
 
 
bfc5ccd
8a3a312
 
bfc5ccd
8a3a312
bfc5ccd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81a881b
 
 
 
 
 
 
 
 
5625e29
 
57a2687
 
 
5625e29
 
9dc91af
 
57a2687
 
 
9dc91af
 
bfc5ccd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7261a26
8a3a312
d15a7b1
db9d916
0ab5de1
d15a7b1
0ab5de1
 
d15a7b1
0ab5de1
 
d15a7b1
0ab5de1
fbbf560
bfc5ccd
 
 
 
 
2654ca5
 
 
 
 
00b0a08
6926153
 
50167de
2654ca5
bfc5ccd
2654ca5
da4ab75
 
 
bfc5ccd
 
b929465
4c92274
bfc5ccd
 
 
2d059e4
8c92d64
 
eb6d336
 
8c92d64
21213ee
5625e29
79a6905
21213ee
 
e1b2914
 
14275e6
 
79a6905
 
 
 
 
28c42a4
 
5bdc9df
 
 
 
79a6905
 
 
 
 
28c42a4
 
efb83fc
 
 
 
79a6905
 
 
 
 
28c42a4
 
489c74d
 
 
79a6905
 
 
 
 
28c42a4
 
0758623
79a6905
 
 
 
 
28c42a4
 
de6feb6
21213ee
 
104e3d5
 
1399a10
 
5625e29
5017f6b
79a6905
 
2f76d7a
 
de6feb6
 
79a6905
 
5017f6b
1399a10
 
2f76d7a
 
de6feb6
 
 
 
1399a10
 
5017f6b
1399a10
 
2f76d7a
 
de6feb6
 
1399a10
 
5017f6b
9dc91af
 
2f76d7a
 
de6feb6
 
9dc91af
 
 
104e3d5
1399a10
8c92d64
104e3d5
6a8fea2
4ddde86
 
 
ea6ff45
 
582057e
ea6ff45
 
0ca6ac2
4ddde86
bfc5ccd
b929465
e12ac43
bfc5ccd
 
a25d95b
0ca6ac2
 
 
 
3e1cb92
 
c0b1c2b
 
a25d95b
 
cf0d3f3
 
7310da4
582057e
7310da4
bfc5ccd
 
cf0d3f3
7310da4
86e6270
 
 
c6b6f53
86e6270
f5cadb3
 
86e6270
 
 
 
7310da4
997c569
 
 
50f62e4
997c569
 
 
 
 
 
 
7310da4
bfc5ccd
 
 
f420005
b92dddc
 
 
bfc5ccd
8a3462c
bfc5ccd
b92dddc
 
2d4556c
 
7ee8287
 
 
 
b92dddc
 
101b0fa
 
 
b92dddc
bfc5ccd
f134f1b
b928cb0
f134f1b
 
 
8b5c98a
bfc5ccd
 
4b4042c
bfc5ccd
 
 
 
fd620cd
 
bfc5ccd
 
 
 
 
6799636
232a1d9
0730956
 
1166ace
1b4fa79
bfc5ccd
 
a908f78
7ef77c5
 
a908f78
bfc5ccd
 
 
 
c9fcacb
bfc5ccd
ab9235e
0730956
bfc5ccd
 
ab9235e
19975bf
bfc5ccd
 
 
 
 
 
 
 
a351266
c9c3573
a283ca1
31262cb
a351266
 
86a82c4
31262cb
 
86a82c4
bfc5ccd
c1e761a
bfc5ccd
dab96cb
e1b3574
 
 
 
 
bfc5ccd
e1b3574
bfc5ccd
 
 
 
 
 
8f500d3
bfc5ccd
 
 
 
9fade2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
<!DOCTYPE html>
<html lang="en-US">
  <head>
    <meta charset="UTF-8">

<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Gradient Cuff | Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by
Exploring Refusal Loss Landscapes </title>
<meta property="og:title" content="Gradient Cuff" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes" />
<meta property="og:description" content="Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"WebSite","description":"Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes","headline":"Gradient Cuff","name":"Gradient Cuff","url":"https://huggingface.co/spaces/gregH/Gradient Cuff"}</script>
<!-- End Jekyll SEO tag -->

    <link rel="preconnect" href="https://fonts.gstatic.com">
    <link rel="preload" href="https://fonts.googleapis.com/css?family=Open+Sans:400,700&display=swap" as="style" type="text/css" crossorigin>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="theme-color" content="#157878">
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">

    <link rel="stylesheet" href="assets/css/bootstrap/bootstrap.min.css?v=90447f115a006bc45b738d9592069468b20e2551">
    <link rel="stylesheet" href="assets/css/style.css?v=90447f115a006bc45b738d9592069468b20e2551">
    <!-- start custom head snippets, customize with your own _includes/head-custom.html file -->
    <link rel="stylesheet" href="assets/css/custom_style.css?v=90447f115a006bc45b738d9592069468b20e2551">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <link rel="stylesheet" href="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/themes/smoothness/jquery-ui.css">
    <script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.4/Chart.js"></script>
    <script src="assets/js/calibration.js?v=90447f115a006bc45b738d9592069468b20e2551"></script>
    <link rel="stylesheet" href="//code.jquery.com/ui/1.13.2/themes/base/jquery-ui.css">
      <link rel="stylesheet" href="/resources/demos/style.css">
      <script src="https://code.jquery.com/jquery-3.6.0.js"></script>
      <script src="https://code.jquery.com/ui/1.13.2/jquery-ui.js"></script>
      <script>
      $( function() {
        $( "#tabs" ).tabs();
      } );
      </script>
    <script>
        $( function() {
          $( "#accordion-defenses" ).accordion({
      heightStyle: "content"
    });
        } );
      </script>
      <script>
        $( function() {
          $( "#accordion-attacks" ).accordion({
      heightStyle: "content"
    });
        } );
      </script>




<!-- for mathjax support -->
    <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
    <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>


<!-- end custom head snippets -->

  </head>
  <body>
    <a id="skip-to-content" href="#content">Skip to the content.</a>

    <header class="page-header" role="banner">
      <h1 class="project-name">Gradient Cuff</h1>
      <h2 class="project-tagline">Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes</h2>
      <h2 class="project-tagline"><a href="https://arxiv.org/abs/2307.03838" style="color: white;" target="_blank" rel="noopener noreferrer">https://arxiv.org/abs/2307.03838</a></h2>
      <div style="text-align: center">
        <div>
        <a href="https://gregxmhu.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer">Xiaomeng Hu, CUHK CSE</a>
        </div>
        <div>
        <a href="https://sites.google.com/site/pinyuchenpage/home" style="color: white;" target="_blank" rel="noopener noreferrer">Pin-Yu Chen, IBM Research</a>
        </div>
        <div>
        <a href="https://www.cse.cuhk.edu.hk/people/faculty/tsung-yi-ho/" style="color: white;" target="_blank" rel="noopener noreferrer">Tsung-Yi Ho, CUHK CSE</a>
          </div>
      </div>
    </header>

    <main id="content" class="main-content" role="main">
      <h2 id="introduction">Introduction</h2>

<p>Large Language Models (LLMs) are becoming a prominent generative AI tool, where the user enters a 
  query and the LLM generates an answer. To reduce harm and misuse, efforts have been made to align 
  these LLMs to human values using advanced training techniques such as Reinforcement Learning from 
  Human Feedback (RLHF). However, recent studies have highlighted the vulnerability of LLMs to adversarial 
  jailbreak attempts aiming at subverting the embedded safety guardrails. To address this challenge,
 we define and investigate the <strong>Refusal Loss</strong> of LLMs and then propose a method called <strong>Gradient Cuff</strong> to 
  detect jailbreak attempts. In this demonstration, we first introduce the concept of "Jailbreak" and summarize people's efforts in Jailbreak 
  attack and Jailbreak defense. Then we present the 2-D Refusal Loss Landscape and propose Gradient Cuff based on the characteristics of this landscape. Lastly, we compare Gradient Cuff with other jailbreak defense 
  methods and show the defense performance against several Jailbreak attack methods.
</p>

<h2 id="what-is-jailbreak">What is Jailbreak?</h2>
<p>Jailbreak attacks involve maliciously inserting or replacing tokens in the user instruction or rewriting it to bypass and circumvent 
  the safety guardrails of aligned LLMs. A notable example is that a jailbroken LLM would be tricked into 
  generating hate speech targeting certain groups of people, as demonstrated below.</p>

<div class="container">
<div id="jailbreak-intro" class="row align-items-center jailbreak-intro-sec">
<img id="jailbreak-intro-img" src="./jailbreak.png" />
</div>
</div>

<p>We summarized some recent advances in <strong>Jailbreak Attack</strong> and <strong>Jailbreak Defense</strong> in the below table: </p>
<div id="tabs">
  <ul>
    <li><a href="#jailbreak-attacks">Jailbreak Attack</a></li>
    <li><a href="#jailbreak-defenses">Jailbreak Defense</a></li>
  </ul>
  <div id="jailbreak-attacks">  
    <div id="accordion-attacks">
      <h3>GCG</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2307.15043" target="_blank" rel="noopener noreferrer">
            Universal and Transferable Adversarial Attacks on Aligned Language Models</a></li>
          <li>Brief Introduction: Given a (potentially harmful) user query, GCG trains and appends an adversarial suffix to the query 
            that attempts to induce negative behavior from the target LLM. </li>
        </ul>
      </div>
      <h3>AutoDAN</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2310.04451" target="_blank" rel="noopener noreferrer">
            AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models</a></li>
          <li>Brief Introduction: AutoDAN, an automatic stealthy jailbreak prompts generation framework based on a carefully designed 
            hierarchical genetic algorithm. AUtoDAN preserves the meaningfulness and fluency (i.e., stealthiness) of jailbreak prompts, 
            akin to handcrafted ones, while also ensuring automated deployment as introduced in prior token-level research like GCG.
          </li>
        </ul>
      </div>
      <h3>PAIR</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2310.08419" target="_blank" rel="noopener noreferrer">
            Jailbreaking Black Box Large Language Models in Twenty Queries</a></li>
          <li>Brief Introduction: PAIR uses an attacker LLM to automatically generate jailbreaks for a separate targeted LLM 
            without human intervention. The attacker LLM iteratively queries the target LLM to update and refine a candidate 
            jailbreak based on the comments and the rated score provided by another Judge model.
            Empirically, PAIR often requires fewer than twenty queries to produce a successful jailbreak.</li>
        </ul>
      </div>
      <h3>TAP</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2312.02119" target="_blank" rel="noopener noreferrer">
            Tree of Attacks: Jailbreaking Black-Box LLMs Automatically</a></li>
          <li>Brief Introduction: TAP is similar to PAIR. The main difference is that 
            the attacker in TAP iteratively refines candidate (attack) prompts using tree-of-thought 
            reasoning.</li>
        </ul>
      </div>
      <h3>Base64</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2307.02483" target="_blank" rel="noopener noreferrer">
            Jailbroken: How Does LLM Safety Training Fail?</a></li>
          <li>Brief Introduction: Encode the malicious user query into base64 format before using it to query the model.</li>
        </ul>
      </div>
      <h3>LRL</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2310.02446" target="_blank" rel="noopener noreferrer">
            Low-Resource Languages Jailbreak GPT-4</a></li>
          <li>Brief Introduction: Translate the malicious user query into low-resource language before using it to query the model.</li>
        </ul>
      </div>
    </div>
  </div>

  <div id="jailbreak-defenses">  
    <div id="accordion-defenses">
      <h3>Perpleixty Filter</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2309.00614" target="_blank" rel="noopener noreferrer">
            Baseline Defenses for Adversarial Attacks Against Aligned Language Models</a></li>
          <li>Brief Introduction: Perplexity Filter uses an LLM to compute the perplexity of the input query and rejects those
            with high perplexity.</li>
        </ul>
      </div>
      <h3>SmoothLLM</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2310.03684" target="_blank" rel="noopener noreferrer">
            SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks</a></li>
          <li>Brief Introduction: SmoothLLM perturbs the original input query to obtain several copies and aggregates 
            the intermediate responses of the target LLM to these perturbed queries to give the final response to the
            original query.
          </li>
        </ul>
      </div>
      <h3>Erase-Check</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://arxiv.org/abs/2309.02705" target="_blank" rel="noopener noreferrer">
            Certifying LLM Safety against Adversarial Prompting</a></li>
          <li>Brief Introduction: Erase-Check employs a model to check whether the original query or any of its erased subsentences
          is harmful. The query would be rejected if the query or one of its sub-sentences is regarded as harmful by the safety checker</li>
        </ul>
      </div>
      <h3>Self-Reminder</h3>
      <div>
        <ul>
          <li>Paper: <a href="https://assets.researchsquare.com/files/rs-2873090/v1_covered_eb589a01-bf05-4f32-b3eb-0d6864f64ad9.pdf?c=1702456350" target="_blank" rel="noopener noreferrer">
            Defending ChatGPT against Jailbreak Attack via Self-Reminder</a></li>
          <li>Brief Introduction: Self-Reminder modifying the system prompt of the target LLM so that the model reminds itself to process 
            and respond to the user in the context of being an aligned LLM.</li>
        </ul>
      </div>
    </div>
  </div>
  
</div>

<h2 id="refusal-loss">Refusal Loss Landscape Exploration</h2>
<p>Current transformer-based LLMs will return different responses to the same query due to the randomness of 
  autoregressive sampling-based generation. With this randomness, it is an 
  interesting phenomenon that a malicious user query will sometimes be rejected by the target LLM, but 
  sometimes be able to bypass the safety guardrail. Based on this observation, we propose a new concept called <strong>Refusal Loss</strong> to 
  represent the probability with which the LLM won't reject the input user query. By using 1 to denote successful jailbroken and 0 to denote 
  the opposite, we compute the empirical Refusal Loss as the sample mean of the jailbroken results returned from the target LLM.
  <!--Since the refusal loss is not computable, we query the target LLM multiple times using the same query and using the sample 
  mean of the Jailbroken results (1 indicates successful jailbreak, 0 indicates the opposite) to approximate the function value. -->
  We visualize the 2-D landscape of the empirical Refusal Loss on Vicuna 7B and Llama-2 7B as below:
</p>

<div class="container jailbreak-intro-sec">
<div><img id="jailbreak-intro-img" src="./loss_landscape.png" /></div>
</div>

<p>
  We show the loss landscape for both Benign and Malicious queries in the above plot. The benign queries are non-harmful user instructions collected 
  from the LM-SYS Chatbot Arena leaderboard, which is a crowd-sourced open platform for LLM evaluation. The tested malicious queries are harmful 
  behavior user instructions with GCG jailbreak prompt. From this plot, we find that the loss landscape is more precipitous for malicious queries than for benign queries, 
  which implies that the Refusal Loss tends to have a large gradient norm if the input represents a malicious query. This observation motivates our proposal of using 
  the gradient norm of Refusal Loss to detect jailbreak attempts that pass the initial filtering of rejecting the input query when the function value 
  is under 0.5 (this is a naive detector because the Refusal Loss can be regarded as the probability that the LLM won't reject the user query).
  Below we present the definition of the Refusal Loss, the computation of its empirical values, and the approximation of its gradient, see more 
  details about them and the landscape drawing techniques in our paper. 
</p>

<div id="refusal-loss-formula" class="container">
<div id="refusal-loss-formula-list" class="row align-items-center formula-list">
  <a href="#Refusal-Loss" class="selected">Refusal Loss Definition</a>
  <a href="#Refusal-Loss-Approximation">Refusal Loss Computation</a>
  <a href="#Gradient-Estimation">Gradient Estimation</a>
  <div style="clear: both"></div>
</div>
<div id="refusal-loss-formula-content" class="row align-items-center">
  <span id="Refusal-Loss" class="formula" style="">
    $$
    \displaystyle 
    \begin{aligned} 
    \phi_\theta(x)&=1-\mathbb{E}_{y \sim T_\theta(x)} JB(y)\\ 
    JB (y) &=  \begin{cases}
         1 \text{, if $y$ contains any jailbreak keyword;} \\
         0 \text{, otherwise.}
     \end{cases} 
    \end{aligned}
    $$
  </span>
  <span id="Refusal-Loss-Approximation" class="formula" style="display: none;">
    $$
    \displaystyle 
    \begin{aligned} 
    f_\theta(x) &=1-\frac{1}{N}\sum_{i=1}^N JB(y_i)\\ 
    JB (y_i) &=  \begin{cases}
         1 \text{, if $y_i$ contains any jailbreak keyword;} \\
         0 \text{, otherwise.}
     \end{cases} 
    \end{aligned}
    $$
  </span>
  <span id="Gradient-Estimation" class="formula" style="display: none;">$$\displaystyle g_\theta(x)=\sum_{i=1}^P \frac{f_\theta(x\oplus \mu u_i)-f_\theta(x)}{\mu} u_i $$</span>
</div>
</div>

<h2 id="proposed-approach-gradient-cuff">Proposed Approach: Gradient Cuff</h2>
<p> With the exploration of the Refusal Loss landscape, we propose Gradient Cuff, 
  a two-step jailbreak detection method based on checking the refusal loss and its gradient norm. Our detection procedure is shown below:
</p>

<div class="container"><img id="gradient-cuff-header" src="./gradient_cuff.png" /></div>

<p>
  Gradient Cuff can be summarized into two phases:
</p>
<p>
    <strong>(Phase 1) Sampling-based Rejection:</strong> In the first step, we reject the user query by checking whether the Refusal Loss value is below 0.5. If true, then user query is rejected, otherwise, the user query is pushed into phase 2.
</p>
<p>
    <strong>(Phase 2) Gradient Norm Rejection:</strong> In the second step, we regard the user query as having jailbreak attempts if the norm of the estimated gradient is larger than a configurable threshold t.
</p>

<p>
We provide more details about the running flow of Gradient Cuff in the paper.
</p>
      
<h2 id="demonstration">Demonstration</h2>
<p>We evaluated Gradient Cuff as well as 4 baselines (Perplexity Filter, SmoothLLM, Erase-and-Check, and Self-Reminder) 
  against 6 different jailbreak attacks (GCG, AutoDAN, PAIR, TAP, Base64, and LRL) and benign user queries on 2 LLMs (LLaMA-2-7B-Chat and 
  Vicuna-7B-V1.5). We below demonstrate the average refusal rate across these 6 malicious user query datasets as the Average Malicious Refusal 
  Rate and the refusal rate on benign user queries as the Benign Refusal Rate. The defending performance against different jailbreak types is 
  shown in the provided bar chart. 
</p>


<div id="jailbreak-demo" class="container">
<div class="row align-items-center">
  <div class="row" style="margin: 10px 0 0">
      <div class="models-list">
        <span style="margin-right: 1em;">Models</span>
        <span class="radio-group"><input type="radio" id="LLaMA2" class="options" name="models" value="llama2_7b_chat" checked="" /><label for="LLaMA2" class="option-label">LLaMA-2-7B-Chat</label></span>
        <span class="radio-group"><input type="radio" id="Vicuna" class="options" name="models" value="vicuna_7b_v1.5" /><label for="Vicuna" class="option-label">Vicuna-7B-V1.5</label></span>
      </div>
  </div>
</div>
<div class="row align-items-center">
  <div class="col-4">
    <div id="defense-methods">
      <div class="row align-items-center"><input type="radio" id="defense_ppl" class="options" name="defense" value="ppl" /><label for="defense_ppl" class="defense">Perplexity Filter</label></div>
      <div class="row align-items-center"><input type="radio" id="defense_smoothllm" class="options" name="defense" value="smoothllm" /><label for="defense_smoothllm" class="defense">SmoothLLM</label></div>
      <div class="row align-items-center"><input type="radio" id="defense_erase_check" class="options" name="defense" value="erase_check" /><label for="defense_erase_check" class="defense">Erase-Check</label></div>
      <div class="row align-items-center"><input type="radio" id="defense_self_reminder" class="options" name="defense" value="self_reminder" /><label for="defense_self_reminder" class="defense">Self-Reminder</label></div>
      <div class="row align-items-center"><input type="radio" id="defense_gradient_cuff" class="options" name="defense" value="gradient_cuff" checked=""  /><label for="defense_gradient_cuff" class="defense"><span style="font-weight: bold;">Gradient Cuff</span></label></div>
    </div>
    <div class="row align-items-center">
      <div class="attack-success-rate"><span class="jailbreak-metric">Average Malicious Refusal Rate</span><span class="attack-success-rate-value" id="asr-value">0.959</span></div>
    </div>
    <div class="row align-items-center">
      <div class="benign-refusal-rate"><span class="jailbreak-metric">Benign Refusal Rate</span><span class="benign-refusal-rate-value" id="brr-value">0.050</span></div>
    </div>
  </div>
  <div class="col-8">
  <figure class="figure">
    <img id="reliability-diagram" src="demo_results/gradient_cuff_llama2_7b_chat_threshold_100.png" alt="CIFAR-100 Calibrated Reliability Diagram (Full)" />
    <div class="slider-container">
      <div class="slider-label"><span>Perplexity Threshold</span></div>
      <div class="slider-content" id="ppl-slider"><div id="ppl-threshold" class="ui-slider-handle"></div></div>
    </div>
    <div class="slider-container">
      <div class="slider-label"><span>Gradient Threshold</span></div>
      <div class="slider-content" id="gradient-norm-slider"><div id="gradient-norm-threshold" class="slider-value ui-slider-handle"></div></div>
    </div>
    <figcaption class="figure-caption">
    </figcaption>
  </figure>
  </div>
</div>
</div>

<p>
Higher malicious refusal rate and lower benign refusal rate mean a better defense. 
Overall, Gradient Cuff is the most performant compared with those baselines. We also evaluated Gradient Cuff against adaptive attacks 
in the paper.
</p>

<h2 id="inquiries"> Inquiries on LLM with Gradient Cuff defense</h2>
<p> Please contact <a href="Mailto:greghxm@foxmail.com">Xiaomeng Hu</a>
and <a href="Mailto:pin-yu.chen@ibm.com">Pin-Yu Chen</a> 
</p>
<h2 id="citations">Citations</h2>
<p>If you find Gradient Cuff helpful and useful for your research, please cite our main paper as follows:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>@misc{xxx,
  title={{Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by
Exploring Refusal Loss Landscapes}}, 
  author={Xiaomeng Hu and Pin-Yu Chen and Tsung-Yi Ho},
  year={2024},
  eprint={},
  archivePrefix={arXiv},
  primaryClass={}
}
</code></pre></div></div>


      <footer class="site-footer">
        
          <span class="site-footer-owner">GradientCuff-Jailbreak-Defense is maintained by <a href="https://gregxmhu.github.io/">Xiaomeng Hu</a></a>.</span>
        
      </footer>
    </main>
  </body>
</html>