ACMCMC
Comments for the examples
0123a49
import gradio as gr
import torch
# import homoglyphs
import confusable_homoglyphs.confusables
from unidecode import unidecode
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
from evaluate import load
perplexity_metric = load("perplexity", module_type="metric")
# homoglyphs_processor = homoglyphs.Homoglyphs(
# ascii_strategy=homoglyphs.STRATEGY_LOAD, strategy=homoglyphs.STRATEGY_LOAD
# )
def calculate_perplexity(text_logits: torch.Tensor):
# The logits are not normalized, so we need to normalize them, e.g. by doing a log softmax
text_logits = torch.nn.functional.log_softmax(text_logits, dim=-1)
# Calculate the perplexity of the text
sequence_negative_log_likelihoods = -text_logits
average_negative_log_likelihood = sequence_negative_log_likelihoods.mean()
perplexity = torch.exp(average_negative_log_likelihood)
return perplexity.item()
# Function to calculate burstiness using an LLM
def process_homoglyphed_text(homoglyphed_text, unhomoglyphed_text):
# # Tokenize the texts
# unhomoglyphed_text_tokens = tokenizer(unhomoglyphed_text, return_tensors="pt")[
# "input_ids"
# ]
# homoglyphed_text_tokens = tokenizer(homoglyphed_text, return_tensors="pt")[
# "input_ids"
# ]
# # Calculate the logits for the texts
# with torch.no_grad():
# unhomoglyphed_text_logits = model(unhomoglyphed_text_tokens).logits
# homoglyphed_text_logits = model(homoglyphed_text_tokens).logits
# # Calculate the perplexity for both texts
# unhomoglyphed_text_perplexity = calculate_perplexity(unhomoglyphed_text_logits)
# homoglyphed_text_perplexity = calculate_perplexity(homoglyphed_text_logits)
unhomoglyphed_text_perplexity, homoglyphed_text_perplexity = (
perplexity_metric.compute(
predictions=[homoglyphed_text, unhomoglyphed_text], model_id="gpt2"
)["perplexities"]
)
print(
f"Unhomoglyphed text perplexity: {unhomoglyphed_text_perplexity}, homoglyphed text perplexity: {homoglyphed_text_perplexity}"
)
# If the version without homoglyphs is more than 1.5 of the perplexity of the version with homoglyphs, trigger the alarm
difference_ratio = unhomoglyphed_text_perplexity / homoglyphed_text_perplexity
print(f"Difference ratio: {difference_ratio}")
alarm_triggered = difference_ratio > 1.5
# Return the burstiness for both texts
return alarm_triggered, difference_ratio
def unhomoglyphize_text(homoglyphed_text):
confusables = confusable_homoglyphs.confusables.is_confusable(
homoglyphed_text, greedy=True, preferred_aliases=["latin"]
)
print(f"Confusables: {confusables}")
unhomoglyphed_text = homoglyphed_text
# Returns something like:
"""
[{'character': 'ρ',
'alias': 'GREEK',
'homoglyphs': [{'c': 'p', 'n': 'LATIN SMALL LETTER P'}]},
{'character': 'τ',
'alias': 'GREEK',
'homoglyphs': [{'c': 'ᴛ', 'n': 'LATIN LETTER SMALL CAPITAL T'}]}]
"""
for confusable in confusables:
# Check if the character is in ASCII
if ord(confusable["character"]) < 128:
continue
homoglyph = confusable["homoglyphs"][0]
unhomoglyphed_text = unhomoglyphed_text.replace(
confusable["character"], homoglyph["c"]
)
# Finally, remove any diacritics (this is not done by the homoglyphs library)
unhomoglyphed_text = unidecode(unhomoglyphed_text)
return unhomoglyphed_text
def process_user_text(user_text, markdown_comment = None):
# The Markdown comment is not used, but it's here to keep the interface consistent
# If the user text doesn't contain homoglyphs, don't trigger the alarm
if not bool(
confusable_homoglyphs.confusables.is_confusable(
user_text, preferred_aliases=["latin"]
)
):
return False, 0.0, "# ✅ All good"
unhomoglyphed_text = unhomoglyphize_text(user_text)
print(f"Unhomoglyphed text: {unhomoglyphed_text}")
alarm_triggered, difference_ratio = process_homoglyphed_text(
homoglyphed_text=user_text, unhomoglyphed_text=unhomoglyphed_text
)
return (
True,
difference_ratio,
"# 🚨 Alarm triggered" if alarm_triggered else "# ✅ All good",
)
theme = gr.themes.Soft(
primary_hue="fuchsia",
secondary_hue="cyan",
neutral_hue="gray",
radius_size="none",
font=[
gr.themes.GoogleFont("IBM Plex Sans"),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
font_mono=[
gr.themes.GoogleFont("IBM Plex Mono"),
"ui-monospace",
"Consolas",
"monospace",
],
)
# Create a Gradio interface
demo = gr.Interface(
theme=theme,
fn=process_user_text,
inputs=[
gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text"),
gr.Markdown(
label="Why is this example interesting?",
show_label=False,
value="",
visible=False,
),
],
outputs=[
# A checkbox: is dangerous or not
gr.Checkbox(label="Is dangerous"),
# The number of the difference ratio
gr.Number(label="Difference ratio"),
# Just an emoji: alarm triggered or not
gr.Markdown(label="Alarm triggered", show_label=False),
],
title="Homoglyphs Alarm 🚨",
description="""Calculates the probablility that a given text has been the target of a homoglyph-based attack.
It calculates the perplexity of the text according to GPT-2 and compares it to the perplexity of the text with homoglyphs replaced by their ASCII equivalents.
Example texts adapted from:
- https://arxiv.org/abs/2401.12070 (also in a version adapted using https://huggingface.co/spaces/acmc/SilverSpeak with 5% replacement)
- https://huggingface.co/google/gemma-2-2b (also in a version adapted using https://huggingface.co/spaces/acmc/SilverSpeak with 5% replacement)
- https://www.persee.fr/doc/rbph_0035-0818_2012_num_90_3_8269
- https://arxiv.org/abs/2411.14257
- https://www.busuu.com/en/spanish/conditional
Written by: [Aldan Creo](https://acmc-website.web.app/intro)
""",
allow_flagging="never",
examples=[
[
"Dr. Capy Cosmos, a capybara unlike any other, astounded the scientific community with his groundbreaking research in astrophysics. With his keen sense of observation and unparalleled ability to interpret cosmic data, he uncovered new insights into the mysteries of black holes and the origins of the universe. As he peered through telescopes with his large, round eyes, fellow researchers often remarked that it seemed as if the stars themselves whispered their secrets directly to him. Dr. Cosmos not only became a beacon of inspiration to aspiring scientists but also proved that intellect and innovation can be found in the most unexpected of creatures.",
"This is an example of a **normal English text** that doesn't contain homoglyphs. We should see here that it is not classified as dangerous, so the ratio is not even calculated.",
],
[
"Dr. Capу Cosmos, a caрybаra unlіkе any other, astounded the scientific community with hіs groundbreakіng reѕearcһ in astrophysics. With hiѕ keen sense of observation and unparаlleled ability to interpret cosmic dаta, he uncovеred new іnsightѕ into tһe myѕteries of black holes аnd the origins of the universe. Aѕ he peered through telescopes with his large, round eyes, fellow reѕearchers often remarked that it seemed as if the stars themѕelves whiѕpered theіr secrets directlу to him. Dr. Cosmos not only became a beacon of inspіration to aspiring scientіsts but also proved thаt intellect and іnnovation can bе found in the most unexpecteԁ οf сreatures.",
"This is the same text as above, but now it's been attacked with a **5% homoglyph replacement**. We should see here that it is classified as dangerous, so the ratio is calculated and the alarm is triggered – hence, the method works.",
],
[
"Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.",
"Again, this is a **normal text**. We should see that it's not classified as dangerous and the ratio is not calculated.",
],
[
"Gemma iѕ a family of lightweіght, ѕtatе-οf-the-art open models from Google, built from the same research аnd technolοgy uѕed to сreate tһe Gemini models. Theу are text-to-text, decoder-only lаrge lаnguage models, available in English, with οpen weightѕ for both рre-trainеd vаrіants аnd instruction-tuned variantѕ. Gemma models are well-suited for a vаrietу of text generation tasks, including question answering, summarization, and rеaѕoning. Their relatively small size makes it possible to dеploy them іn environments witһ limited resourceѕ such as a laptop, desktop or your own cloud infraѕtructure, democratizing acceѕs to state of the art AΙ models and һelping foster іnnovation for everyone.",
"Same as before – just to provide more evidence that the method works. We replace **5% of the characters with homoglyphs** and see that the alarm is triggered.",
],
[
"We run the model on the set of prompts containing known and unknown entities. Inspired by Meng et al. (2022a); Geva et al. (2023); Nanda et al. (2023) we use the residual stream of the final token of the entity, 𝒙 known and 𝒙 unknown. In each layer (l), we compute the activations of each latent in the SAE, i.e. al,j⁢(𝒙lknown) and al,j⁢(𝒙lunknown). For each latent, we obtain the fraction of the time that it is active (i.e. has a value greater than zero) on known and unknown entities respectively: fl,jknown=∑iNknown𝟙⁢[al,j⁢(𝒙l,iknown)>0]Nknown,fl,junknown=∑iNunknown𝟙⁢[al,j⁢(𝒙l,iunknown)>0]Nunknown,(6) where Nknown and Nunknown are the total number of prompts in each subset. Then, we take the difference, obtaining the latent separation scores sl,jknown=fl,jknown−fl,junknown and sl,junknown=fl,junknown−fl,jknown, for detecting known and unknown entities respectively.",
"Now, what happens if the text naturally contains homoglyphs? We can see it's now classified as dangerous, and the ratio is calculated, but the alarm is not triggered, showing how the ratio is a good avenue to avoid **false positives**.",
],
[
"The national/ official name of the country, the people and the language are respectively Eλλάδα, Έλληνας, ελληνικά ([ eláδa, élinas, eliniká]), derived from Ancient Greek Ἑλλάς, Ἕλλην, ἑλληνικός ([ hellás, héllen, hellenikós]) ‘Greece, Greek (noun), Greek (adj.)’, which are also to be found in most European languages as Hellas, hellenic, hellénique etc.; Hellenic Republic is the official name of the country in the European Union. The etymology of these words is uncertain. They first occur in the Iliad of Homer (2.683-4) as a designation of a small area in Thessaly, the homeland of Achilles, and its people. (3) Also in Homer, it is possible to find the compound πανέλληνες ([ panhellenes]) denoting all Greeks (from adjective pan ‘all’ + noun hellen), and it is again uncertain under what historical circumstances this local appellation spread to the totality of the Greek nation, although various theories have been proposed (see Babiniotis 2002).",
"Another example of a text that naturally contains homoglyphs. Again, we avoid flagging it as a **false positive**.",
],
[
"To form the conditional tense in Spanish, you need to use the infinitive form of the verb and add the corresponding endings for each subject pronoun. Regardless of the verb type (-ar, -er, or -ir), the endings remain the same. In singular-plural order, 1st-3rd, the terminations are: -ía, -ías, -ía, -íamos, -ían, -ían. For example: Hablaría (I would speak) Comerías (You would eat) Escribiría (He/She/You would write) Haríamos (We would do/make) Beberían (You all would drink) Leerían (They/You all would read).",
"Third example of a text that naturally contains homoglyphs. Once more, we avoid flagging it as a **false positive**.",
],
[
"To form the cοnԁіtіonal tense in Spanish, yοu need tο uѕе the infinitive fοrm of the verb and add the corresponding endіngs for eaсһ subject рronοun. Regardless of the verb type (-ar, -er, or -ir), the endings remain the same. In singular-plural order, 1st-3rd, thе terminations are: -ía, -ías, -ía, -íamos, -ían, -ían. For example: Hablaríа (I woulԁ speak) Comerías (You would еаt) Eѕcribiría (He/Shе/You would write) Haríаmos (We would do/make) Bеberían (Yοu all would ԁrink) Leerían (They/You all would read).",
"This is probably the **most important** example. It's the same as the previous one, but with **5% of the characters replaced** with homoglyphs. So we have natural homoglyphs and artificial homoglyphs. Still, the method works – the alarm is triggered, while it wasn't in the previous example. This shows the method is robust and can be used to detect homoglyph-based attacks even in contexts of varying content matter, complexity and language.",
],
],
)
# Launch the Gradio app
demo.launch()