import gradio as gr from huggingface_hub import hf_hub_download import json import tensorflow as tf import numpy as np # Load models MODELS = [ ("Bayes Enron1 spam", BAYES := "bayes-enron1-spam"), ("NN Enron1 spam", NN := "nn-enron1-spam"), ("GISTy Enron1 spam", LLM := "gisty-enron1-spam"), ] model_probs_path = hf_hub_download(repo_id="tbitai/bayes-enron1-spam", filename="probs.json") with open(model_probs_path) as f: model_probs = json.load(f) nn_model_path = hf_hub_download(repo_id="tbitai/nn-enron1-spam", filename="nn-enron1-spam.keras") nn_model = tf.keras.models.load_model(nn_model_path) llm_model_path = hf_hub_download(repo_id="tbitai/gisty-enron1-spam", filename="gisty-enron1-spam.keras") llm_model = tf.keras.models.load_model(llm_model_path) # Sentence Transformers should be imported after Keras models, in order to prevent it from setting Keras to legacy. from sentence_transformers import SentenceTransformer st_model = SentenceTransformer("avsolatorio/GIST-large-Embedding-v0") # Utils for Bayes UNK = '[UNK]' def tokenize(text): return tf.keras.preprocessing.text.text_to_word_sequence(text) def combine(probs): if any(p == 0 for p in probs): return 0 prod = np.prod(probs) neg_prod = np.prod([1 - p for p in probs]) if prod + neg_prod == 0: # Still possible due to floating point arithmetic return 0.5 # Assume that prod and neg_prod are equally small return prod / (prod + neg_prod) def get_interesting_probs(probs, intr_threshold): return sorted(probs, key=lambda p: abs(p - 0.5), reverse=True)[:intr_threshold] DEFAULT_INTR_THRESHOLD = 15 def unbias(p): return (2 * p) / (p + 1) # Predict functions def predict_bayes(text, intr_threshold, unbiased=False): words = tokenize(text) probs = [] for w in words: try: p = model_probs[w] if unbiased: p = unbias(p) except KeyError: p = model_probs[UNK] probs.append(p) interesting_probs = get_interesting_probs(probs, intr_threshold) return combine(interesting_probs) def predict_nn(text): return float(nn_model(np.array([text]))[0][0].numpy()) def predict_llm(text): embedding = st_model.encode(text) return float(llm_model(np.array([embedding]))[0][0].numpy()) def predict(model, input_txt, unbiased, intr_threshold): if model == BAYES: return predict_bayes(input_txt, unbiased=unbiased, intr_threshold=intr_threshold) elif model == NN: return predict_nn(input_txt) elif model == LLM: return predict_llm(input_txt) # UI demo = gr.Interface( theme=gr.themes.Origin( # Gradio 4-like primary_hue="yellow", ), fn=predict, inputs=[ gr.Dropdown(choices=MODELS, value=BAYES, label="Model", # FIXME: Font size should be smaller by default. Remove workaround when fixed in Gradio: https://github.com/gradio-app/gradio/issues/9642 info="Learn more about the models [here](https://huggingface.co/collections/tbitai/bayes-or-spam-6700033fa145e298ec849249)"), gr.TextArea(label="Email"), ], additional_inputs_accordion=gr.Accordion("Additional configuration for Bayes", open=False), additional_inputs=[ gr.Checkbox(label="Unbias", info="Correct Graham's bias?"), gr.Slider(minimum=1, maximum=DEFAULT_INTR_THRESHOLD + 5, step=1, value=DEFAULT_INTR_THRESHOLD, label="Interestingness threshold", info=f"How many of the most interesting words to select in the probability calculation? ({DEFAULT_INTR_THRESHOLD} for Graham)"), ], outputs=[gr.Number(label="Spam probability")], title="Bayes or Spam?", description="Choose your model, and predict if your email is a spam! 📨", examples=[ [NN, "Enron actuals for June 26, 2000", None, None], [BAYES, "Stop the aging clock\nNerissa", True, DEFAULT_INTR_THRESHOLD], ], article="This is a demo of the models in the [Bayes or Spam?](https://github.com/tbitai/bayes-or-spam) project.", ) if __name__ == "__main__": demo.launch()