Spaces:

voacado
/

two-sentence-horror-bart

Sleeping

App Files Files Community

A Vo commited on Dec 7, 2023

Commit

40884a0

1 Parent(s): a48cee6

Added eval metrics, comments

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +187 -30
reddit_cleansed_data.csv +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,16 +1,23 @@
 # Imports
 # Core Imports
 import torch
 # Model-related Imports
 from transformers import BartTokenizer, BartForConditionalGeneration # fine-tuned BART model
 from transformers import AutoTokenizer, AutoModelForTokenClassification # restore punct
 from transformers import pipeline # restore punct
 import gradio as gr
 # Instantiate model to restore punctuation
-print("1/4 - Instantiating model to restore punctuation")
 punct_model_path = "felflare/bert-restore-punctuation"
 # Load punct tokenizer and model
@@ -21,7 +28,7 @@ punct_restorer = pipeline("token-classification", model=punct_model, tokenizer=p
 # Instantiate fine-tuned horror BART model
-print("2/4 - Instantiating two-sentence horror generation model")
 model_path = 'voacado/bart-two-sentence-horror'
 # Load tokenizer and model
@@ -30,8 +37,108 @@ model = BartForConditionalGeneration.from_pretrained(model_path)
 # Set up inference
-print("3/4 - Setting parameters for inference")
 # Set the model to evaluation mode
 model.eval()
@@ -40,10 +147,20 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 # Restore punct
-def restore_punctuation(text, restorer):
     # Use the model to predict punctuation
     punctuated_output = restorer(text)
-    punctuated_text = []
     # Define punctuation marks (note: not including left-side because we want space still)
     punctuation_marks = ["!", "?", ".", "-", ":", ";", "'", "’", ",", ")", "]", "}", "…", "”", "’’", "''"]
@@ -53,65 +170,105 @@ def restore_punctuation(text, restorer):
         # If token is punctuation, append to previous token
         if cur_token in punctuation_marks:
-            punctuated_text[-1] += cur_token
         # If previous token is quotations, append to previous token
-        elif punctuated_text and punctuated_text[-1] in ["'", "’", "“", "‘", "‘‘", "““"]:
-            punctuated_text[-1] += cur_token
         # If token is a contraction or a quote, append to previous token (no space)
         elif cur_token.lower() in ["s", "t", "re", "ve", "ll", "d", "m"]:
             # Remove space for contractions
-            punctuated_text[-1] += cur_token
         # if prediction is LABEL_0, token should be capitalized
         elif elem.get('entity') == 'LABEL_0':
-            punctuated_text.append(cur_token.capitalize())
         # else if prediction is LABEL_1, token should be lowercase
         # elif elem.get('entity') == 'LABEL_1':
         else:
-            punctuated_text.append(cur_token)
     # If there's no period at the end of the story, add one
-    if punctuated_text[-1][-1] != '.':
-        punctuated_text[-1] = punctuated_text[-1] + '.'
-    return ' '.join(punctuated_text)
-def generate_text(input_text):
-    # Encode the input text
-    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
-    # Generate text
-    with torch.no_grad():
-        output_ids = model.generate(input_ids, max_length=50)
-    # Decode the generated text
-    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    # Restore punctuation
-    generated_text_punct = restore_punctuation(generated_text, punct_restorer)
-    return generated_text_punct
 # Create gradio demo
-print("4/4 - Launching demo")
 title = "👻 🫣 Generate a Two-Sentence Horror Story 😱 👻"
 description = """
 <center>The bot was trained to generate two-sentence horror stories based on r/TwoSentenceHorror. <i>Spooky!</i></center>
 """
-article = "Check out [the subreddit](https://www.reddit.com/r/TwoSentenceHorror) that this demo is based off of. Or, check out the dataset [here](https://www.kaggle.com/datasets/voanthony/two-sentence-horror-jan-2015-apr-2023)."
 demo = gr.Interface(
     fn=generate_text,
-    inputs=gr.Textbox(lines=4, placeholder="Enter the first sentence of your horror story here...", label="First Sentence"),
-    outputs=gr.Textbox(lines=4, label="Second Sentence"),
     title=title,
     description=description,
     article=article,

 # Imports
 # Core Imports
 import torch
 # Model-related Imports
 from transformers import BartTokenizer, BartForConditionalGeneration # fine-tuned BART model
 from transformers import AutoTokenizer, AutoModelForTokenClassification # restore punct
 from transformers import pipeline # restore punct
 import gradio as gr
+# Evaluation Imports
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import string
 # Instantiate model to restore punctuation
+print("1/7 - Instantiating model to restore punctuation")
 punct_model_path = "felflare/bert-restore-punctuation"
 # Load punct tokenizer and model
 # Instantiate fine-tuned horror BART model
+print("2/7 - Instantiating two-sentence horror generation model")
 model_path = 'voacado/bart-two-sentence-horror'
 # Load tokenizer and model
+# Load data for evaluation metrics
+print("3/7 - Reading in data")
+data = pd.read_csv("./reddit_cleansed_data.csv")
+data['weighted_score'] = data['score'] + (10 * data['num_comments']) + (100 * data['gilded_count'])
+dataset_stories = (data['title'] + ' ' + data['selftext']).to_list()
+# Instantiate evaluation metrics - Cosine Similarity with TF-IDF
+print("4/7 - Instantiating evaluation metrics - Cosine Similarity with TF-IDF")
+# Pre-vectorize dataset
+vectorizer = TfidfVectorizer()
+dataset_matrix = vectorizer.fit_transform(dataset_stories)
+def eval_cosine_similarity(input_sentence: str) -> [str, str]:
+    """
+    Evaluate cosine similarity between input sentence and each story in the dataset.
+    Args:
+        input_sentence (str): user story (first sentence)
+    Returns:
+        [str, str]: most similar story, weighted score
+    """
+    # Vectorize input sentence using the existing vocab
+    input_vec = vectorizer.transform([input_sentence])
+    # Get cosine similarity
+    similarities = cosine_similarity(input_vec, dataset_matrix)
+    # Find most similar story
+    most_similar_story_idx = similarities.argmax()
+    most_similar_story = dataset_stories[most_similar_story_idx]
+    # Get weighted score of most similar story
+    weighted_score = data['weighted_score'][most_similar_story_idx]
+    return most_similar_story, weighted_score
+# Instantiate evaluation metrics - Jaccard Similarity
+print("5/7 - Instantiating evaluation metrics - Jaccard Similarity")
+def tokenize(text: str):
+    """
+    Convert text to lowercase and remove punctuation, then tokenize.
+    Args:
+        text (str): user story
+    Returns:
+        set: set of tokens
+    """
+    text = text.lower()
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    tokens = text.split()
+    return set(tokens)
+def jaccard_similarity(set1: set, set2: set):
+    """
+    Calculate Jaccard similarity between two sets.
+    Args:
+        set1 (set): user_tokens
+        set2 (set): story_tokens
+    Returns:
+        float: Jaccard similarity
+    """
+    intersection = set1.intersection(set2)
+    union = set1.union(set2)
+    return len(intersection) / len(union)
+def eval_jaccard_similarity(input_sentence: str) -> [str, str]:
+    """
+    Evaluate Jaccard similarity between input sentence and each story in the dataset.
+    Args:
+        input_sentence (str): user story (first sentence)
+    Returns:
+        [str, str]: most similar story, weighted score
+    """
+    # Tokenize the user story
+    user_tokens = tokenize(input_sentence)
+    # Initialize variables to find the most similar story
+    max_similarity = 0
+    most_similar_story = ''
+    # Compare with each story in the dataset
+    for story in dataset_stories:
+        story_tokens = tokenize(story)
+        similarity = jaccard_similarity(user_tokens, story_tokens)
+        if similarity > max_similarity:
+            max_similarity = similarity
+            most_similar_story = story
+            max_score = data['weighted_score'][dataset_stories.index(story)]
+    return most_similar_story, max_score
 # Set up inference
+print("6/7 - Setting parameters for inference")
 # Set the model to evaluation mode
 model.eval()
 model.to(device)
 # Restore punct
+def restore_punctuation(text: str, restorer: pipeline) -> str:
+    """
+    Restore punctuation to text.
+    Args:
+        text (str): full story (first and second sentences)
+        restorer (pipeline): model that restores punctuation
+    Returns:
+        str: punctuated text (based on input)
+    """
     # Use the model to predict punctuation
     punctuated_output = restorer(text)
+    punct_text = []
     # Define punctuation marks (note: not including left-side because we want space still)
     punctuation_marks = ["!", "?", ".", "-", ":", ";", "'", "’", ",", ")", "]", "}", "…", "”", "’’", "''"]
         # If token is punctuation, append to previous token
         if cur_token in punctuation_marks:
+            punct_text[-1] += cur_token
         # If previous token is quotations, append to previous token
+        elif punct_text and punct_text[-1] in ["'", "’", "“", "‘", "‘‘", "““"]:
+            punct_text[-1] += cur_token
         # If token is a contraction or a quote, append to previous token (no space)
         elif cur_token.lower() in ["s", "t", "re", "ve", "ll", "d", "m"]:
             # Remove space for contractions
+            punct_text[-1] += cur_token
         # if prediction is LABEL_0, token should be capitalized
         elif elem.get('entity') == 'LABEL_0':
+            punct_text.append(cur_token.capitalize())
         # else if prediction is LABEL_1, token should be lowercase
         # elif elem.get('entity') == 'LABEL_1':
         else:
+            punct_text.append(cur_token)
     # If there's no period at the end of the story, add one
+    if punct_text[-1][-1] != '.':
+        punct_text[-1] = punct_text[-1] + '.'
+    return ' '.join(punct_text)
+def generate_text(input_text: str, full_sentence: str) -> [str, str, float, str, float]:
+    """
+    Generate the second sentence of the horror story given the first (input_text).
+    Args:
+        input_text (str): first sentence of the horror story
+        full_sentence (str): full story (first and second sentences)
+    Returns:
+        gen_text_punct (str): second sentence of the horror story
+        similar_story_cosine (str): most similar story (cosine similarity)
+        cosine_score (float): score of most similar story (cosine similarity)
+        similar_story_jaccard (str): most similar story (Jaccard similarity)
+        jaccard_score (float): score of most similar story (Jaccard similarity)
+    """
+    # If user only enters first sentence, generate second sentence
+    if not full_sentence:
+        # Encode the input text
+        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
+        # Generate text
+        with torch.no_grad():
+            output_ids = model.generate(input_ids, max_length=50)
+        # Decode the generated text
+        gen_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Restore punctuation
+        gen_text_punct = restore_punctuation(gen_text, punct_restorer)
+        full_sentence = input_text + ' ' + gen_text_punct
+    else:
+        gen_text_punct = "N/A"
+    # Calculate Cosine and Jaccard similarity
+    similar_story_cosine, cosine_score = eval_cosine_similarity(full_sentence)
+    similar_story_jaccard, jaccard_score = eval_jaccard_similarity(full_sentence)
+    return gen_text_punct, similar_story_cosine, cosine_score, similar_story_jaccard, jaccard_score
 # Create gradio demo
+print("7/7 - Launching demo")
 title = "👻 🫣 Generate a Two-Sentence Horror Story 😱 👻"
 description = """
 <center>The bot was trained to generate two-sentence horror stories based on r/TwoSentenceHorror. <i>Spooky!</i></center>
 """
+article = """
+Check out [the subreddit](https://www.reddit.com/r/TwoSentenceHorror) that this demo is based off of. Or, check out the dataset [here](https://www.kaggle.com/datasets/voanthony/two-sentence-horror-jan-2015-apr-2023).
+The language model is fine-tuned from ['facebook/bart-base'](https://huggingface.co/facebook/bart-base). We import, then update the weights for the model to generate two-sentence horror stories. The model is fine-tuned over 3 epochs to avoid catastrophic forgetting. We also use a separate model (['felflare/bert-restore-punctuation'](https://huggingface.co/felflare/bert-restore-punctuation?text=My+name+is+wolfgang+and+I+live+in+berlin)) to restore punctuation.
+For evaluation, the generated story is compared to the most similar Reddit post (using either cosine or Jaccard similarity). The score of the most similar post is also returned. The score is calculated as the sum of the post score, 10 * number of comments, and 100 * number of gilds. The score is used as a proxy for the popularity of the post.
+Users may also enter an entire story in the second input prompt rather than generating the remainder of the story. This will be used for evaluation metrics and no story will be generated.
+"""
 demo = gr.Interface(
     fn=generate_text,
+    inputs=[
+        gr.Textbox(lines=4, placeholder="Enter the first sentence of your horror story here...", label="First Sentence"),
+        gr.Textbox(lines=4, placeholder="Or, enter full story for evaluation here...", label="Eval - Full Story")
+    ],
+    outputs=[
+        gr.Textbox(lines=4, label="Generated Second Sentence"),
+        gr.Textbox(lines=3, label="Cosine Similarity - Sentence"),
+        gr.Textbox(lines=1, label="Cosine Similarity - Post Score"),
+        gr.Textbox(lines=3, label="Jaccard Similarity - Sentence"),
+        gr.Textbox(lines=1, label="Jaccard Similarity - Post Score")
+    ],
     title=title,
     description=description,
     article=article,

reddit_cleansed_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4107fb5ebe3aa92bd7cc775fcdccab5d07f45bce613f184bb8dd0f4ed808e628
+size 20222577