Spaces:

jgyasu
/

text-paraphraser

Running

App Files Files Community

jgyasu commited on 24 days ago

Commit

b265c4f

•

1 Parent(s): 02d0f22

Create app.py

Browse files

Files changed (1) hide show

app.py +522 -0

app.py ADDED Viewed

	@@ -0,0 +1,522 @@

+# -*- coding: utf-8 -*-
+"""watermark_intern.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1SyerXj0c3UyLSYmdL4TBBzWhwvMJ3JwJ
+"""
+import gradio as gr
+# import streamlit as st
+from transformers import AutoTokenizer
+from transformers import AutoModelForSeq2SeqLM
+import plotly.graph_objects as go
+from transformers import pipeline
+import re
+import time
+import requests
+from PIL import Image
+import itertools
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+from matplotlib.colors import ListedColormap, rgb2hex
+import ipywidgets as widgets
+from IPython.display import display, HTML
+import pandas as pd
+from pprint import pprint
+from tenacity import retry
+from tqdm import tqdm
+# import tiktoken
+import scipy.stats
+import torch
+from transformers import GPT2LMHeadModel
+import seaborn as sns
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# from colorama import Fore, Style
+# import openai
+import random
+from nltk.corpus import stopwords
+from termcolor import colored
+import nltk
+from nltk.translate.bleu_score import sentence_bleu
+from transformers import BertTokenizer, BertModel
+import nltk
+nltk.download('stopwords')
+# Function to Initialize the Model
+def init_model():
+    para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
+    para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
+    return para_tokenizer, para_model
+# Function to Paraphrase the Text
+def paraphrase(question, para_tokenizer, para_model, num_beams=5, num_beam_groups=5, num_return_sequences=5, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
+    input_ids = para_tokenizer(
+        f'paraphrase: {question}',
+        return_tensors="pt", padding="longest",
+        max_length=max_length,
+        truncation=True,
+    ).input_ids
+    outputs = para_model.generate(
+        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
+        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
+        num_beams=num_beams, num_beam_groups=num_beam_groups,
+        max_length=max_length, diversity_penalty=diversity_penalty
+    )
+    res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    return res
+# Function to Find the Longest Common Substring Words Subsequence
+def longest_common_subss(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    original_sentence_lower = original_sentence.lower()
+    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
+    paraphrased_sentences_no_stopwords = []
+    for sentence in paraphrased_sentences_lower:
+        words = re.findall(r'\b\w+\b', sentence)
+        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
+        paraphrased_sentences_no_stopwords.append(filtered_sentence)
+    results = []
+    for sentence in paraphrased_sentences_no_stopwords:
+        common_words = set(original_sentence_lower.split()) & set(sentence.split())
+        for word in common_words:
+            sentence = sentence.replace(word, colored(word, 'green'))
+        results.append({
+            "Original Sentence": original_sentence_lower,
+            "Paraphrased Sentence": sentence,
+            "Substrings Word Pair": common_words
+        })
+    return results
+# Function to Find Common Substring Word between each paraphrase sentences
+def common_substring_word(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    original_sentence_lower = original_sentence.lower()
+    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
+    paraphrased_sentences_no_stopwords = []
+    for sentence in paraphrased_sentences_lower:
+        words = re.findall(r'\b\w+\b', sentence)
+        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
+        paraphrased_sentences_no_stopwords.append(filtered_sentence)
+    results = []
+    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
+        common_words = set(original_sentence_lower.split()) & set(sentence.split())
+        common_substrings = ', '.join(sorted(common_words))
+        for word in common_words:
+            sentence = sentence.replace(word, colored(word, 'green'))
+        results.append({
+            f"Paraphrased Sentence {idx+1}": sentence,
+            "Common Substrings": common_substrings
+        })
+    return results
+# Function to Watermark a Word Take Randomly Between Each lcs Point (Random Sampling)
+def random_sampling(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    original_sentence_lower = original_sentence.lower()
+    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
+    paraphrased_sentences_no_stopwords = []
+    for sentence in paraphrased_sentences_lower:
+        words = re.findall(r'\b\w+\b', sentence)
+        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
+        paraphrased_sentences_no_stopwords.append(filtered_sentence)
+    results = []
+    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
+        common_words = set(original_sentence_lower.split()) & set(sentence.split())
+        common_substrings = ', '.join(sorted(common_words))
+        words_to_replace = [word for word in sentence.split() if word not in common_words]
+        if words_to_replace:
+            word_to_mark = random.choice(words_to_replace)
+            sentence = sentence.replace(word_to_mark, colored(word_to_mark, 'red'))
+        for word in common_words:
+            sentence = sentence.replace(word, colored(word, 'green'))
+        results.append({
+            f"Paraphrased Sentence {idx+1}": sentence,
+            "Common Substrings": common_substrings
+        })
+    return results
+# Function for Inverse Transform Sampling
+def inverse_transform_sampling(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    original_sentence_lower = original_sentence.lower()
+    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
+    paraphrased_sentences_no_stopwords = []
+    for sentence in paraphrased_sentences_lower:
+        words = re.findall(r'\b\w+\b', sentence)
+        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
+        paraphrased_sentences_no_stopwords.append(filtered_sentence)
+    results = []
+    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
+        common_words = set(original_sentence_lower.split()) & set(sentence.split())
+        common_substrings = ', '.join(sorted(common_words))
+        words_to_replace = [word for word in sentence.split() if word not in common_words]
+        if words_to_replace:
+            probabilities = [1 / len(words_to_replace)] * len(words_to_replace)
+            chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
+            sentence = sentence.replace(chosen_word, colored(chosen_word, 'magenta'))
+        for word in common_words:
+            sentence = sentence.replace(word, colored(word, 'green'))
+        results.append({
+            f"Paraphrased Sentence {idx+1}": sentence,
+            "Common Substrings": common_substrings
+        })
+    return results
+# Function for Contextual Sampling
+def contextual_sampling(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    original_sentence_lower = original_sentence.lower()
+    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
+    paraphrased_sentences_no_stopwords = []
+    for sentence in paraphrased_sentences_lower:
+        words = re.findall(r'\b\w+\b', sentence)
+        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
+        paraphrased_sentences_no_stopwords.append(filtered_sentence)
+    results = []
+    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
+        common_words = set(original_sentence_lower.split()) & set(sentence.split())
+        common_substrings = ', '.join(sorted(common_words))
+        words_to_replace = [word for word in sentence.split() if word not in common_words]
+        if words_to_replace:
+            context = " ".join([word for word in sentence.split() if word not in common_words])
+            chosen_word = random.choice(words_to_replace)
+            sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
+        for word in common_words:
+            sentence = sentence.replace(word, colored(word, 'green'))
+        results.append({
+            f"Paraphrased Sentence {idx+1}": sentence,
+            "Common Substrings": common_substrings
+        })
+    return results
+# Function for Exponential Minimum Sampling
+def exponential_minimum_sampling(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    original_sentence_lower = original_sentence.lower()
+    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
+    paraphrased_sentences_no_stopwords = []
+    for sentence in paraphrased_sentences_lower:
+        words = re.findall(r'\b\w+\b', sentence)
+        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
+        paraphrased_sentences_no_stopwords.append(filtered_sentence)
+    results = []
+    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
+        common_words = set(original_sentence_lower.split()) & set(sentence.split())
+        common_substrings = ', '.join(sorted(common_words))
+        words_to_replace = [word for word in sentence.split() if word not in common_words]
+        if words_to_replace:
+            num_words = len(words_to_replace)
+            probabilities = [2 ** (-i) for i in range(num_words)]
+            chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
+            sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
+        for word in common_words:
+            sentence = sentence.replace(word, colored(word, 'green'))
+        results.append({
+            f"Paraphrased Sentence {idx+1}": sentence,
+            "Common Substrings": common_substrings
+        })
+    return results
+# Function to Calculate the BLEU score
+def calculate_bleu(reference, candidate):
+    return sentence_bleu([reference], candidate)
+# Function to calculate BERT score
+def calculate_bert(reference, candidate):
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertModel.from_pretrained('bert-base-uncased')
+    reference_tokens = tokenizer.tokenize(reference)
+    candidate_tokens = tokenizer.tokenize(candidate)
+    reference_ids = tokenizer.encode(reference, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
+    candidate_ids = tokenizer.encode(candidate, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        reference_outputs = model(reference_ids)
+        candidate_outputs = model(candidate_ids)
+    reference_embeddings = reference_outputs[0][:, 0, :].numpy()
+    candidate_embeddings = candidate_outputs[0][:, 0, :].numpy()
+    cosine_similarity = np.dot(reference_embeddings, candidate_embeddings.T) / (np.linalg.norm(reference_embeddings) * np.linalg.norm(candidate_embeddings))
+    return np.mean(cosine_similarity)
+# Function to calculate minimum edit distance
+def min_edit_distance(reference, candidate):
+    m = len(reference)
+    n = len(candidate)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0:
+                dp[i][j] = j
+            elif j == 0:
+                dp[i][j] = i
+            elif reference[i - 1] == candidate[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
+            else:
+                dp[i][j] = 1 + min(dp[i][j - 1],         # Insert
+                                   dp[i - 1][j],         # Remove
+                                   dp[i - 1][j - 1])    # Replace
+    return dp[m][n]
+def generate_paraphrase(question):
+    para_tokenizer, para_model = init_model()
+    res = paraphrase(question, para_tokenizer, para_model)
+    return res
+# question = "The official position of the United States on the Russia Ukraine war has been consistent in supporting Ukraine ’s sovereignty , territorial integrity, and the peaceful resolution of the conflict."
+question = "Following the declaration of the State of Israel in 1948, neighboring Arab states invaded. The war ended with Israel controlling a significant portion of the territory. Many Palestinians became refugees."
+res = generate_paraphrase(question)
+res
+longest_common_subss(question, res)
+import nltk
+nltk.download('punkt')
+import re
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+def non_melting_points(original_sentence, paraphrased_sentences):
+    stop_words = set(stopwords.words('english'))
+    def tokenize_and_filter(sentence):
+        words = word_tokenize(sentence.lower())
+        filtered_words = {word for word in words if word.isalpha() and word not in stop_words}
+        return filtered_words
+    original_words = tokenize_and_filter(original_sentence)
+    paraphrased_words_list = [tokenize_and_filter(sentence) for sentence in paraphrased_sentences]
+    common_words = original_words
+    for words in paraphrased_words_list:
+        common_words &= words
+    return common_words
+#Function to get the first sentence from a paragraph
+import re
+def get_first_sentence(paragraph):
+    match = re.search(r'([^.]*\.[\s]*[A-Z])', paragraph)
+    if match:
+        first_sentence = match.group(0)
+        first_sentence = first_sentence.strip()
+        if len(first_sentence.split('.')) > 1:
+            return first_sentence.split('.')[0] + '.'
+        return first_sentence
+    else:
+        return paragraph
+#Initializing llama3
+# import json
+# import torch
+# from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline)
+# config_data = json.load(open("config.json"))
+# HF_TOKEN = config_data["HF_TOKEN"]
+# model_name = "meta-llama/Meta-Llama-3-8B"
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_use_double_quant=True,
+#     bnb_4bit_quant_type="nf4",
+#     bnb_4bit_compute_dtype=torch.bfloat16
+# )
+# tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+# tokenizer.pad_token = tokenizer.eos_token
+# model = AutoModelForCausalLM.from_pretrained(
+#     model_name,
+#     device_map="auto",
+#     quantization_config=bnb_config,
+#     token=HF_TOKEN
+# )
+# text_generator = pipeline(
+#     "text-generation",
+#     model=model,
+#     tokenizer=tokenizer,
+#     max_new_tokens=512,
+# )
+# # llm_result = text_generator("write about nazism")
+# llm_result
+# llm_result[0]["generated_text"].split('.')
+#Finds LCS
+import re
+from nltk.corpus import stopwords
+def find_common_subsequences(sentence, str_list):
+    stop_words = set(stopwords.words('english'))
+    sentence = sentence.lower()
+    str_list = [s.lower() for s in str_list]
+    def is_present(lcs, str_list):
+        for string in str_list:
+            if lcs not in string:
+                return False
+        return True
+    def remove_stop_words_and_special_chars(sentence):
+        sentence = re.sub(r'[^\w\s]', '', sentence)
+        words = sentence.split()
+        filtered_words = [word for word in words if word.lower() not in stop_words]
+        return " ".join(filtered_words)
+    sentence = remove_stop_words_and_special_chars(sentence)
+    str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
+    words = sentence.split(" ")
+    common_grams = []
+    added_phrases = set()
+    def is_covered(subseq, added_phrases):
+        for phrase in added_phrases:
+            if subseq in phrase:
+                return True
+        return False
+    for i in range(len(words) - 4):
+        penta = " ".join(words[i:i+5])
+        if is_present(penta, str_list):
+            common_grams.append(penta)
+            added_phrases.add(penta)
+    for i in range(len(words) - 3):
+        quad = " ".join(words[i:i+4])
+        if is_present(quad, str_list) and not is_covered(quad, added_phrases):
+            common_grams.append(quad)
+            added_phrases.add(quad)
+    for i in range(len(words) - 2):
+        tri = " ".join(words[i:i+3])
+        if is_present(tri, str_list) and not is_covered(tri, added_phrases):
+            common_grams.append(tri)
+            added_phrases.add(tri)
+    for i in range(len(words) - 1):
+        bi = " ".join(words[i:i+2])
+        if is_present(bi, str_list) and not is_covered(bi, added_phrases):
+            common_grams.append(bi)
+            added_phrases.add(bi)
+    for i in range(len(words)):
+        uni = words[i]
+        if is_present(uni, str_list) and not is_covered(uni, added_phrases):
+            common_grams.append(uni)
+            added_phrases.add(uni)
+    return common_grams
+question = '''the colorado republican party sent a mass email last week with the subject line "god hates pride"'''
+res = generate_paraphrase(question)
+res
+common_grams = find_common_subsequences(question, res[0:3])
+common_grams
+common_gram_words = [word for gram in common_grams for word in gram.split()]
+common_gram_words
+import re
+def llm_output(prompt):
+    # sequences = text_generator(prompt)
+    # gen_text = sequences[0]["generated_text"]
+    # sentences = gen_text.split('.')
+    # # first_sentence = get_first_sentence(gen_text[len(prompt):])
+    # return gen_text,sentences[-3]
+    return prompt,prompt
+import re
+def generate_html_output(results,common_grams,common_gram_words):
+    html_output = "<table border='1'>"
+    html_output += "<tr><th>Original Sentence</th><th>Paraphrased Sentence</th><th>Common Substrings</th><th>Non Melting Points</th></tr>"
+    for result in results:
+        original_sentence = result[f"Original Sentence"]
+        paraphrased_sentence = result[f"Paraphrased Sentence"]
+        common_substrings = result[f"Substrings Word Pair"]
+        # Highlight common substrings in the paraphrased sentence
+        for word in common_gram_words:
+            paraphrased_sentence = re.sub(r'\b' + re.escape(word) + r'\b', f'<span style="color:green">{word}</span>', paraphrased_sentence, flags=re.IGNORECASE)
+        html_output += f"<tr><td>{original_sentence}</td><td>{paraphrased_sentence}</td><td>{common_substrings}</td><td>{common_grams}</td></tr>"
+    html_output += "</table>"
+    return html_output
+def model(prompt):
+    generated,sentence = llm_output(prompt)
+    res = generate_paraphrase(sentence)
+    common_subs = longest_common_subss(sentence,res)
+    non_melting  = non_melting_points(sentence, res)
+    common_grams = find_common_subsequences(sentence,res)
+    common_gram_words = [word for gram in common_grams for word in gram.split()]
+    for i in range(len(common_subs)):
+        common_subs[i]["Paraphrased Sentence"] = res[i]
+    result = generate_html_output(common_subs,common_grams,common_gram_words)
+    return generated, result
+# final = model(question)
+import gradio as gr
+demo = gr.Interface(
+    fn=model,
+    inputs=gr.Textbox(label="User Prompt"),
+    outputs=[gr.Textbox(label="AI-generated Text (Llama3)"), gr.HTML()],
+    title="Paraphrases the Text and Highlights the Non-melting Points",
+    theme=gr.themes.Soft()
+)
+demo.launch(share=True)