aiisc-watermarking-model / non_melting_points.py
jgyasu's picture
Upload folder using huggingface_hub
0840f0a verified
raw
history blame
4.36 kB
import nltk
from nltk.corpus import stopwords
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
def remove_stopwords(text):
"""
Remove stopwords using NLTK's stopword list
Args:
text (str): Input text
Returns:
str: Cleaned text with stopwords removed
"""
stop_words = set(stopwords.words('english'))
words = text.lower().split()
return ' '.join([word for word in words if word not in stop_words])
def is_exact_match(ngram, sentences):
"""
Check if the given n-gram has an exact match in all sentences
Args:
ngram (str): The n-gram to search for
sentences (list): List of sentences to search in
Returns:
bool: True if n-gram has exact match in all sentences, False otherwise
"""
sentence_ngrams = []
for sentence in sentences:
words = sentence.split()
current_ngrams = []
n = len(ngram.split())
for i in range(len(words) - n + 1):
current_ngram = " ".join(words[i:i+n])
current_ngrams.append(current_ngram)
sentence_ngrams.append(set(current_ngrams))
return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)
def is_substring_of_any(ngram, common_ngrams):
"""
Check if the given n-gram is an exact substring of any previously found common n-grams
Args:
ngram (str): The n-gram to check
common_ngrams (list): List of previously found common n-grams
Returns:
bool: True if ngram is a substring of any common_ngrams, False otherwise
"""
ngram_words = ngram.split()
for common_gram in common_ngrams:
common_words = common_gram.split()
for i in range(len(common_words) - len(ngram_words) + 1):
if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
return True
return False
def find_filtered_ngrams(sentences):
"""
Find all n-grams that have exact matches across all sentences,
excluding those that are part of larger common n-grams
Args:
sentences (list): List of sentences to analyze
Returns:
list: List of all common n-grams in order of their appearance in the first sentence
"""
# First, remove stopwords from all sentences
cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
words = cleaned_sentences[0].split()
max_n = len(words)
all_common_ngrams = []
for n in range(max_n, 0, -1):
for i in range(len(words) - n + 1):
ngram = " ".join(words[i:i+n])
if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
all_common_ngrams.append(ngram)
return all_common_ngrams
def find_relative_order(sentence, common_ngrams):
sentence = sentence.lower()
ngram_positions = {}
for ngram in common_ngrams:
ngram_lower = ngram.lower()
if ngram_lower in sentence:
position = sentence.index(ngram_lower)
ngram_positions[ngram] = position
sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
return result
def find_non_melting_points(sent_list):
# Find filtered n-grams
common_ngrams = find_filtered_ngrams(sent_list)
def remove_punctuation(common_ngrams):
punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
for item in common_ngrams:
if item in punctuation:
common_ngrams.remove(item)
return common_ngrams
final_list = remove_punctuation(common_ngrams)
sentence = sent_list[0]
non_melting_points = find_relative_order(sentence, final_list)
return non_melting_points
# Example usage
# from paraphraser import generate_paraphrase
# from twokenize import tokenize_sentences
# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
# non_melting_points = find_non_melting_points(sentences)
# print(non_melting_points)