Spaces:
Running
Running
import nltk | |
from nltk.corpus import stopwords | |
try: | |
nltk.data.find('corpora/stopwords') | |
except LookupError: | |
nltk.download('stopwords') | |
def remove_stopwords(text): | |
""" | |
Remove stopwords using NLTK's stopword list | |
Args: | |
text (str): Input text | |
Returns: | |
str: Cleaned text with stopwords removed | |
""" | |
stop_words = set(stopwords.words('english')) | |
words = text.lower().split() | |
return ' '.join([word for word in words if word not in stop_words]) | |
def is_exact_match(ngram, sentences): | |
""" | |
Check if the given n-gram has an exact match in all sentences | |
Args: | |
ngram (str): The n-gram to search for | |
sentences (list): List of sentences to search in | |
Returns: | |
bool: True if n-gram has exact match in all sentences, False otherwise | |
""" | |
sentence_ngrams = [] | |
for sentence in sentences: | |
words = sentence.split() | |
current_ngrams = [] | |
n = len(ngram.split()) | |
for i in range(len(words) - n + 1): | |
current_ngram = " ".join(words[i:i+n]) | |
current_ngrams.append(current_ngram) | |
sentence_ngrams.append(set(current_ngrams)) | |
return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams) | |
def is_substring_of_any(ngram, common_ngrams): | |
""" | |
Check if the given n-gram is an exact substring of any previously found common n-grams | |
Args: | |
ngram (str): The n-gram to check | |
common_ngrams (list): List of previously found common n-grams | |
Returns: | |
bool: True if ngram is a substring of any common_ngrams, False otherwise | |
""" | |
ngram_words = ngram.split() | |
for common_gram in common_ngrams: | |
common_words = common_gram.split() | |
for i in range(len(common_words) - len(ngram_words) + 1): | |
if " ".join(common_words[i:i+len(ngram_words)]) == ngram: | |
return True | |
return False | |
def find_filtered_ngrams(sentences): | |
""" | |
Find all n-grams that have exact matches across all sentences, | |
excluding those that are part of larger common n-grams | |
Args: | |
sentences (list): List of sentences to analyze | |
Returns: | |
list: List of all common n-grams in order of their appearance in the first sentence | |
""" | |
# First, remove stopwords from all sentences | |
cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences] | |
words = cleaned_sentences[0].split() | |
max_n = len(words) | |
all_common_ngrams = [] | |
for n in range(max_n, 0, -1): | |
for i in range(len(words) - n + 1): | |
ngram = " ".join(words[i:i+n]) | |
if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams): | |
all_common_ngrams.append(ngram) | |
return all_common_ngrams | |
def find_relative_order(sentence, common_ngrams): | |
sentence = sentence.lower() | |
ngram_positions = {} | |
for ngram in common_ngrams: | |
ngram_lower = ngram.lower() | |
if ngram_lower in sentence: | |
position = sentence.index(ngram_lower) | |
ngram_positions[ngram] = position | |
sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1]) | |
result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)] | |
return result | |
def find_non_melting_points(sent_list): | |
# Find filtered n-grams | |
common_ngrams = find_filtered_ngrams(sent_list) | |
def remove_punctuation(common_ngrams): | |
punctuation = ".?!.;,:'\"()[]{}-ββ...+/\\*^|@#%&_~`" | |
for item in common_ngrams: | |
if item in punctuation: | |
common_ngrams.remove(item) | |
return common_ngrams | |
final_list = remove_punctuation(common_ngrams) | |
sentence = sent_list[0] | |
non_melting_points = find_relative_order(sentence, final_list) | |
return non_melting_points | |
# Example usage | |
# from paraphraser import generate_paraphrase | |
# from twokenize import tokenize_sentences | |
# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is.")) | |
# non_melting_points = find_non_melting_points(sentences) | |
# print(non_melting_points) | |