Spaces:
Running
Running
File size: 4,355 Bytes
0840f0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import nltk
from nltk.corpus import stopwords
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
def remove_stopwords(text):
"""
Remove stopwords using NLTK's stopword list
Args:
text (str): Input text
Returns:
str: Cleaned text with stopwords removed
"""
stop_words = set(stopwords.words('english'))
words = text.lower().split()
return ' '.join([word for word in words if word not in stop_words])
def is_exact_match(ngram, sentences):
"""
Check if the given n-gram has an exact match in all sentences
Args:
ngram (str): The n-gram to search for
sentences (list): List of sentences to search in
Returns:
bool: True if n-gram has exact match in all sentences, False otherwise
"""
sentence_ngrams = []
for sentence in sentences:
words = sentence.split()
current_ngrams = []
n = len(ngram.split())
for i in range(len(words) - n + 1):
current_ngram = " ".join(words[i:i+n])
current_ngrams.append(current_ngram)
sentence_ngrams.append(set(current_ngrams))
return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)
def is_substring_of_any(ngram, common_ngrams):
"""
Check if the given n-gram is an exact substring of any previously found common n-grams
Args:
ngram (str): The n-gram to check
common_ngrams (list): List of previously found common n-grams
Returns:
bool: True if ngram is a substring of any common_ngrams, False otherwise
"""
ngram_words = ngram.split()
for common_gram in common_ngrams:
common_words = common_gram.split()
for i in range(len(common_words) - len(ngram_words) + 1):
if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
return True
return False
def find_filtered_ngrams(sentences):
"""
Find all n-grams that have exact matches across all sentences,
excluding those that are part of larger common n-grams
Args:
sentences (list): List of sentences to analyze
Returns:
list: List of all common n-grams in order of their appearance in the first sentence
"""
# First, remove stopwords from all sentences
cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
words = cleaned_sentences[0].split()
max_n = len(words)
all_common_ngrams = []
for n in range(max_n, 0, -1):
for i in range(len(words) - n + 1):
ngram = " ".join(words[i:i+n])
if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
all_common_ngrams.append(ngram)
return all_common_ngrams
def find_relative_order(sentence, common_ngrams):
sentence = sentence.lower()
ngram_positions = {}
for ngram in common_ngrams:
ngram_lower = ngram.lower()
if ngram_lower in sentence:
position = sentence.index(ngram_lower)
ngram_positions[ngram] = position
sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
return result
def find_non_melting_points(sent_list):
# Find filtered n-grams
common_ngrams = find_filtered_ngrams(sent_list)
def remove_punctuation(common_ngrams):
punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
for item in common_ngrams:
if item in punctuation:
common_ngrams.remove(item)
return common_ngrams
final_list = remove_punctuation(common_ngrams)
sentence = sent_list[0]
non_melting_points = find_relative_order(sentence, final_list)
return non_melting_points
# Example usage
# from paraphraser import generate_paraphrase
# from twokenize import tokenize_sentences
# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
# non_melting_points = find_non_melting_points(sentences)
# print(non_melting_points)
|