import nltk from nltk.corpus import stopwords try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') def remove_stopwords(text): """ Remove stopwords using NLTK's stopword list Args: text (str): Input text Returns: str: Cleaned text with stopwords removed """ stop_words = set(stopwords.words('english')) words = text.lower().split() return ' '.join([word for word in words if word not in stop_words]) def is_exact_match(ngram, sentences): """ Check if the given n-gram has an exact match in all sentences Args: ngram (str): The n-gram to search for sentences (list): List of sentences to search in Returns: bool: True if n-gram has exact match in all sentences, False otherwise """ sentence_ngrams = [] for sentence in sentences: words = sentence.split() current_ngrams = [] n = len(ngram.split()) for i in range(len(words) - n + 1): current_ngram = " ".join(words[i:i+n]) current_ngrams.append(current_ngram) sentence_ngrams.append(set(current_ngrams)) return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams) def is_substring_of_any(ngram, common_ngrams): """ Check if the given n-gram is an exact substring of any previously found common n-grams Args: ngram (str): The n-gram to check common_ngrams (list): List of previously found common n-grams Returns: bool: True if ngram is a substring of any common_ngrams, False otherwise """ ngram_words = ngram.split() for common_gram in common_ngrams: common_words = common_gram.split() for i in range(len(common_words) - len(ngram_words) + 1): if " ".join(common_words[i:i+len(ngram_words)]) == ngram: return True return False def find_filtered_ngrams(sentences): """ Find all n-grams that have exact matches across all sentences, excluding those that are part of larger common n-grams Args: sentences (list): List of sentences to analyze Returns: list: List of all common n-grams in order of their appearance in the first sentence """ # First, remove stopwords from all sentences cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences] words = cleaned_sentences[0].split() max_n = len(words) all_common_ngrams = [] for n in range(max_n, 0, -1): for i in range(len(words) - n + 1): ngram = " ".join(words[i:i+n]) if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams): all_common_ngrams.append(ngram) return all_common_ngrams def find_relative_order(sentence, common_ngrams): sentence = sentence.lower() ngram_positions = {} for ngram in common_ngrams: ngram_lower = ngram.lower() if ngram_lower in sentence: position = sentence.index(ngram_lower) ngram_positions[ngram] = position sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1]) result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)] return result def find_non_melting_points(sent_list): # Find filtered n-grams common_ngrams = find_filtered_ngrams(sent_list) def remove_punctuation(common_ngrams): punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`" for item in common_ngrams: if item in punctuation: common_ngrams.remove(item) return common_ngrams final_list = remove_punctuation(common_ngrams) sentence = sent_list[0] non_melting_points = find_relative_order(sentence, final_list) return non_melting_points # Example usage # from paraphraser import generate_paraphrase # from twokenize import tokenize_sentences # sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is.")) # non_melting_points = find_non_melting_points(sentences) # print(non_melting_points)