File size: 4,355 Bytes
0840f0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import nltk
from nltk.corpus import stopwords

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def remove_stopwords(text):
    """
    Remove stopwords using NLTK's stopword list
    
    Args:
        text (str): Input text
        
    Returns:
        str: Cleaned text with stopwords removed
    """
    stop_words = set(stopwords.words('english'))
    words = text.lower().split()
    return ' '.join([word for word in words if word not in stop_words])

def is_exact_match(ngram, sentences):
    """
    Check if the given n-gram has an exact match in all sentences
    
    Args:
        ngram (str): The n-gram to search for
        sentences (list): List of sentences to search in
        
    Returns:
        bool: True if n-gram has exact match in all sentences, False otherwise
    """
    sentence_ngrams = []
    for sentence in sentences:
        words = sentence.split()
        current_ngrams = []
        n = len(ngram.split())
        
        for i in range(len(words) - n + 1):
            current_ngram = " ".join(words[i:i+n])
            current_ngrams.append(current_ngram)
            
        sentence_ngrams.append(set(current_ngrams))
    
    return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)

def is_substring_of_any(ngram, common_ngrams):
    """
    Check if the given n-gram is an exact substring of any previously found common n-grams
    
    Args:
        ngram (str): The n-gram to check
        common_ngrams (list): List of previously found common n-grams
        
    Returns:
        bool: True if ngram is a substring of any common_ngrams, False otherwise
    """
    ngram_words = ngram.split()
    for common_gram in common_ngrams:
        common_words = common_gram.split()
        for i in range(len(common_words) - len(ngram_words) + 1):
            if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
                return True
    return False

def find_filtered_ngrams(sentences):
    """
    Find all n-grams that have exact matches across all sentences,
    excluding those that are part of larger common n-grams
    
    Args:
        sentences (list): List of sentences to analyze
    
    Returns:
        list: List of all common n-grams in order of their appearance in the first sentence
    """
    # First, remove stopwords from all sentences
    cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
    
    words = cleaned_sentences[0].split()
    max_n = len(words)
    all_common_ngrams = []
    
    for n in range(max_n, 0, -1):
        for i in range(len(words) - n + 1):
            ngram = " ".join(words[i:i+n])
            
            if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
                all_common_ngrams.append(ngram)
    
    return all_common_ngrams

def find_relative_order(sentence, common_ngrams):
    sentence = sentence.lower()
    ngram_positions = {}
    
    for ngram in common_ngrams:
        ngram_lower = ngram.lower()
        if ngram_lower in sentence:
            position = sentence.index(ngram_lower)
            ngram_positions[ngram] = position
    
    sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
    
    result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
    
    return result


def find_non_melting_points(sent_list):
    
    # Find filtered n-grams
    common_ngrams = find_filtered_ngrams(sent_list)
    
    def remove_punctuation(common_ngrams):
      punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
      for item in common_ngrams:
        if item in punctuation:
          common_ngrams.remove(item)
      return common_ngrams

    final_list = remove_punctuation(common_ngrams)
    sentence = sent_list[0]
    non_melting_points = find_relative_order(sentence, final_list)

    return non_melting_points


# Example usage
# from paraphraser import generate_paraphrase
# from twokenize import tokenize_sentences

# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
# non_melting_points = find_non_melting_points(sentences)

# print(non_melting_points)