Spaces:

jgyasu
/

aiisc-watermarking-model

Running

File size: 4,355 Bytes

0840f0a

import nltk
from nltk.corpus import stopwords

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def remove_stopwords(text):
    """
    Remove stopwords using NLTK's stopword list
    
    Args:
        text (str): Input text
        
    Returns:
        str: Cleaned text with stopwords removed
    """
    stop_words = set(stopwords.words('english'))
    words = text.lower().split()
    return ' '.join([word for word in words if word not in stop_words])

def is_exact_match(ngram, sentences):
    """
    Check if the given n-gram has an exact match in all sentences
    
    Args:
        ngram (str): The n-gram to search for
        sentences (list): List of sentences to search in
        
    Returns:
        bool: True if n-gram has exact match in all sentences, False otherwise
    """
    sentence_ngrams = []
    for sentence in sentences:
        words = sentence.split()
        current_ngrams = []
        n = len(ngram.split())
        
        for i in range(len(words) - n + 1):
            current_ngram = " ".join(words[i:i+n])
            current_ngrams.append(current_ngram)
            
        sentence_ngrams.append(set(current_ngrams))
    
    return all(ngram in sent_ngrams for sent_ngrams in sentence_ngrams)

def is_substring_of_any(ngram, common_ngrams):
    """
    Check if the given n-gram is an exact substring of any previously found common n-grams
    
    Args:
        ngram (str): The n-gram to check
        common_ngrams (list): List of previously found common n-grams
        
    Returns:
        bool: True if ngram is a substring of any common_ngrams, False otherwise
    """
    ngram_words = ngram.split()
    for common_gram in common_ngrams:
        common_words = common_gram.split()
        for i in range(len(common_words) - len(ngram_words) + 1):
            if " ".join(common_words[i:i+len(ngram_words)]) == ngram:
                return True
    return False

def find_filtered_ngrams(sentences):
    """
    Find all n-grams that have exact matches across all sentences,
    excluding those that are part of larger common n-grams
    
    Args:
        sentences (list): List of sentences to analyze
    
    Returns:
        list: List of all common n-grams in order of their appearance in the first sentence
    """
    # First, remove stopwords from all sentences
    cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
    
    words = cleaned_sentences[0].split()
    max_n = len(words)
    all_common_ngrams = []
    
    for n in range(max_n, 0, -1):
        for i in range(len(words) - n + 1):
            ngram = " ".join(words[i:i+n])
            
            if is_exact_match(ngram, cleaned_sentences) and not is_substring_of_any(ngram, all_common_ngrams):
                all_common_ngrams.append(ngram)
    
    return all_common_ngrams

def find_relative_order(sentence, common_ngrams):
    sentence = sentence.lower()
    ngram_positions = {}
    
    for ngram in common_ngrams:
        ngram_lower = ngram.lower()
        if ngram_lower in sentence:
            position = sentence.index(ngram_lower)
            ngram_positions[ngram] = position
    
    sorted_ngrams = sorted(ngram_positions.items(), key=lambda x: x[1])
    
    result = [(i + 1, ngram) for i, (ngram, _) in enumerate(sorted_ngrams)]
    
    return result


def find_non_melting_points(sent_list):
    
    # Find filtered n-grams
    common_ngrams = find_filtered_ngrams(sent_list)
    
    def remove_punctuation(common_ngrams):
      punctuation = ".?!.;,:'\"()[]{}-–—...+/\\*^|@#%&_~`"
      for item in common_ngrams:
        if item in punctuation:
          common_ngrams.remove(item)
      return common_ngrams

    final_list = remove_punctuation(common_ngrams)
    sentence = sent_list[0]
    non_melting_points = find_relative_order(sentence, final_list)

    return non_melting_points


# Example usage
# from paraphraser import generate_paraphrase
# from twokenize import tokenize_sentences

# sentences = tokenize_sentences(generate_paraphrase("I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."))
# non_melting_points = find_non_melting_points(sentences)

# print(non_melting_points)