Spaces:
Running
Running
import re | |
from nltk.corpus import stopwords | |
def find_common_subsequences(sentence, str_list): | |
stop_words = set(stopwords.words('english')) | |
sentence = sentence.lower() | |
str_list = [s.lower() for s in str_list] | |
def is_present(subseq, str_list): | |
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b') | |
return all(subseq_regex.search(s) for s in str_list) | |
def remove_stop_words_and_special_chars(sentence): | |
sentence = re.sub(r'[^\w\s]', '', sentence) | |
words = sentence.split() | |
filtered_words = [word for word in words if word.lower() not in stop_words] | |
return " ".join(filtered_words) | |
cleaned_sentence = remove_stop_words_and_special_chars(sentence) | |
cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list] | |
words = cleaned_sentence.split() | |
common_grams = [] | |
added_phrases = set() | |
for n in range(5, 0, -1): # Check n-grams from size 5 to 1 | |
for i in range(len(words) - n + 1): | |
subseq = " ".join(words[i:i + n]) | |
if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases): | |
common_grams.append((i, subseq)) | |
added_phrases.add(subseq) | |
# Sort by the first appearance in the original sentence | |
common_grams.sort(key=lambda x: x[0]) | |
# Assign indices based on the sorted order | |
indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)] | |
return indexed_common_grams | |
def find_common_gram_positions(str_list, common_grams): | |
# Initialize a list to hold positions for each sentence | |
positions = [] | |
for sentence in str_list: | |
# Number each word in the sentence | |
words = re.sub(r'[^\w\s]', '', sentence).lower().split() | |
word_positions = {word: [] for word in words} | |
for idx, word in enumerate(words): | |
word_positions[word].append(idx + 1) # Store 1-based index positions | |
# Create a list to store positions of common grams for the current sentence | |
sentence_positions = [] | |
for gram in common_grams: | |
# Clean the gram for matching | |
cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower() | |
gram_words = cleaned_gram.split() | |
# Check for the position of the common gram in the current sentence | |
if all(word in word_positions for word in gram_words): | |
# Get the position of the first word of the common gram | |
start_idx = word_positions[gram_words[0]][0] | |
sentence_positions.append(start_idx) | |
else: | |
sentence_positions.append(-1) # Common gram not found | |
# Append the positions for the current sentence to the main positions list | |
positions.append(sentence_positions) | |
return positions | |
# Example usage | |
# sentence = "I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is." | |
# str_list = ['The possibility of a robot being the next "great writer" is underwhelming, as I doubt they will be able to convey the same level of intelligence and complexity as humans.', 'I doubt that the next "great writer" will be a robot or that they\'ll be capable of conveying the same level of intelligence and complexity as humans.', 'It\'s my hunches that the next "great writer" will be a robot, or that they\'ll be much more capable of conveying the subtle and profound aspects of human thinking than humans themselves.', 'I have little faith that the next "great writer" will be a robot or that they can accurately convey the subtle and profound aspects of human thinking.', 'My suspicion is that the next "great writer" will be a robot or that they\'ll possess greater intelligence and complexity than if not already present in human form.', 'The idea that a robot will be the next "great writer" is beyond doubt, as I\'m not convinced they can convey the same level of complexity and sophistication as humans.', 'But I\'m very suspicious of the future -- we might hope that someday a robot will be the next "great writer" or at least they\'ll be able to convey the depth and complexity of what humans think than any human.', 'There is a growing doubt in my mind that the next "great writer" will be dominated by gizmos or even capable of outlining every detail and depth of human thought.', 'It seems unlikely that a robot will be the next great writer or that they can convey the subtle and profound aspects of human thinking in evocative terms.', 'Whether or not the next "great writer" is an unknown, and I\'m skeptical about whether they can ever truly embody human thought.'] | |
# # Find common subsequences | |
# common_grams = find_common_subsequences(sentence, str_list) | |
# # Extract the subsequences from the common grams for position checking | |
# subsequences = [subseq for _, subseq in common_grams] | |
# # Find positions of the common grams | |
# common_gram_positions = find_common_gram_positions(str_list, subsequences) | |
# print(common_grams) | |