jgyasu's picture
Upload folder using huggingface_hub
0840f0a verified
raw
history blame
5.13 kB
import re
from nltk.corpus import stopwords
def find_common_subsequences(sentence, str_list):
stop_words = set(stopwords.words('english'))
sentence = sentence.lower()
str_list = [s.lower() for s in str_list]
def is_present(subseq, str_list):
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
return all(subseq_regex.search(s) for s in str_list)
def remove_stop_words_and_special_chars(sentence):
sentence = re.sub(r'[^\w\s]', '', sentence)
words = sentence.split()
filtered_words = [word for word in words if word.lower() not in stop_words]
return " ".join(filtered_words)
cleaned_sentence = remove_stop_words_and_special_chars(sentence)
cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
words = cleaned_sentence.split()
common_grams = []
added_phrases = set()
for n in range(5, 0, -1): # Check n-grams from size 5 to 1
for i in range(len(words) - n + 1):
subseq = " ".join(words[i:i + n])
if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
common_grams.append((i, subseq))
added_phrases.add(subseq)
# Sort by the first appearance in the original sentence
common_grams.sort(key=lambda x: x[0])
# Assign indices based on the sorted order
indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]
return indexed_common_grams
def find_common_gram_positions(str_list, common_grams):
# Initialize a list to hold positions for each sentence
positions = []
for sentence in str_list:
# Number each word in the sentence
words = re.sub(r'[^\w\s]', '', sentence).lower().split()
word_positions = {word: [] for word in words}
for idx, word in enumerate(words):
word_positions[word].append(idx + 1) # Store 1-based index positions
# Create a list to store positions of common grams for the current sentence
sentence_positions = []
for gram in common_grams:
# Clean the gram for matching
cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
gram_words = cleaned_gram.split()
# Check for the position of the common gram in the current sentence
if all(word in word_positions for word in gram_words):
# Get the position of the first word of the common gram
start_idx = word_positions[gram_words[0]][0]
sentence_positions.append(start_idx)
else:
sentence_positions.append(-1) # Common gram not found
# Append the positions for the current sentence to the main positions list
positions.append(sentence_positions)
return positions
# Example usage
# sentence = "I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."
# str_list = ['The possibility of a robot being the next "great writer" is underwhelming, as I doubt they will be able to convey the same level of intelligence and complexity as humans.', 'I doubt that the next "great writer" will be a robot or that they\'ll be capable of conveying the same level of intelligence and complexity as humans.', 'It\'s my hunches that the next "great writer" will be a robot, or that they\'ll be much more capable of conveying the subtle and profound aspects of human thinking than humans themselves.', 'I have little faith that the next "great writer" will be a robot or that they can accurately convey the subtle and profound aspects of human thinking.', 'My suspicion is that the next "great writer" will be a robot or that they\'ll possess greater intelligence and complexity than if not already present in human form.', 'The idea that a robot will be the next "great writer" is beyond doubt, as I\'m not convinced they can convey the same level of complexity and sophistication as humans.', 'But I\'m very suspicious of the future -- we might hope that someday a robot will be the next "great writer" or at least they\'ll be able to convey the depth and complexity of what humans think than any human.', 'There is a growing doubt in my mind that the next "great writer" will be dominated by gizmos or even capable of outlining every detail and depth of human thought.', 'It seems unlikely that a robot will be the next great writer or that they can convey the subtle and profound aspects of human thinking in evocative terms.', 'Whether or not the next "great writer" is an unknown, and I\'m skeptical about whether they can ever truly embody human thought.']
# # Find common subsequences
# common_grams = find_common_subsequences(sentence, str_list)
# # Extract the subsequences from the common grams for position checking
# subsequences = [subseq for _, subseq in common_grams]
# # Find positions of the common grams
# common_gram_positions = find_common_gram_positions(str_list, subsequences)
# print(common_grams)