Spaces:
Runtime error
Runtime error
import re | |
from nltk.corpus import stopwords | |
def find_common_subsequences(sentence, str_list): | |
# Load stop words | |
stop_words = set(stopwords.words('english')) | |
# Preprocess the input sentence and list of strings | |
sentence = sentence.lower() | |
cleaned_str_list = [s.lower() for s in str_list] | |
def clean_text(text): | |
"""Remove stop words and special characters from a given text.""" | |
text = re.sub(r'[^\w\s]', '', text) | |
return " ".join(word for word in text.split() if word not in stop_words) | |
cleaned_sentence = clean_text(sentence) | |
cleaned_str_list = [clean_text(s) for s in cleaned_str_list] | |
words = cleaned_sentence.split() | |
common_grams = [] | |
added_phrases = set() | |
for n in range(5, 0, -1): # Check n-grams from size 5 to 1 | |
for i in range(len(words) - n + 1): | |
subseq = " ".join(words[i:i + n]) | |
if is_present(subseq, cleaned_str_list) and subseq not in added_phrases: | |
common_grams.append((i, subseq)) | |
added_phrases.add(subseq) | |
# Sort by the first appearance in the original sentence and create indexed common grams | |
common_grams.sort(key=lambda x: x[0]) | |
return [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)] | |
def is_present(subseq, str_list): | |
"""Check if a subsequence is present in all strings in the list.""" | |
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b') | |
return all(subseq_regex.search(s) for s in str_list) | |
def find_common_gram_positions(str_list, common_grams): | |
"""Find positions of common grams in each string from str_list.""" | |
positions = [] | |
for sentence in str_list: | |
words = re.sub(r'[^\w\s]', '', sentence).lower().split() | |
word_positions = {word: [] for word in words} | |
for idx, word in enumerate(words): | |
word_positions[word].append(idx + 1) # Store 1-based index positions | |
sentence_positions = [] | |
for _, gram in common_grams: | |
gram_words = re.sub(r'[^\w\s]', '', gram).lower().split() | |
if all(word in word_positions for word in gram_words): | |
start_idx = word_positions[gram_words[0]][0] | |
sentence_positions.append(start_idx) | |
else: | |
sentence_positions.append(-1) # Common gram not found | |
positions.append(sentence_positions) | |
return positions | |