Spaces:

BheemaShankerNeyigapula
/

aiisc-watermarking-model

Runtime error

App Files Files Community

aiisc-watermarking-model / lcs.py

BheemaShankerNeyigapula

Upload folder using huggingface_hub

ea6afa4 verified 22 days ago

raw

history blame contribute delete

2.43 kB

	import re
	from nltk.corpus import stopwords

	def find_common_subsequences(sentence, str_list):
	# Load stop words
	stop_words = set(stopwords.words('english'))

	# Preprocess the input sentence and list of strings
	sentence = sentence.lower()
	cleaned_str_list = [s.lower() for s in str_list]

	def clean_text(text):
	"""Remove stop words and special characters from a given text."""
	text = re.sub(r'[^\w\s]', '', text)
	return " ".join(word for word in text.split() if word not in stop_words)

	cleaned_sentence = clean_text(sentence)
	cleaned_str_list = [clean_text(s) for s in cleaned_str_list]

	words = cleaned_sentence.split()
	common_grams = []
	added_phrases = set()

	for n in range(5, 0, -1): # Check n-grams from size 5 to 1
	for i in range(len(words) - n + 1):
	subseq = " ".join(words[i:i + n])
	if is_present(subseq, cleaned_str_list) and subseq not in added_phrases:
	common_grams.append((i, subseq))
	added_phrases.add(subseq)

	# Sort by the first appearance in the original sentence and create indexed common grams
	common_grams.sort(key=lambda x: x[0])
	return [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]

	def is_present(subseq, str_list):
	"""Check if a subsequence is present in all strings in the list."""
	subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
	return all(subseq_regex.search(s) for s in str_list)

	def find_common_gram_positions(str_list, common_grams):
	"""Find positions of common grams in each string from str_list."""
	positions = []

	for sentence in str_list:
	words = re.sub(r'[^\w\s]', '', sentence).lower().split()
	word_positions = {word: [] for word in words}

	for idx, word in enumerate(words):
	word_positions[word].append(idx + 1) # Store 1-based index positions

	sentence_positions = []
	for _, gram in common_grams:
	gram_words = re.sub(r'[^\w\s]', '', gram).lower().split()

	if all(word in word_positions for word in gram_words):
	start_idx = word_positions[gram_words[0]][0]
	sentence_positions.append(start_idx)
	else:
	sentence_positions.append(-1) # Common gram not found

	positions.append(sentence_positions)

	return positions