Spaces:

jgyasu
/

aiisc-watermarking-model

Running

App Files Files Community

aiisc-watermarking-model / lcs.py

jgyasu

Upload folder using huggingface_hub

0840f0a verified about 1 month ago

raw

history blame

5.13 kB

	import re
	from nltk.corpus import stopwords

	def find_common_subsequences(sentence, str_list):
	stop_words = set(stopwords.words('english'))
	sentence = sentence.lower()
	str_list = [s.lower() for s in str_list]

	def is_present(subseq, str_list):
	subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
	return all(subseq_regex.search(s) for s in str_list)

	def remove_stop_words_and_special_chars(sentence):
	sentence = re.sub(r'[^\w\s]', '', sentence)
	words = sentence.split()
	filtered_words = [word for word in words if word.lower() not in stop_words]
	return " ".join(filtered_words)

	cleaned_sentence = remove_stop_words_and_special_chars(sentence)
	cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list]

	words = cleaned_sentence.split()
	common_grams = []
	added_phrases = set()

	for n in range(5, 0, -1): # Check n-grams from size 5 to 1
	for i in range(len(words) - n + 1):
	subseq = " ".join(words[i:i + n])
	if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases):
	common_grams.append((i, subseq))
	added_phrases.add(subseq)

	# Sort by the first appearance in the original sentence
	common_grams.sort(key=lambda x: x[0])

	# Assign indices based on the sorted order
	indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]

	return indexed_common_grams

	def find_common_gram_positions(str_list, common_grams):
	# Initialize a list to hold positions for each sentence
	positions = []

	for sentence in str_list:
	# Number each word in the sentence
	words = re.sub(r'[^\w\s]', '', sentence).lower().split()
	word_positions = {word: [] for word in words}

	for idx, word in enumerate(words):
	word_positions[word].append(idx + 1) # Store 1-based index positions

	# Create a list to store positions of common grams for the current sentence
	sentence_positions = []
	for gram in common_grams:
	# Clean the gram for matching
	cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower()
	gram_words = cleaned_gram.split()

	# Check for the position of the common gram in the current sentence
	if all(word in word_positions for word in gram_words):
	# Get the position of the first word of the common gram
	start_idx = word_positions[gram_words[0]][0]
	sentence_positions.append(start_idx)
	else:
	sentence_positions.append(-1) # Common gram not found

	# Append the positions for the current sentence to the main positions list
	positions.append(sentence_positions)

	return positions


	# Example usage
	# sentence = "I'm very skeptical that the next \"great writer\" will be a robot or that they'll be much more effective at expressing the subtleties and depths of human thought than a human is."
	# str_list = ['The possibility of a robot being the next "great writer" is underwhelming, as I doubt they will be able to convey the same level of intelligence and complexity as humans.', 'I doubt that the next "great writer" will be a robot or that they\'ll be capable of conveying the same level of intelligence and complexity as humans.', 'It\'s my hunches that the next "great writer" will be a robot, or that they\'ll be much more capable of conveying the subtle and profound aspects of human thinking than humans themselves.', 'I have little faith that the next "great writer" will be a robot or that they can accurately convey the subtle and profound aspects of human thinking.', 'My suspicion is that the next "great writer" will be a robot or that they\'ll possess greater intelligence and complexity than if not already present in human form.', 'The idea that a robot will be the next "great writer" is beyond doubt, as I\'m not convinced they can convey the same level of complexity and sophistication as humans.', 'But I\'m very suspicious of the future -- we might hope that someday a robot will be the next "great writer" or at least they\'ll be able to convey the depth and complexity of what humans think than any human.', 'There is a growing doubt in my mind that the next "great writer" will be dominated by gizmos or even capable of outlining every detail and depth of human thought.', 'It seems unlikely that a robot will be the next great writer or that they can convey the subtle and profound aspects of human thinking in evocative terms.', 'Whether or not the next "great writer" is an unknown, and I\'m skeptical about whether they can ever truly embody human thought.']

	# # Find common subsequences
	# common_grams = find_common_subsequences(sentence, str_list)
	# # Extract the subsequences from the common grams for position checking
	# subsequences = [subseq for _, subseq in common_grams]

	# # Find positions of the common grams
	# common_gram_positions = find_common_gram_positions(str_list, subsequences)


	# print(common_grams)