from rake_nltk import Rake import nltk import re # Download NLTK data nltk.download('punkt') nltk.download('stopwords') # Define a list of obligation words obligation_words = [ # English words "must", "will", "use", "may", "provides", 'is obliged to', 'has to', 'needs to', 'is required to', "shall", "should", "ought to", "required", "obligated", "duty", "agrees to", "has a duty to", "is expected to", "commits to", # German words "muss", "wird", "nutzen", "darf", "stellt bereit", "ist verpflichtet", "ist erforderlich", "soll", "sollte", "erforderlich", "verpflichtet", "Pflicht", "stimmt zu", "hat die Pflicht", "wird erwartet", "verpflichtet sich" ] def extract_sentences_with_obligations(text): # Initialize Rake with stopwords set to None (to keep all words) rake = Rake() # Split the text into sentences sentences = re.split(r'(?<=[.!?])\s+', text) # Initialize a list to store sentences with obligation words obligation_sentences = [] # Iterate through the sentences for sentence in sentences: # Extract keyphrases from the sentence rake.extract_keywords_from_text(sentence) # Get the ranked keyphrases ranked_keyphrases = rake.get_ranked_phrases() # Check if any of the ranked keyphrases contain obligation words if any(any(word in kp.lower() for word in obligation_words) for kp in ranked_keyphrases): obligation_sentences.append(sentence) # Join the sentences into a single string return ' '.join(obligation_sentences)