import re import warnings from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from bs4 import BeautifulSoup import nltk class TextCleaner: def __init__(self): warnings.filterwarnings("ignore") nltk.download('stopwords') nltk.download('wordnet') self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() def cleaning_text(self, text): if text and isinstance(text, str): text = BeautifulSoup(text, "html.parser").get_text() text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower()) text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words]) text = ' '.join(list(dict.fromkeys(text.split()))) else: text = '' return text if __name__ == "__main__": # Example usage: cleaner = TextCleaner() print(cleaner.cleaning_text("I feel bullied online."))