from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from bs4 import BeautifulSoup import re def clean(df): stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() cleaned_headlines = [] for headline in df['title']: headline = BeautifulSoup(headline, 'html.parser').get_text() headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline) headline = re.sub(r'\s+', ' ', headline).strip() headline = headline.lower() words = headline.split() words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] cleaned_headline = ' '.join(words) cleaned_headlines.append(cleaned_headline) df['title'] = cleaned_headlines df.drop_duplicates(subset=['title'], inplace=True) return df