|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from bs4 import BeautifulSoup |
|
import re |
|
def clean(df): |
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
cleaned_headlines = [] |
|
|
|
for headline in df['title']: |
|
headline = BeautifulSoup(headline, 'html.parser').get_text() |
|
headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline) |
|
headline = re.sub(r'\s+', ' ', headline).strip() |
|
headline = headline.lower() |
|
|
|
words = headline.split() |
|
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] |
|
|
|
cleaned_headline = ' '.join(words) |
|
cleaned_headlines.append(cleaned_headline) |
|
|
|
df['title'] = cleaned_headlines |
|
df.drop_duplicates(subset=['title'], inplace=True) |
|
|
|
return df |
|
|