File size: 822 Bytes
bfe44e3 cfabd2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
def clean(df):
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
cleaned_headlines = []
for headline in df['title']:
headline = BeautifulSoup(headline, 'html.parser').get_text()
headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline)
headline = re.sub(r'\s+', ' ', headline).strip()
headline = headline.lower()
words = headline.split()
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
cleaned_headline = ' '.join(words)
cleaned_headlines.append(cleaned_headline)
df['title'] = cleaned_headlines
df.drop_duplicates(subset=['title'], inplace=True)
return df
|