File size: 822 Bytes
bfe44e3
 
 
 
cfabd2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
def clean(df):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned_headlines = []

    for headline in df['title']:
        headline = BeautifulSoup(headline, 'html.parser').get_text()
        headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline)
        headline = re.sub(r'\s+', ' ', headline).strip()
        headline = headline.lower()

        words = headline.split()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

        cleaned_headline = ' '.join(words)
        cleaned_headlines.append(cleaned_headline)

    df['title'] = cleaned_headlines
    df.drop_duplicates(subset=['title'], inplace=True)

    return df