CIS5190abcd
/

svm

svm / data_cleaning.py

Update data_cleaning.py

bfe44e3 verified 21 days ago

822 Bytes

	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from bs4 import BeautifulSoup
	import re
	def clean(df):
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()
	cleaned_headlines = []

	for headline in df['title']:
	headline = BeautifulSoup(headline, 'html.parser').get_text()
	headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline)
	headline = re.sub(r'\s+', ' ', headline).strip()
	headline = headline.lower()

	words = headline.split()
	words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

	cleaned_headline = ' '.join(words)
	cleaned_headlines.append(cleaned_headline)

	df['title'] = cleaned_headlines
	df.drop_duplicates(subset=['title'], inplace=True)

	return df