Diego-0121
commited on
Commit
•
5178166
1
Parent(s):
ed8b14c
Update tokenizer.py
Browse files- tokenizer.py +34 -0
tokenizer.py
CHANGED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_processing import load_data, spotify_data, path
|
2 |
+
import pandas
|
3 |
+
import nltk
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
from nltk.tokenize import word_tokenize
|
6 |
+
import string
|
7 |
+
|
8 |
+
#---------------------------Download the requirements NLTK--------------------------------
|
9 |
+
|
10 |
+
#nltk.download('punkt')
|
11 |
+
#nltk.download('stopwords')
|
12 |
+
|
13 |
+
def clean_lyrics(lyrics):
|
14 |
+
# Tokenización
|
15 |
+
tokens = word_tokenize(lyrics)
|
16 |
+
|
17 |
+
# To lower case
|
18 |
+
tokens = [word.lower() for word in tokens]
|
19 |
+
|
20 |
+
# Delete signs
|
21 |
+
table = str.maketrans('', '', string.punctuation)
|
22 |
+
stripped_tokens = [word.translate(table) for word in tokens]
|
23 |
+
|
24 |
+
# Stop Words
|
25 |
+
stop_words = set(stopwords.words('english'))
|
26 |
+
tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]
|
27 |
+
|
28 |
+
return tokens_without_sw
|
29 |
+
|
30 |
+
# Apply clean
|
31 |
+
spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
|
32 |
+
spotify_data.to_csv('spotify_data_processed.csv', index=False)
|
33 |
+
|
34 |
+
#print(spotify_data['cleaned_text'].head())
|