yitingliii
commited on
Commit
•
26f1a8a
1
Parent(s):
16a82d8
Update tfidf.py
Browse files
tfidf.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import pandas as pd
|
|
|
2 |
df = pd.read_csv("hf://datasets/CIS5190abcd/headlines_train/train_cleaned_headlines.csv")
|
3 |
|
4 |
from sklearn.model_selection import train_test_split
|
@@ -9,3 +10,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
9 |
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
|
10 |
X_train_tfidf = tfidf.fit_transform(X_train)
|
11 |
X_test_tfidf = tfidf.transform(X_test)
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
+
import joblib
|
3 |
df = pd.read_csv("hf://datasets/CIS5190abcd/headlines_train/train_cleaned_headlines.csv")
|
4 |
|
5 |
from sklearn.model_selection import train_test_split
|
|
|
10 |
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
|
11 |
X_train_tfidf = tfidf.fit_transform(X_train)
|
12 |
X_test_tfidf = tfidf.transform(X_test)
|
13 |
+
|
14 |
+
joblib.dump(X_train_tfidf, 'X_train_tfidf.pkl')
|
15 |
+
joblib.dump(X_test_tfidf, 'X_test_tfidf.pkl')
|
16 |
+
joblib.dump(y_train, 'y_train.pkl')
|
17 |
+
joblib.dump(y_test, 'y_test.pkl')
|