yitingliii commited on
Commit
1d6d48d
1 Parent(s): f642add

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +55 -1
README.md CHANGED
@@ -1,2 +1,56 @@
1
  # SVM Model with TF-IDF
2
- This model uses TF-IDF for feature extraction.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # SVM Model with TF-IDF
2
+ Step by step instruction:
3
+ 1. install required packages:
4
+ <br>Before running the code, install some necessary packages.
5
+
6
+ ```python
7
+ import nltk
8
+ nltk.download('stopwords')
9
+ nltk.download('wordnet')
10
+
11
+ from nltk.corpus import stopwords
12
+ from nltk.stem import WordNetLemmatizer
13
+ from bs4 import BeautifulSoup
14
+ import re
15
+ import pandas as pd
16
+ from sklearn.svm import SVC
17
+ ```
18
+
19
+ 2. Data Cleaning
20
+ <br> The next step is to do some data cleaning to ensure the input data's format.
21
+
22
+
23
+ ```python
24
+ def clean(df):
25
+ stop_words = set(stopwords.words('english'))
26
+ lemmatizer = WordNetLemmatizer()
27
+ cleaned_headlines = []
28
+
29
+ for headline in df['title']:
30
+ headline = BeautifulSoup(headline, 'html.parser').get_text()
31
+ headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline)
32
+ headline = re.sub(r'\s+', ' ', headline).strip()
33
+ headline = headline.lower()
34
+
35
+ words = headline.split()
36
+ words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
37
+
38
+ cleaned_headline = ' '.join(words)
39
+ cleaned_headlines.append(cleaned_headline)
40
+
41
+ df['title'] = cleaned_headlines
42
+ df.drop_duplicates(subset=['title'], inplace=True)
43
+
44
+ return df
45
+ ```
46
+
47
+ 3. run the SVM model
48
+ ```python
49
+ svm_model = SVC(kernel='linear', random_state=42)
50
+ svm_model.fit(X_train_tfidf, y_train)
51
+ y_pred = svm_model.predict(X_test_tfidf)
52
+ accuracy = accuracy_score(y_test, y_pred)
53
+ print(f"Random Forest Accuracy: {accuracy:.4f}")
54
+ print(classification_report(y_test, y_pred))
55
+ ```
56
+