Update README.md
Browse files
@@ -28,6 +28,16 @@ from torchvision.io import read_image
28 |
from torch.utils.data import Dataset, DataLoader
29 |
from sklearn.metrics import accuracy_score
30 |
import numpy as np
31 |
from transformers import DistilBertTokenizer, DistilBertModel</pre>
32 |
33 |
@@ -46,6 +56,146 @@ test_df = pd.read_csv(file_path)
46 |
X_test = test_df['title']
47 |
y_test = test_df['labels'] </pre>
48 |
49 |
# Load the embedding model from Huggingface. Transformer: DistilBERT
50 |
51 |
28 |
from torch.utils.data import Dataset, DataLoader
29 |
from sklearn.metrics import accuracy_score
30 |
import numpy as np
31 |
import pandas as pd
32 |
import numpy as np
33 |
import matplotlib.pyplot as plt
34 |
import seaborn as sns
35 |
import nltk
36 |
from nltk.corpus import stopwords
37 |
38 |
39 |
40 |
import re
41 |
from transformers import DistilBertTokenizer, DistilBertModel</pre>
42 |
43 |
56 |
X_test = test_df['title']
57 |
y_test = test_df['labels'] </pre>
58 |
59 |
# Clean the data
60 |
61 |
62 |
def clean_headlines(df, column_name):
63 |
64 |
Cleans a specified column in a DataFrame by:
65 |
- Removing HTML tags
66 |
- Removing <script> elements
67 |
- Removing extra spaces, trailing/leading whitespaces
68 |
- Removing special characters
69 |
- Removing repeating special characters
70 |
- Removing tabs
71 |
- Removing newline characters
72 |
- Removing specific punctuation: periods, commas, and parentheses
73 |
- Normalizing double quotes ("") to single quotes ('')
74 |
75 |
76 |
df (pd.DataFrame): The DataFrame containing the column to clean
77 |
column_name (str): The name of the column to clean
78 |
79 |
80 |
pd.DataFrame: A DataFrame with the cleaned column
81 |
82 |
# Remove HTML tags
83 |
df[column_name] = df[column_name].str.replace(r'<[^<]+?>', '', regex=True)
84 |
85 |
# Remove scripts
86 |
df[column_name] = df[column_name].str.replace(r'<script.*?</script>', '', regex=True)
87 |
88 |
# Remove extra spaces including leading/trailing whitespaces
89 |
df[column_name] = df[column_name].str.strip().str.replace(r'\s+', ' ', regex=True)
90 |
91 |
# Remove special characters
92 |
df[column_name] = df[column_name].str.strip().str.replace(r'[&*|~`^=_+{}[\]<>\\]', ' ', regex=True)
93 |
94 |
# Remove repeating special characters
95 |
df[column_name] = df[column_name].str.strip().str.replace(r'([?!])\1+', r'\1', regex=True)
96 |
97 |
# Remove tabs
98 |
df[column_name] = df[column_name].str.replace(r'\t', ' ', regex=True)
99 |
100 |
# Remove newline characters
101 |
df[column_name] = df[column_name].str.replace(r'\n', ' ', regex=True)
102 |
103 |
# Normalize double quotes to single quotes
104 |
# df[column_name] = df[column_name].str.replace(r'"', "'", regex=True)
105 |
106 |
# Punctuation
107 |
# df[column_name] = df[column_name].str.replace(r'[.,()]', '', regex=True)
108 |
109 |
return df </pre>
110 |
111 |
112 |
def normalize_headlines(df, column_name):
113 |
114 |
Normalizes a given headline by:
115 |
- converting it to lowercase
116 |
- removing stopwords
117 |
- applying stemming or lemmatization to reduce words to their base forms
118 |
119 |
120 |
df (pd.DataFrame): The DataFrame containing the column to clean
121 |
column_name (str): The name of the column to clean
122 |
123 |
124 |
pd.DataFrame: A DataFrame with the cleaned column
125 |
126 |
127 |
# Convert headlines to lowercase
128 |
df[column_name] = df[column_name].str.lower()
129 |
130 |
# Remove stopwords from headline
131 |
stop_words = set(stopwords.words('english'))
132 |
df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
133 |
134 |
# Lemmatize words to base form
135 |
lemmatizer = nltk.stem.WordNetLemmatizer()
136 |
df[column_name] = df[column_name].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
137 |
138 |
return df </pre>
139 |
140 |
141 |
def handle_missing_data(df, column_name):
142 |
143 |
Handles missing or incomplete data in a given column of a DataFrame, including:
144 |
145 |
- Replacing NULL values with "Unknown Headline"
146 |
- Augmenting the data by creating headlines with synonyms of words in other headlines
147 |
148 |
149 |
df (pd.DataFrame): The DataFrame containing the column to clean
150 |
column_name (str): The name of the column to clean
151 |
152 |
153 |
pd.DataFrame: A DataFrame with the cleaned column
154 |
155 |
156 |
# Remove NULL headlines
157 |
df = df.dropna(subset=[column_name])
158 |
159 |
# Set a minimum word count threshold
160 |
min_word_count = 3
161 |
162 |
# Filter out titles with fewer words
163 |
df = df[df[column_name].str.split().apply(len) >= min_word_count].reset_index(drop=True)
164 |
165 |
166 |
return df </pre>
167 |
168 |
169 |
def consistency_checks(df, column_name):
170 |
171 |
Ensures all headlines follow a consistent format by:
172 |
- Removing duplicate headlines
173 |
174 |
175 |
df (pd.DataFrame): The DataFrame containing the column to clean
176 |
column_name (str): The name of the column to clean
177 |
178 |
179 |
pd.DataFrame: A DataFrame with the cleaned column
180 |
181 |
182 |
183 |
# Remove duplicate headlines
184 |
df = df.drop_duplicates(subset=[column_name])
185 |
186 |
# Filter headlines with too few or too many words
187 |
#df = df[df['title'].str.split().apply(len).between(3, 20)]
188 |
189 |
190 |
return df </pre>
191 |
192 |
193 |
X_test = clean_headlines(X_test, 'title')
194 |
X_test = normalize_headlines(X_test, 'title')
195 |
X_test = X_test.dropna(subset = ['title'])
196 |
X_test = handle_missing_data(X_test, 'title')
197 |
X_test = consistency_checks(X_test, 'title') </pre>
198 |
199 |
# Load the embedding model from Huggingface. Transformer: DistilBERT
200 |
201 |