sunga25 commited on
Commit
9465da4
1 Parent(s): 9a0bb2e

Create src/main.py

Browse files
Files changed (1) hide show
  1. src/main.py +311 -0
src/main.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import DataLoader, TensorDataset
7
+ from torch.cuda.amp import autocast, GradScaler
8
+ from torch.optim.lr_scheduler import ReduceLROnPlateau
9
+ import numpy as np
10
+ import pandas as pd
11
+ import matplotlib.pyplot as plt
12
+ from sklearn.model_selection import train_test_split
13
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
14
+ from sklearn.manifold import TSNE
15
+ from sklearn.cluster import DBSCAN
16
+ import optuna
17
+
18
+ # Set up logging
19
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
+
21
+ # Constants
22
+ RANDOM_SEED = 42
23
+ TEST_SIZE = 0.2
24
+ VALIDATION_SIZE = 200
25
+
26
+ def load_data(start_year=2000, end_year=2017):
27
+ dfs = []
28
+ for year in range(start_year, end_year + 1):
29
+ file_path = f'atp_matches_{year}.csv'
30
+ try:
31
+ df = pd.read_csv(file_path, low_memory=False)
32
+ required_columns = ['tourney_id', 'surface', 'winner_id', 'loser_id', 'winner_name', 'loser_name',
33
+ 'winner_age', 'loser_age', 'winner_rank', 'loser_rank', 'tourney_date']
34
+ if not all(col in df.columns for col in required_columns):
35
+ logging.warning(f"File {file_path} is missing some required columns. Skipping this file.")
36
+ continue
37
+ dfs.append(df)
38
+ logging.info(f"Data loaded successfully from {file_path}")
39
+ except FileNotFoundError:
40
+ logging.warning(f"File not found: {file_path}")
41
+ except pd.errors.EmptyDataError:
42
+ logging.warning(f"Empty file: {file_path}")
43
+ except Exception as e:
44
+ logging.error(f"Error loading data from {file_path}: {str(e)}")
45
+
46
+ if not dfs:
47
+ raise ValueError("No data files were successfully loaded.")
48
+
49
+ combined_df = pd.concat(dfs, ignore_index=True)
50
+ if combined_df.empty:
51
+ raise ValueError("The combined DataFrame is empty after processing all files.")
52
+ return combined_df
53
+
54
+ def preprocess_data(df):
55
+ label_encoders = {}
56
+ for col in ['tourney_id', 'surface', 'winner_id', 'loser_id']:
57
+ df[col] = df[col].astype(str)
58
+ le = LabelEncoder()
59
+ df[col] = le.fit_transform(df[col])
60
+ label_encoders[col] = le
61
+
62
+ df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d', errors='coerce')
63
+ df = df.dropna(subset=['tourney_date'])
64
+
65
+ return df, label_encoders
66
+
67
+ def engineer_features(df):
68
+ numeric_cols = ['winner_age', 'loser_age', 'winner_rank', 'loser_rank']
69
+ for col in numeric_cols:
70
+ df[col] = pd.to_numeric(df[col], errors='coerce')
71
+
72
+ df['age_difference'] = df['winner_age'] - df['loser_age']
73
+ df['rank_difference'] = df['loser_rank'] - df['winner_rank']
74
+
75
+ numeric_columns = numeric_cols + ['age_difference', 'rank_difference']
76
+ df = df.dropna(subset=numeric_columns)
77
+
78
+ return df, numeric_columns
79
+
80
+ class JointEmbeddedModel(nn.Module):
81
+ def __init__(self, categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate=0.3):
82
+ super().__init__()
83
+ self.embeddings = nn.ModuleList([nn.Embedding(dim, embedding_dim) for dim in categorical_dims])
84
+ self.fc1 = nn.Linear(len(categorical_dims) * embedding_dim + numerical_dim, hidden_dim)
85
+ self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
86
+ self.fc3 = nn.Linear(hidden_dim // 2, 1)
87
+ self.relu = nn.ReLU()
88
+ self.dropout = nn.Dropout(dropout_rate)
89
+
90
+ def forward(self, x_cat, x_num):
91
+ embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
92
+ x = torch.cat(embedded + [x_num], dim=1)
93
+ x = self.dropout(self.relu(self.fc1(x)))
94
+ x = self.dropout(self.relu(self.fc2(x)))
95
+ return self.fc3(x).squeeze()
96
+
97
+ def create_dataloader(X, y, batch_size=64):
98
+ x_cat, x_num = X
99
+ # Ensure tensors are not empty
100
+ if len(x_cat) == 0 or len(x_num) == 0:
101
+ raise ValueError("Input data for dataloader is empty.")
102
+ dataset = TensorDataset(torch.tensor(x_cat, dtype=torch.long),
103
+ torch.tensor(x_num, dtype=torch.float32),
104
+ torch.tensor(y, dtype=torch.float32))
105
+ return DataLoader(dataset, batch_size=batch_size, shuffle=True)
106
+
107
+ def train_model(model, dataloader, val_data, epochs=20, learning_rate=0.001, weight_decay=0, patience=5):
108
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
109
+ model.to(device)
110
+ criterion = nn.MSELoss()
111
+ optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
112
+ scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=patience, verbose=True)
113
+ scaler = GradScaler() if device.type == 'cuda' else None
114
+
115
+ best_val_loss = float('inf')
116
+ early_stopping_counter = 0
117
+
118
+ for epoch in range(epochs):
119
+ model.train()
120
+ total_loss = 0
121
+ for x_cat, x_num, y in dataloader:
122
+ x_cat, x_num, y = x_cat.to(device), x_num.to(device), y.to(device)
123
+ optimizer.zero_grad()
124
+ if scaler:
125
+ with autocast(device_type='cuda'):
126
+ outputs = model(x_cat, x_num)
127
+ loss = criterion(outputs, y)
128
+ scaler.scale(loss).backward()
129
+ scaler.step(optimizer)
130
+ scaler.update()
131
+ else:
132
+ outputs = model(x_cat, x_num)
133
+ loss = criterion(outputs, y)
134
+ loss.backward()
135
+ optimizer.step()
136
+ total_loss += loss.item()
137
+
138
+ avg_loss = total_loss / len(dataloader)
139
+ val_predictions = evaluate_model(model, val_data[0])
140
+ val_loss = np.mean((val_predictions - val_data[1]) ** 2)
141
+ scheduler.step(val_loss)
142
+ logging.info(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}")
143
+
144
+ if val_loss < best_val_loss:
145
+ best_val_loss = val_loss
146
+ early_stopping_counter = 0
147
+ torch.save(model.state_dict(), 'best_model.pt')
148
+ else:
149
+ early_stopping_counter += 1
150
+ if early_stopping_counter >= patience:
151
+ logging.info(f"Early stopping triggered after {epoch+1} epochs")
152
+ break
153
+
154
+ def evaluate_model(model, X):
155
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
156
+ model.eval()
157
+ x_cat, x_num = X
158
+
159
+ if len(x_cat.shape) == 1:
160
+ x_cat = x_cat.reshape(1, -1)
161
+ if len(x_num.shape) == 1:
162
+ x_num = x_num.reshape(1, -1)
163
+
164
+ x_cat = torch.tensor(x_cat, dtype=torch.long).to(device)
165
+ x_num = torch.tensor(x_num, dtype=torch.float32).to(device)
166
+
167
+ with torch.no_grad():
168
+ outputs = model(x_cat, x_num)
169
+ return outputs.cpu().numpy()
170
+
171
+ def objective(trial):
172
+ embedding_dim = trial.suggest_int('embedding_dim', 8, 64)
173
+ hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
174
+ learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
175
+ batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
176
+ weight_decay = trial.suggest_float('weight_decay', 1e-8, 1e-3, log=True)
177
+ dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
178
+
179
+ model = JointEmbeddedModel(categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate)
180
+ dataloader = create_dataloader(X_train, y_train, batch_size=batch_size)
181
+ train_model(model, dataloader, (X_val, y_val), epochs=10, learning_rate=learning_rate, weight_decay=weight_decay)
182
+
183
+ val_predictions = evaluate_model(model, X_val)
184
+ val_loss = np.mean((val_predictions - y_val) ** 2)
185
+ return val_loss
186
+
187
+ def enhanced_anomaly_detection(model, X, df_subset, eps=0.5, min_samples=5, threshold=None):
188
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
189
+ model.eval()
190
+ x_cat, x_num = X
191
+ if len(x_cat.shape) == 1:
192
+ x_cat = x_cat.reshape(-1, len(categorical_columns))
193
+ if len(x_num.shape) == 1:
194
+ x_num = x_num.reshape(-1, len(numeric_columns))
195
+
196
+ x_cat = torch.tensor(x_cat, dtype=torch.long).to(device)
197
+ x_num = torch.tensor(x_num, dtype=torch.float32).to(device)
198
+ with torch.no_grad():
199
+ embedded = [emb(x_cat[:, i]) for i, emb in enumerate(model.embeddings)]
200
+ embeddings = torch.cat(embedded, dim=1).cpu().numpy()
201
+ outputs = model(x_cat, x_num).cpu().numpy()
202
+
203
+ scaler = StandardScaler()
204
+ embeddings = scaler.fit_transform(embeddings)
205
+
206
+ dbscan = DBSCAN(eps=eps, min_samples=min_samples)
207
+ labels = dbscan.fit_predict(embeddings)
208
+
209
+ df_subset['anomaly'] = labels
210
+ df_subset['expected_rank_difference'] = outputs
211
+
212
+ if threshold is None:
213
+ threshold = np.std(df_subset['rank_difference'] - df_subset['expected_rank_difference']) * 2
214
+
215
+ df_subset['positive_anomaly'] = (df_subset['rank_difference'] - df_subset['expected_rank_difference']) > threshold
216
+ df_subset['negative_anomaly'] = (df_subset['expected_rank_difference'] - df_subset['rank_difference']) > threshold
217
+
218
+ anomalies = df_subset[(df_subset['positive_anomaly']) | (df_subset['negative_anomaly'])]
219
+
220
+ positive_anomalies = anomalies[anomalies['positive_anomaly']]
221
+ negative_anomalies = anomalies[anomalies['negative_anomaly']]
222
+
223
+ logging.info(f"Positive Anomalies: {len(positive_anomalies)}")
224
+ logging.info(f"Negative Anomalies: {len(negative_anomalies)}")
225
+
226
+ # Count positive and negative anomalies per player, year, and tournament
227
+ player_positive_anomalies = pd.concat([
228
+ positive_anomalies['winner_name'],
229
+ positive_anomalies['loser_name']
230
+ ]).value_counts()
231
+
232
+ player_negative_anomalies = pd.concat([
233
+ negative_anomalies['winner_name'],
234
+ negative_anomalies['loser_name']
235
+ ]).value_counts()
236
+
237
+ year_anomalies = anomalies['tourney_date'].dt.year.value_counts()
238
+ tournament_anomalies = anomalies['tourney_id'].value_counts()
239
+
240
+ # Save player anomalies counts to CSV
241
+ player_positive_anomalies.to_csv('players_with_most_positive_anomalies.csv', header=['positive_anomalies'])
242
+ player_negative_anomalies.to_csv('players_with_most_negative_anomalies.csv', header=['negative_anomalies'])
243
+ year_anomalies.to_csv('years_with_most_anomalies.csv', header=['anomalies'])
244
+ tournament_anomalies.to_csv('tournaments_with_most_anomalies.csv', header=['anomalies'])
245
+
246
+ # Plotting DBSCAN results
247
+ plt.figure(figsize=(10, 6))
248
+ reduced_embeddings = TSNE(n_components=2).fit_transform(embeddings)
249
+ plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis', alpha=0.7)
250
+ plt.colorbar(label='Cluster Labels (Anomalies in -1)')
251
+ plt.title('DBSCAN Clustering of Embeddings for Anomaly Detection')
252
+ plt.xlabel('Component 1')
253
+ plt.ylabel('Component 2')
254
+ plt.savefig('anomaly_detection_plot.png')
255
+ plt.close()
256
+
257
+ return anomalies
258
+
259
+ if __name__ == "__main__":
260
+ try:
261
+ df = load_data()
262
+ df, label_encoders = preprocess_data(df)
263
+ df, numeric_columns = engineer_features(df)
264
+
265
+ categorical_columns = ['tourney_id', 'surface', 'winner_id', 'loser_id']
266
+ X_cat = df[categorical_columns].values
267
+ X_num = df[numeric_columns].values
268
+ y = df['rank_difference'].values
269
+
270
+ X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test, train_indices, test_indices = train_test_split(
271
+ X_cat, X_num, y, df.index, test_size=TEST_SIZE, random_state=RANDOM_SEED)
272
+
273
+ categorical_dims = [len(label_encoders[col].classes_) for col in categorical_columns]
274
+ numerical_dim = len(numeric_columns)
275
+
276
+ X_train = (X_cat_train, X_num_train)
277
+ X_val = (X_cat_test[:VALIDATION_SIZE], X_num_test[:VALIDATION_SIZE])
278
+ y_val = y_test[:VALIDATION_SIZE]
279
+
280
+ study = optuna.create_study(direction='minimize')
281
+ study.optimize(objective, n_trials=20)
282
+
283
+ best_params = study.best_params
284
+ logging.info(f"Best Hyperparameters: {best_params}")
285
+
286
+ model = JointEmbeddedModel(categorical_dims, numerical_dim, best_params['embedding_dim'],
287
+ best_params['hidden_dim'], best_params['dropout_rate'])
288
+ dataloader = create_dataloader(X_train, y_train, batch_size=best_params['batch_size'])
289
+ train_model(model, dataloader, (X_val, y_val), epochs=20, learning_rate=best_params['learning_rate'],
290
+ weight_decay=best_params['weight_decay'])
291
+
292
+ model.load_state_dict(torch.load('best_model.pt'))
293
+ test_predictions = evaluate_model(model, (X_cat_test, X_num_test))
294
+ test_mse = np.mean((test_predictions - y_test) ** 2)
295
+ logging.info(f"Final Test MSE: {test_mse}")
296
+
297
+ anomalies = enhanced_anomaly_detection(model, (X_cat_test, X_num_test), df.loc[test_indices])
298
+
299
+ # Save test predictions
300
+ np.save('test_predictions.npy', test_predictions)
301
+
302
+ # Save anomalies to CSV
303
+ anomalies.to_csv('anomalies.csv', index=False)
304
+
305
+ logging.info("Test predictions and anomalies saved successfully.")
306
+
307
+ torch.save(model.state_dict(), 'final_model.pt')
308
+
309
+ logging.info("Script execution completed successfully.")
310
+ except Exception as e:
311
+ logging.error(f"An error occurred during script execution: {str(e)}")