Jensen-holm commited on
Commit
fcc4124
·
1 Parent(s): 3cab2dd

leaving off here, working on kmeans algorithm

Browse files
Files changed (1) hide show
  1. cluster/kmeans.py +32 -22
cluster/kmeans.py CHANGED
@@ -12,29 +12,39 @@ class Kmeans(Clusterer):
12
 
13
  def build(
14
  self,
15
- X_train: np.array,
16
  ):
17
- # Randomly select centroid start points, uniformly distributed across the domain of the dataset
18
- minimum = np.min(X_train, axis=0)
19
- maximum = np.max(X_train, axis=0)
20
- centroids = [np.uniform(minimum, maximum) for _ in range(self.k)]
21
-
22
- # loop through and cluster data
23
- prev_centroids = 0
24
- iteration = 0
25
- while True:
26
- sorted_pts = [[] for _ in range(self.k)]
27
- for x in X_train:
28
- dists = euclidean(x, centroids)
29
-
30
- if not np.not_equal(
31
- centroids,
32
- prev_centroids,
33
- ).any():
34
- break
35
- if not iteration < self.k:
36
- break
37
- iteration += 1
 
 
 
 
 
 
 
 
 
 
38
 
39
  def label():
40
  ...
 
12
 
13
  def build(
14
  self,
15
+ X: np.array,
16
  ):
17
+ # randomly initialize centroids
18
+ centroids = X[np.random.choice(
19
+ X.shape[0],
20
+ self.k,
21
+ replace=False,
22
+ )]
23
+
24
+ # Calculate Euclidean distance between each data point and each centroid
25
+ # then assign each point to its closest cluster
26
+ clusters = self.assign_clusters(X, centroids)
27
+ centroids = self.update_centroids(self.k, X, clusters)
28
+
29
+ @staticmethod
30
+ def assign_clusters(
31
+ X: np.array,
32
+ centroids: np.array,
33
+ ) -> np.array:
34
+ distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
35
+ clusts = np.argmin(distances, axis=0)
36
+ return clusts
37
+
38
+ @staticmethod
39
+ def update_centroids(
40
+ k: int,
41
+ X: np.array,
42
+ clusters: np.array,
43
+ ) -> np.array:
44
+ centroids = np.zeros((k, X.shape[1]))
45
+ for i in range(k):
46
+ centroids[i] = X[clusters == i].mean(axis=0)
47
+ return centroids
48
 
49
  def label():
50
  ...