ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
k_means.py
(3530B)
1 from __future__ import print_function, division
2 import numpy as np
3 from mlfromscratch.utils import normalize, euclidean_distance, Plot
4 from mlfromscratch.unsupervised_learning import *
5
6 class KMeans():
7 """A simple clustering method that forms k clusters by iteratively reassigning
8 samples to the closest centroids and after that moves the centroids to the center
9 of the new formed clusters.
10
11
12 Parameters:
13 -----------
14 k: int
15 The number of clusters the algorithm will form.
16 max_iterations: int
17 The number of iterations the algorithm will run for if it does
18 not converge before that.
19 """
20 def __init__(self, k=2, max_iterations=500):
21 self.k = k
22 self.max_iterations = max_iterations
23
24 def _init_random_centroids(self, X):
25 """ Initialize the centroids as k random samples of X"""
26 n_samples, n_features = np.shape(X)
27 centroids = np.zeros((self.k, n_features))
28 for i in range(self.k):
29 centroid = X[np.random.choice(range(n_samples))]
30 centroids[i] = centroid
31 return centroids
32
33 def _closest_centroid(self, sample, centroids):
34 """ Return the index of the closest centroid to the sample """
35 closest_i = 0
36 closest_dist = float('inf')
37 for i, centroid in enumerate(centroids):
38 distance = euclidean_distance(sample, centroid)
39 if distance < closest_dist:
40 closest_i = i
41 closest_dist = distance
42 return closest_i
43
44 def _create_clusters(self, centroids, X):
45 """ Assign the samples to the closest centroids to create clusters """
46 n_samples = np.shape(X)[0]
47 clusters = [[] for _ in range(self.k)]
48 for sample_i, sample in enumerate(X):
49 centroid_i = self._closest_centroid(sample, centroids)
50 clusters[centroid_i].append(sample_i)
51 return clusters
52
53 def _calculate_centroids(self, clusters, X):
54 """ Calculate new centroids as the means of the samples in each cluster """
55 n_features = np.shape(X)[1]
56 centroids = np.zeros((self.k, n_features))
57 for i, cluster in enumerate(clusters):
58 centroid = np.mean(X[cluster], axis=0)
59 centroids[i] = centroid
60 return centroids
61
62 def _get_cluster_labels(self, clusters, X):
63 """ Classify samples as the index of their clusters """
64 # One prediction for each sample
65 y_pred = np.zeros(np.shape(X)[0])
66 for cluster_i, cluster in enumerate(clusters):
67 for sample_i in cluster:
68 y_pred[sample_i] = cluster_i
69 return y_pred
70
71 def predict(self, X):
72 """ Do K-Means clustering and return cluster indices """
73
74 # Initialize centroids as k random samples from X
75 centroids = self._init_random_centroids(X)
76
77 # Iterate until convergence or for max iterations
78 for _ in range(self.max_iterations):
79 # Assign samples to closest centroids (create clusters)
80 clusters = self._create_clusters(centroids, X)
81 # Save current centroids for convergence check
82 prev_centroids = centroids
83 # Calculate new centroids from the clusters
84 centroids = self._calculate_centroids(clusters, X)
85 # If no centroids have changed => convergence
86 diff = centroids - prev_centroids
87 if not diff.any():
88 break
89
90 return self._get_cluster_labels(clusters, X)
91