ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
partitioning_around_medoids.py
(4900B)
1 from __future__ import print_function, division
2 import numpy as np
3 from mlfromscratch.utils import normalize, euclidean_distance, Plot
4 from mlfromscratch.unsupervised_learning import PCA
5
6
7 class PAM():
8 """A simple clustering method that forms k clusters by first assigning
9 samples to the closest medoids, and then swapping medoids with non-medoid
10 samples if the total distance (cost) between the cluster members and their medoid
11 is smaller than prevoisly.
12
13
14 Parameters:
15 -----------
16 k: int
17 The number of clusters the algorithm will form.
18 """
19 def __init__(self, k=2):
20 self.k = k
21
22 def _init_random_medoids(self, X):
23 """ Initialize the medoids as random samples """
24 n_samples, n_features = np.shape(X)
25 medoids = np.zeros((self.k, n_features))
26 for i in range(self.k):
27 medoid = X[np.random.choice(range(n_samples))]
28 medoids[i] = medoid
29 return medoids
30
31 def _closest_medoid(self, sample, medoids):
32 """ Return the index of the closest medoid to the sample """
33 closest_i = None
34 closest_distance = float("inf")
35 for i, medoid in enumerate(medoids):
36 distance = euclidean_distance(sample, medoid)
37 if distance < closest_distance:
38 closest_i = i
39 closest_distance = distance
40 return closest_i
41
42 def _create_clusters(self, X, medoids):
43 """ Assign the samples to the closest medoids to create clusters """
44 clusters = [[] for _ in range(self.k)]
45 for sample_i, sample in enumerate(X):
46 medoid_i = self._closest_medoid(sample, medoids)
47 clusters[medoid_i].append(sample_i)
48 return clusters
49
50 def _calculate_cost(self, X, clusters, medoids):
51 """ Calculate the cost (total distance between samples and their medoids) """
52 cost = 0
53 # For each cluster
54 for i, cluster in enumerate(clusters):
55 medoid = medoids[i]
56 for sample_i in cluster:
57 # Add distance between sample and medoid as cost
58 cost += euclidean_distance(X[sample_i], medoid)
59 return cost
60
61 def _get_non_medoids(self, X, medoids):
62 """ Returns a list of all samples that are not currently medoids """
63 non_medoids = []
64 for sample in X:
65 if not sample in medoids:
66 non_medoids.append(sample)
67 return non_medoids
68
69 def _get_cluster_labels(self, clusters, X):
70 """ Classify samples as the index of their clusters """
71 # One prediction for each sample
72 y_pred = np.zeros(np.shape(X)[0])
73 for cluster_i in range(len(clusters)):
74 cluster = clusters[cluster_i]
75 for sample_i in cluster:
76 y_pred[sample_i] = cluster_i
77 return y_pred
78
79 def predict(self, X):
80 """ Do Partitioning Around Medoids and return the cluster labels """
81 # Initialize medoids randomly
82 medoids = self._init_random_medoids(X)
83 # Assign samples to closest medoids
84 clusters = self._create_clusters(X, medoids)
85
86 # Calculate the initial cost (total distance between samples and
87 # corresponding medoids)
88 cost = self._calculate_cost(X, clusters, medoids)
89
90 # Iterate until we no longer have a cheaper cost
91 while True:
92 best_medoids = medoids
93 lowest_cost = cost
94 for medoid in medoids:
95 # Get all non-medoid samples
96 non_medoids = self._get_non_medoids(X, medoids)
97 # Calculate the cost when swapping medoid and samples
98 for sample in non_medoids:
99 # Swap sample with the medoid
100 new_medoids = medoids.copy()
101 new_medoids[medoids == medoid] = sample
102 # Assign samples to new medoids
103 new_clusters = self._create_clusters(X, new_medoids)
104 # Calculate the cost with the new set of medoids
105 new_cost = self._calculate_cost(
106 X, new_clusters, new_medoids)
107 # If the swap gives us a lower cost we save the medoids and cost
108 if new_cost < lowest_cost:
109 lowest_cost = new_cost
110 best_medoids = new_medoids
111 # If there was a swap that resultet in a lower cost we save the
112 # resulting medoids from the best swap and the new cost
113 if lowest_cost < cost:
114 cost = lowest_cost
115 medoids = best_medoids
116 # Else finished
117 else:
118 break
119
120 final_clusters = self._create_clusters(X, medoids)
121 # Return the samples cluster indices as labels
122 return self._get_cluster_labels(final_clusters, X)
123