ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

partitioning_around_medoids.py

(4900B)


      1 from __future__ import print_function, division
      2 import numpy as np
      3 from mlfromscratch.utils import normalize, euclidean_distance, Plot
      4 from mlfromscratch.unsupervised_learning import PCA
      5 
      6 
      7 class PAM():
      8     """A simple clustering method that forms k clusters by first assigning
      9     samples to the closest medoids, and then swapping medoids with non-medoid
     10     samples if the total distance (cost) between the cluster members and their medoid
     11     is smaller than prevoisly.
     12 
     13 
     14     Parameters:
     15     -----------
     16     k: int
     17         The number of clusters the algorithm will form.
     18     """
     19     def __init__(self, k=2):
     20         self.k = k
     21 
     22     def _init_random_medoids(self, X):
     23         """ Initialize the medoids as random samples """
     24         n_samples, n_features = np.shape(X)
     25         medoids = np.zeros((self.k, n_features))
     26         for i in range(self.k):
     27             medoid = X[np.random.choice(range(n_samples))]
     28             medoids[i] = medoid
     29         return medoids
     30 
     31     def _closest_medoid(self, sample, medoids):
     32         """ Return the index of the closest medoid to the sample """
     33         closest_i = None
     34         closest_distance = float("inf")
     35         for i, medoid in enumerate(medoids):
     36             distance = euclidean_distance(sample, medoid)
     37             if distance < closest_distance:
     38                 closest_i = i
     39                 closest_distance = distance
     40         return closest_i
     41 
     42     def _create_clusters(self, X, medoids):
     43         """ Assign the samples to the closest medoids to create clusters """
     44         clusters = [[] for _ in range(self.k)]
     45         for sample_i, sample in enumerate(X):
     46             medoid_i = self._closest_medoid(sample, medoids)
     47             clusters[medoid_i].append(sample_i)
     48         return clusters
     49 
     50     def _calculate_cost(self, X, clusters, medoids):
     51         """ Calculate the cost (total distance between samples and their medoids) """
     52         cost = 0
     53         # For each cluster
     54         for i, cluster in enumerate(clusters):
     55             medoid = medoids[i]
     56             for sample_i in cluster:
     57                 # Add distance between sample and medoid as cost
     58                 cost += euclidean_distance(X[sample_i], medoid)
     59         return cost
     60 
     61     def _get_non_medoids(self, X, medoids):
     62         """ Returns a list of all samples that are not currently medoids """
     63         non_medoids = []
     64         for sample in X:
     65             if not sample in medoids:
     66                 non_medoids.append(sample)
     67         return non_medoids
     68 
     69     def _get_cluster_labels(self, clusters, X):
     70         """ Classify samples as the index of their clusters """
     71         # One prediction for each sample
     72         y_pred = np.zeros(np.shape(X)[0])
     73         for cluster_i in range(len(clusters)):
     74             cluster = clusters[cluster_i]
     75             for sample_i in cluster:
     76                 y_pred[sample_i] = cluster_i
     77         return y_pred
     78 
     79     def predict(self, X):
     80         """ Do Partitioning Around Medoids and return the cluster labels """
     81         # Initialize medoids randomly
     82         medoids = self._init_random_medoids(X)
     83         # Assign samples to closest medoids
     84         clusters = self._create_clusters(X, medoids)
     85 
     86         # Calculate the initial cost (total distance between samples and
     87         # corresponding medoids)
     88         cost = self._calculate_cost(X, clusters, medoids)
     89 
     90         # Iterate until we no longer have a cheaper cost
     91         while True:
     92             best_medoids = medoids
     93             lowest_cost = cost
     94             for medoid in medoids:
     95                 # Get all non-medoid samples
     96                 non_medoids = self._get_non_medoids(X, medoids)
     97                 # Calculate the cost when swapping medoid and samples
     98                 for sample in non_medoids:
     99                     # Swap sample with the medoid
    100                     new_medoids = medoids.copy()
    101                     new_medoids[medoids == medoid] = sample
    102                     # Assign samples to new medoids
    103                     new_clusters = self._create_clusters(X, new_medoids)
    104                     # Calculate the cost with the new set of medoids
    105                     new_cost = self._calculate_cost(
    106                         X, new_clusters, new_medoids)
    107                     # If the swap gives us a lower cost we save the medoids and cost
    108                     if new_cost < lowest_cost:
    109                         lowest_cost = new_cost
    110                         best_medoids = new_medoids
    111             # If there was a swap that resultet in a lower cost we save the
    112             # resulting medoids from the best swap and the new cost 
    113             if lowest_cost < cost:
    114                 cost = lowest_cost
    115                 medoids = best_medoids 
    116             # Else finished
    117             else:
    118                 break
    119 
    120         final_clusters = self._create_clusters(X, medoids)
    121         # Return the samples cluster indices as labels
    122         return self._get_cluster_labels(final_clusters, X)
    123