Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added hard and soft Kmeans clustering with tests #71

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions numpy_ml/cluster/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Clustering Models
The `kmeans.py` module implements:

1. [Hard kmeans clustering](https://user-images.githubusercontent.com/1905599/119421132-de04f700-bcb2-11eb-98cd-4337d0b9496d.png) with fixed assignment of data points to only one cluster at a time.
2. [Soft kmeans clustering](https://user-images.githubusercontent.com/1905599/119421211-0bea3b80-bcb3-11eb-9e71-a337da8db24d.png) with probabilistic assignment of data points. Each data point has a membership degree in each cluster. The highest probabe cluster could then be assigned as the cluser index of the data. Alternatively, the probability distribution can be used for any other purpose as it captures our uncertaintity of the clustering routine.

1 change: 1 addition & 0 deletions numpy_ml/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .kmeans import *
210 changes: 210 additions & 0 deletions numpy_ml/cluster/kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""An implementation of kmeans clustering (hard, soft)"""

import numpy as np

class KMeans:
def __init__(self, X_train, cluster_method="hard", n_clusters=5, beta = 1.0):
r"""
Kmean implementation.

Parameters
----------
X_train : array
The input matrix with dimensions (number of data x number of attributes).

cluster_method : {'hard', 'soft'}
Whether to cluster using hard or soft clustering. Default is hard.

n_clusters : int
number of clusters.

beta: float
span of a radial basis kernel
"""
self.cluster_method = cluster_method
self.X_train = X_train
self.n_clusters = n_clusters

self.centroids = None
self.assignments = None

if self.cluster_method == 'soft':
self.beta = beta # Use beta only in soft clustering
self.centroids, self.assignments = self._kmeans_soft()

elif self.cluster_method == 'hard':
self.centroids, self.assignments = self._kmeans_hard()


def _is_converged(self, prev, cur):
r"""
Check for convergence by validating if the centroid or assignments have stopped changing across iterations.

Parameters:
-----------
prev : array
The input could be the centroids or assignment based on the chosen implemtation details (past iteration).
cur : array
The input could be the centroids or assignment based on the chosen implemtation details (current iteration).

Returns
-------
True: if convergence is reached.
False: if convergence is not reached.
"""
return np.allclose(prev,cur)


def _kmeans_hard(self):
r"""
hard clustering: The "vanilla" kmean clustering assigns every data point to a single cluster.

Returns
-------
centroids : array
The centroid matrix with dimensions (number of centroids x number of attributes).
assignments : array
The assignments vector with dimensions (number of data).
"""
size_of_data = self.X_train.shape[0]
centroid_indexes = np.random.choice(size_of_data, self.n_clusters, replace=False)
centroids = np.take(self.X_train, centroid_indexes, axis=0)
assignments = [-1] * size_of_data
n_dims = self.X_train.shape[1]
iteration, max_iteration = 0, 100
prev_weight_vec = np.zeros(n_dims)
weight_list = [prev_weight_vec, centroids]

while not self._is_converged(weight_list[-2], weight_list[-1]) or iteration < max_iteration:
centroids = weight_list[-1]

# update cluster assignments
for i, x_val in enumerate(self.X_train):
min_distance = 1000000000
for k, mu_val in enumerate(centroids):

dist = np.linalg.norm(x_val - mu_val)

if dist < min_distance:
min_distance = dist
assignments[i] = k

# update centroids
set_labels = range(self.n_clusters)
# filter by labels
for label in set_labels:
filter_indices = np.where(np.array(assignments) == label)[0]
count_per_label = len(filter_indices)
if count_per_label > 0:
filter_xdata = np.take(self.X_train, filter_indices, axis=0)
centroids[label, :] = np.mean(filter_xdata, axis=0)
weight_list.append(centroids)
weight_list.pop(0)
iteration = iteration + 1
return centroids, assignments


def _kmeans_soft(self):
r"""
Soft clustering: In this implementation, which is the modification of the kmean hard clustering algorithm, we assigns every data point a degree of membership in the cluster assignment. Hence, we give a probability distribution.

Parameters:
-----------
beta: float
span of a radial basis kernel

Returns
-------
centroids : array
The centroid matrix with dimensions (number of centroids x number of attributes).
assignments : array
The assignments vector with dimensions (number of data x number of centroids). This is the probability distribution of the membership of each data point in the cluster.
"""
size_of_data = self.X_train.shape[0]
centroid_indexes = np.random.choice(size_of_data, self.n_clusters, replace=False)
centroids = np.take(self.X_train, centroid_indexes, axis=0)
assignments = -1 * np.ones((size_of_data, self.n_clusters))
n_dims = self.X_train.shape[1]
iteration, max_iteration = 0, 100
tol = 0.00001 # prevent division by zero
prev_weight_vec = np.zeros(n_dims)
weight_list = [prev_weight_vec, centroids]

while not self._is_converged(weight_list[-2], weight_list[-1]) or iteration < max_iteration:
centroids = weight_list[-1]

# update cluster assignments
for i, x_val in enumerate(self.X_train):
for k, mu_val in enumerate(centroids):
dist = np.linalg.norm(x_val - mu_val)
weight = np.exp(-dist / self.beta)
assignments[i][k] = weight
# normalize assignment matrix
row_sums = assignments.sum(axis=1)
assignments = (assignments + tol) / (row_sums[:, np.newaxis] + tol)

# update centroids
set_labels = range(self.n_clusters)
# filter by labels
for label in set_labels:
filter_indices = np.where(np.array(assignments) == label)[0]
kenluck2001 marked this conversation as resolved.
Show resolved Hide resolved
count_per_label = len(filter_indices)
if count_per_label > 0:
filter_xdata = np.take(self.X_train, filter_indices, axis=0)
# weight by the strength of membership
weight_by_label = np.take(assignments[:,label], filter_indices, axis=0)
weight_by_label_matrix = weight_by_label.reshape((count_per_label, 1))
weighted_filtered_xdata = np.multiply(filter_xdata, weight_by_label_matrix)

# estimate centroid
numerator = np.mean(weighted_filtered_xdata, axis=0) # sum the weighted inputs
denominator = np.sum(weight_by_label) # weight contribution across data input

centroids[label, :] = (numerator + tol) / (denominator + tol)
weight_list.append(centroids)
weight_list.pop(0)
iteration = iteration + 1
return centroids, assignments


def get_centroids(self):
r"""
Get the centroids

Returns
-------
centroids : array
The centroid matrix with dimensions (number of centroids x number of attributes).
"""
return self.centroids


def get_assignments(self):
r"""
Get the assignment of data points into their clusters

Returns
-------
assignments : array
The assignments vector with dimensions (number of data).
"""
assignments = self.assignments
if self.cluster_method == 'soft':
assignments = np.argmax(self.assignments, axis=1)
return assignments


def get_proba(self):
r"""
Get the probability distribution of clusters assignments
Note: only when doing soft-clustering

Returns
-------
assignments : array (return None if hard clustering, array otherwise)
The assignments vector with dimensions (number of data x number of centroids). This is the probability distribution of the membership of each data point in the cluster.
"""
assignments = None
if self.cluster_method == 'soft':
assignments = self.assignments
return assignments
54 changes: 54 additions & 0 deletions numpy_ml/tests/test_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# flake8: noqa
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans as origKMeans
from sklearn.datasets.samples_generator import make_blobs
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.metrics import davies_bouldin_score
from numpy_ml.cluster.kmeans import KMeans

def test_kmeans():
seed = 12345
np.random.seed(seed)
n_clusters=4
# loading the dataset
orig_num_of_samples, orig_num_of_features = 3000, 300
X, y_true = make_blobs(n_samples=orig_num_of_samples, centers=n_clusters, n_features = orig_num_of_features,
cluster_std=0.50, random_state=seed)

# K-Means scikit version (Hard clustering)
kmeans = origKMeans(n_clusters=n_clusters, random_state=seed).fit(X)
# cluster labels as gold standard
gold_labels = kmeans.labels_

# Test the dimensions of the parameters
print ("Hard Clustering")
km_hard = KMeans(X, cluster_method="hard", n_clusters=n_clusters)

num_of_samples, num_of_features = km_hard.get_centroids().shape
assert (num_of_samples, num_of_features) in [(orig_num_of_features, n_clusters), (n_clusters, orig_num_of_features)], "mismatch in assignment probability"

num_of_samples = len(km_hard.get_assignments())
assert (num_of_samples == orig_num_of_samples), "mismatch in assignment size"

# Comparing our clustering algorithm to the Gold standard
mallows_score = fowlkes_mallows_score(gold_labels, km_hard.get_assignments())
print("Mallow score: {} | values closer to 1 indicates better clustering".format(mallows_score))

print ("Soft Clustering")
# Use the default value for the beta variable
km_soft = KMeans(X, cluster_method="soft", n_clusters=n_clusters)

# Only call during soft clustering
num_of_samples, num_of_features = km_soft.get_proba().shape
assert (num_of_samples, num_of_features) in [(orig_num_of_samples, n_clusters), (n_clusters, orig_num_of_samples)], "mismatch in assignment probability"

num_of_samples, num_of_features = km_soft.get_centroids().shape
assert (num_of_samples, num_of_features) in [(orig_num_of_features, n_clusters), (n_clusters, orig_num_of_features)], "mismatch in assignment probability"

num_of_samples = len(km_soft.get_assignments())
assert (num_of_samples == orig_num_of_samples), "mismatch in assignment size"

# Comparing our clustering algorithm to the Gold standard
mallows_score = fowlkes_mallows_score(gold_labels, km_soft.get_assignments())
print("Mallow score: {} | values closer to 1 indicates better clustering".format(mallows_score))