Skip to content

Commit b78dfe3

Browse files
authored
feat: Update for TensorFlow 2, add doctests
Updated K-means clustering implementation for TensorFlow 2.x compatibility. Doctests have been added for code clarity and testing. Core algorithm logic and structure remain unchanged.
1 parent 7a0fee4 commit b78dfe3

1 file changed

Lines changed: 72 additions & 138 deletions

File tree

Lines changed: 72 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,80 @@
11
from random import shuffle
2-
32
import tensorflow as tf
4-
from numpy import array
3+
import numpy as np
54

65

7-
def tf_k_means_cluster(vectors, noofclusters):
6+
def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4):
87
"""
9-
K-Means Clustering using TensorFlow.
10-
'vectors' should be a n*k 2-D NumPy array, where n is the number
11-
of vectors of dimensionality k.
12-
'noofclusters' should be an integer.
8+
Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x
9+
10+
Parameters:
11+
vectors (list): A list of vectors.
12+
noofclusters (int): The number of clusters (k).
13+
max_iterations(int): maximum number of iterations or how many times the algorithm will refine its cluster assignments and centroid positions, until convergence.
14+
tolerance(int): defines a convergence criterion. The K-means algorithm stops when the centroids move less than this tolerance value between consecutive iterations.
15+
16+
(set same random seed in all examples for reproducibility)
17+
>>>tf.random.set_seed(42)
18+
19+
Example 1:
20+
>>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32)
21+
>>>centroids2, assignments2 = tf_k_means_cluster_fixed(data2, 2)
22+
>>>print(centroids2,assignments2)
23+
[[ 0.05 0.05]
24+
[10. 10. ]] [0 0 1]
25+
26+
Example 2 (Idential data points):
27+
>>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
28+
>>>centroids, assignments = tf_k_means_cluster_fixed(data_identical, 1)
29+
>>>print(centroids,assignments)
30+
31+
Example 3 (k>N):
32+
>>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32)
33+
>>>centroids, assignments = tf_k_means_cluster_fixed(data, 5)
34+
>>>print(centroids,assignments)
1335
"""
1436

37+
38+
vectors = tf.constant(vectors, dtype=tf.float32)
1539
noofclusters = int(noofclusters)
16-
assert noofclusters < len(vectors)
17-
18-
# Find out the dimensionality
19-
dim = len(vectors[0])
20-
21-
# Will help select random centroids from among the available vectors
22-
vector_indices = list(range(len(vectors)))
23-
shuffle(vector_indices)
24-
25-
# GRAPH OF COMPUTATION
26-
# We initialize a new graph and set it as the default during each run
27-
# of this algorithm. This ensures that as this function is called
28-
# multiple times, the default graph doesn't keep getting crowded with
29-
# unused ops and Variables from previous function calls.
30-
31-
graph = tf.Graph()
32-
33-
with graph.as_default():
34-
# SESSION OF COMPUTATION
35-
36-
sess = tf.Session()
37-
38-
##CONSTRUCTING THE ELEMENTS OF COMPUTATION
39-
40-
##First lets ensure we have a Variable vector for each centroid,
41-
##initialized to one of the vectors from the available data points
42-
centroids = [
43-
tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)
44-
]
45-
##These nodes will assign the centroid Variables the appropriate
46-
##values
47-
centroid_value = tf.placeholder("float64", [dim])
48-
cent_assigns = []
49-
for centroid in centroids:
50-
cent_assigns.append(tf.assign(centroid, centroid_value))
51-
52-
##Variables for cluster assignments of individual vectors(initialized
53-
##to 0 at first)
54-
assignments = [tf.Variable(0) for i in range(len(vectors))]
55-
##These nodes will assign an assignment Variable the appropriate
56-
##value
57-
assignment_value = tf.placeholder("int32")
58-
cluster_assigns = []
59-
for assignment in assignments:
60-
cluster_assigns.append(tf.assign(assignment, assignment_value))
61-
62-
##Now lets construct the node that will compute the mean
63-
# The placeholder for the input
64-
mean_input = tf.placeholder("float", [None, dim])
65-
# The Node/op takes the input and computes a mean along the 0th
66-
# dimension, i.e. the list of input vectors
67-
mean_op = tf.reduce_mean(mean_input, 0)
68-
69-
##Node for computing Euclidean distances
70-
# Placeholders for input
71-
v1 = tf.placeholder("float", [dim])
72-
v2 = tf.placeholder("float", [dim])
73-
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))
74-
75-
##This node will figure out which cluster to assign a vector to,
76-
##based on Euclidean distances of the vector from the centroids.
77-
# Placeholder for input
78-
centroid_distances = tf.placeholder("float", [noofclusters])
79-
cluster_assignment = tf.argmin(centroid_distances, 0)
80-
81-
##INITIALIZING STATE VARIABLES
82-
83-
##This will help initialization of all Variables defined with respect
84-
##to the graph. The Variable-initializer should be defined after
85-
##all the Variables have been constructed, so that each of them
86-
##will be included in the initialization.
87-
init_op = tf.initialize_all_variables()
88-
89-
# Initialize all variables
90-
sess.run(init_op)
91-
92-
##CLUSTERING ITERATIONS
93-
94-
# Now perform the Expectation-Maximization steps of K-Means clustering
95-
# iterations. To keep things simple, we will only do a set number of
96-
# iterations, instead of using a Stopping Criterion.
97-
noofiterations = 100
98-
for _ in range(noofiterations):
99-
##EXPECTATION STEP
100-
##Based on the centroid locations till last iteration, compute
101-
##the _expected_ centroid assignments.
102-
# Iterate over each vector
103-
for vector_n in range(len(vectors)):
104-
vect = vectors[vector_n]
105-
# Compute Euclidean distance between this vector and each
106-
# centroid. Remember that this list cannot be named
107-
#'centroid_distances', since that is the input to the
108-
# cluster assignment node.
109-
distances = [
110-
sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})
111-
for centroid in centroids
112-
]
113-
# Now use the cluster assignment node, with the distances
114-
# as the input
115-
assignment = sess.run(
116-
cluster_assignment, feed_dict={centroid_distances: distances}
117-
)
118-
# Now assign the value to the appropriate state variable
119-
sess.run(
120-
cluster_assigns[vector_n], feed_dict={assignment_value: assignment}
121-
)
122-
123-
##MAXIMIZATION STEP
124-
# Based on the expected state computed from the Expectation Step,
125-
# compute the locations of the centroids so as to maximize the
126-
# overall objective of minimizing within-cluster Sum-of-Squares
127-
for cluster_n in range(noofclusters):
128-
# Collect all the vectors assigned to this cluster
129-
assigned_vects = [
130-
vectors[i]
131-
for i in range(len(vectors))
132-
if sess.run(assignments[i]) == cluster_n
133-
]
134-
# Compute new centroid location
135-
new_location = sess.run(
136-
mean_op, feed_dict={mean_input: array(assigned_vects)}
137-
)
138-
# Assign value to appropriate variable
139-
sess.run(
140-
cent_assigns[cluster_n], feed_dict={centroid_value: new_location}
141-
)
142-
143-
# Return centroids and assignments
144-
centroids = sess.run(centroids)
145-
assignments = sess.run(assignments)
146-
return centroids, assignments
40+
num_data_points = tf.shape(vectors)[0]
41+
42+
if noofclusters > num_data_points:
43+
raise ValueError("Number of clusters (k) cannot be greater than the number of data points.")
44+
45+
# Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points
46+
initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters]
47+
centroids = tf.Variable(tf.gather(vectors, initial_indices))
48+
49+
@tf.function
50+
def train_step():
51+
# Find the closest centroid for each vector
52+
distances_sq = tf.reduce_sum(
53+
tf.square(tf.expand_dims(vectors, 1) - tf.expand_dims(centroids, 0)), 2
54+
)
55+
assignments = tf.argmin(distances_sq, axis=1)
56+
57+
#Recalculate centroids efficiently
58+
sums = tf.math.unsorted_segment_sum(vectors, assignments, num_segments=noofclusters)
59+
counts = tf.math.unsorted_segment_sum(tf.ones_like(vectors), assignments, num_segments=noofclusters)
60+
61+
# Avoid division by zero for empty clusters
62+
new_centroids = sums / tf.maximum(counts, 1e-9)
63+
64+
# For empty clusters, keep the old centroid to prevent them from moving to the origin
65+
is_empty = tf.equal(tf.reduce_sum(counts, axis=1), 0)
66+
new_centroids = tf.where(tf.expand_dims(is_empty, 1), centroids, new_centroids)
67+
68+
return assignments, new_centroids
69+
70+
# Main iteration loop
71+
for i in range(max_iterations):
72+
old_centroids = tf.identity(centroids)
73+
assignments, new_centroids_val = train_step()
74+
centroids.assign(new_centroids_val)
75+
76+
# Check for convergence
77+
if tf.reduce_sum(tf.square(old_centroids - centroids)) < tolerance:
78+
break
79+
80+
return centroids.numpy(), assignments.numpy()

0 commit comments

Comments
 (0)