|
1 | 1 | from random import shuffle |
2 | | - |
3 | 2 | import tensorflow as tf |
4 | | -from numpy import array |
| 3 | +import numpy as np |
5 | 4 |
|
6 | 5 |
|
7 | | -def tf_k_means_cluster(vectors, noofclusters): |
| 6 | +def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4): |
8 | 7 | """ |
9 | | - K-Means Clustering using TensorFlow. |
10 | | - 'vectors' should be a n*k 2-D NumPy array, where n is the number |
11 | | - of vectors of dimensionality k. |
12 | | - 'noofclusters' should be an integer. |
| 8 | + Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x |
| 9 | +
|
| 10 | + Parameters: |
| 11 | + vectors (list): A list of vectors. |
| 12 | + noofclusters (int): The number of clusters (k). |
| 13 | + max_iterations(int): maximum number of iterations or how many times the algorithm will refine its cluster assignments and centroid positions, until convergence. |
| 14 | + tolerance(int): defines a convergence criterion. The K-means algorithm stops when the centroids move less than this tolerance value between consecutive iterations. |
| 15 | +
|
| 16 | + (set same random seed in all examples for reproducibility) |
| 17 | + >>>tf.random.set_seed(42) |
| 18 | +
|
| 19 | + Example 1: |
| 20 | + >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32) |
| 21 | + >>>centroids2, assignments2 = tf_k_means_cluster_fixed(data2, 2) |
| 22 | + >>>print(centroids2,assignments2) |
| 23 | + [[ 0.05 0.05] |
| 24 | + [10. 10. ]] [0 0 1] |
| 25 | +
|
| 26 | + Example 2 (Idential data points): |
| 27 | + >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) |
| 28 | + >>>centroids, assignments = tf_k_means_cluster_fixed(data_identical, 1) |
| 29 | + >>>print(centroids,assignments) |
| 30 | +
|
| 31 | + Example 3 (k>N): |
| 32 | + >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32) |
| 33 | + >>>centroids, assignments = tf_k_means_cluster_fixed(data, 5) |
| 34 | + >>>print(centroids,assignments) |
13 | 35 | """ |
14 | 36 |
|
| 37 | + |
| 38 | + vectors = tf.constant(vectors, dtype=tf.float32) |
15 | 39 | noofclusters = int(noofclusters) |
16 | | - assert noofclusters < len(vectors) |
17 | | - |
18 | | - # Find out the dimensionality |
19 | | - dim = len(vectors[0]) |
20 | | - |
21 | | - # Will help select random centroids from among the available vectors |
22 | | - vector_indices = list(range(len(vectors))) |
23 | | - shuffle(vector_indices) |
24 | | - |
25 | | - # GRAPH OF COMPUTATION |
26 | | - # We initialize a new graph and set it as the default during each run |
27 | | - # of this algorithm. This ensures that as this function is called |
28 | | - # multiple times, the default graph doesn't keep getting crowded with |
29 | | - # unused ops and Variables from previous function calls. |
30 | | - |
31 | | - graph = tf.Graph() |
32 | | - |
33 | | - with graph.as_default(): |
34 | | - # SESSION OF COMPUTATION |
35 | | - |
36 | | - sess = tf.Session() |
37 | | - |
38 | | - ##CONSTRUCTING THE ELEMENTS OF COMPUTATION |
39 | | - |
40 | | - ##First lets ensure we have a Variable vector for each centroid, |
41 | | - ##initialized to one of the vectors from the available data points |
42 | | - centroids = [ |
43 | | - tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters) |
44 | | - ] |
45 | | - ##These nodes will assign the centroid Variables the appropriate |
46 | | - ##values |
47 | | - centroid_value = tf.placeholder("float64", [dim]) |
48 | | - cent_assigns = [] |
49 | | - for centroid in centroids: |
50 | | - cent_assigns.append(tf.assign(centroid, centroid_value)) |
51 | | - |
52 | | - ##Variables for cluster assignments of individual vectors(initialized |
53 | | - ##to 0 at first) |
54 | | - assignments = [tf.Variable(0) for i in range(len(vectors))] |
55 | | - ##These nodes will assign an assignment Variable the appropriate |
56 | | - ##value |
57 | | - assignment_value = tf.placeholder("int32") |
58 | | - cluster_assigns = [] |
59 | | - for assignment in assignments: |
60 | | - cluster_assigns.append(tf.assign(assignment, assignment_value)) |
61 | | - |
62 | | - ##Now lets construct the node that will compute the mean |
63 | | - # The placeholder for the input |
64 | | - mean_input = tf.placeholder("float", [None, dim]) |
65 | | - # The Node/op takes the input and computes a mean along the 0th |
66 | | - # dimension, i.e. the list of input vectors |
67 | | - mean_op = tf.reduce_mean(mean_input, 0) |
68 | | - |
69 | | - ##Node for computing Euclidean distances |
70 | | - # Placeholders for input |
71 | | - v1 = tf.placeholder("float", [dim]) |
72 | | - v2 = tf.placeholder("float", [dim]) |
73 | | - euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2))) |
74 | | - |
75 | | - ##This node will figure out which cluster to assign a vector to, |
76 | | - ##based on Euclidean distances of the vector from the centroids. |
77 | | - # Placeholder for input |
78 | | - centroid_distances = tf.placeholder("float", [noofclusters]) |
79 | | - cluster_assignment = tf.argmin(centroid_distances, 0) |
80 | | - |
81 | | - ##INITIALIZING STATE VARIABLES |
82 | | - |
83 | | - ##This will help initialization of all Variables defined with respect |
84 | | - ##to the graph. The Variable-initializer should be defined after |
85 | | - ##all the Variables have been constructed, so that each of them |
86 | | - ##will be included in the initialization. |
87 | | - init_op = tf.initialize_all_variables() |
88 | | - |
89 | | - # Initialize all variables |
90 | | - sess.run(init_op) |
91 | | - |
92 | | - ##CLUSTERING ITERATIONS |
93 | | - |
94 | | - # Now perform the Expectation-Maximization steps of K-Means clustering |
95 | | - # iterations. To keep things simple, we will only do a set number of |
96 | | - # iterations, instead of using a Stopping Criterion. |
97 | | - noofiterations = 100 |
98 | | - for _ in range(noofiterations): |
99 | | - ##EXPECTATION STEP |
100 | | - ##Based on the centroid locations till last iteration, compute |
101 | | - ##the _expected_ centroid assignments. |
102 | | - # Iterate over each vector |
103 | | - for vector_n in range(len(vectors)): |
104 | | - vect = vectors[vector_n] |
105 | | - # Compute Euclidean distance between this vector and each |
106 | | - # centroid. Remember that this list cannot be named |
107 | | - #'centroid_distances', since that is the input to the |
108 | | - # cluster assignment node. |
109 | | - distances = [ |
110 | | - sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)}) |
111 | | - for centroid in centroids |
112 | | - ] |
113 | | - # Now use the cluster assignment node, with the distances |
114 | | - # as the input |
115 | | - assignment = sess.run( |
116 | | - cluster_assignment, feed_dict={centroid_distances: distances} |
117 | | - ) |
118 | | - # Now assign the value to the appropriate state variable |
119 | | - sess.run( |
120 | | - cluster_assigns[vector_n], feed_dict={assignment_value: assignment} |
121 | | - ) |
122 | | - |
123 | | - ##MAXIMIZATION STEP |
124 | | - # Based on the expected state computed from the Expectation Step, |
125 | | - # compute the locations of the centroids so as to maximize the |
126 | | - # overall objective of minimizing within-cluster Sum-of-Squares |
127 | | - for cluster_n in range(noofclusters): |
128 | | - # Collect all the vectors assigned to this cluster |
129 | | - assigned_vects = [ |
130 | | - vectors[i] |
131 | | - for i in range(len(vectors)) |
132 | | - if sess.run(assignments[i]) == cluster_n |
133 | | - ] |
134 | | - # Compute new centroid location |
135 | | - new_location = sess.run( |
136 | | - mean_op, feed_dict={mean_input: array(assigned_vects)} |
137 | | - ) |
138 | | - # Assign value to appropriate variable |
139 | | - sess.run( |
140 | | - cent_assigns[cluster_n], feed_dict={centroid_value: new_location} |
141 | | - ) |
142 | | - |
143 | | - # Return centroids and assignments |
144 | | - centroids = sess.run(centroids) |
145 | | - assignments = sess.run(assignments) |
146 | | - return centroids, assignments |
| 40 | + num_data_points = tf.shape(vectors)[0] |
| 41 | + |
| 42 | + if noofclusters > num_data_points: |
| 43 | + raise ValueError("Number of clusters (k) cannot be greater than the number of data points.") |
| 44 | + |
| 45 | + # Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points |
| 46 | + initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters] |
| 47 | + centroids = tf.Variable(tf.gather(vectors, initial_indices)) |
| 48 | + |
| 49 | + @tf.function |
| 50 | + def train_step(): |
| 51 | + # Find the closest centroid for each vector |
| 52 | + distances_sq = tf.reduce_sum( |
| 53 | + tf.square(tf.expand_dims(vectors, 1) - tf.expand_dims(centroids, 0)), 2 |
| 54 | + ) |
| 55 | + assignments = tf.argmin(distances_sq, axis=1) |
| 56 | + |
| 57 | + #Recalculate centroids efficiently |
| 58 | + sums = tf.math.unsorted_segment_sum(vectors, assignments, num_segments=noofclusters) |
| 59 | + counts = tf.math.unsorted_segment_sum(tf.ones_like(vectors), assignments, num_segments=noofclusters) |
| 60 | + |
| 61 | + # Avoid division by zero for empty clusters |
| 62 | + new_centroids = sums / tf.maximum(counts, 1e-9) |
| 63 | + |
| 64 | + # For empty clusters, keep the old centroid to prevent them from moving to the origin |
| 65 | + is_empty = tf.equal(tf.reduce_sum(counts, axis=1), 0) |
| 66 | + new_centroids = tf.where(tf.expand_dims(is_empty, 1), centroids, new_centroids) |
| 67 | + |
| 68 | + return assignments, new_centroids |
| 69 | + |
| 70 | + # Main iteration loop |
| 71 | + for i in range(max_iterations): |
| 72 | + old_centroids = tf.identity(centroids) |
| 73 | + assignments, new_centroids_val = train_step() |
| 74 | + centroids.assign(new_centroids_val) |
| 75 | + |
| 76 | + # Check for convergence |
| 77 | + if tf.reduce_sum(tf.square(old_centroids - centroids)) < tolerance: |
| 78 | + break |
| 79 | + |
| 80 | + return centroids.numpy(), assignments.numpy() |
0 commit comments