1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Tests for KMeans.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import math 22import time 23 24import numpy as np 25from sklearn.cluster import KMeans as SklearnKMeans 26 27# pylint: disable=g-import-not-at-top 28from tensorflow.contrib.learn.python import learn 29from tensorflow.contrib.learn.python.learn.estimators import kmeans as kmeans_lib 30from tensorflow.contrib.learn.python.learn.estimators import run_config 31from tensorflow.python.framework import constant_op 32from tensorflow.python.framework import dtypes 33from tensorflow.python.framework import ops 34from tensorflow.python.ops import array_ops 35from tensorflow.python.ops import control_flow_ops 36from tensorflow.python.ops import data_flow_ops 37from tensorflow.python.ops import math_ops 38from tensorflow.python.ops import random_ops 39from tensorflow.python.platform import benchmark 40from tensorflow.python.platform import flags 41from tensorflow.python.platform import test 42from tensorflow.python.training import input as input_lib 43from tensorflow.python.training import queue_runner 44 45FLAGS = flags.FLAGS 46 47 48def normalize(x): 49 return x / np.sqrt(np.sum(x * x, axis=-1, keepdims=True)) 50 51 52def cosine_similarity(x, y): 53 return np.dot(normalize(x), np.transpose(normalize(y))) 54 55 56def make_random_centers(num_centers, num_dims, center_norm=500): 57 return np.round( 58 np.random.rand(num_centers, num_dims).astype(np.float32) * center_norm) 59 60 61def make_random_points(centers, num_points, max_offset=20): 62 num_centers, num_dims = centers.shape 63 assignments = np.random.choice(num_centers, num_points) 64 offsets = np.round( 65 np.random.randn(num_points, num_dims).astype(np.float32) * max_offset) 66 return (centers[assignments] + offsets, assignments, np.add.reduce( 67 offsets * offsets, 1)) 68 69 70class KMeansTestBase(test.TestCase): 71 72 def input_fn(self, 73 batch_size=None, 74 points=None, 75 randomize=None, 76 num_epochs=None): 77 """Returns an input_fn that randomly selects batches from given points.""" 78 batch_size = batch_size or self.batch_size 79 points = points if points is not None else self.points 80 num_points = points.shape[0] 81 if randomize is None: 82 randomize = (self.use_mini_batch and 83 self.mini_batch_steps_per_iteration <= 1) 84 85 def _fn(): 86 x = constant_op.constant(points) 87 if batch_size == num_points: 88 return input_lib.limit_epochs(x, num_epochs=num_epochs), None 89 if randomize: 90 indices = random_ops.random_uniform( 91 constant_op.constant([batch_size]), 92 minval=0, 93 maxval=num_points - 1, 94 dtype=dtypes.int32, 95 seed=10) 96 else: 97 # We need to cycle through the indices sequentially. We create a queue 98 # to maintain the list of indices. 99 q = data_flow_ops.FIFOQueue(num_points, dtypes.int32, ()) 100 101 # Conditionally initialize the Queue. 102 def _init_q(): 103 with ops.control_dependencies( 104 [q.enqueue_many(math_ops.range(num_points))]): 105 return control_flow_ops.no_op() 106 107 init_q = control_flow_ops.cond(q.size() <= 0, _init_q, 108 control_flow_ops.no_op) 109 with ops.control_dependencies([init_q]): 110 offsets = q.dequeue_many(batch_size) 111 with ops.control_dependencies([q.enqueue_many(offsets)]): 112 indices = array_ops.identity(offsets) 113 batch = array_ops.gather(x, indices) 114 return (input_lib.limit_epochs(batch, num_epochs=num_epochs), None) 115 116 return _fn 117 118 @staticmethod 119 def config(tf_random_seed): 120 return run_config.RunConfig(tf_random_seed=tf_random_seed) 121 122 @property 123 def initial_clusters(self): 124 return kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT 125 126 @property 127 def batch_size(self): 128 return self.num_points 129 130 @property 131 def use_mini_batch(self): 132 return False 133 134 @property 135 def mini_batch_steps_per_iteration(self): 136 return 1 137 138 139class KMeansTest(KMeansTestBase): 140 141 def setUp(self): 142 np.random.seed(3) 143 self.num_centers = 5 144 self.num_dims = 2 145 self.num_points = 1000 146 self.true_centers = make_random_centers(self.num_centers, self.num_dims) 147 self.points, _, self.scores = make_random_points(self.true_centers, 148 self.num_points) 149 self.true_score = np.add.reduce(self.scores) 150 151 def _kmeans(self, relative_tolerance=None): 152 return kmeans_lib.KMeansClustering( 153 self.num_centers, 154 initial_clusters=self.initial_clusters, 155 distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, 156 use_mini_batch=self.use_mini_batch, 157 mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, 158 random_seed=24, 159 relative_tolerance=relative_tolerance) 160 161 def test_clusters(self): 162 kmeans = self._kmeans() 163 kmeans.fit(input_fn=self.input_fn(), steps=1) 164 clusters = kmeans.clusters() 165 self.assertAllEqual(list(clusters.shape), [self.num_centers, self.num_dims]) 166 167 def test_fit(self): 168 kmeans = self._kmeans() 169 kmeans.fit(input_fn=self.input_fn(), steps=1) 170 score1 = kmeans.score( 171 input_fn=self.input_fn(batch_size=self.num_points), steps=1) 172 steps = 10 * self.num_points // self.batch_size 173 kmeans.fit(input_fn=self.input_fn(), steps=steps) 174 score2 = kmeans.score( 175 input_fn=self.input_fn(batch_size=self.num_points), steps=1) 176 self.assertTrue(score1 > score2) 177 self.assertNear(self.true_score, score2, self.true_score * 0.05) 178 179 def test_monitor(self): 180 if self.use_mini_batch: 181 # We don't test for use_mini_batch case since the loss value can be noisy. 182 return 183 kmeans = kmeans_lib.KMeansClustering( 184 self.num_centers, 185 initial_clusters=self.initial_clusters, 186 distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, 187 use_mini_batch=self.use_mini_batch, 188 mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, 189 config=learn.RunConfig(tf_random_seed=14), 190 random_seed=12, 191 relative_tolerance=1e-4) 192 193 kmeans.fit( 194 input_fn=self.input_fn(), 195 # Force it to train until the relative tolerance monitor stops it. 196 steps=None) 197 score = kmeans.score( 198 input_fn=self.input_fn(batch_size=self.num_points), steps=1) 199 self.assertNear(self.true_score, score, self.true_score * 0.01) 200 201 def _infer_helper(self, kmeans, clusters, num_points): 202 points, true_assignments, true_offsets = make_random_points( 203 clusters, num_points) 204 # Test predict 205 assignments = list( 206 kmeans.predict_cluster_idx(input_fn=self.input_fn( 207 batch_size=num_points, points=points, num_epochs=1))) 208 self.assertAllEqual(assignments, true_assignments) 209 210 # Test score 211 score = kmeans.score( 212 input_fn=lambda: (constant_op.constant(points), None), steps=1) 213 self.assertNear(score, np.sum(true_offsets), 0.01 * score) 214 215 # Test transform 216 transform = kmeans.transform( 217 input_fn=lambda: (constant_op.constant(points), None)) 218 true_transform = np.maximum( 219 0, 220 np.sum(np.square(points), axis=1, 221 keepdims=True) - 2 * np.dot(points, np.transpose(clusters)) + 222 np.transpose(np.sum(np.square(clusters), axis=1, keepdims=True))) 223 self.assertAllClose(transform, true_transform, rtol=0.05, atol=10) 224 225 def test_infer(self): 226 kmeans = self._kmeans() 227 # Make a call to fit to initialize the cluster centers. 228 max_steps = 1 229 kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps) 230 clusters = kmeans.clusters() 231 232 # Run inference on small datasets. 233 self._infer_helper(kmeans, clusters, num_points=10) 234 self._infer_helper(kmeans, clusters, num_points=1) 235 236 237class KMeansTestMultiStageInit(KMeansTestBase): 238 239 def test_random(self): 240 points = np.array( 241 [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32) 242 kmeans = kmeans_lib.KMeansClustering( 243 num_clusters=points.shape[0], 244 initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT, 245 distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, 246 use_mini_batch=True, 247 mini_batch_steps_per_iteration=100, 248 random_seed=24, 249 relative_tolerance=None) 250 kmeans.fit( 251 input_fn=self.input_fn(batch_size=1, points=points, randomize=False), 252 steps=1) 253 clusters = kmeans.clusters() 254 self.assertAllEqual(points, clusters) 255 256 def test_kmeans_plus_plus_batch_just_right(self): 257 points = np.array([[1, 2]], dtype=np.float32) 258 kmeans = kmeans_lib.KMeansClustering( 259 num_clusters=points.shape[0], 260 initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT, 261 distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, 262 use_mini_batch=True, 263 mini_batch_steps_per_iteration=100, 264 random_seed=24, 265 relative_tolerance=None) 266 kmeans.fit( 267 input_fn=self.input_fn(batch_size=1, points=points, randomize=False), 268 steps=1) 269 clusters = kmeans.clusters() 270 self.assertAllEqual(points, clusters) 271 272 def test_kmeans_plus_plus_batch_too_small(self): 273 points = np.array( 274 [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32) 275 kmeans = kmeans_lib.KMeansClustering( 276 num_clusters=points.shape[0], 277 initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT, 278 distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, 279 use_mini_batch=True, 280 mini_batch_steps_per_iteration=100, 281 random_seed=24, 282 relative_tolerance=None) 283 with self.assertRaisesOpError(AssertionError): 284 kmeans.fit( 285 input_fn=self.input_fn(batch_size=4, points=points, randomize=False), 286 steps=1) 287 288 289class MiniBatchKMeansTest(KMeansTest): 290 291 @property 292 def batch_size(self): 293 return 50 294 295 @property 296 def use_mini_batch(self): 297 return True 298 299 300class FullBatchAsyncKMeansTest(KMeansTest): 301 302 @property 303 def batch_size(self): 304 return 50 305 306 @property 307 def use_mini_batch(self): 308 return True 309 310 @property 311 def mini_batch_steps_per_iteration(self): 312 return self.num_points // self.batch_size 313 314 315class KMeansCosineDistanceTest(KMeansTestBase): 316 317 def setUp(self): 318 self.points = np.array( 319 [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2], [0.1, 2.5], [0.2, 2], 320 [0.1, 3], [0.2, 4]], 321 dtype=np.float32) 322 self.num_points = self.points.shape[0] 323 self.true_centers = np.array( 324 [ 325 normalize( 326 np.mean(normalize(self.points)[0:4, :], axis=0, keepdims=True))[ 327 0], 328 normalize( 329 np.mean(normalize(self.points)[4:, :], axis=0, keepdims=True))[ 330 0] 331 ], 332 dtype=np.float32) 333 self.true_assignments = np.array([0] * 4 + [1] * 4) 334 self.true_score = len(self.points) - np.tensordot( 335 normalize(self.points), self.true_centers[self.true_assignments]) 336 337 self.num_centers = 2 338 self.kmeans = kmeans_lib.KMeansClustering( 339 self.num_centers, 340 initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT, 341 distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE, 342 use_mini_batch=self.use_mini_batch, 343 mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, 344 config=self.config(3)) 345 346 def test_fit(self): 347 max_steps = 10 * self.num_points // self.batch_size 348 self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps) 349 centers = normalize(self.kmeans.clusters()) 350 centers = centers[centers[:, 0].argsort()] 351 true_centers = self.true_centers[self.true_centers[:, 0].argsort()] 352 self.assertAllClose(centers, true_centers, atol=0.04) 353 354 def test_transform(self): 355 self.kmeans.fit(input_fn=self.input_fn(), steps=10) 356 centers = normalize(self.kmeans.clusters()) 357 true_transform = 1 - cosine_similarity(self.points, centers) 358 transform = self.kmeans.transform(input_fn=self.input_fn( 359 batch_size=self.num_points)) 360 self.assertAllClose(transform, true_transform, atol=1e-3) 361 362 def test_predict(self): 363 max_steps = 10 * self.num_points // self.batch_size 364 self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps) 365 centers = normalize(self.kmeans.clusters()) 366 367 assignments = list( 368 self.kmeans.predict_cluster_idx(input_fn=self.input_fn( 369 num_epochs=1, batch_size=self.num_points))) 370 self.assertAllClose( 371 centers[assignments], 372 self.true_centers[self.true_assignments], 373 atol=1e-2) 374 375 centers = centers[centers[:, 0].argsort()] 376 true_centers = self.true_centers[self.true_centers[:, 0].argsort()] 377 self.assertAllClose(centers, true_centers, atol=0.04) 378 score = self.kmeans.score( 379 input_fn=self.input_fn(batch_size=self.num_points), steps=1) 380 self.assertAllClose(score, self.true_score, atol=1e-2) 381 382 def test_predict_kmeans_plus_plus(self): 383 # Most points are concetrated near one center. KMeans++ is likely to find 384 # the less populated centers. 385 points = np.array( 386 [[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2], 387 [-2.8, -3.], [-2.9, -3.1], [-3., -3.1], [-3., -3.1], [-3.2, -3.], 388 [-3., -3.]], 389 dtype=np.float32) 390 true_centers = np.array( 391 [ 392 normalize( 393 np.mean(normalize(points)[0:2, :], axis=0, keepdims=True))[0], 394 normalize( 395 np.mean(normalize(points)[2:4, :], axis=0, keepdims=True))[0], 396 normalize( 397 np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0] 398 ], 399 dtype=np.float32) 400 true_assignments = [0] * 2 + [1] * 2 + [2] * 8 401 true_score = len(points) - np.tensordot( 402 normalize(points), true_centers[true_assignments]) 403 404 kmeans = kmeans_lib.KMeansClustering( 405 3, 406 initial_clusters=self.initial_clusters, 407 distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE, 408 use_mini_batch=self.use_mini_batch, 409 mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, 410 config=self.config(3)) 411 kmeans.fit(input_fn=lambda: (constant_op.constant(points), None), steps=30) 412 413 centers = normalize(kmeans.clusters()) 414 self.assertAllClose( 415 sorted(centers.tolist()), sorted(true_centers.tolist()), atol=1e-2) 416 417 def _input_fn(): 418 return (input_lib.limit_epochs( 419 constant_op.constant(points), num_epochs=1), None) 420 421 assignments = list(kmeans.predict_cluster_idx(input_fn=_input_fn)) 422 self.assertAllClose( 423 centers[assignments], true_centers[true_assignments], atol=1e-2) 424 425 score = kmeans.score( 426 input_fn=lambda: (constant_op.constant(points), None), steps=1) 427 self.assertAllClose(score, true_score, atol=1e-2) 428 429 430class MiniBatchKMeansCosineTest(KMeansCosineDistanceTest): 431 432 @property 433 def batch_size(self): 434 return 2 435 436 @property 437 def use_mini_batch(self): 438 return True 439 440 441class FullBatchAsyncKMeansCosineTest(KMeansCosineDistanceTest): 442 443 @property 444 def batch_size(self): 445 return 2 446 447 @property 448 def use_mini_batch(self): 449 return True 450 451 @property 452 def mini_batch_steps_per_iteration(self): 453 return self.num_points // self.batch_size 454 455 456class KMeansBenchmark(benchmark.Benchmark): 457 """Base class for benchmarks.""" 458 459 def SetUp(self, 460 dimension=50, 461 num_clusters=50, 462 points_per_cluster=10000, 463 center_norm=500, 464 cluster_width=20): 465 np.random.seed(123456) 466 self.num_clusters = num_clusters 467 self.num_points = num_clusters * points_per_cluster 468 self.centers = make_random_centers( 469 self.num_clusters, dimension, center_norm=center_norm) 470 self.points, _, scores = make_random_points( 471 self.centers, self.num_points, max_offset=cluster_width) 472 self.score = float(np.sum(scores)) 473 474 def _report(self, num_iters, start, end, scores): 475 print(scores) 476 self.report_benchmark( 477 iters=num_iters, 478 wall_time=(end - start) / num_iters, 479 extras={'true_sum_squared_distances': self.score, 480 'fit_scores': scores}) 481 482 def _fit(self, num_iters=10): 483 pass 484 485 def benchmark_01_2dim_5center_500point(self): 486 self.SetUp(dimension=2, num_clusters=5, points_per_cluster=100) 487 self._fit() 488 489 def benchmark_02_20dim_20center_10kpoint(self): 490 self.SetUp(dimension=20, num_clusters=20, points_per_cluster=500) 491 self._fit() 492 493 def benchmark_03_100dim_50center_50kpoint(self): 494 self.SetUp(dimension=100, num_clusters=50, points_per_cluster=1000) 495 self._fit() 496 497 def benchmark_03_100dim_50center_50kpoint_unseparated(self): 498 self.SetUp( 499 dimension=100, 500 num_clusters=50, 501 points_per_cluster=1000, 502 cluster_width=250) 503 self._fit() 504 505 def benchmark_04_100dim_500center_500kpoint(self): 506 self.SetUp(dimension=100, num_clusters=500, points_per_cluster=1000) 507 self._fit(num_iters=4) 508 509 def benchmark_05_100dim_500center_500kpoint_unseparated(self): 510 self.SetUp( 511 dimension=100, 512 num_clusters=500, 513 points_per_cluster=1000, 514 cluster_width=250) 515 self._fit(num_iters=4) 516 517 518class TensorflowKMeansBenchmark(KMeansBenchmark): 519 520 def _fit(self, num_iters=10): 521 scores = [] 522 start = time.time() 523 for i in range(num_iters): 524 print('Starting tensorflow KMeans: %d' % i) 525 tf_kmeans = kmeans_lib.KMeansClustering( 526 self.num_clusters, 527 initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT, 528 kmeans_plus_plus_num_retries=int(math.log(self.num_clusters) + 2), 529 random_seed=i * 42, 530 relative_tolerance=1e-6, 531 config=run_config.RunConfig(tf_random_seed=3)) 532 tf_kmeans.fit( 533 input_fn=lambda: (constant_op.constant(self.points), None), steps=50) 534 _ = tf_kmeans.clusters() 535 scores.append( 536 tf_kmeans.score( 537 input_fn=lambda: (constant_op.constant(self.points), None), 538 steps=1)) 539 self._report(num_iters, start, time.time(), scores) 540 541 542class SklearnKMeansBenchmark(KMeansBenchmark): 543 544 def _fit(self, num_iters=10): 545 scores = [] 546 start = time.time() 547 for i in range(num_iters): 548 print('Starting sklearn KMeans: %d' % i) 549 sklearn_kmeans = SklearnKMeans( 550 n_clusters=self.num_clusters, 551 init='k-means++', 552 max_iter=50, 553 n_init=1, 554 tol=1e-4, 555 random_state=i * 42) 556 sklearn_kmeans.fit(self.points) 557 scores.append(sklearn_kmeans.inertia_) 558 self._report(num_iters, start, time.time(), scores) 559 560 561class KMeansTestQueues(test.TestCase): 562 563 def input_fn(self): 564 565 def _fn(): 566 queue = data_flow_ops.FIFOQueue( 567 capacity=10, dtypes=dtypes.float32, shapes=[10, 3]) 568 enqueue_op = queue.enqueue(array_ops.zeros([10, 3], dtype=dtypes.float32)) 569 queue_runner.add_queue_runner( 570 queue_runner.QueueRunner(queue, [enqueue_op])) 571 return queue.dequeue(), None 572 573 return _fn 574 575 # This test makes sure that there are no deadlocks when using a QueueRunner. 576 # Note that since cluster initialization is dependendent on inputs, if input 577 # is generated using a QueueRunner, one has to make sure that these runners 578 # are started before the initialization. 579 def test_queues(self): 580 kmeans = kmeans_lib.KMeansClustering(5) 581 kmeans.fit(input_fn=self.input_fn(), steps=1) 582 583 584if __name__ == '__main__': 585 test.main() 586