• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Tests for KMeans."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import math
22import time
23
24import numpy as np
25from sklearn.cluster import KMeans as SklearnKMeans
26
27# pylint: disable=g-import-not-at-top
28from tensorflow.contrib.learn.python import learn
29from tensorflow.contrib.learn.python.learn.estimators import kmeans as kmeans_lib
30from tensorflow.contrib.learn.python.learn.estimators import run_config
31from tensorflow.python.framework import constant_op
32from tensorflow.python.framework import dtypes
33from tensorflow.python.framework import ops
34from tensorflow.python.ops import array_ops
35from tensorflow.python.ops import control_flow_ops
36from tensorflow.python.ops import data_flow_ops
37from tensorflow.python.ops import math_ops
38from tensorflow.python.ops import random_ops
39from tensorflow.python.platform import benchmark
40from tensorflow.python.platform import flags
41from tensorflow.python.platform import test
42from tensorflow.python.training import input as input_lib
43from tensorflow.python.training import queue_runner
44
45FLAGS = flags.FLAGS
46
47
48def normalize(x):
49  return x / np.sqrt(np.sum(x * x, axis=-1, keepdims=True))
50
51
52def cosine_similarity(x, y):
53  return np.dot(normalize(x), np.transpose(normalize(y)))
54
55
56def make_random_centers(num_centers, num_dims, center_norm=500):
57  return np.round(
58      np.random.rand(num_centers, num_dims).astype(np.float32) * center_norm)
59
60
61def make_random_points(centers, num_points, max_offset=20):
62  num_centers, num_dims = centers.shape
63  assignments = np.random.choice(num_centers, num_points)
64  offsets = np.round(
65      np.random.randn(num_points, num_dims).astype(np.float32) * max_offset)
66  return (centers[assignments] + offsets, assignments, np.add.reduce(
67      offsets * offsets, 1))
68
69
70class KMeansTestBase(test.TestCase):
71
72  def input_fn(self,
73               batch_size=None,
74               points=None,
75               randomize=None,
76               num_epochs=None):
77    """Returns an input_fn that randomly selects batches from given points."""
78    batch_size = batch_size or self.batch_size
79    points = points if points is not None else self.points
80    num_points = points.shape[0]
81    if randomize is None:
82      randomize = (self.use_mini_batch and
83                   self.mini_batch_steps_per_iteration <= 1)
84
85    def _fn():
86      x = constant_op.constant(points)
87      if batch_size == num_points:
88        return input_lib.limit_epochs(x, num_epochs=num_epochs), None
89      if randomize:
90        indices = random_ops.random_uniform(
91            constant_op.constant([batch_size]),
92            minval=0,
93            maxval=num_points - 1,
94            dtype=dtypes.int32,
95            seed=10)
96      else:
97        # We need to cycle through the indices sequentially. We create a queue
98        # to maintain the list of indices.
99        q = data_flow_ops.FIFOQueue(num_points, dtypes.int32, ())
100
101        # Conditionally initialize the Queue.
102        def _init_q():
103          with ops.control_dependencies(
104              [q.enqueue_many(math_ops.range(num_points))]):
105            return control_flow_ops.no_op()
106
107        init_q = control_flow_ops.cond(q.size() <= 0, _init_q,
108                                       control_flow_ops.no_op)
109        with ops.control_dependencies([init_q]):
110          offsets = q.dequeue_many(batch_size)
111          with ops.control_dependencies([q.enqueue_many(offsets)]):
112            indices = array_ops.identity(offsets)
113      batch = array_ops.gather(x, indices)
114      return (input_lib.limit_epochs(batch, num_epochs=num_epochs), None)
115
116    return _fn
117
118  @staticmethod
119  def config(tf_random_seed):
120    return run_config.RunConfig(tf_random_seed=tf_random_seed)
121
122  @property
123  def initial_clusters(self):
124    return kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT
125
126  @property
127  def batch_size(self):
128    return self.num_points
129
130  @property
131  def use_mini_batch(self):
132    return False
133
134  @property
135  def mini_batch_steps_per_iteration(self):
136    return 1
137
138
139class KMeansTest(KMeansTestBase):
140
141  def setUp(self):
142    np.random.seed(3)
143    self.num_centers = 5
144    self.num_dims = 2
145    self.num_points = 1000
146    self.true_centers = make_random_centers(self.num_centers, self.num_dims)
147    self.points, _, self.scores = make_random_points(self.true_centers,
148                                                     self.num_points)
149    self.true_score = np.add.reduce(self.scores)
150
151  def _kmeans(self, relative_tolerance=None):
152    return kmeans_lib.KMeansClustering(
153        self.num_centers,
154        initial_clusters=self.initial_clusters,
155        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
156        use_mini_batch=self.use_mini_batch,
157        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
158        random_seed=24,
159        relative_tolerance=relative_tolerance)
160
161  def test_clusters(self):
162    kmeans = self._kmeans()
163    kmeans.fit(input_fn=self.input_fn(), steps=1)
164    clusters = kmeans.clusters()
165    self.assertAllEqual(list(clusters.shape), [self.num_centers, self.num_dims])
166
167  def test_fit(self):
168    kmeans = self._kmeans()
169    kmeans.fit(input_fn=self.input_fn(), steps=1)
170    score1 = kmeans.score(
171        input_fn=self.input_fn(batch_size=self.num_points), steps=1)
172    steps = 10 * self.num_points // self.batch_size
173    kmeans.fit(input_fn=self.input_fn(), steps=steps)
174    score2 = kmeans.score(
175        input_fn=self.input_fn(batch_size=self.num_points), steps=1)
176    self.assertTrue(score1 > score2)
177    self.assertNear(self.true_score, score2, self.true_score * 0.05)
178
179  def test_monitor(self):
180    if self.use_mini_batch:
181      # We don't test for use_mini_batch case since the loss value can be noisy.
182      return
183    kmeans = kmeans_lib.KMeansClustering(
184        self.num_centers,
185        initial_clusters=self.initial_clusters,
186        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
187        use_mini_batch=self.use_mini_batch,
188        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
189        config=learn.RunConfig(tf_random_seed=14),
190        random_seed=12,
191        relative_tolerance=1e-4)
192
193    kmeans.fit(
194        input_fn=self.input_fn(),
195        # Force it to train until the relative tolerance monitor stops it.
196        steps=None)
197    score = kmeans.score(
198        input_fn=self.input_fn(batch_size=self.num_points), steps=1)
199    self.assertNear(self.true_score, score, self.true_score * 0.01)
200
201  def _infer_helper(self, kmeans, clusters, num_points):
202    points, true_assignments, true_offsets = make_random_points(
203        clusters, num_points)
204    # Test predict
205    assignments = list(
206        kmeans.predict_cluster_idx(input_fn=self.input_fn(
207            batch_size=num_points, points=points, num_epochs=1)))
208    self.assertAllEqual(assignments, true_assignments)
209
210    # Test score
211    score = kmeans.score(
212        input_fn=lambda: (constant_op.constant(points), None), steps=1)
213    self.assertNear(score, np.sum(true_offsets), 0.01 * score)
214
215    # Test transform
216    transform = kmeans.transform(
217        input_fn=lambda: (constant_op.constant(points), None))
218    true_transform = np.maximum(
219        0,
220        np.sum(np.square(points), axis=1,
221               keepdims=True) - 2 * np.dot(points, np.transpose(clusters)) +
222        np.transpose(np.sum(np.square(clusters), axis=1, keepdims=True)))
223    self.assertAllClose(transform, true_transform, rtol=0.05, atol=10)
224
225  def test_infer(self):
226    kmeans = self._kmeans()
227    # Make a call to fit to initialize the cluster centers.
228    max_steps = 1
229    kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
230    clusters = kmeans.clusters()
231
232    # Run inference on small datasets.
233    self._infer_helper(kmeans, clusters, num_points=10)
234    self._infer_helper(kmeans, clusters, num_points=1)
235
236
237class KMeansTestMultiStageInit(KMeansTestBase):
238
239  def test_random(self):
240    points = np.array(
241        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32)
242    kmeans = kmeans_lib.KMeansClustering(
243        num_clusters=points.shape[0],
244        initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT,
245        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
246        use_mini_batch=True,
247        mini_batch_steps_per_iteration=100,
248        random_seed=24,
249        relative_tolerance=None)
250    kmeans.fit(
251        input_fn=self.input_fn(batch_size=1, points=points, randomize=False),
252        steps=1)
253    clusters = kmeans.clusters()
254    self.assertAllEqual(points, clusters)
255
256  def test_kmeans_plus_plus_batch_just_right(self):
257    points = np.array([[1, 2]], dtype=np.float32)
258    kmeans = kmeans_lib.KMeansClustering(
259        num_clusters=points.shape[0],
260        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
261        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
262        use_mini_batch=True,
263        mini_batch_steps_per_iteration=100,
264        random_seed=24,
265        relative_tolerance=None)
266    kmeans.fit(
267        input_fn=self.input_fn(batch_size=1, points=points, randomize=False),
268        steps=1)
269    clusters = kmeans.clusters()
270    self.assertAllEqual(points, clusters)
271
272  def test_kmeans_plus_plus_batch_too_small(self):
273    points = np.array(
274        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32)
275    kmeans = kmeans_lib.KMeansClustering(
276        num_clusters=points.shape[0],
277        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
278        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
279        use_mini_batch=True,
280        mini_batch_steps_per_iteration=100,
281        random_seed=24,
282        relative_tolerance=None)
283    with self.assertRaisesOpError(AssertionError):
284      kmeans.fit(
285          input_fn=self.input_fn(batch_size=4, points=points, randomize=False),
286          steps=1)
287
288
289class MiniBatchKMeansTest(KMeansTest):
290
291  @property
292  def batch_size(self):
293    return 50
294
295  @property
296  def use_mini_batch(self):
297    return True
298
299
300class FullBatchAsyncKMeansTest(KMeansTest):
301
302  @property
303  def batch_size(self):
304    return 50
305
306  @property
307  def use_mini_batch(self):
308    return True
309
310  @property
311  def mini_batch_steps_per_iteration(self):
312    return self.num_points // self.batch_size
313
314
315class KMeansCosineDistanceTest(KMeansTestBase):
316
317  def setUp(self):
318    self.points = np.array(
319        [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2], [0.1, 2.5], [0.2, 2],
320         [0.1, 3], [0.2, 4]],
321        dtype=np.float32)
322    self.num_points = self.points.shape[0]
323    self.true_centers = np.array(
324        [
325            normalize(
326                np.mean(normalize(self.points)[0:4, :], axis=0, keepdims=True))[
327                    0],
328            normalize(
329                np.mean(normalize(self.points)[4:, :], axis=0, keepdims=True))[
330                    0]
331        ],
332        dtype=np.float32)
333    self.true_assignments = np.array([0] * 4 + [1] * 4)
334    self.true_score = len(self.points) - np.tensordot(
335        normalize(self.points), self.true_centers[self.true_assignments])
336
337    self.num_centers = 2
338    self.kmeans = kmeans_lib.KMeansClustering(
339        self.num_centers,
340        initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT,
341        distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
342        use_mini_batch=self.use_mini_batch,
343        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
344        config=self.config(3))
345
346  def test_fit(self):
347    max_steps = 10 * self.num_points // self.batch_size
348    self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
349    centers = normalize(self.kmeans.clusters())
350    centers = centers[centers[:, 0].argsort()]
351    true_centers = self.true_centers[self.true_centers[:, 0].argsort()]
352    self.assertAllClose(centers, true_centers, atol=0.04)
353
354  def test_transform(self):
355    self.kmeans.fit(input_fn=self.input_fn(), steps=10)
356    centers = normalize(self.kmeans.clusters())
357    true_transform = 1 - cosine_similarity(self.points, centers)
358    transform = self.kmeans.transform(input_fn=self.input_fn(
359        batch_size=self.num_points))
360    self.assertAllClose(transform, true_transform, atol=1e-3)
361
362  def test_predict(self):
363    max_steps = 10 * self.num_points // self.batch_size
364    self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
365    centers = normalize(self.kmeans.clusters())
366
367    assignments = list(
368        self.kmeans.predict_cluster_idx(input_fn=self.input_fn(
369            num_epochs=1, batch_size=self.num_points)))
370    self.assertAllClose(
371        centers[assignments],
372        self.true_centers[self.true_assignments],
373        atol=1e-2)
374
375    centers = centers[centers[:, 0].argsort()]
376    true_centers = self.true_centers[self.true_centers[:, 0].argsort()]
377    self.assertAllClose(centers, true_centers, atol=0.04)
378    score = self.kmeans.score(
379        input_fn=self.input_fn(batch_size=self.num_points), steps=1)
380    self.assertAllClose(score, self.true_score, atol=1e-2)
381
382  def test_predict_kmeans_plus_plus(self):
383    # Most points are concetrated near one center. KMeans++ is likely to find
384    # the less populated centers.
385    points = np.array(
386        [[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2],
387         [-2.8, -3.], [-2.9, -3.1], [-3., -3.1], [-3., -3.1], [-3.2, -3.],
388         [-3., -3.]],
389        dtype=np.float32)
390    true_centers = np.array(
391        [
392            normalize(
393                np.mean(normalize(points)[0:2, :], axis=0, keepdims=True))[0],
394            normalize(
395                np.mean(normalize(points)[2:4, :], axis=0, keepdims=True))[0],
396            normalize(
397                np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0]
398        ],
399        dtype=np.float32)
400    true_assignments = [0] * 2 + [1] * 2 + [2] * 8
401    true_score = len(points) - np.tensordot(
402        normalize(points), true_centers[true_assignments])
403
404    kmeans = kmeans_lib.KMeansClustering(
405        3,
406        initial_clusters=self.initial_clusters,
407        distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
408        use_mini_batch=self.use_mini_batch,
409        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
410        config=self.config(3))
411    kmeans.fit(input_fn=lambda: (constant_op.constant(points), None), steps=30)
412
413    centers = normalize(kmeans.clusters())
414    self.assertAllClose(
415        sorted(centers.tolist()), sorted(true_centers.tolist()), atol=1e-2)
416
417    def _input_fn():
418      return (input_lib.limit_epochs(
419          constant_op.constant(points), num_epochs=1), None)
420
421    assignments = list(kmeans.predict_cluster_idx(input_fn=_input_fn))
422    self.assertAllClose(
423        centers[assignments], true_centers[true_assignments], atol=1e-2)
424
425    score = kmeans.score(
426        input_fn=lambda: (constant_op.constant(points), None), steps=1)
427    self.assertAllClose(score, true_score, atol=1e-2)
428
429
430class MiniBatchKMeansCosineTest(KMeansCosineDistanceTest):
431
432  @property
433  def batch_size(self):
434    return 2
435
436  @property
437  def use_mini_batch(self):
438    return True
439
440
441class FullBatchAsyncKMeansCosineTest(KMeansCosineDistanceTest):
442
443  @property
444  def batch_size(self):
445    return 2
446
447  @property
448  def use_mini_batch(self):
449    return True
450
451  @property
452  def mini_batch_steps_per_iteration(self):
453    return self.num_points // self.batch_size
454
455
456class KMeansBenchmark(benchmark.Benchmark):
457  """Base class for benchmarks."""
458
459  def SetUp(self,
460            dimension=50,
461            num_clusters=50,
462            points_per_cluster=10000,
463            center_norm=500,
464            cluster_width=20):
465    np.random.seed(123456)
466    self.num_clusters = num_clusters
467    self.num_points = num_clusters * points_per_cluster
468    self.centers = make_random_centers(
469        self.num_clusters, dimension, center_norm=center_norm)
470    self.points, _, scores = make_random_points(
471        self.centers, self.num_points, max_offset=cluster_width)
472    self.score = float(np.sum(scores))
473
474  def _report(self, num_iters, start, end, scores):
475    print(scores)
476    self.report_benchmark(
477        iters=num_iters,
478        wall_time=(end - start) / num_iters,
479        extras={'true_sum_squared_distances': self.score,
480                'fit_scores': scores})
481
482  def _fit(self, num_iters=10):
483    pass
484
485  def benchmark_01_2dim_5center_500point(self):
486    self.SetUp(dimension=2, num_clusters=5, points_per_cluster=100)
487    self._fit()
488
489  def benchmark_02_20dim_20center_10kpoint(self):
490    self.SetUp(dimension=20, num_clusters=20, points_per_cluster=500)
491    self._fit()
492
493  def benchmark_03_100dim_50center_50kpoint(self):
494    self.SetUp(dimension=100, num_clusters=50, points_per_cluster=1000)
495    self._fit()
496
497  def benchmark_03_100dim_50center_50kpoint_unseparated(self):
498    self.SetUp(
499        dimension=100,
500        num_clusters=50,
501        points_per_cluster=1000,
502        cluster_width=250)
503    self._fit()
504
505  def benchmark_04_100dim_500center_500kpoint(self):
506    self.SetUp(dimension=100, num_clusters=500, points_per_cluster=1000)
507    self._fit(num_iters=4)
508
509  def benchmark_05_100dim_500center_500kpoint_unseparated(self):
510    self.SetUp(
511        dimension=100,
512        num_clusters=500,
513        points_per_cluster=1000,
514        cluster_width=250)
515    self._fit(num_iters=4)
516
517
518class TensorflowKMeansBenchmark(KMeansBenchmark):
519
520  def _fit(self, num_iters=10):
521    scores = []
522    start = time.time()
523    for i in range(num_iters):
524      print('Starting tensorflow KMeans: %d' % i)
525      tf_kmeans = kmeans_lib.KMeansClustering(
526          self.num_clusters,
527          initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
528          kmeans_plus_plus_num_retries=int(math.log(self.num_clusters) + 2),
529          random_seed=i * 42,
530          relative_tolerance=1e-6,
531          config=run_config.RunConfig(tf_random_seed=3))
532      tf_kmeans.fit(
533          input_fn=lambda: (constant_op.constant(self.points), None), steps=50)
534      _ = tf_kmeans.clusters()
535      scores.append(
536          tf_kmeans.score(
537              input_fn=lambda: (constant_op.constant(self.points), None),
538              steps=1))
539    self._report(num_iters, start, time.time(), scores)
540
541
542class SklearnKMeansBenchmark(KMeansBenchmark):
543
544  def _fit(self, num_iters=10):
545    scores = []
546    start = time.time()
547    for i in range(num_iters):
548      print('Starting sklearn KMeans: %d' % i)
549      sklearn_kmeans = SklearnKMeans(
550          n_clusters=self.num_clusters,
551          init='k-means++',
552          max_iter=50,
553          n_init=1,
554          tol=1e-4,
555          random_state=i * 42)
556      sklearn_kmeans.fit(self.points)
557      scores.append(sklearn_kmeans.inertia_)
558    self._report(num_iters, start, time.time(), scores)
559
560
561class KMeansTestQueues(test.TestCase):
562
563  def input_fn(self):
564
565    def _fn():
566      queue = data_flow_ops.FIFOQueue(
567          capacity=10, dtypes=dtypes.float32, shapes=[10, 3])
568      enqueue_op = queue.enqueue(array_ops.zeros([10, 3], dtype=dtypes.float32))
569      queue_runner.add_queue_runner(
570          queue_runner.QueueRunner(queue, [enqueue_op]))
571      return queue.dequeue(), None
572
573    return _fn
574
575  # This test makes sure that there are no deadlocks when using a QueueRunner.
576  # Note that since cluster initialization is dependendent on inputs, if input
577  # is generated using a QueueRunner, one has to make sure that these runners
578  # are started before the initialization.
579  def test_queues(self):
580    kmeans = kmeans_lib.KMeansClustering(5)
581    kmeans.fit(input_fn=self.input_fn(), steps=1)
582
583
584if __name__ == '__main__':
585  test.main()
586