1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Tests for estimators.linear.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import functools 22import json 23import tempfile 24 25import numpy as np 26 27from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib 28from tensorflow.contrib.learn.python.learn import experiment 29from tensorflow.contrib.learn.python.learn.datasets import base 30from tensorflow.contrib.learn.python.learn.estimators import _sklearn 31from tensorflow.contrib.learn.python.learn.estimators import estimator 32from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils 33from tensorflow.contrib.learn.python.learn.estimators import head as head_lib 34from tensorflow.contrib.learn.python.learn.estimators import linear 35from tensorflow.contrib.learn.python.learn.estimators import run_config 36from tensorflow.contrib.learn.python.learn.estimators import test_data 37from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec 38from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib 39from tensorflow.contrib.metrics.python.ops import metric_ops 40from tensorflow.python.feature_column import feature_column_lib as fc_core 41from tensorflow.python.framework import constant_op 42from tensorflow.python.framework import dtypes 43from tensorflow.python.framework import sparse_tensor 44from tensorflow.python.ops import array_ops 45from tensorflow.python.ops import math_ops 46from tensorflow.python.ops import partitioned_variables 47from tensorflow.python.platform import test 48from tensorflow.python.training import ftrl 49from tensorflow.python.training import input as input_lib 50from tensorflow.python.training import server_lib 51 52 53def _prepare_iris_data_for_logistic_regression(): 54 # Converts iris data to a logistic regression problem. 55 iris = base.load_iris() 56 ids = np.where((iris.target == 0) | (iris.target == 1)) 57 iris = base.Dataset(data=iris.data[ids], target=iris.target[ids]) 58 return iris 59 60 61class LinearClassifierTest(test.TestCase): 62 63 def testExperimentIntegration(self): 64 cont_features = [ 65 feature_column_lib.real_valued_column( 66 'feature', dimension=4) 67 ] 68 69 exp = experiment.Experiment( 70 estimator=linear.LinearClassifier( 71 n_classes=3, feature_columns=cont_features), 72 train_input_fn=test_data.iris_input_multiclass_fn, 73 eval_input_fn=test_data.iris_input_multiclass_fn) 74 exp.test() 75 76 def testEstimatorContract(self): 77 estimator_test_utils.assert_estimator_contract(self, 78 linear.LinearClassifier) 79 80 def testTrain(self): 81 """Tests that loss goes down with training.""" 82 83 def input_fn(): 84 return { 85 'age': 86 constant_op.constant([1]), 87 'language': 88 sparse_tensor.SparseTensor( 89 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 90 }, constant_op.constant([[1]]) 91 92 language = feature_column_lib.sparse_column_with_hash_bucket('language', 93 100) 94 age = feature_column_lib.real_valued_column('age') 95 96 classifier = linear.LinearClassifier(feature_columns=[age, language]) 97 classifier.fit(input_fn=input_fn, steps=100) 98 loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 99 classifier.fit(input_fn=input_fn, steps=200) 100 loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 101 self.assertLess(loss2, loss1) 102 self.assertLess(loss2, 0.01) 103 104 def testJointTrain(self): 105 """Tests that loss goes down with training with joint weights.""" 106 107 def input_fn(): 108 return { 109 'age': 110 sparse_tensor.SparseTensor( 111 values=['1'], indices=[[0, 0]], dense_shape=[1, 1]), 112 'language': 113 sparse_tensor.SparseTensor( 114 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 115 }, constant_op.constant([[1]]) 116 117 language = feature_column_lib.sparse_column_with_hash_bucket('language', 118 100) 119 age = feature_column_lib.sparse_column_with_hash_bucket('age', 2) 120 121 classifier = linear.LinearClassifier( 122 _joint_weight=True, feature_columns=[age, language]) 123 classifier.fit(input_fn=input_fn, steps=100) 124 loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 125 classifier.fit(input_fn=input_fn, steps=200) 126 loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 127 self.assertLess(loss2, loss1) 128 self.assertLess(loss2, 0.01) 129 130 def testMultiClass_MatrixData(self): 131 """Tests multi-class classification using matrix data as input.""" 132 feature_column = feature_column_lib.real_valued_column( 133 'feature', dimension=4) 134 135 classifier = linear.LinearClassifier( 136 n_classes=3, feature_columns=[feature_column]) 137 138 classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100) 139 scores = classifier.evaluate( 140 input_fn=test_data.iris_input_multiclass_fn, steps=100) 141 self.assertGreater(scores['accuracy'], 0.9) 142 143 def testMultiClass_MatrixData_Labels1D(self): 144 """Same as the last test, but labels shape is [150] instead of [150, 1].""" 145 146 def _input_fn(): 147 iris = base.load_iris() 148 return { 149 'feature': constant_op.constant( 150 iris.data, dtype=dtypes.float32) 151 }, constant_op.constant( 152 iris.target, shape=[150], dtype=dtypes.int32) 153 154 feature_column = feature_column_lib.real_valued_column( 155 'feature', dimension=4) 156 157 classifier = linear.LinearClassifier( 158 n_classes=3, feature_columns=[feature_column]) 159 160 classifier.fit(input_fn=_input_fn, steps=100) 161 scores = classifier.evaluate(input_fn=_input_fn, steps=1) 162 self.assertGreater(scores['accuracy'], 0.9) 163 164 def testMultiClass_NpMatrixData(self): 165 """Tests multi-class classification using numpy matrix data as input.""" 166 iris = base.load_iris() 167 train_x = iris.data 168 train_y = iris.target 169 feature_column = feature_column_lib.real_valued_column('', dimension=4) 170 classifier = linear.LinearClassifier( 171 n_classes=3, feature_columns=[feature_column]) 172 173 classifier.fit(x=train_x, y=train_y, steps=100) 174 scores = classifier.evaluate(x=train_x, y=train_y, steps=1) 175 self.assertGreater(scores['accuracy'], 0.9) 176 177 def testMultiClassLabelKeys(self): 178 """Tests n_classes > 2 with label_keys vocabulary for labels.""" 179 # Byte literals needed for python3 test to pass. 180 label_keys = [b'label0', b'label1', b'label2'] 181 182 def _input_fn(num_epochs=None): 183 features = { 184 'language': 185 sparse_tensor.SparseTensor( 186 values=input_lib.limit_epochs( 187 ['en', 'fr', 'zh'], num_epochs=num_epochs), 188 indices=[[0, 0], [0, 1], [2, 0]], 189 dense_shape=[3, 2]) 190 } 191 labels = constant_op.constant( 192 [[label_keys[1]], [label_keys[0]], [label_keys[0]]], 193 dtype=dtypes.string) 194 return features, labels 195 196 language_column = feature_column_lib.sparse_column_with_hash_bucket( 197 'language', hash_bucket_size=20) 198 199 classifier = linear.LinearClassifier( 200 n_classes=3, 201 feature_columns=[language_column], 202 label_keys=label_keys) 203 204 classifier.fit(input_fn=_input_fn, steps=50) 205 206 scores = classifier.evaluate(input_fn=_input_fn, steps=1) 207 self.assertGreater(scores['accuracy'], 0.9) 208 self.assertIn('loss', scores) 209 predict_input_fn = functools.partial(_input_fn, num_epochs=1) 210 predicted_classes = list( 211 classifier.predict_classes( 212 input_fn=predict_input_fn, as_iterable=True)) 213 self.assertEqual(3, len(predicted_classes)) 214 for pred in predicted_classes: 215 self.assertIn(pred, label_keys) 216 predictions = list( 217 classifier.predict(input_fn=predict_input_fn, as_iterable=True)) 218 self.assertAllEqual(predicted_classes, predictions) 219 220 def testLogisticRegression_MatrixData(self): 221 """Tests binary classification using matrix data as input.""" 222 223 def _input_fn(): 224 iris = _prepare_iris_data_for_logistic_regression() 225 return { 226 'feature': constant_op.constant( 227 iris.data, dtype=dtypes.float32) 228 }, constant_op.constant( 229 iris.target, shape=[100, 1], dtype=dtypes.int32) 230 231 feature_column = feature_column_lib.real_valued_column( 232 'feature', dimension=4) 233 234 classifier = linear.LinearClassifier(feature_columns=[feature_column]) 235 236 classifier.fit(input_fn=_input_fn, steps=100) 237 scores = classifier.evaluate(input_fn=_input_fn, steps=1) 238 self.assertGreater(scores['accuracy'], 0.9) 239 240 def testEstimatorWithCoreFeatureColumns(self): 241 242 def _input_fn(num_epochs=None): 243 features = { 244 'age': 245 input_lib.limit_epochs( 246 constant_op.constant([[.8], [0.2], [.1]]), 247 num_epochs=num_epochs), 248 'language': 249 sparse_tensor.SparseTensor( 250 values=input_lib.limit_epochs( 251 ['en', 'fr', 'zh'], num_epochs=num_epochs), 252 indices=[[0, 0], [0, 1], [2, 0]], 253 dense_shape=[3, 2]) 254 } 255 return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) 256 257 language_column = fc_core.categorical_column_with_hash_bucket( 258 'language', hash_bucket_size=20) 259 feature_columns = [language_column, fc_core.numeric_column('age')] 260 261 classifier = linear.LinearClassifier(feature_columns=feature_columns) 262 classifier.fit(input_fn=_input_fn, steps=100) 263 scores = classifier.evaluate(input_fn=_input_fn, steps=1) 264 self.assertGreater(scores['accuracy'], 0.9) 265 266 def testLogisticRegression_MatrixData_Labels1D(self): 267 """Same as the last test, but labels shape is [100] instead of [100, 1].""" 268 269 def _input_fn(): 270 iris = _prepare_iris_data_for_logistic_regression() 271 return { 272 'feature': constant_op.constant( 273 iris.data, dtype=dtypes.float32) 274 }, constant_op.constant( 275 iris.target, shape=[100], dtype=dtypes.int32) 276 277 feature_column = feature_column_lib.real_valued_column( 278 'feature', dimension=4) 279 280 classifier = linear.LinearClassifier(feature_columns=[feature_column]) 281 282 classifier.fit(input_fn=_input_fn, steps=100) 283 scores = classifier.evaluate(input_fn=_input_fn, steps=1) 284 self.assertGreater(scores['accuracy'], 0.9) 285 286 def testLogisticRegression_NpMatrixData(self): 287 """Tests binary classification using numpy matrix data as input.""" 288 iris = _prepare_iris_data_for_logistic_regression() 289 train_x = iris.data 290 train_y = iris.target 291 feature_columns = [feature_column_lib.real_valued_column('', dimension=4)] 292 classifier = linear.LinearClassifier(feature_columns=feature_columns) 293 294 classifier.fit(x=train_x, y=train_y, steps=100) 295 scores = classifier.evaluate(x=train_x, y=train_y, steps=1) 296 self.assertGreater(scores['accuracy'], 0.9) 297 298 def testWeightAndBiasNames(self): 299 """Tests that weight and bias names haven't changed.""" 300 feature_column = feature_column_lib.real_valued_column( 301 'feature', dimension=4) 302 303 classifier = linear.LinearClassifier( 304 n_classes=3, feature_columns=[feature_column]) 305 306 classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100) 307 308 variable_names = classifier.get_variable_names() 309 self.assertIn('linear/feature/weight', variable_names) 310 self.assertIn('linear/bias_weight', variable_names) 311 self.assertEqual( 312 4, len(classifier.get_variable_value('linear/feature/weight'))) 313 self.assertEqual( 314 3, len(classifier.get_variable_value('linear/bias_weight'))) 315 316 def testCustomOptimizerByObject(self): 317 """Tests multi-class classification using matrix data as input.""" 318 feature_column = feature_column_lib.real_valued_column( 319 'feature', dimension=4) 320 321 classifier = linear.LinearClassifier( 322 n_classes=3, 323 optimizer=ftrl.FtrlOptimizer(learning_rate=0.1), 324 feature_columns=[feature_column]) 325 326 classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100) 327 scores = classifier.evaluate( 328 input_fn=test_data.iris_input_multiclass_fn, steps=100) 329 self.assertGreater(scores['accuracy'], 0.9) 330 331 def testCustomOptimizerByString(self): 332 """Tests multi-class classification using matrix data as input.""" 333 feature_column = feature_column_lib.real_valued_column( 334 'feature', dimension=4) 335 336 def _optimizer(): 337 return ftrl.FtrlOptimizer(learning_rate=0.1) 338 339 classifier = linear.LinearClassifier( 340 n_classes=3, optimizer=_optimizer, feature_columns=[feature_column]) 341 342 classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100) 343 scores = classifier.evaluate( 344 input_fn=test_data.iris_input_multiclass_fn, steps=100) 345 self.assertGreater(scores['accuracy'], 0.9) 346 347 def testCustomOptimizerByFunction(self): 348 """Tests multi-class classification using matrix data as input.""" 349 feature_column = feature_column_lib.real_valued_column( 350 'feature', dimension=4) 351 352 classifier = linear.LinearClassifier( 353 n_classes=3, optimizer='Ftrl', feature_columns=[feature_column]) 354 355 classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100) 356 scores = classifier.evaluate( 357 input_fn=test_data.iris_input_multiclass_fn, steps=100) 358 self.assertGreater(scores['accuracy'], 0.9) 359 360 def testCustomMetrics(self): 361 """Tests custom evaluation metrics.""" 362 363 def _input_fn(num_epochs=None): 364 # Create 4 rows, one of them (y = x), three of them (y=Not(x)) 365 labels = constant_op.constant([[1], [0], [0], [0]], dtype=dtypes.float32) 366 features = { 367 'x': 368 input_lib.limit_epochs( 369 array_ops.ones( 370 shape=[4, 1], dtype=dtypes.float32), 371 num_epochs=num_epochs) 372 } 373 return features, labels 374 375 def _my_metric_op(predictions, labels): 376 # For the case of binary classification, the 2nd column of "predictions" 377 # denotes the model predictions. 378 predictions = array_ops.strided_slice( 379 predictions, [0, 1], [-1, 2], end_mask=1) 380 return math_ops.reduce_sum(math_ops.multiply(predictions, labels)) 381 382 classifier = linear.LinearClassifier( 383 feature_columns=[feature_column_lib.real_valued_column('x')]) 384 385 classifier.fit(input_fn=_input_fn, steps=100) 386 scores = classifier.evaluate( 387 input_fn=_input_fn, 388 steps=100, 389 metrics={ 390 'my_accuracy': 391 MetricSpec( 392 metric_fn=metric_ops.streaming_accuracy, 393 prediction_key='classes'), 394 'my_precision': 395 MetricSpec( 396 metric_fn=metric_ops.streaming_precision, 397 prediction_key='classes'), 398 'my_metric': 399 MetricSpec( 400 metric_fn=_my_metric_op, prediction_key='probabilities') 401 }) 402 self.assertTrue( 403 set(['loss', 'my_accuracy', 'my_precision', 'my_metric']).issubset( 404 set(scores.keys()))) 405 predict_input_fn = functools.partial(_input_fn, num_epochs=1) 406 predictions = np.array(list(classifier.predict_classes( 407 input_fn=predict_input_fn))) 408 self.assertEqual( 409 _sklearn.accuracy_score([1, 0, 0, 0], predictions), 410 scores['my_accuracy']) 411 412 # Tests the case where the prediction_key is neither "classes" nor 413 # "probabilities". 414 with self.assertRaisesRegexp(KeyError, 'bad_type'): 415 classifier.evaluate( 416 input_fn=_input_fn, 417 steps=100, 418 metrics={ 419 'bad_name': 420 MetricSpec( 421 metric_fn=metric_ops.streaming_auc, 422 prediction_key='bad_type') 423 }) 424 425 # Tests the case where the 2nd element of the key is neither "classes" nor 426 # "probabilities". 427 with self.assertRaises(KeyError): 428 classifier.evaluate( 429 input_fn=_input_fn, 430 steps=100, 431 metrics={('bad_name', 'bad_type'): metric_ops.streaming_auc}) 432 433 # Tests the case where the tuple of the key doesn't have 2 elements. 434 with self.assertRaises(ValueError): 435 classifier.evaluate( 436 input_fn=_input_fn, 437 steps=100, 438 metrics={ 439 ('bad_length_name', 'classes', 'bad_length'): 440 metric_ops.streaming_accuracy 441 }) 442 443 def testLogisticFractionalLabels(self): 444 """Tests logistic training with fractional labels.""" 445 446 def input_fn(num_epochs=None): 447 return { 448 'age': 449 input_lib.limit_epochs( 450 constant_op.constant([[1], [2]]), num_epochs=num_epochs), 451 }, constant_op.constant( 452 [[.7], [0]], dtype=dtypes.float32) 453 454 age = feature_column_lib.real_valued_column('age') 455 456 classifier = linear.LinearClassifier( 457 feature_columns=[age], config=run_config.RunConfig(tf_random_seed=1)) 458 classifier.fit(input_fn=input_fn, steps=500) 459 460 predict_input_fn = functools.partial(input_fn, num_epochs=1) 461 predictions_proba = list( 462 classifier.predict_proba(input_fn=predict_input_fn)) 463 # Prediction probabilities mirror the labels column, which proves that the 464 # classifier learns from float input. 465 self.assertAllClose([[.3, .7], [1., 0.]], predictions_proba, atol=.1) 466 467 def testTrainWithPartitionedVariables(self): 468 """Tests training with partitioned variables.""" 469 470 def _input_fn(): 471 features = { 472 'language': 473 sparse_tensor.SparseTensor( 474 values=['en', 'fr', 'zh'], 475 indices=[[0, 0], [0, 1], [2, 0]], 476 dense_shape=[3, 2]) 477 } 478 labels = constant_op.constant([[1], [0], [0]]) 479 return features, labels 480 481 sparse_features = [ 482 # The given hash_bucket_size results in variables larger than the 483 # default min_slice_size attribute, so the variables are partitioned. 484 feature_column_lib.sparse_column_with_hash_bucket( 485 'language', hash_bucket_size=2e7) 486 ] 487 488 tf_config = { 489 'cluster': { 490 run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] 491 } 492 } 493 with test.mock.patch.dict('os.environ', 494 {'TF_CONFIG': json.dumps(tf_config)}): 495 config = run_config.RunConfig() 496 # Because we did not start a distributed cluster, we need to pass an 497 # empty ClusterSpec, otherwise the device_setter will look for 498 # distributed jobs, such as "/job:ps" which are not present. 499 config._cluster_spec = server_lib.ClusterSpec({}) 500 501 classifier = linear.LinearClassifier( 502 feature_columns=sparse_features, config=config) 503 classifier.fit(input_fn=_input_fn, steps=200) 504 loss = classifier.evaluate(input_fn=_input_fn, steps=1)['loss'] 505 self.assertLess(loss, 0.07) 506 507 def testTrainSaveLoad(self): 508 """Tests that insures you can save and reload a trained model.""" 509 510 def input_fn(num_epochs=None): 511 return { 512 'age': 513 input_lib.limit_epochs( 514 constant_op.constant([1]), num_epochs=num_epochs), 515 'language': 516 sparse_tensor.SparseTensor( 517 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]), 518 }, constant_op.constant([[1]]) 519 520 language = feature_column_lib.sparse_column_with_hash_bucket('language', 521 100) 522 age = feature_column_lib.real_valued_column('age') 523 524 model_dir = tempfile.mkdtemp() 525 classifier = linear.LinearClassifier( 526 model_dir=model_dir, feature_columns=[age, language]) 527 classifier.fit(input_fn=input_fn, steps=30) 528 predict_input_fn = functools.partial(input_fn, num_epochs=1) 529 out1_class = list( 530 classifier.predict_classes( 531 input_fn=predict_input_fn, as_iterable=True)) 532 out1_proba = list( 533 classifier.predict_proba( 534 input_fn=predict_input_fn, as_iterable=True)) 535 del classifier 536 537 classifier2 = linear.LinearClassifier( 538 model_dir=model_dir, feature_columns=[age, language]) 539 out2_class = list( 540 classifier2.predict_classes( 541 input_fn=predict_input_fn, as_iterable=True)) 542 out2_proba = list( 543 classifier2.predict_proba( 544 input_fn=predict_input_fn, as_iterable=True)) 545 self.assertTrue(np.array_equal(out1_class, out2_class)) 546 self.assertTrue(np.array_equal(out1_proba, out2_proba)) 547 548 def testWeightColumn(self): 549 """Tests training with given weight column.""" 550 551 def _input_fn_train(): 552 # Create 4 rows, one of them (y = x), three of them (y=Not(x)) 553 # First row has more weight than others. Model should fit (y=x) better 554 # than (y=Not(x)) due to the relative higher weight of the first row. 555 labels = constant_op.constant([[1], [0], [0], [0]]) 556 features = { 557 'x': array_ops.ones( 558 shape=[4, 1], dtype=dtypes.float32), 559 'w': constant_op.constant([[100.], [3.], [2.], [2.]]) 560 } 561 return features, labels 562 563 def _input_fn_eval(): 564 # Create 4 rows (y = x) 565 labels = constant_op.constant([[1], [1], [1], [1]]) 566 features = { 567 'x': array_ops.ones( 568 shape=[4, 1], dtype=dtypes.float32), 569 'w': constant_op.constant([[1.], [1.], [1.], [1.]]) 570 } 571 return features, labels 572 573 classifier = linear.LinearClassifier( 574 weight_column_name='w', 575 feature_columns=[feature_column_lib.real_valued_column('x')], 576 config=run_config.RunConfig(tf_random_seed=3)) 577 578 classifier.fit(input_fn=_input_fn_train, steps=100) 579 scores = classifier.evaluate(input_fn=_input_fn_eval, steps=1) 580 # All examples in eval data set are y=x. 581 self.assertGreater(scores['labels/actual_label_mean'], 0.9) 582 # If there were no weight column, model would learn y=Not(x). Because of 583 # weights, it learns y=x. 584 self.assertGreater(scores['labels/prediction_mean'], 0.9) 585 # All examples in eval data set are y=x. So if weight column were ignored, 586 # then accuracy would be zero. Because of weights, accuracy should be close 587 # to 1.0. 588 self.assertGreater(scores['accuracy'], 0.9) 589 590 scores_train_set = classifier.evaluate(input_fn=_input_fn_train, steps=1) 591 # Considering weights, the mean label should be close to 1.0. 592 # If weights were ignored, it would be 0.25. 593 self.assertGreater(scores_train_set['labels/actual_label_mean'], 0.9) 594 # The classifier has learned y=x. If weight column were ignored in 595 # evaluation, then accuracy for the train set would be 0.25. 596 # Because weight is not ignored, accuracy is greater than 0.6. 597 self.assertGreater(scores_train_set['accuracy'], 0.6) 598 599 def testWeightColumnLoss(self): 600 """Test ensures that you can specify per-example weights for loss.""" 601 602 def _input_fn(): 603 features = { 604 'age': constant_op.constant([[20], [20], [20]]), 605 'weights': constant_op.constant([[100], [1], [1]]), 606 } 607 labels = constant_op.constant([[1], [0], [0]]) 608 return features, labels 609 610 age = feature_column_lib.real_valued_column('age') 611 612 classifier = linear.LinearClassifier(feature_columns=[age]) 613 classifier.fit(input_fn=_input_fn, steps=100) 614 loss_unweighted = classifier.evaluate(input_fn=_input_fn, steps=1)['loss'] 615 616 classifier = linear.LinearClassifier( 617 feature_columns=[age], weight_column_name='weights') 618 classifier.fit(input_fn=_input_fn, steps=100) 619 loss_weighted = classifier.evaluate(input_fn=_input_fn, steps=1)['loss'] 620 621 self.assertLess(loss_weighted, loss_unweighted) 622 623 def testExport(self): 624 """Tests that export model for servo works.""" 625 626 def input_fn(): 627 return { 628 'age': 629 constant_op.constant([1]), 630 'language': 631 sparse_tensor.SparseTensor( 632 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 633 }, constant_op.constant([[1]]) 634 635 language = feature_column_lib.sparse_column_with_hash_bucket('language', 636 100) 637 age = feature_column_lib.real_valued_column('age') 638 639 classifier = linear.LinearClassifier(feature_columns=[age, language]) 640 classifier.fit(input_fn=input_fn, steps=100) 641 642 export_dir = tempfile.mkdtemp() 643 classifier.export(export_dir) 644 645 def testDisableCenteredBias(self): 646 """Tests that we can disable centered bias.""" 647 648 def input_fn(): 649 return { 650 'age': 651 constant_op.constant([1]), 652 'language': 653 sparse_tensor.SparseTensor( 654 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 655 }, constant_op.constant([[1]]) 656 657 language = feature_column_lib.sparse_column_with_hash_bucket('language', 658 100) 659 age = feature_column_lib.real_valued_column('age') 660 661 classifier = linear.LinearClassifier( 662 feature_columns=[age, language], enable_centered_bias=False) 663 classifier.fit(input_fn=input_fn, steps=100) 664 self.assertNotIn('centered_bias_weight', classifier.get_variable_names()) 665 666 def testEnableCenteredBias(self): 667 """Tests that we can enable centered bias.""" 668 669 def input_fn(): 670 return { 671 'age': 672 constant_op.constant([1]), 673 'language': 674 sparse_tensor.SparseTensor( 675 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 676 }, constant_op.constant([[1]]) 677 678 language = feature_column_lib.sparse_column_with_hash_bucket('language', 679 100) 680 age = feature_column_lib.real_valued_column('age') 681 682 classifier = linear.LinearClassifier( 683 feature_columns=[age, language], enable_centered_bias=True) 684 classifier.fit(input_fn=input_fn, steps=100) 685 self.assertIn('linear/binary_logistic_head/centered_bias_weight', 686 classifier.get_variable_names()) 687 688 def testTrainOptimizerWithL1Reg(self): 689 """Tests l1 regularized model has higher loss.""" 690 691 def input_fn(): 692 return { 693 'language': 694 sparse_tensor.SparseTensor( 695 values=['hindi'], indices=[[0, 0]], dense_shape=[1, 1]) 696 }, constant_op.constant([[1]]) 697 698 language = feature_column_lib.sparse_column_with_hash_bucket('language', 699 100) 700 classifier_no_reg = linear.LinearClassifier(feature_columns=[language]) 701 classifier_with_reg = linear.LinearClassifier( 702 feature_columns=[language], 703 optimizer=ftrl.FtrlOptimizer( 704 learning_rate=1.0, l1_regularization_strength=100.)) 705 loss_no_reg = classifier_no_reg.fit(input_fn=input_fn, steps=100).evaluate( 706 input_fn=input_fn, steps=1)['loss'] 707 loss_with_reg = classifier_with_reg.fit(input_fn=input_fn, 708 steps=100).evaluate( 709 input_fn=input_fn, 710 steps=1)['loss'] 711 self.assertLess(loss_no_reg, loss_with_reg) 712 713 def testTrainWithMissingFeature(self): 714 """Tests that training works with missing features.""" 715 716 def input_fn(): 717 return { 718 'language': 719 sparse_tensor.SparseTensor( 720 values=['Swahili', 'turkish'], 721 indices=[[0, 0], [2, 0]], 722 dense_shape=[3, 1]) 723 }, constant_op.constant([[1], [1], [1]]) 724 725 language = feature_column_lib.sparse_column_with_hash_bucket('language', 726 100) 727 classifier = linear.LinearClassifier(feature_columns=[language]) 728 classifier.fit(input_fn=input_fn, steps=100) 729 loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 730 self.assertLess(loss, 0.07) 731 732 def testSdcaOptimizerRealValuedFeatures(self): 733 """Tests LinearClassifier with SDCAOptimizer and real valued features.""" 734 735 def input_fn(): 736 return { 737 'example_id': constant_op.constant(['1', '2']), 738 'maintenance_cost': constant_op.constant([[500.0], [200.0]]), 739 'sq_footage': constant_op.constant([[800.0], [600.0]]), 740 'weights': constant_op.constant([[1.0], [1.0]]) 741 }, constant_op.constant([[0], [1]]) 742 743 maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost') 744 sq_footage = feature_column_lib.real_valued_column('sq_footage') 745 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 746 example_id_column='example_id') 747 classifier = linear.LinearClassifier( 748 feature_columns=[maintenance_cost, sq_footage], 749 weight_column_name='weights', 750 optimizer=sdca_optimizer) 751 classifier.fit(input_fn=input_fn, steps=100) 752 loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 753 self.assertLess(loss, 0.05) 754 755 def testSdcaOptimizerRealValuedFeatureWithHigherDimension(self): 756 """Tests SDCAOptimizer with real valued features of higher dimension.""" 757 758 # input_fn is identical to the one in testSdcaOptimizerRealValuedFeatures 759 # where 2 1-dimensional dense features have been replaced by 1 2-dimensional 760 # feature. 761 def input_fn(): 762 return { 763 'example_id': 764 constant_op.constant(['1', '2']), 765 'dense_feature': 766 constant_op.constant([[500.0, 800.0], [200.0, 600.0]]) 767 }, constant_op.constant([[0], [1]]) 768 769 dense_feature = feature_column_lib.real_valued_column( 770 'dense_feature', dimension=2) 771 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 772 example_id_column='example_id') 773 classifier = linear.LinearClassifier( 774 feature_columns=[dense_feature], optimizer=sdca_optimizer) 775 classifier.fit(input_fn=input_fn, steps=100) 776 loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 777 self.assertLess(loss, 0.05) 778 779 def testSdcaOptimizerBucketizedFeatures(self): 780 """Tests LinearClassifier with SDCAOptimizer and bucketized features.""" 781 782 def input_fn(): 783 return { 784 'example_id': constant_op.constant(['1', '2', '3']), 785 'price': constant_op.constant([[600.0], [1000.0], [400.0]]), 786 'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]), 787 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) 788 }, constant_op.constant([[1], [0], [1]]) 789 790 price_bucket = feature_column_lib.bucketized_column( 791 feature_column_lib.real_valued_column('price'), 792 boundaries=[500.0, 700.0]) 793 sq_footage_bucket = feature_column_lib.bucketized_column( 794 feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0]) 795 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 796 example_id_column='example_id', symmetric_l2_regularization=1.0) 797 classifier = linear.LinearClassifier( 798 feature_columns=[price_bucket, sq_footage_bucket], 799 weight_column_name='weights', 800 optimizer=sdca_optimizer) 801 classifier.fit(input_fn=input_fn, steps=50) 802 scores = classifier.evaluate(input_fn=input_fn, steps=1) 803 self.assertGreater(scores['accuracy'], 0.9) 804 805 def testSdcaOptimizerSparseFeatures(self): 806 """Tests LinearClassifier with SDCAOptimizer and sparse features.""" 807 808 def input_fn(): 809 return { 810 'example_id': 811 constant_op.constant(['1', '2', '3']), 812 'price': 813 constant_op.constant([0.4, 0.6, 0.3]), 814 'country': 815 sparse_tensor.SparseTensor( 816 values=['IT', 'US', 'GB'], 817 indices=[[0, 0], [1, 3], [2, 1]], 818 dense_shape=[3, 5]), 819 'weights': 820 constant_op.constant([[1.0], [1.0], [1.0]]) 821 }, constant_op.constant([[1], [0], [1]]) 822 823 price = feature_column_lib.real_valued_column('price') 824 country = feature_column_lib.sparse_column_with_hash_bucket( 825 'country', hash_bucket_size=5) 826 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 827 example_id_column='example_id') 828 classifier = linear.LinearClassifier( 829 feature_columns=[price, country], 830 weight_column_name='weights', 831 optimizer=sdca_optimizer) 832 classifier.fit(input_fn=input_fn, steps=50) 833 scores = classifier.evaluate(input_fn=input_fn, steps=1) 834 self.assertGreater(scores['accuracy'], 0.9) 835 836 def testSdcaOptimizerWeightedSparseFeatures(self): 837 """LinearClassifier with SDCAOptimizer and weighted sparse features.""" 838 839 def input_fn(): 840 return { 841 'example_id': 842 constant_op.constant(['1', '2', '3']), 843 'price': 844 sparse_tensor.SparseTensor( 845 values=[2., 3., 1.], 846 indices=[[0, 0], [1, 0], [2, 0]], 847 dense_shape=[3, 5]), 848 'country': 849 sparse_tensor.SparseTensor( 850 values=['IT', 'US', 'GB'], 851 indices=[[0, 0], [1, 0], [2, 0]], 852 dense_shape=[3, 5]) 853 }, constant_op.constant([[1], [0], [1]]) 854 855 country = feature_column_lib.sparse_column_with_hash_bucket( 856 'country', hash_bucket_size=5) 857 country_weighted_by_price = feature_column_lib.weighted_sparse_column( 858 country, 'price') 859 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 860 example_id_column='example_id') 861 classifier = linear.LinearClassifier( 862 feature_columns=[country_weighted_by_price], optimizer=sdca_optimizer) 863 classifier.fit(input_fn=input_fn, steps=50) 864 scores = classifier.evaluate(input_fn=input_fn, steps=1) 865 self.assertGreater(scores['accuracy'], 0.9) 866 867 def testSdcaOptimizerWeightedSparseFeaturesOOVWithNoOOVBuckets(self): 868 """LinearClassifier with SDCAOptimizer with OOV features (-1 IDs).""" 869 870 def input_fn(): 871 return { 872 'example_id': 873 constant_op.constant(['1', '2', '3']), 874 'price': 875 sparse_tensor.SparseTensor( 876 values=[2., 3., 1.], 877 indices=[[0, 0], [1, 0], [2, 0]], 878 dense_shape=[3, 5]), 879 'country': 880 sparse_tensor.SparseTensor( 881 # 'GB' is out of the vocabulary. 882 values=['IT', 'US', 'GB'], 883 indices=[[0, 0], [1, 0], [2, 0]], 884 dense_shape=[3, 5]) 885 }, constant_op.constant([[1], [0], [1]]) 886 887 country = feature_column_lib.sparse_column_with_keys( 888 'country', keys=['US', 'CA', 'MK', 'IT', 'CN']) 889 country_weighted_by_price = feature_column_lib.weighted_sparse_column( 890 country, 'price') 891 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 892 example_id_column='example_id') 893 classifier = linear.LinearClassifier( 894 feature_columns=[country_weighted_by_price], optimizer=sdca_optimizer) 895 classifier.fit(input_fn=input_fn, steps=50) 896 scores = classifier.evaluate(input_fn=input_fn, steps=1) 897 self.assertGreater(scores['accuracy'], 0.9) 898 899 def testSdcaOptimizerCrossedFeatures(self): 900 """Tests LinearClassifier with SDCAOptimizer and crossed features.""" 901 902 def input_fn(): 903 return { 904 'example_id': 905 constant_op.constant(['1', '2', '3']), 906 'language': 907 sparse_tensor.SparseTensor( 908 values=['english', 'italian', 'spanish'], 909 indices=[[0, 0], [1, 0], [2, 0]], 910 dense_shape=[3, 1]), 911 'country': 912 sparse_tensor.SparseTensor( 913 values=['US', 'IT', 'MX'], 914 indices=[[0, 0], [1, 0], [2, 0]], 915 dense_shape=[3, 1]) 916 }, constant_op.constant([[0], [0], [1]]) 917 918 language = feature_column_lib.sparse_column_with_hash_bucket( 919 'language', hash_bucket_size=5) 920 country = feature_column_lib.sparse_column_with_hash_bucket( 921 'country', hash_bucket_size=5) 922 country_language = feature_column_lib.crossed_column( 923 [language, country], hash_bucket_size=10) 924 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 925 example_id_column='example_id') 926 classifier = linear.LinearClassifier( 927 feature_columns=[country_language], optimizer=sdca_optimizer) 928 classifier.fit(input_fn=input_fn, steps=10) 929 scores = classifier.evaluate(input_fn=input_fn, steps=1) 930 self.assertGreater(scores['accuracy'], 0.9) 931 932 def testSdcaOptimizerMixedFeatures(self): 933 """Tests LinearClassifier with SDCAOptimizer and a mix of features.""" 934 935 def input_fn(): 936 return { 937 'example_id': 938 constant_op.constant(['1', '2', '3']), 939 'price': 940 constant_op.constant([[0.6], [0.8], [0.3]]), 941 'sq_footage': 942 constant_op.constant([[900.0], [700.0], [600.0]]), 943 'country': 944 sparse_tensor.SparseTensor( 945 values=['IT', 'US', 'GB'], 946 indices=[[0, 0], [1, 3], [2, 1]], 947 dense_shape=[3, 5]), 948 'weights': 949 constant_op.constant([[3.0], [1.0], [1.0]]) 950 }, constant_op.constant([[1], [0], [1]]) 951 952 price = feature_column_lib.real_valued_column('price') 953 sq_footage_bucket = feature_column_lib.bucketized_column( 954 feature_column_lib.real_valued_column('sq_footage'), 955 boundaries=[650.0, 800.0]) 956 country = feature_column_lib.sparse_column_with_hash_bucket( 957 'country', hash_bucket_size=5) 958 sq_footage_country = feature_column_lib.crossed_column( 959 [sq_footage_bucket, country], hash_bucket_size=10) 960 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 961 example_id_column='example_id') 962 classifier = linear.LinearClassifier( 963 feature_columns=[price, sq_footage_bucket, country, sq_footage_country], 964 weight_column_name='weights', 965 optimizer=sdca_optimizer) 966 classifier.fit(input_fn=input_fn, steps=50) 967 scores = classifier.evaluate(input_fn=input_fn, steps=1) 968 self.assertGreater(scores['accuracy'], 0.9) 969 970 def testSdcaOptimizerPartitionedVariables(self): 971 """Tests LinearClassifier with SDCAOptimizer with partitioned variables.""" 972 973 def input_fn(): 974 return { 975 'example_id': 976 constant_op.constant(['1', '2', '3']), 977 'price': 978 constant_op.constant([[0.6], [0.8], [0.3]]), 979 'sq_footage': 980 constant_op.constant([[900.0], [700.0], [600.0]]), 981 'country': 982 sparse_tensor.SparseTensor( 983 values=['IT', 'US', 'GB'], 984 indices=[[0, 0], [1, 3], [2, 1]], 985 dense_shape=[3, 5]), 986 'weights': 987 constant_op.constant([[3.0], [1.0], [1.0]]) 988 }, constant_op.constant([[1], [0], [1]]) 989 990 price = feature_column_lib.real_valued_column('price') 991 sq_footage_bucket = feature_column_lib.bucketized_column( 992 feature_column_lib.real_valued_column('sq_footage'), 993 boundaries=[650.0, 800.0]) 994 country = feature_column_lib.sparse_column_with_hash_bucket( 995 'country', hash_bucket_size=5) 996 sq_footage_country = feature_column_lib.crossed_column( 997 [sq_footage_bucket, country], hash_bucket_size=10) 998 999 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1000 example_id_column='example_id', 1001 partitioner=partitioned_variables.fixed_size_partitioner( 1002 num_shards=2, axis=0)) 1003 1004 tf_config = { 1005 'cluster': { 1006 run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] 1007 } 1008 } 1009 with test.mock.patch.dict('os.environ', 1010 {'TF_CONFIG': json.dumps(tf_config)}): 1011 config = run_config.RunConfig() 1012 # Because we did not start a distributed cluster, we need to pass an 1013 # empty ClusterSpec, otherwise the device_setter will look for 1014 # distributed jobs, such as "/job:ps" which are not present. 1015 config._cluster_spec = server_lib.ClusterSpec({}) 1016 1017 classifier = linear.LinearClassifier( 1018 feature_columns=[price, sq_footage_bucket, country, sq_footage_country], 1019 weight_column_name='weights', 1020 optimizer=sdca_optimizer, 1021 config=config) 1022 classifier.fit(input_fn=input_fn, steps=50) 1023 scores = classifier.evaluate(input_fn=input_fn, steps=1) 1024 print('all scores = {}'.format(scores)) 1025 self.assertGreater(scores['accuracy'], 0.9) 1026 1027 def testEval(self): 1028 """Tests that eval produces correct metrics. 1029 """ 1030 1031 def input_fn(): 1032 return { 1033 'age': 1034 constant_op.constant([[1], [2]]), 1035 'language': 1036 sparse_tensor.SparseTensor( 1037 values=['greek', 'chinese'], 1038 indices=[[0, 0], [1, 0]], 1039 dense_shape=[2, 1]), 1040 }, constant_op.constant([[1], [0]]) 1041 1042 language = feature_column_lib.sparse_column_with_hash_bucket('language', 1043 100) 1044 age = feature_column_lib.real_valued_column('age') 1045 classifier = linear.LinearClassifier(feature_columns=[age, language]) 1046 1047 # Evaluate on trained model 1048 classifier.fit(input_fn=input_fn, steps=100) 1049 classifier.evaluate(input_fn=input_fn, steps=1) 1050 1051 # TODO(ispir): Enable accuracy check after resolving the randomness issue. 1052 # self.assertLess(evaluated_values['loss/mean'], 0.3) 1053 # self.assertGreater(evaluated_values['accuracy/mean'], .95) 1054 1055 1056class LinearRegressorTest(test.TestCase): 1057 1058 def testExperimentIntegration(self): 1059 cont_features = [ 1060 feature_column_lib.real_valued_column( 1061 'feature', dimension=4) 1062 ] 1063 1064 exp = experiment.Experiment( 1065 estimator=linear.LinearRegressor(feature_columns=cont_features), 1066 train_input_fn=test_data.iris_input_logistic_fn, 1067 eval_input_fn=test_data.iris_input_logistic_fn) 1068 exp.test() 1069 1070 def testEstimatorContract(self): 1071 estimator_test_utils.assert_estimator_contract(self, linear.LinearRegressor) 1072 1073 def testRegression(self): 1074 """Tests that loss goes down with training.""" 1075 1076 def input_fn(): 1077 return { 1078 'age': 1079 constant_op.constant([1]), 1080 'language': 1081 sparse_tensor.SparseTensor( 1082 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 1083 }, constant_op.constant([[10.]]) 1084 1085 language = feature_column_lib.sparse_column_with_hash_bucket('language', 1086 100) 1087 age = feature_column_lib.real_valued_column('age') 1088 1089 classifier = linear.LinearRegressor(feature_columns=[age, language]) 1090 classifier.fit(input_fn=input_fn, steps=100) 1091 loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 1092 classifier.fit(input_fn=input_fn, steps=200) 1093 loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] 1094 1095 self.assertLess(loss2, loss1) 1096 self.assertLess(loss2, 0.5) 1097 1098 def testRegression_MatrixData(self): 1099 """Tests regression using matrix data as input.""" 1100 cont_features = [ 1101 feature_column_lib.real_valued_column( 1102 'feature', dimension=4) 1103 ] 1104 1105 regressor = linear.LinearRegressor( 1106 feature_columns=cont_features, 1107 config=run_config.RunConfig(tf_random_seed=1)) 1108 1109 regressor.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100) 1110 scores = regressor.evaluate( 1111 input_fn=test_data.iris_input_multiclass_fn, steps=1) 1112 self.assertLess(scores['loss'], 0.2) 1113 1114 def testRegression_TensorData(self): 1115 """Tests regression using tensor data as input.""" 1116 1117 def _input_fn(num_epochs=None): 1118 features = { 1119 'age': 1120 input_lib.limit_epochs( 1121 constant_op.constant([[0.8], [0.15], [0.]]), 1122 num_epochs=num_epochs), 1123 'language': 1124 sparse_tensor.SparseTensor( 1125 values=['en', 'fr', 'zh'], 1126 indices=[[0, 0], [0, 1], [2, 0]], 1127 dense_shape=[3, 2]) 1128 } 1129 return features, constant_op.constant( 1130 [1.0, 0., 0.2], dtype=dtypes.float32) 1131 1132 feature_columns = [ 1133 feature_column_lib.sparse_column_with_hash_bucket( 1134 'language', hash_bucket_size=20), 1135 feature_column_lib.real_valued_column('age') 1136 ] 1137 1138 regressor = linear.LinearRegressor( 1139 feature_columns=feature_columns, 1140 config=run_config.RunConfig(tf_random_seed=1)) 1141 1142 regressor.fit(input_fn=_input_fn, steps=100) 1143 1144 scores = regressor.evaluate(input_fn=_input_fn, steps=1) 1145 self.assertLess(scores['loss'], 0.2) 1146 1147 def testLoss(self): 1148 """Tests loss calculation.""" 1149 1150 def _input_fn_train(): 1151 # Create 4 rows, one of them (y = x), three of them (y=Not(x)) 1152 # The algorithm should learn (y = 0.25). 1153 labels = constant_op.constant([[1.], [0.], [0.], [0.]]) 1154 features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),} 1155 return features, labels 1156 1157 regressor = linear.LinearRegressor( 1158 feature_columns=[feature_column_lib.real_valued_column('x')], 1159 config=run_config.RunConfig(tf_random_seed=1)) 1160 1161 regressor.fit(input_fn=_input_fn_train, steps=100) 1162 scores = regressor.evaluate(input_fn=_input_fn_train, steps=1) 1163 # Average square loss = (0.75^2 + 3*0.25^2) / 4 = 0.1875 1164 self.assertAlmostEqual(0.1875, scores['loss'], delta=0.1) 1165 1166 def testLossWithWeights(self): 1167 """Tests loss calculation with weights.""" 1168 1169 def _input_fn_train(): 1170 # 4 rows with equal weight, one of them (y = x), three of them (y=Not(x)) 1171 # The algorithm should learn (y = 0.25). 1172 labels = constant_op.constant([[1.], [0.], [0.], [0.]]) 1173 features = { 1174 'x': array_ops.ones( 1175 shape=[4, 1], dtype=dtypes.float32), 1176 'w': constant_op.constant([[1.], [1.], [1.], [1.]]) 1177 } 1178 return features, labels 1179 1180 def _input_fn_eval(): 1181 # 4 rows, with different weights. 1182 labels = constant_op.constant([[1.], [0.], [0.], [0.]]) 1183 features = { 1184 'x': array_ops.ones( 1185 shape=[4, 1], dtype=dtypes.float32), 1186 'w': constant_op.constant([[7.], [1.], [1.], [1.]]) 1187 } 1188 return features, labels 1189 1190 regressor = linear.LinearRegressor( 1191 weight_column_name='w', 1192 feature_columns=[feature_column_lib.real_valued_column('x')], 1193 config=run_config.RunConfig(tf_random_seed=1)) 1194 1195 regressor.fit(input_fn=_input_fn_train, steps=100) 1196 scores = regressor.evaluate(input_fn=_input_fn_eval, steps=1) 1197 # Weighted average square loss = (7*0.75^2 + 3*0.25^2) / 10 = 0.4125 1198 self.assertAlmostEqual(0.4125, scores['loss'], delta=0.1) 1199 1200 def testTrainWithWeights(self): 1201 """Tests training with given weight column.""" 1202 1203 def _input_fn_train(): 1204 # Create 4 rows, one of them (y = x), three of them (y=Not(x)) 1205 # First row has more weight than others. Model should fit (y=x) better 1206 # than (y=Not(x)) due to the relative higher weight of the first row. 1207 labels = constant_op.constant([[1.], [0.], [0.], [0.]]) 1208 features = { 1209 'x': array_ops.ones( 1210 shape=[4, 1], dtype=dtypes.float32), 1211 'w': constant_op.constant([[100.], [3.], [2.], [2.]]) 1212 } 1213 return features, labels 1214 1215 def _input_fn_eval(): 1216 # Create 4 rows (y = x) 1217 labels = constant_op.constant([[1.], [1.], [1.], [1.]]) 1218 features = { 1219 'x': array_ops.ones( 1220 shape=[4, 1], dtype=dtypes.float32), 1221 'w': constant_op.constant([[1.], [1.], [1.], [1.]]) 1222 } 1223 return features, labels 1224 1225 regressor = linear.LinearRegressor( 1226 weight_column_name='w', 1227 feature_columns=[feature_column_lib.real_valued_column('x')], 1228 config=run_config.RunConfig(tf_random_seed=1)) 1229 1230 regressor.fit(input_fn=_input_fn_train, steps=100) 1231 scores = regressor.evaluate(input_fn=_input_fn_eval, steps=1) 1232 # The model should learn (y = x) because of the weights, so the loss should 1233 # be close to zero. 1234 self.assertLess(scores['loss'], 0.1) 1235 1236 def testPredict_AsIterableFalse(self): 1237 """Tests predict method with as_iterable=False.""" 1238 labels = [1.0, 0., 0.2] 1239 1240 def _input_fn(num_epochs=None): 1241 features = { 1242 'age': 1243 input_lib.limit_epochs( 1244 constant_op.constant([[0.8], [0.15], [0.]]), 1245 num_epochs=num_epochs), 1246 'language': 1247 sparse_tensor.SparseTensor( 1248 values=['en', 'fr', 'zh'], 1249 indices=[[0, 0], [0, 1], [2, 0]], 1250 dense_shape=[3, 2]) 1251 } 1252 return features, constant_op.constant(labels, dtype=dtypes.float32) 1253 1254 feature_columns = [ 1255 feature_column_lib.sparse_column_with_hash_bucket( 1256 'language', hash_bucket_size=20), 1257 feature_column_lib.real_valued_column('age') 1258 ] 1259 1260 regressor = linear.LinearRegressor( 1261 feature_columns=feature_columns, 1262 config=run_config.RunConfig(tf_random_seed=1)) 1263 1264 regressor.fit(input_fn=_input_fn, steps=100) 1265 1266 scores = regressor.evaluate(input_fn=_input_fn, steps=1) 1267 self.assertLess(scores['loss'], 0.1) 1268 predicted_scores = regressor.predict_scores( 1269 input_fn=_input_fn, as_iterable=False) 1270 self.assertAllClose(labels, predicted_scores, atol=0.1) 1271 predictions = regressor.predict(input_fn=_input_fn, as_iterable=False) 1272 self.assertAllClose(predicted_scores, predictions) 1273 1274 def testPredict_AsIterable(self): 1275 """Tests predict method with as_iterable=True.""" 1276 labels = [1.0, 0., 0.2] 1277 1278 def _input_fn(num_epochs=None): 1279 features = { 1280 'age': 1281 input_lib.limit_epochs( 1282 constant_op.constant([[0.8], [0.15], [0.]]), 1283 num_epochs=num_epochs), 1284 'language': 1285 sparse_tensor.SparseTensor( 1286 values=['en', 'fr', 'zh'], 1287 indices=[[0, 0], [0, 1], [2, 0]], 1288 dense_shape=[3, 2]) 1289 } 1290 return features, constant_op.constant(labels, dtype=dtypes.float32) 1291 1292 feature_columns = [ 1293 feature_column_lib.sparse_column_with_hash_bucket( 1294 'language', hash_bucket_size=20), 1295 feature_column_lib.real_valued_column('age') 1296 ] 1297 1298 regressor = linear.LinearRegressor( 1299 feature_columns=feature_columns, 1300 config=run_config.RunConfig(tf_random_seed=1)) 1301 1302 regressor.fit(input_fn=_input_fn, steps=100) 1303 1304 scores = regressor.evaluate(input_fn=_input_fn, steps=1) 1305 self.assertLess(scores['loss'], 0.1) 1306 predict_input_fn = functools.partial(_input_fn, num_epochs=1) 1307 predicted_scores = list( 1308 regressor.predict_scores( 1309 input_fn=predict_input_fn, as_iterable=True)) 1310 self.assertAllClose(labels, predicted_scores, atol=0.1) 1311 predictions = list( 1312 regressor.predict( 1313 input_fn=predict_input_fn, as_iterable=True)) 1314 self.assertAllClose(predicted_scores, predictions) 1315 1316 def testCustomMetrics(self): 1317 """Tests custom evaluation metrics.""" 1318 1319 def _input_fn(num_epochs=None): 1320 # Create 4 rows, one of them (y = x), three of them (y=Not(x)) 1321 labels = constant_op.constant([[1.], [0.], [0.], [0.]]) 1322 features = { 1323 'x': 1324 input_lib.limit_epochs( 1325 array_ops.ones( 1326 shape=[4, 1], dtype=dtypes.float32), 1327 num_epochs=num_epochs) 1328 } 1329 return features, labels 1330 1331 def _my_metric_op(predictions, labels): 1332 return math_ops.reduce_sum(math_ops.multiply(predictions, labels)) 1333 1334 regressor = linear.LinearRegressor( 1335 feature_columns=[feature_column_lib.real_valued_column('x')], 1336 config=run_config.RunConfig(tf_random_seed=1)) 1337 1338 regressor.fit(input_fn=_input_fn, steps=100) 1339 scores = regressor.evaluate( 1340 input_fn=_input_fn, 1341 steps=1, 1342 metrics={ 1343 'my_error': 1344 MetricSpec( 1345 metric_fn=metric_ops.streaming_mean_squared_error, 1346 prediction_key='scores'), 1347 'my_metric': 1348 MetricSpec( 1349 metric_fn=_my_metric_op, prediction_key='scores') 1350 }) 1351 self.assertIn('loss', set(scores.keys())) 1352 self.assertIn('my_error', set(scores.keys())) 1353 self.assertIn('my_metric', set(scores.keys())) 1354 predict_input_fn = functools.partial(_input_fn, num_epochs=1) 1355 predictions = np.array(list( 1356 regressor.predict_scores(input_fn=predict_input_fn))) 1357 self.assertAlmostEqual( 1358 _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions), 1359 scores['my_error']) 1360 1361 # Tests the case where the prediction_key is not "scores". 1362 with self.assertRaisesRegexp(KeyError, 'bad_type'): 1363 regressor.evaluate( 1364 input_fn=_input_fn, 1365 steps=1, 1366 metrics={ 1367 'bad_name': 1368 MetricSpec( 1369 metric_fn=metric_ops.streaming_auc, 1370 prediction_key='bad_type') 1371 }) 1372 1373 # Tests the case where the 2nd element of the key is not "scores". 1374 with self.assertRaises(KeyError): 1375 regressor.evaluate( 1376 input_fn=_input_fn, 1377 steps=1, 1378 metrics={ 1379 ('my_error', 'predictions'): 1380 metric_ops.streaming_mean_squared_error 1381 }) 1382 1383 # Tests the case where the tuple of the key doesn't have 2 elements. 1384 with self.assertRaises(ValueError): 1385 regressor.evaluate( 1386 input_fn=_input_fn, 1387 steps=1, 1388 metrics={ 1389 ('bad_length_name', 'scores', 'bad_length'): 1390 metric_ops.streaming_mean_squared_error 1391 }) 1392 1393 def testTrainSaveLoad(self): 1394 """Tests that insures you can save and reload a trained model.""" 1395 1396 def _input_fn(num_epochs=None): 1397 features = { 1398 'age': 1399 input_lib.limit_epochs( 1400 constant_op.constant([[0.8], [0.15], [0.]]), 1401 num_epochs=num_epochs), 1402 'language': 1403 sparse_tensor.SparseTensor( 1404 values=['en', 'fr', 'zh'], 1405 indices=[[0, 0], [0, 1], [2, 0]], 1406 dense_shape=[3, 2]) 1407 } 1408 return features, constant_op.constant( 1409 [1.0, 0., 0.2], dtype=dtypes.float32) 1410 1411 feature_columns = [ 1412 feature_column_lib.sparse_column_with_hash_bucket( 1413 'language', hash_bucket_size=20), 1414 feature_column_lib.real_valued_column('age') 1415 ] 1416 1417 model_dir = tempfile.mkdtemp() 1418 regressor = linear.LinearRegressor( 1419 model_dir=model_dir, 1420 feature_columns=feature_columns, 1421 config=run_config.RunConfig(tf_random_seed=1)) 1422 1423 regressor.fit(input_fn=_input_fn, steps=100) 1424 predict_input_fn = functools.partial(_input_fn, num_epochs=1) 1425 predictions = list(regressor.predict_scores(input_fn=predict_input_fn)) 1426 del regressor 1427 1428 regressor2 = linear.LinearRegressor( 1429 model_dir=model_dir, feature_columns=feature_columns) 1430 predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn)) 1431 self.assertAllClose(predictions, predictions2) 1432 1433 def testTrainWithPartitionedVariables(self): 1434 """Tests training with partitioned variables.""" 1435 1436 def _input_fn(num_epochs=None): 1437 features = { 1438 'age': 1439 input_lib.limit_epochs( 1440 constant_op.constant([[0.8], [0.15], [0.]]), 1441 num_epochs=num_epochs), 1442 'language': 1443 sparse_tensor.SparseTensor( 1444 values=['en', 'fr', 'zh'], 1445 indices=[[0, 0], [0, 1], [2, 0]], 1446 dense_shape=[3, 2]) 1447 } 1448 return features, constant_op.constant( 1449 [1.0, 0., 0.2], dtype=dtypes.float32) 1450 1451 feature_columns = [ 1452 # The given hash_bucket_size results in variables larger than the 1453 # default min_slice_size attribute, so the variables are partitioned. 1454 feature_column_lib.sparse_column_with_hash_bucket( 1455 'language', hash_bucket_size=2e7), 1456 feature_column_lib.real_valued_column('age') 1457 ] 1458 1459 tf_config = { 1460 'cluster': { 1461 run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] 1462 } 1463 } 1464 with test.mock.patch.dict('os.environ', 1465 {'TF_CONFIG': json.dumps(tf_config)}): 1466 config = run_config.RunConfig(tf_random_seed=1) 1467 # Because we did not start a distributed cluster, we need to pass an 1468 # empty ClusterSpec, otherwise the device_setter will look for 1469 # distributed jobs, such as "/job:ps" which are not present. 1470 config._cluster_spec = server_lib.ClusterSpec({}) 1471 1472 regressor = linear.LinearRegressor( 1473 feature_columns=feature_columns, config=config) 1474 1475 regressor.fit(input_fn=_input_fn, steps=100) 1476 1477 scores = regressor.evaluate(input_fn=_input_fn, steps=1) 1478 self.assertLess(scores['loss'], 0.1) 1479 1480 def testDisableCenteredBias(self): 1481 """Tests that we can disable centered bias.""" 1482 1483 def _input_fn(num_epochs=None): 1484 features = { 1485 'age': 1486 input_lib.limit_epochs( 1487 constant_op.constant([[0.8], [0.15], [0.]]), 1488 num_epochs=num_epochs), 1489 'language': 1490 sparse_tensor.SparseTensor( 1491 values=['en', 'fr', 'zh'], 1492 indices=[[0, 0], [0, 1], [2, 0]], 1493 dense_shape=[3, 2]) 1494 } 1495 return features, constant_op.constant( 1496 [1.0, 0., 0.2], dtype=dtypes.float32) 1497 1498 feature_columns = [ 1499 feature_column_lib.sparse_column_with_hash_bucket( 1500 'language', hash_bucket_size=20), 1501 feature_column_lib.real_valued_column('age') 1502 ] 1503 1504 regressor = linear.LinearRegressor( 1505 feature_columns=feature_columns, 1506 enable_centered_bias=False, 1507 config=run_config.RunConfig(tf_random_seed=1)) 1508 1509 regressor.fit(input_fn=_input_fn, steps=100) 1510 1511 scores = regressor.evaluate(input_fn=_input_fn, steps=1) 1512 self.assertLess(scores['loss'], 0.1) 1513 1514 def testRecoverWeights(self): 1515 rng = np.random.RandomState(67) 1516 n = 1000 1517 n_weights = 10 1518 bias = 2 1519 x = rng.uniform(-1, 1, (n, n_weights)) 1520 weights = 10 * rng.randn(n_weights) 1521 y = np.dot(x, weights) 1522 y += rng.randn(len(x)) * 0.05 + rng.normal(bias, 0.01) 1523 feature_columns = estimator.infer_real_valued_columns_from_input(x) 1524 regressor = linear.LinearRegressor( 1525 feature_columns=feature_columns, 1526 optimizer=ftrl.FtrlOptimizer(learning_rate=0.8)) 1527 regressor.fit(x, y, batch_size=64, steps=2000) 1528 self.assertIn('linear//weight', regressor.get_variable_names()) 1529 regressor_weights = regressor.get_variable_value('linear//weight') 1530 # Have to flatten weights since they come in (x, 1) shape. 1531 self.assertAllClose(weights, regressor_weights.flatten(), rtol=1) 1532 # TODO(ispir): Disable centered_bias. 1533 # assert abs(bias - regressor.bias_) < 0.1 1534 1535 def testSdcaOptimizerRealValuedLinearFeatures(self): 1536 """Tests LinearRegressor with SDCAOptimizer and real valued features.""" 1537 x = [[1.2, 2.0, -1.5], [-2.0, 3.0, -0.5], [1.0, -0.5, 4.0]] 1538 weights = [[3.0], [-1.2], [0.5]] 1539 y = np.dot(x, weights) 1540 1541 def input_fn(): 1542 return { 1543 'example_id': constant_op.constant(['1', '2', '3']), 1544 'x': constant_op.constant(x), 1545 'weights': constant_op.constant([[10.0], [10.0], [10.0]]) 1546 }, constant_op.constant(y) 1547 1548 x_column = feature_column_lib.real_valued_column('x', dimension=3) 1549 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1550 example_id_column='example_id') 1551 regressor = linear.LinearRegressor( 1552 feature_columns=[x_column], 1553 weight_column_name='weights', 1554 optimizer=sdca_optimizer) 1555 regressor.fit(input_fn=input_fn, steps=20) 1556 loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] 1557 self.assertLess(loss, 0.01) 1558 self.assertIn('linear/x/weight', regressor.get_variable_names()) 1559 regressor_weights = regressor.get_variable_value('linear/x/weight') 1560 self.assertAllClose( 1561 [w[0] for w in weights], regressor_weights.flatten(), rtol=0.1) 1562 1563 def testSdcaOptimizerMixedFeaturesArbitraryWeights(self): 1564 """Tests LinearRegressor with SDCAOptimizer and a mix of features.""" 1565 1566 def input_fn(): 1567 return { 1568 'example_id': 1569 constant_op.constant(['1', '2', '3']), 1570 'price': 1571 constant_op.constant([0.6, 0.8, 0.3]), 1572 'sq_footage': 1573 constant_op.constant([[900.0], [700.0], [600.0]]), 1574 'country': 1575 sparse_tensor.SparseTensor( 1576 values=['IT', 'US', 'GB'], 1577 indices=[[0, 0], [1, 3], [2, 1]], 1578 dense_shape=[3, 5]), 1579 'weights': 1580 constant_op.constant([[3.0], [5.0], [7.0]]) 1581 }, constant_op.constant([[1.55], [-1.25], [-3.0]]) 1582 1583 price = feature_column_lib.real_valued_column('price') 1584 sq_footage_bucket = feature_column_lib.bucketized_column( 1585 feature_column_lib.real_valued_column('sq_footage'), 1586 boundaries=[650.0, 800.0]) 1587 country = feature_column_lib.sparse_column_with_hash_bucket( 1588 'country', hash_bucket_size=5) 1589 sq_footage_country = feature_column_lib.crossed_column( 1590 [sq_footage_bucket, country], hash_bucket_size=10) 1591 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1592 example_id_column='example_id', symmetric_l2_regularization=1.0) 1593 regressor = linear.LinearRegressor( 1594 feature_columns=[price, sq_footage_bucket, country, sq_footage_country], 1595 weight_column_name='weights', 1596 optimizer=sdca_optimizer) 1597 regressor.fit(input_fn=input_fn, steps=20) 1598 loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] 1599 self.assertLess(loss, 0.05) 1600 1601 def testSdcaOptimizerPartitionedVariables(self): 1602 """Tests LinearRegressor with SDCAOptimizer with partitioned variables.""" 1603 1604 def input_fn(): 1605 return { 1606 'example_id': 1607 constant_op.constant(['1', '2', '3']), 1608 'price': 1609 constant_op.constant([0.6, 0.8, 0.3]), 1610 'sq_footage': 1611 constant_op.constant([[900.0], [700.0], [600.0]]), 1612 'country': 1613 sparse_tensor.SparseTensor( 1614 values=['IT', 'US', 'GB'], 1615 indices=[[0, 0], [1, 3], [2, 1]], 1616 dense_shape=[3, 5]), 1617 'weights': 1618 constant_op.constant([[3.0], [5.0], [7.0]]) 1619 }, constant_op.constant([[1.55], [-1.25], [-3.0]]) 1620 1621 price = feature_column_lib.real_valued_column('price') 1622 sq_footage_bucket = feature_column_lib.bucketized_column( 1623 feature_column_lib.real_valued_column('sq_footage'), 1624 boundaries=[650.0, 800.0]) 1625 country = feature_column_lib.sparse_column_with_hash_bucket( 1626 'country', hash_bucket_size=5) 1627 sq_footage_country = feature_column_lib.crossed_column( 1628 [sq_footage_bucket, country], hash_bucket_size=10) 1629 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1630 example_id_column='example_id', symmetric_l2_regularization=1.0, 1631 partitioner=partitioned_variables.fixed_size_partitioner( 1632 num_shards=2, axis=0)) 1633 tf_config = { 1634 'cluster': { 1635 run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] 1636 } 1637 } 1638 with test.mock.patch.dict('os.environ', 1639 {'TF_CONFIG': json.dumps(tf_config)}): 1640 config = run_config.RunConfig() 1641 # Because we did not start a distributed cluster, we need to pass an 1642 # empty ClusterSpec, otherwise the device_setter will look for 1643 # distributed jobs, such as "/job:ps" which are not present. 1644 config._cluster_spec = server_lib.ClusterSpec({}) 1645 1646 regressor = linear.LinearRegressor( 1647 feature_columns=[price, sq_footage_bucket, country, sq_footage_country], 1648 weight_column_name='weights', 1649 optimizer=sdca_optimizer, 1650 config=config) 1651 regressor.fit(input_fn=input_fn, steps=20) 1652 loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] 1653 self.assertLess(loss, 0.05) 1654 1655 def testSdcaOptimizerSparseFeaturesWithL1Reg(self): 1656 """Tests LinearClassifier with SDCAOptimizer and sparse features.""" 1657 1658 def input_fn(): 1659 return { 1660 'example_id': 1661 constant_op.constant(['1', '2', '3']), 1662 'price': 1663 constant_op.constant([[0.4], [0.6], [0.3]]), 1664 'country': 1665 sparse_tensor.SparseTensor( 1666 values=['IT', 'US', 'GB'], 1667 indices=[[0, 0], [1, 3], [2, 1]], 1668 dense_shape=[3, 5]), 1669 'weights': 1670 constant_op.constant([[10.0], [10.0], [10.0]]) 1671 }, constant_op.constant([[1.4], [-0.8], [2.6]]) 1672 1673 price = feature_column_lib.real_valued_column('price') 1674 country = feature_column_lib.sparse_column_with_hash_bucket( 1675 'country', hash_bucket_size=5) 1676 # Regressor with no L1 regularization. 1677 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1678 example_id_column='example_id') 1679 regressor = linear.LinearRegressor( 1680 feature_columns=[price, country], 1681 weight_column_name='weights', 1682 optimizer=sdca_optimizer) 1683 regressor.fit(input_fn=input_fn, steps=20) 1684 no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] 1685 variable_names = regressor.get_variable_names() 1686 self.assertIn('linear/price/weight', variable_names) 1687 self.assertIn('linear/country/weights', variable_names) 1688 no_l1_reg_weights = { 1689 'linear/price/weight': regressor.get_variable_value( 1690 'linear/price/weight'), 1691 'linear/country/weights': regressor.get_variable_value( 1692 'linear/country/weights'), 1693 } 1694 1695 # Regressor with L1 regularization. 1696 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1697 example_id_column='example_id', symmetric_l1_regularization=1.0) 1698 regressor = linear.LinearRegressor( 1699 feature_columns=[price, country], 1700 weight_column_name='weights', 1701 optimizer=sdca_optimizer) 1702 regressor.fit(input_fn=input_fn, steps=20) 1703 l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] 1704 l1_reg_weights = { 1705 'linear/price/weight': regressor.get_variable_value( 1706 'linear/price/weight'), 1707 'linear/country/weights': regressor.get_variable_value( 1708 'linear/country/weights'), 1709 } 1710 1711 # Unregularized loss is lower when there is no L1 regularization. 1712 self.assertLess(no_l1_reg_loss, l1_reg_loss) 1713 self.assertLess(no_l1_reg_loss, 0.05) 1714 1715 # But weights returned by the regressor with L1 regularization have smaller 1716 # L1 norm. 1717 l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0 1718 for var_name in sorted(l1_reg_weights): 1719 l1_reg_weights_norm += sum( 1720 np.absolute(l1_reg_weights[var_name].flatten())) 1721 no_l1_reg_weights_norm += sum( 1722 np.absolute(no_l1_reg_weights[var_name].flatten())) 1723 print('Var name: %s, value: %s' % 1724 (var_name, no_l1_reg_weights[var_name].flatten())) 1725 self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm) 1726 1727 def testSdcaOptimizerBiasOnly(self): 1728 """Tests LinearClassifier with SDCAOptimizer and validates bias weight.""" 1729 1730 def input_fn(): 1731 """Testing the bias weight when it's the only feature present. 1732 1733 All of the instances in this input only have the bias feature, and a 1734 1/4 of the labels are positive. This means that the expected weight for 1735 the bias should be close to the average prediction, i.e 0.25. 1736 Returns: 1737 Training data for the test. 1738 """ 1739 num_examples = 40 1740 return { 1741 'example_id': 1742 constant_op.constant([str(x + 1) for x in range(num_examples)]), 1743 # place_holder is an empty column which is always 0 (absent), because 1744 # LinearClassifier requires at least one column. 1745 'place_holder': 1746 constant_op.constant([[0.0]] * num_examples), 1747 }, constant_op.constant( 1748 [[1 if i % 4 == 0 else 0] for i in range(num_examples)]) 1749 1750 place_holder = feature_column_lib.real_valued_column('place_holder') 1751 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1752 example_id_column='example_id') 1753 regressor = linear.LinearRegressor( 1754 feature_columns=[place_holder], optimizer=sdca_optimizer) 1755 regressor.fit(input_fn=input_fn, steps=100) 1756 1757 self.assertNear( 1758 regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1) 1759 1760 def testSdcaOptimizerBiasAndOtherColumns(self): 1761 """Tests LinearClassifier with SDCAOptimizer and validates bias weight.""" 1762 1763 def input_fn(): 1764 """Testing the bias weight when there are other features present. 1765 1766 1/2 of the instances in this input have feature 'a', the rest have 1767 feature 'b', and we expect the bias to be added to each instance as well. 1768 0.4 of all instances that have feature 'a' are positive, and 0.2 of all 1769 instances that have feature 'b' are positive. The labels in the dataset 1770 are ordered to appear shuffled since SDCA expects shuffled data, and 1771 converges faster with this pseudo-random ordering. 1772 If the bias was centered we would expect the weights to be: 1773 bias: 0.3 1774 a: 0.1 1775 b: -0.1 1776 Until b/29339026 is resolved, the bias gets regularized with the same 1777 global value for the other columns, and so the expected weights get 1778 shifted and are: 1779 bias: 0.2 1780 a: 0.2 1781 b: 0.0 1782 Returns: 1783 The test dataset. 1784 """ 1785 num_examples = 200 1786 half = int(num_examples / 2) 1787 return { 1788 'example_id': 1789 constant_op.constant([str(x + 1) for x in range(num_examples)]), 1790 'a': 1791 constant_op.constant([[1]] * int(half) + [[0]] * int(half)), 1792 'b': 1793 constant_op.constant([[0]] * int(half) + [[1]] * int(half)), 1794 }, constant_op.constant( 1795 [[x] 1796 for x in [1, 0, 0, 1, 1, 0, 0, 0, 1, 0] * int(half / 10) + 1797 [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] * int(half / 10)]) 1798 1799 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1800 example_id_column='example_id') 1801 regressor = linear.LinearRegressor( 1802 feature_columns=[ 1803 feature_column_lib.real_valued_column('a'), 1804 feature_column_lib.real_valued_column('b') 1805 ], 1806 optimizer=sdca_optimizer) 1807 1808 regressor.fit(input_fn=input_fn, steps=200) 1809 1810 variable_names = regressor.get_variable_names() 1811 self.assertIn('linear/bias_weight', variable_names) 1812 self.assertIn('linear/a/weight', variable_names) 1813 self.assertIn('linear/b/weight', variable_names) 1814 # TODO(b/29339026): Change the expected results to expect a centered bias. 1815 self.assertNear( 1816 regressor.get_variable_value('linear/bias_weight')[0], 0.2, err=0.05) 1817 self.assertNear( 1818 regressor.get_variable_value('linear/a/weight')[0], 0.2, err=0.05) 1819 self.assertNear( 1820 regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05) 1821 1822 def testSdcaOptimizerBiasAndOtherColumnsFabricatedCentered(self): 1823 """Tests LinearClassifier with SDCAOptimizer and validates bias weight.""" 1824 1825 def input_fn(): 1826 """Testing the bias weight when there are other features present. 1827 1828 1/2 of the instances in this input have feature 'a', the rest have 1829 feature 'b', and we expect the bias to be added to each instance as well. 1830 0.1 of all instances that have feature 'a' have a label of 1, and 0.1 of 1831 all instances that have feature 'b' have a label of -1. 1832 We can expect the weights to be: 1833 bias: 0.0 1834 a: 0.1 1835 b: -0.1 1836 Returns: 1837 The test dataset. 1838 """ 1839 num_examples = 200 1840 half = int(num_examples / 2) 1841 return { 1842 'example_id': 1843 constant_op.constant([str(x + 1) for x in range(num_examples)]), 1844 'a': 1845 constant_op.constant([[1]] * int(half) + [[0]] * int(half)), 1846 'b': 1847 constant_op.constant([[0]] * int(half) + [[1]] * int(half)), 1848 }, constant_op.constant([[1 if x % 10 == 0 else 0] for x in range(half)] + 1849 [[-1 if x % 10 == 0 else 0] for x in range(half)]) 1850 1851 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1852 example_id_column='example_id') 1853 regressor = linear.LinearRegressor( 1854 feature_columns=[ 1855 feature_column_lib.real_valued_column('a'), 1856 feature_column_lib.real_valued_column('b') 1857 ], 1858 optimizer=sdca_optimizer) 1859 1860 regressor.fit(input_fn=input_fn, steps=100) 1861 1862 variable_names = regressor.get_variable_names() 1863 self.assertIn('linear/bias_weight', variable_names) 1864 self.assertIn('linear/a/weight', variable_names) 1865 self.assertIn('linear/b/weight', variable_names) 1866 self.assertNear( 1867 regressor.get_variable_value('linear/bias_weight')[0], 0.0, err=0.05) 1868 self.assertNear( 1869 regressor.get_variable_value('linear/a/weight')[0], 0.1, err=0.05) 1870 self.assertNear( 1871 regressor.get_variable_value('linear/b/weight')[0], -0.1, err=0.05) 1872 1873 1874class LinearEstimatorTest(test.TestCase): 1875 1876 def testExperimentIntegration(self): 1877 cont_features = [ 1878 feature_column_lib.real_valued_column( 1879 'feature', dimension=4) 1880 ] 1881 exp = experiment.Experiment( 1882 estimator=linear.LinearEstimator(feature_columns=cont_features, 1883 head=head_lib.regression_head()), 1884 train_input_fn=test_data.iris_input_logistic_fn, 1885 eval_input_fn=test_data.iris_input_logistic_fn) 1886 exp.test() 1887 1888 def testEstimatorContract(self): 1889 estimator_test_utils.assert_estimator_contract(self, 1890 linear.LinearEstimator) 1891 1892 def testLinearRegression(self): 1893 """Tests that loss goes down with training.""" 1894 1895 def input_fn(): 1896 return { 1897 'age': 1898 constant_op.constant([1]), 1899 'language': 1900 sparse_tensor.SparseTensor( 1901 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 1902 }, constant_op.constant([[10.]]) 1903 1904 language = feature_column_lib.sparse_column_with_hash_bucket('language', 1905 100) 1906 age = feature_column_lib.real_valued_column('age') 1907 1908 linear_estimator = linear.LinearEstimator(feature_columns=[age, language], 1909 head=head_lib.regression_head()) 1910 linear_estimator.fit(input_fn=input_fn, steps=100) 1911 loss1 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss'] 1912 linear_estimator.fit(input_fn=input_fn, steps=400) 1913 loss2 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss'] 1914 1915 self.assertLess(loss2, loss1) 1916 self.assertLess(loss2, 0.5) 1917 1918 def testPoissonRegression(self): 1919 """Tests that loss goes down with training.""" 1920 1921 def input_fn(): 1922 return { 1923 'age': 1924 constant_op.constant([1]), 1925 'language': 1926 sparse_tensor.SparseTensor( 1927 values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) 1928 }, constant_op.constant([[10.]]) 1929 1930 language = feature_column_lib.sparse_column_with_hash_bucket('language', 1931 100) 1932 age = feature_column_lib.real_valued_column('age') 1933 1934 linear_estimator = linear.LinearEstimator( 1935 feature_columns=[age, language], 1936 head=head_lib.poisson_regression_head()) 1937 linear_estimator.fit(input_fn=input_fn, steps=10) 1938 loss1 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss'] 1939 linear_estimator.fit(input_fn=input_fn, steps=100) 1940 loss2 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss'] 1941 1942 self.assertLess(loss2, loss1) 1943 # Here loss of 2.1 implies a prediction of ~9.9998 1944 self.assertLess(loss2, 2.1) 1945 1946 def testSDCANotSupported(self): 1947 """Tests that we detect error for SDCA.""" 1948 maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost') 1949 sq_footage = feature_column_lib.real_valued_column('sq_footage') 1950 sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( 1951 example_id_column='example_id') 1952 with self.assertRaises(ValueError): 1953 linear.LinearEstimator( 1954 head=head_lib.regression_head(label_dimension=1), 1955 feature_columns=[maintenance_cost, sq_footage], 1956 optimizer=sdca_optimizer, 1957 _joint_weights=True) 1958 1959 1960def boston_input_fn(): 1961 boston = base.load_boston() 1962 features = math_ops.cast( 1963 array_ops.reshape(constant_op.constant(boston.data), [-1, 13]), 1964 dtypes.float32) 1965 labels = math_ops.cast( 1966 array_ops.reshape(constant_op.constant(boston.target), [-1, 1]), 1967 dtypes.float32) 1968 return features, labels 1969 1970 1971class FeatureColumnTest(test.TestCase): 1972 1973 def testTrain(self): 1974 feature_columns = estimator.infer_real_valued_columns_from_input_fn( 1975 boston_input_fn) 1976 est = linear.LinearRegressor(feature_columns=feature_columns) 1977 est.fit(input_fn=boston_input_fn, steps=1) 1978 _ = est.evaluate(input_fn=boston_input_fn, steps=1) 1979 1980 1981if __name__ == '__main__': 1982 test.main() 1983