1 2 #include "precomp.hpp" 3 #include <time.h> 4 5 #if 0 6 7 #define pCvSeq CvSeq* 8 #define pCvDTreeNode CvDTreeNode* 9 10 //=========================================================================== 11 //----------------------------- CvGBTreesParams ----------------------------- 12 //=========================================================================== 13 14 CvGBTreesParams::CvGBTreesParams() 15 : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 ) 16 { 17 weak_count = 200; 18 loss_function_type = CvGBTrees::SQUARED_LOSS; 19 subsample_portion = 0.8f; 20 shrinkage = 0.01f; 21 } 22 23 //=========================================================================== 24 25 CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count, 26 float _shrinkage, float _subsample_portion, 27 int _max_depth, bool _use_surrogates ) 28 : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 ) 29 { 30 loss_function_type = _loss_function_type; 31 weak_count = _weak_count; 32 shrinkage = _shrinkage; 33 subsample_portion = _subsample_portion; 34 max_depth = _max_depth; 35 use_surrogates = _use_surrogates; 36 } 37 38 //=========================================================================== 39 //------------------------------- CvGBTrees --------------------------------- 40 //=========================================================================== 41 42 CvGBTrees::CvGBTrees() 43 { 44 data = 0; 45 weak = 0; 46 default_model_name = "my_boost_tree"; 47 orig_response = sum_response = sum_response_tmp = 0; 48 subsample_train = subsample_test = 0; 49 missing = sample_idx = 0; 50 class_labels = 0; 51 class_count = 1; 52 delta = 0.0f; 53 54 clear(); 55 } 56 57 //=========================================================================== 58 59 int CvGBTrees::get_len(const CvMat* mat) const 60 { 61 return (mat->cols > mat->rows) ? mat->cols : mat->rows; 62 } 63 64 //=========================================================================== 65 66 void CvGBTrees::clear() 67 { 68 if( weak ) 69 { 70 CvSeqReader reader; 71 CvSlice slice = CV_WHOLE_SEQ; 72 CvDTree* tree; 73 74 //data->shared = false; 75 for (int i=0; i<class_count; ++i) 76 { 77 int weak_count = cvSliceLength( slice, weak[i] ); 78 if ((weak[i]) && (weak_count)) 79 { 80 cvStartReadSeq( weak[i], &reader ); 81 cvSetSeqReaderPos( &reader, slice.start_index ); 82 for (int j=0; j<weak_count; ++j) 83 { 84 CV_READ_SEQ_ELEM( tree, reader ); 85 //tree->clear(); 86 delete tree; 87 tree = 0; 88 } 89 } 90 } 91 for (int i=0; i<class_count; ++i) 92 if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) ); 93 delete[] weak; 94 } 95 if (data) 96 { 97 data->shared = false; 98 delete data; 99 } 100 weak = 0; 101 data = 0; 102 delta = 0.0f; 103 cvReleaseMat( &orig_response ); 104 cvReleaseMat( &sum_response ); 105 cvReleaseMat( &sum_response_tmp ); 106 cvReleaseMat( &subsample_train ); 107 cvReleaseMat( &subsample_test ); 108 cvReleaseMat( &sample_idx ); 109 cvReleaseMat( &missing ); 110 cvReleaseMat( &class_labels ); 111 } 112 113 //=========================================================================== 114 115 CvGBTrees::~CvGBTrees() 116 { 117 clear(); 118 } 119 120 //=========================================================================== 121 122 CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag, 123 const CvMat* _responses, const CvMat* _var_idx, 124 const CvMat* _sample_idx, const CvMat* _var_type, 125 const CvMat* _missing_mask, CvGBTreesParams _params ) 126 { 127 weak = 0; 128 data = 0; 129 default_model_name = "my_boost_tree"; 130 orig_response = sum_response = sum_response_tmp = 0; 131 subsample_train = subsample_test = 0; 132 missing = sample_idx = 0; 133 class_labels = 0; 134 class_count = 1; 135 delta = 0.0f; 136 137 train( _train_data, _tflag, _responses, _var_idx, _sample_idx, 138 _var_type, _missing_mask, _params ); 139 } 140 141 //=========================================================================== 142 143 bool CvGBTrees::problem_type() const 144 { 145 switch (params.loss_function_type) 146 { 147 case DEVIANCE_LOSS: return false; 148 default: return true; 149 } 150 } 151 152 //=========================================================================== 153 154 bool 155 CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update ) 156 { 157 bool result; 158 result = train ( _data->get_values(), CV_ROW_SAMPLE, 159 _data->get_responses(), _data->get_var_idx(), 160 _data->get_train_sample_idx(), _data->get_var_types(), 161 _data->get_missing(), _params, update); 162 //update is not supported 163 return result; 164 } 165 166 //=========================================================================== 167 168 169 bool 170 CvGBTrees::train( const CvMat* _train_data, int _tflag, 171 const CvMat* _responses, const CvMat* _var_idx, 172 const CvMat* _sample_idx, const CvMat* _var_type, 173 const CvMat* _missing_mask, 174 CvGBTreesParams _params, bool /*_update*/ ) //update is not supported 175 { 176 CvMemStorage* storage = 0; 177 178 params = _params; 179 bool is_regression = problem_type(); 180 181 clear(); 182 /* 183 n - count of samples 184 m - count of variables 185 */ 186 int n = _train_data->rows; 187 int m = _train_data->cols; 188 if (_tflag != CV_ROW_SAMPLE) 189 { 190 int tmp; 191 CV_SWAP(n,m,tmp); 192 } 193 194 CvMat* new_responses = cvCreateMat( n, 1, CV_32F); 195 cvZero(new_responses); 196 197 data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx, 198 _sample_idx, _var_type, _missing_mask, _params, true, true ); 199 if (_missing_mask) 200 { 201 missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols, 202 _missing_mask->type); 203 cvCopy( _missing_mask, missing); 204 } 205 206 orig_response = cvCreateMat( 1, n, CV_32F ); 207 int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type); 208 switch (CV_MAT_TYPE(_responses->type)) 209 { 210 case CV_32FC1: 211 { 212 for (int i=0; i<n; ++i) 213 orig_response->data.fl[i] = _responses->data.fl[i*step]; 214 }; break; 215 case CV_32SC1: 216 { 217 for (int i=0; i<n; ++i) 218 orig_response->data.fl[i] = (float) _responses->data.i[i*step]; 219 }; break; 220 default: 221 CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector."); 222 } 223 224 if (!is_regression) 225 { 226 class_count = 0; 227 unsigned char * mask = new unsigned char[n]; 228 memset(mask, 0, n); 229 // compute the count of different output classes 230 for (int i=0; i<n; ++i) 231 if (!mask[i]) 232 { 233 class_count++; 234 for (int j=i; j<n; ++j) 235 if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i])) 236 mask[j] = 1; 237 } 238 delete[] mask; 239 240 class_labels = cvCreateMat(1, class_count, CV_32S); 241 class_labels->data.i[0] = int(orig_response->data.fl[0]); 242 int j = 1; 243 for (int i=1; i<n; ++i) 244 { 245 int k = 0; 246 while ((int(orig_response->data.fl[i]) - class_labels->data.i[k]) && (k<j)) 247 k++; 248 if (k == j) 249 { 250 class_labels->data.i[k] = int(orig_response->data.fl[i]); 251 j++; 252 } 253 } 254 } 255 256 // inside gbt learning proccess only regression decision trees are built 257 data->is_classifier = false; 258 259 // preproccessing sample indices 260 if (_sample_idx) 261 { 262 int sample_idx_len = get_len(_sample_idx); 263 264 switch (CV_MAT_TYPE(_sample_idx->type)) 265 { 266 case CV_32SC1: 267 { 268 sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S ); 269 for (int i=0; i<sample_idx_len; ++i) 270 sample_idx->data.i[i] = _sample_idx->data.i[i]; 271 std::sort(sample_idx->data.i, sample_idx->data.i + sample_idx_len); 272 } break; 273 case CV_8S: 274 case CV_8U: 275 { 276 int active_samples_count = 0; 277 for (int i=0; i<sample_idx_len; ++i) 278 active_samples_count += int( _sample_idx->data.ptr[i] ); 279 sample_idx = cvCreateMat( 1, active_samples_count, CV_32S ); 280 active_samples_count = 0; 281 for (int i=0; i<sample_idx_len; ++i) 282 if (int( _sample_idx->data.ptr[i] )) 283 sample_idx->data.i[active_samples_count++] = i; 284 285 } break; 286 default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector."); 287 } 288 } 289 else 290 { 291 sample_idx = cvCreateMat( 1, n, CV_32S ); 292 for (int i=0; i<n; ++i) 293 sample_idx->data.i[i] = i; 294 } 295 296 sum_response = cvCreateMat(class_count, n, CV_32F); 297 sum_response_tmp = cvCreateMat(class_count, n, CV_32F); 298 cvZero(sum_response); 299 300 delta = 0.0f; 301 /* 302 in the case of a regression problem the initial guess (the zero term 303 in the sum) is set to the mean of all the training responses, that is 304 the best constant model 305 */ 306 if (is_regression) base_value = find_optimal_value(sample_idx); 307 /* 308 in the case of a classification problem the initial guess (the zero term 309 in the sum) is set to zero for all the trees sequences 310 */ 311 else base_value = 0.0f; 312 /* 313 current predicition on all training samples is set to be 314 equal to the base_value 315 */ 316 cvSet( sum_response, cvScalar(base_value) ); 317 318 weak = new pCvSeq[class_count]; 319 for (int i=0; i<class_count; ++i) 320 { 321 storage = cvCreateMemStorage(); 322 weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage ); 323 storage = 0; 324 } 325 326 // subsample params and data 327 rng = &cv::theRNG(); 328 329 int samples_count = get_len(sample_idx); 330 331 params.subsample_portion = params.subsample_portion <= FLT_EPSILON || 332 1 - params.subsample_portion <= FLT_EPSILON 333 ? 1 : params.subsample_portion; 334 int train_sample_count = cvFloor(params.subsample_portion * samples_count); 335 if (train_sample_count == 0) 336 train_sample_count = samples_count; 337 int test_sample_count = samples_count - train_sample_count; 338 int* idx_data = new int[samples_count]; 339 subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 ); 340 *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data ); 341 if (test_sample_count) 342 { 343 subsample_test = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 ); 344 *subsample_test = cvMat( 1, test_sample_count, CV_32SC1, 345 idx_data + train_sample_count ); 346 } 347 348 // training procedure 349 350 for ( int i=0; i < params.weak_count; ++i ) 351 { 352 do_subsample(); 353 for ( int k=0; k < class_count; ++k ) 354 { 355 find_gradient(k); 356 CvDTree* tree = new CvDTree; 357 tree->train( data, subsample_train ); 358 change_values(tree, k); 359 360 if (subsample_test) 361 { 362 CvMat x; 363 CvMat x_miss; 364 int* sample_data = sample_idx->data.i; 365 int* subsample_data = subsample_test->data.i; 366 int s_step = (sample_idx->cols > sample_idx->rows) ? 1 367 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); 368 for (int j=0; j<get_len(subsample_test); ++j) 369 { 370 int idx = *(sample_data + subsample_data[j]*s_step); 371 float res = 0.0f; 372 if (_tflag == CV_ROW_SAMPLE) 373 cvGetRow( data->train_data, &x, idx); 374 else 375 cvGetCol( data->train_data, &x, idx); 376 377 if (missing) 378 { 379 if (_tflag == CV_ROW_SAMPLE) 380 cvGetRow( missing, &x_miss, idx); 381 else 382 cvGetCol( missing, &x_miss, idx); 383 384 res = (float)tree->predict(&x, &x_miss)->value; 385 } 386 else 387 { 388 res = (float)tree->predict(&x)->value; 389 } 390 sum_response_tmp->data.fl[idx + k*n] = 391 sum_response->data.fl[idx + k*n] + 392 params.shrinkage * res; 393 } 394 } 395 396 cvSeqPush( weak[k], &tree ); 397 tree = 0; 398 } // k=0..class_count 399 CvMat* tmp; 400 tmp = sum_response_tmp; 401 sum_response_tmp = sum_response; 402 sum_response = tmp; 403 tmp = 0; 404 } // i=0..params.weak_count 405 406 delete[] idx_data; 407 cvReleaseMat(&new_responses); 408 data->free_train_data(); 409 410 return true; 411 412 } // CvGBTrees::train(...) 413 414 //=========================================================================== 415 416 inline float Sign(float x) 417 { 418 if (x<0.0f) return -1.0f; 419 else if (x>0.0f) return 1.0f; 420 return 0.0f; 421 } 422 423 //=========================================================================== 424 425 void CvGBTrees::find_gradient(const int k) 426 { 427 int* sample_data = sample_idx->data.i; 428 int* subsample_data = subsample_train->data.i; 429 float* grad_data = data->responses->data.fl; 430 float* resp_data = orig_response->data.fl; 431 float* current_data = sum_response->data.fl; 432 433 switch (params.loss_function_type) 434 // loss_function_type in 435 // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS} 436 { 437 case SQUARED_LOSS: 438 { 439 for (int i=0; i<get_len(subsample_train); ++i) 440 { 441 int s_step = (sample_idx->cols > sample_idx->rows) ? 1 442 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); 443 int idx = *(sample_data + subsample_data[i]*s_step); 444 grad_data[idx] = resp_data[idx] - current_data[idx]; 445 } 446 }; break; 447 448 case ABSOLUTE_LOSS: 449 { 450 for (int i=0; i<get_len(subsample_train); ++i) 451 { 452 int s_step = (sample_idx->cols > sample_idx->rows) ? 1 453 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); 454 int idx = *(sample_data + subsample_data[i]*s_step); 455 grad_data[idx] = Sign(resp_data[idx] - current_data[idx]); 456 } 457 }; break; 458 459 case HUBER_LOSS: 460 { 461 float alpha = 0.2f; 462 int n = get_len(subsample_train); 463 int s_step = (sample_idx->cols > sample_idx->rows) ? 1 464 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); 465 466 float* residuals = new float[n]; 467 for (int i=0; i<n; ++i) 468 { 469 int idx = *(sample_data + subsample_data[i]*s_step); 470 residuals[i] = fabs(resp_data[idx] - current_data[idx]); 471 } 472 std::sort(residuals, residuals + n); 473 474 delta = residuals[int(ceil(n*alpha))]; 475 476 for (int i=0; i<n; ++i) 477 { 478 int idx = *(sample_data + subsample_data[i]*s_step); 479 float r = resp_data[idx] - current_data[idx]; 480 grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r; 481 } 482 delete[] residuals; 483 484 }; break; 485 486 case DEVIANCE_LOSS: 487 { 488 for (int i=0; i<get_len(subsample_train); ++i) 489 { 490 double exp_fk = 0; 491 double exp_sfi = 0; 492 int s_step = (sample_idx->cols > sample_idx->rows) ? 1 493 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); 494 int idx = *(sample_data + subsample_data[i]*s_step); 495 496 for (int j=0; j<class_count; ++j) 497 { 498 double res; 499 res = current_data[idx + j*sum_response->cols]; 500 res = exp(res); 501 if (j == k) exp_fk = res; 502 exp_sfi += res; 503 } 504 int orig_label = int(resp_data[idx]); 505 /* 506 grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) - 507 (float)(exp_fk / exp_sfi); 508 */ 509 int ensemble_label = 0; 510 while (class_labels->data.i[ensemble_label] - orig_label) 511 ensemble_label++; 512 513 grad_data[idx] = (float)(!(k-ensemble_label)) - 514 (float)(exp_fk / exp_sfi); 515 } 516 }; break; 517 518 default: break; 519 } 520 521 } // CvGBTrees::find_gradient(...) 522 523 //=========================================================================== 524 525 void CvGBTrees::change_values(CvDTree* tree, const int _k) 526 { 527 CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)]; 528 529 int* sample_data = sample_idx->data.i; 530 int* subsample_data = subsample_train->data.i; 531 int s_step = (sample_idx->cols > sample_idx->rows) ? 1 532 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); 533 534 CvMat x; 535 CvMat miss_x; 536 537 for (int i=0; i<get_len(subsample_train); ++i) 538 { 539 int idx = *(sample_data + subsample_data[i]*s_step); 540 if (data->tflag == CV_ROW_SAMPLE) 541 cvGetRow( data->train_data, &x, idx); 542 else 543 cvGetCol( data->train_data, &x, idx); 544 545 if (missing) 546 { 547 if (data->tflag == CV_ROW_SAMPLE) 548 cvGetRow( missing, &miss_x, idx); 549 else 550 cvGetCol( missing, &miss_x, idx); 551 552 predictions[i] = tree->predict(&x, &miss_x); 553 } 554 else 555 predictions[i] = tree->predict(&x); 556 } 557 558 559 CvDTreeNode** leaves; 560 int leaves_count = 0; 561 leaves = GetLeaves( tree, leaves_count); 562 563 for (int i=0; i<leaves_count; ++i) 564 { 565 int samples_in_leaf = 0; 566 for (int j=0; j<get_len(subsample_train); ++j) 567 { 568 if (leaves[i] == predictions[j]) samples_in_leaf++; 569 } 570 571 if (!samples_in_leaf) // It should not be done anyways! but... 572 { 573 leaves[i]->value = 0.0; 574 continue; 575 } 576 577 CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S); 578 int* leaf_idx_data = leaf_idx->data.i; 579 580 for (int j=0; j<get_len(subsample_train); ++j) 581 { 582 int idx = *(sample_data + subsample_data[j]*s_step); 583 if (leaves[i] == predictions[j]) 584 *leaf_idx_data++ = idx; 585 } 586 587 float value = find_optimal_value(leaf_idx); 588 leaves[i]->value = value; 589 590 leaf_idx_data = leaf_idx->data.i; 591 592 int len = sum_response_tmp->cols; 593 for (int j=0; j<get_len(leaf_idx); ++j) 594 { 595 int idx = leaf_idx_data[j]; 596 sum_response_tmp->data.fl[idx + _k*len] = 597 sum_response->data.fl[idx + _k*len] + 598 params.shrinkage * value; 599 } 600 leaf_idx_data = 0; 601 cvReleaseMat(&leaf_idx); 602 } 603 604 // releasing the memory 605 for (int i=0; i<get_len(subsample_train); ++i) 606 { 607 predictions[i] = 0; 608 } 609 delete[] predictions; 610 611 for (int i=0; i<leaves_count; ++i) 612 { 613 leaves[i] = 0; 614 } 615 delete[] leaves; 616 617 } 618 619 //=========================================================================== 620 /* 621 void CvGBTrees::change_values(CvDTree* tree, const int _k) 622 { 623 624 CvDTreeNode** leaves; 625 int leaves_count = 0; 626 int offset = _k*sum_response_tmp->cols; 627 CvMat leaf_idx; 628 leaf_idx.rows = 1; 629 630 leaves = GetLeaves( tree, leaves_count); 631 632 for (int i=0; i<leaves_count; ++i) 633 { 634 int n = leaves[i]->sample_count; 635 int* leaf_idx_data = new int[n]; 636 data->get_sample_indices(leaves[i], leaf_idx_data); 637 //CvMat* leaf_idx = new CvMat(); 638 //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data); 639 leaf_idx.cols = n; 640 leaf_idx.data.i = leaf_idx_data; 641 642 float value = find_optimal_value(&leaf_idx); 643 leaves[i]->value = value; 644 float val = params.shrinkage * value; 645 646 647 for (int j=0; j<n; ++j) 648 { 649 int idx = leaf_idx_data[j] + offset; 650 sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val; 651 } 652 //leaf_idx_data = 0; 653 //cvReleaseMat(&leaf_idx); 654 leaf_idx.data.i = 0; 655 //delete leaf_idx; 656 delete[] leaf_idx_data; 657 } 658 659 // releasing the memory 660 for (int i=0; i<leaves_count; ++i) 661 { 662 leaves[i] = 0; 663 } 664 delete[] leaves; 665 666 } //change_values(...); 667 */ 668 //=========================================================================== 669 670 float CvGBTrees::find_optimal_value( const CvMat* _Idx ) 671 { 672 673 double gamma = (double)0.0; 674 675 int* idx = _Idx->data.i; 676 float* resp_data = orig_response->data.fl; 677 float* cur_data = sum_response->data.fl; 678 int n = get_len(_Idx); 679 680 switch (params.loss_function_type) 681 // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4 682 { 683 case SQUARED_LOSS: 684 { 685 for (int i=0; i<n; ++i) 686 gamma += resp_data[idx[i]] - cur_data[idx[i]]; 687 gamma /= (double)n; 688 }; break; 689 690 case ABSOLUTE_LOSS: 691 { 692 float* residuals = new float[n]; 693 for (int i=0; i<n; ++i, ++idx) 694 residuals[i] = (resp_data[*idx] - cur_data[*idx]); 695 std::sort(residuals, residuals + n); 696 if (n % 2) 697 gamma = residuals[n/2]; 698 else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f; 699 delete[] residuals; 700 }; break; 701 702 case HUBER_LOSS: 703 { 704 float* residuals = new float[n]; 705 for (int i=0; i<n; ++i, ++idx) 706 residuals[i] = (resp_data[*idx] - cur_data[*idx]); 707 std::sort(residuals, residuals + n); 708 709 int n_half = n >> 1; 710 float r_median = (n == n_half<<1) ? 711 (residuals[n_half-1] + residuals[n_half]) / 2.0f : 712 residuals[n_half]; 713 714 for (int i=0; i<n; ++i) 715 { 716 float dif = residuals[i] - r_median; 717 gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif; 718 } 719 gamma /= (double)n; 720 gamma += r_median; 721 delete[] residuals; 722 723 }; break; 724 725 case DEVIANCE_LOSS: 726 { 727 float* grad_data = data->responses->data.fl; 728 double tmp1 = 0; 729 double tmp2 = 0; 730 double tmp = 0; 731 for (int i=0; i<n; ++i) 732 { 733 tmp = grad_data[idx[i]]; 734 tmp1 += tmp; 735 tmp2 += fabs(tmp)*(1-fabs(tmp)); 736 }; 737 if (tmp2 == 0) 738 { 739 tmp2 = 1; 740 } 741 742 gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2); 743 }; break; 744 745 default: break; 746 } 747 748 return float(gamma); 749 750 } // CvGBTrees::find_optimal_value 751 752 //=========================================================================== 753 754 755 void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node ) 756 { 757 if (node->left != NULL) leaves_get(leaves, count, node->left); 758 if (node->right != NULL) leaves_get(leaves, count, node->right); 759 if ((node->left == NULL) && (node->right == NULL)) 760 leaves[count++] = node; 761 } 762 763 //--------------------------------------------------------------------------- 764 765 CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len ) 766 { 767 len = 0; 768 CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth]; 769 leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root())); 770 return leaves; 771 } 772 773 //=========================================================================== 774 775 void CvGBTrees::do_subsample() 776 { 777 778 int n = get_len(sample_idx); 779 int* idx = subsample_train->data.i; 780 781 for (int i = 0; i < n; i++ ) 782 idx[i] = i; 783 784 if (subsample_test) 785 for (int i = 0; i < n; i++) 786 { 787 int a = (*rng)(n); 788 int b = (*rng)(n); 789 int t; 790 CV_SWAP( idx[a], idx[b], t ); 791 } 792 793 /* 794 int n = get_len(sample_idx); 795 if (subsample_train == 0) 796 subsample_train = cvCreateMat(1, n, CV_32S); 797 int* subsample_data = subsample_train->data.i; 798 for (int i=0; i<n; ++i) 799 subsample_data[i] = i; 800 subsample_test = 0; 801 */ 802 } 803 804 //=========================================================================== 805 806 float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing, 807 CvMat* weak_responses, CvSlice slice, int k) const 808 { 809 float result = 0.0f; 810 811 if (!weak) return 0.0f; 812 813 CvSeqReader reader; 814 int weak_count = cvSliceLength( slice, weak[class_count-1] ); 815 CvDTree* tree; 816 817 if (weak_responses) 818 { 819 if (CV_MAT_TYPE(weak_responses->type) != CV_32F) 820 return 0.0f; 821 if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1)) 822 return 0.0f; 823 if ((k == -1) && (weak_responses->rows != class_count)) 824 return 0.0f; 825 if (weak_responses->cols != weak_count) 826 return 0.0f; 827 } 828 829 float* sum = new float[class_count]; 830 memset(sum, 0, class_count*sizeof(float)); 831 832 for (int i=0; i<class_count; ++i) 833 { 834 if ((weak[i]) && (weak_count)) 835 { 836 cvStartReadSeq( weak[i], &reader ); 837 cvSetSeqReaderPos( &reader, slice.start_index ); 838 for (int j=0; j<weak_count; ++j) 839 { 840 CV_READ_SEQ_ELEM( tree, reader ); 841 float p = (float)(tree->predict(_sample, _missing)->value); 842 sum[i] += params.shrinkage * p; 843 if (weak_responses) 844 weak_responses->data.fl[i*weak_count+j] = p; 845 } 846 } 847 } 848 849 for (int i=0; i<class_count; ++i) 850 sum[i] += base_value; 851 852 if (class_count == 1) 853 { 854 result = sum[0]; 855 delete[] sum; 856 return result; 857 } 858 859 if ((k>=0) && (k<class_count)) 860 { 861 result = sum[k]; 862 delete[] sum; 863 return result; 864 } 865 866 float max = sum[0]; 867 int class_label = 0; 868 for (int i=1; i<class_count; ++i) 869 if (sum[i] > max) 870 { 871 max = sum[i]; 872 class_label = i; 873 } 874 875 delete[] sum; 876 877 /* 878 int orig_class_label = -1; 879 for (int i=0; i<get_len(class_labels); ++i) 880 if (class_labels->data.i[i] == class_label+1) 881 orig_class_label = i; 882 */ 883 int orig_class_label = class_labels->data.i[class_label]; 884 885 return float(orig_class_label); 886 } 887 888 889 class Tree_predictor : public cv::ParallelLoopBody 890 { 891 private: 892 pCvSeq* weak; 893 float* sum; 894 const int k; 895 const CvMat* sample; 896 const CvMat* missing; 897 const float shrinkage; 898 899 static cv::Mutex SumMutex; 900 901 902 public: 903 Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {} 904 Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage, 905 const CvMat* _sample, const CvMat* _missing, float* _sum ) : 906 weak(_weak), sum(_sum), k(_k), sample(_sample), 907 missing(_missing), shrinkage(_shrinkage) 908 {} 909 910 Tree_predictor( const Tree_predictor& p, cv::Split ) : 911 weak(p.weak), sum(p.sum), k(p.k), sample(p.sample), 912 missing(p.missing), shrinkage(p.shrinkage) 913 {} 914 915 Tree_predictor& operator=( const Tree_predictor& ) 916 { return *this; } 917 918 virtual void operator()(const cv::Range& range) const 919 { 920 CvSeqReader reader; 921 int begin = range.start; 922 int end = range.end; 923 924 int weak_count = end - begin; 925 CvDTree* tree; 926 927 for (int i=0; i<k; ++i) 928 { 929 float tmp_sum = 0.0f; 930 if ((weak[i]) && (weak_count)) 931 { 932 cvStartReadSeq( weak[i], &reader ); 933 cvSetSeqReaderPos( &reader, begin ); 934 for (int j=0; j<weak_count; ++j) 935 { 936 CV_READ_SEQ_ELEM( tree, reader ); 937 tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value); 938 } 939 } 940 941 { 942 cv::AutoLock lock(SumMutex); 943 sum[i] += tmp_sum; 944 } 945 } 946 } // Tree_predictor::operator() 947 948 virtual ~Tree_predictor() {} 949 950 }; // class Tree_predictor 951 952 cv::Mutex Tree_predictor::SumMutex; 953 954 955 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing, 956 CvMat* /*weak_responses*/, CvSlice slice, int k) const 957 { 958 float result = 0.0f; 959 if (!weak) return 0.0f; 960 float* sum = new float[class_count]; 961 for (int i=0; i<class_count; ++i) 962 sum[i] = 0.0f; 963 int begin = slice.start_index; 964 int end = begin + cvSliceLength( slice, weak[0] ); 965 966 pCvSeq* weak_seq = weak; 967 Tree_predictor predictor = Tree_predictor(weak_seq, class_count, 968 params.shrinkage, _sample, _missing, sum); 969 970 cv::parallel_for_(cv::Range(begin, end), predictor); 971 972 for (int i=0; i<class_count; ++i) 973 sum[i] = sum[i] /** params.shrinkage*/ + base_value; 974 975 if (class_count == 1) 976 { 977 result = sum[0]; 978 delete[] sum; 979 return result; 980 } 981 982 if ((k>=0) && (k<class_count)) 983 { 984 result = sum[k]; 985 delete[] sum; 986 return result; 987 } 988 989 float max = sum[0]; 990 int class_label = 0; 991 for (int i=1; i<class_count; ++i) 992 if (sum[i] > max) 993 { 994 max = sum[i]; 995 class_label = i; 996 } 997 998 delete[] sum; 999 int orig_class_label = class_labels->data.i[class_label]; 1000 1001 return float(orig_class_label); 1002 } 1003 1004 1005 //=========================================================================== 1006 1007 void CvGBTrees::write_params( CvFileStorage* fs ) const 1008 { 1009 const char* loss_function_type_str = 1010 params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" : 1011 params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" : 1012 params.loss_function_type == HUBER_LOSS ? "HuberLoss" : 1013 params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0; 1014 1015 1016 if( loss_function_type_str ) 1017 cvWriteString( fs, "loss_function", loss_function_type_str ); 1018 else 1019 cvWriteInt( fs, "loss_function", params.loss_function_type ); 1020 1021 cvWriteInt( fs, "ensemble_length", params.weak_count ); 1022 cvWriteReal( fs, "shrinkage", params.shrinkage ); 1023 cvWriteReal( fs, "subsample_portion", params.subsample_portion ); 1024 //cvWriteInt( fs, "max_tree_depth", params.max_depth ); 1025 //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false"); 1026 if (class_labels) cvWrite( fs, "class_labels", class_labels); 1027 1028 data->is_classifier = !problem_type(); 1029 data->write_params( fs ); 1030 data->is_classifier = 0; 1031 } 1032 1033 1034 //=========================================================================== 1035 1036 void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode ) 1037 { 1038 CV_FUNCNAME( "CvGBTrees::read_params" ); 1039 __BEGIN__; 1040 1041 1042 CvFileNode* temp; 1043 1044 if( !fnode || !CV_NODE_IS_MAP(fnode->tag) ) 1045 return; 1046 1047 data = new CvDTreeTrainData(); 1048 CV_CALL( data->read_params(fs, fnode)); 1049 data->shared = true; 1050 1051 params.max_depth = data->params.max_depth; 1052 params.min_sample_count = data->params.min_sample_count; 1053 params.max_categories = data->params.max_categories; 1054 params.priors = data->params.priors; 1055 params.regression_accuracy = data->params.regression_accuracy; 1056 params.use_surrogates = data->params.use_surrogates; 1057 1058 temp = cvGetFileNodeByName( fs, fnode, "loss_function" ); 1059 if( !temp ) 1060 EXIT; 1061 1062 if( temp && CV_NODE_IS_STRING(temp->tag) ) 1063 { 1064 const char* loss_function_type_str = cvReadString( temp, "" ); 1065 params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS : 1066 strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS : 1067 strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS : 1068 strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1; 1069 } 1070 else 1071 params.loss_function_type = cvReadInt( temp, -1 ); 1072 1073 1074 if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS || params.loss_function_type == 2) 1075 CV_ERROR( CV_StsBadArg, "Unknown loss function" ); 1076 1077 params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" ); 1078 params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 ); 1079 params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 ); 1080 1081 if (data->is_classifier) 1082 { 1083 class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" ); 1084 if( class_labels && !CV_IS_MAT(class_labels)) 1085 CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix"); 1086 } 1087 data->is_classifier = 0; 1088 1089 __END__; 1090 } 1091 1092 1093 1094 1095 void CvGBTrees::write( CvFileStorage* fs, const char* name ) const 1096 { 1097 CV_FUNCNAME( "CvGBTrees::write" ); 1098 1099 __BEGIN__; 1100 1101 CvSeqReader reader; 1102 int i; 1103 cv::String s; 1104 1105 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT ); 1106 1107 if( !weak ) 1108 CV_ERROR( CV_StsBadArg, "The model has not been trained yet" ); 1109 1110 write_params( fs ); 1111 cvWriteReal( fs, "base_value", base_value); 1112 cvWriteInt( fs, "class_count", class_count); 1113 1114 for ( int j=0; j < class_count; ++j ) 1115 { 1116 s = cv::format("trees_%d", j); 1117 cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ ); 1118 1119 cvStartReadSeq( weak[j], &reader ); 1120 1121 for( i = 0; i < weak[j]->total; i++ ) 1122 { 1123 CvDTree* tree; 1124 CV_READ_SEQ_ELEM( tree, reader ); 1125 cvStartWriteStruct( fs, 0, CV_NODE_MAP ); 1126 tree->write( fs ); 1127 cvEndWriteStruct( fs ); 1128 } 1129 1130 cvEndWriteStruct( fs ); 1131 } 1132 1133 cvEndWriteStruct( fs ); 1134 1135 __END__; 1136 } 1137 1138 1139 //=========================================================================== 1140 1141 1142 void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node ) 1143 { 1144 1145 CV_FUNCNAME( "CvGBTrees::read" ); 1146 1147 __BEGIN__; 1148 1149 CvSeqReader reader; 1150 CvFileNode* trees_fnode; 1151 CvMemStorage* storage; 1152 int i, ntrees; 1153 cv::String s; 1154 1155 clear(); 1156 read_params( fs, node ); 1157 1158 if( !data ) 1159 EXIT; 1160 1161 base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 ); 1162 class_count = cvReadIntByName( fs, node, "class_count", 1 ); 1163 1164 weak = new pCvSeq[class_count]; 1165 1166 1167 for (int j=0; j<class_count; ++j) 1168 { 1169 s = cv::format("trees_%d", j); 1170 1171 trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() ); 1172 if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) ) 1173 CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" ); 1174 1175 cvStartReadSeq( trees_fnode->data.seq, &reader ); 1176 ntrees = trees_fnode->data.seq->total; 1177 1178 if( ntrees != params.weak_count ) 1179 CV_ERROR( CV_StsUnmatchedSizes, 1180 "The number of trees stored does not match <ntrees> tag value" ); 1181 1182 CV_CALL( storage = cvCreateMemStorage() ); 1183 weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage ); 1184 1185 for( i = 0; i < ntrees; i++ ) 1186 { 1187 CvDTree* tree = new CvDTree(); 1188 CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data )); 1189 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader ); 1190 cvSeqPush( weak[j], &tree ); 1191 } 1192 } 1193 1194 __END__; 1195 } 1196 1197 //=========================================================================== 1198 1199 class Sample_predictor : public cv::ParallelLoopBody 1200 { 1201 private: 1202 const CvGBTrees* gbt; 1203 float* predictions; 1204 const CvMat* samples; 1205 const CvMat* missing; 1206 const CvMat* idx; 1207 CvSlice slice; 1208 1209 public: 1210 Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0), 1211 idx(0), slice(CV_WHOLE_SEQ) 1212 {} 1213 1214 Sample_predictor(const CvGBTrees* _gbt, float* _predictions, 1215 const CvMat* _samples, const CvMat* _missing, 1216 const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) : 1217 gbt(_gbt), predictions(_predictions), samples(_samples), 1218 missing(_missing), idx(_idx), slice(_slice) 1219 {} 1220 1221 1222 Sample_predictor( const Sample_predictor& p, cv::Split ) : 1223 gbt(p.gbt), predictions(p.predictions), 1224 samples(p.samples), missing(p.missing), idx(p.idx), 1225 slice(p.slice) 1226 {} 1227 1228 1229 virtual void operator()(const cv::Range& range) const 1230 { 1231 int begin = range.start; 1232 int end = range.end; 1233 1234 CvMat x; 1235 CvMat miss; 1236 1237 for (int i=begin; i<end; ++i) 1238 { 1239 int j = idx ? idx->data.i[i] : i; 1240 cvGetRow(samples, &x, j); 1241 if (!missing) 1242 { 1243 predictions[i] = gbt->predict_serial(&x,0,0,slice); 1244 } 1245 else 1246 { 1247 cvGetRow(missing, &miss, j); 1248 predictions[i] = gbt->predict_serial(&x,&miss,0,slice); 1249 } 1250 } 1251 } // Sample_predictor::operator() 1252 1253 virtual ~Sample_predictor() {} 1254 1255 }; // class Sample_predictor 1256 1257 1258 1259 // type in {CV_TRAIN_ERROR, CV_TEST_ERROR} 1260 float 1261 CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp ) 1262 { 1263 1264 float err = 0.0f; 1265 const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ? 1266 _data->get_train_sample_idx() : 1267 _data->get_test_sample_idx(); 1268 const CvMat* response = _data->get_responses(); 1269 1270 int n = _sample_idx ? get_len(_sample_idx) : 0; 1271 n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n; 1272 1273 if (!n) 1274 return -FLT_MAX; 1275 1276 float* pred_resp = 0; 1277 if (resp) 1278 { 1279 resp->resize(n); 1280 pred_resp = &((*resp)[0]); 1281 } 1282 else 1283 pred_resp = new float[n]; 1284 1285 Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(), 1286 _data->get_missing(), _sample_idx); 1287 1288 cv::parallel_for_(cv::Range(0,n), predictor); 1289 1290 int* sidx = _sample_idx ? _sample_idx->data.i : 0; 1291 int r_step = CV_IS_MAT_CONT(response->type) ? 1292 1 : response->step / CV_ELEM_SIZE(response->type); 1293 1294 1295 if ( !problem_type() ) 1296 { 1297 for( int i = 0; i < n; i++ ) 1298 { 1299 int si = sidx ? sidx[i] : i; 1300 int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1; 1301 err += d; 1302 } 1303 err = err / (float)n * 100.0f; 1304 } 1305 else 1306 { 1307 for( int i = 0; i < n; i++ ) 1308 { 1309 int si = sidx ? sidx[i] : i; 1310 float d = pred_resp[i] - response->data.fl[si*r_step]; 1311 err += d*d; 1312 } 1313 err = err / (float)n; 1314 } 1315 1316 return err; 1317 } 1318 1319 1320 CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag, 1321 const cv::Mat& responses, const cv::Mat& varIdx, 1322 const cv::Mat& sampleIdx, const cv::Mat& varType, 1323 const cv::Mat& missingDataMask, 1324 CvGBTreesParams _params ) 1325 { 1326 data = 0; 1327 weak = 0; 1328 default_model_name = "my_boost_tree"; 1329 orig_response = sum_response = sum_response_tmp = 0; 1330 subsample_train = subsample_test = 0; 1331 missing = sample_idx = 0; 1332 class_labels = 0; 1333 class_count = 1; 1334 delta = 0.0f; 1335 1336 clear(); 1337 1338 train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false); 1339 } 1340 1341 bool CvGBTrees::train( const cv::Mat& trainData, int tflag, 1342 const cv::Mat& responses, const cv::Mat& varIdx, 1343 const cv::Mat& sampleIdx, const cv::Mat& varType, 1344 const cv::Mat& missingDataMask, 1345 CvGBTreesParams _params, 1346 bool update ) 1347 { 1348 CvMat _trainData = trainData, _responses = responses; 1349 CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType; 1350 CvMat _missingDataMask = missingDataMask; 1351 1352 return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx, 1353 sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType, 1354 missingDataMask.empty() ? 0 : &_missingDataMask, _params, update); 1355 } 1356 1357 float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing, 1358 const cv::Range& slice, int k ) const 1359 { 1360 CvMat _sample = sample, miss = _missing; 1361 return predict(&_sample, _missing.empty() ? 0 : &miss, 0, 1362 slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k); 1363 } 1364 1365 #endif 1366