• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 #include "precomp.hpp"
3 #include <time.h>
4 
5 #if 0
6 
7 #define pCvSeq CvSeq*
8 #define pCvDTreeNode CvDTreeNode*
9 
10 //===========================================================================
11 //----------------------------- CvGBTreesParams -----------------------------
12 //===========================================================================
13 
14 CvGBTreesParams::CvGBTreesParams()
15             : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
16 {
17     weak_count = 200;
18     loss_function_type = CvGBTrees::SQUARED_LOSS;
19     subsample_portion = 0.8f;
20     shrinkage = 0.01f;
21 }
22 
23 //===========================================================================
24 
25 CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count,
26                          float _shrinkage, float _subsample_portion,
27                          int _max_depth, bool _use_surrogates )
28             : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
29 {
30     loss_function_type = _loss_function_type;
31     weak_count = _weak_count;
32     shrinkage = _shrinkage;
33     subsample_portion = _subsample_portion;
34     max_depth = _max_depth;
35     use_surrogates = _use_surrogates;
36 }
37 
38 //===========================================================================
39 //------------------------------- CvGBTrees ---------------------------------
40 //===========================================================================
41 
42 CvGBTrees::CvGBTrees()
43 {
44     data = 0;
45     weak = 0;
46     default_model_name = "my_boost_tree";
47     orig_response = sum_response = sum_response_tmp = 0;
48     subsample_train = subsample_test = 0;
49     missing = sample_idx = 0;
50     class_labels = 0;
51     class_count = 1;
52     delta = 0.0f;
53 
54     clear();
55 }
56 
57 //===========================================================================
58 
59 int CvGBTrees::get_len(const CvMat* mat) const
60 {
61     return (mat->cols > mat->rows) ? mat->cols : mat->rows;
62 }
63 
64 //===========================================================================
65 
66 void CvGBTrees::clear()
67 {
68     if( weak )
69     {
70         CvSeqReader reader;
71         CvSlice slice = CV_WHOLE_SEQ;
72         CvDTree* tree;
73 
74         //data->shared = false;
75         for (int i=0; i<class_count; ++i)
76         {
77             int weak_count = cvSliceLength( slice, weak[i] );
78             if ((weak[i]) && (weak_count))
79             {
80                 cvStartReadSeq( weak[i], &reader );
81                 cvSetSeqReaderPos( &reader, slice.start_index );
82                 for (int j=0; j<weak_count; ++j)
83                 {
84                     CV_READ_SEQ_ELEM( tree, reader );
85                     //tree->clear();
86                     delete tree;
87                     tree = 0;
88                 }
89             }
90         }
91         for (int i=0; i<class_count; ++i)
92             if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) );
93         delete[] weak;
94     }
95     if (data)
96     {
97         data->shared = false;
98         delete data;
99     }
100     weak = 0;
101     data = 0;
102     delta = 0.0f;
103     cvReleaseMat( &orig_response );
104     cvReleaseMat( &sum_response );
105     cvReleaseMat( &sum_response_tmp );
106     cvReleaseMat( &subsample_train );
107     cvReleaseMat( &subsample_test );
108     cvReleaseMat( &sample_idx );
109     cvReleaseMat( &missing );
110     cvReleaseMat( &class_labels );
111 }
112 
113 //===========================================================================
114 
115 CvGBTrees::~CvGBTrees()
116 {
117     clear();
118 }
119 
120 //===========================================================================
121 
122 CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag,
123                   const CvMat* _responses, const CvMat* _var_idx,
124                   const CvMat* _sample_idx, const CvMat* _var_type,
125                   const CvMat* _missing_mask, CvGBTreesParams _params )
126 {
127     weak = 0;
128     data = 0;
129     default_model_name = "my_boost_tree";
130     orig_response = sum_response = sum_response_tmp = 0;
131     subsample_train = subsample_test = 0;
132     missing = sample_idx = 0;
133     class_labels = 0;
134     class_count = 1;
135     delta = 0.0f;
136 
137     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
138            _var_type, _missing_mask, _params );
139 }
140 
141 //===========================================================================
142 
143 bool CvGBTrees::problem_type() const
144 {
145     switch (params.loss_function_type)
146     {
147     case DEVIANCE_LOSS: return false;
148     default: return true;
149     }
150 }
151 
152 //===========================================================================
153 
154 bool
155 CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update )
156 {
157     bool result;
158     result = train ( _data->get_values(), CV_ROW_SAMPLE,
159             _data->get_responses(), _data->get_var_idx(),
160             _data->get_train_sample_idx(), _data->get_var_types(),
161             _data->get_missing(), _params, update);
162                                          //update is not supported
163     return result;
164 }
165 
166 //===========================================================================
167 
168 
169 bool
170 CvGBTrees::train( const CvMat* _train_data, int _tflag,
171               const CvMat* _responses, const CvMat* _var_idx,
172               const CvMat* _sample_idx, const CvMat* _var_type,
173               const CvMat* _missing_mask,
174               CvGBTreesParams _params, bool /*_update*/ ) //update is not supported
175 {
176     CvMemStorage* storage = 0;
177 
178     params = _params;
179     bool is_regression = problem_type();
180 
181     clear();
182     /*
183       n - count of samples
184       m - count of variables
185     */
186     int n = _train_data->rows;
187     int m = _train_data->cols;
188     if (_tflag != CV_ROW_SAMPLE)
189     {
190         int tmp;
191         CV_SWAP(n,m,tmp);
192     }
193 
194     CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
195     cvZero(new_responses);
196 
197     data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
198         _sample_idx, _var_type, _missing_mask, _params, true, true );
199     if (_missing_mask)
200     {
201         missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
202                               _missing_mask->type);
203         cvCopy( _missing_mask, missing);
204     }
205 
206     orig_response = cvCreateMat( 1, n, CV_32F );
207     int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
208     switch (CV_MAT_TYPE(_responses->type))
209     {
210         case CV_32FC1:
211         {
212             for (int i=0; i<n; ++i)
213                 orig_response->data.fl[i] = _responses->data.fl[i*step];
214         }; break;
215         case CV_32SC1:
216         {
217             for (int i=0; i<n; ++i)
218                 orig_response->data.fl[i] = (float) _responses->data.i[i*step];
219         }; break;
220         default:
221             CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
222     }
223 
224     if (!is_regression)
225     {
226         class_count = 0;
227         unsigned char * mask = new unsigned char[n];
228         memset(mask, 0, n);
229         // compute the count of different output classes
230         for (int i=0; i<n; ++i)
231             if (!mask[i])
232             {
233                 class_count++;
234                 for (int j=i; j<n; ++j)
235                     if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
236                         mask[j] = 1;
237             }
238         delete[] mask;
239 
240         class_labels = cvCreateMat(1, class_count, CV_32S);
241         class_labels->data.i[0] = int(orig_response->data.fl[0]);
242         int j = 1;
243         for (int i=1; i<n; ++i)
244         {
245             int k = 0;
246             while ((int(orig_response->data.fl[i]) - class_labels->data.i[k]) && (k<j))
247                 k++;
248             if (k == j)
249             {
250                 class_labels->data.i[k] = int(orig_response->data.fl[i]);
251                 j++;
252             }
253         }
254     }
255 
256     // inside gbt learning proccess only regression decision trees are built
257     data->is_classifier = false;
258 
259     // preproccessing sample indices
260     if (_sample_idx)
261     {
262         int sample_idx_len = get_len(_sample_idx);
263 
264         switch (CV_MAT_TYPE(_sample_idx->type))
265         {
266             case CV_32SC1:
267             {
268                 sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
269                 for (int i=0; i<sample_idx_len; ++i)
270                     sample_idx->data.i[i] = _sample_idx->data.i[i];
271                 std::sort(sample_idx->data.i, sample_idx->data.i + sample_idx_len);
272             } break;
273             case CV_8S:
274             case CV_8U:
275             {
276                 int active_samples_count = 0;
277                 for (int i=0; i<sample_idx_len; ++i)
278                     active_samples_count += int( _sample_idx->data.ptr[i] );
279                 sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
280                 active_samples_count = 0;
281                 for (int i=0; i<sample_idx_len; ++i)
282                     if (int( _sample_idx->data.ptr[i] ))
283                         sample_idx->data.i[active_samples_count++] = i;
284 
285             } break;
286             default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
287         }
288     }
289     else
290     {
291         sample_idx = cvCreateMat( 1, n, CV_32S );
292         for (int i=0; i<n; ++i)
293             sample_idx->data.i[i] = i;
294     }
295 
296     sum_response = cvCreateMat(class_count, n, CV_32F);
297     sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
298     cvZero(sum_response);
299 
300     delta = 0.0f;
301     /*
302       in the case of a regression problem the initial guess (the zero term
303       in the sum) is set to the mean of all the training responses, that is
304       the best constant model
305     */
306     if (is_regression) base_value = find_optimal_value(sample_idx);
307     /*
308       in the case of a classification problem the initial guess (the zero term
309       in the sum) is set to zero for all the trees sequences
310     */
311     else base_value = 0.0f;
312     /*
313       current predicition on all training samples is set to be
314       equal to the base_value
315     */
316     cvSet( sum_response, cvScalar(base_value) );
317 
318     weak = new pCvSeq[class_count];
319     for (int i=0; i<class_count; ++i)
320     {
321         storage = cvCreateMemStorage();
322         weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
323         storage = 0;
324     }
325 
326     // subsample params and data
327     rng = &cv::theRNG();
328 
329     int samples_count = get_len(sample_idx);
330 
331     params.subsample_portion = params.subsample_portion <= FLT_EPSILON ||
332         1 - params.subsample_portion <= FLT_EPSILON
333         ? 1 : params.subsample_portion;
334     int train_sample_count = cvFloor(params.subsample_portion * samples_count);
335     if (train_sample_count == 0)
336         train_sample_count = samples_count;
337     int test_sample_count = samples_count - train_sample_count;
338     int* idx_data = new int[samples_count];
339     subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
340     *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data );
341     if (test_sample_count)
342     {
343         subsample_test  = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
344         *subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
345                                  idx_data + train_sample_count );
346     }
347 
348     // training procedure
349 
350     for ( int i=0; i < params.weak_count; ++i )
351     {
352         do_subsample();
353         for ( int k=0; k < class_count; ++k )
354         {
355             find_gradient(k);
356             CvDTree* tree = new CvDTree;
357             tree->train( data, subsample_train );
358             change_values(tree, k);
359 
360             if (subsample_test)
361             {
362                 CvMat x;
363                 CvMat x_miss;
364                 int* sample_data = sample_idx->data.i;
365                 int* subsample_data = subsample_test->data.i;
366                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
367                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
368                 for (int j=0; j<get_len(subsample_test); ++j)
369                 {
370                     int idx = *(sample_data + subsample_data[j]*s_step);
371                     float res = 0.0f;
372                     if (_tflag == CV_ROW_SAMPLE)
373                         cvGetRow( data->train_data, &x, idx);
374                     else
375                         cvGetCol( data->train_data, &x, idx);
376 
377                     if (missing)
378                     {
379                         if (_tflag == CV_ROW_SAMPLE)
380                             cvGetRow( missing, &x_miss, idx);
381                         else
382                             cvGetCol( missing, &x_miss, idx);
383 
384                         res = (float)tree->predict(&x, &x_miss)->value;
385                     }
386                     else
387                     {
388                         res = (float)tree->predict(&x)->value;
389                     }
390                     sum_response_tmp->data.fl[idx + k*n] =
391                                     sum_response->data.fl[idx + k*n] +
392                                     params.shrinkage * res;
393                 }
394             }
395 
396             cvSeqPush( weak[k], &tree );
397             tree = 0;
398         } // k=0..class_count
399     CvMat* tmp;
400     tmp = sum_response_tmp;
401     sum_response_tmp = sum_response;
402     sum_response = tmp;
403     tmp = 0;
404     } // i=0..params.weak_count
405 
406     delete[] idx_data;
407     cvReleaseMat(&new_responses);
408     data->free_train_data();
409 
410     return true;
411 
412 } // CvGBTrees::train(...)
413 
414 //===========================================================================
415 
416 inline float Sign(float x)
417   {
418   if (x<0.0f) return -1.0f;
419   else if (x>0.0f) return 1.0f;
420   return 0.0f;
421   }
422 
423 //===========================================================================
424 
425 void CvGBTrees::find_gradient(const int k)
426 {
427     int* sample_data = sample_idx->data.i;
428     int* subsample_data = subsample_train->data.i;
429     float* grad_data = data->responses->data.fl;
430     float* resp_data = orig_response->data.fl;
431     float* current_data = sum_response->data.fl;
432 
433     switch (params.loss_function_type)
434     // loss_function_type in
435     // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS}
436     {
437         case SQUARED_LOSS:
438         {
439             for (int i=0; i<get_len(subsample_train); ++i)
440             {
441                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
442                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
443                 int idx = *(sample_data + subsample_data[i]*s_step);
444                 grad_data[idx] = resp_data[idx] - current_data[idx];
445             }
446         }; break;
447 
448         case ABSOLUTE_LOSS:
449         {
450             for (int i=0; i<get_len(subsample_train); ++i)
451             {
452                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
453                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
454                 int idx = *(sample_data + subsample_data[i]*s_step);
455                 grad_data[idx] = Sign(resp_data[idx] - current_data[idx]);
456             }
457         }; break;
458 
459         case HUBER_LOSS:
460         {
461             float alpha = 0.2f;
462             int n = get_len(subsample_train);
463             int s_step = (sample_idx->cols > sample_idx->rows) ? 1
464                          : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
465 
466             float* residuals = new float[n];
467             for (int i=0; i<n; ++i)
468             {
469                 int idx = *(sample_data + subsample_data[i]*s_step);
470                 residuals[i] = fabs(resp_data[idx] - current_data[idx]);
471             }
472             std::sort(residuals, residuals + n);
473 
474             delta = residuals[int(ceil(n*alpha))];
475 
476             for (int i=0; i<n; ++i)
477             {
478                 int idx = *(sample_data + subsample_data[i]*s_step);
479                 float r = resp_data[idx] - current_data[idx];
480                 grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r;
481             }
482             delete[] residuals;
483 
484         }; break;
485 
486         case DEVIANCE_LOSS:
487         {
488             for (int i=0; i<get_len(subsample_train); ++i)
489             {
490                 double exp_fk = 0;
491                 double exp_sfi = 0;
492                 int s_step = (sample_idx->cols > sample_idx->rows) ? 1
493                              : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
494                 int idx = *(sample_data + subsample_data[i]*s_step);
495 
496                 for (int j=0; j<class_count; ++j)
497                 {
498                     double res;
499                     res = current_data[idx + j*sum_response->cols];
500                     res = exp(res);
501                     if (j == k) exp_fk = res;
502                     exp_sfi += res;
503                 }
504                 int orig_label = int(resp_data[idx]);
505                 /*
506                 grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) -
507                                  (float)(exp_fk / exp_sfi);
508                 */
509                 int ensemble_label = 0;
510                 while (class_labels->data.i[ensemble_label] - orig_label)
511                     ensemble_label++;
512 
513                 grad_data[idx] = (float)(!(k-ensemble_label)) -
514                                  (float)(exp_fk / exp_sfi);
515             }
516         }; break;
517 
518         default: break;
519     }
520 
521 } // CvGBTrees::find_gradient(...)
522 
523 //===========================================================================
524 
525 void CvGBTrees::change_values(CvDTree* tree, const int _k)
526 {
527     CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)];
528 
529     int* sample_data = sample_idx->data.i;
530     int* subsample_data = subsample_train->data.i;
531     int s_step = (sample_idx->cols > sample_idx->rows) ? 1
532                  : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
533 
534     CvMat x;
535     CvMat miss_x;
536 
537     for (int i=0; i<get_len(subsample_train); ++i)
538     {
539         int idx = *(sample_data + subsample_data[i]*s_step);
540         if (data->tflag == CV_ROW_SAMPLE)
541             cvGetRow( data->train_data, &x, idx);
542         else
543             cvGetCol( data->train_data, &x, idx);
544 
545         if (missing)
546         {
547             if (data->tflag == CV_ROW_SAMPLE)
548                 cvGetRow( missing, &miss_x, idx);
549             else
550                 cvGetCol( missing, &miss_x, idx);
551 
552             predictions[i] = tree->predict(&x, &miss_x);
553         }
554         else
555             predictions[i] = tree->predict(&x);
556     }
557 
558 
559     CvDTreeNode** leaves;
560     int leaves_count = 0;
561     leaves = GetLeaves( tree, leaves_count);
562 
563     for (int i=0; i<leaves_count; ++i)
564     {
565         int samples_in_leaf = 0;
566         for (int j=0; j<get_len(subsample_train); ++j)
567         {
568             if (leaves[i] == predictions[j]) samples_in_leaf++;
569         }
570 
571         if (!samples_in_leaf) // It should not be done anyways! but...
572         {
573             leaves[i]->value = 0.0;
574             continue;
575         }
576 
577         CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S);
578         int* leaf_idx_data = leaf_idx->data.i;
579 
580         for (int j=0; j<get_len(subsample_train); ++j)
581         {
582             int idx = *(sample_data + subsample_data[j]*s_step);
583             if (leaves[i] == predictions[j])
584                 *leaf_idx_data++ = idx;
585         }
586 
587         float value = find_optimal_value(leaf_idx);
588         leaves[i]->value = value;
589 
590         leaf_idx_data = leaf_idx->data.i;
591 
592         int len = sum_response_tmp->cols;
593         for (int j=0; j<get_len(leaf_idx); ++j)
594         {
595             int idx = leaf_idx_data[j];
596             sum_response_tmp->data.fl[idx + _k*len] =
597                                     sum_response->data.fl[idx + _k*len] +
598                                     params.shrinkage * value;
599         }
600         leaf_idx_data = 0;
601         cvReleaseMat(&leaf_idx);
602     }
603 
604     // releasing the memory
605     for (int i=0; i<get_len(subsample_train); ++i)
606     {
607         predictions[i] = 0;
608     }
609     delete[] predictions;
610 
611     for (int i=0; i<leaves_count; ++i)
612     {
613         leaves[i] = 0;
614     }
615     delete[] leaves;
616 
617 }
618 
619 //===========================================================================
620 /*
621 void CvGBTrees::change_values(CvDTree* tree, const int _k)
622 {
623 
624     CvDTreeNode** leaves;
625     int leaves_count = 0;
626     int offset = _k*sum_response_tmp->cols;
627     CvMat leaf_idx;
628     leaf_idx.rows = 1;
629 
630     leaves = GetLeaves( tree, leaves_count);
631 
632     for (int i=0; i<leaves_count; ++i)
633     {
634         int n = leaves[i]->sample_count;
635         int* leaf_idx_data = new int[n];
636         data->get_sample_indices(leaves[i], leaf_idx_data);
637         //CvMat* leaf_idx = new CvMat();
638         //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
639         leaf_idx.cols = n;
640         leaf_idx.data.i = leaf_idx_data;
641 
642         float value = find_optimal_value(&leaf_idx);
643         leaves[i]->value = value;
644         float val = params.shrinkage * value;
645 
646 
647         for (int j=0; j<n; ++j)
648         {
649             int idx = leaf_idx_data[j] + offset;
650             sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
651         }
652         //leaf_idx_data = 0;
653         //cvReleaseMat(&leaf_idx);
654         leaf_idx.data.i = 0;
655         //delete leaf_idx;
656         delete[] leaf_idx_data;
657     }
658 
659     // releasing the memory
660     for (int i=0; i<leaves_count; ++i)
661     {
662         leaves[i] = 0;
663     }
664     delete[] leaves;
665 
666 }    //change_values(...);
667 */
668 //===========================================================================
669 
670 float CvGBTrees::find_optimal_value( const CvMat* _Idx )
671 {
672 
673     double gamma = (double)0.0;
674 
675     int* idx = _Idx->data.i;
676     float* resp_data = orig_response->data.fl;
677     float* cur_data = sum_response->data.fl;
678     int n = get_len(_Idx);
679 
680     switch (params.loss_function_type)
681     // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4
682     {
683     case SQUARED_LOSS:
684         {
685             for (int i=0; i<n; ++i)
686                 gamma += resp_data[idx[i]] - cur_data[idx[i]];
687             gamma /= (double)n;
688         }; break;
689 
690     case ABSOLUTE_LOSS:
691         {
692             float* residuals = new float[n];
693             for (int i=0; i<n; ++i, ++idx)
694                 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
695             std::sort(residuals, residuals + n);
696             if (n % 2)
697                 gamma = residuals[n/2];
698             else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f;
699             delete[] residuals;
700         }; break;
701 
702     case HUBER_LOSS:
703         {
704             float* residuals = new float[n];
705             for (int i=0; i<n; ++i, ++idx)
706                 residuals[i] = (resp_data[*idx] - cur_data[*idx]);
707             std::sort(residuals, residuals + n);
708 
709             int n_half = n >> 1;
710             float r_median = (n == n_half<<1) ?
711                         (residuals[n_half-1] + residuals[n_half]) / 2.0f :
712                         residuals[n_half];
713 
714             for (int i=0; i<n; ++i)
715             {
716                 float dif = residuals[i] - r_median;
717                 gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif;
718             }
719             gamma /= (double)n;
720             gamma += r_median;
721             delete[] residuals;
722 
723         }; break;
724 
725     case DEVIANCE_LOSS:
726         {
727             float* grad_data = data->responses->data.fl;
728             double tmp1 = 0;
729             double tmp2 = 0;
730             double tmp  = 0;
731             for (int i=0; i<n; ++i)
732             {
733                 tmp = grad_data[idx[i]];
734                 tmp1 += tmp;
735                 tmp2 += fabs(tmp)*(1-fabs(tmp));
736             };
737             if (tmp2 == 0)
738             {
739                 tmp2 = 1;
740             }
741 
742             gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2);
743         }; break;
744 
745     default: break;
746     }
747 
748     return float(gamma);
749 
750 } // CvGBTrees::find_optimal_value
751 
752 //===========================================================================
753 
754 
755 void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node )
756 {
757     if (node->left != NULL)  leaves_get(leaves, count, node->left);
758     if (node->right != NULL) leaves_get(leaves, count, node->right);
759     if ((node->left == NULL) && (node->right == NULL))
760         leaves[count++] = node;
761 }
762 
763 //---------------------------------------------------------------------------
764 
765 CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len )
766 {
767     len = 0;
768     CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth];
769     leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root()));
770     return leaves;
771 }
772 
773 //===========================================================================
774 
775 void CvGBTrees::do_subsample()
776 {
777 
778     int n = get_len(sample_idx);
779     int* idx = subsample_train->data.i;
780 
781     for (int i = 0; i < n; i++ )
782         idx[i] = i;
783 
784     if (subsample_test)
785         for (int i = 0; i < n; i++)
786         {
787             int a = (*rng)(n);
788             int b = (*rng)(n);
789             int t;
790             CV_SWAP( idx[a], idx[b], t );
791         }
792 
793 /*
794     int n = get_len(sample_idx);
795     if (subsample_train == 0)
796         subsample_train = cvCreateMat(1, n, CV_32S);
797     int* subsample_data = subsample_train->data.i;
798     for (int i=0; i<n; ++i)
799         subsample_data[i] = i;
800     subsample_test = 0;
801 */
802 }
803 
804 //===========================================================================
805 
806 float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
807         CvMat* weak_responses, CvSlice slice, int k) const
808 {
809     float result = 0.0f;
810 
811     if (!weak) return 0.0f;
812 
813     CvSeqReader reader;
814     int weak_count = cvSliceLength( slice, weak[class_count-1] );
815     CvDTree* tree;
816 
817     if (weak_responses)
818     {
819         if (CV_MAT_TYPE(weak_responses->type) != CV_32F)
820             return 0.0f;
821         if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1))
822             return 0.0f;
823         if ((k == -1) && (weak_responses->rows != class_count))
824             return 0.0f;
825         if (weak_responses->cols != weak_count)
826             return 0.0f;
827     }
828 
829     float* sum = new float[class_count];
830     memset(sum, 0, class_count*sizeof(float));
831 
832     for (int i=0; i<class_count; ++i)
833     {
834         if ((weak[i]) && (weak_count))
835         {
836             cvStartReadSeq( weak[i], &reader );
837             cvSetSeqReaderPos( &reader, slice.start_index );
838             for (int j=0; j<weak_count; ++j)
839             {
840                 CV_READ_SEQ_ELEM( tree, reader );
841                 float p = (float)(tree->predict(_sample, _missing)->value);
842                 sum[i] += params.shrinkage * p;
843                 if (weak_responses)
844                     weak_responses->data.fl[i*weak_count+j] = p;
845             }
846         }
847     }
848 
849     for (int i=0; i<class_count; ++i)
850         sum[i] += base_value;
851 
852     if (class_count == 1)
853     {
854         result = sum[0];
855         delete[] sum;
856         return result;
857     }
858 
859     if ((k>=0) && (k<class_count))
860     {
861         result = sum[k];
862         delete[] sum;
863         return result;
864     }
865 
866     float max = sum[0];
867     int class_label = 0;
868     for (int i=1; i<class_count; ++i)
869         if (sum[i] > max)
870         {
871             max = sum[i];
872             class_label = i;
873         }
874 
875     delete[] sum;
876 
877     /*
878     int orig_class_label = -1;
879     for (int i=0; i<get_len(class_labels); ++i)
880         if (class_labels->data.i[i] == class_label+1)
881             orig_class_label = i;
882     */
883     int orig_class_label = class_labels->data.i[class_label];
884 
885     return float(orig_class_label);
886 }
887 
888 
889 class Tree_predictor : public cv::ParallelLoopBody
890 {
891 private:
892     pCvSeq* weak;
893     float* sum;
894     const int k;
895     const CvMat* sample;
896     const CvMat* missing;
897     const float shrinkage;
898 
899     static cv::Mutex SumMutex;
900 
901 
902 public:
903     Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {}
904     Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage,
905                    const CvMat* _sample, const CvMat* _missing, float* _sum ) :
906                    weak(_weak), sum(_sum), k(_k), sample(_sample),
907                    missing(_missing), shrinkage(_shrinkage)
908     {}
909 
910     Tree_predictor( const Tree_predictor& p, cv::Split ) :
911             weak(p.weak), sum(p.sum), k(p.k), sample(p.sample),
912             missing(p.missing), shrinkage(p.shrinkage)
913     {}
914 
915     Tree_predictor& operator=( const Tree_predictor& )
916     { return *this; }
917 
918     virtual void operator()(const cv::Range& range) const
919     {
920         CvSeqReader reader;
921         int begin = range.start;
922         int end = range.end;
923 
924         int weak_count = end - begin;
925         CvDTree* tree;
926 
927         for (int i=0; i<k; ++i)
928         {
929             float tmp_sum = 0.0f;
930             if ((weak[i]) && (weak_count))
931             {
932                 cvStartReadSeq( weak[i], &reader );
933                 cvSetSeqReaderPos( &reader, begin );
934                 for (int j=0; j<weak_count; ++j)
935                 {
936                     CV_READ_SEQ_ELEM( tree, reader );
937                     tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
938                 }
939             }
940 
941             {
942                 cv::AutoLock lock(SumMutex);
943                 sum[i] += tmp_sum;
944             }
945         }
946     } // Tree_predictor::operator()
947 
948     virtual ~Tree_predictor() {}
949 
950 }; // class Tree_predictor
951 
952 cv::Mutex Tree_predictor::SumMutex;
953 
954 
955 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
956             CvMat* /*weak_responses*/, CvSlice slice, int k) const
957     {
958         float result = 0.0f;
959         if (!weak) return 0.0f;
960         float* sum = new float[class_count];
961         for (int i=0; i<class_count; ++i)
962             sum[i] = 0.0f;
963         int begin = slice.start_index;
964         int end = begin + cvSliceLength( slice, weak[0] );
965 
966         pCvSeq* weak_seq = weak;
967         Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
968                                     params.shrinkage, _sample, _missing, sum);
969 
970         cv::parallel_for_(cv::Range(begin, end), predictor);
971 
972         for (int i=0; i<class_count; ++i)
973             sum[i] = sum[i] /** params.shrinkage*/ + base_value;
974 
975         if (class_count == 1)
976         {
977             result = sum[0];
978             delete[] sum;
979             return result;
980         }
981 
982         if ((k>=0) && (k<class_count))
983         {
984             result = sum[k];
985             delete[] sum;
986             return result;
987         }
988 
989         float max = sum[0];
990         int class_label = 0;
991         for (int i=1; i<class_count; ++i)
992             if (sum[i] > max)
993             {
994                 max = sum[i];
995                 class_label = i;
996             }
997 
998         delete[] sum;
999         int orig_class_label = class_labels->data.i[class_label];
1000 
1001         return float(orig_class_label);
1002     }
1003 
1004 
1005 //===========================================================================
1006 
1007 void CvGBTrees::write_params( CvFileStorage* fs ) const
1008 {
1009     const char* loss_function_type_str =
1010         params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" :
1011         params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" :
1012         params.loss_function_type == HUBER_LOSS ? "HuberLoss" :
1013         params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0;
1014 
1015 
1016     if( loss_function_type_str )
1017         cvWriteString( fs, "loss_function", loss_function_type_str );
1018     else
1019         cvWriteInt( fs, "loss_function", params.loss_function_type );
1020 
1021     cvWriteInt( fs, "ensemble_length", params.weak_count );
1022     cvWriteReal( fs, "shrinkage", params.shrinkage );
1023     cvWriteReal( fs, "subsample_portion", params.subsample_portion );
1024     //cvWriteInt( fs, "max_tree_depth", params.max_depth );
1025     //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false");
1026     if (class_labels) cvWrite( fs, "class_labels", class_labels);
1027 
1028     data->is_classifier = !problem_type();
1029     data->write_params( fs );
1030     data->is_classifier = 0;
1031 }
1032 
1033 
1034 //===========================================================================
1035 
1036 void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
1037 {
1038     CV_FUNCNAME( "CvGBTrees::read_params" );
1039     __BEGIN__;
1040 
1041 
1042     CvFileNode* temp;
1043 
1044     if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1045         return;
1046 
1047     data = new CvDTreeTrainData();
1048     CV_CALL( data->read_params(fs, fnode));
1049     data->shared = true;
1050 
1051     params.max_depth = data->params.max_depth;
1052     params.min_sample_count = data->params.min_sample_count;
1053     params.max_categories = data->params.max_categories;
1054     params.priors = data->params.priors;
1055     params.regression_accuracy = data->params.regression_accuracy;
1056     params.use_surrogates = data->params.use_surrogates;
1057 
1058     temp = cvGetFileNodeByName( fs, fnode, "loss_function" );
1059     if( !temp )
1060         EXIT;
1061 
1062     if( temp && CV_NODE_IS_STRING(temp->tag) )
1063     {
1064         const char* loss_function_type_str = cvReadString( temp, "" );
1065         params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS :
1066                             strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS :
1067                             strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS :
1068                             strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1;
1069     }
1070     else
1071         params.loss_function_type = cvReadInt( temp, -1 );
1072 
1073 
1074     if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS ||  params.loss_function_type == 2)
1075         CV_ERROR( CV_StsBadArg, "Unknown loss function" );
1076 
1077     params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
1078     params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
1079     params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 );
1080 
1081     if (data->is_classifier)
1082     {
1083         class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
1084         if( class_labels && !CV_IS_MAT(class_labels))
1085             CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
1086     }
1087     data->is_classifier = 0;
1088 
1089     __END__;
1090 }
1091 
1092 
1093 
1094 
1095 void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
1096 {
1097     CV_FUNCNAME( "CvGBTrees::write" );
1098 
1099     __BEGIN__;
1100 
1101     CvSeqReader reader;
1102     int i;
1103     cv::String s;
1104 
1105     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
1106 
1107     if( !weak )
1108         CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
1109 
1110     write_params( fs );
1111     cvWriteReal( fs, "base_value", base_value);
1112     cvWriteInt( fs, "class_count", class_count);
1113 
1114     for ( int j=0; j < class_count; ++j )
1115     {
1116         s = cv::format("trees_%d", j);
1117         cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ );
1118 
1119         cvStartReadSeq( weak[j], &reader );
1120 
1121         for( i = 0; i < weak[j]->total; i++ )
1122         {
1123             CvDTree* tree;
1124             CV_READ_SEQ_ELEM( tree, reader );
1125             cvStartWriteStruct( fs, 0, CV_NODE_MAP );
1126             tree->write( fs );
1127             cvEndWriteStruct( fs );
1128         }
1129 
1130         cvEndWriteStruct( fs );
1131     }
1132 
1133     cvEndWriteStruct( fs );
1134 
1135     __END__;
1136 }
1137 
1138 
1139 //===========================================================================
1140 
1141 
1142 void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
1143 {
1144 
1145     CV_FUNCNAME( "CvGBTrees::read" );
1146 
1147     __BEGIN__;
1148 
1149     CvSeqReader reader;
1150     CvFileNode* trees_fnode;
1151     CvMemStorage* storage;
1152     int i, ntrees;
1153     cv::String s;
1154 
1155     clear();
1156     read_params( fs, node );
1157 
1158     if( !data )
1159         EXIT;
1160 
1161     base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 );
1162     class_count = cvReadIntByName( fs, node, "class_count", 1 );
1163 
1164     weak = new pCvSeq[class_count];
1165 
1166 
1167     for (int j=0; j<class_count; ++j)
1168     {
1169         s = cv::format("trees_%d", j);
1170 
1171         trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
1172         if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
1173             CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
1174 
1175         cvStartReadSeq( trees_fnode->data.seq, &reader );
1176         ntrees = trees_fnode->data.seq->total;
1177 
1178         if( ntrees != params.weak_count )
1179             CV_ERROR( CV_StsUnmatchedSizes,
1180             "The number of trees stored does not match <ntrees> tag value" );
1181 
1182         CV_CALL( storage = cvCreateMemStorage() );
1183         weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
1184 
1185         for( i = 0; i < ntrees; i++ )
1186         {
1187             CvDTree* tree = new CvDTree();
1188             CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data ));
1189             CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
1190             cvSeqPush( weak[j], &tree );
1191         }
1192     }
1193 
1194     __END__;
1195 }
1196 
1197 //===========================================================================
1198 
1199 class Sample_predictor : public cv::ParallelLoopBody
1200 {
1201 private:
1202     const CvGBTrees* gbt;
1203     float* predictions;
1204     const CvMat* samples;
1205     const CvMat* missing;
1206     const CvMat* idx;
1207     CvSlice slice;
1208 
1209 public:
1210     Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0),
1211                          idx(0), slice(CV_WHOLE_SEQ)
1212     {}
1213 
1214     Sample_predictor(const CvGBTrees* _gbt, float* _predictions,
1215                    const CvMat* _samples, const CvMat* _missing,
1216                    const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) :
1217                    gbt(_gbt), predictions(_predictions), samples(_samples),
1218                    missing(_missing), idx(_idx), slice(_slice)
1219     {}
1220 
1221 
1222     Sample_predictor( const Sample_predictor& p, cv::Split ) :
1223             gbt(p.gbt), predictions(p.predictions),
1224             samples(p.samples), missing(p.missing), idx(p.idx),
1225             slice(p.slice)
1226     {}
1227 
1228 
1229     virtual void operator()(const cv::Range& range) const
1230     {
1231         int begin = range.start;
1232         int end = range.end;
1233 
1234         CvMat x;
1235         CvMat miss;
1236 
1237         for (int i=begin; i<end; ++i)
1238         {
1239             int j = idx ? idx->data.i[i] : i;
1240             cvGetRow(samples, &x, j);
1241             if (!missing)
1242             {
1243                 predictions[i] = gbt->predict_serial(&x,0,0,slice);
1244             }
1245             else
1246             {
1247                 cvGetRow(missing, &miss, j);
1248                 predictions[i] = gbt->predict_serial(&x,&miss,0,slice);
1249             }
1250         }
1251     } // Sample_predictor::operator()
1252 
1253     virtual ~Sample_predictor() {}
1254 
1255 }; // class Sample_predictor
1256 
1257 
1258 
1259 // type in {CV_TRAIN_ERROR, CV_TEST_ERROR}
1260 float
1261 CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1262 {
1263 
1264     float err = 0.0f;
1265     const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ?
1266                               _data->get_train_sample_idx() :
1267                               _data->get_test_sample_idx();
1268     const CvMat* response = _data->get_responses();
1269 
1270     int n = _sample_idx ? get_len(_sample_idx) : 0;
1271     n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n;
1272 
1273     if (!n)
1274         return -FLT_MAX;
1275 
1276     float* pred_resp = 0;
1277     if (resp)
1278     {
1279         resp->resize(n);
1280         pred_resp = &((*resp)[0]);
1281     }
1282     else
1283         pred_resp = new float[n];
1284 
1285     Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
1286             _data->get_missing(), _sample_idx);
1287 
1288     cv::parallel_for_(cv::Range(0,n), predictor);
1289 
1290     int* sidx = _sample_idx ? _sample_idx->data.i : 0;
1291     int r_step = CV_IS_MAT_CONT(response->type) ?
1292                 1 : response->step / CV_ELEM_SIZE(response->type);
1293 
1294 
1295     if ( !problem_type() )
1296     {
1297         for( int i = 0; i < n; i++ )
1298         {
1299             int si = sidx ? sidx[i] : i;
1300             int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1301             err += d;
1302         }
1303         err = err / (float)n * 100.0f;
1304     }
1305     else
1306     {
1307         for( int i = 0; i < n; i++ )
1308         {
1309             int si = sidx ? sidx[i] : i;
1310             float d = pred_resp[i] - response->data.fl[si*r_step];
1311             err += d*d;
1312         }
1313         err = err / (float)n;
1314     }
1315 
1316     return err;
1317 }
1318 
1319 
1320 CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag,
1321           const cv::Mat& responses, const cv::Mat& varIdx,
1322           const cv::Mat& sampleIdx, const cv::Mat& varType,
1323           const cv::Mat& missingDataMask,
1324           CvGBTreesParams _params )
1325 {
1326     data = 0;
1327     weak = 0;
1328     default_model_name = "my_boost_tree";
1329     orig_response = sum_response = sum_response_tmp = 0;
1330     subsample_train = subsample_test = 0;
1331     missing = sample_idx = 0;
1332     class_labels = 0;
1333     class_count = 1;
1334     delta = 0.0f;
1335 
1336     clear();
1337 
1338     train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false);
1339 }
1340 
1341 bool CvGBTrees::train( const cv::Mat& trainData, int tflag,
1342                    const cv::Mat& responses, const cv::Mat& varIdx,
1343                    const cv::Mat& sampleIdx, const cv::Mat& varType,
1344                    const cv::Mat& missingDataMask,
1345                    CvGBTreesParams _params,
1346                    bool update )
1347 {
1348     CvMat _trainData = trainData, _responses = responses;
1349     CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType;
1350     CvMat _missingDataMask = missingDataMask;
1351 
1352     return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx,
1353                   sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType,
1354                   missingDataMask.empty() ? 0 : &_missingDataMask, _params, update);
1355 }
1356 
1357 float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing,
1358                           const cv::Range& slice, int k ) const
1359 {
1360     CvMat _sample = sample, miss = _missing;
1361     return predict(&_sample, _missing.empty() ? 0 : &miss, 0,
1362                    slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k);
1363 }
1364 
1365 #endif
1366