• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  **	Filename:	cluster.c
3  **	Purpose:	Routines for clustering points in N-D space
4  **	Author:		Dan Johnson
5  **	History:	5/29/89, DSJ, Created.
6  **
7  **	(c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 #include "oldheap.h"
19 #include "const.h"
20 #include "cluster.h"
21 #include "emalloc.h"
22 #include "tprintf.h"
23 #include "danerror.h"
24 #include "freelist.h"
25 #include <math.h>
26 
27 #define HOTELLING 1  // If true use Hotelling's test to decide where to split.
28 #define FTABLE_X 10  // Size of FTable.
29 #define FTABLE_Y 100  // Size of FTable.
30 
31 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
32 double FTable[FTABLE_Y][FTABLE_X] = {
33  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
34   {98.502,  99.000,  99.166,  99.249,  99.300,  99.333,  99.356,  99.374,  99.388,  99.399,},
35   {34.116,  30.816,  29.457,  28.710,  28.237,  27.911,  27.672,  27.489,  27.345,  27.229,},
36   {21.198,  18.000,  16.694,  15.977,  15.522,  15.207,  14.976,  14.799,  14.659,  14.546,},
37   {16.258,  13.274,  12.060,  11.392,  10.967,  10.672,  10.456,  10.289,  10.158,  10.051,},
38   {13.745,  10.925,   9.780,   9.148,   8.746,   8.466,   8.260,   8.102,   7.976,   7.874,},
39   {12.246,   9.547,   8.451,   7.847,   7.460,   7.191,   6.993,   6.840,   6.719,   6.620,},
40   {11.259,   8.649,   7.591,   7.006,   6.632,   6.371,   6.178,   6.029,   5.911,   5.814,},
41   {10.561,   8.022,   6.992,   6.422,   6.057,   5.802,   5.613,   5.467,   5.351,   5.257,},
42   {10.044,   7.559,   6.552,   5.994,   5.636,   5.386,   5.200,   5.057,   4.942,   4.849,},
43   { 9.646,   7.206,   6.217,   5.668,   5.316,   5.069,   4.886,   4.744,   4.632,   4.539,},
44   { 9.330,   6.927,   5.953,   5.412,   5.064,   4.821,   4.640,   4.499,   4.388,   4.296,},
45   { 9.074,   6.701,   5.739,   5.205,   4.862,   4.620,   4.441,   4.302,   4.191,   4.100,},
46   { 8.862,   6.515,   5.564,   5.035,   4.695,   4.456,   4.278,   4.140,   4.030,   3.939,},
47   { 8.683,   6.359,   5.417,   4.893,   4.556,   4.318,   4.142,   4.004,   3.895,   3.805,},
48   { 8.531,   6.226,   5.292,   4.773,   4.437,   4.202,   4.026,   3.890,   3.780,   3.691,},
49   { 8.400,   6.112,   5.185,   4.669,   4.336,   4.102,   3.927,   3.791,   3.682,   3.593,},
50   { 8.285,   6.013,   5.092,   4.579,   4.248,   4.015,   3.841,   3.705,   3.597,   3.508,},
51   { 8.185,   5.926,   5.010,   4.500,   4.171,   3.939,   3.765,   3.631,   3.523,   3.434,},
52   { 8.096,   5.849,   4.938,   4.431,   4.103,   3.871,   3.699,   3.564,   3.457,   3.368,},
53   { 8.017,   5.780,   4.874,   4.369,   4.042,   3.812,   3.640,   3.506,   3.398,   3.310,},
54   { 7.945,   5.719,   4.817,   4.313,   3.988,   3.758,   3.587,   3.453,   3.346,   3.258,},
55   { 7.881,   5.664,   4.765,   4.264,   3.939,   3.710,   3.539,   3.406,   3.299,   3.211,},
56   { 7.823,   5.614,   4.718,   4.218,   3.895,   3.667,   3.496,   3.363,   3.256,   3.168,},
57   { 7.770,   5.568,   4.675,   4.177,   3.855,   3.627,   3.457,   3.324,   3.217,   3.129,},
58   { 7.721,   5.526,   4.637,   4.140,   3.818,   3.591,   3.421,   3.288,   3.182,   3.094,},
59   { 7.677,   5.488,   4.601,   4.106,   3.785,   3.558,   3.388,   3.256,   3.149,   3.062,},
60   { 7.636,   5.453,   4.568,   4.074,   3.754,   3.528,   3.358,   3.226,   3.120,   3.032,},
61   { 7.598,   5.420,   4.538,   4.045,   3.725,   3.499,   3.330,   3.198,   3.092,   3.005,},
62   { 7.562,   5.390,   4.510,   4.018,   3.699,   3.473,   3.305,   3.173,   3.067,   2.979,},
63   { 7.530,   5.362,   4.484,   3.993,   3.675,   3.449,   3.281,   3.149,   3.043,   2.955,},
64   { 7.499,   5.336,   4.459,   3.969,   3.652,   3.427,   3.258,   3.127,   3.021,   2.934,},
65   { 7.471,   5.312,   4.437,   3.948,   3.630,   3.406,   3.238,   3.106,   3.000,   2.913,},
66   { 7.444,   5.289,   4.416,   3.927,   3.611,   3.386,   3.218,   3.087,   2.981,   2.894,},
67   { 7.419,   5.268,   4.396,   3.908,   3.592,   3.368,   3.200,   3.069,   2.963,   2.876,},
68   { 7.396,   5.248,   4.377,   3.890,   3.574,   3.351,   3.183,   3.052,   2.946,   2.859,},
69   { 7.373,   5.229,   4.360,   3.873,   3.558,   3.334,   3.167,   3.036,   2.930,   2.843,},
70   { 7.353,   5.211,   4.343,   3.858,   3.542,   3.319,   3.152,   3.021,   2.915,   2.828,},
71   { 7.333,   5.194,   4.327,   3.843,   3.528,   3.305,   3.137,   3.006,   2.901,   2.814,},
72   { 7.314,   5.179,   4.313,   3.828,   3.514,   3.291,   3.124,   2.993,   2.888,   2.801,},
73   { 7.296,   5.163,   4.299,   3.815,   3.501,   3.278,   3.111,   2.980,   2.875,   2.788,},
74   { 7.280,   5.149,   4.285,   3.802,   3.488,   3.266,   3.099,   2.968,   2.863,   2.776,},
75   { 7.264,   5.136,   4.273,   3.790,   3.476,   3.254,   3.087,   2.957,   2.851,   2.764,},
76   { 7.248,   5.123,   4.261,   3.778,   3.465,   3.243,   3.076,   2.946,   2.840,   2.754,},
77   { 7.234,   5.110,   4.249,   3.767,   3.454,   3.232,   3.066,   2.935,   2.830,   2.743,},
78   { 7.220,   5.099,   4.238,   3.757,   3.444,   3.222,   3.056,   2.925,   2.820,   2.733,},
79   { 7.207,   5.087,   4.228,   3.747,   3.434,   3.213,   3.046,   2.916,   2.811,   2.724,},
80   { 7.194,   5.077,   4.218,   3.737,   3.425,   3.204,   3.037,   2.907,   2.802,   2.715,},
81   { 7.182,   5.066,   4.208,   3.728,   3.416,   3.195,   3.028,   2.898,   2.793,   2.706,},
82   { 7.171,   5.057,   4.199,   3.720,   3.408,   3.186,   3.020,   2.890,   2.785,   2.698,},
83   { 7.159,   5.047,   4.191,   3.711,   3.400,   3.178,   3.012,   2.882,   2.777,   2.690,},
84   { 7.149,   5.038,   4.182,   3.703,   3.392,   3.171,   3.005,   2.874,   2.769,   2.683,},
85   { 7.139,   5.030,   4.174,   3.695,   3.384,   3.163,   2.997,   2.867,   2.762,   2.675,},
86   { 7.129,   5.021,   4.167,   3.688,   3.377,   3.156,   2.990,   2.860,   2.755,   2.668,},
87   { 7.119,   5.013,   4.159,   3.681,   3.370,   3.149,   2.983,   2.853,   2.748,   2.662,},
88   { 7.110,   5.006,   4.152,   3.674,   3.363,   3.143,   2.977,   2.847,   2.742,   2.655,},
89   { 7.102,   4.998,   4.145,   3.667,   3.357,   3.136,   2.971,   2.841,   2.736,   2.649,},
90   { 7.093,   4.991,   4.138,   3.661,   3.351,   3.130,   2.965,   2.835,   2.730,   2.643,},
91   { 7.085,   4.984,   4.132,   3.655,   3.345,   3.124,   2.959,   2.829,   2.724,   2.637,},
92   { 7.077,   4.977,   4.126,   3.649,   3.339,   3.119,   2.953,   2.823,   2.718,   2.632,},
93   { 7.070,   4.971,   4.120,   3.643,   3.333,   3.113,   2.948,   2.818,   2.713,   2.626,},
94   { 7.062,   4.965,   4.114,   3.638,   3.328,   3.108,   2.942,   2.813,   2.708,   2.621,},
95   { 7.055,   4.959,   4.109,   3.632,   3.323,   3.103,   2.937,   2.808,   2.703,   2.616,},
96   { 7.048,   4.953,   4.103,   3.627,   3.318,   3.098,   2.932,   2.803,   2.698,   2.611,},
97   { 7.042,   4.947,   4.098,   3.622,   3.313,   3.093,   2.928,   2.798,   2.693,   2.607,},
98   { 7.035,   4.942,   4.093,   3.618,   3.308,   3.088,   2.923,   2.793,   2.689,   2.602,},
99   { 7.029,   4.937,   4.088,   3.613,   3.304,   3.084,   2.919,   2.789,   2.684,   2.598,},
100   { 7.023,   4.932,   4.083,   3.608,   3.299,   3.080,   2.914,   2.785,   2.680,   2.593,},
101   { 7.017,   4.927,   4.079,   3.604,   3.295,   3.075,   2.910,   2.781,   2.676,   2.589,},
102   { 7.011,   4.922,   4.074,   3.600,   3.291,   3.071,   2.906,   2.777,   2.672,   2.585,},
103   { 7.006,   4.917,   4.070,   3.596,   3.287,   3.067,   2.902,   2.773,   2.668,   2.581,},
104   { 7.001,   4.913,   4.066,   3.591,   3.283,   3.063,   2.898,   2.769,   2.664,   2.578,},
105   { 6.995,   4.908,   4.062,   3.588,   3.279,   3.060,   2.895,   2.765,   2.660,   2.574,},
106   { 6.990,   4.904,   4.058,   3.584,   3.275,   3.056,   2.891,   2.762,   2.657,   2.570,},
107   { 6.985,   4.900,   4.054,   3.580,   3.272,   3.052,   2.887,   2.758,   2.653,   2.567,},
108   { 6.981,   4.896,   4.050,   3.577,   3.268,   3.049,   2.884,   2.755,   2.650,   2.563,},
109   { 6.976,   4.892,   4.047,   3.573,   3.265,   3.046,   2.881,   2.751,   2.647,   2.560,},
110   { 6.971,   4.888,   4.043,   3.570,   3.261,   3.042,   2.877,   2.748,   2.644,   2.557,},
111   { 6.967,   4.884,   4.040,   3.566,   3.258,   3.039,   2.874,   2.745,   2.640,   2.554,},
112   { 6.963,   4.881,   4.036,   3.563,   3.255,   3.036,   2.871,   2.742,   2.637,   2.551,},
113   { 6.958,   4.877,   4.033,   3.560,   3.252,   3.033,   2.868,   2.739,   2.634,   2.548,},
114   { 6.954,   4.874,   4.030,   3.557,   3.249,   3.030,   2.865,   2.736,   2.632,   2.545,},
115   { 6.950,   4.870,   4.027,   3.554,   3.246,   3.027,   2.863,   2.733,   2.629,   2.542,},
116   { 6.947,   4.867,   4.024,   3.551,   3.243,   3.025,   2.860,   2.731,   2.626,   2.539,},
117   { 6.943,   4.864,   4.021,   3.548,   3.240,   3.022,   2.857,   2.728,   2.623,   2.537,},
118   { 6.939,   4.861,   4.018,   3.545,   3.238,   3.019,   2.854,   2.725,   2.621,   2.534,},
119   { 6.935,   4.858,   4.015,   3.543,   3.235,   3.017,   2.852,   2.723,   2.618,   2.532,},
120   { 6.932,   4.855,   4.012,   3.540,   3.233,   3.014,   2.849,   2.720,   2.616,   2.529,},
121   { 6.928,   4.852,   4.010,   3.538,   3.230,   3.012,   2.847,   2.718,   2.613,   2.527,},
122   { 6.925,   4.849,   4.007,   3.535,   3.228,   3.009,   2.845,   2.715,   2.611,   2.524,},
123   { 6.922,   4.846,   4.004,   3.533,   3.225,   3.007,   2.842,   2.713,   2.609,   2.522,},
124   { 6.919,   4.844,   4.002,   3.530,   3.223,   3.004,   2.840,   2.711,   2.606,   2.520,},
125   { 6.915,   4.841,   3.999,   3.528,   3.221,   3.002,   2.838,   2.709,   2.604,   2.518,},
126   { 6.912,   4.838,   3.997,   3.525,   3.218,   3.000,   2.835,   2.706,   2.602,   2.515,},
127   { 6.909,   4.836,   3.995,   3.523,   3.216,   2.998,   2.833,   2.704,   2.600,   2.513,},
128   { 6.906,   4.833,   3.992,   3.521,   3.214,   2.996,   2.831,   2.702,   2.598,   2.511,},
129   { 6.904,   4.831,   3.990,   3.519,   3.212,   2.994,   2.829,   2.700,   2.596,   2.509,},
130   { 6.901,   4.829,   3.988,   3.517,   3.210,   2.992,   2.827,   2.698,   2.594,   2.507,},
131   { 6.898,   4.826,   3.986,   3.515,   3.208,   2.990,   2.825,   2.696,   2.592,   2.505,},
132   { 6.895,   4.824,   3.984,   3.513,   3.206,   2.988,   2.823,   2.694,   2.590,   2.503}
133 };
134 
135 /* define the variance which will be used as a minimum variance for any
136   dimension of any feature. Since most features are calculated from numbers
137   with a precision no better than 1 in 128, the variance should never be
138   less than the square of this number for parameters whose range is 1. */
139 #define MINVARIANCE     0.0001
140 
141 /* define the absolute minimum number of samples which must be present in
142   order to accurately test hypotheses about underlying probability
143   distributions.  Define separately the minimum samples that are needed
144   before a statistical analysis is attempted; this number should be
145   equal to MINSAMPLES but can be set to a lower number for early testing
146   when very few samples are available. */
147 #define MINBUCKETS      5
148 #define MINSAMPLESPERBUCKET 5
149 #define MINSAMPLES    (MINBUCKETS * MINSAMPLESPERBUCKET)
150 #define MINSAMPLESNEEDED  1
151 
152 /* define the size of the table which maps normalized samples to
153   histogram buckets.  Also define the number of standard deviations
154   in a normal distribution which are considered to be significant.
155   The mapping table will be defined in such a way that it covers
156   the specified number of standard deviations on either side of
157   the mean.  BUCKETTABLESIZE should always be even. */
158 #define BUCKETTABLESIZE   1024
159 #define NORMALEXTENT    3.0
160 
161 typedef struct
162 {
163   CLUSTER *Cluster;
164   CLUSTER *Neighbor;
165 }
166 
167 
168 TEMPCLUSTER;
169 
170 typedef struct
171 {
172   FLOAT32 AvgVariance;
173   FLOAT32 *CoVariance;
174   FLOAT32 *Min;                  // largest negative distance from the mean
175   FLOAT32 *Max;                  // largest positive distance from the mean
176 }
177 
178 
179 STATISTICS;
180 
181 typedef struct
182 {
183   DISTRIBUTION Distribution;     // distribution being tested for
184   uinT32 SampleCount;            // # of samples in histogram
185   FLOAT64 Confidence;            // confidence level of test
186   FLOAT64 ChiSquared;            // test threshold
187   uinT16 NumberOfBuckets;        // number of cells in histogram
188   uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
189   uinT32 *Count;                 // frequency of occurence histogram
190   FLOAT32 *ExpectedCount;        // expected histogram
191 }
192 
193 
194 BUCKETS;
195 
196 typedef struct
197 {
198   uinT16 DegreesOfFreedom;
199   FLOAT64 Alpha;
200   FLOAT64 ChiSquared;
201 }
202 
203 
204 CHISTRUCT;
205 
206 typedef FLOAT64 (*DENSITYFUNC) (inT32);
207 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
208 
209 #define Odd(N) ((N)%2)
210 #define Mirror(N,R) ((R) - (N) - 1)
211 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
212 
213 //--------------Global Data Definitions and Declarations----------------------
214 /* the following variables are declared as global so that routines which
215 are called from the kd-tree walker can get to them. */
216 static HEAP *Heap;
217 static TEMPCLUSTER *TempCluster;
218 static KDTREE *Tree;
219 static inT32 CurrentTemp;
220 
221 /* the following variables describe a discrete normal distribution
222   which is used by NormalDensity() and NormalBucket().  The
223   constant NORMALEXTENT determines how many standard
224   deviations of the distribution are mapped onto the fixed
225   discrete range of x.  x=0 is mapped to -NORMALEXTENT standard
226   deviations and x=BUCKETTABLESIZE is mapped to
227   +NORMALEXTENT standard deviations. */
228 #define SqrtOf2Pi     2.506628275
229 static FLOAT64 NormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
230 static FLOAT64 NormalVariance =
231 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
232 static FLOAT64 NormalMagnitude =
233 (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
234 static FLOAT64 NormalMean = BUCKETTABLESIZE / 2;
235 
236 // keep a list of histogram buckets to minimize recomputing them
237 static LIST OldBuckets[] = { NIL, NIL, NIL };
238 
239 /* define lookup tables used to compute the number of histogram buckets
240   that should be used for a given number of samples. */
241 #define LOOKUPTABLESIZE   8
242 #define MAXBUCKETS      39
243 #define MAXDEGREESOFFREEDOM MAXBUCKETS
244 
245 static uinT32 CountTable[LOOKUPTABLESIZE] = {
246   MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
247 };
248 static uinT16 BucketsTable[LOOKUPTABLESIZE] = {
249   MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
250 };
251 
252 /*-------------------------------------------------------------------------
253           Private Function Prototypes
254 --------------------------------------------------------------------------*/
255 void CreateClusterTree(CLUSTERER *Clusterer);
256 
257 void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level);
258 
259 CLUSTER *FindNearestNeighbor(KDTREE *Tree,
260                              CLUSTER *Cluster,
261                              FLOAT32 *Distance);
262 
263 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
264 
265 inT32 MergeClusters (inT16 N,
266 register PARAM_DESC ParamDesc[],
267 register inT32 n1,
268 register inT32 n2,
269 register FLOAT32 m[],
270 register FLOAT32 m1[], register FLOAT32 m2[]);
271 
272 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
273 
274 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
275                          CLUSTERCONFIG *Config,
276                          CLUSTER *Cluster);
277 
278 PROTOTYPE *MakeDegenerateProto(uinT16 N,
279                                CLUSTER *Cluster,
280                                STATISTICS *Statistics,
281                                PROTOSTYLE Style,
282                                inT32 MinSamples);
283 
284 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
285                                CLUSTERCONFIG *Config,
286                                CLUSTER *Cluster,
287                                STATISTICS *Statistics);
288 
289 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
290                               CLUSTER *Cluster,
291                               STATISTICS *Statistics,
292                               BUCKETS *Buckets);
293 
294 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
295                                CLUSTER *Cluster,
296                                STATISTICS *Statistics,
297                                BUCKETS *Buckets);
298 
299 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
300                           CLUSTER *Cluster,
301                           STATISTICS *Statistics,
302                           BUCKETS *NormalBuckets,
303                           FLOAT64 Confidence);
304 
305 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
306 
307 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
308 
309 STATISTICS *ComputeStatistics (inT16 N,
310 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
311 
312 PROTOTYPE *NewSphericalProto(uinT16 N,
313                              CLUSTER *Cluster,
314                              STATISTICS *Statistics);
315 
316 PROTOTYPE *NewEllipticalProto(inT16 N,
317                               CLUSTER *Cluster,
318                               STATISTICS *Statistics);
319 
320 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
321 
322 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
323 
324 BOOL8 Independent (PARAM_DESC ParamDesc[],
325 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
326 
327 BUCKETS *GetBuckets(DISTRIBUTION Distribution,
328                     uinT32 SampleCount,
329                     FLOAT64 Confidence);
330 
331 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
332                      uinT32 SampleCount,
333                      FLOAT64 Confidence);
334 
335 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
336 
337 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
338 
339 FLOAT64 NormalDensity(inT32 x);
340 
341 FLOAT64 UniformDensity(inT32 x);
342 
343 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
344 
345 void FillBuckets(BUCKETS *Buckets,
346                  CLUSTER *Cluster,
347                  uinT16 Dim,
348                  PARAM_DESC *ParamDesc,
349                  FLOAT32 Mean,
350                  FLOAT32 StdDev);
351 
352 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
353                     FLOAT32 x,
354                     FLOAT32 Mean,
355                     FLOAT32 StdDev);
356 
357 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
358                      FLOAT32 x,
359                      FLOAT32 Mean,
360                      FLOAT32 StdDev);
361 
362 BOOL8 DistributionOK(BUCKETS *Buckets);
363 
364 void FreeStatistics(STATISTICS *Statistics);
365 
366 void FreeBuckets(BUCKETS *Buckets);
367 
368 void FreeCluster(CLUSTER *Cluster);
369 
370 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
371 
372 int NumBucketsMatch(void *arg1,   //BUCKETS                                       *Histogram,
373                     void *arg2);  //uinT16                        *DesiredNumberOfBuckets);
374 
375 int ListEntryMatch(void *arg1, void *arg2);
376 
377 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
378 
379 void InitBuckets(BUCKETS *Buckets);
380 
381 int AlphaMatch(void *arg1,   //CHISTRUCT                             *ChiStruct,
382                void *arg2);  //CHISTRUCT                             *SearchKey);
383 
384 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
385 
386 FLOAT64 Solve(SOLVEFUNC Function,
387               void *FunctionParams,
388               FLOAT64 InitialGuess,
389               FLOAT64 Accuracy);
390 
391 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
392 
393 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
394                           CLUSTER *Cluster,
395                           FLOAT32 MaxIllegal);
396 
397 double InvertMatrix(const float* input, int size, float* inv);
398 
399 //--------------------------Public Code--------------------------------------
400 /** MakeClusterer **********************************************************
401 Parameters:	SampleSize	number of dimensions in feature space
402       ParamDesc	description of each dimension
403 Globals:	None
404 Operation:	This routine creates a new clusterer data structure,
405       initializes it, and returns a pointer to it.
406 Return:		pointer to the new clusterer data structure
407 Exceptions:	None
408 History:	5/29/89, DSJ, Created.
409 ****************************************************************************/
410 CLUSTERER *
MakeClusterer(inT16 SampleSize,PARAM_DESC ParamDesc[])411 MakeClusterer (inT16 SampleSize, PARAM_DESC ParamDesc[]) {
412   CLUSTERER *Clusterer;
413   int i;
414 
415   // allocate main clusterer data structure and init simple fields
416   Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
417   Clusterer->SampleSize = SampleSize;
418   Clusterer->NumberOfSamples = 0;
419   Clusterer->NumChar = 0;
420 
421   // init fields which will not be used initially
422   Clusterer->Root = NULL;
423   Clusterer->ProtoList = NIL;
424 
425   // maintain a copy of param descriptors in the clusterer data structure
426   Clusterer->ParamDesc =
427     (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
428   for (i = 0; i < SampleSize; i++) {
429     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
430     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
431     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
432     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
433     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
434     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
435     Clusterer->ParamDesc[i].MidRange =
436       (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
437   }
438 
439   // allocate a kd tree to hold the samples
440   Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
441 
442   // execute hook for monitoring clustering operation
443   // (*ClustererCreationHook)( Clusterer );
444 
445   return (Clusterer);
446 }                                // MakeClusterer
447 
448 
449 /** MakeSample ***********************************************************
450 Parameters:	Clusterer	clusterer data structure to add sample to
451       Feature		feature to be added to clusterer
452       CharID		unique ident. of char that sample came from
453 Globals:	None
454 Operation:	This routine creates a new sample data structure to hold
455       the specified feature.  This sample is added to the clusterer
456       data structure (so that it knows which samples are to be
457       clustered later), and a pointer to the sample is returned to
458       the caller.
459 Return:		Pointer to the new sample data structure
460 Exceptions:	ALREADYCLUSTERED	MakeSample can't be called after
461       ClusterSamples has been called
462 History:	5/29/89, DSJ, Created.
463 *****************************************************************************/
464 SAMPLE *
MakeSample(CLUSTERER * Clusterer,FLOAT32 Feature[],inT32 CharID)465 MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID) {
466   SAMPLE *Sample;
467   int i;
468 
469   // see if the samples have already been clustered - if so trap an error
470   if (Clusterer->Root != NULL)
471     DoError (ALREADYCLUSTERED,
472       "Can't add samples after they have been clustered");
473 
474   // allocate the new sample and initialize it
475   Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
476     (Clusterer->SampleSize -
477     1) * sizeof (FLOAT32));
478   Sample->Clustered = FALSE;
479   Sample->Prototype = FALSE;
480   Sample->SampleCount = 1;
481   Sample->Left = NULL;
482   Sample->Right = NULL;
483   Sample->CharID = CharID;
484 
485   for (i = 0; i < Clusterer->SampleSize; i++)
486     Sample->Mean[i] = Feature[i];
487 
488   // add the sample to the KD tree - keep track of the total # of samples
489   Clusterer->NumberOfSamples++;
490   KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
491   if (CharID >= Clusterer->NumChar)
492     Clusterer->NumChar = CharID + 1;
493 
494   // execute hook for monitoring clustering operation
495   // (*SampleCreationHook)( Sample );
496 
497   return (Sample);
498 }                                // MakeSample
499 
500 
501 /** ClusterSamples ***********************************************************
502 Parameters:	Clusterer	data struct containing samples to be clustered
503       Config		parameters which control clustering process
504 Globals:	None
505 Operation:	This routine first checks to see if the samples in this
506       clusterer have already been clustered before; if so, it does
507       not bother to recreate the cluster tree.  It simply recomputes
508       the prototypes based on the new Config info.
509         If the samples have not been clustered before, the
510       samples in the KD tree are formed into a cluster tree and then
511       the prototypes are computed from the cluster tree.
512         In either case this routine returns a pointer to a
513       list of prototypes that best represent the samples given
514       the constraints specified in Config.
515 Return:		Pointer to a list of prototypes
516 Exceptions:	None
517 History:	5/29/89, DSJ, Created.
518 *******************************************************************************/
ClusterSamples(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)519 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
520   //only create cluster tree if samples have never been clustered before
521   if (Clusterer->Root == NULL)
522     CreateClusterTree(Clusterer);
523 
524   //deallocate the old prototype list if one exists
525   FreeProtoList (&Clusterer->ProtoList);
526   Clusterer->ProtoList = NIL;
527 
528   //compute prototypes starting at the root node in the tree
529   ComputePrototypes(Clusterer, Config);
530   return (Clusterer->ProtoList);
531 }                                // ClusterSamples
532 
533 
534 /** FreeClusterer *************************************************************
535 Parameters:	Clusterer	pointer to data structure to be freed
536 Globals:	None
537 Operation:	This routine frees all of the memory allocated to the
538       specified data structure.  It will not, however, free
539       the memory used by the prototype list.  The pointers to
540       the clusters for each prototype in the list will be set
541       to NULL to indicate that the cluster data structures no
542       longer exist.  Any sample lists that have been obtained
543       via calls to GetSamples are no longer valid.
544 Return:		None
545 Exceptions:	None
546 History:	6/6/89, DSJ, Created.
547 *******************************************************************************/
FreeClusterer(CLUSTERER * Clusterer)548 void FreeClusterer(CLUSTERER *Clusterer) {
549   if (Clusterer != NULL) {
550     memfree (Clusterer->ParamDesc);
551     if (Clusterer->KDTree != NULL)
552       FreeKDTree (Clusterer->KDTree);
553     if (Clusterer->Root != NULL)
554       FreeCluster (Clusterer->Root);
555     iterate (Clusterer->ProtoList) {
556       ((PROTOTYPE *) (first_node (Clusterer->ProtoList)))->Cluster = NULL;
557     }
558     memfree(Clusterer);
559   }
560 }                                // FreeClusterer
561 
562 
563 /** FreeProtoList ************************************************************
564 Parameters:	ProtoList	pointer to list of prototypes to be freed
565 Globals:	None
566 Operation:	This routine frees all of the memory allocated to the
567       specified list of prototypes.  The clusters which are
568       pointed to by the prototypes are not freed.
569 Return:		None
570 Exceptions:	None
571 History:	6/6/89, DSJ, Created.
572 *****************************************************************************/
FreeProtoList(LIST * ProtoList)573 void FreeProtoList(LIST *ProtoList) {
574   destroy_nodes(*ProtoList, FreePrototype);
575 }                                // FreeProtoList
576 
577 
578 /** FreePrototype ************************************************************
579 Parameters:	Prototype	prototype data structure to be deallocated
580 Globals:	None
581 Operation:	This routine deallocates the memory consumed by the specified
582       prototype and modifies the corresponding cluster so that it
583       is no longer marked as a prototype.  The cluster is NOT
584       deallocated by this routine.
585 Return:		None
586 Exceptions:	None
587 History:	5/30/89, DSJ, Created.
588 *******************************************************************************/
FreePrototype(void * arg)589 void FreePrototype(void *arg) {  //PROTOTYPE     *Prototype)
590   PROTOTYPE *Prototype = (PROTOTYPE *) arg;
591 
592   // unmark the corresponding cluster (if there is one
593   if (Prototype->Cluster != NULL)
594     Prototype->Cluster->Prototype = FALSE;
595 
596   // deallocate the prototype statistics and then the prototype itself
597   if (Prototype->Distrib != NULL)
598     memfree (Prototype->Distrib);
599   if (Prototype->Mean != NULL)
600     memfree (Prototype->Mean);
601   if (Prototype->Style != spherical) {
602     if (Prototype->Variance.Elliptical != NULL)
603       memfree (Prototype->Variance.Elliptical);
604     if (Prototype->Magnitude.Elliptical != NULL)
605       memfree (Prototype->Magnitude.Elliptical);
606     if (Prototype->Weight.Elliptical != NULL)
607       memfree (Prototype->Weight.Elliptical);
608   }
609   memfree(Prototype);
610 }                                // FreePrototype
611 
612 
613 /** NextSample ************************************************************
614 Parameters:	SearchState	ptr to list containing clusters to be searched
615 Globals:	None
616 Operation:	This routine is used to find all of the samples which
617       belong to a cluster.  It starts by removing the top
618       cluster on the cluster list (SearchState).  If this cluster is
619       a leaf it is returned.  Otherwise, the right subcluster
620       is pushed on the list and we continue the search in the
621       left subcluster.  This continues until a leaf is found.
622       If all samples have been found, NULL is returned.
623       InitSampleSearch() must be called
624       before NextSample() to initialize the search.
625 Return:		Pointer to the next leaf cluster (sample) or NULL.
626 Exceptions:	None
627 History:	6/16/89, DSJ, Created.
628 ****************************************************************************/
NextSample(LIST * SearchState)629 CLUSTER *NextSample(LIST *SearchState) {
630   CLUSTER *Cluster;
631 
632   if (*SearchState == NIL)
633     return (NULL);
634   Cluster = (CLUSTER *) first_node (*SearchState);
635   *SearchState = pop (*SearchState);
636   while (TRUE) {
637     if (Cluster->Left == NULL)
638       return (Cluster);
639     *SearchState = push (*SearchState, Cluster->Right);
640     Cluster = Cluster->Left;
641   }
642 }                                // NextSample
643 
644 
645 /** Mean ***********************************************************
646 Parameters:	Proto		prototype to return mean of
647       Dimension	dimension whose mean is to be returned
648 Globals:	none
649 Operation:	This routine returns the mean of the specified
650       prototype in the indicated dimension.
651 Return:		Mean of Prototype in Dimension
652 Exceptions: none
653 History:	7/6/89, DSJ, Created.
654 *********************************************************************/
Mean(PROTOTYPE * Proto,uinT16 Dimension)655 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
656   return (Proto->Mean[Dimension]);
657 }                                // Mean
658 
659 
660 /** StandardDeviation *************************************************
661 Parameters:	Proto		prototype to return standard deviation of
662       Dimension	dimension whose stddev is to be returned
663 Globals:	none
664 Operation:	This routine returns the standard deviation of the
665       prototype in the indicated dimension.
666 Return:		Standard deviation of Prototype in Dimension
667 Exceptions: none
668 History:	7/6/89, DSJ, Created.
669 **********************************************************************/
StandardDeviation(PROTOTYPE * Proto,uinT16 Dimension)670 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
671   switch (Proto->Style) {
672     case spherical:
673       return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
674     case elliptical:
675       return ((FLOAT32)
676         sqrt ((double) Proto->Variance.Elliptical[Dimension]));
677     case mixed:
678       switch (Proto->Distrib[Dimension]) {
679         case normal:
680           return ((FLOAT32)
681             sqrt ((double) Proto->Variance.Elliptical[Dimension]));
682         case uniform:
683         case D_random:
684           return (Proto->Variance.Elliptical[Dimension]);
685       }
686   }
687   return 0.0f;
688 }                                // StandardDeviation
689 
690 
691 /*---------------------------------------------------------------------------
692             Private Code
693 ----------------------------------------------------------------------------*/
694 /** CreateClusterTree *******************************************************
695 Parameters:	Clusterer	data structure holdings samples to be clustered
696 Globals:	Tree		kd-tree holding samples
697       TempCluster	array of temporary clusters
698       CurrentTemp	index of next temp cluster to be used
699       Heap		heap used to hold temp clusters - "best" on top
700 Operation:	This routine performs a bottoms-up clustering on the samples
701       held in the kd-tree of the Clusterer data structure.  The
702       result is a cluster tree.  Each node in the tree represents
703       a cluster which conceptually contains a subset of the samples.
704       More precisely, the cluster contains all of the samples which
705       are contained in its two sub-clusters.  The leaves of the
706       tree are the individual samples themselves; they have no
707       sub-clusters.  The root node of the tree conceptually contains
708       all of the samples.
709 Return:		None (the Clusterer data structure is changed)
710 Exceptions:	None
711 History:	5/29/89, DSJ, Created.
712 ******************************************************************************/
CreateClusterTree(CLUSTERER * Clusterer)713 void CreateClusterTree(CLUSTERER *Clusterer) {
714   HEAPENTRY HeapEntry;
715   TEMPCLUSTER *PotentialCluster;
716 
717   // save the kd-tree in a global variable so kd-tree walker can get at it
718   Tree = Clusterer->KDTree;
719 
720   // allocate memory to to hold all of the "potential" clusters
721   TempCluster = (TEMPCLUSTER *)
722     Emalloc (Clusterer->NumberOfSamples * sizeof (TEMPCLUSTER));
723   CurrentTemp = 0;
724 
725   // each sample and its nearest neighbor form a "potential" cluster
726   // save these in a heap with the "best" potential clusters on top
727   Heap = MakeHeap (Clusterer->NumberOfSamples);
728   KDWalk (Tree, (void_proc) MakePotentialClusters);
729 
730   // form potential clusters into actual clusters - always do "best" first
731   while (GetTopOfHeap (Heap, &HeapEntry) != EMPTY) {
732     PotentialCluster = (TEMPCLUSTER *) (HeapEntry.Data);
733 
734     // if main cluster of potential cluster is already in another cluster
735     // then we don't need to worry about it
736     if (PotentialCluster->Cluster->Clustered) {
737       continue;
738     }
739 
740     // if main cluster is not yet clustered, but its nearest neighbor is
741     // then we must find a new nearest neighbor
742     else if (PotentialCluster->Neighbor->Clustered) {
743       PotentialCluster->Neighbor =
744         FindNearestNeighbor (Tree, PotentialCluster->Cluster,
745         &(HeapEntry.Key));
746       if (PotentialCluster->Neighbor != NULL) {
747         HeapStore(Heap, &HeapEntry);
748       }
749     }
750 
751     // if neither cluster is already clustered, form permanent cluster
752     else {
753       PotentialCluster->Cluster =
754         MakeNewCluster(Clusterer, PotentialCluster);
755       PotentialCluster->Neighbor =
756         FindNearestNeighbor (Tree, PotentialCluster->Cluster,
757         &(HeapEntry.Key));
758       if (PotentialCluster->Neighbor != NULL) {
759         HeapStore(Heap, &HeapEntry);
760       }
761     }
762   }
763 
764   // the root node in the cluster tree is now the only node in the kd-tree
765   Clusterer->Root = (CLUSTER *) RootOf (Clusterer->KDTree);
766 
767   // free up the memory used by the K-D tree, heap, and temp clusters
768   FreeKDTree(Tree);
769   Clusterer->KDTree = NULL;
770   FreeHeap(Heap);
771   memfree(TempCluster);
772 }                                // CreateClusterTree
773 
774 
775 /** MakePotentialClusters **************************************************
776 Parameters:	Cluster	current cluster being visited in kd-tree walk
777       Order	order in which cluster is being visited
778       Level	level of this cluster in the kd-tree
779 Globals:	Tree		kd-tree to be searched for neighbors
780       TempCluster	array of temporary clusters
781       CurrentTemp	index of next temp cluster to be used
782       Heap		heap used to hold temp clusters - "best" on top
783 Operation:	This routine is designed to be used in concert with the
784       KDWalk routine.  It will create a potential cluster for
785       each sample in the kd-tree that is being walked.  This
786       potential cluster will then be pushed on the heap.
787 Return:		none
788 Exceptions: none
789 History:	5/29/89, DSJ, Created.
790       7/13/89, DSJ, Removed visibility of kd-tree node data struct.
791 ******************************************************************************/
MakePotentialClusters(CLUSTER * Cluster,VISIT Order,inT32 Level)792 void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level) {
793   HEAPENTRY HeapEntry;
794 
795   if ((Order == preorder) || (Order == leaf)) {
796     TempCluster[CurrentTemp].Cluster = Cluster;
797     HeapEntry.Data = (char *) &(TempCluster[CurrentTemp]);
798     TempCluster[CurrentTemp].Neighbor =
799       FindNearestNeighbor (Tree, TempCluster[CurrentTemp].Cluster,
800       &(HeapEntry.Key));
801     if (TempCluster[CurrentTemp].Neighbor != NULL) {
802       HeapStore(Heap, &HeapEntry);
803       CurrentTemp++;
804     }
805   }
806 }                                // MakePotentialClusters
807 
808 
809 /** FindNearestNeighbor *********************************************************
810 Parameters:	Tree		kd-tree to search in for nearest neighbor
811       Cluster		cluster whose nearest neighbor is to be found
812       Distance	ptr to variable to report distance found
813 Globals:	none
814 Operation:	This routine searches the specified kd-tree for the nearest
815       neighbor of the specified cluster.  It actually uses the
816       kd routines to find the 2 nearest neighbors since one of them
817       will be the original cluster.  A pointer to the nearest
818       neighbor is returned, if it can be found, otherwise NULL is
819       returned.  The distance between the 2 nodes is placed
820       in the specified variable.
821 Return:		Pointer to the nearest neighbor of Cluster, or NULL
822 Exceptions: none
823 History:	5/29/89, DSJ, Created.
824       7/13/89, DSJ, Removed visibility of kd-tree node data struct
825 ********************************************************************************/
826 CLUSTER *
FindNearestNeighbor(KDTREE * Tree,CLUSTER * Cluster,FLOAT32 * Distance)827 FindNearestNeighbor (KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
828 #define MAXNEIGHBORS  2
829 #define MAXDISTANCE   MAX_FLOAT32
830 {
831   CLUSTER *Neighbor[MAXNEIGHBORS];
832   FLOAT32 Dist[MAXNEIGHBORS];
833   inT32 NumberOfNeighbors;
834   inT32 i;
835   CLUSTER *BestNeighbor;
836 
837   // find the 2 nearest neighbors of the cluster
838   NumberOfNeighbors = KDNearestNeighborSearch
839     (Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, Neighbor, Dist);
840 
841   // search for the nearest neighbor that is not the cluster itself
842   *Distance = MAXDISTANCE;
843   BestNeighbor = NULL;
844   for (i = 0; i < NumberOfNeighbors; i++) {
845     if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
846       *Distance = Dist[i];
847       BestNeighbor = Neighbor[i];
848     }
849   }
850   return (BestNeighbor);
851 }                                // FindNearestNeighbor
852 
853 
854 /** MakeNewCluster *************************************************************
855 Parameters:	Clusterer	current clustering environment
856       TempCluster	potential cluster to make permanent
857 Globals:	none
858 Operation:	This routine creates a new permanent cluster from the
859       clusters specified in TempCluster.  The 2 clusters in
860       TempCluster are marked as "clustered" and deleted from
861       the kd-tree.  The new cluster is then added to the kd-tree.
862       Return: Pointer to the new permanent cluster
863 Exceptions:	none
864 History:	5/29/89, DSJ, Created.
865       7/13/89, DSJ, Removed visibility of kd-tree node data struct
866 ********************************************************************************/
MakeNewCluster(CLUSTERER * Clusterer,TEMPCLUSTER * TempCluster)867 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
868   CLUSTER *Cluster;
869 
870   // allocate the new cluster and initialize it
871   Cluster = (CLUSTER *) Emalloc (sizeof (CLUSTER) +
872     (Clusterer->SampleSize -
873     1) * sizeof (FLOAT32));
874   Cluster->Clustered = FALSE;
875   Cluster->Prototype = FALSE;
876   Cluster->Left = TempCluster->Cluster;
877   Cluster->Right = TempCluster->Neighbor;
878   Cluster->CharID = -1;
879 
880   // mark the old clusters as "clustered" and delete them from the kd-tree
881   Cluster->Left->Clustered = TRUE;
882   Cluster->Right->Clustered = TRUE;
883   KDDelete (Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
884   KDDelete (Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
885 
886   // compute the mean and sample count for the new cluster
887   Cluster->SampleCount =
888     MergeClusters (Clusterer->SampleSize, Clusterer->ParamDesc,
889     Cluster->Left->SampleCount, Cluster->Right->SampleCount,
890     Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
891 
892   // add the new cluster to the KD tree
893   KDStore (Clusterer->KDTree, Cluster->Mean, Cluster);
894   return (Cluster);
895 }                                // MakeNewCluster
896 
897 
898 /** MergeClusters ************************************************************
899 Parameters:	N	# of dimensions (size of arrays)
900       ParamDesc	array of dimension descriptions
901       n1, n2	number of samples in each old cluster
902       m	array to hold mean of new cluster
903       m1, m2	arrays containing means of old clusters
904 Globals:	None
905 Operation:	This routine merges two clusters into one larger cluster.
906       To do this it computes the number of samples in the new
907       cluster and the mean of the new cluster.  The ParamDesc
908       information is used to ensure that circular dimensions
909       are handled correctly.
910 Return:		The number of samples in the new cluster.
911 Exceptions:	None
912 History:	5/31/89, DSJ, Created.
913 *********************************************************************************/
914 inT32
MergeClusters(inT16 N,register PARAM_DESC ParamDesc[],register inT32 n1,register inT32 n2,register FLOAT32 m[],register FLOAT32 m1[],register FLOAT32 m2[])915 MergeClusters (inT16 N,
916 register PARAM_DESC ParamDesc[],
917 register inT32 n1,
918 register inT32 n2,
919 register FLOAT32 m[],
920 register FLOAT32 m1[], register FLOAT32 m2[]) {
921   register inT32 i, n;
922 
923   n = n1 + n2;
924   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
925     if (ParamDesc->Circular) {
926       // if distance between means is greater than allowed
927       // reduce upper point by one "rotation" to compute mean
928       // then normalize the mean back into the accepted range
929       if ((*m2 - *m1) > ParamDesc->HalfRange) {
930         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
931         if (*m < ParamDesc->Min)
932           *m += ParamDesc->Range;
933       }
934       else if ((*m1 - *m2) > ParamDesc->HalfRange) {
935         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
936         if (*m < ParamDesc->Min)
937           *m += ParamDesc->Range;
938       }
939       else
940         *m = (n1 * *m1 + n2 * *m2) / n;
941     }
942     else
943       *m = (n1 * *m1 + n2 * *m2) / n;
944   }
945   return (n);
946 }                                // MergeClusters
947 
948 
949 /** ComputePrototypes *******************************************************
950 Parameters:	Clusterer	data structure holding cluster tree
951       Config		parameters used to control prototype generation
952 Globals:	None
953 Operation:	This routine decides which clusters in the cluster tree
954       should be represented by prototypes, forms a list of these
955       prototypes, and places the list in the Clusterer data
956       structure.
957 Return:		None
958 Exceptions:	None
959 History:	5/30/89, DSJ, Created.
960 *******************************************************************************/
ComputePrototypes(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)961 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
962   LIST ClusterStack = NIL;
963   CLUSTER *Cluster;
964   PROTOTYPE *Prototype;
965 
966   // use a stack to keep track of clusters waiting to be processed
967   // initially the only cluster on the stack is the root cluster
968   if (Clusterer->Root != NULL)
969     ClusterStack = push (NIL, Clusterer->Root);
970 
971   // loop until we have analyzed all clusters which are potential prototypes
972   while (ClusterStack != NIL) {
973     // remove the next cluster to be analyzed from the stack
974     // try to make a prototype from the cluster
975     // if successful, put it on the proto list, else split the cluster
976     Cluster = (CLUSTER *) first_node (ClusterStack);
977     ClusterStack = pop (ClusterStack);
978     Prototype = MakePrototype (Clusterer, Config, Cluster);
979     if (Prototype != NULL) {
980       Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
981     }
982     else {
983       ClusterStack = push (ClusterStack, Cluster->Right);
984       ClusterStack = push (ClusterStack, Cluster->Left);
985     }
986   }
987 }                                // ComputePrototypes
988 
989 
990 /** MakePrototype ***********************************************************
991 Parameters:	Clusterer	data structure holding cluster tree
992       Config		parameters used to control prototype generation
993       Cluster		cluster to be made into a prototype
994 Globals:	None
995 Operation:	This routine attempts to create a prototype from the
996       specified cluster that conforms to the distribution
997       specified in Config.  If there are too few samples in the
998       cluster to perform a statistical analysis, then a prototype
999       is generated but labelled as insignificant.  If the
1000       dimensions of the cluster are not independent, no prototype
1001       is generated and NULL is returned.  If a prototype can be
1002       found that matches the desired distribution then a pointer
1003       to it is returned, otherwise NULL is returned.
1004 Return:		Pointer to new prototype or NULL
1005 Exceptions:	None
1006 History:	6/19/89, DSJ, Created.
1007 *******************************************************************************/
MakePrototype(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster)1008 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
1009                          CLUSTERCONFIG *Config,
1010                          CLUSTER *Cluster) {
1011   STATISTICS *Statistics;
1012   PROTOTYPE *Proto;
1013   BUCKETS *Buckets;
1014 
1015   // filter out clusters which contain samples from the same character
1016   if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
1017     return (NULL);
1018 
1019   // compute the covariance matrix and ranges for the cluster
1020   Statistics =
1021     ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1022 
1023   // check for degenerate clusters which need not be analyzed further
1024   // note that the MinSamples test assumes that all clusters with multiple
1025   // character samples have been removed (as above)
1026   Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics,
1027     Config->ProtoStyle,
1028     (inT32) (Config->MinSamples *
1029     Clusterer->NumChar));
1030   if (Proto != NULL) {
1031     FreeStatistics(Statistics);
1032     return (Proto);
1033   }
1034   // check to ensure that all dimensions are independent
1035   if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize,
1036   Statistics->CoVariance, Config->Independence)) {
1037     FreeStatistics(Statistics);
1038     return (NULL);
1039   }
1040 
1041   if (HOTELLING && Config->ProtoStyle == elliptical) {
1042     Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1043     if (Proto != NULL) {
1044       FreeStatistics(Statistics);
1045       return Proto;
1046     }
1047   }
1048 
1049   // create a histogram data structure used to evaluate distributions
1050   Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence);
1051 
1052   // create a prototype based on the statistics and test it
1053   switch (Config->ProtoStyle) {
1054     case spherical:
1055       Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1056       break;
1057     case elliptical:
1058       Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1059       break;
1060     case mixed:
1061       Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1062         Config->Confidence);
1063       break;
1064     case automatic:
1065       Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1066       if (Proto != NULL)
1067         break;
1068       Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1069       if (Proto != NULL)
1070         break;
1071       Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1072         Config->Confidence);
1073       break;
1074   }
1075   FreeBuckets(Buckets);
1076   FreeStatistics(Statistics);
1077   return (Proto);
1078 }                                // MakePrototype
1079 
1080 
1081 /** MakeDegenerateProto ******************************************************
1082 Parameters:	N		number of dimensions
1083       Cluster		cluster being analyzed
1084       Statistics	statistical info about cluster
1085       Style		type of prototype to be generated
1086       MinSamples	minimum number of samples in a cluster
1087 Globals:	None
1088 Operation:	This routine checks for clusters which are degenerate and
1089       therefore cannot be analyzed in a statistically valid way.
1090       A cluster is defined as degenerate if it does not have at
1091       least MINSAMPLESNEEDED samples in it.  If the cluster is
1092       found to be degenerate, a prototype of the specified style
1093       is generated and marked as insignificant.  A cluster is
1094       also degenerate if it does not have at least MinSamples
1095       samples in it.
1096       If the cluster is not degenerate, NULL is returned.
1097 Return:		Pointer to degenerate prototype or NULL.
1098 Exceptions:	None
1099 History:	6/20/89, DSJ, Created.
1100       7/12/89, DSJ, Changed name and added check for 0 stddev.
1101       8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).
1102 ********************************************************************************/
MakeDegenerateProto(uinT16 N,CLUSTER * Cluster,STATISTICS * Statistics,PROTOSTYLE Style,inT32 MinSamples)1103 PROTOTYPE *MakeDegenerateProto(  //this was MinSample
1104                                uinT16 N,
1105                                CLUSTER *Cluster,
1106                                STATISTICS *Statistics,
1107                                PROTOSTYLE Style,
1108                                inT32 MinSamples) {
1109   PROTOTYPE *Proto = NULL;
1110 
1111   if (MinSamples < MINSAMPLESNEEDED)
1112     MinSamples = MINSAMPLESNEEDED;
1113 
1114   if (Cluster->SampleCount < MinSamples) {
1115     switch (Style) {
1116       case spherical:
1117         Proto = NewSphericalProto (N, Cluster, Statistics);
1118         break;
1119       case elliptical:
1120       case automatic:
1121         Proto = NewEllipticalProto (N, Cluster, Statistics);
1122         break;
1123       case mixed:
1124         Proto = NewMixedProto (N, Cluster, Statistics);
1125         break;
1126     }
1127     Proto->Significant = FALSE;
1128   }
1129   return (Proto);
1130 }                                // MakeDegenerateProto
1131 
1132 /** TestEllipticalProto ****************************************************
1133 Parameters:	Clusterer	data struct containing samples being clustered
1134       Config provides the magic number of samples that make a good cluster
1135       Cluster		cluster to be made into an elliptical prototype
1136       Statistics	statistical info about cluster
1137 Globals:	None
1138 Operation:	This routine tests the specified cluster to see if **
1139 *     there is a statistically significant difference between
1140 *     the sub-clusters that would be made if the cluster were to
1141 *     be split. If not, then a new prototype is formed and
1142 *     returned to the caller. If there is, then NULL is returned
1143 *     to the caller.
1144 Return:		Pointer to new elliptical prototype or NULL.
1145 ****************************************************************************/
TestEllipticalProto(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster,STATISTICS * Statistics)1146 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
1147                                CLUSTERCONFIG *Config,
1148                                CLUSTER *Cluster,
1149                                STATISTICS *Statistics) {
1150   // Fraction of the number of samples used as a range around 1 within
1151   // which a cluster has the magic size that allows a boost to the
1152   // FTable by kFTableBoostMargin, thus allowing clusters near the
1153   // magic size (equal to the number of sample characters) to be more
1154   // likely to stay together.
1155   const double kMagicSampleMargin = 0.0625;
1156   const double kFTableBoostMargin = 2.0;
1157 
1158   int N = Clusterer->SampleSize;
1159   CLUSTER* Left = Cluster->Left;
1160   CLUSTER* Right = Cluster->Right;
1161   if (Left == NULL || Right == NULL)
1162     return NULL;
1163   int TotalDims = Left->SampleCount + Right->SampleCount;
1164   if (TotalDims < N + 1 || TotalDims < 2)
1165     return NULL;
1166   const int kMatrixSize = N * N * sizeof(FLOAT32);
1167   FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1168   FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1169   FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1170   // Compute a new covariance matrix that only uses essential features.
1171   for (int i = 0; i < N; ++i) {
1172     int row_offset = i * N;
1173     if (!Clusterer->ParamDesc[i].NonEssential) {
1174       for (int j = 0; j < N; ++j) {
1175         if (!Clusterer->ParamDesc[j].NonEssential)
1176           Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1177         else
1178           Covariance[j + row_offset] = 0.0f;
1179       }
1180     } else {
1181       for (int j = 0; j < N; ++j) {
1182         if (i == j)
1183           Covariance[j + row_offset] = 1.0f;
1184         else
1185           Covariance[j + row_offset] = 0.0f;
1186       }
1187     }
1188   }
1189   double err = InvertMatrix(Covariance, N, Inverse);
1190   if (err > 1) {
1191     tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1192   }
1193   int EssentialN = 0;
1194   for (int dim = 0; dim < N; ++dim) {
1195     if (!Clusterer->ParamDesc[dim].NonEssential) {
1196       Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1197       ++EssentialN;
1198     } else {
1199       Delta[dim] = 0.0f;
1200     }
1201   }
1202   // Compute Hotelling's T-squared.
1203   double Tsq = 0.0;
1204   for (int x = 0; x < N; ++x) {
1205     double temp = 0.0;
1206     for (int y = 0; y < N; ++y) {
1207       temp += Inverse[y + N*x] * Delta[y];
1208     }
1209     Tsq += Delta[x] * temp;
1210   }
1211   memfree(Covariance);
1212   memfree(Inverse);
1213   memfree(Delta);
1214   // Changed this function to match the formula in
1215   // Statistical Methods in Medical Research p 473
1216   // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1217   // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1218   double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1219   int Fx = EssentialN;
1220   if (Fx > FTABLE_X)
1221     Fx = FTABLE_X;
1222   --Fx;
1223   int Fy = TotalDims - EssentialN - 1;
1224   if (Fy > FTABLE_Y)
1225     Fy = FTABLE_Y;
1226   --Fy;
1227   double FTarget = FTable[Fy][Fx];
1228   if (Config->MagicSamples > 0 &&
1229       TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1230       TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1231     // Give magic-sized clusters a magic FTable boost.
1232     FTarget += kFTableBoostMargin;
1233   }
1234   if (F < FTarget) {
1235     return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1236   }
1237   return NULL;
1238 }
1239 
1240 /* MakeSphericalProto *******************************************************
1241 Parameters:	Clusterer	data struct containing samples being clustered
1242       Cluster		cluster to be made into a spherical prototype
1243       Statistics	statistical info about cluster
1244       Buckets		histogram struct used to analyze distribution
1245 Globals:	None
1246 Operation:	This routine tests the specified cluster to see if it can
1247       be approximated by a spherical normal distribution.  If it
1248       can be, then a new prototype is formed and returned to the
1249       caller.  If it can't be, then NULL is returned to the caller.
1250 Return:		Pointer to new spherical prototype or NULL.
1251 Exceptions:	None
1252 History:	6/1/89, DSJ, Created.
1253 ******************************************************************************/
MakeSphericalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)1254 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
1255                               CLUSTER *Cluster,
1256                               STATISTICS *Statistics,
1257                               BUCKETS *Buckets) {
1258   PROTOTYPE *Proto = NULL;
1259   int i;
1260 
1261   // check that each dimension is a normal distribution
1262   for (i = 0; i < Clusterer->SampleSize; i++) {
1263     if (Clusterer->ParamDesc[i].NonEssential)
1264       continue;
1265 
1266     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1267       Cluster->Mean[i],
1268       sqrt ((FLOAT64) (Statistics->AvgVariance)));
1269     if (!DistributionOK (Buckets))
1270       break;
1271   }
1272   // if all dimensions matched a normal distribution, make a proto
1273   if (i >= Clusterer->SampleSize)
1274     Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1275   return (Proto);
1276 }                                // MakeSphericalProto
1277 
1278 
1279 /** MakeEllipticalProto ****************************************************
1280 Parameters:	Clusterer	data struct containing samples being clustered
1281       Cluster		cluster to be made into an elliptical prototype
1282       Statistics	statistical info about cluster
1283       Buckets		histogram struct used to analyze distribution
1284 Globals:	None
1285 Operation:	This routine tests the specified cluster to see if it can
1286       be approximated by an elliptical normal distribution.  If it
1287       can be, then a new prototype is formed and returned to the
1288       caller.  If it can't be, then NULL is returned to the caller.
1289 Return:		Pointer to new elliptical prototype or NULL.
1290 Exceptions:	None
1291 History:	6/12/89, DSJ, Created.
1292 ****************************************************************************/
MakeEllipticalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)1293 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
1294                                CLUSTER *Cluster,
1295                                STATISTICS *Statistics,
1296                                BUCKETS *Buckets) {
1297   PROTOTYPE *Proto = NULL;
1298   int i;
1299 
1300   // check that each dimension is a normal distribution
1301   for (i = 0; i < Clusterer->SampleSize; i++) {
1302     if (Clusterer->ParamDesc[i].NonEssential)
1303       continue;
1304 
1305     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1306       Cluster->Mean[i],
1307       sqrt ((FLOAT64) Statistics->
1308       CoVariance[i * (Clusterer->SampleSize + 1)]));
1309     if (!DistributionOK (Buckets))
1310       break;
1311   }
1312   // if all dimensions matched a normal distribution, make a proto
1313   if (i >= Clusterer->SampleSize)
1314     Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1315   return (Proto);
1316 }                                // MakeEllipticalProto
1317 
1318 
1319 /** MakeMixedProto ***********************************************************
1320 Parameters:	Clusterer	data struct containing samples being clustered
1321       Cluster		cluster to be made into a prototype
1322       Statistics	statistical info about cluster
1323       NormalBuckets	histogram struct used to analyze distribution
1324       Confidence	confidence level for alternate distributions
1325 Globals:	None
1326 Operation:	This routine tests each dimension of the specified cluster to
1327       see what distribution would best approximate that dimension.
1328       Each dimension is compared to the following distributions
1329       in order: normal, random, uniform.  If each dimension can
1330       be represented by one of these distributions,
1331       then a new prototype is formed and returned to the
1332       caller.  If it can't be, then NULL is returned to the caller.
1333 Return:		Pointer to new mixed prototype or NULL.
1334 Exceptions:	None
1335 History:	6/12/89, DSJ, Created.
1336 ********************************************************************************/
MakeMixedProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * NormalBuckets,FLOAT64 Confidence)1337 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
1338                           CLUSTER *Cluster,
1339                           STATISTICS *Statistics,
1340                           BUCKETS *NormalBuckets,
1341                           FLOAT64 Confidence) {
1342   PROTOTYPE *Proto;
1343   int i;
1344   BUCKETS *UniformBuckets = NULL;
1345   BUCKETS *RandomBuckets = NULL;
1346 
1347   // create a mixed proto to work on - initially assume all dimensions normal*/
1348   Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1349 
1350   // find the proper distribution for each dimension
1351   for (i = 0; i < Clusterer->SampleSize; i++) {
1352     if (Clusterer->ParamDesc[i].NonEssential)
1353       continue;
1354 
1355     FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1356       Proto->Mean[i],
1357       sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1358     if (DistributionOK (NormalBuckets))
1359       continue;
1360 
1361     if (RandomBuckets == NULL)
1362       RandomBuckets =
1363         GetBuckets (D_random, Cluster->SampleCount, Confidence);
1364     MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1365     FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1366       Proto->Mean[i], Proto->Variance.Elliptical[i]);
1367     if (DistributionOK (RandomBuckets))
1368       continue;
1369 
1370     if (UniformBuckets == NULL)
1371       UniformBuckets =
1372         GetBuckets (uniform, Cluster->SampleCount, Confidence);
1373     MakeDimUniform(i, Proto, Statistics);
1374     FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1375       Proto->Mean[i], Proto->Variance.Elliptical[i]);
1376     if (DistributionOK (UniformBuckets))
1377       continue;
1378     break;
1379   }
1380   // if any dimension failed to match a distribution, discard the proto
1381   if (i < Clusterer->SampleSize) {
1382     FreePrototype(Proto);
1383     Proto = NULL;
1384   }
1385   if (UniformBuckets != NULL)
1386     FreeBuckets(UniformBuckets);
1387   if (RandomBuckets != NULL)
1388     FreeBuckets(RandomBuckets);
1389   return (Proto);
1390 }                                // MakeMixedProto
1391 
1392 
1393 /* MakeDimRandom *************************************************************
1394 Parameters:	i		index of dimension to be changed
1395       Proto		prototype whose dimension is to be altered
1396       ParamDesc	description of specified dimension
1397 Globals:	None
1398 Operation:	This routine alters the ith dimension of the specified
1399       mixed prototype to be D_random.
1400 Return:		None
1401 Exceptions:	None
1402 History:	6/20/89, DSJ, Created.
1403 ******************************************************************************/
MakeDimRandom(uinT16 i,PROTOTYPE * Proto,PARAM_DESC * ParamDesc)1404 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1405   Proto->Distrib[i] = D_random;
1406   Proto->Mean[i] = ParamDesc->MidRange;
1407   Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1408 
1409   // subtract out the previous magnitude of this dimension from the total
1410   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1411   Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1412   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1413   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1414 
1415   // note that the proto Weight is irrelevant for D_random protos
1416 }                                // MakeDimRandom
1417 
1418 
1419 /** MakeDimUniform ***********************************************************
1420 Parameters:	i		index of dimension to be changed
1421       Proto		prototype whose dimension is to be altered
1422       Statistics	statistical info about prototype
1423 Globals:	None
1424 Operation:	This routine alters the ith dimension of the specified
1425       mixed prototype to be uniform.
1426 Return:		None
1427 Exceptions:	None
1428 History:	6/20/89, DSJ, Created.
1429 ******************************************************************************/
MakeDimUniform(uinT16 i,PROTOTYPE * Proto,STATISTICS * Statistics)1430 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1431   Proto->Distrib[i] = uniform;
1432   Proto->Mean[i] = Proto->Cluster->Mean[i] +
1433     (Statistics->Min[i] + Statistics->Max[i]) / 2;
1434   Proto->Variance.Elliptical[i] =
1435     (Statistics->Max[i] - Statistics->Min[i]) / 2;
1436   if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1437     Proto->Variance.Elliptical[i] = MINVARIANCE;
1438 
1439   // subtract out the previous magnitude of this dimension from the total
1440   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1441   Proto->Magnitude.Elliptical[i] =
1442     1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1443   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1444   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1445 
1446   // note that the proto Weight is irrelevant for uniform protos
1447 }                                // MakeDimUniform
1448 
1449 
1450 /** ComputeStatistics *********************************************************
1451 Parameters:	N		number of dimensions
1452       ParamDesc	array of dimension descriptions
1453       Cluster		cluster whose stats are to be computed
1454 Globals:	None
1455 Operation:	This routine searches the cluster tree for all leaf nodes
1456       which are samples in the specified cluster.  It computes
1457       a full covariance matrix for these samples as well as
1458       keeping track of the ranges (min and max) for each
1459       dimension.  A special data structure is allocated to
1460       return this information to the caller.  An incremental
1461       algorithm for computing statistics is not used because
1462       it will not work with circular dimensions.
1463 Return:		Pointer to new data structure containing statistics
1464 Exceptions:	None
1465 History:	6/2/89, DSJ, Created.
1466 *********************************************************************************/
1467 STATISTICS *
ComputeStatistics(inT16 N,PARAM_DESC ParamDesc[],CLUSTER * Cluster)1468 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1469   STATISTICS *Statistics;
1470   int i, j;
1471   FLOAT32 *CoVariance;
1472   FLOAT32 *Distance;
1473   LIST SearchState;
1474   SAMPLE *Sample;
1475   uinT32 SampleCountAdjustedForBias;
1476 
1477   // allocate memory to hold the statistics results
1478   Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1479   Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1480   Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1481   Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1482 
1483   // allocate temporary memory to hold the sample to mean distances
1484   Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1485 
1486   // initialize the statistics
1487   Statistics->AvgVariance = 1.0;
1488   CoVariance = Statistics->CoVariance;
1489   for (i = 0; i < N; i++) {
1490     Statistics->Min[i] = 0.0;
1491     Statistics->Max[i] = 0.0;
1492     for (j = 0; j < N; j++, CoVariance++)
1493       *CoVariance = 0;
1494   }
1495   // find each sample in the cluster and merge it into the statistics
1496   InitSampleSearch(SearchState, Cluster);
1497   while ((Sample = NextSample (&SearchState)) != NULL) {
1498     for (i = 0; i < N; i++) {
1499       Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1500       if (ParamDesc[i].Circular) {
1501         if (Distance[i] > ParamDesc[i].HalfRange)
1502           Distance[i] -= ParamDesc[i].Range;
1503         if (Distance[i] < -ParamDesc[i].HalfRange)
1504           Distance[i] += ParamDesc[i].Range;
1505       }
1506       if (Distance[i] < Statistics->Min[i])
1507         Statistics->Min[i] = Distance[i];
1508       if (Distance[i] > Statistics->Max[i])
1509         Statistics->Max[i] = Distance[i];
1510     }
1511     CoVariance = Statistics->CoVariance;
1512     for (i = 0; i < N; i++)
1513       for (j = 0; j < N; j++, CoVariance++)
1514         *CoVariance += Distance[i] * Distance[j];
1515   }
1516   // normalize the variances by the total number of samples
1517   // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1518   // also compute the geometic mean of the diagonal variances
1519   // ensure that clusters with only 1 sample are handled correctly
1520   if (Cluster->SampleCount > 1)
1521     SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1522   else
1523     SampleCountAdjustedForBias = 1;
1524   CoVariance = Statistics->CoVariance;
1525   for (i = 0; i < N; i++)
1526   for (j = 0; j < N; j++, CoVariance++) {
1527     *CoVariance /= SampleCountAdjustedForBias;
1528     if (j == i) {
1529       if (*CoVariance < MINVARIANCE)
1530         *CoVariance = MINVARIANCE;
1531       Statistics->AvgVariance *= *CoVariance;
1532     }
1533   }
1534   Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1535                                        1.0 / N);
1536 
1537   // release temporary memory and return
1538   memfree(Distance);
1539   return (Statistics);
1540 }                                // ComputeStatistics
1541 
1542 
1543 /** NewSpericalProto *********************************************************
1544 Parameters:	N		number of dimensions
1545       Cluster		cluster to be made into a spherical prototype
1546       Statistics	statistical info about samples in cluster
1547 Globals:	None
1548 Operation:	This routine creates a spherical prototype data structure to
1549       approximate the samples in the specified cluster.
1550       Spherical prototypes have a single variance which is
1551       common across all dimensions.  All dimensions are normally
1552       distributed and independent.
1553 Return:		Pointer to a new spherical prototype data structure
1554 Exceptions:	None
1555 History:	6/19/89, DSJ, Created.
1556 ******************************************************************************/
NewSphericalProto(uinT16 N,CLUSTER * Cluster,STATISTICS * Statistics)1557 PROTOTYPE *NewSphericalProto(uinT16 N,
1558                              CLUSTER *Cluster,
1559                              STATISTICS *Statistics) {
1560   PROTOTYPE *Proto;
1561 
1562   Proto = NewSimpleProto (N, Cluster);
1563 
1564   Proto->Variance.Spherical = Statistics->AvgVariance;
1565   if (Proto->Variance.Spherical < MINVARIANCE)
1566     Proto->Variance.Spherical = MINVARIANCE;
1567 
1568   Proto->Magnitude.Spherical =
1569     1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1570   Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1571                                      (double) N);
1572   Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1573   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1574 
1575   return (Proto);
1576 }                                // NewSphericalProto
1577 
1578 
1579 /** NewEllipticalProto *******************************************************
1580 Parameters:	N		number of dimensions
1581       Cluster		cluster to be made into an elliptical prototype
1582       Statistics	statistical info about samples in cluster
1583 Globals:	None
1584 Operation:	This routine creates an elliptical prototype data structure to
1585       approximate the samples in the specified cluster.
1586       Elliptical prototypes have a variance for each dimension.
1587       All dimensions are normally distributed and independent.
1588 Return:		Pointer to a new elliptical prototype data structure
1589 Exceptions:	None
1590 History:	6/19/89, DSJ, Created.
1591 *******************************************************************************/
NewEllipticalProto(inT16 N,CLUSTER * Cluster,STATISTICS * Statistics)1592 PROTOTYPE *NewEllipticalProto(inT16 N,
1593                               CLUSTER *Cluster,
1594                               STATISTICS *Statistics) {
1595   PROTOTYPE *Proto;
1596   FLOAT32 *CoVariance;
1597   int i;
1598 
1599   Proto = NewSimpleProto (N, Cluster);
1600   Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1601   Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1602   Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1603 
1604   CoVariance = Statistics->CoVariance;
1605   Proto->TotalMagnitude = 1.0;
1606   for (i = 0; i < N; i++, CoVariance += N + 1) {
1607     Proto->Variance.Elliptical[i] = *CoVariance;
1608     if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1609       Proto->Variance.Elliptical[i] = MINVARIANCE;
1610 
1611     Proto->Magnitude.Elliptical[i] =
1612       1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1613     Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1614     Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1615   }
1616   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1617   Proto->Style = elliptical;
1618   return (Proto);
1619 }                                // NewEllipticalProto
1620 
1621 
1622 /** MewMixedProto ************************************************************
1623 Parameters:	N		number of dimensions
1624       Cluster		cluster to be made into a mixed prototype
1625       Statistics	statistical info about samples in cluster
1626 Globals:	None
1627 Operation:	This routine creates a mixed prototype data structure to
1628       approximate the samples in the specified cluster.
1629       Mixed prototypes can have different distributions for
1630       each dimension.  All dimensions are independent.  The
1631       structure is initially filled in as though it were an
1632       elliptical prototype.  The actual distributions of the
1633       dimensions can be altered by other routines.
1634 Return:		Pointer to a new mixed prototype data structure
1635 Exceptions:	None
1636 History:	6/19/89, DSJ, Created.
1637 ********************************************************************************/
NewMixedProto(inT16 N,CLUSTER * Cluster,STATISTICS * Statistics)1638 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1639   PROTOTYPE *Proto;
1640   int i;
1641 
1642   Proto = NewEllipticalProto (N, Cluster, Statistics);
1643   Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1644 
1645   for (i = 0; i < N; i++) {
1646     Proto->Distrib[i] = normal;
1647   }
1648   Proto->Style = mixed;
1649   return (Proto);
1650 }                                // NewMixedProto
1651 
1652 
1653 /** NewSimpleProto ***********************************************************
1654 Parameters:	N		number of dimensions
1655       Cluster		cluster to be made into a prototype
1656 Globals:	None
1657 Operation:	This routine allocates memory to hold a simple prototype
1658       data structure, i.e. one without independent distributions
1659       and variances for each dimension.
1660 Return:		Pointer to new simple prototype
1661 Exceptions:	None
1662 History:	6/19/89, DSJ, Created.
1663 *******************************************************************************/
NewSimpleProto(inT16 N,CLUSTER * Cluster)1664 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
1665   PROTOTYPE *Proto;
1666   int i;
1667 
1668   Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1669   Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1670 
1671   for (i = 0; i < N; i++)
1672     Proto->Mean[i] = Cluster->Mean[i];
1673   Proto->Distrib = NULL;
1674 
1675   Proto->Significant = TRUE;
1676   Proto->Style = spherical;
1677   Proto->NumSamples = Cluster->SampleCount;
1678   Proto->Cluster = Cluster;
1679   Proto->Cluster->Prototype = TRUE;
1680   return (Proto);
1681 }                                // NewSimpleProto
1682 
1683 
1684 /** Independent ***************************************************************
1685 Parameters:	ParamDesc	descriptions of each feature space dimension
1686       N		number of dimensions
1687       CoVariance	ptr to a covariance matrix
1688       Independence	max off-diagonal correlation coefficient
1689 Globals:	None
1690 Operation:	This routine returns TRUE if the specified covariance
1691       matrix indicates that all N dimensions are independent of
1692       one another.  One dimension is judged to be independent of
1693       another when the magnitude of the corresponding correlation
1694       coefficient is
1695       less than the specified Independence factor.  The
1696       correlation coefficient is calculated as: (see Duda and
1697       Hart, pg. 247)
1698       coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
1699       The covariance matrix is assumed to be symmetric (which
1700       should always be true).
1701 Return:		TRUE if dimensions are independent, FALSE otherwise
1702 Exceptions:	None
1703 History:	6/4/89, DSJ, Created.
1704 *******************************************************************************/
1705 BOOL8
Independent(PARAM_DESC ParamDesc[],inT16 N,FLOAT32 * CoVariance,FLOAT32 Independence)1706 Independent (PARAM_DESC ParamDesc[],
1707 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1708   int i, j;
1709   FLOAT32 *VARii;                // points to ith on-diagonal element
1710   FLOAT32 *VARjj;                // points to jth on-diagonal element
1711   FLOAT32 CorrelationCoeff;
1712 
1713   VARii = CoVariance;
1714   for (i = 0; i < N; i++, VARii += N + 1) {
1715     if (ParamDesc[i].NonEssential)
1716       continue;
1717 
1718     VARjj = VARii + N + 1;
1719     CoVariance = VARii + 1;
1720     for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1721       if (ParamDesc[j].NonEssential)
1722         continue;
1723 
1724       if ((*VARii == 0.0) || (*VARjj == 0.0))
1725         CorrelationCoeff = 0.0;
1726       else
1727         CorrelationCoeff =
1728           sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1729       if (CorrelationCoeff > Independence)
1730         return (FALSE);
1731     }
1732   }
1733   return (TRUE);
1734 }                                // Independent
1735 
1736 
1737 /** GetBuckets **************************************************************
1738 Parameters:	Distribution	type of probability distribution to test for
1739       SampleCount	number of samples that are available
1740       Confidence	probability of a Type I error
1741 Globals:	none
1742 Operation:	This routine returns a histogram data structure which can
1743       be used by other routines to place samples into histogram
1744       buckets, and then apply a goodness of fit test to the
1745       histogram data to determine if the samples belong to the
1746       specified probability distribution.  The routine keeps
1747       a list of bucket data structures which have already been
1748       created so that it minimizes the computation time needed
1749       to create a new bucket.
1750 Return:		Bucket data structure
1751 Exceptions: none
1752 History:	Thu Aug  3 12:58:10 1989, DSJ, Created.
1753 *****************************************************************************/
GetBuckets(DISTRIBUTION Distribution,uinT32 SampleCount,FLOAT64 Confidence)1754 BUCKETS *GetBuckets(DISTRIBUTION Distribution,
1755                     uinT32 SampleCount,
1756                     FLOAT64 Confidence) {
1757   uinT16 NumberOfBuckets;
1758   BUCKETS *Buckets;
1759 
1760   // search for an old bucket structure with the same number of buckets
1761   NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1762   Buckets = (BUCKETS *) first_node (search (OldBuckets[(int) Distribution],
1763     &NumberOfBuckets, NumBucketsMatch));
1764 
1765   // if a matching bucket structure is found, delete it from the list
1766   if (Buckets != NULL) {
1767     OldBuckets[(int) Distribution] =
1768       delete_d (OldBuckets[(int) Distribution], Buckets, ListEntryMatch);
1769     if (SampleCount != Buckets->SampleCount)
1770       AdjustBuckets(Buckets, SampleCount);
1771     if (Confidence != Buckets->Confidence) {
1772       Buckets->Confidence = Confidence;
1773       Buckets->ChiSquared = ComputeChiSquared
1774         (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets),
1775         Confidence);
1776     }
1777     InitBuckets(Buckets);
1778   }
1779   else                           // otherwise create a new structure
1780     Buckets = MakeBuckets (Distribution, SampleCount, Confidence);
1781   return (Buckets);
1782 }                                // GetBuckets
1783 
1784 
1785 /** Makebuckets *************************************************************
1786 Parameters:	Distribution	type of probability distribution to test for
1787       SampleCount	number of samples that are available
1788       Confidence	probability of a Type I error
1789 Globals:	None
1790 Operation:	This routine creates a histogram data structure which can
1791       be used by other routines to place samples into histogram
1792       buckets, and then apply a goodness of fit test to the
1793       histogram data to determine if the samples belong to the
1794       specified probability distribution.  The buckets are
1795       allocated in such a way that the expected frequency of
1796       samples in each bucket is approximately the same.  In
1797       order to make this possible, a mapping table is
1798       computed which maps "normalized" samples into the
1799       appropriate bucket.
1800 Return:		Pointer to new histogram data structure
1801 Exceptions:	None
1802 History:	6/4/89, DSJ, Created.
1803 *****************************************************************************/
MakeBuckets(DISTRIBUTION Distribution,uinT32 SampleCount,FLOAT64 Confidence)1804 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
1805                      uinT32 SampleCount,
1806                      FLOAT64 Confidence) {
1807   static DENSITYFUNC DensityFunction[] =
1808     { NormalDensity, UniformDensity, UniformDensity };
1809   int i, j;
1810   BUCKETS *Buckets;
1811   FLOAT64 BucketProbability;
1812   FLOAT64 NextBucketBoundary;
1813   FLOAT64 Probability;
1814   FLOAT64 ProbabilityDelta;
1815   FLOAT64 LastProbDensity;
1816   FLOAT64 ProbDensity;
1817   uinT16 CurrentBucket;
1818   BOOL8 Symmetrical;
1819 
1820   // allocate memory needed for data structure
1821   Buckets = (BUCKETS *) Emalloc (sizeof (BUCKETS));
1822   Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1823   Buckets->SampleCount = SampleCount;
1824   Buckets->Confidence = Confidence;
1825   Buckets->Count =
1826     (uinT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (uinT32));
1827   Buckets->ExpectedCount =
1828     (FLOAT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (FLOAT32));
1829 
1830   // initialize simple fields
1831   Buckets->Distribution = Distribution;
1832   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1833     Buckets->Count[i] = 0;
1834     Buckets->ExpectedCount[i] = 0.0;
1835   }
1836 
1837   // all currently defined distributions are symmetrical
1838   Symmetrical = TRUE;
1839   Buckets->ChiSquared = ComputeChiSquared
1840     (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), Confidence);
1841 
1842   if (Symmetrical) {
1843     // allocate buckets so that all have approx. equal probability
1844     BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1845 
1846     // distribution is symmetric so fill in upper half then copy
1847     CurrentBucket = Buckets->NumberOfBuckets / 2;
1848     if (Odd (Buckets->NumberOfBuckets))
1849       NextBucketBoundary = BucketProbability / 2;
1850     else
1851       NextBucketBoundary = BucketProbability;
1852 
1853     Probability = 0.0;
1854     LastProbDensity =
1855       (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1856     for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1857       ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1858       ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1859       Probability += ProbabilityDelta;
1860       if (Probability > NextBucketBoundary) {
1861         if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1862           CurrentBucket++;
1863         NextBucketBoundary += BucketProbability;
1864       }
1865       Buckets->Bucket[i] = CurrentBucket;
1866       Buckets->ExpectedCount[CurrentBucket] +=
1867         (FLOAT32) (ProbabilityDelta * SampleCount);
1868       LastProbDensity = ProbDensity;
1869     }
1870     // place any leftover probability into the last bucket
1871     Buckets->ExpectedCount[CurrentBucket] +=
1872       (FLOAT32) ((0.5 - Probability) * SampleCount);
1873 
1874     // copy upper half of distribution to lower half
1875     for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1876       Buckets->Bucket[i] =
1877         Mirror (Buckets->Bucket[j], Buckets->NumberOfBuckets);
1878 
1879     // copy upper half of expected counts to lower half
1880     for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1881       Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1882   }
1883   return (Buckets);
1884 }                                // MakeBuckets
1885 
1886 
1887 //---------------------------------------------------------------------------
OptimumNumberOfBuckets(uinT32 SampleCount)1888 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
1889 /*
1890  **	Parameters:
1891  **		SampleCount	number of samples to be tested
1892  **	Globals:
1893  **		CountTable	lookup table for number of samples
1894  **		BucketsTable	lookup table for necessary number of buckets
1895  **	Operation:
1896  **		This routine computes the optimum number of histogram
1897  **		buckets that should be used in a chi-squared goodness of
1898  **		fit test for the specified number of samples.  The optimum
1899  **		number is computed based on Table 4.1 on pg. 147 of
1900  **		"Measurement and Analysis of Random Data" by Bendat & Piersol.
1901  **		Linear interpolation is used to interpolate between table
1902  **		values.  The table is intended for a 0.05 level of
1903  **		significance (alpha).  This routine assumes that it is
1904  **		equally valid for other alpha's, which may not be true.
1905  **	Return:
1906  **		Optimum number of histogram buckets
1907  **	Exceptions:
1908  **		None
1909  **	History:
1910  **		6/5/89, DSJ, Created.
1911  */
1912   uinT8 Last, Next;
1913   FLOAT32 Slope;
1914 
1915   if (SampleCount < CountTable[0])
1916     return (BucketsTable[0]);
1917 
1918   for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1919     if (SampleCount <= CountTable[Next]) {
1920       Slope = (FLOAT32) (BucketsTable[Next] - BucketsTable[Last]) /
1921         (FLOAT32) (CountTable[Next] - CountTable[Last]);
1922       return ((uinT16) (BucketsTable[Last] +
1923         Slope * (SampleCount - CountTable[Last])));
1924     }
1925   }
1926   return (BucketsTable[Last]);
1927 }                                // OptimumNumberOfBuckets
1928 
1929 
1930 //---------------------------------------------------------------------------
1931 FLOAT64
ComputeChiSquared(uinT16 DegreesOfFreedom,FLOAT64 Alpha)1932 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
1933 /*
1934  **	Parameters:
1935  **		DegreesOfFreedom	determines shape of distribution
1936  **		Alpha			probability of right tail
1937  **	Globals: none
1938  **	Operation:
1939  **		This routine computes the chi-squared value which will
1940  **		leave a cumulative probability of Alpha in the right tail
1941  **		of a chi-squared distribution with the specified number of
1942  **		degrees of freedom.  Alpha must be between 0 and 1.
1943  **		DegreesOfFreedom must be even.  The routine maintains an
1944  **		array of lists.  Each list corresponds to a different
1945  **		number of degrees of freedom.  Each entry in the list
1946  **		corresponds to a different alpha value and its corresponding
1947  **		chi-squared value.  Therefore, once a particular chi-squared
1948  **		value is computed, it is stored in the list and never
1949  **		needs to be computed again.
1950  **	Return: Desired chi-squared value
1951  **	Exceptions: none
1952  **	History: 6/5/89, DSJ, Created.
1953  */
1954 #define CHIACCURACY     0.01
1955 #define MINALPHA  (1e-200)
1956 {
1957   static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1958 
1959   CHISTRUCT *OldChiSquared;
1960   CHISTRUCT SearchKey;
1961 
1962   // limit the minimum alpha that can be used - if alpha is too small
1963   //      it may not be possible to compute chi-squared.
1964   if (Alpha < MINALPHA)
1965     Alpha = MINALPHA;
1966   if (Alpha > 1.0)
1967     Alpha = 1.0;
1968   if (Odd (DegreesOfFreedom))
1969     DegreesOfFreedom++;
1970 
1971   /* find the list of chi-squared values which have already been computed
1972      for the specified number of degrees of freedom.  Search the list for
1973      the desired chi-squared. */
1974   SearchKey.Alpha = Alpha;
1975   OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1976     &SearchKey, AlphaMatch));
1977 
1978   if (OldChiSquared == NULL) {
1979     OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1980     OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1981       (FLOAT64) DegreesOfFreedom,
1982       (FLOAT64) CHIACCURACY);
1983     ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1984       OldChiSquared);
1985   }
1986   else {
1987     // further optimization might move OldChiSquared to front of list
1988   }
1989 
1990   return (OldChiSquared->ChiSquared);
1991 
1992 }                                // ComputeChiSquared
1993 
1994 
1995 //---------------------------------------------------------------------------
NormalDensity(inT32 x)1996 FLOAT64 NormalDensity(inT32 x) {
1997 /*
1998  **	Parameters:
1999  **		x	number to compute the normal probability density for
2000  **	Globals:
2001  **		NormalMean	mean of a discrete normal distribution
2002  **		NormalVariance	variance of a discrete normal distribution
2003  **		NormalMagnitude	magnitude of a discrete normal distribution
2004  **	Operation:
2005  **		This routine computes the probability density function
2006  **		of a discrete normal distribution defined by the global
2007  **		variables NormalMean, NormalVariance, and NormalMagnitude.
2008  **		Normal magnitude could, of course, be computed in terms of
2009  **		the normal variance but it is precomputed for efficiency.
2010  **	Return:
2011  **		The value of the normal distribution at x.
2012  **	Exceptions:
2013  **		None
2014  **	History:
2015  **		6/4/89, DSJ, Created.
2016  */
2017   FLOAT64 Distance;
2018 
2019   Distance = x - NormalMean;
2020   return (NormalMagnitude *
2021     exp (-0.5 * Distance * Distance / NormalVariance));
2022 }                                // NormalDensity
2023 
2024 
2025 //---------------------------------------------------------------------------
UniformDensity(inT32 x)2026 FLOAT64 UniformDensity(inT32 x) {
2027 /*
2028  **	Parameters:
2029  **		x	number to compute the uniform probability density for
2030  **	Globals:
2031  **		BUCKETTABLESIZE		determines range of distribution
2032  **	Operation:
2033  **		This routine computes the probability density function
2034  **		of a uniform distribution at the specified point.  The
2035  **		range of the distribution is from 0 to BUCKETTABLESIZE.
2036  **	Return:
2037  **		The value of the uniform distribution at x.
2038  **	Exceptions:
2039  **		None
2040  **	History:
2041  **		6/5/89, DSJ, Created.
2042  */
2043   static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
2044 
2045   if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
2046     return (UniformDistributionDensity);
2047   else
2048     return ((FLOAT64) 0.0);
2049 }                                // UniformDensity
2050 
2051 
2052 //---------------------------------------------------------------------------
Integral(FLOAT64 f1,FLOAT64 f2,FLOAT64 Dx)2053 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
2054 /*
2055  **	Parameters:
2056  **		f1	value of function at x1
2057  **		f2	value of function at x2
2058  **		Dx	x2 - x1 (should always be positive)
2059  **	Globals:
2060  **		None
2061  **	Operation:
2062  **		This routine computes a trapezoidal approximation to the
2063  **		integral of a function over a small delta in x.
2064  **	Return:
2065  **		Approximation of the integral of the function from x1 to x2.
2066  **	Exceptions:
2067  **		None
2068  **	History:
2069  **		6/5/89, DSJ, Created.
2070  */
2071   return ((f1 + f2) * Dx / 2.0);
2072 }                                // Integral
2073 
2074 
2075 //---------------------------------------------------------------------------
FillBuckets(BUCKETS * Buckets,CLUSTER * Cluster,uinT16 Dim,PARAM_DESC * ParamDesc,FLOAT32 Mean,FLOAT32 StdDev)2076 void FillBuckets(BUCKETS *Buckets,
2077                  CLUSTER *Cluster,
2078                  uinT16 Dim,
2079                  PARAM_DESC *ParamDesc,
2080                  FLOAT32 Mean,
2081                  FLOAT32 StdDev) {
2082 /*
2083  **	Parameters:
2084  **		Buckets		histogram buckets to count samples
2085  **		Cluster		cluster whose samples are being analyzed
2086  **		Dim		dimension of samples which is being analyzed
2087  **		ParamDesc	description of the dimension
2088  **		Mean		"mean" of the distribution
2089  **		StdDev		"standard deviation" of the distribution
2090  **	Globals:
2091  **		None
2092  **	Operation:
2093  **		This routine counts the number of cluster samples which
2094  **		fall within the various histogram buckets in Buckets.  Only
2095  **		one dimension of each sample is examined.  The exact meaning
2096  **		of the Mean and StdDev parameters depends on the
2097  **		distribution which is being analyzed (this info is in the
2098  **		Buckets data structure).  For normal distributions, Mean
2099  **		and StdDev have the expected meanings.  For uniform and
2100  **		random distributions the Mean is the center point of the
2101  **		range and the StdDev is 1/2 the range.  A dimension with
2102  **		zero standard deviation cannot be statistically analyzed.
2103  **		In this case, a pseudo-analysis is used.
2104  **	Return:
2105  **		None (the Buckets data structure is filled in)
2106  **	Exceptions:
2107  **		None
2108  **	History:
2109  **		6/5/89, DSJ, Created.
2110  */
2111   uinT16 BucketID;
2112   int i;
2113   LIST SearchState;
2114   SAMPLE *Sample;
2115 
2116   // initialize the histogram bucket counts to 0
2117   for (i = 0; i < Buckets->NumberOfBuckets; i++)
2118     Buckets->Count[i] = 0;
2119 
2120   if (StdDev == 0.0) {
2121     /* if the standard deviation is zero, then we can't statistically
2122        analyze the cluster.  Use a pseudo-analysis: samples exactly on
2123        the mean are distributed evenly across all buckets.  Samples greater
2124        than the mean are placed in the last bucket; samples less than the
2125        mean are placed in the first bucket. */
2126 
2127     InitSampleSearch(SearchState, Cluster);
2128     i = 0;
2129     while ((Sample = NextSample (&SearchState)) != NULL) {
2130       if (Sample->Mean[Dim] > Mean)
2131         BucketID = Buckets->NumberOfBuckets - 1;
2132       else if (Sample->Mean[Dim] < Mean)
2133         BucketID = 0;
2134       else
2135         BucketID = i;
2136       Buckets->Count[BucketID] += 1;
2137       i++;
2138       if (i >= Buckets->NumberOfBuckets)
2139         i = 0;
2140     }
2141   }
2142   else {
2143     // search for all samples in the cluster and add to histogram buckets
2144     InitSampleSearch(SearchState, Cluster);
2145     while ((Sample = NextSample (&SearchState)) != NULL) {
2146       switch (Buckets->Distribution) {
2147         case normal:
2148           BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2149             Mean, StdDev);
2150           break;
2151         case D_random:
2152         case uniform:
2153           BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2154             Mean, StdDev);
2155           break;
2156         default:
2157           BucketID = 0;
2158       }
2159       Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2160     }
2161   }
2162 }                                // FillBuckets
2163 
2164 
2165 //---------------------------------------------------------------------------*/
NormalBucket(PARAM_DESC * ParamDesc,FLOAT32 x,FLOAT32 Mean,FLOAT32 StdDev)2166 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
2167                     FLOAT32 x,
2168                     FLOAT32 Mean,
2169                     FLOAT32 StdDev) {
2170 /*
2171  **	Parameters:
2172  **		ParamDesc	used to identify circular dimensions
2173  **		x		value to be normalized
2174  **		Mean		mean of normal distribution
2175  **		StdDev		standard deviation of normal distribution
2176  **	Globals:
2177  **		NormalMean	mean of discrete normal distribution
2178  **		NormalStdDev	standard deviation of discrete normal dist.
2179  **		BUCKETTABLESIZE	range of the discrete distribution
2180  **	Operation:
2181  **		This routine determines which bucket x falls into in the
2182  **		discrete normal distribution defined by NormalMean
2183  **		and NormalStdDev.  x values which exceed the range of
2184  **		the discrete distribution are clipped.
2185  **	Return:
2186  **		Bucket number into which x falls
2187  **	Exceptions:
2188  **		None
2189  **	History:
2190  **		6/5/89, DSJ, Created.
2191  */
2192   FLOAT32 X;
2193 
2194   // wraparound circular parameters if necessary
2195   if (ParamDesc->Circular) {
2196     if (x - Mean > ParamDesc->HalfRange)
2197       x -= ParamDesc->Range;
2198     else if (x - Mean < -ParamDesc->HalfRange)
2199       x += ParamDesc->Range;
2200   }
2201 
2202   X = ((x - Mean) / StdDev) * NormalStdDev + NormalMean;
2203   if (X < 0)
2204     return ((uinT16) 0);
2205   if (X > BUCKETTABLESIZE - 1)
2206     return ((uinT16) (BUCKETTABLESIZE - 1));
2207   return ((uinT16) floor ((FLOAT64) X));
2208 }                                // NormalBucket
2209 
2210 
2211 //---------------------------------------------------------------------------
UniformBucket(PARAM_DESC * ParamDesc,FLOAT32 x,FLOAT32 Mean,FLOAT32 StdDev)2212 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
2213                      FLOAT32 x,
2214                      FLOAT32 Mean,
2215                      FLOAT32 StdDev) {
2216 /*
2217  **	Parameters:
2218  **		ParamDesc	used to identify circular dimensions
2219  **		x		value to be normalized
2220  **		Mean		center of range of uniform distribution
2221  **		StdDev		1/2 the range of the uniform distribution
2222  **	Globals:
2223  **		BUCKETTABLESIZE	range of the discrete distribution
2224  **	Operation:
2225  **		This routine determines which bucket x falls into in the
2226  **		discrete uniform distribution defined by
2227  **		BUCKETTABLESIZE.  x values which exceed the range of
2228  **		the discrete distribution are clipped.
2229  **	Return:
2230  **		Bucket number into which x falls
2231  **	Exceptions:
2232  **		None
2233  **	History:
2234  **		6/5/89, DSJ, Created.
2235  */
2236   FLOAT32 X;
2237 
2238   // wraparound circular parameters if necessary
2239   if (ParamDesc->Circular) {
2240     if (x - Mean > ParamDesc->HalfRange)
2241       x -= ParamDesc->Range;
2242     else if (x - Mean < -ParamDesc->HalfRange)
2243       x += ParamDesc->Range;
2244   }
2245 
2246   X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2247   if (X < 0)
2248     return ((uinT16) 0);
2249   if (X > BUCKETTABLESIZE - 1)
2250     return ((uinT16) (BUCKETTABLESIZE - 1));
2251   return ((uinT16) floor ((FLOAT64) X));
2252 }                                // UniformBucket
2253 
2254 
2255 //---------------------------------------------------------------------------
DistributionOK(BUCKETS * Buckets)2256 BOOL8 DistributionOK(BUCKETS *Buckets) {
2257 /*
2258  **	Parameters:
2259  **		Buckets		histogram data to perform chi-square test on
2260  **	Globals:
2261  **		None
2262  **	Operation:
2263  **		This routine performs a chi-square goodness of fit test
2264  **		on the histogram data in the Buckets data structure.  TRUE
2265  **		is returned if the histogram matches the probability
2266  **		distribution which was specified when the Buckets
2267  **		structure was originally created.  Otherwise FALSE is
2268  **		returned.
2269  **	Return:
2270  **		TRUE if samples match distribution, FALSE otherwise
2271  **	Exceptions:
2272  **		None
2273  **	History:
2274  **		6/5/89, DSJ, Created.
2275  */
2276   FLOAT32 FrequencyDifference;
2277   FLOAT32 TotalDifference;
2278   int i;
2279 
2280   // compute how well the histogram matches the expected histogram
2281   TotalDifference = 0.0;
2282   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2283     FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2284     TotalDifference += (FrequencyDifference * FrequencyDifference) /
2285       Buckets->ExpectedCount[i];
2286   }
2287 
2288   // test to see if the difference is more than expected
2289   if (TotalDifference > Buckets->ChiSquared)
2290     return (FALSE);
2291   else
2292     return (TRUE);
2293 }                                // DistributionOK
2294 
2295 
2296 //---------------------------------------------------------------------------
FreeStatistics(STATISTICS * Statistics)2297 void FreeStatistics(STATISTICS *Statistics) {
2298 /*
2299  **	Parameters:
2300  **		Statistics	pointer to data structure to be freed
2301  **	Globals:
2302  **		None
2303  **	Operation:
2304  **		This routine frees the memory used by the statistics
2305  **		data structure.
2306  **	Return:
2307  **		None
2308  **	Exceptions:
2309  **		None
2310  **	History:
2311  **		6/5/89, DSJ, Created.
2312  */
2313   memfree (Statistics->CoVariance);
2314   memfree (Statistics->Min);
2315   memfree (Statistics->Max);
2316   memfree(Statistics);
2317 }                                // FreeStatistics
2318 
2319 
2320 //---------------------------------------------------------------------------
FreeBuckets(BUCKETS * Buckets)2321 void FreeBuckets(BUCKETS *Buckets) {
2322 /*
2323  **	Parameters:
2324  **		Buckets		pointer to data structure to be freed
2325  **	Globals: none
2326  **	Operation:
2327  **		This routine places the specified histogram data structure
2328  **		at the front of a list of histograms so that it can be
2329  **		reused later if necessary.  A separate list is maintained
2330  **		for each different type of distribution.
2331  **	Return: none
2332  **	Exceptions: none
2333  **	History: 6/5/89, DSJ, Created.
2334  */
2335   int Dist;
2336 
2337   if (Buckets != NULL) {
2338     Dist = (int) Buckets->Distribution;
2339     OldBuckets[Dist] = (LIST) push (OldBuckets[Dist], Buckets);
2340   }
2341 
2342 }                                // FreeBuckets
2343 
2344 
2345 //---------------------------------------------------------------------------
FreeCluster(CLUSTER * Cluster)2346 void FreeCluster(CLUSTER *Cluster) {
2347 /*
2348  **	Parameters:
2349  **		Cluster		pointer to cluster to be freed
2350  **	Globals:
2351  **		None
2352  **	Operation:
2353  **		This routine frees the memory consumed by the specified
2354  **		cluster and all of its subclusters.  This is done by
2355  **		recursive calls to FreeCluster().
2356  **	Return:
2357  **		None
2358  **	Exceptions:
2359  **		None
2360  **	History:
2361  **		6/6/89, DSJ, Created.
2362  */
2363   if (Cluster != NULL) {
2364     FreeCluster (Cluster->Left);
2365     FreeCluster (Cluster->Right);
2366     memfree(Cluster);
2367   }
2368 }                                // FreeCluster
2369 
2370 
2371 //---------------------------------------------------------------------------
DegreesOfFreedom(DISTRIBUTION Distribution,uinT16 HistogramBuckets)2372 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2373 /*
2374  **	Parameters:
2375  **		Distribution		distribution being tested for
2376  **		HistogramBuckets	number of buckets in chi-square test
2377  **	Globals: none
2378  **	Operation:
2379  **		This routine computes the degrees of freedom that should
2380  **		be used in a chi-squared test with the specified number of
2381  **		histogram buckets.  The result is always rounded up to
2382  **		the next even number so that the value of chi-squared can be
2383  **		computed more easily.  This will cause the value of
2384  **		chi-squared to be higher than the optimum value, resulting
2385  **		in the chi-square test being more lenient than optimum.
2386  **	Return: The number of degrees of freedom for a chi-square test
2387  **	Exceptions: none
2388  **	History: Thu Aug  3 14:04:18 1989, DSJ, Created.
2389  */
2390   static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2391 
2392   uinT16 AdjustedNumBuckets;
2393 
2394   AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2395   if (Odd (AdjustedNumBuckets))
2396     AdjustedNumBuckets++;
2397   return (AdjustedNumBuckets);
2398 
2399 }                                // DegreesOfFreedom
2400 
2401 
2402 //---------------------------------------------------------------------------
NumBucketsMatch(void * arg1,void * arg2)2403 int NumBucketsMatch(void *arg1,    //BUCKETS                                       *Histogram,
2404                     void *arg2) {  //uinT16                                        *DesiredNumberOfBuckets)
2405 /*
2406  **	Parameters:
2407  **		Histogram	current histogram being tested for a match
2408  **		DesiredNumberOfBuckets	match key
2409  **	Globals: none
2410  **	Operation:
2411  **		This routine is used to search a list of histogram data
2412  **		structures to find one with the specified number of
2413  **		buckets.  It is called by the list search routines.
2414  **	Return: TRUE if Histogram matches DesiredNumberOfBuckets
2415  **	Exceptions: none
2416  **	History: Thu Aug  3 14:17:33 1989, DSJ, Created.
2417  */
2418   BUCKETS *Histogram = (BUCKETS *) arg1;
2419   uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2420 
2421   return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2422 
2423 }                                // NumBucketsMatch
2424 
2425 
2426 //---------------------------------------------------------------------------
ListEntryMatch(void * arg1,void * arg2)2427 int ListEntryMatch(void *arg1,    //ListNode
2428                    void *arg2) {  //Key
2429 /*
2430  **	Parameters: none
2431  **	Globals: none
2432  **	Operation:
2433  **		This routine is used to search a list for a list node
2434  **		whose contents match Key.  It is called by the list
2435  **		delete_d routine.
2436  **	Return: TRUE if ListNode matches Key
2437  **	Exceptions: none
2438  **	History: Thu Aug  3 14:23:58 1989, DSJ, Created.
2439  */
2440   return (arg1 == arg2);
2441 
2442 }                                // ListEntryMatch
2443 
2444 
2445 //---------------------------------------------------------------------------
AdjustBuckets(BUCKETS * Buckets,uinT32 NewSampleCount)2446 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2447 /*
2448  **	Parameters:
2449  **		Buckets		histogram data structure to adjust
2450  **		NewSampleCount	new sample count to adjust to
2451  **	Globals: none
2452  **	Operation:
2453  **		This routine multiplies each ExpectedCount histogram entry
2454  **		by NewSampleCount/OldSampleCount so that the histogram
2455  **		is now adjusted to the new sample count.
2456  **	Return: none
2457  **	Exceptions: none
2458  **	History: Thu Aug  3 14:31:14 1989, DSJ, Created.
2459  */
2460   int i;
2461   FLOAT64 AdjustFactor;
2462 
2463   AdjustFactor = (((FLOAT64) NewSampleCount) /
2464     ((FLOAT64) Buckets->SampleCount));
2465 
2466   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2467     Buckets->ExpectedCount[i] *= AdjustFactor;
2468   }
2469 
2470   Buckets->SampleCount = NewSampleCount;
2471 
2472 }                                // AdjustBuckets
2473 
2474 
2475 //---------------------------------------------------------------------------
InitBuckets(BUCKETS * Buckets)2476 void InitBuckets(BUCKETS *Buckets) {
2477 /*
2478  **	Parameters:
2479  **		Buckets		histogram data structure to init
2480  **	Globals: none
2481  **	Operation:
2482  **		This routine sets the bucket counts in the specified histogram
2483  **		to zero.
2484  **	Return: none
2485  **	Exceptions: none
2486  **	History: Thu Aug  3 14:31:14 1989, DSJ, Created.
2487  */
2488   int i;
2489 
2490   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2491     Buckets->Count[i] = 0;
2492   }
2493 
2494 }                                // InitBuckets
2495 
2496 
2497 //---------------------------------------------------------------------------
AlphaMatch(void * arg1,void * arg2)2498 int AlphaMatch(void *arg1,    //CHISTRUCT                             *ChiStruct,
2499                void *arg2) {  //CHISTRUCT                             *SearchKey)
2500 /*
2501  **	Parameters:
2502  **		ChiStruct	chi-squared struct being tested for a match
2503  **		SearchKey	chi-squared struct that is the search key
2504  **	Globals: none
2505  **	Operation:
2506  **		This routine is used to search a list of structures which
2507  **		hold pre-computed chi-squared values for a chi-squared
2508  **		value whose corresponding alpha field matches the alpha
2509  **		field of SearchKey.
2510  **		It is called by the list search routines.
2511  **	Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2512  **	Exceptions: none
2513  **	History: Thu Aug  3 14:17:33 1989, DSJ, Created.
2514  */
2515   CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2516   CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2517 
2518   return (ChiStruct->Alpha == SearchKey->Alpha);
2519 
2520 }                                // AlphaMatch
2521 
2522 
2523 //---------------------------------------------------------------------------
NewChiStruct(uinT16 DegreesOfFreedom,FLOAT64 Alpha)2524 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2525 /*
2526  **	Parameters:
2527  **		DegreesOfFreedom	degrees of freedom for new chi value
2528  **		Alpha			confidence level for new chi value
2529  **	Globals: none
2530  **	Operation:
2531  **		This routine allocates a new data structure which is used
2532  **		to hold a chi-squared value along with its associated
2533  **		number of degrees of freedom and alpha value.
2534  **	Return: none
2535  **	Exceptions: none
2536  **	History: Fri Aug  4 11:04:59 1989, DSJ, Created.
2537  */
2538   CHISTRUCT *NewChiStruct;
2539 
2540   NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2541   NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2542   NewChiStruct->Alpha = Alpha;
2543   return (NewChiStruct);
2544 
2545 }                                // NewChiStruct
2546 
2547 
2548 //---------------------------------------------------------------------------
2549 FLOAT64
Solve(SOLVEFUNC Function,void * FunctionParams,FLOAT64 InitialGuess,FLOAT64 Accuracy)2550 Solve (SOLVEFUNC Function,
2551 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2552 /*
2553  **	Parameters:
2554  **		Function	function whose zero is to be found
2555  **		FunctionParams	arbitrary data to pass to function
2556  **		InitialGuess	point to start solution search at
2557  **		Accuracy	maximum allowed error
2558  **	Globals: none
2559  **	Operation:
2560  **		This routine attempts to find an x value at which Function
2561  **		goes to zero (i.e. a root of the function ).  It will only
2562  **		work correctly if a solution actually exists and there
2563  **		are no extrema between the solution and the InitialGuess.
2564  **		The algorithms used are extremely primitive.
2565  **	Return: Solution of function ( x for which f(x) = 0 ).
2566  **	Exceptions: none
2567  **	History: Fri Aug  4 11:08:59 1989, DSJ, Created.
2568  */
2569 #define INITIALDELTA    0.1
2570 #define  DELTARATIO     0.1
2571 {
2572   FLOAT64 x;
2573   FLOAT64 f;
2574   FLOAT64 Slope;
2575   FLOAT64 Delta;
2576   FLOAT64 NewDelta;
2577   FLOAT64 xDelta;
2578   FLOAT64 LastPosX, LastNegX;
2579 
2580   x = InitialGuess;
2581   Delta = INITIALDELTA;
2582   LastPosX = MAX_FLOAT32;
2583   LastNegX = -MAX_FLOAT32;
2584   f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2585   while (Abs (LastPosX - LastNegX) > Accuracy) {
2586     // keep track of outer bounds of current estimate
2587     if (f < 0)
2588       LastNegX = x;
2589     else
2590       LastPosX = x;
2591 
2592     // compute the approx. slope of f(x) at the current point
2593     Slope =
2594       ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2595 
2596     // compute the next solution guess */
2597     xDelta = f / Slope;
2598     x -= xDelta;
2599 
2600     // reduce the delta used for computing slope to be a fraction of
2601     //the amount moved to get to the new guess
2602     NewDelta = Abs (xDelta) * DELTARATIO;
2603     if (NewDelta < Delta)
2604       Delta = NewDelta;
2605 
2606     // compute the value of the function at the new guess
2607     f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2608   }
2609   return (x);
2610 
2611 }                                // Solve
2612 
2613 
2614 //---------------------------------------------------------------------------
ChiArea(CHISTRUCT * ChiParams,FLOAT64 x)2615 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
2616 /*
2617  **	Parameters:
2618  **		ChiParams	contains degrees of freedom and alpha
2619  **		x		value of chi-squared to evaluate
2620  **	Globals: none
2621  **	Operation:
2622  **		This routine computes the area under a chi density curve
2623  **		from 0 to x, minus the desired area under the curve.  The
2624  **		number of degrees of freedom of the chi curve is specified
2625  **		in the ChiParams structure.  The desired area is also
2626  **		specified in the ChiParams structure as Alpha ( or 1 minus
2627  **		the desired area ).  This routine is intended to be passed
2628  **		to the Solve() function to find the value of chi-squared
2629  **		which will yield a desired area under the right tail of
2630  **		the chi density curve.  The function will only work for
2631  **		even degrees of freedom.  The equations are based on
2632  **		integrating the chi density curve in parts to obtain
2633  **		a series that can be used to compute the area under the
2634  **		curve.
2635  **	Return: Error between actual and desired area under the chi curve.
2636  **	Exceptions: none
2637  **	History: Fri Aug  4 12:48:41 1989, DSJ, Created.
2638  */
2639   int i, N;
2640   FLOAT64 SeriesTotal;
2641   FLOAT64 Denominator;
2642   FLOAT64 PowerOfx;
2643 
2644   N = ChiParams->DegreesOfFreedom / 2 - 1;
2645   SeriesTotal = 1;
2646   Denominator = 1;
2647   PowerOfx = 1;
2648   for (i = 1; i <= N; i++) {
2649     Denominator *= 2 * i;
2650     PowerOfx *= x;
2651     SeriesTotal += PowerOfx / Denominator;
2652   }
2653   return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2654 
2655 }                                // ChiArea
2656 
2657 
2658 //---------------------------------------------------------------------------
2659 BOOL8
MultipleCharSamples(CLUSTERER * Clusterer,CLUSTER * Cluster,FLOAT32 MaxIllegal)2660 MultipleCharSamples (CLUSTERER * Clusterer,
2661 CLUSTER * Cluster, FLOAT32 MaxIllegal)
2662 /*
2663  **	Parameters:
2664  **		Clusterer	data structure holding cluster tree
2665  **		Cluster		cluster containing samples to be tested
2666  **		MaxIllegal	max percentage of samples allowed to have
2667  **				more than 1 feature in the cluster
2668  **	Globals: none
2669  **	Operation:
2670  **		This routine looks at all samples in the specified cluster.
2671  **		It computes a running estimate of the percentage of the
2672  **		charaters which have more than 1 sample in the cluster.
2673  **		When this percentage exceeds MaxIllegal, TRUE is returned.
2674  **		Otherwise FALSE is returned.  The CharID
2675  **		fields must contain integers which identify the training
2676  **		characters which were used to generate the sample.  One
2677  **		integer is used for each sample.  The NumChar field in
2678  **		the Clusterer must contain the number of characters in the
2679  **		training set.  All CharID fields must be between 0 and
2680  **		NumChar-1.  The main function of this routine is to help
2681  **		identify clusters which need to be split further, i.e. if
2682  **		numerous training characters have 2 or more features which are
2683  **		contained in the same cluster, then the cluster should be
2684  **		split.
2685  **	Return: TRUE if the cluster should be split, FALSE otherwise.
2686  **	Exceptions: none
2687  **	History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2688  **		2/22/90, DSJ, Added MaxIllegal control rather than always
2689  **				splitting illegal clusters.
2690  */
2691 #define ILLEGAL_CHAR    2
2692 {
2693   static BOOL8 *CharFlags = NULL;
2694   static inT32 NumFlags = 0;
2695   int i;
2696   LIST SearchState;
2697   SAMPLE *Sample;
2698   inT32 CharID;
2699   inT32 NumCharInCluster;
2700   inT32 NumIllegalInCluster;
2701   FLOAT32 PercentIllegal;
2702 
2703   // initial estimate assumes that no illegal chars exist in the cluster
2704   NumCharInCluster = Cluster->SampleCount;
2705   NumIllegalInCluster = 0;
2706 
2707   if (Clusterer->NumChar > NumFlags) {
2708     if (CharFlags != NULL)
2709       memfree(CharFlags);
2710     NumFlags = Clusterer->NumChar;
2711     CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2712   }
2713 
2714   for (i = 0; i < NumFlags; i++)
2715     CharFlags[i] = FALSE;
2716 
2717   // find each sample in the cluster and check if we have seen it before
2718   InitSampleSearch(SearchState, Cluster);
2719   while ((Sample = NextSample (&SearchState)) != NULL) {
2720     CharID = Sample->CharID;
2721     if (CharFlags[CharID] == FALSE) {
2722       CharFlags[CharID] = TRUE;
2723     }
2724     else {
2725       if (CharFlags[CharID] == TRUE) {
2726         NumIllegalInCluster++;
2727         CharFlags[CharID] = ILLEGAL_CHAR;
2728       }
2729       NumCharInCluster--;
2730       PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2731       if (PercentIllegal > MaxIllegal)
2732         return (TRUE);
2733     }
2734   }
2735   return (FALSE);
2736 
2737 }                                // MultipleCharSamples
2738 
2739 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2740 // The return value is the sum of norms of the off-diagonal terms of the
2741 // product of a and inv. (A measure of the error.)
InvertMatrix(const float * input,int size,float * inv)2742 double InvertMatrix(const float* input, int size, float* inv) {
2743   double** U;  // The upper triangular array.
2744   double* Umem;
2745   double** U_inv;  // The inverse of U.
2746   double* U_invmem;
2747   double** L;  // The lower triangular array.
2748   double* Lmem;
2749 
2750   // Allocate memory for the 2D arrays.
2751   ALLOC_2D_ARRAY(size, size, Umem, U, double);
2752   ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
2753   ALLOC_2D_ARRAY(size, size, Lmem, L, double);
2754 
2755   // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2756   int row;
2757   int col;
2758   for (row = 0; row < size; row++) {
2759     for (col = 0; col < size; col++) {
2760       U[row][col] = input[row*size + col];
2761       L[row][col] = row == col ? 1.0 : 0.0;
2762       U_inv[row][col] = 0.0;
2763     }
2764   }
2765 
2766   // Compute forward matrix by inversion by LU decomposition of input.
2767   for (col = 0; col < size; ++col) {
2768     // Find best pivot
2769     int best_row = 0;
2770     double best_pivot = -1.0;
2771     for (row = col; row < size; ++row) {
2772       if (Abs(U[row][col]) > best_pivot) {
2773         best_pivot = Abs(U[row][col]);
2774         best_row = row;
2775       }
2776     }
2777     // Exchange pivot rows.
2778     if (best_row != col) {
2779       for (int k = 0; k < size; ++k) {
2780         double tmp = U[best_row][k];
2781         U[best_row][k] = U[col][k];
2782         U[col][k] = tmp;
2783         tmp = L[best_row][k];
2784         L[best_row][k] = L[col][k];
2785         L[col][k] = tmp;
2786       }
2787     }
2788     // Now do the pivot itself.
2789     for (row = col + 1; row < size; ++row) {
2790       double ratio = -U[row][col] / U[col][col];
2791       for (int j = col; j < size; ++j) {
2792         U[row][j] += U[col][j] * ratio;
2793       }
2794       for (int k = 0; k < size; ++k) {
2795         L[row][k] += L[col][k] * ratio;
2796       }
2797     }
2798   }
2799   // Next invert U.
2800   for (col = 0; col < size; ++col) {
2801     U_inv[col][col] = 1.0 / U[col][col];
2802     for (row = col - 1; row >= 0; --row) {
2803       double total = 0.0;
2804       for (int k = col; k > row; --k) {
2805         total += U[row][k] * U_inv[k][col];
2806       }
2807       U_inv[row][col] = -total / U[row][row];
2808     }
2809   }
2810   // Now the answer is U_inv.L.
2811   for (row = 0; row < size; row++) {
2812     for (col = 0; col < size; col++) {
2813       double sum = 0.0;
2814       for (int k = row; k < size; ++k) {
2815         sum += U_inv[row][k] * L[k][col];
2816       }
2817       inv[row*size + col] = sum;
2818     }
2819   }
2820   // Check matrix product.
2821   double error_sum = 0.0;
2822   for (row = 0; row < size; row++) {
2823     for (col = 0; col < size; col++) {
2824       double sum = 0.0;
2825       for (int k = 0; k < size; ++k) {
2826         sum += input[row*size + k] * inv[k *size + col];
2827       }
2828       if (row != col) {
2829         error_sum += Abs(sum);
2830       }
2831     }
2832   }
2833   return error_sum;
2834 }
2835