1 /******************************************************************************
2 ** Filename: cluster.c
3 ** Purpose: Routines for clustering points in N-D space
4 ** Author: Dan Johnson
5 ** History: 5/29/89, DSJ, Created.
6 **
7 ** (c) Copyright Hewlett-Packard Company, 1988.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 ******************************************************************************/
18 #include "oldheap.h"
19 #include "const.h"
20 #include "cluster.h"
21 #include "emalloc.h"
22 #include "tprintf.h"
23 #include "danerror.h"
24 #include "freelist.h"
25 #include <math.h>
26
27 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
28 #define FTABLE_X 10 // Size of FTable.
29 #define FTABLE_Y 100 // Size of FTable.
30
31 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
32 double FTable[FTABLE_Y][FTABLE_X] = {
33 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
34 {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
35 {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
36 {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
37 {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
38 {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
39 {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
40 {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
41 {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
42 {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
43 { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
44 { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
45 { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
46 { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
47 { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
48 { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
49 { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
50 { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
51 { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
52 { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
53 { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
54 { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
55 { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
56 { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
57 { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
58 { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
59 { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
60 { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
61 { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
62 { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
63 { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
64 { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
65 { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
66 { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
67 { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
68 { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
69 { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
70 { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
71 { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
72 { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
73 { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
74 { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
75 { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
76 { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
77 { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
78 { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
79 { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
80 { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
81 { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
82 { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
83 { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
84 { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
85 { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
86 { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
87 { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
88 { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
89 { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
90 { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
91 { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
92 { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
93 { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
94 { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
95 { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
96 { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
97 { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
98 { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
99 { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
100 { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
101 { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
102 { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
103 { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
104 { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
105 { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
106 { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
107 { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
108 { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
109 { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
110 { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
111 { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
112 { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
113 { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
114 { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
115 { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
116 { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
117 { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
118 { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
119 { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
120 { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
121 { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
122 { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
123 { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
124 { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
125 { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
126 { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
127 { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
128 { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
129 { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
130 { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
131 { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
132 { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
133 };
134
135 /* define the variance which will be used as a minimum variance for any
136 dimension of any feature. Since most features are calculated from numbers
137 with a precision no better than 1 in 128, the variance should never be
138 less than the square of this number for parameters whose range is 1. */
139 #define MINVARIANCE 0.0001
140
141 /* define the absolute minimum number of samples which must be present in
142 order to accurately test hypotheses about underlying probability
143 distributions. Define separately the minimum samples that are needed
144 before a statistical analysis is attempted; this number should be
145 equal to MINSAMPLES but can be set to a lower number for early testing
146 when very few samples are available. */
147 #define MINBUCKETS 5
148 #define MINSAMPLESPERBUCKET 5
149 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
150 #define MINSAMPLESNEEDED 1
151
152 /* define the size of the table which maps normalized samples to
153 histogram buckets. Also define the number of standard deviations
154 in a normal distribution which are considered to be significant.
155 The mapping table will be defined in such a way that it covers
156 the specified number of standard deviations on either side of
157 the mean. BUCKETTABLESIZE should always be even. */
158 #define BUCKETTABLESIZE 1024
159 #define NORMALEXTENT 3.0
160
161 typedef struct
162 {
163 CLUSTER *Cluster;
164 CLUSTER *Neighbor;
165 }
166
167
168 TEMPCLUSTER;
169
170 typedef struct
171 {
172 FLOAT32 AvgVariance;
173 FLOAT32 *CoVariance;
174 FLOAT32 *Min; // largest negative distance from the mean
175 FLOAT32 *Max; // largest positive distance from the mean
176 }
177
178
179 STATISTICS;
180
181 typedef struct
182 {
183 DISTRIBUTION Distribution; // distribution being tested for
184 uinT32 SampleCount; // # of samples in histogram
185 FLOAT64 Confidence; // confidence level of test
186 FLOAT64 ChiSquared; // test threshold
187 uinT16 NumberOfBuckets; // number of cells in histogram
188 uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
189 uinT32 *Count; // frequency of occurence histogram
190 FLOAT32 *ExpectedCount; // expected histogram
191 }
192
193
194 BUCKETS;
195
196 typedef struct
197 {
198 uinT16 DegreesOfFreedom;
199 FLOAT64 Alpha;
200 FLOAT64 ChiSquared;
201 }
202
203
204 CHISTRUCT;
205
206 typedef FLOAT64 (*DENSITYFUNC) (inT32);
207 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
208
209 #define Odd(N) ((N)%2)
210 #define Mirror(N,R) ((R) - (N) - 1)
211 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
212
213 //--------------Global Data Definitions and Declarations----------------------
214 /* the following variables are declared as global so that routines which
215 are called from the kd-tree walker can get to them. */
216 static HEAP *Heap;
217 static TEMPCLUSTER *TempCluster;
218 static KDTREE *Tree;
219 static inT32 CurrentTemp;
220
221 /* the following variables describe a discrete normal distribution
222 which is used by NormalDensity() and NormalBucket(). The
223 constant NORMALEXTENT determines how many standard
224 deviations of the distribution are mapped onto the fixed
225 discrete range of x. x=0 is mapped to -NORMALEXTENT standard
226 deviations and x=BUCKETTABLESIZE is mapped to
227 +NORMALEXTENT standard deviations. */
228 #define SqrtOf2Pi 2.506628275
229 static FLOAT64 NormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
230 static FLOAT64 NormalVariance =
231 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
232 static FLOAT64 NormalMagnitude =
233 (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
234 static FLOAT64 NormalMean = BUCKETTABLESIZE / 2;
235
236 // keep a list of histogram buckets to minimize recomputing them
237 static LIST OldBuckets[] = { NIL, NIL, NIL };
238
239 /* define lookup tables used to compute the number of histogram buckets
240 that should be used for a given number of samples. */
241 #define LOOKUPTABLESIZE 8
242 #define MAXBUCKETS 39
243 #define MAXDEGREESOFFREEDOM MAXBUCKETS
244
245 static uinT32 CountTable[LOOKUPTABLESIZE] = {
246 MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
247 };
248 static uinT16 BucketsTable[LOOKUPTABLESIZE] = {
249 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
250 };
251
252 /*-------------------------------------------------------------------------
253 Private Function Prototypes
254 --------------------------------------------------------------------------*/
255 void CreateClusterTree(CLUSTERER *Clusterer);
256
257 void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level);
258
259 CLUSTER *FindNearestNeighbor(KDTREE *Tree,
260 CLUSTER *Cluster,
261 FLOAT32 *Distance);
262
263 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
264
265 inT32 MergeClusters (inT16 N,
266 register PARAM_DESC ParamDesc[],
267 register inT32 n1,
268 register inT32 n2,
269 register FLOAT32 m[],
270 register FLOAT32 m1[], register FLOAT32 m2[]);
271
272 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
273
274 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
275 CLUSTERCONFIG *Config,
276 CLUSTER *Cluster);
277
278 PROTOTYPE *MakeDegenerateProto(uinT16 N,
279 CLUSTER *Cluster,
280 STATISTICS *Statistics,
281 PROTOSTYLE Style,
282 inT32 MinSamples);
283
284 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
285 CLUSTERCONFIG *Config,
286 CLUSTER *Cluster,
287 STATISTICS *Statistics);
288
289 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
290 CLUSTER *Cluster,
291 STATISTICS *Statistics,
292 BUCKETS *Buckets);
293
294 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
295 CLUSTER *Cluster,
296 STATISTICS *Statistics,
297 BUCKETS *Buckets);
298
299 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
300 CLUSTER *Cluster,
301 STATISTICS *Statistics,
302 BUCKETS *NormalBuckets,
303 FLOAT64 Confidence);
304
305 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
306
307 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
308
309 STATISTICS *ComputeStatistics (inT16 N,
310 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
311
312 PROTOTYPE *NewSphericalProto(uinT16 N,
313 CLUSTER *Cluster,
314 STATISTICS *Statistics);
315
316 PROTOTYPE *NewEllipticalProto(inT16 N,
317 CLUSTER *Cluster,
318 STATISTICS *Statistics);
319
320 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
321
322 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
323
324 BOOL8 Independent (PARAM_DESC ParamDesc[],
325 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
326
327 BUCKETS *GetBuckets(DISTRIBUTION Distribution,
328 uinT32 SampleCount,
329 FLOAT64 Confidence);
330
331 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
332 uinT32 SampleCount,
333 FLOAT64 Confidence);
334
335 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
336
337 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
338
339 FLOAT64 NormalDensity(inT32 x);
340
341 FLOAT64 UniformDensity(inT32 x);
342
343 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
344
345 void FillBuckets(BUCKETS *Buckets,
346 CLUSTER *Cluster,
347 uinT16 Dim,
348 PARAM_DESC *ParamDesc,
349 FLOAT32 Mean,
350 FLOAT32 StdDev);
351
352 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
353 FLOAT32 x,
354 FLOAT32 Mean,
355 FLOAT32 StdDev);
356
357 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
358 FLOAT32 x,
359 FLOAT32 Mean,
360 FLOAT32 StdDev);
361
362 BOOL8 DistributionOK(BUCKETS *Buckets);
363
364 void FreeStatistics(STATISTICS *Statistics);
365
366 void FreeBuckets(BUCKETS *Buckets);
367
368 void FreeCluster(CLUSTER *Cluster);
369
370 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
371
372 int NumBucketsMatch(void *arg1, //BUCKETS *Histogram,
373 void *arg2); //uinT16 *DesiredNumberOfBuckets);
374
375 int ListEntryMatch(void *arg1, void *arg2);
376
377 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
378
379 void InitBuckets(BUCKETS *Buckets);
380
381 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
382 void *arg2); //CHISTRUCT *SearchKey);
383
384 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
385
386 FLOAT64 Solve(SOLVEFUNC Function,
387 void *FunctionParams,
388 FLOAT64 InitialGuess,
389 FLOAT64 Accuracy);
390
391 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
392
393 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
394 CLUSTER *Cluster,
395 FLOAT32 MaxIllegal);
396
397 double InvertMatrix(const float* input, int size, float* inv);
398
399 //--------------------------Public Code--------------------------------------
400 /** MakeClusterer **********************************************************
401 Parameters: SampleSize number of dimensions in feature space
402 ParamDesc description of each dimension
403 Globals: None
404 Operation: This routine creates a new clusterer data structure,
405 initializes it, and returns a pointer to it.
406 Return: pointer to the new clusterer data structure
407 Exceptions: None
408 History: 5/29/89, DSJ, Created.
409 ****************************************************************************/
410 CLUSTERER *
MakeClusterer(inT16 SampleSize,PARAM_DESC ParamDesc[])411 MakeClusterer (inT16 SampleSize, PARAM_DESC ParamDesc[]) {
412 CLUSTERER *Clusterer;
413 int i;
414
415 // allocate main clusterer data structure and init simple fields
416 Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
417 Clusterer->SampleSize = SampleSize;
418 Clusterer->NumberOfSamples = 0;
419 Clusterer->NumChar = 0;
420
421 // init fields which will not be used initially
422 Clusterer->Root = NULL;
423 Clusterer->ProtoList = NIL;
424
425 // maintain a copy of param descriptors in the clusterer data structure
426 Clusterer->ParamDesc =
427 (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
428 for (i = 0; i < SampleSize; i++) {
429 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
430 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
431 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
432 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
433 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
434 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
435 Clusterer->ParamDesc[i].MidRange =
436 (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
437 }
438
439 // allocate a kd tree to hold the samples
440 Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
441
442 // execute hook for monitoring clustering operation
443 // (*ClustererCreationHook)( Clusterer );
444
445 return (Clusterer);
446 } // MakeClusterer
447
448
449 /** MakeSample ***********************************************************
450 Parameters: Clusterer clusterer data structure to add sample to
451 Feature feature to be added to clusterer
452 CharID unique ident. of char that sample came from
453 Globals: None
454 Operation: This routine creates a new sample data structure to hold
455 the specified feature. This sample is added to the clusterer
456 data structure (so that it knows which samples are to be
457 clustered later), and a pointer to the sample is returned to
458 the caller.
459 Return: Pointer to the new sample data structure
460 Exceptions: ALREADYCLUSTERED MakeSample can't be called after
461 ClusterSamples has been called
462 History: 5/29/89, DSJ, Created.
463 *****************************************************************************/
464 SAMPLE *
MakeSample(CLUSTERER * Clusterer,FLOAT32 Feature[],inT32 CharID)465 MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID) {
466 SAMPLE *Sample;
467 int i;
468
469 // see if the samples have already been clustered - if so trap an error
470 if (Clusterer->Root != NULL)
471 DoError (ALREADYCLUSTERED,
472 "Can't add samples after they have been clustered");
473
474 // allocate the new sample and initialize it
475 Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
476 (Clusterer->SampleSize -
477 1) * sizeof (FLOAT32));
478 Sample->Clustered = FALSE;
479 Sample->Prototype = FALSE;
480 Sample->SampleCount = 1;
481 Sample->Left = NULL;
482 Sample->Right = NULL;
483 Sample->CharID = CharID;
484
485 for (i = 0; i < Clusterer->SampleSize; i++)
486 Sample->Mean[i] = Feature[i];
487
488 // add the sample to the KD tree - keep track of the total # of samples
489 Clusterer->NumberOfSamples++;
490 KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
491 if (CharID >= Clusterer->NumChar)
492 Clusterer->NumChar = CharID + 1;
493
494 // execute hook for monitoring clustering operation
495 // (*SampleCreationHook)( Sample );
496
497 return (Sample);
498 } // MakeSample
499
500
501 /** ClusterSamples ***********************************************************
502 Parameters: Clusterer data struct containing samples to be clustered
503 Config parameters which control clustering process
504 Globals: None
505 Operation: This routine first checks to see if the samples in this
506 clusterer have already been clustered before; if so, it does
507 not bother to recreate the cluster tree. It simply recomputes
508 the prototypes based on the new Config info.
509 If the samples have not been clustered before, the
510 samples in the KD tree are formed into a cluster tree and then
511 the prototypes are computed from the cluster tree.
512 In either case this routine returns a pointer to a
513 list of prototypes that best represent the samples given
514 the constraints specified in Config.
515 Return: Pointer to a list of prototypes
516 Exceptions: None
517 History: 5/29/89, DSJ, Created.
518 *******************************************************************************/
ClusterSamples(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)519 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
520 //only create cluster tree if samples have never been clustered before
521 if (Clusterer->Root == NULL)
522 CreateClusterTree(Clusterer);
523
524 //deallocate the old prototype list if one exists
525 FreeProtoList (&Clusterer->ProtoList);
526 Clusterer->ProtoList = NIL;
527
528 //compute prototypes starting at the root node in the tree
529 ComputePrototypes(Clusterer, Config);
530 return (Clusterer->ProtoList);
531 } // ClusterSamples
532
533
534 /** FreeClusterer *************************************************************
535 Parameters: Clusterer pointer to data structure to be freed
536 Globals: None
537 Operation: This routine frees all of the memory allocated to the
538 specified data structure. It will not, however, free
539 the memory used by the prototype list. The pointers to
540 the clusters for each prototype in the list will be set
541 to NULL to indicate that the cluster data structures no
542 longer exist. Any sample lists that have been obtained
543 via calls to GetSamples are no longer valid.
544 Return: None
545 Exceptions: None
546 History: 6/6/89, DSJ, Created.
547 *******************************************************************************/
FreeClusterer(CLUSTERER * Clusterer)548 void FreeClusterer(CLUSTERER *Clusterer) {
549 if (Clusterer != NULL) {
550 memfree (Clusterer->ParamDesc);
551 if (Clusterer->KDTree != NULL)
552 FreeKDTree (Clusterer->KDTree);
553 if (Clusterer->Root != NULL)
554 FreeCluster (Clusterer->Root);
555 iterate (Clusterer->ProtoList) {
556 ((PROTOTYPE *) (first_node (Clusterer->ProtoList)))->Cluster = NULL;
557 }
558 memfree(Clusterer);
559 }
560 } // FreeClusterer
561
562
563 /** FreeProtoList ************************************************************
564 Parameters: ProtoList pointer to list of prototypes to be freed
565 Globals: None
566 Operation: This routine frees all of the memory allocated to the
567 specified list of prototypes. The clusters which are
568 pointed to by the prototypes are not freed.
569 Return: None
570 Exceptions: None
571 History: 6/6/89, DSJ, Created.
572 *****************************************************************************/
FreeProtoList(LIST * ProtoList)573 void FreeProtoList(LIST *ProtoList) {
574 destroy_nodes(*ProtoList, FreePrototype);
575 } // FreeProtoList
576
577
578 /** FreePrototype ************************************************************
579 Parameters: Prototype prototype data structure to be deallocated
580 Globals: None
581 Operation: This routine deallocates the memory consumed by the specified
582 prototype and modifies the corresponding cluster so that it
583 is no longer marked as a prototype. The cluster is NOT
584 deallocated by this routine.
585 Return: None
586 Exceptions: None
587 History: 5/30/89, DSJ, Created.
588 *******************************************************************************/
FreePrototype(void * arg)589 void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
590 PROTOTYPE *Prototype = (PROTOTYPE *) arg;
591
592 // unmark the corresponding cluster (if there is one
593 if (Prototype->Cluster != NULL)
594 Prototype->Cluster->Prototype = FALSE;
595
596 // deallocate the prototype statistics and then the prototype itself
597 if (Prototype->Distrib != NULL)
598 memfree (Prototype->Distrib);
599 if (Prototype->Mean != NULL)
600 memfree (Prototype->Mean);
601 if (Prototype->Style != spherical) {
602 if (Prototype->Variance.Elliptical != NULL)
603 memfree (Prototype->Variance.Elliptical);
604 if (Prototype->Magnitude.Elliptical != NULL)
605 memfree (Prototype->Magnitude.Elliptical);
606 if (Prototype->Weight.Elliptical != NULL)
607 memfree (Prototype->Weight.Elliptical);
608 }
609 memfree(Prototype);
610 } // FreePrototype
611
612
613 /** NextSample ************************************************************
614 Parameters: SearchState ptr to list containing clusters to be searched
615 Globals: None
616 Operation: This routine is used to find all of the samples which
617 belong to a cluster. It starts by removing the top
618 cluster on the cluster list (SearchState). If this cluster is
619 a leaf it is returned. Otherwise, the right subcluster
620 is pushed on the list and we continue the search in the
621 left subcluster. This continues until a leaf is found.
622 If all samples have been found, NULL is returned.
623 InitSampleSearch() must be called
624 before NextSample() to initialize the search.
625 Return: Pointer to the next leaf cluster (sample) or NULL.
626 Exceptions: None
627 History: 6/16/89, DSJ, Created.
628 ****************************************************************************/
NextSample(LIST * SearchState)629 CLUSTER *NextSample(LIST *SearchState) {
630 CLUSTER *Cluster;
631
632 if (*SearchState == NIL)
633 return (NULL);
634 Cluster = (CLUSTER *) first_node (*SearchState);
635 *SearchState = pop (*SearchState);
636 while (TRUE) {
637 if (Cluster->Left == NULL)
638 return (Cluster);
639 *SearchState = push (*SearchState, Cluster->Right);
640 Cluster = Cluster->Left;
641 }
642 } // NextSample
643
644
645 /** Mean ***********************************************************
646 Parameters: Proto prototype to return mean of
647 Dimension dimension whose mean is to be returned
648 Globals: none
649 Operation: This routine returns the mean of the specified
650 prototype in the indicated dimension.
651 Return: Mean of Prototype in Dimension
652 Exceptions: none
653 History: 7/6/89, DSJ, Created.
654 *********************************************************************/
Mean(PROTOTYPE * Proto,uinT16 Dimension)655 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
656 return (Proto->Mean[Dimension]);
657 } // Mean
658
659
660 /** StandardDeviation *************************************************
661 Parameters: Proto prototype to return standard deviation of
662 Dimension dimension whose stddev is to be returned
663 Globals: none
664 Operation: This routine returns the standard deviation of the
665 prototype in the indicated dimension.
666 Return: Standard deviation of Prototype in Dimension
667 Exceptions: none
668 History: 7/6/89, DSJ, Created.
669 **********************************************************************/
StandardDeviation(PROTOTYPE * Proto,uinT16 Dimension)670 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
671 switch (Proto->Style) {
672 case spherical:
673 return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
674 case elliptical:
675 return ((FLOAT32)
676 sqrt ((double) Proto->Variance.Elliptical[Dimension]));
677 case mixed:
678 switch (Proto->Distrib[Dimension]) {
679 case normal:
680 return ((FLOAT32)
681 sqrt ((double) Proto->Variance.Elliptical[Dimension]));
682 case uniform:
683 case D_random:
684 return (Proto->Variance.Elliptical[Dimension]);
685 }
686 }
687 return 0.0f;
688 } // StandardDeviation
689
690
691 /*---------------------------------------------------------------------------
692 Private Code
693 ----------------------------------------------------------------------------*/
694 /** CreateClusterTree *******************************************************
695 Parameters: Clusterer data structure holdings samples to be clustered
696 Globals: Tree kd-tree holding samples
697 TempCluster array of temporary clusters
698 CurrentTemp index of next temp cluster to be used
699 Heap heap used to hold temp clusters - "best" on top
700 Operation: This routine performs a bottoms-up clustering on the samples
701 held in the kd-tree of the Clusterer data structure. The
702 result is a cluster tree. Each node in the tree represents
703 a cluster which conceptually contains a subset of the samples.
704 More precisely, the cluster contains all of the samples which
705 are contained in its two sub-clusters. The leaves of the
706 tree are the individual samples themselves; they have no
707 sub-clusters. The root node of the tree conceptually contains
708 all of the samples.
709 Return: None (the Clusterer data structure is changed)
710 Exceptions: None
711 History: 5/29/89, DSJ, Created.
712 ******************************************************************************/
CreateClusterTree(CLUSTERER * Clusterer)713 void CreateClusterTree(CLUSTERER *Clusterer) {
714 HEAPENTRY HeapEntry;
715 TEMPCLUSTER *PotentialCluster;
716
717 // save the kd-tree in a global variable so kd-tree walker can get at it
718 Tree = Clusterer->KDTree;
719
720 // allocate memory to to hold all of the "potential" clusters
721 TempCluster = (TEMPCLUSTER *)
722 Emalloc (Clusterer->NumberOfSamples * sizeof (TEMPCLUSTER));
723 CurrentTemp = 0;
724
725 // each sample and its nearest neighbor form a "potential" cluster
726 // save these in a heap with the "best" potential clusters on top
727 Heap = MakeHeap (Clusterer->NumberOfSamples);
728 KDWalk (Tree, (void_proc) MakePotentialClusters);
729
730 // form potential clusters into actual clusters - always do "best" first
731 while (GetTopOfHeap (Heap, &HeapEntry) != EMPTY) {
732 PotentialCluster = (TEMPCLUSTER *) (HeapEntry.Data);
733
734 // if main cluster of potential cluster is already in another cluster
735 // then we don't need to worry about it
736 if (PotentialCluster->Cluster->Clustered) {
737 continue;
738 }
739
740 // if main cluster is not yet clustered, but its nearest neighbor is
741 // then we must find a new nearest neighbor
742 else if (PotentialCluster->Neighbor->Clustered) {
743 PotentialCluster->Neighbor =
744 FindNearestNeighbor (Tree, PotentialCluster->Cluster,
745 &(HeapEntry.Key));
746 if (PotentialCluster->Neighbor != NULL) {
747 HeapStore(Heap, &HeapEntry);
748 }
749 }
750
751 // if neither cluster is already clustered, form permanent cluster
752 else {
753 PotentialCluster->Cluster =
754 MakeNewCluster(Clusterer, PotentialCluster);
755 PotentialCluster->Neighbor =
756 FindNearestNeighbor (Tree, PotentialCluster->Cluster,
757 &(HeapEntry.Key));
758 if (PotentialCluster->Neighbor != NULL) {
759 HeapStore(Heap, &HeapEntry);
760 }
761 }
762 }
763
764 // the root node in the cluster tree is now the only node in the kd-tree
765 Clusterer->Root = (CLUSTER *) RootOf (Clusterer->KDTree);
766
767 // free up the memory used by the K-D tree, heap, and temp clusters
768 FreeKDTree(Tree);
769 Clusterer->KDTree = NULL;
770 FreeHeap(Heap);
771 memfree(TempCluster);
772 } // CreateClusterTree
773
774
775 /** MakePotentialClusters **************************************************
776 Parameters: Cluster current cluster being visited in kd-tree walk
777 Order order in which cluster is being visited
778 Level level of this cluster in the kd-tree
779 Globals: Tree kd-tree to be searched for neighbors
780 TempCluster array of temporary clusters
781 CurrentTemp index of next temp cluster to be used
782 Heap heap used to hold temp clusters - "best" on top
783 Operation: This routine is designed to be used in concert with the
784 KDWalk routine. It will create a potential cluster for
785 each sample in the kd-tree that is being walked. This
786 potential cluster will then be pushed on the heap.
787 Return: none
788 Exceptions: none
789 History: 5/29/89, DSJ, Created.
790 7/13/89, DSJ, Removed visibility of kd-tree node data struct.
791 ******************************************************************************/
MakePotentialClusters(CLUSTER * Cluster,VISIT Order,inT32 Level)792 void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level) {
793 HEAPENTRY HeapEntry;
794
795 if ((Order == preorder) || (Order == leaf)) {
796 TempCluster[CurrentTemp].Cluster = Cluster;
797 HeapEntry.Data = (char *) &(TempCluster[CurrentTemp]);
798 TempCluster[CurrentTemp].Neighbor =
799 FindNearestNeighbor (Tree, TempCluster[CurrentTemp].Cluster,
800 &(HeapEntry.Key));
801 if (TempCluster[CurrentTemp].Neighbor != NULL) {
802 HeapStore(Heap, &HeapEntry);
803 CurrentTemp++;
804 }
805 }
806 } // MakePotentialClusters
807
808
809 /** FindNearestNeighbor *********************************************************
810 Parameters: Tree kd-tree to search in for nearest neighbor
811 Cluster cluster whose nearest neighbor is to be found
812 Distance ptr to variable to report distance found
813 Globals: none
814 Operation: This routine searches the specified kd-tree for the nearest
815 neighbor of the specified cluster. It actually uses the
816 kd routines to find the 2 nearest neighbors since one of them
817 will be the original cluster. A pointer to the nearest
818 neighbor is returned, if it can be found, otherwise NULL is
819 returned. The distance between the 2 nodes is placed
820 in the specified variable.
821 Return: Pointer to the nearest neighbor of Cluster, or NULL
822 Exceptions: none
823 History: 5/29/89, DSJ, Created.
824 7/13/89, DSJ, Removed visibility of kd-tree node data struct
825 ********************************************************************************/
826 CLUSTER *
FindNearestNeighbor(KDTREE * Tree,CLUSTER * Cluster,FLOAT32 * Distance)827 FindNearestNeighbor (KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
828 #define MAXNEIGHBORS 2
829 #define MAXDISTANCE MAX_FLOAT32
830 {
831 CLUSTER *Neighbor[MAXNEIGHBORS];
832 FLOAT32 Dist[MAXNEIGHBORS];
833 inT32 NumberOfNeighbors;
834 inT32 i;
835 CLUSTER *BestNeighbor;
836
837 // find the 2 nearest neighbors of the cluster
838 NumberOfNeighbors = KDNearestNeighborSearch
839 (Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, Neighbor, Dist);
840
841 // search for the nearest neighbor that is not the cluster itself
842 *Distance = MAXDISTANCE;
843 BestNeighbor = NULL;
844 for (i = 0; i < NumberOfNeighbors; i++) {
845 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
846 *Distance = Dist[i];
847 BestNeighbor = Neighbor[i];
848 }
849 }
850 return (BestNeighbor);
851 } // FindNearestNeighbor
852
853
854 /** MakeNewCluster *************************************************************
855 Parameters: Clusterer current clustering environment
856 TempCluster potential cluster to make permanent
857 Globals: none
858 Operation: This routine creates a new permanent cluster from the
859 clusters specified in TempCluster. The 2 clusters in
860 TempCluster are marked as "clustered" and deleted from
861 the kd-tree. The new cluster is then added to the kd-tree.
862 Return: Pointer to the new permanent cluster
863 Exceptions: none
864 History: 5/29/89, DSJ, Created.
865 7/13/89, DSJ, Removed visibility of kd-tree node data struct
866 ********************************************************************************/
MakeNewCluster(CLUSTERER * Clusterer,TEMPCLUSTER * TempCluster)867 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
868 CLUSTER *Cluster;
869
870 // allocate the new cluster and initialize it
871 Cluster = (CLUSTER *) Emalloc (sizeof (CLUSTER) +
872 (Clusterer->SampleSize -
873 1) * sizeof (FLOAT32));
874 Cluster->Clustered = FALSE;
875 Cluster->Prototype = FALSE;
876 Cluster->Left = TempCluster->Cluster;
877 Cluster->Right = TempCluster->Neighbor;
878 Cluster->CharID = -1;
879
880 // mark the old clusters as "clustered" and delete them from the kd-tree
881 Cluster->Left->Clustered = TRUE;
882 Cluster->Right->Clustered = TRUE;
883 KDDelete (Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
884 KDDelete (Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
885
886 // compute the mean and sample count for the new cluster
887 Cluster->SampleCount =
888 MergeClusters (Clusterer->SampleSize, Clusterer->ParamDesc,
889 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
890 Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
891
892 // add the new cluster to the KD tree
893 KDStore (Clusterer->KDTree, Cluster->Mean, Cluster);
894 return (Cluster);
895 } // MakeNewCluster
896
897
898 /** MergeClusters ************************************************************
899 Parameters: N # of dimensions (size of arrays)
900 ParamDesc array of dimension descriptions
901 n1, n2 number of samples in each old cluster
902 m array to hold mean of new cluster
903 m1, m2 arrays containing means of old clusters
904 Globals: None
905 Operation: This routine merges two clusters into one larger cluster.
906 To do this it computes the number of samples in the new
907 cluster and the mean of the new cluster. The ParamDesc
908 information is used to ensure that circular dimensions
909 are handled correctly.
910 Return: The number of samples in the new cluster.
911 Exceptions: None
912 History: 5/31/89, DSJ, Created.
913 *********************************************************************************/
914 inT32
MergeClusters(inT16 N,register PARAM_DESC ParamDesc[],register inT32 n1,register inT32 n2,register FLOAT32 m[],register FLOAT32 m1[],register FLOAT32 m2[])915 MergeClusters (inT16 N,
916 register PARAM_DESC ParamDesc[],
917 register inT32 n1,
918 register inT32 n2,
919 register FLOAT32 m[],
920 register FLOAT32 m1[], register FLOAT32 m2[]) {
921 register inT32 i, n;
922
923 n = n1 + n2;
924 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
925 if (ParamDesc->Circular) {
926 // if distance between means is greater than allowed
927 // reduce upper point by one "rotation" to compute mean
928 // then normalize the mean back into the accepted range
929 if ((*m2 - *m1) > ParamDesc->HalfRange) {
930 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
931 if (*m < ParamDesc->Min)
932 *m += ParamDesc->Range;
933 }
934 else if ((*m1 - *m2) > ParamDesc->HalfRange) {
935 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
936 if (*m < ParamDesc->Min)
937 *m += ParamDesc->Range;
938 }
939 else
940 *m = (n1 * *m1 + n2 * *m2) / n;
941 }
942 else
943 *m = (n1 * *m1 + n2 * *m2) / n;
944 }
945 return (n);
946 } // MergeClusters
947
948
949 /** ComputePrototypes *******************************************************
950 Parameters: Clusterer data structure holding cluster tree
951 Config parameters used to control prototype generation
952 Globals: None
953 Operation: This routine decides which clusters in the cluster tree
954 should be represented by prototypes, forms a list of these
955 prototypes, and places the list in the Clusterer data
956 structure.
957 Return: None
958 Exceptions: None
959 History: 5/30/89, DSJ, Created.
960 *******************************************************************************/
ComputePrototypes(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)961 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
962 LIST ClusterStack = NIL;
963 CLUSTER *Cluster;
964 PROTOTYPE *Prototype;
965
966 // use a stack to keep track of clusters waiting to be processed
967 // initially the only cluster on the stack is the root cluster
968 if (Clusterer->Root != NULL)
969 ClusterStack = push (NIL, Clusterer->Root);
970
971 // loop until we have analyzed all clusters which are potential prototypes
972 while (ClusterStack != NIL) {
973 // remove the next cluster to be analyzed from the stack
974 // try to make a prototype from the cluster
975 // if successful, put it on the proto list, else split the cluster
976 Cluster = (CLUSTER *) first_node (ClusterStack);
977 ClusterStack = pop (ClusterStack);
978 Prototype = MakePrototype (Clusterer, Config, Cluster);
979 if (Prototype != NULL) {
980 Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
981 }
982 else {
983 ClusterStack = push (ClusterStack, Cluster->Right);
984 ClusterStack = push (ClusterStack, Cluster->Left);
985 }
986 }
987 } // ComputePrototypes
988
989
990 /** MakePrototype ***********************************************************
991 Parameters: Clusterer data structure holding cluster tree
992 Config parameters used to control prototype generation
993 Cluster cluster to be made into a prototype
994 Globals: None
995 Operation: This routine attempts to create a prototype from the
996 specified cluster that conforms to the distribution
997 specified in Config. If there are too few samples in the
998 cluster to perform a statistical analysis, then a prototype
999 is generated but labelled as insignificant. If the
1000 dimensions of the cluster are not independent, no prototype
1001 is generated and NULL is returned. If a prototype can be
1002 found that matches the desired distribution then a pointer
1003 to it is returned, otherwise NULL is returned.
1004 Return: Pointer to new prototype or NULL
1005 Exceptions: None
1006 History: 6/19/89, DSJ, Created.
1007 *******************************************************************************/
MakePrototype(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster)1008 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
1009 CLUSTERCONFIG *Config,
1010 CLUSTER *Cluster) {
1011 STATISTICS *Statistics;
1012 PROTOTYPE *Proto;
1013 BUCKETS *Buckets;
1014
1015 // filter out clusters which contain samples from the same character
1016 if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
1017 return (NULL);
1018
1019 // compute the covariance matrix and ranges for the cluster
1020 Statistics =
1021 ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1022
1023 // check for degenerate clusters which need not be analyzed further
1024 // note that the MinSamples test assumes that all clusters with multiple
1025 // character samples have been removed (as above)
1026 Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics,
1027 Config->ProtoStyle,
1028 (inT32) (Config->MinSamples *
1029 Clusterer->NumChar));
1030 if (Proto != NULL) {
1031 FreeStatistics(Statistics);
1032 return (Proto);
1033 }
1034 // check to ensure that all dimensions are independent
1035 if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize,
1036 Statistics->CoVariance, Config->Independence)) {
1037 FreeStatistics(Statistics);
1038 return (NULL);
1039 }
1040
1041 if (HOTELLING && Config->ProtoStyle == elliptical) {
1042 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1043 if (Proto != NULL) {
1044 FreeStatistics(Statistics);
1045 return Proto;
1046 }
1047 }
1048
1049 // create a histogram data structure used to evaluate distributions
1050 Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence);
1051
1052 // create a prototype based on the statistics and test it
1053 switch (Config->ProtoStyle) {
1054 case spherical:
1055 Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1056 break;
1057 case elliptical:
1058 Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1059 break;
1060 case mixed:
1061 Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1062 Config->Confidence);
1063 break;
1064 case automatic:
1065 Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1066 if (Proto != NULL)
1067 break;
1068 Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1069 if (Proto != NULL)
1070 break;
1071 Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1072 Config->Confidence);
1073 break;
1074 }
1075 FreeBuckets(Buckets);
1076 FreeStatistics(Statistics);
1077 return (Proto);
1078 } // MakePrototype
1079
1080
1081 /** MakeDegenerateProto ******************************************************
1082 Parameters: N number of dimensions
1083 Cluster cluster being analyzed
1084 Statistics statistical info about cluster
1085 Style type of prototype to be generated
1086 MinSamples minimum number of samples in a cluster
1087 Globals: None
1088 Operation: This routine checks for clusters which are degenerate and
1089 therefore cannot be analyzed in a statistically valid way.
1090 A cluster is defined as degenerate if it does not have at
1091 least MINSAMPLESNEEDED samples in it. If the cluster is
1092 found to be degenerate, a prototype of the specified style
1093 is generated and marked as insignificant. A cluster is
1094 also degenerate if it does not have at least MinSamples
1095 samples in it.
1096 If the cluster is not degenerate, NULL is returned.
1097 Return: Pointer to degenerate prototype or NULL.
1098 Exceptions: None
1099 History: 6/20/89, DSJ, Created.
1100 7/12/89, DSJ, Changed name and added check for 0 stddev.
1101 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).
1102 ********************************************************************************/
MakeDegenerateProto(uinT16 N,CLUSTER * Cluster,STATISTICS * Statistics,PROTOSTYLE Style,inT32 MinSamples)1103 PROTOTYPE *MakeDegenerateProto( //this was MinSample
1104 uinT16 N,
1105 CLUSTER *Cluster,
1106 STATISTICS *Statistics,
1107 PROTOSTYLE Style,
1108 inT32 MinSamples) {
1109 PROTOTYPE *Proto = NULL;
1110
1111 if (MinSamples < MINSAMPLESNEEDED)
1112 MinSamples = MINSAMPLESNEEDED;
1113
1114 if (Cluster->SampleCount < MinSamples) {
1115 switch (Style) {
1116 case spherical:
1117 Proto = NewSphericalProto (N, Cluster, Statistics);
1118 break;
1119 case elliptical:
1120 case automatic:
1121 Proto = NewEllipticalProto (N, Cluster, Statistics);
1122 break;
1123 case mixed:
1124 Proto = NewMixedProto (N, Cluster, Statistics);
1125 break;
1126 }
1127 Proto->Significant = FALSE;
1128 }
1129 return (Proto);
1130 } // MakeDegenerateProto
1131
1132 /** TestEllipticalProto ****************************************************
1133 Parameters: Clusterer data struct containing samples being clustered
1134 Config provides the magic number of samples that make a good cluster
1135 Cluster cluster to be made into an elliptical prototype
1136 Statistics statistical info about cluster
1137 Globals: None
1138 Operation: This routine tests the specified cluster to see if **
1139 * there is a statistically significant difference between
1140 * the sub-clusters that would be made if the cluster were to
1141 * be split. If not, then a new prototype is formed and
1142 * returned to the caller. If there is, then NULL is returned
1143 * to the caller.
1144 Return: Pointer to new elliptical prototype or NULL.
1145 ****************************************************************************/
TestEllipticalProto(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster,STATISTICS * Statistics)1146 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
1147 CLUSTERCONFIG *Config,
1148 CLUSTER *Cluster,
1149 STATISTICS *Statistics) {
1150 // Fraction of the number of samples used as a range around 1 within
1151 // which a cluster has the magic size that allows a boost to the
1152 // FTable by kFTableBoostMargin, thus allowing clusters near the
1153 // magic size (equal to the number of sample characters) to be more
1154 // likely to stay together.
1155 const double kMagicSampleMargin = 0.0625;
1156 const double kFTableBoostMargin = 2.0;
1157
1158 int N = Clusterer->SampleSize;
1159 CLUSTER* Left = Cluster->Left;
1160 CLUSTER* Right = Cluster->Right;
1161 if (Left == NULL || Right == NULL)
1162 return NULL;
1163 int TotalDims = Left->SampleCount + Right->SampleCount;
1164 if (TotalDims < N + 1 || TotalDims < 2)
1165 return NULL;
1166 const int kMatrixSize = N * N * sizeof(FLOAT32);
1167 FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1168 FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1169 FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1170 // Compute a new covariance matrix that only uses essential features.
1171 for (int i = 0; i < N; ++i) {
1172 int row_offset = i * N;
1173 if (!Clusterer->ParamDesc[i].NonEssential) {
1174 for (int j = 0; j < N; ++j) {
1175 if (!Clusterer->ParamDesc[j].NonEssential)
1176 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1177 else
1178 Covariance[j + row_offset] = 0.0f;
1179 }
1180 } else {
1181 for (int j = 0; j < N; ++j) {
1182 if (i == j)
1183 Covariance[j + row_offset] = 1.0f;
1184 else
1185 Covariance[j + row_offset] = 0.0f;
1186 }
1187 }
1188 }
1189 double err = InvertMatrix(Covariance, N, Inverse);
1190 if (err > 1) {
1191 tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1192 }
1193 int EssentialN = 0;
1194 for (int dim = 0; dim < N; ++dim) {
1195 if (!Clusterer->ParamDesc[dim].NonEssential) {
1196 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1197 ++EssentialN;
1198 } else {
1199 Delta[dim] = 0.0f;
1200 }
1201 }
1202 // Compute Hotelling's T-squared.
1203 double Tsq = 0.0;
1204 for (int x = 0; x < N; ++x) {
1205 double temp = 0.0;
1206 for (int y = 0; y < N; ++y) {
1207 temp += Inverse[y + N*x] * Delta[y];
1208 }
1209 Tsq += Delta[x] * temp;
1210 }
1211 memfree(Covariance);
1212 memfree(Inverse);
1213 memfree(Delta);
1214 // Changed this function to match the formula in
1215 // Statistical Methods in Medical Research p 473
1216 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1217 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1218 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1219 int Fx = EssentialN;
1220 if (Fx > FTABLE_X)
1221 Fx = FTABLE_X;
1222 --Fx;
1223 int Fy = TotalDims - EssentialN - 1;
1224 if (Fy > FTABLE_Y)
1225 Fy = FTABLE_Y;
1226 --Fy;
1227 double FTarget = FTable[Fy][Fx];
1228 if (Config->MagicSamples > 0 &&
1229 TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1230 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1231 // Give magic-sized clusters a magic FTable boost.
1232 FTarget += kFTableBoostMargin;
1233 }
1234 if (F < FTarget) {
1235 return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1236 }
1237 return NULL;
1238 }
1239
1240 /* MakeSphericalProto *******************************************************
1241 Parameters: Clusterer data struct containing samples being clustered
1242 Cluster cluster to be made into a spherical prototype
1243 Statistics statistical info about cluster
1244 Buckets histogram struct used to analyze distribution
1245 Globals: None
1246 Operation: This routine tests the specified cluster to see if it can
1247 be approximated by a spherical normal distribution. If it
1248 can be, then a new prototype is formed and returned to the
1249 caller. If it can't be, then NULL is returned to the caller.
1250 Return: Pointer to new spherical prototype or NULL.
1251 Exceptions: None
1252 History: 6/1/89, DSJ, Created.
1253 ******************************************************************************/
MakeSphericalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)1254 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
1255 CLUSTER *Cluster,
1256 STATISTICS *Statistics,
1257 BUCKETS *Buckets) {
1258 PROTOTYPE *Proto = NULL;
1259 int i;
1260
1261 // check that each dimension is a normal distribution
1262 for (i = 0; i < Clusterer->SampleSize; i++) {
1263 if (Clusterer->ParamDesc[i].NonEssential)
1264 continue;
1265
1266 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1267 Cluster->Mean[i],
1268 sqrt ((FLOAT64) (Statistics->AvgVariance)));
1269 if (!DistributionOK (Buckets))
1270 break;
1271 }
1272 // if all dimensions matched a normal distribution, make a proto
1273 if (i >= Clusterer->SampleSize)
1274 Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1275 return (Proto);
1276 } // MakeSphericalProto
1277
1278
1279 /** MakeEllipticalProto ****************************************************
1280 Parameters: Clusterer data struct containing samples being clustered
1281 Cluster cluster to be made into an elliptical prototype
1282 Statistics statistical info about cluster
1283 Buckets histogram struct used to analyze distribution
1284 Globals: None
1285 Operation: This routine tests the specified cluster to see if it can
1286 be approximated by an elliptical normal distribution. If it
1287 can be, then a new prototype is formed and returned to the
1288 caller. If it can't be, then NULL is returned to the caller.
1289 Return: Pointer to new elliptical prototype or NULL.
1290 Exceptions: None
1291 History: 6/12/89, DSJ, Created.
1292 ****************************************************************************/
MakeEllipticalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)1293 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
1294 CLUSTER *Cluster,
1295 STATISTICS *Statistics,
1296 BUCKETS *Buckets) {
1297 PROTOTYPE *Proto = NULL;
1298 int i;
1299
1300 // check that each dimension is a normal distribution
1301 for (i = 0; i < Clusterer->SampleSize; i++) {
1302 if (Clusterer->ParamDesc[i].NonEssential)
1303 continue;
1304
1305 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1306 Cluster->Mean[i],
1307 sqrt ((FLOAT64) Statistics->
1308 CoVariance[i * (Clusterer->SampleSize + 1)]));
1309 if (!DistributionOK (Buckets))
1310 break;
1311 }
1312 // if all dimensions matched a normal distribution, make a proto
1313 if (i >= Clusterer->SampleSize)
1314 Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1315 return (Proto);
1316 } // MakeEllipticalProto
1317
1318
1319 /** MakeMixedProto ***********************************************************
1320 Parameters: Clusterer data struct containing samples being clustered
1321 Cluster cluster to be made into a prototype
1322 Statistics statistical info about cluster
1323 NormalBuckets histogram struct used to analyze distribution
1324 Confidence confidence level for alternate distributions
1325 Globals: None
1326 Operation: This routine tests each dimension of the specified cluster to
1327 see what distribution would best approximate that dimension.
1328 Each dimension is compared to the following distributions
1329 in order: normal, random, uniform. If each dimension can
1330 be represented by one of these distributions,
1331 then a new prototype is formed and returned to the
1332 caller. If it can't be, then NULL is returned to the caller.
1333 Return: Pointer to new mixed prototype or NULL.
1334 Exceptions: None
1335 History: 6/12/89, DSJ, Created.
1336 ********************************************************************************/
MakeMixedProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * NormalBuckets,FLOAT64 Confidence)1337 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
1338 CLUSTER *Cluster,
1339 STATISTICS *Statistics,
1340 BUCKETS *NormalBuckets,
1341 FLOAT64 Confidence) {
1342 PROTOTYPE *Proto;
1343 int i;
1344 BUCKETS *UniformBuckets = NULL;
1345 BUCKETS *RandomBuckets = NULL;
1346
1347 // create a mixed proto to work on - initially assume all dimensions normal*/
1348 Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1349
1350 // find the proper distribution for each dimension
1351 for (i = 0; i < Clusterer->SampleSize; i++) {
1352 if (Clusterer->ParamDesc[i].NonEssential)
1353 continue;
1354
1355 FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1356 Proto->Mean[i],
1357 sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1358 if (DistributionOK (NormalBuckets))
1359 continue;
1360
1361 if (RandomBuckets == NULL)
1362 RandomBuckets =
1363 GetBuckets (D_random, Cluster->SampleCount, Confidence);
1364 MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1365 FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1366 Proto->Mean[i], Proto->Variance.Elliptical[i]);
1367 if (DistributionOK (RandomBuckets))
1368 continue;
1369
1370 if (UniformBuckets == NULL)
1371 UniformBuckets =
1372 GetBuckets (uniform, Cluster->SampleCount, Confidence);
1373 MakeDimUniform(i, Proto, Statistics);
1374 FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1375 Proto->Mean[i], Proto->Variance.Elliptical[i]);
1376 if (DistributionOK (UniformBuckets))
1377 continue;
1378 break;
1379 }
1380 // if any dimension failed to match a distribution, discard the proto
1381 if (i < Clusterer->SampleSize) {
1382 FreePrototype(Proto);
1383 Proto = NULL;
1384 }
1385 if (UniformBuckets != NULL)
1386 FreeBuckets(UniformBuckets);
1387 if (RandomBuckets != NULL)
1388 FreeBuckets(RandomBuckets);
1389 return (Proto);
1390 } // MakeMixedProto
1391
1392
1393 /* MakeDimRandom *************************************************************
1394 Parameters: i index of dimension to be changed
1395 Proto prototype whose dimension is to be altered
1396 ParamDesc description of specified dimension
1397 Globals: None
1398 Operation: This routine alters the ith dimension of the specified
1399 mixed prototype to be D_random.
1400 Return: None
1401 Exceptions: None
1402 History: 6/20/89, DSJ, Created.
1403 ******************************************************************************/
MakeDimRandom(uinT16 i,PROTOTYPE * Proto,PARAM_DESC * ParamDesc)1404 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1405 Proto->Distrib[i] = D_random;
1406 Proto->Mean[i] = ParamDesc->MidRange;
1407 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1408
1409 // subtract out the previous magnitude of this dimension from the total
1410 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1411 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1412 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1413 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1414
1415 // note that the proto Weight is irrelevant for D_random protos
1416 } // MakeDimRandom
1417
1418
1419 /** MakeDimUniform ***********************************************************
1420 Parameters: i index of dimension to be changed
1421 Proto prototype whose dimension is to be altered
1422 Statistics statistical info about prototype
1423 Globals: None
1424 Operation: This routine alters the ith dimension of the specified
1425 mixed prototype to be uniform.
1426 Return: None
1427 Exceptions: None
1428 History: 6/20/89, DSJ, Created.
1429 ******************************************************************************/
MakeDimUniform(uinT16 i,PROTOTYPE * Proto,STATISTICS * Statistics)1430 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1431 Proto->Distrib[i] = uniform;
1432 Proto->Mean[i] = Proto->Cluster->Mean[i] +
1433 (Statistics->Min[i] + Statistics->Max[i]) / 2;
1434 Proto->Variance.Elliptical[i] =
1435 (Statistics->Max[i] - Statistics->Min[i]) / 2;
1436 if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1437 Proto->Variance.Elliptical[i] = MINVARIANCE;
1438
1439 // subtract out the previous magnitude of this dimension from the total
1440 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1441 Proto->Magnitude.Elliptical[i] =
1442 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1443 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1444 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1445
1446 // note that the proto Weight is irrelevant for uniform protos
1447 } // MakeDimUniform
1448
1449
1450 /** ComputeStatistics *********************************************************
1451 Parameters: N number of dimensions
1452 ParamDesc array of dimension descriptions
1453 Cluster cluster whose stats are to be computed
1454 Globals: None
1455 Operation: This routine searches the cluster tree for all leaf nodes
1456 which are samples in the specified cluster. It computes
1457 a full covariance matrix for these samples as well as
1458 keeping track of the ranges (min and max) for each
1459 dimension. A special data structure is allocated to
1460 return this information to the caller. An incremental
1461 algorithm for computing statistics is not used because
1462 it will not work with circular dimensions.
1463 Return: Pointer to new data structure containing statistics
1464 Exceptions: None
1465 History: 6/2/89, DSJ, Created.
1466 *********************************************************************************/
1467 STATISTICS *
ComputeStatistics(inT16 N,PARAM_DESC ParamDesc[],CLUSTER * Cluster)1468 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1469 STATISTICS *Statistics;
1470 int i, j;
1471 FLOAT32 *CoVariance;
1472 FLOAT32 *Distance;
1473 LIST SearchState;
1474 SAMPLE *Sample;
1475 uinT32 SampleCountAdjustedForBias;
1476
1477 // allocate memory to hold the statistics results
1478 Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1479 Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1480 Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1481 Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1482
1483 // allocate temporary memory to hold the sample to mean distances
1484 Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1485
1486 // initialize the statistics
1487 Statistics->AvgVariance = 1.0;
1488 CoVariance = Statistics->CoVariance;
1489 for (i = 0; i < N; i++) {
1490 Statistics->Min[i] = 0.0;
1491 Statistics->Max[i] = 0.0;
1492 for (j = 0; j < N; j++, CoVariance++)
1493 *CoVariance = 0;
1494 }
1495 // find each sample in the cluster and merge it into the statistics
1496 InitSampleSearch(SearchState, Cluster);
1497 while ((Sample = NextSample (&SearchState)) != NULL) {
1498 for (i = 0; i < N; i++) {
1499 Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1500 if (ParamDesc[i].Circular) {
1501 if (Distance[i] > ParamDesc[i].HalfRange)
1502 Distance[i] -= ParamDesc[i].Range;
1503 if (Distance[i] < -ParamDesc[i].HalfRange)
1504 Distance[i] += ParamDesc[i].Range;
1505 }
1506 if (Distance[i] < Statistics->Min[i])
1507 Statistics->Min[i] = Distance[i];
1508 if (Distance[i] > Statistics->Max[i])
1509 Statistics->Max[i] = Distance[i];
1510 }
1511 CoVariance = Statistics->CoVariance;
1512 for (i = 0; i < N; i++)
1513 for (j = 0; j < N; j++, CoVariance++)
1514 *CoVariance += Distance[i] * Distance[j];
1515 }
1516 // normalize the variances by the total number of samples
1517 // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1518 // also compute the geometic mean of the diagonal variances
1519 // ensure that clusters with only 1 sample are handled correctly
1520 if (Cluster->SampleCount > 1)
1521 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1522 else
1523 SampleCountAdjustedForBias = 1;
1524 CoVariance = Statistics->CoVariance;
1525 for (i = 0; i < N; i++)
1526 for (j = 0; j < N; j++, CoVariance++) {
1527 *CoVariance /= SampleCountAdjustedForBias;
1528 if (j == i) {
1529 if (*CoVariance < MINVARIANCE)
1530 *CoVariance = MINVARIANCE;
1531 Statistics->AvgVariance *= *CoVariance;
1532 }
1533 }
1534 Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1535 1.0 / N);
1536
1537 // release temporary memory and return
1538 memfree(Distance);
1539 return (Statistics);
1540 } // ComputeStatistics
1541
1542
1543 /** NewSpericalProto *********************************************************
1544 Parameters: N number of dimensions
1545 Cluster cluster to be made into a spherical prototype
1546 Statistics statistical info about samples in cluster
1547 Globals: None
1548 Operation: This routine creates a spherical prototype data structure to
1549 approximate the samples in the specified cluster.
1550 Spherical prototypes have a single variance which is
1551 common across all dimensions. All dimensions are normally
1552 distributed and independent.
1553 Return: Pointer to a new spherical prototype data structure
1554 Exceptions: None
1555 History: 6/19/89, DSJ, Created.
1556 ******************************************************************************/
NewSphericalProto(uinT16 N,CLUSTER * Cluster,STATISTICS * Statistics)1557 PROTOTYPE *NewSphericalProto(uinT16 N,
1558 CLUSTER *Cluster,
1559 STATISTICS *Statistics) {
1560 PROTOTYPE *Proto;
1561
1562 Proto = NewSimpleProto (N, Cluster);
1563
1564 Proto->Variance.Spherical = Statistics->AvgVariance;
1565 if (Proto->Variance.Spherical < MINVARIANCE)
1566 Proto->Variance.Spherical = MINVARIANCE;
1567
1568 Proto->Magnitude.Spherical =
1569 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1570 Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1571 (double) N);
1572 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1573 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1574
1575 return (Proto);
1576 } // NewSphericalProto
1577
1578
1579 /** NewEllipticalProto *******************************************************
1580 Parameters: N number of dimensions
1581 Cluster cluster to be made into an elliptical prototype
1582 Statistics statistical info about samples in cluster
1583 Globals: None
1584 Operation: This routine creates an elliptical prototype data structure to
1585 approximate the samples in the specified cluster.
1586 Elliptical prototypes have a variance for each dimension.
1587 All dimensions are normally distributed and independent.
1588 Return: Pointer to a new elliptical prototype data structure
1589 Exceptions: None
1590 History: 6/19/89, DSJ, Created.
1591 *******************************************************************************/
NewEllipticalProto(inT16 N,CLUSTER * Cluster,STATISTICS * Statistics)1592 PROTOTYPE *NewEllipticalProto(inT16 N,
1593 CLUSTER *Cluster,
1594 STATISTICS *Statistics) {
1595 PROTOTYPE *Proto;
1596 FLOAT32 *CoVariance;
1597 int i;
1598
1599 Proto = NewSimpleProto (N, Cluster);
1600 Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1601 Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1602 Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1603
1604 CoVariance = Statistics->CoVariance;
1605 Proto->TotalMagnitude = 1.0;
1606 for (i = 0; i < N; i++, CoVariance += N + 1) {
1607 Proto->Variance.Elliptical[i] = *CoVariance;
1608 if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1609 Proto->Variance.Elliptical[i] = MINVARIANCE;
1610
1611 Proto->Magnitude.Elliptical[i] =
1612 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1613 Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1614 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1615 }
1616 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1617 Proto->Style = elliptical;
1618 return (Proto);
1619 } // NewEllipticalProto
1620
1621
1622 /** MewMixedProto ************************************************************
1623 Parameters: N number of dimensions
1624 Cluster cluster to be made into a mixed prototype
1625 Statistics statistical info about samples in cluster
1626 Globals: None
1627 Operation: This routine creates a mixed prototype data structure to
1628 approximate the samples in the specified cluster.
1629 Mixed prototypes can have different distributions for
1630 each dimension. All dimensions are independent. The
1631 structure is initially filled in as though it were an
1632 elliptical prototype. The actual distributions of the
1633 dimensions can be altered by other routines.
1634 Return: Pointer to a new mixed prototype data structure
1635 Exceptions: None
1636 History: 6/19/89, DSJ, Created.
1637 ********************************************************************************/
NewMixedProto(inT16 N,CLUSTER * Cluster,STATISTICS * Statistics)1638 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1639 PROTOTYPE *Proto;
1640 int i;
1641
1642 Proto = NewEllipticalProto (N, Cluster, Statistics);
1643 Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1644
1645 for (i = 0; i < N; i++) {
1646 Proto->Distrib[i] = normal;
1647 }
1648 Proto->Style = mixed;
1649 return (Proto);
1650 } // NewMixedProto
1651
1652
1653 /** NewSimpleProto ***********************************************************
1654 Parameters: N number of dimensions
1655 Cluster cluster to be made into a prototype
1656 Globals: None
1657 Operation: This routine allocates memory to hold a simple prototype
1658 data structure, i.e. one without independent distributions
1659 and variances for each dimension.
1660 Return: Pointer to new simple prototype
1661 Exceptions: None
1662 History: 6/19/89, DSJ, Created.
1663 *******************************************************************************/
NewSimpleProto(inT16 N,CLUSTER * Cluster)1664 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
1665 PROTOTYPE *Proto;
1666 int i;
1667
1668 Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1669 Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1670
1671 for (i = 0; i < N; i++)
1672 Proto->Mean[i] = Cluster->Mean[i];
1673 Proto->Distrib = NULL;
1674
1675 Proto->Significant = TRUE;
1676 Proto->Style = spherical;
1677 Proto->NumSamples = Cluster->SampleCount;
1678 Proto->Cluster = Cluster;
1679 Proto->Cluster->Prototype = TRUE;
1680 return (Proto);
1681 } // NewSimpleProto
1682
1683
1684 /** Independent ***************************************************************
1685 Parameters: ParamDesc descriptions of each feature space dimension
1686 N number of dimensions
1687 CoVariance ptr to a covariance matrix
1688 Independence max off-diagonal correlation coefficient
1689 Globals: None
1690 Operation: This routine returns TRUE if the specified covariance
1691 matrix indicates that all N dimensions are independent of
1692 one another. One dimension is judged to be independent of
1693 another when the magnitude of the corresponding correlation
1694 coefficient is
1695 less than the specified Independence factor. The
1696 correlation coefficient is calculated as: (see Duda and
1697 Hart, pg. 247)
1698 coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
1699 The covariance matrix is assumed to be symmetric (which
1700 should always be true).
1701 Return: TRUE if dimensions are independent, FALSE otherwise
1702 Exceptions: None
1703 History: 6/4/89, DSJ, Created.
1704 *******************************************************************************/
1705 BOOL8
Independent(PARAM_DESC ParamDesc[],inT16 N,FLOAT32 * CoVariance,FLOAT32 Independence)1706 Independent (PARAM_DESC ParamDesc[],
1707 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1708 int i, j;
1709 FLOAT32 *VARii; // points to ith on-diagonal element
1710 FLOAT32 *VARjj; // points to jth on-diagonal element
1711 FLOAT32 CorrelationCoeff;
1712
1713 VARii = CoVariance;
1714 for (i = 0; i < N; i++, VARii += N + 1) {
1715 if (ParamDesc[i].NonEssential)
1716 continue;
1717
1718 VARjj = VARii + N + 1;
1719 CoVariance = VARii + 1;
1720 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1721 if (ParamDesc[j].NonEssential)
1722 continue;
1723
1724 if ((*VARii == 0.0) || (*VARjj == 0.0))
1725 CorrelationCoeff = 0.0;
1726 else
1727 CorrelationCoeff =
1728 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1729 if (CorrelationCoeff > Independence)
1730 return (FALSE);
1731 }
1732 }
1733 return (TRUE);
1734 } // Independent
1735
1736
1737 /** GetBuckets **************************************************************
1738 Parameters: Distribution type of probability distribution to test for
1739 SampleCount number of samples that are available
1740 Confidence probability of a Type I error
1741 Globals: none
1742 Operation: This routine returns a histogram data structure which can
1743 be used by other routines to place samples into histogram
1744 buckets, and then apply a goodness of fit test to the
1745 histogram data to determine if the samples belong to the
1746 specified probability distribution. The routine keeps
1747 a list of bucket data structures which have already been
1748 created so that it minimizes the computation time needed
1749 to create a new bucket.
1750 Return: Bucket data structure
1751 Exceptions: none
1752 History: Thu Aug 3 12:58:10 1989, DSJ, Created.
1753 *****************************************************************************/
GetBuckets(DISTRIBUTION Distribution,uinT32 SampleCount,FLOAT64 Confidence)1754 BUCKETS *GetBuckets(DISTRIBUTION Distribution,
1755 uinT32 SampleCount,
1756 FLOAT64 Confidence) {
1757 uinT16 NumberOfBuckets;
1758 BUCKETS *Buckets;
1759
1760 // search for an old bucket structure with the same number of buckets
1761 NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1762 Buckets = (BUCKETS *) first_node (search (OldBuckets[(int) Distribution],
1763 &NumberOfBuckets, NumBucketsMatch));
1764
1765 // if a matching bucket structure is found, delete it from the list
1766 if (Buckets != NULL) {
1767 OldBuckets[(int) Distribution] =
1768 delete_d (OldBuckets[(int) Distribution], Buckets, ListEntryMatch);
1769 if (SampleCount != Buckets->SampleCount)
1770 AdjustBuckets(Buckets, SampleCount);
1771 if (Confidence != Buckets->Confidence) {
1772 Buckets->Confidence = Confidence;
1773 Buckets->ChiSquared = ComputeChiSquared
1774 (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets),
1775 Confidence);
1776 }
1777 InitBuckets(Buckets);
1778 }
1779 else // otherwise create a new structure
1780 Buckets = MakeBuckets (Distribution, SampleCount, Confidence);
1781 return (Buckets);
1782 } // GetBuckets
1783
1784
1785 /** Makebuckets *************************************************************
1786 Parameters: Distribution type of probability distribution to test for
1787 SampleCount number of samples that are available
1788 Confidence probability of a Type I error
1789 Globals: None
1790 Operation: This routine creates a histogram data structure which can
1791 be used by other routines to place samples into histogram
1792 buckets, and then apply a goodness of fit test to the
1793 histogram data to determine if the samples belong to the
1794 specified probability distribution. The buckets are
1795 allocated in such a way that the expected frequency of
1796 samples in each bucket is approximately the same. In
1797 order to make this possible, a mapping table is
1798 computed which maps "normalized" samples into the
1799 appropriate bucket.
1800 Return: Pointer to new histogram data structure
1801 Exceptions: None
1802 History: 6/4/89, DSJ, Created.
1803 *****************************************************************************/
MakeBuckets(DISTRIBUTION Distribution,uinT32 SampleCount,FLOAT64 Confidence)1804 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
1805 uinT32 SampleCount,
1806 FLOAT64 Confidence) {
1807 static DENSITYFUNC DensityFunction[] =
1808 { NormalDensity, UniformDensity, UniformDensity };
1809 int i, j;
1810 BUCKETS *Buckets;
1811 FLOAT64 BucketProbability;
1812 FLOAT64 NextBucketBoundary;
1813 FLOAT64 Probability;
1814 FLOAT64 ProbabilityDelta;
1815 FLOAT64 LastProbDensity;
1816 FLOAT64 ProbDensity;
1817 uinT16 CurrentBucket;
1818 BOOL8 Symmetrical;
1819
1820 // allocate memory needed for data structure
1821 Buckets = (BUCKETS *) Emalloc (sizeof (BUCKETS));
1822 Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1823 Buckets->SampleCount = SampleCount;
1824 Buckets->Confidence = Confidence;
1825 Buckets->Count =
1826 (uinT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (uinT32));
1827 Buckets->ExpectedCount =
1828 (FLOAT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (FLOAT32));
1829
1830 // initialize simple fields
1831 Buckets->Distribution = Distribution;
1832 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1833 Buckets->Count[i] = 0;
1834 Buckets->ExpectedCount[i] = 0.0;
1835 }
1836
1837 // all currently defined distributions are symmetrical
1838 Symmetrical = TRUE;
1839 Buckets->ChiSquared = ComputeChiSquared
1840 (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), Confidence);
1841
1842 if (Symmetrical) {
1843 // allocate buckets so that all have approx. equal probability
1844 BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1845
1846 // distribution is symmetric so fill in upper half then copy
1847 CurrentBucket = Buckets->NumberOfBuckets / 2;
1848 if (Odd (Buckets->NumberOfBuckets))
1849 NextBucketBoundary = BucketProbability / 2;
1850 else
1851 NextBucketBoundary = BucketProbability;
1852
1853 Probability = 0.0;
1854 LastProbDensity =
1855 (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1856 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1857 ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1858 ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1859 Probability += ProbabilityDelta;
1860 if (Probability > NextBucketBoundary) {
1861 if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1862 CurrentBucket++;
1863 NextBucketBoundary += BucketProbability;
1864 }
1865 Buckets->Bucket[i] = CurrentBucket;
1866 Buckets->ExpectedCount[CurrentBucket] +=
1867 (FLOAT32) (ProbabilityDelta * SampleCount);
1868 LastProbDensity = ProbDensity;
1869 }
1870 // place any leftover probability into the last bucket
1871 Buckets->ExpectedCount[CurrentBucket] +=
1872 (FLOAT32) ((0.5 - Probability) * SampleCount);
1873
1874 // copy upper half of distribution to lower half
1875 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1876 Buckets->Bucket[i] =
1877 Mirror (Buckets->Bucket[j], Buckets->NumberOfBuckets);
1878
1879 // copy upper half of expected counts to lower half
1880 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1881 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1882 }
1883 return (Buckets);
1884 } // MakeBuckets
1885
1886
1887 //---------------------------------------------------------------------------
OptimumNumberOfBuckets(uinT32 SampleCount)1888 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
1889 /*
1890 ** Parameters:
1891 ** SampleCount number of samples to be tested
1892 ** Globals:
1893 ** CountTable lookup table for number of samples
1894 ** BucketsTable lookup table for necessary number of buckets
1895 ** Operation:
1896 ** This routine computes the optimum number of histogram
1897 ** buckets that should be used in a chi-squared goodness of
1898 ** fit test for the specified number of samples. The optimum
1899 ** number is computed based on Table 4.1 on pg. 147 of
1900 ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1901 ** Linear interpolation is used to interpolate between table
1902 ** values. The table is intended for a 0.05 level of
1903 ** significance (alpha). This routine assumes that it is
1904 ** equally valid for other alpha's, which may not be true.
1905 ** Return:
1906 ** Optimum number of histogram buckets
1907 ** Exceptions:
1908 ** None
1909 ** History:
1910 ** 6/5/89, DSJ, Created.
1911 */
1912 uinT8 Last, Next;
1913 FLOAT32 Slope;
1914
1915 if (SampleCount < CountTable[0])
1916 return (BucketsTable[0]);
1917
1918 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1919 if (SampleCount <= CountTable[Next]) {
1920 Slope = (FLOAT32) (BucketsTable[Next] - BucketsTable[Last]) /
1921 (FLOAT32) (CountTable[Next] - CountTable[Last]);
1922 return ((uinT16) (BucketsTable[Last] +
1923 Slope * (SampleCount - CountTable[Last])));
1924 }
1925 }
1926 return (BucketsTable[Last]);
1927 } // OptimumNumberOfBuckets
1928
1929
1930 //---------------------------------------------------------------------------
1931 FLOAT64
ComputeChiSquared(uinT16 DegreesOfFreedom,FLOAT64 Alpha)1932 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
1933 /*
1934 ** Parameters:
1935 ** DegreesOfFreedom determines shape of distribution
1936 ** Alpha probability of right tail
1937 ** Globals: none
1938 ** Operation:
1939 ** This routine computes the chi-squared value which will
1940 ** leave a cumulative probability of Alpha in the right tail
1941 ** of a chi-squared distribution with the specified number of
1942 ** degrees of freedom. Alpha must be between 0 and 1.
1943 ** DegreesOfFreedom must be even. The routine maintains an
1944 ** array of lists. Each list corresponds to a different
1945 ** number of degrees of freedom. Each entry in the list
1946 ** corresponds to a different alpha value and its corresponding
1947 ** chi-squared value. Therefore, once a particular chi-squared
1948 ** value is computed, it is stored in the list and never
1949 ** needs to be computed again.
1950 ** Return: Desired chi-squared value
1951 ** Exceptions: none
1952 ** History: 6/5/89, DSJ, Created.
1953 */
1954 #define CHIACCURACY 0.01
1955 #define MINALPHA (1e-200)
1956 {
1957 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1958
1959 CHISTRUCT *OldChiSquared;
1960 CHISTRUCT SearchKey;
1961
1962 // limit the minimum alpha that can be used - if alpha is too small
1963 // it may not be possible to compute chi-squared.
1964 if (Alpha < MINALPHA)
1965 Alpha = MINALPHA;
1966 if (Alpha > 1.0)
1967 Alpha = 1.0;
1968 if (Odd (DegreesOfFreedom))
1969 DegreesOfFreedom++;
1970
1971 /* find the list of chi-squared values which have already been computed
1972 for the specified number of degrees of freedom. Search the list for
1973 the desired chi-squared. */
1974 SearchKey.Alpha = Alpha;
1975 OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1976 &SearchKey, AlphaMatch));
1977
1978 if (OldChiSquared == NULL) {
1979 OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1980 OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1981 (FLOAT64) DegreesOfFreedom,
1982 (FLOAT64) CHIACCURACY);
1983 ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1984 OldChiSquared);
1985 }
1986 else {
1987 // further optimization might move OldChiSquared to front of list
1988 }
1989
1990 return (OldChiSquared->ChiSquared);
1991
1992 } // ComputeChiSquared
1993
1994
1995 //---------------------------------------------------------------------------
NormalDensity(inT32 x)1996 FLOAT64 NormalDensity(inT32 x) {
1997 /*
1998 ** Parameters:
1999 ** x number to compute the normal probability density for
2000 ** Globals:
2001 ** NormalMean mean of a discrete normal distribution
2002 ** NormalVariance variance of a discrete normal distribution
2003 ** NormalMagnitude magnitude of a discrete normal distribution
2004 ** Operation:
2005 ** This routine computes the probability density function
2006 ** of a discrete normal distribution defined by the global
2007 ** variables NormalMean, NormalVariance, and NormalMagnitude.
2008 ** Normal magnitude could, of course, be computed in terms of
2009 ** the normal variance but it is precomputed for efficiency.
2010 ** Return:
2011 ** The value of the normal distribution at x.
2012 ** Exceptions:
2013 ** None
2014 ** History:
2015 ** 6/4/89, DSJ, Created.
2016 */
2017 FLOAT64 Distance;
2018
2019 Distance = x - NormalMean;
2020 return (NormalMagnitude *
2021 exp (-0.5 * Distance * Distance / NormalVariance));
2022 } // NormalDensity
2023
2024
2025 //---------------------------------------------------------------------------
UniformDensity(inT32 x)2026 FLOAT64 UniformDensity(inT32 x) {
2027 /*
2028 ** Parameters:
2029 ** x number to compute the uniform probability density for
2030 ** Globals:
2031 ** BUCKETTABLESIZE determines range of distribution
2032 ** Operation:
2033 ** This routine computes the probability density function
2034 ** of a uniform distribution at the specified point. The
2035 ** range of the distribution is from 0 to BUCKETTABLESIZE.
2036 ** Return:
2037 ** The value of the uniform distribution at x.
2038 ** Exceptions:
2039 ** None
2040 ** History:
2041 ** 6/5/89, DSJ, Created.
2042 */
2043 static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
2044
2045 if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
2046 return (UniformDistributionDensity);
2047 else
2048 return ((FLOAT64) 0.0);
2049 } // UniformDensity
2050
2051
2052 //---------------------------------------------------------------------------
Integral(FLOAT64 f1,FLOAT64 f2,FLOAT64 Dx)2053 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
2054 /*
2055 ** Parameters:
2056 ** f1 value of function at x1
2057 ** f2 value of function at x2
2058 ** Dx x2 - x1 (should always be positive)
2059 ** Globals:
2060 ** None
2061 ** Operation:
2062 ** This routine computes a trapezoidal approximation to the
2063 ** integral of a function over a small delta in x.
2064 ** Return:
2065 ** Approximation of the integral of the function from x1 to x2.
2066 ** Exceptions:
2067 ** None
2068 ** History:
2069 ** 6/5/89, DSJ, Created.
2070 */
2071 return ((f1 + f2) * Dx / 2.0);
2072 } // Integral
2073
2074
2075 //---------------------------------------------------------------------------
FillBuckets(BUCKETS * Buckets,CLUSTER * Cluster,uinT16 Dim,PARAM_DESC * ParamDesc,FLOAT32 Mean,FLOAT32 StdDev)2076 void FillBuckets(BUCKETS *Buckets,
2077 CLUSTER *Cluster,
2078 uinT16 Dim,
2079 PARAM_DESC *ParamDesc,
2080 FLOAT32 Mean,
2081 FLOAT32 StdDev) {
2082 /*
2083 ** Parameters:
2084 ** Buckets histogram buckets to count samples
2085 ** Cluster cluster whose samples are being analyzed
2086 ** Dim dimension of samples which is being analyzed
2087 ** ParamDesc description of the dimension
2088 ** Mean "mean" of the distribution
2089 ** StdDev "standard deviation" of the distribution
2090 ** Globals:
2091 ** None
2092 ** Operation:
2093 ** This routine counts the number of cluster samples which
2094 ** fall within the various histogram buckets in Buckets. Only
2095 ** one dimension of each sample is examined. The exact meaning
2096 ** of the Mean and StdDev parameters depends on the
2097 ** distribution which is being analyzed (this info is in the
2098 ** Buckets data structure). For normal distributions, Mean
2099 ** and StdDev have the expected meanings. For uniform and
2100 ** random distributions the Mean is the center point of the
2101 ** range and the StdDev is 1/2 the range. A dimension with
2102 ** zero standard deviation cannot be statistically analyzed.
2103 ** In this case, a pseudo-analysis is used.
2104 ** Return:
2105 ** None (the Buckets data structure is filled in)
2106 ** Exceptions:
2107 ** None
2108 ** History:
2109 ** 6/5/89, DSJ, Created.
2110 */
2111 uinT16 BucketID;
2112 int i;
2113 LIST SearchState;
2114 SAMPLE *Sample;
2115
2116 // initialize the histogram bucket counts to 0
2117 for (i = 0; i < Buckets->NumberOfBuckets; i++)
2118 Buckets->Count[i] = 0;
2119
2120 if (StdDev == 0.0) {
2121 /* if the standard deviation is zero, then we can't statistically
2122 analyze the cluster. Use a pseudo-analysis: samples exactly on
2123 the mean are distributed evenly across all buckets. Samples greater
2124 than the mean are placed in the last bucket; samples less than the
2125 mean are placed in the first bucket. */
2126
2127 InitSampleSearch(SearchState, Cluster);
2128 i = 0;
2129 while ((Sample = NextSample (&SearchState)) != NULL) {
2130 if (Sample->Mean[Dim] > Mean)
2131 BucketID = Buckets->NumberOfBuckets - 1;
2132 else if (Sample->Mean[Dim] < Mean)
2133 BucketID = 0;
2134 else
2135 BucketID = i;
2136 Buckets->Count[BucketID] += 1;
2137 i++;
2138 if (i >= Buckets->NumberOfBuckets)
2139 i = 0;
2140 }
2141 }
2142 else {
2143 // search for all samples in the cluster and add to histogram buckets
2144 InitSampleSearch(SearchState, Cluster);
2145 while ((Sample = NextSample (&SearchState)) != NULL) {
2146 switch (Buckets->Distribution) {
2147 case normal:
2148 BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2149 Mean, StdDev);
2150 break;
2151 case D_random:
2152 case uniform:
2153 BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2154 Mean, StdDev);
2155 break;
2156 default:
2157 BucketID = 0;
2158 }
2159 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2160 }
2161 }
2162 } // FillBuckets
2163
2164
2165 //---------------------------------------------------------------------------*/
NormalBucket(PARAM_DESC * ParamDesc,FLOAT32 x,FLOAT32 Mean,FLOAT32 StdDev)2166 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
2167 FLOAT32 x,
2168 FLOAT32 Mean,
2169 FLOAT32 StdDev) {
2170 /*
2171 ** Parameters:
2172 ** ParamDesc used to identify circular dimensions
2173 ** x value to be normalized
2174 ** Mean mean of normal distribution
2175 ** StdDev standard deviation of normal distribution
2176 ** Globals:
2177 ** NormalMean mean of discrete normal distribution
2178 ** NormalStdDev standard deviation of discrete normal dist.
2179 ** BUCKETTABLESIZE range of the discrete distribution
2180 ** Operation:
2181 ** This routine determines which bucket x falls into in the
2182 ** discrete normal distribution defined by NormalMean
2183 ** and NormalStdDev. x values which exceed the range of
2184 ** the discrete distribution are clipped.
2185 ** Return:
2186 ** Bucket number into which x falls
2187 ** Exceptions:
2188 ** None
2189 ** History:
2190 ** 6/5/89, DSJ, Created.
2191 */
2192 FLOAT32 X;
2193
2194 // wraparound circular parameters if necessary
2195 if (ParamDesc->Circular) {
2196 if (x - Mean > ParamDesc->HalfRange)
2197 x -= ParamDesc->Range;
2198 else if (x - Mean < -ParamDesc->HalfRange)
2199 x += ParamDesc->Range;
2200 }
2201
2202 X = ((x - Mean) / StdDev) * NormalStdDev + NormalMean;
2203 if (X < 0)
2204 return ((uinT16) 0);
2205 if (X > BUCKETTABLESIZE - 1)
2206 return ((uinT16) (BUCKETTABLESIZE - 1));
2207 return ((uinT16) floor ((FLOAT64) X));
2208 } // NormalBucket
2209
2210
2211 //---------------------------------------------------------------------------
UniformBucket(PARAM_DESC * ParamDesc,FLOAT32 x,FLOAT32 Mean,FLOAT32 StdDev)2212 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
2213 FLOAT32 x,
2214 FLOAT32 Mean,
2215 FLOAT32 StdDev) {
2216 /*
2217 ** Parameters:
2218 ** ParamDesc used to identify circular dimensions
2219 ** x value to be normalized
2220 ** Mean center of range of uniform distribution
2221 ** StdDev 1/2 the range of the uniform distribution
2222 ** Globals:
2223 ** BUCKETTABLESIZE range of the discrete distribution
2224 ** Operation:
2225 ** This routine determines which bucket x falls into in the
2226 ** discrete uniform distribution defined by
2227 ** BUCKETTABLESIZE. x values which exceed the range of
2228 ** the discrete distribution are clipped.
2229 ** Return:
2230 ** Bucket number into which x falls
2231 ** Exceptions:
2232 ** None
2233 ** History:
2234 ** 6/5/89, DSJ, Created.
2235 */
2236 FLOAT32 X;
2237
2238 // wraparound circular parameters if necessary
2239 if (ParamDesc->Circular) {
2240 if (x - Mean > ParamDesc->HalfRange)
2241 x -= ParamDesc->Range;
2242 else if (x - Mean < -ParamDesc->HalfRange)
2243 x += ParamDesc->Range;
2244 }
2245
2246 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2247 if (X < 0)
2248 return ((uinT16) 0);
2249 if (X > BUCKETTABLESIZE - 1)
2250 return ((uinT16) (BUCKETTABLESIZE - 1));
2251 return ((uinT16) floor ((FLOAT64) X));
2252 } // UniformBucket
2253
2254
2255 //---------------------------------------------------------------------------
DistributionOK(BUCKETS * Buckets)2256 BOOL8 DistributionOK(BUCKETS *Buckets) {
2257 /*
2258 ** Parameters:
2259 ** Buckets histogram data to perform chi-square test on
2260 ** Globals:
2261 ** None
2262 ** Operation:
2263 ** This routine performs a chi-square goodness of fit test
2264 ** on the histogram data in the Buckets data structure. TRUE
2265 ** is returned if the histogram matches the probability
2266 ** distribution which was specified when the Buckets
2267 ** structure was originally created. Otherwise FALSE is
2268 ** returned.
2269 ** Return:
2270 ** TRUE if samples match distribution, FALSE otherwise
2271 ** Exceptions:
2272 ** None
2273 ** History:
2274 ** 6/5/89, DSJ, Created.
2275 */
2276 FLOAT32 FrequencyDifference;
2277 FLOAT32 TotalDifference;
2278 int i;
2279
2280 // compute how well the histogram matches the expected histogram
2281 TotalDifference = 0.0;
2282 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2283 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2284 TotalDifference += (FrequencyDifference * FrequencyDifference) /
2285 Buckets->ExpectedCount[i];
2286 }
2287
2288 // test to see if the difference is more than expected
2289 if (TotalDifference > Buckets->ChiSquared)
2290 return (FALSE);
2291 else
2292 return (TRUE);
2293 } // DistributionOK
2294
2295
2296 //---------------------------------------------------------------------------
FreeStatistics(STATISTICS * Statistics)2297 void FreeStatistics(STATISTICS *Statistics) {
2298 /*
2299 ** Parameters:
2300 ** Statistics pointer to data structure to be freed
2301 ** Globals:
2302 ** None
2303 ** Operation:
2304 ** This routine frees the memory used by the statistics
2305 ** data structure.
2306 ** Return:
2307 ** None
2308 ** Exceptions:
2309 ** None
2310 ** History:
2311 ** 6/5/89, DSJ, Created.
2312 */
2313 memfree (Statistics->CoVariance);
2314 memfree (Statistics->Min);
2315 memfree (Statistics->Max);
2316 memfree(Statistics);
2317 } // FreeStatistics
2318
2319
2320 //---------------------------------------------------------------------------
FreeBuckets(BUCKETS * Buckets)2321 void FreeBuckets(BUCKETS *Buckets) {
2322 /*
2323 ** Parameters:
2324 ** Buckets pointer to data structure to be freed
2325 ** Globals: none
2326 ** Operation:
2327 ** This routine places the specified histogram data structure
2328 ** at the front of a list of histograms so that it can be
2329 ** reused later if necessary. A separate list is maintained
2330 ** for each different type of distribution.
2331 ** Return: none
2332 ** Exceptions: none
2333 ** History: 6/5/89, DSJ, Created.
2334 */
2335 int Dist;
2336
2337 if (Buckets != NULL) {
2338 Dist = (int) Buckets->Distribution;
2339 OldBuckets[Dist] = (LIST) push (OldBuckets[Dist], Buckets);
2340 }
2341
2342 } // FreeBuckets
2343
2344
2345 //---------------------------------------------------------------------------
FreeCluster(CLUSTER * Cluster)2346 void FreeCluster(CLUSTER *Cluster) {
2347 /*
2348 ** Parameters:
2349 ** Cluster pointer to cluster to be freed
2350 ** Globals:
2351 ** None
2352 ** Operation:
2353 ** This routine frees the memory consumed by the specified
2354 ** cluster and all of its subclusters. This is done by
2355 ** recursive calls to FreeCluster().
2356 ** Return:
2357 ** None
2358 ** Exceptions:
2359 ** None
2360 ** History:
2361 ** 6/6/89, DSJ, Created.
2362 */
2363 if (Cluster != NULL) {
2364 FreeCluster (Cluster->Left);
2365 FreeCluster (Cluster->Right);
2366 memfree(Cluster);
2367 }
2368 } // FreeCluster
2369
2370
2371 //---------------------------------------------------------------------------
DegreesOfFreedom(DISTRIBUTION Distribution,uinT16 HistogramBuckets)2372 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2373 /*
2374 ** Parameters:
2375 ** Distribution distribution being tested for
2376 ** HistogramBuckets number of buckets in chi-square test
2377 ** Globals: none
2378 ** Operation:
2379 ** This routine computes the degrees of freedom that should
2380 ** be used in a chi-squared test with the specified number of
2381 ** histogram buckets. The result is always rounded up to
2382 ** the next even number so that the value of chi-squared can be
2383 ** computed more easily. This will cause the value of
2384 ** chi-squared to be higher than the optimum value, resulting
2385 ** in the chi-square test being more lenient than optimum.
2386 ** Return: The number of degrees of freedom for a chi-square test
2387 ** Exceptions: none
2388 ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2389 */
2390 static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2391
2392 uinT16 AdjustedNumBuckets;
2393
2394 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2395 if (Odd (AdjustedNumBuckets))
2396 AdjustedNumBuckets++;
2397 return (AdjustedNumBuckets);
2398
2399 } // DegreesOfFreedom
2400
2401
2402 //---------------------------------------------------------------------------
NumBucketsMatch(void * arg1,void * arg2)2403 int NumBucketsMatch(void *arg1, //BUCKETS *Histogram,
2404 void *arg2) { //uinT16 *DesiredNumberOfBuckets)
2405 /*
2406 ** Parameters:
2407 ** Histogram current histogram being tested for a match
2408 ** DesiredNumberOfBuckets match key
2409 ** Globals: none
2410 ** Operation:
2411 ** This routine is used to search a list of histogram data
2412 ** structures to find one with the specified number of
2413 ** buckets. It is called by the list search routines.
2414 ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2415 ** Exceptions: none
2416 ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2417 */
2418 BUCKETS *Histogram = (BUCKETS *) arg1;
2419 uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2420
2421 return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2422
2423 } // NumBucketsMatch
2424
2425
2426 //---------------------------------------------------------------------------
ListEntryMatch(void * arg1,void * arg2)2427 int ListEntryMatch(void *arg1, //ListNode
2428 void *arg2) { //Key
2429 /*
2430 ** Parameters: none
2431 ** Globals: none
2432 ** Operation:
2433 ** This routine is used to search a list for a list node
2434 ** whose contents match Key. It is called by the list
2435 ** delete_d routine.
2436 ** Return: TRUE if ListNode matches Key
2437 ** Exceptions: none
2438 ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2439 */
2440 return (arg1 == arg2);
2441
2442 } // ListEntryMatch
2443
2444
2445 //---------------------------------------------------------------------------
AdjustBuckets(BUCKETS * Buckets,uinT32 NewSampleCount)2446 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2447 /*
2448 ** Parameters:
2449 ** Buckets histogram data structure to adjust
2450 ** NewSampleCount new sample count to adjust to
2451 ** Globals: none
2452 ** Operation:
2453 ** This routine multiplies each ExpectedCount histogram entry
2454 ** by NewSampleCount/OldSampleCount so that the histogram
2455 ** is now adjusted to the new sample count.
2456 ** Return: none
2457 ** Exceptions: none
2458 ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2459 */
2460 int i;
2461 FLOAT64 AdjustFactor;
2462
2463 AdjustFactor = (((FLOAT64) NewSampleCount) /
2464 ((FLOAT64) Buckets->SampleCount));
2465
2466 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2467 Buckets->ExpectedCount[i] *= AdjustFactor;
2468 }
2469
2470 Buckets->SampleCount = NewSampleCount;
2471
2472 } // AdjustBuckets
2473
2474
2475 //---------------------------------------------------------------------------
InitBuckets(BUCKETS * Buckets)2476 void InitBuckets(BUCKETS *Buckets) {
2477 /*
2478 ** Parameters:
2479 ** Buckets histogram data structure to init
2480 ** Globals: none
2481 ** Operation:
2482 ** This routine sets the bucket counts in the specified histogram
2483 ** to zero.
2484 ** Return: none
2485 ** Exceptions: none
2486 ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2487 */
2488 int i;
2489
2490 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2491 Buckets->Count[i] = 0;
2492 }
2493
2494 } // InitBuckets
2495
2496
2497 //---------------------------------------------------------------------------
AlphaMatch(void * arg1,void * arg2)2498 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
2499 void *arg2) { //CHISTRUCT *SearchKey)
2500 /*
2501 ** Parameters:
2502 ** ChiStruct chi-squared struct being tested for a match
2503 ** SearchKey chi-squared struct that is the search key
2504 ** Globals: none
2505 ** Operation:
2506 ** This routine is used to search a list of structures which
2507 ** hold pre-computed chi-squared values for a chi-squared
2508 ** value whose corresponding alpha field matches the alpha
2509 ** field of SearchKey.
2510 ** It is called by the list search routines.
2511 ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2512 ** Exceptions: none
2513 ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2514 */
2515 CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2516 CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2517
2518 return (ChiStruct->Alpha == SearchKey->Alpha);
2519
2520 } // AlphaMatch
2521
2522
2523 //---------------------------------------------------------------------------
NewChiStruct(uinT16 DegreesOfFreedom,FLOAT64 Alpha)2524 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2525 /*
2526 ** Parameters:
2527 ** DegreesOfFreedom degrees of freedom for new chi value
2528 ** Alpha confidence level for new chi value
2529 ** Globals: none
2530 ** Operation:
2531 ** This routine allocates a new data structure which is used
2532 ** to hold a chi-squared value along with its associated
2533 ** number of degrees of freedom and alpha value.
2534 ** Return: none
2535 ** Exceptions: none
2536 ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2537 */
2538 CHISTRUCT *NewChiStruct;
2539
2540 NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2541 NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2542 NewChiStruct->Alpha = Alpha;
2543 return (NewChiStruct);
2544
2545 } // NewChiStruct
2546
2547
2548 //---------------------------------------------------------------------------
2549 FLOAT64
Solve(SOLVEFUNC Function,void * FunctionParams,FLOAT64 InitialGuess,FLOAT64 Accuracy)2550 Solve (SOLVEFUNC Function,
2551 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2552 /*
2553 ** Parameters:
2554 ** Function function whose zero is to be found
2555 ** FunctionParams arbitrary data to pass to function
2556 ** InitialGuess point to start solution search at
2557 ** Accuracy maximum allowed error
2558 ** Globals: none
2559 ** Operation:
2560 ** This routine attempts to find an x value at which Function
2561 ** goes to zero (i.e. a root of the function ). It will only
2562 ** work correctly if a solution actually exists and there
2563 ** are no extrema between the solution and the InitialGuess.
2564 ** The algorithms used are extremely primitive.
2565 ** Return: Solution of function ( x for which f(x) = 0 ).
2566 ** Exceptions: none
2567 ** History: Fri Aug 4 11:08:59 1989, DSJ, Created.
2568 */
2569 #define INITIALDELTA 0.1
2570 #define DELTARATIO 0.1
2571 {
2572 FLOAT64 x;
2573 FLOAT64 f;
2574 FLOAT64 Slope;
2575 FLOAT64 Delta;
2576 FLOAT64 NewDelta;
2577 FLOAT64 xDelta;
2578 FLOAT64 LastPosX, LastNegX;
2579
2580 x = InitialGuess;
2581 Delta = INITIALDELTA;
2582 LastPosX = MAX_FLOAT32;
2583 LastNegX = -MAX_FLOAT32;
2584 f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2585 while (Abs (LastPosX - LastNegX) > Accuracy) {
2586 // keep track of outer bounds of current estimate
2587 if (f < 0)
2588 LastNegX = x;
2589 else
2590 LastPosX = x;
2591
2592 // compute the approx. slope of f(x) at the current point
2593 Slope =
2594 ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2595
2596 // compute the next solution guess */
2597 xDelta = f / Slope;
2598 x -= xDelta;
2599
2600 // reduce the delta used for computing slope to be a fraction of
2601 //the amount moved to get to the new guess
2602 NewDelta = Abs (xDelta) * DELTARATIO;
2603 if (NewDelta < Delta)
2604 Delta = NewDelta;
2605
2606 // compute the value of the function at the new guess
2607 f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2608 }
2609 return (x);
2610
2611 } // Solve
2612
2613
2614 //---------------------------------------------------------------------------
ChiArea(CHISTRUCT * ChiParams,FLOAT64 x)2615 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
2616 /*
2617 ** Parameters:
2618 ** ChiParams contains degrees of freedom and alpha
2619 ** x value of chi-squared to evaluate
2620 ** Globals: none
2621 ** Operation:
2622 ** This routine computes the area under a chi density curve
2623 ** from 0 to x, minus the desired area under the curve. The
2624 ** number of degrees of freedom of the chi curve is specified
2625 ** in the ChiParams structure. The desired area is also
2626 ** specified in the ChiParams structure as Alpha ( or 1 minus
2627 ** the desired area ). This routine is intended to be passed
2628 ** to the Solve() function to find the value of chi-squared
2629 ** which will yield a desired area under the right tail of
2630 ** the chi density curve. The function will only work for
2631 ** even degrees of freedom. The equations are based on
2632 ** integrating the chi density curve in parts to obtain
2633 ** a series that can be used to compute the area under the
2634 ** curve.
2635 ** Return: Error between actual and desired area under the chi curve.
2636 ** Exceptions: none
2637 ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2638 */
2639 int i, N;
2640 FLOAT64 SeriesTotal;
2641 FLOAT64 Denominator;
2642 FLOAT64 PowerOfx;
2643
2644 N = ChiParams->DegreesOfFreedom / 2 - 1;
2645 SeriesTotal = 1;
2646 Denominator = 1;
2647 PowerOfx = 1;
2648 for (i = 1; i <= N; i++) {
2649 Denominator *= 2 * i;
2650 PowerOfx *= x;
2651 SeriesTotal += PowerOfx / Denominator;
2652 }
2653 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2654
2655 } // ChiArea
2656
2657
2658 //---------------------------------------------------------------------------
2659 BOOL8
MultipleCharSamples(CLUSTERER * Clusterer,CLUSTER * Cluster,FLOAT32 MaxIllegal)2660 MultipleCharSamples (CLUSTERER * Clusterer,
2661 CLUSTER * Cluster, FLOAT32 MaxIllegal)
2662 /*
2663 ** Parameters:
2664 ** Clusterer data structure holding cluster tree
2665 ** Cluster cluster containing samples to be tested
2666 ** MaxIllegal max percentage of samples allowed to have
2667 ** more than 1 feature in the cluster
2668 ** Globals: none
2669 ** Operation:
2670 ** This routine looks at all samples in the specified cluster.
2671 ** It computes a running estimate of the percentage of the
2672 ** charaters which have more than 1 sample in the cluster.
2673 ** When this percentage exceeds MaxIllegal, TRUE is returned.
2674 ** Otherwise FALSE is returned. The CharID
2675 ** fields must contain integers which identify the training
2676 ** characters which were used to generate the sample. One
2677 ** integer is used for each sample. The NumChar field in
2678 ** the Clusterer must contain the number of characters in the
2679 ** training set. All CharID fields must be between 0 and
2680 ** NumChar-1. The main function of this routine is to help
2681 ** identify clusters which need to be split further, i.e. if
2682 ** numerous training characters have 2 or more features which are
2683 ** contained in the same cluster, then the cluster should be
2684 ** split.
2685 ** Return: TRUE if the cluster should be split, FALSE otherwise.
2686 ** Exceptions: none
2687 ** History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2688 ** 2/22/90, DSJ, Added MaxIllegal control rather than always
2689 ** splitting illegal clusters.
2690 */
2691 #define ILLEGAL_CHAR 2
2692 {
2693 static BOOL8 *CharFlags = NULL;
2694 static inT32 NumFlags = 0;
2695 int i;
2696 LIST SearchState;
2697 SAMPLE *Sample;
2698 inT32 CharID;
2699 inT32 NumCharInCluster;
2700 inT32 NumIllegalInCluster;
2701 FLOAT32 PercentIllegal;
2702
2703 // initial estimate assumes that no illegal chars exist in the cluster
2704 NumCharInCluster = Cluster->SampleCount;
2705 NumIllegalInCluster = 0;
2706
2707 if (Clusterer->NumChar > NumFlags) {
2708 if (CharFlags != NULL)
2709 memfree(CharFlags);
2710 NumFlags = Clusterer->NumChar;
2711 CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2712 }
2713
2714 for (i = 0; i < NumFlags; i++)
2715 CharFlags[i] = FALSE;
2716
2717 // find each sample in the cluster and check if we have seen it before
2718 InitSampleSearch(SearchState, Cluster);
2719 while ((Sample = NextSample (&SearchState)) != NULL) {
2720 CharID = Sample->CharID;
2721 if (CharFlags[CharID] == FALSE) {
2722 CharFlags[CharID] = TRUE;
2723 }
2724 else {
2725 if (CharFlags[CharID] == TRUE) {
2726 NumIllegalInCluster++;
2727 CharFlags[CharID] = ILLEGAL_CHAR;
2728 }
2729 NumCharInCluster--;
2730 PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2731 if (PercentIllegal > MaxIllegal)
2732 return (TRUE);
2733 }
2734 }
2735 return (FALSE);
2736
2737 } // MultipleCharSamples
2738
2739 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2740 // The return value is the sum of norms of the off-diagonal terms of the
2741 // product of a and inv. (A measure of the error.)
InvertMatrix(const float * input,int size,float * inv)2742 double InvertMatrix(const float* input, int size, float* inv) {
2743 double** U; // The upper triangular array.
2744 double* Umem;
2745 double** U_inv; // The inverse of U.
2746 double* U_invmem;
2747 double** L; // The lower triangular array.
2748 double* Lmem;
2749
2750 // Allocate memory for the 2D arrays.
2751 ALLOC_2D_ARRAY(size, size, Umem, U, double);
2752 ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
2753 ALLOC_2D_ARRAY(size, size, Lmem, L, double);
2754
2755 // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2756 int row;
2757 int col;
2758 for (row = 0; row < size; row++) {
2759 for (col = 0; col < size; col++) {
2760 U[row][col] = input[row*size + col];
2761 L[row][col] = row == col ? 1.0 : 0.0;
2762 U_inv[row][col] = 0.0;
2763 }
2764 }
2765
2766 // Compute forward matrix by inversion by LU decomposition of input.
2767 for (col = 0; col < size; ++col) {
2768 // Find best pivot
2769 int best_row = 0;
2770 double best_pivot = -1.0;
2771 for (row = col; row < size; ++row) {
2772 if (Abs(U[row][col]) > best_pivot) {
2773 best_pivot = Abs(U[row][col]);
2774 best_row = row;
2775 }
2776 }
2777 // Exchange pivot rows.
2778 if (best_row != col) {
2779 for (int k = 0; k < size; ++k) {
2780 double tmp = U[best_row][k];
2781 U[best_row][k] = U[col][k];
2782 U[col][k] = tmp;
2783 tmp = L[best_row][k];
2784 L[best_row][k] = L[col][k];
2785 L[col][k] = tmp;
2786 }
2787 }
2788 // Now do the pivot itself.
2789 for (row = col + 1; row < size; ++row) {
2790 double ratio = -U[row][col] / U[col][col];
2791 for (int j = col; j < size; ++j) {
2792 U[row][j] += U[col][j] * ratio;
2793 }
2794 for (int k = 0; k < size; ++k) {
2795 L[row][k] += L[col][k] * ratio;
2796 }
2797 }
2798 }
2799 // Next invert U.
2800 for (col = 0; col < size; ++col) {
2801 U_inv[col][col] = 1.0 / U[col][col];
2802 for (row = col - 1; row >= 0; --row) {
2803 double total = 0.0;
2804 for (int k = col; k > row; --k) {
2805 total += U[row][k] * U_inv[k][col];
2806 }
2807 U_inv[row][col] = -total / U[row][row];
2808 }
2809 }
2810 // Now the answer is U_inv.L.
2811 for (row = 0; row < size; row++) {
2812 for (col = 0; col < size; col++) {
2813 double sum = 0.0;
2814 for (int k = row; k < size; ++k) {
2815 sum += U_inv[row][k] * L[k][col];
2816 }
2817 inv[row*size + col] = sum;
2818 }
2819 }
2820 // Check matrix product.
2821 double error_sum = 0.0;
2822 for (row = 0; row < size; row++) {
2823 for (col = 0; col < size; col++) {
2824 double sum = 0.0;
2825 for (int k = 0; k < size; ++k) {
2826 sum += input[row*size + k] * inv[k *size + col];
2827 }
2828 if (row != col) {
2829 error_sum += Abs(sum);
2830 }
2831 }
2832 }
2833 return error_sum;
2834 }
2835