1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 - This software is distributed in the hope that it will be
4 - useful, but with NO WARRANTY OF ANY KIND.
5 - No author or distributor accepts responsibility to anyone for the
6 - consequences of using this software, or for whether it serves any
7 - particular purpose or works at all, unless he or she says so in
8 - writing. Everyone is granted permission to copy, modify and
9 - redistribute this source code, for commercial or non-commercial
10 - purposes, with the following restrictions: (1) the origin of this
11 - source code must not be misrepresented; (2) modified versions must
12 - be plainly marked as such; and (3) this notice may not be removed
13 - or altered from any source or modified source distribution.
14 *====================================================================*/
15
16 /*
17 * numafunc2.c
18 *
19 * Transformations
20 * NUMA *numaTransform()
21 * NUMA *numaConvolve()
22 * NUMA *numaConvertToInt()
23 *
24 * Histogram generation and statistics
25 * NUMA *numaMakeHistogram()
26 * NUMA *numaMakeHistogramAuto()
27 * NUMA *numaMakeHistogramClipped()
28 * NUMA *numaRebinHistogram()
29 * NUMA *numaNormalizeHistogram()
30 * l_int32 numaGetStatsUsingHistogram()
31 * l_int32 numaGetHistogramStats()
32 * l_int32 numaGetHistogramStatsOnInterval()
33 * l_int32 numaMakeRankFromHistogram()
34 * l_int32 numaHistogramGetRankFromVal()
35 * l_int32 numaHistogramGetValFromRank()
36 *
37 * Splitting a distribution
38 * l_int32 numaSplitDistribution()
39 *
40 * Extrema finding
41 * NUMA *numaFindPeaks()
42 * NUMA *numaFindExtrema()
43 *
44 * Threshold crossings and frequency analysis
45 * l_int32 numaSelectCrossingThreshold()
46 * NUMA *numaCrossingsByThreshold()
47 * NUMA *numaCrossingsByPeaks()
48 * NUMA *numaEvalBestHaarParameters()
49 * l_int32 numaEvalHaarSum()
50 *
51 * Things to remember when using the Numa:
52 *
53 * (1) The numa is a struct, not an array. Always use accessors
54 * (see numabasic.c), never the fields directly.
55 *
56 * (2) The number array holds l_float32 values. It can also
57 * be used to store l_int32 values. See numabasic.c for
58 * details on using the accessors.
59 *
60 * (3) Occasionally, in the comments we denote the i-th element of a
61 * numa by na[i]. This is conceptual only -- the numa is not an array!
62 *
63 * Some general comments on histograms:
64 *
65 * (1) Histograms are the generic statistical representation of
66 * the data about some attribute. Typically they're not
67 * normalized -- they simply give the number of occurrences
68 * within each range of values of the attribute. This range
69 * of values is referred to as a 'bucket'. For example,
70 * the histogram could specify how many connected components
71 * are found for each value of their width; in that case,
72 * the bucket size is 1.
73 *
74 * (2) In leptonica, all buckets have the same size. Histograms
75 * are therefore specified by a numa of occurrences, along
76 * with two other numbers: the 'value' associated with the
77 * occupants of the first bucket and the size (i.e., 'width')
78 * of each bucket. These two numbers then allow us to calculate
79 * the value associated with the occupants of each bucket.
80 * These numbers are fields in the numa, initialized to
81 * a startx value of 0.0 and a binsize of 1.0. Accessors for
82 * these fields are functions numa*XParameters(). All histograms
83 * must have these two numbers properly set.
84 */
85
86 #include <stdio.h>
87 #include <string.h>
88 #include <stdlib.h>
89 #include <math.h>
90 #include "allheaders.h"
91
92 /* bin sizes in numaMakeHistogram() */
93 static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\
94 2000, 5000, 10000, 20000, 50000, 100000, 200000,\
95 500000, 1000000, 2000000, 5000000, 10000000,\
96 200000000, 50000000, 100000000};
97 static const l_int32 NBinSizes = 24;
98
99
100 #ifndef NO_CONSOLE_IO
101 #define DEBUG_HISTO 0
102 #define DEBUG_CROSSINGS 0
103 #define DEBUG_FREQUENCY 0
104 #endif /* ~NO_CONSOLE_IO */
105
106
107 /*----------------------------------------------------------------------*
108 * Transformations *
109 *----------------------------------------------------------------------*/
110 /*!
111 * numaTransform()
112 *
113 * Input: nas
114 * shift (add this to each number)
115 * scale (multiply each number by this)
116 * Return: na with all values shifted and scaled, or null on error
117 *
118 * Notes:
119 * (1) Each number is shifted before scaling.
120 * (2) The operation sequence is opposite to that for Box and Pta:
121 * scale first, then shift.
122 */
123 NUMA *
numaTransform(NUMA * nas,l_float32 shift,l_float32 scale)124 numaTransform(NUMA *nas,
125 l_float32 shift,
126 l_float32 scale)
127 {
128 l_int32 i, n;
129 l_float32 val;
130 NUMA *nad;
131
132 PROCNAME("numaTransform");
133
134 if (!nas)
135 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
136 n = numaGetCount(nas);
137 if ((nad = numaCreate(n)) == NULL)
138 return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
139 for (i = 0; i < n; i++) {
140 numaGetFValue(nas, i, &val);
141 val = scale * val + shift;
142 numaAddNumber(nad, val);
143 }
144 return nad;
145 }
146
147
148 /*!
149 * numaConvolve()
150 *
151 * Input: na
152 * halfwidth (of rectangular filter, minus the center)
153 * Return: na (after low-pass filtering), or null on error
154 *
155 * Notes:
156 * (1) Full convolution takes place only from i = halfwidth to
157 * i = n - halfwidth - 1. We do the end parts using only
158 * the partial array available. We do not pad the ends with 0.
159 * (2) This implementation assumes specific fields in the Numa!
160 */
161 NUMA *
numaConvolve(NUMA * na,l_int32 halfwidth)162 numaConvolve(NUMA *na,
163 l_int32 halfwidth)
164 {
165 l_int32 i, n, rval;
166 l_float32 sum, norm;
167 l_float32 *array, *carray, *sumarray;
168 NUMA *nac;
169
170 PROCNAME("numaConvolve");
171
172 if (!na)
173 return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
174 n = numaGetCount(na);
175 if (2 * halfwidth + 1 > n)
176 L_WARNING("filter wider than input array!", procName);
177 array = na->array;
178
179 if ((nac = numaCreate(n)) == NULL)
180 return (NUMA *)ERROR_PTR("nac not made", procName, NULL);
181 carray = nac->array;
182 nac->n = n; /* fill with zeroes */
183
184 /* Make sum array; note the indexing */
185 if ((sumarray = (l_float32 *)CALLOC(n + 1, sizeof(l_float32))) == NULL)
186 return (NUMA *)ERROR_PTR("sumarray not made", procName, NULL);
187 sum = 0.0;
188 sumarray[0] = 0.0;
189 for (i = 0; i < n; i++) {
190 sum += array[i];
191 sumarray[i + 1] = sum;
192 }
193
194 /* Central part */
195 norm = 1. / (2 * halfwidth + 1);
196 rval = n - halfwidth;
197 for (i = halfwidth; i < rval; ++i)
198 carray[i] = norm *
199 (sumarray[i + halfwidth + 1] - sumarray[i - halfwidth]);
200
201 /* Left side */
202 for (i = 0; i < halfwidth; i++)
203 carray[i] = sumarray[i + halfwidth + 1] / (halfwidth + i + 1);
204
205 /* Right side */
206 for (i = rval; i < n; i++)
207 carray[i] = (1. / (n - i + halfwidth)) *
208 (sumarray[n] - sumarray[i - halfwidth]);
209
210 FREE(sumarray);
211 return nac;
212 }
213
214
215 /*!
216 * numaConvertToInt()
217 *
218 * Input: na
219 * Return: na with all values rounded to nearest integer, or
220 * null on error
221 */
222 NUMA *
numaConvertToInt(NUMA * nas)223 numaConvertToInt(NUMA *nas)
224 {
225 l_int32 i, n, ival;
226 NUMA *nad;
227
228 PROCNAME("numaConvertToInt");
229
230 if (!nas)
231 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
232
233 n = numaGetCount(nas);
234 if ((nad = numaCreate(n)) == NULL)
235 return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
236 for (i = 0; i < n; i++) {
237 numaGetIValue(nas, i, &ival);
238 numaAddNumber(nad, ival);
239 }
240 return nad;
241 }
242
243
244 /*----------------------------------------------------------------------*
245 * Histogram generation and statistics *
246 *----------------------------------------------------------------------*/
247 /*!
248 * numaMakeHistogram()
249 *
250 * Input: na
251 * maxbins (max number of histogram bins)
252 * &binsize (<return> size of histogram bins)
253 * &binstart (<optional return> start val of minimum bin;
254 * input NULL to force start at 0)
255 * Return: na consisiting of histogram of integerized values,
256 * or null on error.
257 *
258 * Note:
259 * (1) This simple interface is designed for integer data.
260 * The bins are of integer width and start on integer boundaries,
261 * so the results on float data will not have high precision.
262 * (2) Specify the max number of input bins. Then @binsize,
263 * the size of bins necessary to accommodate the input data,
264 * is returned. It is one of the sequence:
265 * {1, 2, 5, 10, 20, 50, ...}.
266 * (3) If &binstart is given, all values are accommodated,
267 * and the min value of the starting bin is returned.
268 * Otherwise, all negative values are discarded and
269 * the histogram bins start at 0.
270 */
271 NUMA *
numaMakeHistogram(NUMA * na,l_int32 maxbins,l_int32 * pbinsize,l_int32 * pbinstart)272 numaMakeHistogram(NUMA *na,
273 l_int32 maxbins,
274 l_int32 *pbinsize,
275 l_int32 *pbinstart)
276 {
277 l_int32 i, n, ival, hval;
278 l_int32 iminval, imaxval, range, binsize, nbins, ibin;
279 l_float32 val, ratio;
280 NUMA *nai, *nahist;
281
282 PROCNAME("numaMakeHistogram");
283
284 if (!na)
285 return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
286 if (!pbinsize)
287 return (NUMA *)ERROR_PTR("&binsize not defined", procName, NULL);
288
289 /* Determine input range */
290 numaGetMin(na, &val, NULL);
291 iminval = (l_int32)(val + 0.5);
292 numaGetMax(na, &val, NULL);
293 imaxval = (l_int32)(val + 0.5);
294 if (pbinstart == NULL) { /* clip negative vals; start from 0 */
295 iminval = 0;
296 if (imaxval < 0)
297 return (NUMA *)ERROR_PTR("all values < 0", procName, NULL);
298 }
299
300 /* Determine binsize */
301 range = imaxval - iminval + 1;
302 if (range > maxbins - 1) {
303 ratio = (l_float64)range / (l_float64)maxbins;
304 binsize = 0;
305 for (i = 0; i < NBinSizes; i++) {
306 if (ratio < BinSizeArray[i]) {
307 binsize = BinSizeArray[i];
308 break;
309 }
310 }
311 if (binsize == 0)
312 return (NUMA *)ERROR_PTR("numbers too large", procName, NULL);
313 }
314 else
315 binsize = 1;
316 *pbinsize = binsize;
317 nbins = 1 + range / binsize; /* +1 seems to be sufficient */
318
319 /* Redetermine iminval */
320 if (pbinstart && binsize > 1) {
321 if (iminval >= 0)
322 iminval = binsize * (iminval / binsize);
323 else
324 iminval = binsize * ((iminval - binsize + 1) / binsize);
325 }
326 if (pbinstart)
327 *pbinstart = iminval;
328
329 #if DEBUG_HISTO
330 fprintf(stderr, " imaxval = %d, range = %d, nbins = %d\n",
331 imaxval, range, nbins);
332 #endif /* DEBUG_HISTO */
333
334 /* Use integerized data for input */
335 if ((nai = numaConvertToInt(na)) == NULL)
336 return (NUMA *)ERROR_PTR("nai not made", procName, NULL);
337 n = numaGetCount(nai);
338
339 /* Make histogram, converting value in input array
340 * into a bin number for this histogram array. */
341 if ((nahist = numaCreate(nbins)) == NULL)
342 return (NUMA *)ERROR_PTR("nahist not made", procName, NULL);
343 numaSetCount(nahist, nbins);
344 numaSetXParameters(nahist, iminval, binsize);
345 for (i = 0; i < n; i++) {
346 numaGetIValue(nai, i, &ival);
347 ibin = (ival - iminval) / binsize;
348 if (ibin >= 0 && ibin < nbins) {
349 numaGetIValue(nahist, ibin, &hval);
350 numaSetValue(nahist, ibin, hval + 1.0);
351 }
352 }
353
354 numaDestroy(&nai);
355 return nahist;
356 }
357
358
359 /*!
360 * numaMakeHistogramAuto()
361 *
362 * Input: na (numa of floats; these may be integers)
363 * maxbins (max number of histogram bins; >= 1)
364 * Return: na consisiting of histogram of quantized float values,
365 * or null on error.
366 *
367 * Notes:
368 * (1) This simple interface is designed for accurate binning
369 * of both integer and float data.
370 * (2) If the array data is integers, and the range of integers
371 * is smaller than @maxbins, they are binned as they fall,
372 * with binsize = 1.
373 * (3) If the range of data, (maxval - minval), is larger than
374 * @maxbins, or if the data is floats, they are binned into
375 * exactly @maxbins bins.
376 * (4) Unlike numaMakeHistogram(), these bins in general have
377 * non-integer location and width, even for integer data.
378 */
379 NUMA *
numaMakeHistogramAuto(NUMA * na,l_int32 maxbins)380 numaMakeHistogramAuto(NUMA *na,
381 l_int32 maxbins)
382 {
383 l_int32 i, n, imin, imax, irange, ibin, ival, allints;
384 l_float32 minval, maxval, range, binsize, fval;
385 NUMA *nah;
386
387 PROCNAME("numaMakeHistogramAuto");
388
389 if (!na)
390 return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
391 maxbins = L_MAX(1, maxbins);
392
393 /* Determine input range */
394 numaGetMin(na, &minval, NULL);
395 numaGetMax(na, &maxval, NULL);
396
397 /* Determine if values are all integers */
398 n = numaGetCount(na);
399 numaHasOnlyIntegers(na, maxbins, &allints);
400
401 /* Do simple integer binning if possible */
402 if (allints && (maxval - minval < maxbins)) {
403 imin = (l_int32)minval;
404 imax = (l_int32)maxval;
405 irange = imax - imin + 1;
406 nah = numaCreate(irange);
407 numaSetCount(nah, irange); /* init */
408 numaSetXParameters(nah, minval, 1.0);
409 for (i = 0; i < n; i++) {
410 numaGetIValue(na, i, &ival);
411 ibin = ival - imin;
412 numaGetIValue(nah, ibin, &ival);
413 numaSetValue(nah, ibin, ival + 1.0);
414 }
415
416 return nah;
417 }
418
419 /* Do float binning, even if the data is integers. */
420 range = maxval - minval;
421 binsize = range / (l_float32)maxbins;
422 if (range == 0.0) {
423 nah = numaCreate(1);
424 numaSetXParameters(nah, minval, binsize);
425 numaAddNumber(nah, n);
426 return nah;
427 }
428 nah = numaCreate(maxbins);
429 numaSetCount(nah, maxbins);
430 numaSetXParameters(nah, minval, binsize);
431 for (i = 0; i < n; i++) {
432 numaGetFValue(na, i, &fval);
433 ibin = (l_int32)((fval - minval) / binsize);
434 numaGetIValue(nah, ibin, &ival);
435 numaSetValue(nah, ibin, ival + 1.0);
436 }
437
438 return nah;
439 }
440
441
442 /*!
443 * numaMakeHistogramClipped()
444 *
445 * Input: na
446 * binsize (typically 1.0)
447 * maxsize (of histogram ordinate)
448 * Return: na (histogram of bins of size @binsize, starting with
449 * the na[0] (x = 0.0) and going up to a maximum of
450 * x = @maxsize, by increments of @binsize), or null on error
451 *
452 * Notes:
453 * (1) This simple function generates a histogram of values
454 * from na, discarding all values < 0.0 or greater than
455 * min(@maxsize, maxval), where maxval is the maximum value in na.
456 * The histogram data is put in bins of size delx = @binsize,
457 * starting at x = 0.0. We use as many bins as are
458 * needed to hold the data.
459 */
460 NUMA *
numaMakeHistogramClipped(NUMA * na,l_float32 binsize,l_float32 maxsize)461 numaMakeHistogramClipped(NUMA *na,
462 l_float32 binsize,
463 l_float32 maxsize)
464 {
465 l_int32 i, n, nbins, ival, ibin;
466 l_float32 val, maxval;
467 NUMA *nad;
468
469 PROCNAME("numaMakeHistogramClipped");
470
471 if (!na)
472 return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
473 if (binsize <= 0.0)
474 return (NUMA *)ERROR_PTR("binsize must be > 0.0", procName, NULL);
475 if (binsize > maxsize)
476 binsize = maxsize; /* just one bin */
477
478 numaGetMax(na, &maxval, NULL);
479 n = numaGetCount(na);
480 maxsize = L_MIN(maxsize, maxval);
481 nbins = (l_int32)(maxsize / binsize) + 1;
482
483 /* fprintf(stderr, "maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */
484
485 if ((nad = numaCreate(nbins)) == NULL)
486 return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
487 numaSetXParameters(nad, 0.0, binsize);
488 numaSetCount(nad, nbins); /* interpret zeroes in bins as data */
489 for (i = 0; i < n; i++) {
490 numaGetFValue(na, i, &val);
491 ibin = (l_int32)(val / binsize);
492 if (ibin >= 0 && ibin < nbins) {
493 numaGetIValue(nad, ibin, &ival);
494 numaSetValue(nad, ibin, ival + 1.0);
495 }
496 }
497
498 return nad;
499 }
500
501
502 /*!
503 * numaRebinHistogram()
504 *
505 * Input: nas (input histogram)
506 * newsize (number of old bins contained in each new bin)
507 * Return: nad (more coarsely re-binned histogram), or null on error
508 */
509 NUMA *
numaRebinHistogram(NUMA * nas,l_int32 newsize)510 numaRebinHistogram(NUMA *nas,
511 l_int32 newsize)
512 {
513 l_int32 i, j, ns, nd, index, count, val;
514 l_float32 start, oldsize;
515 NUMA *nad;
516
517 PROCNAME("numaRebinHistogram");
518
519 if (!nas)
520 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
521 if (newsize <= 1)
522 return (NUMA *)ERROR_PTR("newsize must be > 1", procName, NULL);
523 if ((ns = numaGetCount(nas)) == 0)
524 return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL);
525
526 nd = (ns + newsize - 1) / newsize;
527 if ((nad = numaCreate(nd)) == NULL)
528 return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
529 numaGetXParameters(nad, &start, &oldsize);
530 numaSetXParameters(nad, start, oldsize * newsize);
531
532 for (i = 0; i < nd; i++) { /* new bins */
533 count = 0;
534 index = i * newsize;
535 for (j = 0; j < newsize; j++) {
536 if (index < ns) {
537 numaGetIValue(nas, index, &val);
538 count += val;
539 index++;
540 }
541 }
542 numaAddNumber(nad, count);
543 }
544
545 return nad;
546 }
547
548
549 /*!
550 * numaNormalizeHistogram()
551 *
552 * Input: nas (input histogram)
553 * area (target sum of all numbers in dest histogram;
554 * e.g., use area = 1.0 if this represents a
555 * probability distribution)
556 * Return: nad (normalized histogram), or null on error
557 */
558 NUMA *
numaNormalizeHistogram(NUMA * nas,l_float32 area)559 numaNormalizeHistogram(NUMA *nas,
560 l_float32 area)
561 {
562 l_int32 i, ns;
563 l_float32 sum, factor, fval;
564 NUMA *nad;
565
566 PROCNAME("numaNormalizeHistogram");
567
568 if (!nas)
569 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
570 if (area <= 0.0)
571 return (NUMA *)ERROR_PTR("area must be > 0.0", procName, NULL);
572 if ((ns = numaGetCount(nas)) == 0)
573 return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL);
574
575 numaGetSum(nas, &sum);
576 factor = area / sum;
577
578 if ((nad = numaCreate(ns)) == NULL)
579 return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
580 numaCopyXParameters(nad, nas);
581
582 for (i = 0; i < ns; i++) {
583 numaGetFValue(nas, i, &fval);
584 fval *= factor;
585 numaAddNumber(nad, fval);
586 }
587
588 return nad;
589 }
590
591
592 /*!
593 * numaGetStatsUsingHistogram()
594 *
595 * Input: na (an arbitrary set of numbers; not ordered and not
596 * a histogram)
597 * maxbins (the maximum number of bins to be allowed in
598 * the histogram; use 0 for consecutive integer bins)
599 * &min (<optional return> min value of set)
600 * &max (<optional return> max value of set)
601 * &mean (<optional return> mean value of set)
602 * &variance (<optional return> variance)
603 * &median (<optional return> median value of set)
604 * rank (in [0.0 ... 1.0]; median has a rank 0.5; ignored
605 * if &rval == NULL)
606 * &rval (<optional return> value in na corresponding to @rank)
607 * &histo (<optional return> Numa histogram; use NULL to prevent)
608 * Return: 0 if OK, 1 on error
609 *
610 * Notes:
611 * (1) This is a simple interface for gathering statistics
612 * from a numa, where a histogram is used 'under the covers'
613 * to avoid sorting if a rank value is requested. In that case,
614 * by using a histogram we are trading speed for accuracy, because
615 * the values in @na are quantized to the center of a set of bins.
616 * (2) If the median, other rank value, or histogram are not requested,
617 * the calculation is all performed on the input Numa.
618 * (3) The variance is the average of the square of the
619 * difference from the mean. The median is the value in na
620 * with rank 0.5.
621 * (4) There are two situations where this gives rank results with
622 * accuracy comparable to computing stastics directly on the input
623 * data, without binning into a histogram:
624 * (a) the data is integers and the range of data is less than
625 * @maxbins, and
626 * (b) the data is floats and the range is small compared to
627 * @maxbins, so that the binsize is much less than 1.
628 * (5) If a histogram is used and the numbers in the Numa extend
629 * over a large range, you can limit the required storage by
630 * specifying the maximum number of bins in the histogram.
631 * Use @maxbins == 0 to force the bin size to be 1.
632 * (6) This optionally returns the median and one arbitrary rank value.
633 * If you need several rank values, return the histogram and use
634 * numaHistogramGetValFromRank(nah, rank, &rval)
635 * multiple times.
636 */
637 l_int32
numaGetStatsUsingHistogram(NUMA * na,l_int32 maxbins,l_float32 * pmin,l_float32 * pmax,l_float32 * pmean,l_float32 * pvariance,l_float32 * pmedian,l_float32 rank,l_float32 * prval,NUMA ** phisto)638 numaGetStatsUsingHistogram(NUMA *na,
639 l_int32 maxbins,
640 l_float32 *pmin,
641 l_float32 *pmax,
642 l_float32 *pmean,
643 l_float32 *pvariance,
644 l_float32 *pmedian,
645 l_float32 rank,
646 l_float32 *prval,
647 NUMA **phisto)
648 {
649 l_int32 i, n;
650 l_float32 minval, maxval, fval, mean, sum;
651 NUMA *nah;
652
653 PROCNAME("numaGetStatsUsingHistogram");
654
655 if (pmin) *pmin = 0.0;
656 if (pmax) *pmax = 0.0;
657 if (pmean) *pmean = 0.0;
658 if (pmedian) *pmedian = 0.0;
659 if (pvariance) *pvariance = 0.0;
660 if (!na)
661 return ERROR_INT("na not defined", procName, 1);
662 if ((n = numaGetCount(na)) == 0)
663 return ERROR_INT("numa is empty", procName, 1);
664
665 numaGetMin(na, &minval, NULL);
666 numaGetMax(na, &maxval, NULL);
667 if (pmin) *pmin = minval;
668 if (pmax) *pmax = maxval;
669 if (pmean || pvariance) {
670 sum = 0.0;
671 for (i = 0; i < n; i++) {
672 numaGetFValue(na, i, &fval);
673 sum += fval;
674 }
675 mean = sum / (l_float32)n;
676 if (pmean) *pmean = mean;
677 }
678 if (pvariance) {
679 sum = 0.0;
680 for (i = 0; i < n; i++) {
681 numaGetFValue(na, i, &fval);
682 sum += fval * fval;
683 }
684 *pvariance = sum / (l_float32)n - mean * mean;
685 }
686
687 if (!pmedian && !prval && !phisto)
688 return 0;
689
690 nah = numaMakeHistogramAuto(na, maxbins);
691 if (pmedian)
692 numaHistogramGetValFromRank(nah, 0.5, pmedian);
693 if (prval)
694 numaHistogramGetValFromRank(nah, rank, prval);
695 if (phisto)
696 *phisto = nah;
697 else
698 numaDestroy(&nah);
699 return 0;
700 }
701
702
703 /*!
704 * numaGetHistogramStats()
705 *
706 * Input: nahisto (histogram: y(x(i)), i = 0 ... nbins - 1)
707 * startx (x value of first bin: x(0))
708 * deltax (x increment between bins; the bin size; x(1) - x(0))
709 * &xmean (<optional return> mean value of histogram)
710 * &xmedian (<optional return> median value of histogram)
711 * &xmode (<optional return> mode value of histogram:
712 * xmode = x(imode), where y(xmode) >= y(x(i)) for
713 * all i != imode)
714 * &xvariance (<optional return> variance of x)
715 * Return: 0 if OK, 1 on error
716 *
717 * Notes:
718 * (1) If the histogram represents the relation y(x), the
719 * computed values that are returned are the x values.
720 * These are NOT the bucket indices i; they are related to the
721 * bucket indices by
722 * x(i) = startx + i * deltax
723 */
724 l_int32
numaGetHistogramStats(NUMA * nahisto,l_float32 startx,l_float32 deltax,l_float32 * pxmean,l_float32 * pxmedian,l_float32 * pxmode,l_float32 * pxvariance)725 numaGetHistogramStats(NUMA *nahisto,
726 l_float32 startx,
727 l_float32 deltax,
728 l_float32 *pxmean,
729 l_float32 *pxmedian,
730 l_float32 *pxmode,
731 l_float32 *pxvariance)
732 {
733 PROCNAME("numaGetHistogramStats");
734
735 if (pxmean) *pxmean = 0.0;
736 if (pxmedian) *pxmedian = 0.0;
737 if (pxmode) *pxmode = 0.0;
738 if (pxvariance) *pxvariance = 0.0;
739 if (!nahisto)
740 return ERROR_INT("nahisto not defined", procName, 1);
741
742 return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, 0,
743 pxmean, pxmedian, pxmode,
744 pxvariance);
745 }
746
747
748 /*!
749 * numaGetHistogramStatsOnInterval()
750 *
751 * Input: nahisto (histogram: y(x(i)), i = 0 ... nbins - 1)
752 * startx (x value of first bin: x(0))
753 * deltax (x increment between bins; the bin size; x(1) - x(0))
754 * ifirst (first bin to use for collecting stats)
755 * ilast (last bin for collecting stats; use 0 to go to the end)
756 * &xmean (<optional return> mean value of histogram)
757 * &xmedian (<optional return> median value of histogram)
758 * &xmode (<optional return> mode value of histogram:
759 * xmode = x(imode), where y(xmode) >= y(x(i)) for
760 * all i != imode)
761 * &xvariance (<optional return> variance of x)
762 * Return: 0 if OK, 1 on error
763 *
764 * Notes:
765 * (1) If the histogram represents the relation y(x), the
766 * computed values that are returned are the x values.
767 * These are NOT the bucket indices i; they are related to the
768 * bucket indices by
769 * x(i) = startx + i * deltax
770 */
771 l_int32
numaGetHistogramStatsOnInterval(NUMA * nahisto,l_float32 startx,l_float32 deltax,l_int32 ifirst,l_int32 ilast,l_float32 * pxmean,l_float32 * pxmedian,l_float32 * pxmode,l_float32 * pxvariance)772 numaGetHistogramStatsOnInterval(NUMA *nahisto,
773 l_float32 startx,
774 l_float32 deltax,
775 l_int32 ifirst,
776 l_int32 ilast,
777 l_float32 *pxmean,
778 l_float32 *pxmedian,
779 l_float32 *pxmode,
780 l_float32 *pxvariance)
781 {
782 l_int32 i, n, imax;
783 l_float32 sum, sumval, halfsum, moment, var, x, y, ymax;
784
785 PROCNAME("numaGetHistogramStats");
786
787 if (pxmean) *pxmean = 0.0;
788 if (pxmedian) *pxmedian = 0.0;
789 if (pxmode) *pxmode = 0.0;
790 if (pxvariance) *pxvariance = 0.0;
791 if (!nahisto)
792 return ERROR_INT("nahisto not defined", procName, 1);
793 if (!pxmean && !pxmedian && !pxmode && !pxvariance)
794 return ERROR_INT("nothing to compute", procName, 1);
795
796 n = numaGetCount(nahisto);
797 if (ilast <= 0) ilast = n - 1;
798 if (ifirst < 0) ifirst = 0;
799 if (ifirst > ilast || ifirst > n - 1)
800 return ERROR_INT("ifirst is too large", procName, 1);
801 for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) {
802 x = startx + i * deltax;
803 numaGetFValue(nahisto, i, &y);
804 sum += y;
805 moment += x * y;
806 var += x * x * y;
807 }
808 if (sum == 0.0)
809 return ERROR_INT("sum is 0", procName, 1);
810
811 if (pxmean)
812 *pxmean = moment / sum;
813 if (pxvariance)
814 *pxvariance = var / sum - moment * moment / (sum * sum);
815
816 if (pxmedian) {
817 halfsum = sum / 2.0;
818 for (sumval = 0.0, i = ifirst; i <= ilast; i++) {
819 numaGetFValue(nahisto, i, &y);
820 sumval += y;
821 if (sumval >= halfsum) {
822 *pxmedian = startx + i * deltax;
823 break;
824 }
825 }
826 }
827
828 if (pxmode) {
829 ymax = -1.0e10;
830 for (i = ifirst; i <= ilast; i++) {
831 numaGetFValue(nahisto, i, &y);
832 if (y > ymax) {
833 ymax = y;
834 imax = i;
835 }
836 }
837 *pxmode = startx + imax * deltax;
838 }
839
840 return 0;
841 }
842
843
844 /*!
845 * numaMakeRankFromHistogram()
846 *
847 * Input: startx (xval corresponding to first element in nay)
848 * deltax (x increment between array elements in nay)
849 * nasy (input histogram, assumed equally spaced)
850 * npts (number of points to evaluate rank function)
851 * &nax (<optional return> array of x values in range)
852 * &nay (<return> rank array of specified npts)
853 * Return: 0 if OK, 1 on error
854 */
855 l_int32
numaMakeRankFromHistogram(l_float32 startx,l_float32 deltax,NUMA * nasy,l_int32 npts,NUMA ** pnax,NUMA ** pnay)856 numaMakeRankFromHistogram(l_float32 startx,
857 l_float32 deltax,
858 NUMA *nasy,
859 l_int32 npts,
860 NUMA **pnax,
861 NUMA **pnay)
862 {
863 l_int32 i, n;
864 l_float32 sum, fval;
865 NUMA *nan, *nar;
866
867 PROCNAME("numaMakeRankFromHistogram");
868
869 if (pnax) *pnax = NULL;
870 if (!pnay)
871 return ERROR_INT("&nay not defined", procName, 1);
872 *pnay = NULL;
873 if (!nasy)
874 return ERROR_INT("nasy not defined", procName, 1);
875 if (!pnay)
876 return ERROR_INT("&nay not defined", procName, 1);
877 if ((n = numaGetCount(nasy)) == 0)
878 return ERROR_INT("no bins in nas", procName, 1);
879
880 /* Normalize and generate the rank array corresponding to
881 * the binned histogram. */
882 nan = numaNormalizeHistogram(nasy, 1.0);
883 nar = numaCreate(n + 1); /* rank numa corresponding to nan */
884 sum = 0.0;
885 numaAddNumber(nar, sum); /* first element is 0.0 */
886 for (i = 0; i < n; i++) {
887 numaGetFValue(nan, i, &fval);
888 sum += fval;
889 numaAddNumber(nar, sum);
890 }
891
892 /* Compute rank array on full range with specified
893 * number of points and correspondence to x-values. */
894 numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP,
895 startx, startx + n * deltax, npts,
896 pnax, pnay);
897 numaDestroy(&nan);
898 numaDestroy(&nar);
899 return 0;
900 }
901
902
903 /*!
904 * numaHistogramGetRankFromVal()
905 *
906 * Input: na (histogram)
907 * rval (value of input sample for which we want the rank)
908 * &rank (<return> fraction of total samples below rval)
909 * Return: 0 if OK, 1 on error
910 *
911 * Notes:
912 * (1) If we think of the histogram as a function y(x), normalized
913 * to 1, for a given input value of x, this computes the
914 * rank of x, which is the integral of y(x) from the start
915 * value of x to the input value.
916 * (2) This function only makes sense when applied to a Numa that
917 * is a histogram. The values in the histogram can be ints and
918 * floats, and are computed as floats. The rank is returned
919 * as a float between 0.0 and 1.0.
920 * (3) The numa parameters startx and binsize are used to
921 * compute x from the Numa index i.
922 */
923 l_int32
numaHistogramGetRankFromVal(NUMA * na,l_float32 rval,l_float32 * prank)924 numaHistogramGetRankFromVal(NUMA *na,
925 l_float32 rval,
926 l_float32 *prank)
927 {
928 l_int32 i, ibinval, n;
929 l_float32 startval, binsize, binval, maxval, fractval, total, sum, val;
930
931 PROCNAME("numaHistogramGetRankFromVal");
932
933 if (!prank)
934 return ERROR_INT("prank not defined", procName, 1);
935 *prank = 0.0;
936 if (!na)
937 return ERROR_INT("na not defined", procName, 1);
938 numaGetXParameters(na, &startval, &binsize);
939 n = numaGetCount(na);
940 if (rval < startval)
941 return 0;
942 maxval = startval + n * binsize;
943 if (rval > maxval) {
944 *prank = 1.0;
945 return 0;
946 }
947
948 binval = (rval - startval) / binsize;
949 ibinval = (l_int32)binval;
950 if (ibinval >= n) {
951 *prank = 1.0;
952 return 0;
953 }
954 fractval = binval - (l_float32)ibinval;
955
956 sum = 0.0;
957 for (i = 0; i < ibinval; i++) {
958 numaGetFValue(na, i, &val);
959 sum += val;
960 }
961 numaGetFValue(na, ibinval, &val);
962 sum += fractval * val;
963 numaGetSum(na, &total);
964 *prank = sum / total;
965
966 /* fprintf(stderr, "binval = %7.3f, rank = %7.3f\n", binval, *prank); */
967
968 return 0;
969 }
970
971
972 /*!
973 * numaHistogramGetValFromRank()
974 *
975 * Input: na (histogram)
976 * rank (fraction of total samples)
977 * &rval (<return> approx. to the bin value)
978 * Return: 0 if OK, 1 on error
979 *
980 * Notes:
981 * (1) If we think of the histogram as a function y(x), this returns
982 * the value x such that the integral of y(x) from the start
983 * value to x gives the fraction 'rank' of the integral
984 * of y(x) over all bins.
985 * (2) This function only makes sense when applied to a Numa that
986 * is a histogram. The values in the histogram can be ints and
987 * floats, and are computed as floats. The val is returned
988 * as a float, even though the buckets are of integer width.
989 * (3) The numa parameters startx and binsize are used to
990 * compute x from the Numa index i.
991 */
992 l_int32
numaHistogramGetValFromRank(NUMA * na,l_float32 rank,l_float32 * prval)993 numaHistogramGetValFromRank(NUMA *na,
994 l_float32 rank,
995 l_float32 *prval)
996 {
997 l_int32 i, n;
998 l_float32 startval, binsize, rankcount, total, sum, fract, val;
999
1000 PROCNAME("numaHistogramGetValFromRank");
1001
1002 if (!prval)
1003 return ERROR_INT("prval not defined", procName, 1);
1004 *prval = 0.0;
1005 if (!na)
1006 return ERROR_INT("na not defined", procName, 1);
1007 if (rank < 0.0) {
1008 L_WARNING("rank < 0; setting to 0.0", procName);
1009 rank = 0.0;
1010 }
1011 if (rank > 1.0) {
1012 L_WARNING("rank > 1.0; setting to 1.0", procName);
1013 rank = 1.0;
1014 }
1015
1016 n = numaGetCount(na);
1017 numaGetXParameters(na, &startval, &binsize);
1018 numaGetSum(na, &total);
1019 rankcount = rank * total; /* count that corresponds to rank */
1020 sum = 0.0;
1021 for (i = 0; i < n; i++) {
1022 numaGetFValue(na, i, &val);
1023 if (sum + val >= rankcount)
1024 break;
1025 sum += val;
1026 }
1027 if (val <= 0.0) /* can == 0 if rank == 0.0 */
1028 fract = 0.0;
1029 else /* sum + fract * val = rankcount */
1030 fract = (rankcount - sum) / val;
1031
1032 /* The use of the fraction of a bin allows a simple calculation
1033 * for the histogram value at the given rank. */
1034 *prval = startval + binsize * ((l_float32)i + fract);
1035
1036 /* fprintf(stderr, "rank = %7.3f, val = %7.3f\n", rank, *prval); */
1037
1038 return 0;
1039 }
1040
1041
1042 /*----------------------------------------------------------------------*
1043 * Splitting a distribution *
1044 *----------------------------------------------------------------------*/
1045 /*!
1046 * numaSplitDistribution()
1047 *
1048 * Input: na (histogram)
1049 * scorefract (fraction of the max score, used to determine
1050 * the range over which the histogram min is searched)
1051 * &splitindex (<optional return> index for splitting)
1052 * &ave1 (<optional return> average of lower distribution)
1053 * &ave2 (<optional return> average of upper distribution)
1054 * &num1 (<optional return> population of lower distribution)
1055 * &num2 (<optional return> population of upper distribution)
1056 * &nascore (<optional return> for debugging; otherwise use null)
1057 * Return: 0 if OK, 1 on error
1058 *
1059 * Notes:
1060 * (1) This function is intended to be used on a distribution of
1061 * values that represent two sets, such as a histogram of
1062 * pixel values, and the goal is to determine the means of
1063 * the two sets and the best splitting point.
1064 * (2) The Otsu method finds a split point that divides the distribution
1065 * into two parts by maximizing a score function that is the
1066 * product of two terms:
1067 * (a) the square of the difference of centroids, (ave1 - ave2)^2
1068 * (b) fract1 * (1 - fract1)
1069 * where fract1 is the fraction in the lower distribution.
1070 * This biases the split point into the larger "bump" (i.e., toward
1071 * the point where the (b) term reaches its maximum of 0.25 at
1072 * fract1 = 0.5. To avoid this, we define a range of values near
1073 * the maximum of the score function, and choose the value within
1074 * this range such that the histogram itself has a minimum value.
1075 * The range is determined by scorefract: we include all abscissa
1076 * values to the left and right of the value that maximizes the
1077 * score, such that the score stays above (1 - scorefract) * maxscore.
1078 * (3) We normalize the score so that if the two distributions
1079 * were of equal size and at opposite ends of the numa, the
1080 * score would be 1.0.
1081 */
1082 l_int32
numaSplitDistribution(NUMA * na,l_float32 scorefract,l_int32 * psplitindex,l_float32 * pave1,l_float32 * pave2,l_float32 * pnum1,l_float32 * pnum2,NUMA ** pnascore)1083 numaSplitDistribution(NUMA *na,
1084 l_float32 scorefract,
1085 l_int32 *psplitindex,
1086 l_float32 *pave1,
1087 l_float32 *pave2,
1088 l_float32 *pnum1,
1089 l_float32 *pnum2,
1090 NUMA **pnascore)
1091 {
1092 l_int32 i, n, bestsplit, minrange, maxrange, maxindex;
1093 l_float32 ave1, ave2, ave1prev, ave2prev;
1094 l_float32 num1, num2, num1prev, num2prev;
1095 l_float32 val, minval, sum, fract1;
1096 l_float32 norm, score, minscore, maxscore;
1097 NUMA *nascore, *naave1, *naave2, *nanum1, *nanum2;
1098
1099 PROCNAME("numaSplitDistribution");
1100
1101 if (!na)
1102 return ERROR_INT("na not defined", procName, 1);
1103
1104 n = numaGetCount(na);
1105 if (n <= 1)
1106 return ERROR_INT("n = 1 in histogram", procName, 1);
1107 numaGetSum(na, &sum);
1108 if (sum <= 0.0)
1109 return ERROR_INT("sum <= 0.0", procName, 1);
1110 norm = 4.0 / ((n - 1) * (n - 1));
1111 ave1prev = 0.0;
1112 numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL);
1113 num1prev = 0.0;
1114 num2prev = sum;
1115 maxindex = n / 2; /* initialize with something */
1116
1117 /* Split the histogram with [0 ... i] in the lower part
1118 * and [i+1 ... n-1] in upper part. First, compute an otsu
1119 * score for each possible splitting. */
1120 nascore = numaCreate(n);
1121 if (pave2) naave1 = numaCreate(n);
1122 if (pave2) naave2 = numaCreate(n);
1123 if (pnum1) nanum1 = numaCreate(n);
1124 if (pnum2) nanum2 = numaCreate(n);
1125 maxscore = 0.0;
1126 for (i = 0; i < n; i++) {
1127 numaGetFValue(na, i, &val);
1128 num1 = num1prev + val;
1129 if (num1 == 0)
1130 ave1 = ave1prev;
1131 else
1132 ave1 = (num1prev * ave1prev + i * val) / num1;
1133 num2 = num2prev - val;
1134 if (num2 == 0)
1135 ave2 = ave2prev;
1136 else
1137 ave2 = (num2prev * ave2prev - i * val) / num2;
1138 fract1 = num1 / sum;
1139 score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1);
1140 numaAddNumber(nascore, score);
1141 if (pave1) numaAddNumber(naave1, ave1);
1142 if (pave2) numaAddNumber(naave2, ave2);
1143 if (pnum1) numaAddNumber(nanum1, num1);
1144 if (pnum1) numaAddNumber(nanum2, num2);
1145 if (score > maxscore) {
1146 maxscore = score;
1147 maxindex = i;
1148 }
1149 num1prev = num1;
1150 num2prev = num2;
1151 ave1prev = ave1;
1152 ave2prev = ave2;
1153 }
1154
1155 /* Next, for all contiguous scores within a specified fraction
1156 * of the max, choose the split point as the value with the
1157 * minimum in the histogram. */
1158 minscore = (1. - scorefract) * maxscore;
1159 for (i = maxindex - 1; i >= 0; i--) {
1160 numaGetFValue(nascore, i, &val);
1161 if (val < minscore)
1162 break;
1163 }
1164 minrange = i + 1;
1165 for (i = maxindex + 1; i < n; i++) {
1166 numaGetFValue(nascore, i, &val);
1167 if (val < minscore)
1168 break;
1169 }
1170 maxrange = i - 1;
1171 numaGetFValue(na, minrange, &minval);
1172 bestsplit = minrange;
1173 for (i = minrange + 1; i <= maxrange; i++) {
1174 numaGetFValue(na, i, &val);
1175 if (val < minval) {
1176 minval = val;
1177 bestsplit = i;
1178 }
1179 }
1180
1181 if (psplitindex) *psplitindex = bestsplit;
1182 if (pave1) numaGetFValue(naave1, bestsplit, pave1);
1183 if (pave2) numaGetFValue(naave2, bestsplit, pave2);
1184 if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1);
1185 if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2);
1186
1187 if (pnascore) { /* debug mode */
1188 fprintf(stderr, "minrange = %d, maxrange = %d\n", minrange, maxrange);
1189 fprintf(stderr, "minval = %10.0f\n", minval);
1190 gplotSimple1(nascore, GPLOT_X11, "junkoutroot",
1191 "Score for split distribution");
1192 *pnascore = nascore;
1193 }
1194 else
1195 numaDestroy(&nascore);
1196
1197 if (pave1) numaDestroy(&naave1);
1198 if (pave2) numaDestroy(&naave2);
1199 if (pnum1) numaDestroy(&nanum1);
1200 if (pnum2) numaDestroy(&nanum2);
1201 return 0;
1202 }
1203
1204
1205 /*----------------------------------------------------------------------*
1206 * Extrema finding *
1207 *----------------------------------------------------------------------*/
1208 /*!
1209 * numaFindPeaks()
1210 *
1211 * Input: source na
1212 * max number of peaks to be found
1213 * fract1 (min fraction of peak value)
1214 * fract2 (min slope)
1215 * Return: peak na, or null on error.
1216 *
1217 * Notes:
1218 * (1) The returned na consists of sets of four numbers representing
1219 * the peak, in the following order:
1220 * left edge; peak center; right edge; normalized peak area
1221 */
1222 NUMA *
numaFindPeaks(NUMA * nas,l_int32 nmax,l_float32 fract1,l_float32 fract2)1223 numaFindPeaks(NUMA *nas,
1224 l_int32 nmax,
1225 l_float32 fract1,
1226 l_float32 fract2)
1227 {
1228 l_int32 i, k, n, maxloc, lloc, rloc;
1229 l_float32 fmaxval, sum, total, newtotal, val, lastval;
1230 l_float32 peakfract;
1231 NUMA *na, *napeak;
1232
1233 PROCNAME("numaFindPeaks");
1234
1235 if (!nas)
1236 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
1237 n = numaGetCount(nas);
1238 numaGetSum(nas, &total);
1239
1240 /* We munge this copy */
1241 if ((na = numaCopy(nas)) == NULL)
1242 return (NUMA *)ERROR_PTR("na not made", procName, NULL);
1243 if ((napeak = numaCreate(4 * nmax)) == NULL)
1244 return (NUMA *)ERROR_PTR("napeak not made", procName, NULL);
1245
1246 for (k = 0; k < nmax; k++) {
1247 numaGetSum(na, &newtotal);
1248 if (newtotal == 0.0) /* sanity check */
1249 break;
1250 numaGetMax(na, &fmaxval, &maxloc);
1251 sum = fmaxval;
1252 lastval = fmaxval;
1253 lloc = 0;
1254 for (i = maxloc - 1; i >= 0; --i) {
1255 numaGetFValue(na, i, &val);
1256 if (val == 0.0) {
1257 lloc = i + 1;
1258 break;
1259 }
1260 if (val > fract1 * fmaxval) {
1261 sum += val;
1262 lastval = val;
1263 continue;
1264 }
1265 if (lastval - val > fract2 * lastval) {
1266 sum += val;
1267 lastval = val;
1268 continue;
1269 }
1270 lloc = i;
1271 break;
1272 }
1273 lastval = fmaxval;
1274 rloc = n - 1;
1275 for (i = maxloc + 1; i < n; ++i) {
1276 numaGetFValue(na, i, &val);
1277 if (val == 0.0) {
1278 rloc = i - 1;
1279 break;
1280 }
1281 if (val > fract1 * fmaxval) {
1282 sum += val;
1283 lastval = val;
1284 continue;
1285 }
1286 if (lastval - val > fract2 * lastval) {
1287 sum += val;
1288 lastval = val;
1289 continue;
1290 }
1291 rloc = i;
1292 break;
1293 }
1294 peakfract = sum / total;
1295 numaAddNumber(napeak, lloc);
1296 numaAddNumber(napeak, maxloc);
1297 numaAddNumber(napeak, rloc);
1298 numaAddNumber(napeak, peakfract);
1299
1300 for (i = lloc; i <= rloc; i++)
1301 numaSetValue(na, i, 0.0);
1302 }
1303
1304 numaDestroy(&na);
1305 return napeak;
1306 }
1307
1308
1309 /*!
1310 * numaFindExtrema()
1311 *
1312 * Input: nas (input values)
1313 * delta (relative amount to resolve peaks and valleys)
1314 * Return: nad (locations of extrema), or null on error
1315 *
1316 * Notes:
1317 * (1) This returns a sequence of extrema (peaks and valleys).
1318 * (2) The algorithm is analogous to that for determining
1319 * mountain peaks. Suppose we have a local peak, with
1320 * bumps on the side. Under what conditions can we consider
1321 * those 'bumps' to be actual peaks? The answer: if the
1322 * bump is separated from the peak by a saddle that is at
1323 * least 500 feet below the bump.
1324 * (3) Operationally, suppose we are looking for a peak.
1325 * We are keeping the largest value we've seen since the
1326 * last valley, and are looking for a value that is delta
1327 * BELOW our current peak. When we find such a value,
1328 * we label the peak, use the current value to label the
1329 * valley, and then do the same operation in reverse (looking
1330 * for a valley).
1331 */
1332 NUMA *
numaFindExtrema(NUMA * nas,l_float32 delta)1333 numaFindExtrema(NUMA *nas,
1334 l_float32 delta)
1335 {
1336 l_int32 i, n, found, loc, direction;
1337 l_float32 startval, val, maxval, minval;
1338 NUMA *nad;
1339
1340 PROCNAME("numaFindExtrema");
1341
1342 if (!nas)
1343 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
1344
1345 n = numaGetCount(nas);
1346 nad = numaCreate(0);
1347
1348 /* We don't know if we'll find a peak or valley first,
1349 * but use the first element of nas as the reference point.
1350 * Break when we deviate by 'delta' from the first point. */
1351 numaGetFValue(nas, 0, &startval);
1352 found = FALSE;
1353 for (i = 1; i < n; i++) {
1354 numaGetFValue(nas, i, &val);
1355 if (L_ABS(val - startval) >= delta) {
1356 found = TRUE;
1357 break;
1358 }
1359 }
1360
1361 if (!found)
1362 return nad; /* it's empty */
1363
1364 /* Are we looking for a peak or a valley? */
1365 if (val > startval) { /* peak */
1366 direction = 1;
1367 maxval = val;
1368 }
1369 else {
1370 direction = -1;
1371 minval = val;
1372 }
1373 loc = i;
1374
1375 /* Sweep through the rest of the array, recording alternating
1376 * peak/valley extrema. */
1377 for (i = i + 1; i < n; i++) {
1378 numaGetFValue(nas, i, &val);
1379 if (direction == 1 && val > maxval ) { /* new local max */
1380 maxval = val;
1381 loc = i;
1382 }
1383 else if (direction == -1 && val < minval ) { /* new local min */
1384 minval = val;
1385 loc = i;
1386 }
1387 else if (direction == 1 && (maxval - val >= delta)) {
1388 numaAddNumber(nad, loc); /* save the current max location */
1389 direction = -1; /* reverse: start looking for a min */
1390 minval = val;
1391 loc = i; /* current min location */
1392 }
1393 else if (direction == -1 && (val - minval >= delta)) {
1394 numaAddNumber(nad, loc); /* save the current min location */
1395 direction = 1; /* reverse: start looking for a max */
1396 maxval = val;
1397 loc = i; /* current max location */
1398 }
1399 }
1400
1401 /* Save the final extremum */
1402 /* numaAddNumber(nad, loc); */
1403 return nad;
1404 }
1405
1406
1407 /*----------------------------------------------------------------------*
1408 * Threshold crossings and frequency analysis *
1409 *----------------------------------------------------------------------*/
1410 /*!
1411 * numaSelectCrossingThreshold()
1412 *
1413 * Input: nax (<optional> numa of abscissa values; can be NULL)
1414 * nay (signal)
1415 * estthresh (estimated pixel threshold for crossing: e.g., for
1416 * images, white <--> black; typ. ~120)
1417 * &bestthresh (<return> robust estimate of threshold to use)
1418 * Return: 0 if OK, 1 on error
1419 *
1420 * Note:
1421 * (1) When a valid threshold is used, the number of crossings is
1422 * a maximum, because none are missed. If no threshold intersects
1423 * all the crossings, the crossings must be determined with
1424 * numaCrossingsByPeaks().
1425 * (2) @estthresh is an input estimate of the threshold that should
1426 * be used. We compute the crossings with 41 thresholds
1427 * (20 below and 20 above). There is a range in which the
1428 * number of crossings is a maximum. Return a threshold
1429 * in the center of this stable plateau of crossings.
1430 * This can then be used with numaCrossingsByThreshold()
1431 * to get a good estimate of crossing locations.
1432 */
1433 l_int32
numaSelectCrossingThreshold(NUMA * nax,NUMA * nay,l_float32 estthresh,l_float32 * pbestthresh)1434 numaSelectCrossingThreshold(NUMA *nax,
1435 NUMA *nay,
1436 l_float32 estthresh,
1437 l_float32 *pbestthresh)
1438 {
1439 l_int32 i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen;
1440 l_int32 val, maxval, nmax, count;
1441 l_float32 thresh, fmaxval, fmodeval;
1442 NUMA *nat, *nac;
1443
1444 PROCNAME("numaSelectCrossingThreshold");
1445
1446 if (!nay)
1447 return ERROR_INT("nay not defined", procName, 1);
1448
1449 /* Compute the number of crossings for different thresholds */
1450 nat = numaCreate(41);
1451 for (i = 0; i < 41; i++) {
1452 thresh = estthresh - 80.0 + 4.0 * i;
1453 nac = numaCrossingsByThreshold(nax, nay, thresh);
1454 numaAddNumber(nat, numaGetCount(nac));
1455 numaDestroy(&nac);
1456 }
1457
1458 /* Find the center of the plateau of max crossings, which
1459 * extends from thresh[istart] to thresh[iend]. */
1460 numaGetMax(nat, &fmaxval, NULL);
1461 maxval = (l_int32)fmaxval;
1462 nmax = 0;
1463 for (i = 0; i < 41; i++) {
1464 numaGetIValue(nat, i, &val);
1465 if (val == maxval)
1466 nmax++;
1467 }
1468 if (nmax < 3) { /* likely accidental max; try the mode */
1469 numaGetMode(nat, &fmodeval, &count);
1470 if (count > nmax && fmodeval > 0.5 * fmaxval)
1471 maxval = (l_int32)fmodeval; /* use the mode */
1472 }
1473
1474 inrun = FALSE;
1475 iend = 40;
1476 maxrunlen = 0;
1477 for (i = 0; i < 41; i++) {
1478 numaGetIValue(nat, i, &val);
1479 if (val == maxval) {
1480 if (!inrun) {
1481 istart = i;
1482 inrun = TRUE;
1483 }
1484 continue;
1485 }
1486 if (inrun && (val != maxval)) {
1487 iend = i - 1;
1488 runlen = iend - istart + 1;
1489 inrun = FALSE;
1490 if (runlen > maxrunlen) {
1491 maxstart = istart;
1492 maxend = iend;
1493 maxrunlen = runlen;
1494 }
1495 }
1496 }
1497 if (inrun) {
1498 runlen = i - istart;
1499 if (runlen > maxrunlen) {
1500 maxstart = istart;
1501 maxend = i - 1;
1502 maxrunlen = runlen;
1503 }
1504 }
1505
1506 #if 0
1507 foundfirst = FALSE;
1508 iend = 40;
1509 for (i = 0; i < 41; i++) {
1510 numaGetIValue(nat, i, &val);
1511 if (val == maxval) {
1512 if (!foundfirst) {
1513 istart = i;
1514 foundfirst = TRUE;
1515 }
1516 }
1517 if ((val != maxval) && foundfirst) {
1518 iend = i - 1;
1519 break;
1520 }
1521 }
1522 nmax = iend - istart + 1;
1523 #endif
1524
1525 *pbestthresh = estthresh - 80.0 + 2.0 * (l_float32)(maxstart + maxend);
1526
1527 #if DEBUG_CROSSINGS
1528 fprintf(stderr, "\nCrossings attain a maximum at %d thresholds, between:\n"
1529 " thresh[%d] = %5.1f and thresh[%d] = %5.1f\n",
1530 nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart,
1531 maxend, estthresh - 80.0 + 4.0 * maxend);
1532 fprintf(stderr, "The best choice: %5.1f\n", *pbestthresh);
1533 fprintf(stderr, "Number of crossings at the 41 thresholds:");
1534 numaWriteStream(stderr, nat);
1535 #endif /* DEBUG_CROSSINGS */
1536
1537 numaDestroy(&nat);
1538 return 0;
1539 }
1540
1541
1542 /*!
1543 * numaCrossingsByThreshold()
1544 *
1545 * Input: nax (<optional> numa of abscissa values; can be NULL)
1546 * nay (numa of ordinate values, corresponding to nax)
1547 * thresh (threshold value for nay)
1548 * Return: nad (abscissa pts at threshold), or null on error
1549 *
1550 * Notes:
1551 * (1) If nax == NULL, we use startx and delx from nay to compute
1552 * the crossing values in nad.
1553 */
1554 NUMA *
numaCrossingsByThreshold(NUMA * nax,NUMA * nay,l_float32 thresh)1555 numaCrossingsByThreshold(NUMA *nax,
1556 NUMA *nay,
1557 l_float32 thresh)
1558 {
1559 l_int32 i, n;
1560 l_float32 startx, delx;
1561 l_float32 xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract;
1562 NUMA *nad;
1563
1564 PROCNAME("numaCrossingsByThreshold");
1565
1566 if (!nay)
1567 return (NUMA *)ERROR_PTR("nay not defined", procName, NULL);
1568 n = numaGetCount(nay);
1569
1570 if (nax && (numaGetCount(nax) != n))
1571 return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL);
1572
1573 nad = numaCreate(0);
1574 numaGetFValue(nay, 0, &yval1);
1575 numaGetXParameters(nay, &startx, &delx);
1576 if (nax)
1577 numaGetFValue(nax, 0, &xval1);
1578 else
1579 xval1 = startx;
1580 for (i = 1; i < n; i++) {
1581 numaGetFValue(nay, i, &yval2);
1582 if (nax)
1583 numaGetFValue(nax, i, &xval2);
1584 else
1585 xval2 = startx + i * delx;
1586 delta1 = yval1 - thresh;
1587 delta2 = yval2 - thresh;
1588 if (delta1 == 0.0)
1589 numaAddNumber(nad, xval1);
1590 else if (delta2 == 0.0)
1591 numaAddNumber(nad, xval2);
1592 else if (delta1 * delta2 < 0.0) { /* crossing */
1593 fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
1594 crossval = xval1 + fract * (xval2 - xval1);
1595 numaAddNumber(nad, crossval);
1596 }
1597 xval1 = xval2;
1598 yval1 = yval2;
1599 }
1600
1601 return nad;
1602 }
1603
1604
1605 /*!
1606 * numaCrossingsByPeaks()
1607 *
1608 * Input: nax (<optional> numa of abscissa values)
1609 * nay (numa of ordinate values, corresponding to nax)
1610 * delta (parameter used to identify when a new peak can be found)
1611 * Return: nad (abscissa pts at threshold), or null on error
1612 *
1613 * Notes:
1614 * (1) If nax == NULL, we use startx and delx from nay to compute
1615 * the crossing values in nad.
1616 */
1617 NUMA *
numaCrossingsByPeaks(NUMA * nax,NUMA * nay,l_float32 delta)1618 numaCrossingsByPeaks(NUMA *nax,
1619 NUMA *nay,
1620 l_float32 delta)
1621 {
1622 l_int32 i, j, n, np, previndex, curindex;
1623 l_float32 startx, delx;
1624 l_float32 xval1, xval2, yval1, yval2, delta1, delta2;
1625 l_float32 prevval, curval, thresh, crossval, fract;
1626 NUMA *nap, *nad;
1627
1628 PROCNAME("numaCrossingsByPeaks");
1629
1630 if (!nax)
1631 return (NUMA *)ERROR_PTR("nax not defined", procName, NULL);
1632 if (!nay)
1633 return (NUMA *)ERROR_PTR("nay not defined", procName, NULL);
1634
1635 n = numaGetCount(nax);
1636 if (numaGetCount(nay) != n)
1637 return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL);
1638
1639 /* Find the extrema. Also add last point in nay to get
1640 * the last transition (from the last peak to the end).
1641 * The number of crossings is 1 more than the number of extrema. */
1642 nap = numaFindExtrema(nay, delta);
1643 numaAddNumber(nap, n - 1);
1644 np = numaGetCount(nap);
1645 L_INFO_INT("Number of crossings: %d", procName, np);
1646
1647 /* Do all computation in index units of nax */
1648 nad = numaCreate(np); /* output crossings, in nax units */
1649 previndex = 0; /* prime the search with 1st point */
1650 numaGetFValue(nay, 0, &prevval); /* prime the search with 1st point */
1651 numaGetXParameters(nay, &startx, &delx);
1652 for (i = 0; i < np; i++) {
1653 numaGetIValue(nap, i, &curindex);
1654 numaGetFValue(nay, curindex, &curval);
1655 thresh = (prevval + curval) / 2.0;
1656 /* fprintf(stderr, "thresh[%d] = %7.3f\n", i, thresh); */
1657 if (nax)
1658 numaGetFValue(nax, previndex, &xval1);
1659 else
1660 xval1 = startx + previndex * delx;
1661 numaGetFValue(nay, previndex, &yval1);
1662 for (j = previndex + 1; j <= curindex; j++) {
1663 if (nax)
1664 numaGetFValue(nax, j, &xval2);
1665 else
1666 xval2 = startx + j * delx;
1667 numaGetFValue(nay, j, &yval2);
1668 delta1 = yval1 - thresh;
1669 delta2 = yval2 - thresh;
1670 if (delta1 == 0.0) {
1671 numaAddNumber(nad, xval1);
1672 break;
1673 }
1674 else if (delta2 == 0.0) {
1675 numaAddNumber(nad, xval2);
1676 break;
1677 }
1678 else if (delta1 * delta2 < 0.0) { /* crossing */
1679 fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
1680 crossval = xval1 + fract * (xval2 - xval1);
1681 numaAddNumber(nad, crossval);
1682 break;
1683 }
1684 xval1 = xval2;
1685 yval1 = yval2;
1686 }
1687 previndex = curindex;
1688 prevval = curval;
1689 }
1690
1691 numaDestroy(&nap);
1692 return nad;
1693 }
1694
1695
1696 /*!
1697 * numaEvalBestHaarParameters()
1698 *
1699 * Input: nas (numa of non-negative signal values)
1700 * relweight (relative weight of (-1 comb) / (+1 comb)
1701 * contributions to the 'convolution'. In effect,
1702 * the convolution kernel is a comb consisting of
1703 * alternating +1 and -weight.)
1704 * nwidth (number of widths to consider)
1705 * nshift (number of shifts to consider for each width)
1706 * minwidth (smallest width to consider)
1707 * maxwidth (largest width to consider)
1708 * &bestwidth (<return> width giving largest score)
1709 * &bestshift (<return> shift giving largest score)
1710 * &bestscore (<optional return> convolution with
1711 * "Haar"-like comb)
1712 * Return: 0 if OK, 1 on error
1713 *
1714 * Notes:
1715 * (1) This does a linear sweep of widths, evaluating at @nshift
1716 * shifts for each width, computing the score from a convolution
1717 * with a long comb, and finding the (width, shift) pair that
1718 * gives the maximum score. The best width is the "half-wavelength"
1719 * of the signal.
1720 * (2) The convolving function is a comb of alternating values
1721 * +1 and -1 * relweight, separated by the width and phased by
1722 * the shift. This is similar to a Haar transform, except
1723 * there the convolution is performed with a square wave.
1724 * (3) The function is useful for finding the line spacing
1725 * and strength of line signal from pixel sum projections.
1726 * (4) The score is normalized to the size of nas divided by
1727 * the number of half-widths. For image applications, the input is
1728 * typically an array of pixel projections, so one should
1729 * normalize by dividing the score by the image width in the
1730 * pixel projection direction.
1731 */
1732 l_int32
numaEvalBestHaarParameters(NUMA * nas,l_float32 relweight,l_int32 nwidth,l_int32 nshift,l_float32 minwidth,l_float32 maxwidth,l_float32 * pbestwidth,l_float32 * pbestshift,l_float32 * pbestscore)1733 numaEvalBestHaarParameters(NUMA *nas,
1734 l_float32 relweight,
1735 l_int32 nwidth,
1736 l_int32 nshift,
1737 l_float32 minwidth,
1738 l_float32 maxwidth,
1739 l_float32 *pbestwidth,
1740 l_float32 *pbestshift,
1741 l_float32 *pbestscore)
1742 {
1743 l_int32 i, j;
1744 l_float32 delwidth, delshift, width, shift, score;
1745 l_float32 bestwidth, bestshift, bestscore;
1746
1747 PROCNAME("numaEvalBestHaarParameters");
1748
1749 if (!nas)
1750 return ERROR_INT("nas not defined", procName, 1);
1751 if (!pbestwidth || !pbestshift)
1752 return ERROR_INT("&bestwidth and &bestshift not defined", procName, 1);
1753
1754 bestscore = 0.0;
1755 delwidth = (maxwidth - minwidth) / (nwidth - 1.0);
1756 for (i = 0; i < nwidth; i++) {
1757 width = minwidth + delwidth * i;
1758 delshift = width / (l_float32)(nshift);
1759 for (j = 0; j < nshift; j++) {
1760 shift = j * delshift;
1761 numaEvalHaarSum(nas, width, shift, relweight, &score);
1762 if (score > bestscore) {
1763 bestscore = score;
1764 bestwidth = width;
1765 bestshift = shift;
1766 #if DEBUG_FREQUENCY
1767 fprintf(stderr, "width = %7.3f, shift = %7.3f, score = %7.3f\n",
1768 width, shift, score);
1769 #endif /* DEBUG_FREQUENCY */
1770 }
1771 }
1772 }
1773
1774 *pbestwidth = bestwidth;
1775 *pbestshift = bestshift;
1776 if (pbestscore)
1777 *pbestscore = bestscore;
1778 return 0;
1779 }
1780
1781
1782 /*!
1783 * numaEvalHaarSum()
1784 *
1785 * Input: nas (numa of non-negative signal values)
1786 * width (distance between +1 and -1 in convolution comb)
1787 * shift (phase of the comb: location of first +1)
1788 * relweight (relative weight of (-1 comb) / (+1 comb)
1789 * contributions to the 'convolution'. In effect,
1790 * the convolution kernel is a comb consisting of
1791 * alternating +1 and -weight.)
1792 * &score (<return> convolution with "Haar"-like comb)
1793 * Return: 0 if OK, 1 on error
1794 *
1795 * Notes:
1796 * (1) This does a convolution with a comb of alternating values
1797 * +1 and -relweight, separated by the width and phased by the shift.
1798 * This is similar to a Haar transform, except that for Haar,
1799 * (1) the convolution kernel is symmetric about 0, so the
1800 * relweight is 1.0, and
1801 * (2) the convolution is performed with a square wave.
1802 * (2) The score is normalized to the size of nas divided by
1803 * twice the "width". For image applications, the input is
1804 * typically an array of pixel projections, so one should
1805 * normalize by dividing the score by the image width in the
1806 * pixel projection direction.
1807 * (3) To get a Haar-like result, use relweight = 1.0. For detecting
1808 * signals where you expect every other sample to be close to
1809 * zero, as with barcodes or filtered text lines, you can
1810 * use relweight > 1.0.
1811 */
1812 l_int32
numaEvalHaarSum(NUMA * nas,l_float32 width,l_float32 shift,l_float32 relweight,l_float32 * pscore)1813 numaEvalHaarSum(NUMA *nas,
1814 l_float32 width,
1815 l_float32 shift,
1816 l_float32 relweight,
1817 l_float32 *pscore)
1818 {
1819 l_int32 i, n, nsamp, index;
1820 l_float32 score, weight, val;
1821
1822 PROCNAME("numaEvalHaarSum");
1823
1824 if (!pscore)
1825 return ERROR_INT("&score not defined", procName, 1);
1826 *pscore = 0.0;
1827 if (!nas)
1828 return ERROR_INT("nas not defined", procName, 1);
1829 if ((n = numaGetCount(nas)) < 2 * width)
1830 return ERROR_INT("nas size too small", procName, 1);
1831
1832 score = 0.0;
1833 nsamp = (l_int32)((n - shift) / width);
1834 for (i = 0; i < nsamp; i++) {
1835 index = (l_int32)(shift + i * width);
1836 weight = (i % 2) ? 1.0 : -1.0 * relweight;
1837 numaGetFValue(nas, index, &val);
1838 score += weight * val;
1839 }
1840
1841 *pscore = 2.0 * width * score / (l_float32)n;
1842 return 0;
1843 }
1844
1845