• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -  This software is distributed in the hope that it will be
4  -  useful, but with NO WARRANTY OF ANY KIND.
5  -  No author or distributor accepts responsibility to anyone for the
6  -  consequences of using this software, or for whether it serves any
7  -  particular purpose or works at all, unless he or she says so in
8  -  writing.  Everyone is granted permission to copy, modify and
9  -  redistribute this source code, for commercial or non-commercial
10  -  purposes, with the following restrictions: (1) the origin of this
11  -  source code must not be misrepresented; (2) modified versions must
12  -  be plainly marked as such; and (3) this notice may not be removed
13  -  or altered from any source or modified source distribution.
14  *====================================================================*/
15 
16 /*
17  *   numafunc2.c
18  *
19  *      Transformations
20  *          NUMA        *numaTransform()
21  *          NUMA        *numaConvolve()
22  *          NUMA        *numaConvertToInt()
23  *
24  *      Histogram generation and statistics
25  *          NUMA        *numaMakeHistogram()
26  *          NUMA        *numaMakeHistogramAuto()
27  *          NUMA        *numaMakeHistogramClipped()
28  *          NUMA        *numaRebinHistogram()
29  *          NUMA        *numaNormalizeHistogram()
30  *          l_int32      numaGetStatsUsingHistogram()
31  *          l_int32      numaGetHistogramStats()
32  *          l_int32      numaGetHistogramStatsOnInterval()
33  *          l_int32      numaMakeRankFromHistogram()
34  *          l_int32      numaHistogramGetRankFromVal()
35  *          l_int32      numaHistogramGetValFromRank()
36  *
37  *      Splitting a distribution
38  *          l_int32      numaSplitDistribution()
39  *
40  *      Extrema finding
41  *          NUMA        *numaFindPeaks()
42  *          NUMA        *numaFindExtrema()
43  *
44  *      Threshold crossings and frequency analysis
45  *          l_int32      numaSelectCrossingThreshold()
46  *          NUMA        *numaCrossingsByThreshold()
47  *          NUMA        *numaCrossingsByPeaks()
48  *          NUMA        *numaEvalBestHaarParameters()
49  *          l_int32      numaEvalHaarSum()
50  *
51  *    Things to remember when using the Numa:
52  *
53  *    (1) The numa is a struct, not an array.  Always use accessors
54  *        (see numabasic.c), never the fields directly.
55  *
56  *    (2) The number array holds l_float32 values.  It can also
57  *        be used to store l_int32 values.  See numabasic.c for
58  *        details on using the accessors.
59  *
60  *    (3) Occasionally, in the comments we denote the i-th element of a
61  *        numa by na[i].  This is conceptual only -- the numa is not an array!
62  *
63  *    Some general comments on histograms:
64  *
65  *    (1) Histograms are the generic statistical representation of
66  *        the data about some attribute.  Typically they're not
67  *        normalized -- they simply give the number of occurrences
68  *        within each range of values of the attribute.  This range
69  *        of values is referred to as a 'bucket'.  For example,
70  *        the histogram could specify how many connected components
71  *        are found for each value of their width; in that case,
72  *        the bucket size is 1.
73  *
74  *    (2) In leptonica, all buckets have the same size.  Histograms
75  *        are therefore specified by a numa of occurrences, along
76  *        with two other numbers: the 'value' associated with the
77  *        occupants of the first bucket and the size (i.e., 'width')
78  *        of each bucket.  These two numbers then allow us to calculate
79  *        the value associated with the occupants of each bucket.
80  *        These numbers are fields in the numa, initialized to
81  *        a startx value of 0.0 and a binsize of 1.0.  Accessors for
82  *        these fields are functions numa*XParameters().  All histograms
83  *        must have these two numbers properly set.
84  */
85 
86 #include <stdio.h>
87 #include <string.h>
88 #include <stdlib.h>
89 #include <math.h>
90 #include "allheaders.h"
91 
92     /* bin sizes in numaMakeHistogram() */
93 static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\
94                       2000, 5000, 10000, 20000, 50000, 100000, 200000,\
95                       500000, 1000000, 2000000, 5000000, 10000000,\
96                       200000000, 50000000, 100000000};
97 static const l_int32 NBinSizes = 24;
98 
99 
100 #ifndef  NO_CONSOLE_IO
101 #define  DEBUG_HISTO        0
102 #define  DEBUG_CROSSINGS    0
103 #define  DEBUG_FREQUENCY    0
104 #endif  /* ~NO_CONSOLE_IO */
105 
106 
107 /*----------------------------------------------------------------------*
108  *                             Transformations                          *
109  *----------------------------------------------------------------------*/
110 /*!
111  *  numaTransform()
112  *
113  *      Input:  nas
114  *              shift (add this to each number)
115  *              scale (multiply each number by this)
116  *      Return: na with all values shifted and scaled, or null on error
117  *
118  *  Notes:
119  *      (1) Each number is shifted before scaling.
120  *      (2) The operation sequence is opposite to that for Box and Pta:
121  *          scale first, then shift.
122  */
123 NUMA *
numaTransform(NUMA * nas,l_float32 shift,l_float32 scale)124 numaTransform(NUMA      *nas,
125               l_float32  shift,
126               l_float32  scale)
127 {
128 l_int32    i, n;
129 l_float32  val;
130 NUMA      *nad;
131 
132     PROCNAME("numaTransform");
133 
134     if (!nas)
135         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
136     n = numaGetCount(nas);
137     if ((nad = numaCreate(n)) == NULL)
138         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
139     for (i = 0; i < n; i++) {
140         numaGetFValue(nas, i, &val);
141         val = scale * val + shift;
142         numaAddNumber(nad, val);
143     }
144     return nad;
145 }
146 
147 
148 /*!
149  *  numaConvolve()
150  *
151  *      Input:  na
152  *              halfwidth (of rectangular filter, minus the center)
153  *      Return: na (after low-pass filtering), or null on error
154  *
155  *  Notes:
156  *      (1) Full convolution takes place only from i = halfwidth to
157  *          i = n - halfwidth - 1.  We do the end parts using only
158  *          the partial array available.  We do not pad the ends with 0.
159  *      (2) This implementation assumes specific fields in the Numa!
160  */
161 NUMA *
numaConvolve(NUMA * na,l_int32 halfwidth)162 numaConvolve(NUMA    *na,
163              l_int32  halfwidth)
164 {
165 l_int32     i, n, rval;
166 l_float32   sum, norm;
167 l_float32  *array, *carray, *sumarray;
168 NUMA       *nac;
169 
170     PROCNAME("numaConvolve");
171 
172     if (!na)
173         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
174     n = numaGetCount(na);
175     if (2 * halfwidth + 1 > n)
176         L_WARNING("filter wider than input array!", procName);
177     array = na->array;
178 
179     if ((nac = numaCreate(n)) == NULL)
180         return (NUMA *)ERROR_PTR("nac not made", procName, NULL);
181     carray = nac->array;
182     nac->n = n;  /* fill with zeroes */
183 
184         /* Make sum array; note the indexing */
185     if ((sumarray = (l_float32 *)CALLOC(n + 1, sizeof(l_float32))) == NULL)
186         return (NUMA *)ERROR_PTR("sumarray not made", procName, NULL);
187     sum = 0.0;
188     sumarray[0] = 0.0;
189     for (i = 0; i < n; i++) {
190         sum += array[i];
191         sumarray[i + 1] = sum;
192     }
193 
194         /* Central part */
195     norm = 1. / (2 * halfwidth + 1);
196     rval = n - halfwidth;
197     for (i = halfwidth; i < rval; ++i)
198         carray[i] = norm *
199                       (sumarray[i + halfwidth + 1] - sumarray[i - halfwidth]);
200 
201         /* Left side */
202     for (i = 0; i < halfwidth; i++)
203         carray[i] = sumarray[i + halfwidth + 1] / (halfwidth + i + 1);
204 
205         /* Right side */
206     for (i = rval; i < n; i++)
207         carray[i] = (1. / (n - i + halfwidth)) *
208                        (sumarray[n] - sumarray[i - halfwidth]);
209 
210     FREE(sumarray);
211     return nac;
212 }
213 
214 
215 /*!
216  *  numaConvertToInt()
217  *
218  *      Input:  na
219  *      Return: na with all values rounded to nearest integer, or
220  *              null on error
221  */
222 NUMA *
numaConvertToInt(NUMA * nas)223 numaConvertToInt(NUMA  *nas)
224 {
225 l_int32  i, n, ival;
226 NUMA    *nad;
227 
228     PROCNAME("numaConvertToInt");
229 
230     if (!nas)
231         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
232 
233     n = numaGetCount(nas);
234     if ((nad = numaCreate(n)) == NULL)
235         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
236     for (i = 0; i < n; i++) {
237         numaGetIValue(nas, i, &ival);
238         numaAddNumber(nad, ival);
239     }
240     return nad;
241 }
242 
243 
244 /*----------------------------------------------------------------------*
245  *                 Histogram generation and statistics                  *
246  *----------------------------------------------------------------------*/
247 /*!
248  *  numaMakeHistogram()
249  *
250  *      Input:  na
251  *              maxbins (max number of histogram bins)
252  *              &binsize  (<return> size of histogram bins)
253  *              &binstart (<optional return> start val of minimum bin;
254  *                         input NULL to force start at 0)
255  *      Return: na consisiting of histogram of integerized values,
256  *              or null on error.
257  *
258  *  Note:
259  *      (1) This simple interface is designed for integer data.
260  *          The bins are of integer width and start on integer boundaries,
261  *          so the results on float data will not have high precision.
262  *      (2) Specify the max number of input bins.   Then @binsize,
263  *          the size of bins necessary to accommodate the input data,
264  *          is returned.  It is one of the sequence:
265  *                {1, 2, 5, 10, 20, 50, ...}.
266  *      (3) If &binstart is given, all values are accommodated,
267  *          and the min value of the starting bin is returned.
268  *          Otherwise, all negative values are discarded and
269  *          the histogram bins start at 0.
270  */
271 NUMA *
numaMakeHistogram(NUMA * na,l_int32 maxbins,l_int32 * pbinsize,l_int32 * pbinstart)272 numaMakeHistogram(NUMA     *na,
273                   l_int32   maxbins,
274                   l_int32  *pbinsize,
275                   l_int32  *pbinstart)
276 {
277 l_int32    i, n, ival, hval;
278 l_int32    iminval, imaxval, range, binsize, nbins, ibin;
279 l_float32  val, ratio;
280 NUMA      *nai, *nahist;
281 
282     PROCNAME("numaMakeHistogram");
283 
284     if (!na)
285         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
286     if (!pbinsize)
287         return (NUMA *)ERROR_PTR("&binsize not defined", procName, NULL);
288 
289         /* Determine input range */
290     numaGetMin(na, &val, NULL);
291     iminval = (l_int32)(val + 0.5);
292     numaGetMax(na, &val, NULL);
293     imaxval = (l_int32)(val + 0.5);
294     if (pbinstart == NULL) {  /* clip negative vals; start from 0 */
295         iminval = 0;
296         if (imaxval < 0)
297             return (NUMA *)ERROR_PTR("all values < 0", procName, NULL);
298     }
299 
300         /* Determine binsize */
301     range = imaxval - iminval + 1;
302     if (range > maxbins - 1) {
303         ratio = (l_float64)range / (l_float64)maxbins;
304         binsize = 0;
305         for (i = 0; i < NBinSizes; i++) {
306             if (ratio < BinSizeArray[i]) {
307                 binsize = BinSizeArray[i];
308                 break;
309             }
310         }
311         if (binsize == 0)
312             return (NUMA *)ERROR_PTR("numbers too large", procName, NULL);
313     }
314     else
315         binsize = 1;
316     *pbinsize = binsize;
317     nbins = 1 + range / binsize;  /* +1 seems to be sufficient */
318 
319         /* Redetermine iminval */
320     if (pbinstart && binsize > 1) {
321         if (iminval >= 0)
322             iminval = binsize * (iminval / binsize);
323         else
324             iminval = binsize * ((iminval - binsize + 1) / binsize);
325     }
326     if (pbinstart)
327         *pbinstart = iminval;
328 
329 #if  DEBUG_HISTO
330     fprintf(stderr, " imaxval = %d, range = %d, nbins = %d\n",
331             imaxval, range, nbins);
332 #endif  /* DEBUG_HISTO */
333 
334         /* Use integerized data for input */
335     if ((nai = numaConvertToInt(na)) == NULL)
336         return (NUMA *)ERROR_PTR("nai not made", procName, NULL);
337     n = numaGetCount(nai);
338 
339         /* Make histogram, converting value in input array
340          * into a bin number for this histogram array. */
341     if ((nahist = numaCreate(nbins)) == NULL)
342         return (NUMA *)ERROR_PTR("nahist not made", procName, NULL);
343     numaSetCount(nahist, nbins);
344     numaSetXParameters(nahist, iminval, binsize);
345     for (i = 0; i < n; i++) {
346         numaGetIValue(nai, i, &ival);
347         ibin = (ival - iminval) / binsize;
348         if (ibin >= 0 && ibin < nbins) {
349             numaGetIValue(nahist, ibin, &hval);
350             numaSetValue(nahist, ibin, hval + 1.0);
351         }
352     }
353 
354     numaDestroy(&nai);
355     return nahist;
356 }
357 
358 
359 /*!
360  *  numaMakeHistogramAuto()
361  *
362  *      Input:  na (numa of floats; these may be integers)
363  *              maxbins (max number of histogram bins; >= 1)
364  *      Return: na consisiting of histogram of quantized float values,
365  *              or null on error.
366  *
367  *  Notes:
368  *      (1) This simple interface is designed for accurate binning
369  *          of both integer and float data.
370  *      (2) If the array data is integers, and the range of integers
371  *          is smaller than @maxbins, they are binned as they fall,
372  *          with binsize = 1.
373  *      (3) If the range of data, (maxval - minval), is larger than
374  *          @maxbins, or if the data is floats, they are binned into
375  *          exactly @maxbins bins.
376  *      (4) Unlike numaMakeHistogram(), these bins in general have
377  *          non-integer location and width, even for integer data.
378  */
379 NUMA *
numaMakeHistogramAuto(NUMA * na,l_int32 maxbins)380 numaMakeHistogramAuto(NUMA    *na,
381                       l_int32  maxbins)
382 {
383 l_int32    i, n, imin, imax, irange, ibin, ival, allints;
384 l_float32  minval, maxval, range, binsize, fval;
385 NUMA      *nah;
386 
387     PROCNAME("numaMakeHistogramAuto");
388 
389     if (!na)
390         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
391     maxbins = L_MAX(1, maxbins);
392 
393         /* Determine input range */
394     numaGetMin(na, &minval, NULL);
395     numaGetMax(na, &maxval, NULL);
396 
397         /* Determine if values are all integers */
398     n = numaGetCount(na);
399     numaHasOnlyIntegers(na, maxbins, &allints);
400 
401         /* Do simple integer binning if possible */
402     if (allints && (maxval - minval < maxbins)) {
403         imin = (l_int32)minval;
404         imax = (l_int32)maxval;
405         irange = imax - imin + 1;
406         nah = numaCreate(irange);
407         numaSetCount(nah, irange);  /* init */
408         numaSetXParameters(nah, minval, 1.0);
409         for (i = 0; i < n; i++) {
410             numaGetIValue(na, i, &ival);
411             ibin = ival - imin;
412             numaGetIValue(nah, ibin, &ival);
413             numaSetValue(nah, ibin, ival + 1.0);
414         }
415 
416         return nah;
417     }
418 
419         /* Do float binning, even if the data is integers. */
420     range = maxval - minval;
421     binsize = range / (l_float32)maxbins;
422     if (range == 0.0) {
423         nah = numaCreate(1);
424         numaSetXParameters(nah, minval, binsize);
425         numaAddNumber(nah, n);
426         return nah;
427     }
428     nah = numaCreate(maxbins);
429     numaSetCount(nah, maxbins);
430     numaSetXParameters(nah, minval, binsize);
431     for (i = 0; i < n; i++) {
432         numaGetFValue(na, i, &fval);
433         ibin = (l_int32)((fval - minval) / binsize);
434         numaGetIValue(nah, ibin, &ival);
435         numaSetValue(nah, ibin, ival + 1.0);
436     }
437 
438     return nah;
439 }
440 
441 
442 /*!
443  *  numaMakeHistogramClipped()
444  *
445  *      Input:  na
446  *              binsize (typically 1.0)
447  *              maxsize (of histogram ordinate)
448  *      Return: na (histogram of bins of size @binsize, starting with
449  *                  the na[0] (x = 0.0) and going up to a maximum of
450  *                  x = @maxsize, by increments of @binsize), or null on error
451  *
452  *  Notes:
453  *      (1) This simple function generates a histogram of values
454  *          from na, discarding all values < 0.0 or greater than
455  *          min(@maxsize, maxval), where maxval is the maximum value in na.
456  *          The histogram data is put in bins of size delx = @binsize,
457  *          starting at x = 0.0.  We use as many bins as are
458  *          needed to hold the data.
459  */
460 NUMA *
numaMakeHistogramClipped(NUMA * na,l_float32 binsize,l_float32 maxsize)461 numaMakeHistogramClipped(NUMA      *na,
462                          l_float32  binsize,
463                          l_float32  maxsize)
464 {
465 l_int32    i, n, nbins, ival, ibin;
466 l_float32  val, maxval;
467 NUMA      *nad;
468 
469     PROCNAME("numaMakeHistogramClipped");
470 
471     if (!na)
472         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
473     if (binsize <= 0.0)
474         return (NUMA *)ERROR_PTR("binsize must be > 0.0", procName, NULL);
475     if (binsize > maxsize)
476         binsize = maxsize;  /* just one bin */
477 
478     numaGetMax(na, &maxval, NULL);
479     n = numaGetCount(na);
480     maxsize = L_MIN(maxsize, maxval);
481     nbins = (l_int32)(maxsize / binsize) + 1;
482 
483 /*    fprintf(stderr, "maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */
484 
485     if ((nad = numaCreate(nbins)) == NULL)
486         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
487     numaSetXParameters(nad, 0.0, binsize);
488     numaSetCount(nad, nbins);  /* interpret zeroes in bins as data */
489     for (i = 0; i < n; i++) {
490         numaGetFValue(na, i, &val);
491         ibin = (l_int32)(val / binsize);
492         if (ibin >= 0 && ibin < nbins) {
493             numaGetIValue(nad, ibin, &ival);
494             numaSetValue(nad, ibin, ival + 1.0);
495         }
496     }
497 
498     return nad;
499 }
500 
501 
502 /*!
503  *  numaRebinHistogram()
504  *
505  *      Input:  nas (input histogram)
506  *              newsize (number of old bins contained in each new bin)
507  *      Return: nad (more coarsely re-binned histogram), or null on error
508  */
509 NUMA *
numaRebinHistogram(NUMA * nas,l_int32 newsize)510 numaRebinHistogram(NUMA    *nas,
511                    l_int32  newsize)
512 {
513 l_int32    i, j, ns, nd, index, count, val;
514 l_float32  start, oldsize;
515 NUMA      *nad;
516 
517     PROCNAME("numaRebinHistogram");
518 
519     if (!nas)
520         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
521     if (newsize <= 1)
522         return (NUMA *)ERROR_PTR("newsize must be > 1", procName, NULL);
523     if ((ns = numaGetCount(nas)) == 0)
524         return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL);
525 
526     nd = (ns + newsize - 1) / newsize;
527     if ((nad = numaCreate(nd)) == NULL)
528         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
529     numaGetXParameters(nad, &start, &oldsize);
530     numaSetXParameters(nad, start, oldsize * newsize);
531 
532     for (i = 0; i < nd; i++) {  /* new bins */
533         count = 0;
534         index = i * newsize;
535         for (j = 0; j < newsize; j++) {
536             if (index < ns) {
537                 numaGetIValue(nas, index, &val);
538                 count += val;
539                 index++;
540             }
541         }
542         numaAddNumber(nad, count);
543     }
544 
545     return nad;
546 }
547 
548 
549 /*!
550  *  numaNormalizeHistogram()
551  *
552  *      Input:  nas (input histogram)
553  *              area (target sum of all numbers in dest histogram;
554  *                    e.g., use area = 1.0 if this represents a
555  *                    probability distribution)
556  *      Return: nad (normalized histogram), or null on error
557  */
558 NUMA *
numaNormalizeHistogram(NUMA * nas,l_float32 area)559 numaNormalizeHistogram(NUMA      *nas,
560                        l_float32  area)
561 {
562 l_int32    i, ns;
563 l_float32  sum, factor, fval;
564 NUMA      *nad;
565 
566     PROCNAME("numaNormalizeHistogram");
567 
568     if (!nas)
569         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
570     if (area <= 0.0)
571         return (NUMA *)ERROR_PTR("area must be > 0.0", procName, NULL);
572     if ((ns = numaGetCount(nas)) == 0)
573         return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL);
574 
575     numaGetSum(nas, &sum);
576     factor = area / sum;
577 
578     if ((nad = numaCreate(ns)) == NULL)
579         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
580     numaCopyXParameters(nad, nas);
581 
582     for (i = 0; i < ns; i++) {
583         numaGetFValue(nas, i, &fval);
584         fval *= factor;
585         numaAddNumber(nad, fval);
586     }
587 
588     return nad;
589 }
590 
591 
592 /*!
593  *  numaGetStatsUsingHistogram()
594  *
595  *      Input:  na (an arbitrary set of numbers; not ordered and not
596  *                  a histogram)
597  *              maxbins (the maximum number of bins to be allowed in
598  *                       the histogram; use 0 for consecutive integer bins)
599  *              &min (<optional return> min value of set)
600  *              &max (<optional return> max value of set)
601  *              &mean (<optional return> mean value of set)
602  *              &variance (<optional return> variance)
603  *              &median (<optional return> median value of set)
604  *              rank (in [0.0 ... 1.0]; median has a rank 0.5; ignored
605  *                    if &rval == NULL)
606  *              &rval (<optional return> value in na corresponding to @rank)
607  *              &histo (<optional return> Numa histogram; use NULL to prevent)
608  *      Return: 0 if OK, 1 on error
609  *
610  *  Notes:
611  *      (1) This is a simple interface for gathering statistics
612  *          from a numa, where a histogram is used 'under the covers'
613  *          to avoid sorting if a rank value is requested.  In that case,
614  *          by using a histogram we are trading speed for accuracy, because
615  *          the values in @na are quantized to the center of a set of bins.
616  *      (2) If the median, other rank value, or histogram are not requested,
617  *          the calculation is all performed on the input Numa.
618  *      (3) The variance is the average of the square of the
619  *          difference from the mean.  The median is the value in na
620  *          with rank 0.5.
621  *      (4) There are two situations where this gives rank results with
622  *          accuracy comparable to computing stastics directly on the input
623  *          data, without binning into a histogram:
624  *           (a) the data is integers and the range of data is less than
625  *               @maxbins, and
626  *           (b) the data is floats and the range is small compared to
627  *               @maxbins, so that the binsize is much less than 1.
628  *      (5) If a histogram is used and the numbers in the Numa extend
629  *          over a large range, you can limit the required storage by
630  *          specifying the maximum number of bins in the histogram.
631  *          Use @maxbins == 0 to force the bin size to be 1.
632  *      (6) This optionally returns the median and one arbitrary rank value.
633  *          If you need several rank values, return the histogram and use
634  *               numaHistogramGetValFromRank(nah, rank, &rval)
635  *          multiple times.
636  */
637 l_int32
numaGetStatsUsingHistogram(NUMA * na,l_int32 maxbins,l_float32 * pmin,l_float32 * pmax,l_float32 * pmean,l_float32 * pvariance,l_float32 * pmedian,l_float32 rank,l_float32 * prval,NUMA ** phisto)638 numaGetStatsUsingHistogram(NUMA       *na,
639                            l_int32     maxbins,
640                            l_float32  *pmin,
641                            l_float32  *pmax,
642                            l_float32  *pmean,
643                            l_float32  *pvariance,
644                            l_float32  *pmedian,
645                            l_float32   rank,
646                            l_float32  *prval,
647                            NUMA      **phisto)
648 {
649 l_int32    i, n;
650 l_float32  minval, maxval, fval, mean, sum;
651 NUMA      *nah;
652 
653     PROCNAME("numaGetStatsUsingHistogram");
654 
655     if (pmin) *pmin = 0.0;
656     if (pmax) *pmax = 0.0;
657     if (pmean) *pmean = 0.0;
658     if (pmedian) *pmedian = 0.0;
659     if (pvariance) *pvariance = 0.0;
660     if (!na)
661         return ERROR_INT("na not defined", procName, 1);
662     if ((n = numaGetCount(na)) == 0)
663         return ERROR_INT("numa is empty", procName, 1);
664 
665     numaGetMin(na, &minval, NULL);
666     numaGetMax(na, &maxval, NULL);
667     if (pmin) *pmin = minval;
668     if (pmax) *pmax = maxval;
669     if (pmean || pvariance) {
670         sum = 0.0;
671         for (i = 0; i < n; i++) {
672             numaGetFValue(na, i, &fval);
673             sum += fval;
674         }
675         mean = sum / (l_float32)n;
676         if (pmean) *pmean = mean;
677     }
678     if (pvariance) {
679         sum = 0.0;
680         for (i = 0; i < n; i++) {
681             numaGetFValue(na, i, &fval);
682             sum += fval * fval;
683         }
684         *pvariance = sum / (l_float32)n - mean * mean;
685     }
686 
687     if (!pmedian && !prval && !phisto)
688         return 0;
689 
690     nah = numaMakeHistogramAuto(na, maxbins);
691     if (pmedian)
692         numaHistogramGetValFromRank(nah, 0.5, pmedian);
693     if (prval)
694         numaHistogramGetValFromRank(nah, rank, prval);
695     if (phisto)
696         *phisto = nah;
697     else
698         numaDestroy(&nah);
699     return 0;
700 }
701 
702 
703 /*!
704  *  numaGetHistogramStats()
705  *
706  *      Input:  nahisto (histogram: y(x(i)), i = 0 ... nbins - 1)
707  *              startx (x value of first bin: x(0))
708  *              deltax (x increment between bins; the bin size; x(1) - x(0))
709  *              &xmean (<optional return> mean value of histogram)
710  *              &xmedian (<optional return> median value of histogram)
711  *              &xmode (<optional return> mode value of histogram:
712  *                     xmode = x(imode), where y(xmode) >= y(x(i)) for
713  *                     all i != imode)
714  *              &xvariance (<optional return> variance of x)
715  *      Return: 0 if OK, 1 on error
716  *
717  *  Notes:
718  *      (1) If the histogram represents the relation y(x), the
719  *          computed values that are returned are the x values.
720  *          These are NOT the bucket indices i; they are related to the
721  *          bucket indices by
722  *                x(i) = startx + i * deltax
723  */
724 l_int32
numaGetHistogramStats(NUMA * nahisto,l_float32 startx,l_float32 deltax,l_float32 * pxmean,l_float32 * pxmedian,l_float32 * pxmode,l_float32 * pxvariance)725 numaGetHistogramStats(NUMA       *nahisto,
726                       l_float32   startx,
727                       l_float32   deltax,
728                       l_float32  *pxmean,
729                       l_float32  *pxmedian,
730                       l_float32  *pxmode,
731                       l_float32  *pxvariance)
732 {
733     PROCNAME("numaGetHistogramStats");
734 
735     if (pxmean) *pxmean = 0.0;
736     if (pxmedian) *pxmedian = 0.0;
737     if (pxmode) *pxmode = 0.0;
738     if (pxvariance) *pxvariance = 0.0;
739     if (!nahisto)
740         return ERROR_INT("nahisto not defined", procName, 1);
741 
742     return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, 0,
743                                            pxmean, pxmedian, pxmode,
744                                            pxvariance);
745 }
746 
747 
748 /*!
749  *  numaGetHistogramStatsOnInterval()
750  *
751  *      Input:  nahisto (histogram: y(x(i)), i = 0 ... nbins - 1)
752  *              startx (x value of first bin: x(0))
753  *              deltax (x increment between bins; the bin size; x(1) - x(0))
754  *              ifirst (first bin to use for collecting stats)
755  *              ilast (last bin for collecting stats; use 0 to go to the end)
756  *              &xmean (<optional return> mean value of histogram)
757  *              &xmedian (<optional return> median value of histogram)
758  *              &xmode (<optional return> mode value of histogram:
759  *                     xmode = x(imode), where y(xmode) >= y(x(i)) for
760  *                     all i != imode)
761  *              &xvariance (<optional return> variance of x)
762  *      Return: 0 if OK, 1 on error
763  *
764  *  Notes:
765  *      (1) If the histogram represents the relation y(x), the
766  *          computed values that are returned are the x values.
767  *          These are NOT the bucket indices i; they are related to the
768  *          bucket indices by
769  *                x(i) = startx + i * deltax
770  */
771 l_int32
numaGetHistogramStatsOnInterval(NUMA * nahisto,l_float32 startx,l_float32 deltax,l_int32 ifirst,l_int32 ilast,l_float32 * pxmean,l_float32 * pxmedian,l_float32 * pxmode,l_float32 * pxvariance)772 numaGetHistogramStatsOnInterval(NUMA       *nahisto,
773                                 l_float32   startx,
774                                 l_float32   deltax,
775                                 l_int32     ifirst,
776                                 l_int32     ilast,
777                                 l_float32  *pxmean,
778                                 l_float32  *pxmedian,
779                                 l_float32  *pxmode,
780                                 l_float32  *pxvariance)
781 {
782 l_int32    i, n, imax;
783 l_float32  sum, sumval, halfsum, moment, var, x, y, ymax;
784 
785     PROCNAME("numaGetHistogramStats");
786 
787     if (pxmean) *pxmean = 0.0;
788     if (pxmedian) *pxmedian = 0.0;
789     if (pxmode) *pxmode = 0.0;
790     if (pxvariance) *pxvariance = 0.0;
791     if (!nahisto)
792         return ERROR_INT("nahisto not defined", procName, 1);
793     if (!pxmean && !pxmedian && !pxmode && !pxvariance)
794         return ERROR_INT("nothing to compute", procName, 1);
795 
796     n = numaGetCount(nahisto);
797     if (ilast <= 0) ilast = n - 1;
798     if (ifirst < 0) ifirst = 0;
799     if (ifirst > ilast || ifirst > n - 1)
800         return ERROR_INT("ifirst is too large", procName, 1);
801     for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) {
802         x = startx + i * deltax;
803         numaGetFValue(nahisto, i, &y);
804         sum += y;
805         moment += x * y;
806         var += x * x * y;
807     }
808     if (sum == 0.0)
809         return ERROR_INT("sum is 0", procName, 1);
810 
811     if (pxmean)
812         *pxmean = moment / sum;
813     if (pxvariance)
814         *pxvariance = var / sum - moment * moment / (sum * sum);
815 
816     if (pxmedian) {
817         halfsum = sum / 2.0;
818         for (sumval = 0.0, i = ifirst; i <= ilast; i++) {
819             numaGetFValue(nahisto, i, &y);
820             sumval += y;
821             if (sumval >= halfsum) {
822                 *pxmedian = startx + i * deltax;
823                 break;
824             }
825         }
826     }
827 
828     if (pxmode) {
829         ymax = -1.0e10;
830         for (i = ifirst; i <= ilast; i++) {
831             numaGetFValue(nahisto, i, &y);
832             if (y > ymax) {
833                 ymax = y;
834                 imax = i;
835             }
836         }
837         *pxmode = startx + imax * deltax;
838     }
839 
840     return 0;
841 }
842 
843 
844 /*!
845  *  numaMakeRankFromHistogram()
846  *
847  *      Input:  startx (xval corresponding to first element in nay)
848  *              deltax (x increment between array elements in nay)
849  *              nasy (input histogram, assumed equally spaced)
850  *              npts (number of points to evaluate rank function)
851  *              &nax (<optional return> array of x values in range)
852  *              &nay (<return> rank array of specified npts)
853  *      Return: 0 if OK, 1 on error
854  */
855 l_int32
numaMakeRankFromHistogram(l_float32 startx,l_float32 deltax,NUMA * nasy,l_int32 npts,NUMA ** pnax,NUMA ** pnay)856 numaMakeRankFromHistogram(l_float32  startx,
857                           l_float32  deltax,
858                           NUMA      *nasy,
859                           l_int32    npts,
860                           NUMA     **pnax,
861                           NUMA     **pnay)
862 {
863 l_int32    i, n;
864 l_float32  sum, fval;
865 NUMA      *nan, *nar;
866 
867     PROCNAME("numaMakeRankFromHistogram");
868 
869     if (pnax) *pnax = NULL;
870     if (!pnay)
871         return ERROR_INT("&nay not defined", procName, 1);
872     *pnay = NULL;
873     if (!nasy)
874         return ERROR_INT("nasy not defined", procName, 1);
875     if (!pnay)
876         return ERROR_INT("&nay not defined", procName, 1);
877     if ((n = numaGetCount(nasy)) == 0)
878         return ERROR_INT("no bins in nas", procName, 1);
879 
880         /* Normalize and generate the rank array corresponding to
881          * the binned histogram. */
882     nan = numaNormalizeHistogram(nasy, 1.0);
883     nar = numaCreate(n + 1);  /* rank numa corresponding to nan */
884     sum = 0.0;
885     numaAddNumber(nar, sum);  /* first element is 0.0 */
886     for (i = 0; i < n; i++) {
887         numaGetFValue(nan, i, &fval);
888         sum += fval;
889         numaAddNumber(nar, sum);
890     }
891 
892         /* Compute rank array on full range with specified
893          * number of points and correspondence to x-values. */
894     numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP,
895                                startx, startx + n * deltax, npts,
896                                pnax, pnay);
897     numaDestroy(&nan);
898     numaDestroy(&nar);
899     return 0;
900 }
901 
902 
903 /*!
904  *  numaHistogramGetRankFromVal()
905  *
906  *      Input:  na (histogram)
907  *              rval (value of input sample for which we want the rank)
908  *              &rank (<return> fraction of total samples below rval)
909  *      Return: 0 if OK, 1 on error
910  *
911  *  Notes:
912  *      (1) If we think of the histogram as a function y(x), normalized
913  *          to 1, for a given input value of x, this computes the
914  *          rank of x, which is the integral of y(x) from the start
915  *          value of x to the input value.
916  *      (2) This function only makes sense when applied to a Numa that
917  *          is a histogram.  The values in the histogram can be ints and
918  *          floats, and are computed as floats.  The rank is returned
919  *          as a float between 0.0 and 1.0.
920  *      (3) The numa parameters startx and binsize are used to
921  *          compute x from the Numa index i.
922  */
923 l_int32
numaHistogramGetRankFromVal(NUMA * na,l_float32 rval,l_float32 * prank)924 numaHistogramGetRankFromVal(NUMA       *na,
925                             l_float32   rval,
926                             l_float32  *prank)
927 {
928 l_int32    i, ibinval, n;
929 l_float32  startval, binsize, binval, maxval, fractval, total, sum, val;
930 
931     PROCNAME("numaHistogramGetRankFromVal");
932 
933     if (!prank)
934         return ERROR_INT("prank not defined", procName, 1);
935     *prank = 0.0;
936     if (!na)
937         return ERROR_INT("na not defined", procName, 1);
938     numaGetXParameters(na, &startval, &binsize);
939     n = numaGetCount(na);
940     if (rval < startval)
941         return 0;
942     maxval = startval + n * binsize;
943     if (rval > maxval) {
944         *prank = 1.0;
945         return 0;
946     }
947 
948     binval = (rval - startval) / binsize;
949     ibinval = (l_int32)binval;
950     if (ibinval >= n) {
951         *prank = 1.0;
952         return 0;
953     }
954     fractval = binval - (l_float32)ibinval;
955 
956     sum = 0.0;
957     for (i = 0; i < ibinval; i++) {
958         numaGetFValue(na, i, &val);
959         sum += val;
960     }
961     numaGetFValue(na, ibinval, &val);
962     sum += fractval * val;
963     numaGetSum(na, &total);
964     *prank = sum / total;
965 
966 /*    fprintf(stderr, "binval = %7.3f, rank = %7.3f\n", binval, *prank); */
967 
968     return 0;
969 }
970 
971 
972 /*!
973  *  numaHistogramGetValFromRank()
974  *
975  *      Input:  na (histogram)
976  *              rank (fraction of total samples)
977  *              &rval (<return> approx. to the bin value)
978  *      Return: 0 if OK, 1 on error
979  *
980  *  Notes:
981  *      (1) If we think of the histogram as a function y(x), this returns
982  *          the value x such that the integral of y(x) from the start
983  *          value to x gives the fraction 'rank' of the integral
984  *          of y(x) over all bins.
985  *      (2) This function only makes sense when applied to a Numa that
986  *          is a histogram.  The values in the histogram can be ints and
987  *          floats, and are computed as floats.  The val is returned
988  *          as a float, even though the buckets are of integer width.
989  *      (3) The numa parameters startx and binsize are used to
990  *          compute x from the Numa index i.
991  */
992 l_int32
numaHistogramGetValFromRank(NUMA * na,l_float32 rank,l_float32 * prval)993 numaHistogramGetValFromRank(NUMA       *na,
994                             l_float32   rank,
995                             l_float32  *prval)
996 {
997 l_int32    i, n;
998 l_float32  startval, binsize, rankcount, total, sum, fract, val;
999 
1000     PROCNAME("numaHistogramGetValFromRank");
1001 
1002     if (!prval)
1003         return ERROR_INT("prval not defined", procName, 1);
1004     *prval = 0.0;
1005     if (!na)
1006         return ERROR_INT("na not defined", procName, 1);
1007     if (rank < 0.0) {
1008         L_WARNING("rank < 0; setting to 0.0", procName);
1009         rank = 0.0;
1010     }
1011     if (rank > 1.0) {
1012         L_WARNING("rank > 1.0; setting to 1.0", procName);
1013         rank = 1.0;
1014     }
1015 
1016     n = numaGetCount(na);
1017     numaGetXParameters(na, &startval, &binsize);
1018     numaGetSum(na, &total);
1019     rankcount = rank * total;  /* count that corresponds to rank */
1020     sum = 0.0;
1021     for (i = 0; i < n; i++) {
1022         numaGetFValue(na, i, &val);
1023         if (sum + val >= rankcount)
1024             break;
1025         sum += val;
1026     }
1027     if (val <= 0.0)  /* can == 0 if rank == 0.0 */
1028         fract = 0.0;
1029     else  /* sum + fract * val = rankcount */
1030         fract = (rankcount - sum) / val;
1031 
1032     /* The use of the fraction of a bin allows a simple calculation
1033      * for the histogram value at the given rank. */
1034     *prval = startval + binsize * ((l_float32)i + fract);
1035 
1036 /*    fprintf(stderr, "rank = %7.3f, val = %7.3f\n", rank, *prval); */
1037 
1038     return 0;
1039 }
1040 
1041 
1042 /*----------------------------------------------------------------------*
1043  *                      Splitting a distribution                        *
1044  *----------------------------------------------------------------------*/
1045 /*!
1046  *  numaSplitDistribution()
1047  *
1048  *      Input:  na (histogram)
1049  *              scorefract (fraction of the max score, used to determine
1050  *                          the range over which the histogram min is searched)
1051  *              &splitindex (<optional return> index for splitting)
1052  *              &ave1 (<optional return> average of lower distribution)
1053  *              &ave2 (<optional return> average of upper distribution)
1054  *              &num1 (<optional return> population of lower distribution)
1055  *              &num2 (<optional return> population of upper distribution)
1056  *              &nascore (<optional return> for debugging; otherwise use null)
1057  *      Return: 0 if OK, 1 on error
1058  *
1059  *  Notes:
1060  *      (1) This function is intended to be used on a distribution of
1061  *          values that represent two sets, such as a histogram of
1062  *          pixel values, and the goal is to determine the means of
1063  *          the two sets and the best splitting point.
1064  *      (2) The Otsu method finds a split point that divides the distribution
1065  *          into two parts by maximizing a score function that is the
1066  *          product of two terms:
1067  *            (a) the square of the difference of centroids, (ave1 - ave2)^2
1068  *            (b) fract1 * (1 - fract1)
1069  *          where fract1 is the fraction in the lower distribution.
1070  *          This biases the split point into the larger "bump" (i.e., toward
1071  *          the point where the (b) term reaches its maximum of 0.25 at
1072  *          fract1 = 0.5.  To avoid this, we define a range of values near
1073  *          the maximum of the score function, and choose the value within
1074  *          this range such that the histogram itself has a minimum value.
1075  *          The range is determined by scorefract: we include all abscissa
1076  *          values to the left and right of the value that maximizes the
1077  *          score, such that the score stays above (1 - scorefract) * maxscore.
1078  *      (3) We normalize the score so that if the two distributions
1079  *          were of equal size and at opposite ends of the numa, the
1080  *          score would be 1.0.
1081  */
1082 l_int32
numaSplitDistribution(NUMA * na,l_float32 scorefract,l_int32 * psplitindex,l_float32 * pave1,l_float32 * pave2,l_float32 * pnum1,l_float32 * pnum2,NUMA ** pnascore)1083 numaSplitDistribution(NUMA       *na,
1084                       l_float32   scorefract,
1085                       l_int32    *psplitindex,
1086                       l_float32  *pave1,
1087                       l_float32  *pave2,
1088                       l_float32  *pnum1,
1089                       l_float32  *pnum2,
1090                       NUMA      **pnascore)
1091 {
1092 l_int32    i, n, bestsplit, minrange, maxrange, maxindex;
1093 l_float32  ave1, ave2, ave1prev, ave2prev;
1094 l_float32  num1, num2, num1prev, num2prev;
1095 l_float32  val, minval, sum, fract1;
1096 l_float32  norm, score, minscore, maxscore;
1097 NUMA      *nascore, *naave1, *naave2, *nanum1, *nanum2;
1098 
1099     PROCNAME("numaSplitDistribution");
1100 
1101     if (!na)
1102         return ERROR_INT("na not defined", procName, 1);
1103 
1104     n = numaGetCount(na);
1105     if (n <= 1)
1106         return ERROR_INT("n = 1 in histogram", procName, 1);
1107     numaGetSum(na, &sum);
1108     if (sum <= 0.0)
1109         return ERROR_INT("sum <= 0.0", procName, 1);
1110     norm = 4.0 / ((n - 1) * (n - 1));
1111     ave1prev = 0.0;
1112     numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL);
1113     num1prev = 0.0;
1114     num2prev = sum;
1115     maxindex = n / 2;  /* initialize with something */
1116 
1117         /* Split the histogram with [0 ... i] in the lower part
1118          * and [i+1 ... n-1] in upper part.  First, compute an otsu
1119          * score for each possible splitting.  */
1120     nascore = numaCreate(n);
1121     if (pave2) naave1 = numaCreate(n);
1122     if (pave2) naave2 = numaCreate(n);
1123     if (pnum1) nanum1 = numaCreate(n);
1124     if (pnum2) nanum2 = numaCreate(n);
1125     maxscore = 0.0;
1126     for (i = 0; i < n; i++) {
1127         numaGetFValue(na, i, &val);
1128         num1 = num1prev + val;
1129         if (num1 == 0)
1130             ave1 = ave1prev;
1131         else
1132             ave1 = (num1prev * ave1prev + i * val) / num1;
1133         num2 = num2prev - val;
1134         if (num2 == 0)
1135             ave2 = ave2prev;
1136         else
1137             ave2 = (num2prev * ave2prev - i * val) / num2;
1138         fract1 = num1 / sum;
1139         score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1);
1140         numaAddNumber(nascore, score);
1141         if (pave1) numaAddNumber(naave1, ave1);
1142         if (pave2) numaAddNumber(naave2, ave2);
1143         if (pnum1) numaAddNumber(nanum1, num1);
1144         if (pnum1) numaAddNumber(nanum2, num2);
1145         if (score > maxscore) {
1146             maxscore = score;
1147             maxindex = i;
1148         }
1149         num1prev = num1;
1150         num2prev = num2;
1151         ave1prev = ave1;
1152         ave2prev = ave2;
1153     }
1154 
1155         /* Next, for all contiguous scores within a specified fraction
1156          * of the max, choose the split point as the value with the
1157          * minimum in the histogram. */
1158     minscore = (1. - scorefract) * maxscore;
1159     for (i = maxindex - 1; i >= 0; i--) {
1160         numaGetFValue(nascore, i, &val);
1161         if (val < minscore)
1162             break;
1163     }
1164     minrange = i + 1;
1165     for (i = maxindex + 1; i < n; i++) {
1166         numaGetFValue(nascore, i, &val);
1167         if (val < minscore)
1168             break;
1169     }
1170     maxrange = i - 1;
1171     numaGetFValue(na, minrange, &minval);
1172     bestsplit = minrange;
1173     for (i = minrange + 1; i <= maxrange; i++) {
1174         numaGetFValue(na, i, &val);
1175         if (val < minval) {
1176             minval = val;
1177             bestsplit = i;
1178         }
1179     }
1180 
1181     if (psplitindex) *psplitindex = bestsplit;
1182     if (pave1) numaGetFValue(naave1, bestsplit, pave1);
1183     if (pave2) numaGetFValue(naave2, bestsplit, pave2);
1184     if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1);
1185     if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2);
1186 
1187     if (pnascore) {  /* debug mode */
1188         fprintf(stderr, "minrange = %d, maxrange = %d\n", minrange, maxrange);
1189         fprintf(stderr, "minval = %10.0f\n", minval);
1190         gplotSimple1(nascore, GPLOT_X11, "junkoutroot",
1191                      "Score for split distribution");
1192         *pnascore = nascore;
1193     }
1194     else
1195         numaDestroy(&nascore);
1196 
1197     if (pave1) numaDestroy(&naave1);
1198     if (pave2) numaDestroy(&naave2);
1199     if (pnum1) numaDestroy(&nanum1);
1200     if (pnum2) numaDestroy(&nanum2);
1201     return 0;
1202 }
1203 
1204 
1205 /*----------------------------------------------------------------------*
1206  *                             Extrema finding                          *
1207  *----------------------------------------------------------------------*/
1208 /*!
1209  *  numaFindPeaks()
1210  *
1211  *      Input:  source na
1212  *              max number of peaks to be found
1213  *              fract1  (min fraction of peak value)
1214  *              fract2  (min slope)
1215  *      Return: peak na, or null on error.
1216  *
1217  * Notes:
1218  *     (1) The returned na consists of sets of four numbers representing
1219  *         the peak, in the following order:
1220  *            left edge; peak center; right edge; normalized peak area
1221  */
1222 NUMA *
numaFindPeaks(NUMA * nas,l_int32 nmax,l_float32 fract1,l_float32 fract2)1223 numaFindPeaks(NUMA      *nas,
1224               l_int32    nmax,
1225               l_float32  fract1,
1226               l_float32  fract2)
1227 {
1228 l_int32    i, k, n, maxloc, lloc, rloc;
1229 l_float32  fmaxval, sum, total, newtotal, val, lastval;
1230 l_float32  peakfract;
1231 NUMA      *na, *napeak;
1232 
1233     PROCNAME("numaFindPeaks");
1234 
1235     if (!nas)
1236         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
1237     n = numaGetCount(nas);
1238     numaGetSum(nas, &total);
1239 
1240         /* We munge this copy */
1241     if ((na = numaCopy(nas)) == NULL)
1242         return (NUMA *)ERROR_PTR("na not made", procName, NULL);
1243     if ((napeak = numaCreate(4 * nmax)) == NULL)
1244         return (NUMA *)ERROR_PTR("napeak not made", procName, NULL);
1245 
1246     for (k = 0; k < nmax; k++) {
1247         numaGetSum(na, &newtotal);
1248         if (newtotal == 0.0)   /* sanity check */
1249             break;
1250         numaGetMax(na, &fmaxval, &maxloc);
1251         sum = fmaxval;
1252         lastval = fmaxval;
1253         lloc = 0;
1254         for (i = maxloc - 1; i >= 0; --i) {
1255             numaGetFValue(na, i, &val);
1256             if (val == 0.0) {
1257                 lloc = i + 1;
1258                 break;
1259             }
1260             if (val > fract1 * fmaxval) {
1261                 sum += val;
1262                 lastval = val;
1263                 continue;
1264             }
1265             if (lastval - val > fract2 * lastval) {
1266                 sum += val;
1267                 lastval = val;
1268                 continue;
1269             }
1270             lloc = i;
1271             break;
1272         }
1273         lastval = fmaxval;
1274         rloc = n - 1;
1275         for (i = maxloc + 1; i < n; ++i) {
1276             numaGetFValue(na, i, &val);
1277             if (val == 0.0) {
1278                 rloc = i - 1;
1279                 break;
1280             }
1281             if (val > fract1 * fmaxval) {
1282                 sum += val;
1283                 lastval = val;
1284                 continue;
1285             }
1286             if (lastval - val > fract2 * lastval) {
1287                 sum += val;
1288                 lastval = val;
1289                 continue;
1290             }
1291             rloc = i;
1292             break;
1293         }
1294         peakfract = sum / total;
1295         numaAddNumber(napeak, lloc);
1296         numaAddNumber(napeak, maxloc);
1297         numaAddNumber(napeak, rloc);
1298         numaAddNumber(napeak, peakfract);
1299 
1300         for (i = lloc; i <= rloc; i++)
1301             numaSetValue(na, i, 0.0);
1302     }
1303 
1304     numaDestroy(&na);
1305     return napeak;
1306 }
1307 
1308 
1309 /*!
1310  *  numaFindExtrema()
1311  *
1312  *      Input:  nas (input values)
1313  *              delta (relative amount to resolve peaks and valleys)
1314  *      Return: nad (locations of extrema), or null on error
1315  *
1316  *  Notes:
1317  *      (1) This returns a sequence of extrema (peaks and valleys).
1318  *      (2) The algorithm is analogous to that for determining
1319  *          mountain peaks.  Suppose we have a local peak, with
1320  *          bumps on the side.  Under what conditions can we consider
1321  *          those 'bumps' to be actual peaks?  The answer: if the
1322  *          bump is separated from the peak by a saddle that is at
1323  *          least 500 feet below the bump.
1324  *      (3) Operationally, suppose we are looking for a peak.
1325  *          We are keeping the largest value we've seen since the
1326  *          last valley, and are looking for a value that is delta
1327  *          BELOW our current peak.  When we find such a value,
1328  *          we label the peak, use the current value to label the
1329  *          valley, and then do the same operation in reverse (looking
1330  *          for a valley).
1331  */
1332 NUMA *
numaFindExtrema(NUMA * nas,l_float32 delta)1333 numaFindExtrema(NUMA      *nas,
1334                 l_float32  delta)
1335 {
1336 l_int32    i, n, found, loc, direction;
1337 l_float32  startval, val, maxval, minval;
1338 NUMA      *nad;
1339 
1340     PROCNAME("numaFindExtrema");
1341 
1342     if (!nas)
1343         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
1344 
1345     n = numaGetCount(nas);
1346     nad = numaCreate(0);
1347 
1348         /* We don't know if we'll find a peak or valley first,
1349          * but use the first element of nas as the reference point.
1350          * Break when we deviate by 'delta' from the first point. */
1351     numaGetFValue(nas, 0, &startval);
1352     found = FALSE;
1353     for (i = 1; i < n; i++) {
1354         numaGetFValue(nas, i, &val);
1355         if (L_ABS(val - startval) >= delta) {
1356             found = TRUE;
1357             break;
1358         }
1359     }
1360 
1361     if (!found)
1362         return nad;  /* it's empty */
1363 
1364         /* Are we looking for a peak or a valley? */
1365     if (val > startval) {  /* peak */
1366         direction = 1;
1367         maxval = val;
1368     }
1369     else {
1370         direction = -1;
1371         minval = val;
1372     }
1373     loc = i;
1374 
1375         /* Sweep through the rest of the array, recording alternating
1376          * peak/valley extrema. */
1377     for (i = i + 1; i < n; i++) {
1378         numaGetFValue(nas, i, &val);
1379         if (direction == 1 && val > maxval ) {  /* new local max */
1380             maxval = val;
1381             loc = i;
1382         }
1383         else if (direction == -1 && val < minval ) {  /* new local min */
1384             minval = val;
1385             loc = i;
1386         }
1387         else if (direction == 1 && (maxval - val >= delta)) {
1388             numaAddNumber(nad, loc);  /* save the current max location */
1389             direction = -1;  /* reverse: start looking for a min */
1390             minval = val;
1391             loc = i;  /* current min location */
1392         }
1393         else if (direction == -1 && (val - minval >= delta)) {
1394             numaAddNumber(nad, loc);  /* save the current min location */
1395             direction = 1;  /* reverse: start looking for a max */
1396             maxval = val;
1397             loc = i;  /* current max location */
1398         }
1399     }
1400 
1401         /* Save the final extremum */
1402 /*    numaAddNumber(nad, loc); */
1403     return nad;
1404 }
1405 
1406 
1407 /*----------------------------------------------------------------------*
1408  *                Threshold crossings and frequency analysis            *
1409  *----------------------------------------------------------------------*/
1410 /*!
1411  *  numaSelectCrossingThreshold()
1412  *
1413  *      Input:  nax (<optional> numa of abscissa values; can be NULL)
1414  *              nay (signal)
1415  *              estthresh (estimated pixel threshold for crossing: e.g., for
1416  *                         images, white <--> black; typ. ~120)
1417  *              &bestthresh (<return> robust estimate of threshold to use)
1418  *      Return: 0 if OK, 1 on error
1419  *
1420  *  Note:
1421  *     (1) When a valid threshold is used, the number of crossings is
1422  *         a maximum, because none are missed.  If no threshold intersects
1423  *         all the crossings, the crossings must be determined with
1424  *         numaCrossingsByPeaks().
1425  *     (2) @estthresh is an input estimate of the threshold that should
1426  *         be used.  We compute the crossings with 41 thresholds
1427  *         (20 below and 20 above).  There is a range in which the
1428  *         number of crossings is a maximum.  Return a threshold
1429  *         in the center of this stable plateau of crossings.
1430  *         This can then be used with numaCrossingsByThreshold()
1431  *         to get a good estimate of crossing locations.
1432  */
1433 l_int32
numaSelectCrossingThreshold(NUMA * nax,NUMA * nay,l_float32 estthresh,l_float32 * pbestthresh)1434 numaSelectCrossingThreshold(NUMA       *nax,
1435                             NUMA       *nay,
1436                             l_float32   estthresh,
1437                             l_float32  *pbestthresh)
1438 {
1439 l_int32    i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen;
1440 l_int32    val, maxval, nmax, count;
1441 l_float32  thresh, fmaxval, fmodeval;
1442 NUMA      *nat, *nac;
1443 
1444     PROCNAME("numaSelectCrossingThreshold");
1445 
1446     if (!nay)
1447         return ERROR_INT("nay not defined", procName, 1);
1448 
1449         /* Compute the number of crossings for different thresholds */
1450     nat = numaCreate(41);
1451     for (i = 0; i < 41; i++) {
1452         thresh = estthresh - 80.0 + 4.0 * i;
1453         nac = numaCrossingsByThreshold(nax, nay, thresh);
1454         numaAddNumber(nat, numaGetCount(nac));
1455         numaDestroy(&nac);
1456     }
1457 
1458         /* Find the center of the plateau of max crossings, which
1459          * extends from thresh[istart] to thresh[iend]. */
1460     numaGetMax(nat, &fmaxval, NULL);
1461     maxval = (l_int32)fmaxval;
1462     nmax = 0;
1463     for (i = 0; i < 41; i++) {
1464         numaGetIValue(nat, i, &val);
1465         if (val == maxval)
1466             nmax++;
1467     }
1468     if (nmax < 3) {  /* likely accidental max; try the mode */
1469         numaGetMode(nat, &fmodeval, &count);
1470         if (count > nmax && fmodeval > 0.5 * fmaxval)
1471             maxval = (l_int32)fmodeval;  /* use the mode */
1472     }
1473 
1474     inrun = FALSE;
1475     iend = 40;
1476     maxrunlen = 0;
1477     for (i = 0; i < 41; i++) {
1478         numaGetIValue(nat, i, &val);
1479         if (val == maxval) {
1480             if (!inrun) {
1481                 istart = i;
1482                 inrun = TRUE;
1483             }
1484 	    continue;
1485         }
1486         if (inrun && (val != maxval)) {
1487             iend = i - 1;
1488             runlen = iend - istart + 1;
1489             inrun = FALSE;
1490             if (runlen > maxrunlen) {
1491                 maxstart = istart;
1492                 maxend = iend;
1493                 maxrunlen = runlen;
1494             }
1495         }
1496     }
1497     if (inrun) {
1498         runlen = i - istart;
1499         if (runlen > maxrunlen) {
1500             maxstart = istart;
1501             maxend = i - 1;
1502             maxrunlen = runlen;
1503         }
1504     }
1505 
1506 #if 0
1507     foundfirst = FALSE;
1508     iend = 40;
1509     for (i = 0; i < 41; i++) {
1510         numaGetIValue(nat, i, &val);
1511         if (val == maxval) {
1512             if (!foundfirst) {
1513                 istart = i;
1514                 foundfirst = TRUE;
1515             }
1516         }
1517         if ((val != maxval) && foundfirst) {
1518             iend = i - 1;
1519             break;
1520         }
1521     }
1522     nmax = iend - istart + 1;
1523 #endif
1524 
1525     *pbestthresh = estthresh - 80.0 + 2.0 * (l_float32)(maxstart + maxend);
1526 
1527 #if  DEBUG_CROSSINGS
1528     fprintf(stderr, "\nCrossings attain a maximum at %d thresholds, between:\n"
1529                     "  thresh[%d] = %5.1f and thresh[%d] = %5.1f\n",
1530 		    nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart,
1531 		    maxend, estthresh - 80.0 + 4.0 * maxend);
1532     fprintf(stderr, "The best choice: %5.1f\n", *pbestthresh);
1533     fprintf(stderr, "Number of crossings at the 41 thresholds:");
1534     numaWriteStream(stderr, nat);
1535 #endif  /* DEBUG_CROSSINGS */
1536 
1537     numaDestroy(&nat);
1538     return 0;
1539 }
1540 
1541 
1542 /*!
1543  *  numaCrossingsByThreshold()
1544  *
1545  *      Input:  nax (<optional> numa of abscissa values; can be NULL)
1546  *              nay (numa of ordinate values, corresponding to nax)
1547  *              thresh (threshold value for nay)
1548  *      Return: nad (abscissa pts at threshold), or null on error
1549  *
1550  *  Notes:
1551  *      (1) If nax == NULL, we use startx and delx from nay to compute
1552  *          the crossing values in nad.
1553  */
1554 NUMA *
numaCrossingsByThreshold(NUMA * nax,NUMA * nay,l_float32 thresh)1555 numaCrossingsByThreshold(NUMA      *nax,
1556                          NUMA      *nay,
1557                          l_float32  thresh)
1558 {
1559 l_int32    i, n;
1560 l_float32  startx, delx;
1561 l_float32  xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract;
1562 NUMA      *nad;
1563 
1564     PROCNAME("numaCrossingsByThreshold");
1565 
1566     if (!nay)
1567         return (NUMA *)ERROR_PTR("nay not defined", procName, NULL);
1568     n = numaGetCount(nay);
1569 
1570     if (nax && (numaGetCount(nax) != n))
1571         return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL);
1572 
1573     nad = numaCreate(0);
1574     numaGetFValue(nay, 0, &yval1);
1575     numaGetXParameters(nay, &startx, &delx);
1576     if (nax)
1577         numaGetFValue(nax, 0, &xval1);
1578     else
1579         xval1 = startx;
1580     for (i = 1; i < n; i++) {
1581         numaGetFValue(nay, i, &yval2);
1582         if (nax)
1583             numaGetFValue(nax, i, &xval2);
1584         else
1585             xval2 = startx + i * delx;
1586         delta1 = yval1 - thresh;
1587         delta2 = yval2 - thresh;
1588         if (delta1 == 0.0)
1589             numaAddNumber(nad, xval1);
1590         else if (delta2 == 0.0)
1591             numaAddNumber(nad, xval2);
1592         else if (delta1 * delta2 < 0.0) {  /* crossing */
1593             fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
1594             crossval = xval1 + fract * (xval2 - xval1);
1595             numaAddNumber(nad, crossval);
1596         }
1597         xval1 = xval2;
1598         yval1 = yval2;
1599     }
1600 
1601     return nad;
1602 }
1603 
1604 
1605 /*!
1606  *  numaCrossingsByPeaks()
1607  *
1608  *      Input:  nax (<optional> numa of abscissa values)
1609  *              nay (numa of ordinate values, corresponding to nax)
1610  *              delta (parameter used to identify when a new peak can be found)
1611  *      Return: nad (abscissa pts at threshold), or null on error
1612  *
1613  *  Notes:
1614  *      (1) If nax == NULL, we use startx and delx from nay to compute
1615  *          the crossing values in nad.
1616  */
1617 NUMA *
numaCrossingsByPeaks(NUMA * nax,NUMA * nay,l_float32 delta)1618 numaCrossingsByPeaks(NUMA      *nax,
1619                      NUMA      *nay,
1620                      l_float32  delta)
1621 {
1622 l_int32    i, j, n, np, previndex, curindex;
1623 l_float32  startx, delx;
1624 l_float32  xval1, xval2, yval1, yval2, delta1, delta2;
1625 l_float32  prevval, curval, thresh, crossval, fract;
1626 NUMA      *nap, *nad;
1627 
1628     PROCNAME("numaCrossingsByPeaks");
1629 
1630     if (!nax)
1631         return (NUMA *)ERROR_PTR("nax not defined", procName, NULL);
1632     if (!nay)
1633         return (NUMA *)ERROR_PTR("nay not defined", procName, NULL);
1634 
1635     n = numaGetCount(nax);
1636     if (numaGetCount(nay) != n)
1637         return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL);
1638 
1639         /* Find the extrema.  Also add last point in nay to get
1640          * the last transition (from the last peak to the end).
1641          * The number of crossings is 1 more than the number of extrema. */
1642     nap = numaFindExtrema(nay, delta);
1643     numaAddNumber(nap, n - 1);
1644     np = numaGetCount(nap);
1645     L_INFO_INT("Number of crossings: %d", procName, np);
1646 
1647         /* Do all computation in index units of nax */
1648     nad = numaCreate(np);  /* output crossings, in nax units */
1649     previndex = 0;  /* prime the search with 1st point */
1650     numaGetFValue(nay, 0, &prevval);  /* prime the search with 1st point */
1651     numaGetXParameters(nay, &startx, &delx);
1652     for (i = 0; i < np; i++) {
1653         numaGetIValue(nap, i, &curindex);
1654         numaGetFValue(nay, curindex, &curval);
1655         thresh = (prevval + curval) / 2.0;
1656 /*        fprintf(stderr, "thresh[%d] = %7.3f\n", i, thresh); */
1657         if (nax)
1658             numaGetFValue(nax, previndex, &xval1);
1659         else
1660             xval1 = startx + previndex * delx;
1661         numaGetFValue(nay, previndex, &yval1);
1662         for (j = previndex + 1; j <= curindex; j++) {
1663             if (nax)
1664                 numaGetFValue(nax, j, &xval2);
1665             else
1666                 xval2 = startx + j * delx;
1667             numaGetFValue(nay, j, &yval2);
1668             delta1 = yval1 - thresh;
1669             delta2 = yval2 - thresh;
1670             if (delta1 == 0.0) {
1671                 numaAddNumber(nad, xval1);
1672                 break;
1673             }
1674             else if (delta2 == 0.0) {
1675                 numaAddNumber(nad, xval2);
1676                 break;
1677             }
1678             else if (delta1 * delta2 < 0.0) {  /* crossing */
1679                 fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
1680                 crossval = xval1 + fract * (xval2 - xval1);
1681                 numaAddNumber(nad, crossval);
1682                 break;
1683             }
1684             xval1 = xval2;
1685             yval1 = yval2;
1686         }
1687         previndex = curindex;
1688         prevval = curval;
1689     }
1690 
1691     numaDestroy(&nap);
1692     return nad;
1693 }
1694 
1695 
1696 /*!
1697  *  numaEvalBestHaarParameters()
1698  *
1699  *      Input:  nas (numa of non-negative signal values)
1700  *              relweight (relative weight of (-1 comb) / (+1 comb)
1701  *                         contributions to the 'convolution'.  In effect,
1702  *                         the convolution kernel is a comb consisting of
1703  *                         alternating +1 and -weight.)
1704  *              nwidth (number of widths to consider)
1705  *              nshift (number of shifts to consider for each width)
1706  *              minwidth (smallest width to consider)
1707  *              maxwidth (largest width to consider)
1708  *              &bestwidth (<return> width giving largest score)
1709  *              &bestshift (<return> shift giving largest score)
1710  *              &bestscore (<optional return> convolution with
1711  *                          "Haar"-like comb)
1712  *      Return: 0 if OK, 1 on error
1713  *
1714  *  Notes:
1715  *      (1) This does a linear sweep of widths, evaluating at @nshift
1716  *          shifts for each width, computing the score from a convolution
1717  *          with a long comb, and finding the (width, shift) pair that
1718  *          gives the maximum score.  The best width is the "half-wavelength"
1719  *          of the signal.
1720  *      (2) The convolving function is a comb of alternating values
1721  *          +1 and -1 * relweight, separated by the width and phased by
1722  *          the shift.  This is similar to a Haar transform, except
1723  *          there the convolution is performed with a square wave.
1724  *      (3) The function is useful for finding the line spacing
1725  *          and strength of line signal from pixel sum projections.
1726  *      (4) The score is normalized to the size of nas divided by
1727  *          the number of half-widths.  For image applications, the input is
1728  *          typically an array of pixel projections, so one should
1729  *          normalize by dividing the score by the image width in the
1730  *          pixel projection direction.
1731  */
1732 l_int32
numaEvalBestHaarParameters(NUMA * nas,l_float32 relweight,l_int32 nwidth,l_int32 nshift,l_float32 minwidth,l_float32 maxwidth,l_float32 * pbestwidth,l_float32 * pbestshift,l_float32 * pbestscore)1733 numaEvalBestHaarParameters(NUMA       *nas,
1734                            l_float32   relweight,
1735                            l_int32     nwidth,
1736                            l_int32     nshift,
1737                            l_float32   minwidth,
1738                            l_float32   maxwidth,
1739                            l_float32  *pbestwidth,
1740                            l_float32  *pbestshift,
1741                            l_float32  *pbestscore)
1742 {
1743 l_int32    i, j;
1744 l_float32  delwidth, delshift, width, shift, score;
1745 l_float32  bestwidth, bestshift, bestscore;
1746 
1747     PROCNAME("numaEvalBestHaarParameters");
1748 
1749     if (!nas)
1750         return ERROR_INT("nas not defined", procName, 1);
1751     if (!pbestwidth || !pbestshift)
1752         return ERROR_INT("&bestwidth and &bestshift not defined", procName, 1);
1753 
1754     bestscore = 0.0;
1755     delwidth = (maxwidth - minwidth) / (nwidth - 1.0);
1756     for (i = 0; i < nwidth; i++) {
1757         width = minwidth + delwidth * i;
1758         delshift = width / (l_float32)(nshift);
1759         for (j = 0; j < nshift; j++) {
1760             shift = j * delshift;
1761             numaEvalHaarSum(nas, width, shift, relweight, &score);
1762             if (score > bestscore) {
1763                 bestscore = score;
1764                 bestwidth = width;
1765                 bestshift = shift;
1766 #if  DEBUG_FREQUENCY
1767                 fprintf(stderr, "width = %7.3f, shift = %7.3f, score = %7.3f\n",
1768                         width, shift, score);
1769 #endif  /* DEBUG_FREQUENCY */
1770             }
1771         }
1772     }
1773 
1774     *pbestwidth = bestwidth;
1775     *pbestshift = bestshift;
1776     if (pbestscore)
1777         *pbestscore = bestscore;
1778     return 0;
1779 }
1780 
1781 
1782 /*!
1783  *  numaEvalHaarSum()
1784  *
1785  *      Input:  nas (numa of non-negative signal values)
1786  *              width (distance between +1 and -1 in convolution comb)
1787  *              shift (phase of the comb: location of first +1)
1788  *              relweight (relative weight of (-1 comb) / (+1 comb)
1789  *                         contributions to the 'convolution'.  In effect,
1790  *                         the convolution kernel is a comb consisting of
1791  *                         alternating +1 and -weight.)
1792  *              &score (<return> convolution with "Haar"-like comb)
1793  *      Return: 0 if OK, 1 on error
1794  *
1795  *  Notes:
1796  *      (1) This does a convolution with a comb of alternating values
1797  *          +1 and -relweight, separated by the width and phased by the shift.
1798  *          This is similar to a Haar transform, except that for Haar,
1799  *            (1) the convolution kernel is symmetric about 0, so the
1800  *                relweight is 1.0, and
1801  *            (2) the convolution is performed with a square wave.
1802  *      (2) The score is normalized to the size of nas divided by
1803  *          twice the "width".  For image applications, the input is
1804  *          typically an array of pixel projections, so one should
1805  *          normalize by dividing the score by the image width in the
1806  *          pixel projection direction.
1807  *      (3) To get a Haar-like result, use relweight = 1.0.  For detecting
1808  *          signals where you expect every other sample to be close to
1809  *          zero, as with barcodes or filtered text lines, you can
1810  *          use relweight > 1.0.
1811  */
1812 l_int32
numaEvalHaarSum(NUMA * nas,l_float32 width,l_float32 shift,l_float32 relweight,l_float32 * pscore)1813 numaEvalHaarSum(NUMA       *nas,
1814                 l_float32   width,
1815                 l_float32   shift,
1816                 l_float32   relweight,
1817                 l_float32  *pscore)
1818 {
1819 l_int32    i, n, nsamp, index;
1820 l_float32  score, weight, val;
1821 
1822     PROCNAME("numaEvalHaarSum");
1823 
1824     if (!pscore)
1825         return ERROR_INT("&score not defined", procName, 1);
1826     *pscore = 0.0;
1827     if (!nas)
1828         return ERROR_INT("nas not defined", procName, 1);
1829     if ((n = numaGetCount(nas)) < 2 * width)
1830         return ERROR_INT("nas size too small", procName, 1);
1831 
1832     score = 0.0;
1833     nsamp = (l_int32)((n - shift) / width);
1834     for (i = 0; i < nsamp; i++) {
1835         index = (l_int32)(shift + i * width);
1836         weight = (i % 2) ? 1.0 : -1.0 * relweight;
1837         numaGetFValue(nas, index, &val);
1838         score += weight * val;
1839     }
1840 
1841     *pscore = 2.0 * width * score / (l_float32)n;
1842     return 0;
1843 }
1844 
1845