Coverage Report

Created: 2024-06-18 06:05

/src/leptonica/src/numafunc2.c
Line
Count
Source (jump to first uncovered line)
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*!
28
 * \file  numafunc2.c
29
 * <pre>
30
 *
31
 *      --------------------------------------
32
 *      This file has these Numa utilities:
33
 *         - morphological operations
34
 *         - arithmetic transforms
35
 *         - windowed statistical operations
36
 *         - histogram extraction
37
 *         - histogram comparison
38
 *         - extrema finding
39
 *         - frequency and crossing analysis
40
 *      --------------------------------------
41
42
 *      Morphological (min/max) operations
43
 *          NUMA        *numaErode()
44
 *          NUMA        *numaDilate()
45
 *          NUMA        *numaOpen()
46
 *          NUMA        *numaClose()
47
 *
48
 *      Other transforms
49
 *          NUMA        *numaTransform()
50
 *          l_int32      numaSimpleStats()
51
 *          l_int32      numaWindowedStats()
52
 *          NUMA        *numaWindowedMean()
53
 *          NUMA        *numaWindowedMeanSquare()
54
 *          l_int32      numaWindowedVariance()
55
 *          NUMA        *numaWindowedMedian()
56
 *          NUMA        *numaConvertToInt()
57
 *
58
 *      Histogram generation and statistics
59
 *          NUMA        *numaMakeHistogram()
60
 *          NUMA        *numaMakeHistogramAuto()
61
 *          NUMA        *numaMakeHistogramClipped()
62
 *          NUMA        *numaRebinHistogram()
63
 *          NUMA        *numaNormalizeHistogram()
64
 *          l_int32      numaGetStatsUsingHistogram()
65
 *          l_int32      numaGetHistogramStats()
66
 *          l_int32      numaGetHistogramStatsOnInterval()
67
 *          l_int32      numaMakeRankFromHistogram()
68
 *          l_int32      numaHistogramGetRankFromVal()
69
 *          l_int32      numaHistogramGetValFromRank()
70
 *          l_int32      numaDiscretizeSortedInBins()
71
 *          l_int32      numaDiscretizeHistoInBins()
72
 *          l_int32      numaGetRankBinValues()
73
 *          NUMA        *numaGetUniformBinSizes()
74
 *
75
 *      Splitting a distribution
76
 *          l_int32      numaSplitDistribution()
77
 *
78
 *      Comparing histograms
79
 *          l_int32      grayHistogramsToEMD()
80
 *          l_int32      numaEarthMoverDistance()
81
 *          l_int32      grayInterHistogramStats()
82
 *
83
 *      Extrema finding
84
 *          NUMA        *numaFindPeaks()
85
 *          NUMA        *numaFindExtrema()
86
 *          NUMA        *numaFindLocForThreshold()
87
 *          l_int32     *numaCountReversals()
88
 *
89
 *      Threshold crossings and frequency analysis
90
 *          l_int32      numaSelectCrossingThreshold()
91
 *          NUMA        *numaCrossingsByThreshold()
92
 *          NUMA        *numaCrossingsByPeaks()
93
 *          NUMA        *numaEvalBestHaarParameters()
94
 *          l_int32      numaEvalHaarSum()
95
 *
96
 *      Generating numbers in a range under constraints
97
 *          NUMA        *genConstrainedNumaInRange()
98
 *
99
 *    Things to remember when using the Numa:
100
 *
101
 *    (1) The numa is a struct, not an array.  Always use accessors
102
 *        (see numabasic.c), never the fields directly.
103
 *
104
 *    (2) The number array holds l_float32 values.  It can also
105
 *        be used to store l_int32 values.  See numabasic.c for
106
 *        details on using the accessors.  Integers larger than
107
 *        about 10M will lose accuracy due on retrieval due to round-off.
108
 *        For large integers, use the dna (array of l_float64) instead.
109
 *
110
 *    (3) Occasionally, in the comments we denote the i-th element of a
111
 *        numa by na[i].  This is conceptual only -- the numa is not an array!
112
 *
113
 *    Some general comments on histograms:
114
 *
115
 *    (1) Histograms are the generic statistical representation of
116
 *        the data about some attribute.  Typically they're not
117
 *        normalized -- they simply give the number of occurrences
118
 *        within each range of values of the attribute.  This range
119
 *        of values is referred to as a 'bucket'.  For example,
120
 *        the histogram could specify how many connected components
121
 *        are found for each value of their width; in that case,
122
 *        the bucket size is 1.
123
 *
124
 *    (2) In leptonica, all buckets have the same size.  Histograms
125
 *        are therefore specified by a numa of occurrences, along
126
 *        with two other numbers: the 'value' associated with the
127
 *        occupants of the first bucket and the size (i.e., 'width')
128
 *        of each bucket.  These two numbers then allow us to calculate
129
 *        the value associated with the occupants of each bucket.
130
 *        These numbers are fields in the numa, initialized to
131
 *        a startx value of 0.0 and a binsize of 1.0.  Accessors for
132
 *        these fields are functions numa*Parameters().  All histograms
133
 *        must have these two numbers properly set.
134
 * </pre>
135
 */
136
137
#ifdef HAVE_CONFIG_H
138
#include <config_auto.h>
139
#endif  /* HAVE_CONFIG_H */
140
141
#include <math.h>
142
#include "allheaders.h"
143
144
    /* bin sizes in numaMakeHistogram() */
145
static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\
146
                      2000, 5000, 10000, 20000, 50000, 100000, 200000,\
147
                      500000, 1000000, 2000000, 5000000, 10000000,\
148
                      200000000, 50000000, 100000000};
149
static const l_int32 NBinSizes = 24;
150
151
152
#ifndef  NO_CONSOLE_IO
153
#define  DEBUG_HISTO        0
154
#define  DEBUG_CROSSINGS    0
155
#define  DEBUG_FREQUENCY    0
156
#endif  /* ~NO_CONSOLE_IO */
157
158
/*----------------------------------------------------------------------*
159
 *                     Morphological operations                         *
160
 *----------------------------------------------------------------------*/
161
/*!
162
 * \brief   numaErode()
163
 *
164
 * \param[in]    nas
165
 * \param[in]    size   of sel; greater than 0, odd.  The origin
166
 *                      is implicitly in the center.
167
 * \return  nad eroded, or NULL on error
168
 *
169
 * <pre>
170
 * Notes:
171
 *      (1) The structuring element (sel) is linear, all "hits"
172
 *      (2) If size == 1, this returns a copy
173
 *      (3) General comment.  The morphological operations are equivalent
174
 *          to those that would be performed on a 1-dimensional fpix.
175
 *          However, because we have not implemented morphological
176
 *          operations on fpix, we do this here.  Because it is only
177
 *          1 dimensional, there is no reason to use the more
178
 *          complicated van Herk/Gil-Werman algorithm, and we do it
179
 *          by brute force.
180
 * </pre>
181
 */
182
NUMA *
183
numaErode(NUMA    *nas,
184
          l_int32  size)
185
0
{
186
0
l_int32     i, j, n, hsize, len;
187
0
l_float32   minval;
188
0
l_float32  *fa, *fas, *fad;
189
0
NUMA       *nad;
190
191
0
    if (!nas)
192
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
193
0
    if (size <= 0)
194
0
        return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
195
0
    if ((size & 1) == 0 ) {
196
0
        L_WARNING("sel size must be odd; increasing by 1\n", __func__);
197
0
        size++;
198
0
    }
199
200
0
    if (size == 1)
201
0
        return numaCopy(nas);
202
203
        /* Make a source fa (fas) that has an added (size / 2) boundary
204
         * on left and right, contains a copy of nas in the interior region
205
         * (between 'size' and 'size + n', and has large values
206
         * inserted in the boundary (because it is an erosion). */
207
0
    n = numaGetCount(nas);
208
0
    hsize = size / 2;
209
0
    len = n + 2 * hsize;
210
0
    if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL)
211
0
        return (NUMA *)ERROR_PTR("fas not made", __func__, NULL);
212
0
    for (i = 0; i < hsize; i++)
213
0
         fas[i] = 1.0e37f;
214
0
    for (i = hsize + n; i < len; i++)
215
0
         fas[i] = 1.0e37f;
216
0
    fa = numaGetFArray(nas, L_NOCOPY);
217
0
    for (i = 0; i < n; i++)
218
0
         fas[hsize + i] = fa[i];
219
220
0
    nad = numaMakeConstant(0, n);
221
0
    numaCopyParameters(nad, nas);
222
0
    fad = numaGetFArray(nad, L_NOCOPY);
223
0
    for (i = 0; i < n; i++) {
224
0
        minval = 1.0e37f;  /* start big */
225
0
        for (j = 0; j < size; j++)
226
0
            minval = L_MIN(minval, fas[i + j]);
227
0
        fad[i] = minval;
228
0
    }
229
230
0
    LEPT_FREE(fas);
231
0
    return nad;
232
0
}
233
234
235
/*!
236
 * \brief   numaDilate()
237
 *
238
 * \param[in]    nas
239
 * \param[in]    size   of sel; greater than 0, odd.  The origin
240
 *                      is implicitly in the center.
241
 * \return  nad dilated, or NULL on error
242
 *
243
 * <pre>
244
 * Notes:
245
 *      (1) The structuring element (sel) is linear, all "hits"
246
 *      (2) If size == 1, this returns a copy
247
 * </pre>
248
 */
249
NUMA *
250
numaDilate(NUMA    *nas,
251
           l_int32  size)
252
0
{
253
0
l_int32     i, j, n, hsize, len;
254
0
l_float32   maxval;
255
0
l_float32  *fa, *fas, *fad;
256
0
NUMA       *nad;
257
258
0
    if (!nas)
259
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
260
0
    if (size <= 0)
261
0
        return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
262
0
    if ((size & 1) == 0 ) {
263
0
        L_WARNING("sel size must be odd; increasing by 1\n", __func__);
264
0
        size++;
265
0
    }
266
267
0
    if (size == 1)
268
0
        return numaCopy(nas);
269
270
        /* Make a source fa (fas) that has an added (size / 2) boundary
271
         * on left and right, contains a copy of nas in the interior region
272
         * (between 'size' and 'size + n', and has small values
273
         * inserted in the boundary (because it is a dilation). */
274
0
    n = numaGetCount(nas);
275
0
    hsize = size / 2;
276
0
    len = n + 2 * hsize;
277
0
    if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL)
278
0
        return (NUMA *)ERROR_PTR("fas not made", __func__, NULL);
279
0
    for (i = 0; i < hsize; i++)
280
0
         fas[i] = -1.0e37f;
281
0
    for (i = hsize + n; i < len; i++)
282
0
         fas[i] = -1.0e37f;
283
0
    fa = numaGetFArray(nas, L_NOCOPY);
284
0
    for (i = 0; i < n; i++)
285
0
         fas[hsize + i] = fa[i];
286
287
0
    nad = numaMakeConstant(0, n);
288
0
    numaCopyParameters(nad, nas);
289
0
    fad = numaGetFArray(nad, L_NOCOPY);
290
0
    for (i = 0; i < n; i++) {
291
0
        maxval = -1.0e37f;  /* start small */
292
0
        for (j = 0; j < size; j++)
293
0
            maxval = L_MAX(maxval, fas[i + j]);
294
0
        fad[i] = maxval;
295
0
    }
296
297
0
    LEPT_FREE(fas);
298
0
    return nad;
299
0
}
300
301
302
/*!
303
 * \brief   numaOpen()
304
 *
305
 * \param[in]    nas
306
 * \param[in]    size   of sel; greater than 0, odd.  The origin
307
 *                      is implicitly in the center.
308
 * \return  nad opened, or NULL on error
309
 *
310
 * <pre>
311
 * Notes:
312
 *      (1) The structuring element (sel) is linear, all "hits"
313
 *      (2) If size == 1, this returns a copy
314
 * </pre>
315
 */
316
NUMA *
317
numaOpen(NUMA    *nas,
318
         l_int32  size)
319
0
{
320
0
NUMA  *nat, *nad;
321
322
0
    if (!nas)
323
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
324
0
    if (size <= 0)
325
0
        return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
326
0
    if ((size & 1) == 0 ) {
327
0
        L_WARNING("sel size must be odd; increasing by 1\n", __func__);
328
0
        size++;
329
0
    }
330
331
0
    if (size == 1)
332
0
        return numaCopy(nas);
333
334
0
    nat = numaErode(nas, size);
335
0
    nad = numaDilate(nat, size);
336
0
    numaDestroy(&nat);
337
0
    return nad;
338
0
}
339
340
341
/*!
342
 * \brief   numaClose()
343
 *
344
 * \param[in]    nas
345
 * \param[in]    size   of sel; greater than 0, odd.  The origin
346
 *                      is implicitly in the center.
347
 * \return  nad  closed, or NULL on error
348
 *
349
 * <pre>
350
 * Notes:
351
 *      (1) The structuring element (sel) is linear, all "hits"
352
 *      (2) If size == 1, this returns a copy
353
 *      (3) We add a border before doing this operation, for the same
354
 *          reason that we add a border to a pix before doing a safe closing.
355
 *          Without the border, a small component near the border gets
356
 *          clipped at the border on dilation, and can be entirely removed
357
 *          by the following erosion, violating the basic extensivity
358
 *          property of closing.
359
 * </pre>
360
 */
361
NUMA *
362
numaClose(NUMA    *nas,
363
          l_int32  size)
364
0
{
365
0
NUMA  *nab, *nat1, *nat2, *nad;
366
367
0
    if (!nas)
368
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
369
0
    if (size <= 0)
370
0
        return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
371
0
    if ((size & 1) == 0 ) {
372
0
        L_WARNING("sel size must be odd; increasing by 1\n", __func__);
373
0
        size++;
374
0
    }
375
376
0
    if (size == 1)
377
0
        return numaCopy(nas);
378
379
0
    nab = numaAddBorder(nas, size, size, 0);  /* to preserve extensivity */
380
0
    nat1 = numaDilate(nab, size);
381
0
    nat2 = numaErode(nat1, size);
382
0
    nad = numaRemoveBorder(nat2, size, size);
383
0
    numaDestroy(&nab);
384
0
    numaDestroy(&nat1);
385
0
    numaDestroy(&nat2);
386
0
    return nad;
387
0
}
388
389
390
/*----------------------------------------------------------------------*
391
 *                            Other transforms                          *
392
 *----------------------------------------------------------------------*/
393
/*!
394
 * \brief   numaTransform()
395
 *
396
 * \param[in]    nas
397
 * \param[in]    shift    add this to each number
398
 * \param[in]    scale    multiply each number by this
399
 * \return  nad with all values shifted and scaled, or NULL on error
400
 *
401
 * <pre>
402
 * Notes:
403
 *      (1) Each number is shifted before scaling.
404
 * </pre>
405
 */
406
NUMA *
407
numaTransform(NUMA      *nas,
408
              l_float32  shift,
409
              l_float32  scale)
410
0
{
411
0
l_int32    i, n;
412
0
l_float32  val;
413
0
NUMA      *nad;
414
415
0
    if (!nas)
416
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
417
0
    n = numaGetCount(nas);
418
0
    if ((nad = numaCreate(n)) == NULL)
419
0
        return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
420
0
    numaCopyParameters(nad, nas);
421
0
    for (i = 0; i < n; i++) {
422
0
        numaGetFValue(nas, i, &val);
423
0
        val = scale * (val + shift);
424
0
        numaAddNumber(nad, val);
425
0
    }
426
0
    return nad;
427
0
}
428
429
430
/*!
431
 * \brief   numaSimpleStats()
432
 *
433
 * \param[in]    na       input numa
434
 * \param[in]    first    first element to use
435
 * \param[in]    last     last element to use; -1 to go to the end
436
 * \param[out]   pmean    [optional] mean value
437
 * \param[out]   pvar     [optional] variance
438
 * \param[out]   prvar    [optional] rms deviation from the mean
439
 * \return  0 if OK, 1 on error
440
 */
441
l_ok
442
numaSimpleStats(NUMA       *na,
443
                l_int32     first,
444
                l_int32     last,
445
                l_float32  *pmean,
446
                l_float32  *pvar,
447
                l_float32  *prvar)
448
0
{
449
0
l_int32    i, n, ni;
450
0
l_float32  sum, sumsq, val, mean, var;
451
452
0
    if (pmean) *pmean = 0.0;
453
0
    if (pvar) *pvar = 0.0;
454
0
    if (prvar) *prvar = 0.0;
455
0
    if (!pmean && !pvar && !prvar)
456
0
        return ERROR_INT("nothing requested", __func__, 1);
457
0
    if (!na)
458
0
        return ERROR_INT("na not defined", __func__, 1);
459
0
    if ((n = numaGetCount(na)) == 0)
460
0
        return ERROR_INT("na is empty", __func__, 1);
461
0
    first = L_MAX(0, first);
462
0
    if (last < 0) last = n - 1;
463
0
    if (first >= n)
464
0
        return ERROR_INT("invalid first", __func__, 1);
465
0
    if (last >= n) {
466
0
        L_WARNING("last = %d is beyond max index = %d; adjusting\n",
467
0
                  __func__, last, n - 1);
468
0
        last = n - 1;
469
0
    }
470
0
    if (first > last)
471
0
        return ERROR_INT("first > last\n", __func__, 1);
472
0
    ni = last - first + 1;
473
0
    sum = sumsq = 0.0;
474
0
    for (i = first; i <= last; i++) {
475
0
        numaGetFValue(na, i, &val);
476
0
        sum += val;
477
0
        sumsq += val * val;
478
0
    }
479
480
0
    mean = sum / ni;
481
0
    if (pmean)
482
0
        *pmean = mean;
483
0
    if (pvar || prvar) {
484
0
        var = sumsq / ni - mean * mean;
485
0
        if (pvar) *pvar = var;
486
0
        if (prvar) *prvar = sqrtf(var);
487
0
    }
488
489
0
    return 0;
490
0
}
491
492
493
/*!
494
 * \brief   numaWindowedStats()
495
 *
496
 * \param[in]    nas     input numa
497
 * \param[in]    wc      half width of the window
498
 * \param[out]   pnam    [optional] mean value in window
499
 * \param[out]   pnams   [optional] mean square value in window
500
 * \param[out]   pnav    [optional] variance in window
501
 * \param[out]   pnarv   [optional] rms deviation from the mean
502
 * \return  0 if OK, 1 on error
503
 *
504
 * <pre>
505
 * Notes:
506
 *      (1) This is a high-level convenience function for calculating
507
 *          any or all of these derived arrays.
508
 *      (2) These statistical measures over the values in the
509
 *          rectangular window are:
510
 *            ~ average value: [x]  (nam)
511
 *            ~ average squared value: [x*x] (nams)
512
 *            ~ variance: [(x - [x])*(x - [x])] = [x*x] - [x]*[x]  (nav)
513
 *            ~ square-root of variance: (narv)
514
 *          where the brackets [ .. ] indicate that the average value is
515
 *          to be taken over the window.
516
 *      (3) Note that the variance is just the mean square difference from
517
 *          the mean value; and the square root of the variance is the
518
 *          root mean square difference from the mean, sometimes also
519
 *          called the 'standard deviation'.
520
 *      (4) Internally, use mirrored borders to handle values near the
521
 *          end of each array.
522
 * </pre>
523
 */
524
l_ok
525
numaWindowedStats(NUMA    *nas,
526
                  l_int32  wc,
527
                  NUMA   **pnam,
528
                  NUMA   **pnams,
529
                  NUMA   **pnav,
530
                  NUMA   **pnarv)
531
0
{
532
0
NUMA  *nam, *nams;
533
534
0
    if (!nas)
535
0
        return ERROR_INT("nas not defined", __func__, 1);
536
0
    if (2 * wc + 1 > numaGetCount(nas))
537
0
        L_WARNING("filter wider than input array!\n", __func__);
538
539
0
    if (!pnav && !pnarv) {
540
0
        if (pnam) *pnam = numaWindowedMean(nas, wc);
541
0
        if (pnams) *pnams = numaWindowedMeanSquare(nas, wc);
542
0
        return 0;
543
0
    }
544
545
0
    nam = numaWindowedMean(nas, wc);
546
0
    nams = numaWindowedMeanSquare(nas, wc);
547
0
    numaWindowedVariance(nam, nams, pnav, pnarv);
548
0
    if (pnam)
549
0
        *pnam = nam;
550
0
    else
551
0
        numaDestroy(&nam);
552
0
    if (pnams)
553
0
        *pnams = nams;
554
0
    else
555
0
        numaDestroy(&nams);
556
0
    return 0;
557
0
}
558
559
560
/*!
561
 * \brief   numaWindowedMean()
562
 *
563
 * \param[in]    nas
564
 * \param[in]    wc    half width of the convolution window
565
 * \return  nad after low-pass filtering, or NULL on error
566
 *
567
 * <pre>
568
 * Notes:
569
 *      (1) This is a convolution.  The window has width = 2 * %wc + 1.
570
 *      (2) We add a mirrored border of size %wc to each end of the array.
571
 * </pre>
572
 */
573
NUMA *
574
numaWindowedMean(NUMA    *nas,
575
                 l_int32  wc)
576
0
{
577
0
l_int32     i, n, n1, width;
578
0
l_float32   sum, norm;
579
0
l_float32  *fa1, *fad, *suma;
580
0
NUMA       *na1, *nad;
581
582
0
    if (!nas)
583
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
584
0
    n = numaGetCount(nas);
585
0
    width = 2 * wc + 1;  /* filter width */
586
0
    if (width > n)
587
0
        L_WARNING("filter wider than input array!\n", __func__);
588
589
0
    na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER);
590
0
    n1 = n + 2 * wc;
591
0
    fa1 = numaGetFArray(na1, L_NOCOPY);
592
0
    nad = numaMakeConstant(0, n);
593
0
    fad = numaGetFArray(nad, L_NOCOPY);
594
595
        /* Make sum array; note the indexing */
596
0
    if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) {
597
0
        numaDestroy(&na1);
598
0
        numaDestroy(&nad);
599
0
        return (NUMA *)ERROR_PTR("suma not made", __func__, NULL);
600
0
    }
601
0
    sum = 0.0;
602
0
    suma[0] = 0.0;
603
0
    for (i = 0; i < n1; i++) {
604
0
        sum += fa1[i];
605
0
        suma[i + 1] = sum;
606
0
    }
607
608
0
    norm = 1.f / (2 * wc + 1);
609
0
    for (i = 0; i < n; i++)
610
0
        fad[i] = norm * (suma[width + i] - suma[i]);
611
612
0
    LEPT_FREE(suma);
613
0
    numaDestroy(&na1);
614
0
    return nad;
615
0
}
616
617
618
/*!
619
 * \brief   numaWindowedMeanSquare()
620
 *
621
 * \param[in]    nas
622
 * \param[in]    wc    half width of the window
623
 * \return  nad containing windowed mean square values, or NULL on error
624
 *
625
 * <pre>
626
 * Notes:
627
 *      (1) The window has width = 2 * %wc + 1.
628
 *      (2) We add a mirrored border of size %wc to each end of the array.
629
 * </pre>
630
 */
631
NUMA *
632
numaWindowedMeanSquare(NUMA    *nas,
633
                       l_int32  wc)
634
0
{
635
0
l_int32     i, n, n1, width;
636
0
l_float32   sum, norm;
637
0
l_float32  *fa1, *fad, *suma;
638
0
NUMA       *na1, *nad;
639
640
0
    if (!nas)
641
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
642
0
    n = numaGetCount(nas);
643
0
    width = 2 * wc + 1;  /* filter width */
644
0
    if (width > n)
645
0
        L_WARNING("filter wider than input array!\n", __func__);
646
647
0
    na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER);
648
0
    n1 = n + 2 * wc;
649
0
    fa1 = numaGetFArray(na1, L_NOCOPY);
650
0
    nad = numaMakeConstant(0, n);
651
0
    fad = numaGetFArray(nad, L_NOCOPY);
652
653
        /* Make sum array; note the indexing */
654
0
    if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) {
655
0
        numaDestroy(&na1);
656
0
        numaDestroy(&nad);
657
0
        return (NUMA *)ERROR_PTR("suma not made", __func__, NULL);
658
0
    }
659
0
    sum = 0.0;
660
0
    suma[0] = 0.0;
661
0
    for (i = 0; i < n1; i++) {
662
0
        sum += fa1[i] * fa1[i];
663
0
        suma[i + 1] = sum;
664
0
    }
665
666
0
    norm = 1.f / (2 * wc + 1);
667
0
    for (i = 0; i < n; i++)
668
0
        fad[i] = norm * (suma[width + i] - suma[i]);
669
670
0
    LEPT_FREE(suma);
671
0
    numaDestroy(&na1);
672
0
    return nad;
673
0
}
674
675
676
/*!
677
 * \brief   numaWindowedVariance()
678
 *
679
 * \param[in]    nam    windowed mean values
680
 * \param[in]    nams   windowed mean square values
681
 * \param[out]   pnav   [optional] numa of variance -- the ms deviation
682
 *                      from the mean
683
 * \param[out]   pnarv  [optional] numa of rms deviation from the mean
684
 * \return  0 if OK, 1 on error
685
 *
686
 * <pre>
687
 * Notes:
688
 *      (1) The numas of windowed mean and mean square are precomputed,
689
 *          using numaWindowedMean() and numaWindowedMeanSquare().
690
 *      (2) Either or both of the variance and square-root of variance
691
 *          are returned, where the variance is the average over the
692
 *          window of the mean square difference of the pixel value
693
 *          from the mean:
694
 *                [(x - [x])*(x - [x])] = [x*x] - [x]*[x]
695
 * </pre>
696
 */
697
l_ok
698
numaWindowedVariance(NUMA   *nam,
699
                     NUMA   *nams,
700
                     NUMA  **pnav,
701
                     NUMA  **pnarv)
702
0
{
703
0
l_int32     i, nm, nms;
704
0
l_float32   var;
705
0
l_float32  *fam, *fams, *fav = NULL, *farv = NULL;
706
0
NUMA       *nav, *narv;  /* variance and square root of variance */
707
708
0
    if (pnav) *pnav = NULL;
709
0
    if (pnarv) *pnarv = NULL;
710
0
    if (!pnav && !pnarv)
711
0
        return ERROR_INT("neither &nav nor &narv are defined", __func__, 1);
712
0
    if (!nam)
713
0
        return ERROR_INT("nam not defined", __func__, 1);
714
0
    if (!nams)
715
0
        return ERROR_INT("nams not defined", __func__, 1);
716
0
    nm = numaGetCount(nam);
717
0
    nms = numaGetCount(nams);
718
0
    if (nm != nms)
719
0
        return ERROR_INT("sizes of nam and nams differ", __func__, 1);
720
721
0
    if (pnav) {
722
0
        nav = numaMakeConstant(0, nm);
723
0
        *pnav = nav;
724
0
        fav = numaGetFArray(nav, L_NOCOPY);
725
0
    }
726
0
    if (pnarv) {
727
0
        narv = numaMakeConstant(0, nm);
728
0
        *pnarv = narv;
729
0
        farv = numaGetFArray(narv, L_NOCOPY);
730
0
    }
731
0
    fam = numaGetFArray(nam, L_NOCOPY);
732
0
    fams = numaGetFArray(nams, L_NOCOPY);
733
734
0
    for (i = 0; i < nm; i++) {
735
0
        var = fams[i] - fam[i] * fam[i];
736
0
        if (pnav)
737
0
            fav[i] = var;
738
0
        if (pnarv)
739
0
            farv[i] = sqrtf(var);
740
0
    }
741
742
0
    return 0;
743
0
}
744
745
746
/*!
747
 * \brief   numaWindowedMedian()
748
 *
749
 * \param[in]    nas
750
 * \param[in]    halfwin   half width of window over which the median is found
751
 * \return  nad after windowed median filtering, or NULL on error
752
 *
753
 * <pre>
754
 * Notes:
755
 *      (1) The requested window has width = 2 * %halfwin + 1.
756
 *      (2) If the input nas has less then 3 elements, return a copy.
757
 *      (3) If the filter is too small (%halfwin <= 0), return a copy.
758
 *      (4) If the filter is too large, it is reduced in size.
759
 *      (5) We add a mirrored border of size %halfwin to each end of
760
 *          the array to simplify the calculation by avoiding end-effects.
761
 * </pre>
762
 */
763
NUMA *
764
numaWindowedMedian(NUMA    *nas,
765
                   l_int32  halfwin)
766
0
{
767
0
l_int32    i, n;
768
0
l_float32  medval;
769
0
NUMA      *na1, *na2, *nad;
770
771
0
    if (!nas)
772
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
773
0
    if ((n = numaGetCount(nas)) < 3)
774
0
        return numaCopy(nas);
775
0
    if (halfwin <= 0) {
776
0
        L_ERROR("filter too small; returning a copy\n", __func__);
777
0
        return numaCopy(nas);
778
0
    }
779
780
0
    if (halfwin > (n - 1) / 2) {
781
0
        halfwin = (n - 1) / 2;
782
0
        L_INFO("reducing filter to halfwin = %d\n", __func__, halfwin);
783
0
    }
784
785
        /* Add a border to both ends */
786
0
    na1 = numaAddSpecifiedBorder(nas, halfwin, halfwin, L_MIRRORED_BORDER);
787
788
        /* Get the median value at the center of each window, corresponding
789
         * to locations in the input nas. */
790
0
    nad = numaCreate(n);
791
0
    for (i = 0; i < n; i++) {
792
0
        na2 = numaClipToInterval(na1, i, i + 2 * halfwin);
793
0
        numaGetMedian(na2, &medval);
794
0
        numaAddNumber(nad, medval);
795
0
        numaDestroy(&na2);
796
0
    }
797
798
0
    numaDestroy(&na1);
799
0
    return nad;
800
0
}
801
802
803
/*!
804
 * \brief   numaConvertToInt()
805
 *
806
 * \param[in]    nas   source numa
807
 * \return  na with all values rounded to nearest integer, or
808
 *              NULL on error
809
 */
810
NUMA *
811
numaConvertToInt(NUMA  *nas)
812
0
{
813
0
l_int32  i, n, ival;
814
0
NUMA    *nad;
815
816
0
    if (!nas)
817
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
818
819
0
    n = numaGetCount(nas);
820
0
    if ((nad = numaCreate(n)) == NULL)
821
0
        return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
822
0
    numaCopyParameters(nad, nas);
823
0
    for (i = 0; i < n; i++) {
824
0
        numaGetIValue(nas, i, &ival);
825
0
        numaAddNumber(nad, ival);
826
0
    }
827
0
    return nad;
828
0
}
829
830
831
/*----------------------------------------------------------------------*
832
 *                 Histogram generation and statistics                  *
833
 *----------------------------------------------------------------------*/
834
/*!
835
 * \brief   numaMakeHistogram()
836
 *
837
 * \param[in]    na
838
 * \param[in]    maxbins    max number of histogram bins
839
 * \param[out]   pbinsize   [optional] size of histogram bins
840
 * \param[out]   pbinstart  [optional] start val of minimum bin;
841
 *                          input NULL to force start at 0
842
 * \return  na consisiting of histogram of integerized values,
843
 *              or NULL on error.
844
 *
845
 * <pre>
846
 * Notes:
847
 *      (1) This simple interface is designed for integer data.
848
 *          The bins are of integer width and start on integer boundaries,
849
 *          so the results on float data will not have high precision.
850
 *      (2) Specify the max number of input bins.   Then %binsize,
851
 *          the size of bins necessary to accommodate the input data,
852
 *          is returned.  It is optionally returned and one of the sequence:
853
 *                {1, 2, 5, 10, 20, 50, ...}.
854
 *      (3) If &binstart is given, all values are accommodated,
855
 *          and the min value of the starting bin is returned.
856
 *          Otherwise, all negative values are discarded and
857
 *          the histogram bins start at 0.
858
 * </pre>
859
 */
860
NUMA *
861
numaMakeHistogram(NUMA     *na,
862
                  l_int32   maxbins,
863
                  l_int32  *pbinsize,
864
                  l_int32  *pbinstart)
865
0
{
866
0
l_int32    i, n, ival, hval;
867
0
l_int32    iminval, imaxval, range, binsize, nbins, ibin;
868
0
l_float32  val, ratio;
869
0
NUMA      *nai, *nahist;
870
871
0
    if (pbinsize) *pbinsize = 0;
872
0
    if (pbinstart) *pbinstart = 0;
873
0
    if (!na)
874
0
        return (NUMA *)ERROR_PTR("na not defined", __func__, NULL);
875
0
    if (maxbins < 1)
876
0
        return (NUMA *)ERROR_PTR("maxbins < 1", __func__, NULL);
877
878
        /* Determine input range */
879
0
    numaGetMin(na, &val, NULL);
880
0
    iminval = (l_int32)(val + 0.5);
881
0
    numaGetMax(na, &val, NULL);
882
0
    imaxval = (l_int32)(val + 0.5);
883
0
    if (pbinstart == NULL) {  /* clip negative vals; start from 0 */
884
0
        iminval = 0;
885
0
        if (imaxval < 0)
886
0
            return (NUMA *)ERROR_PTR("all values < 0", __func__, NULL);
887
0
    }
888
889
        /* Determine binsize */
890
0
    range = imaxval - iminval + 1;
891
0
    if (range > maxbins - 1) {
892
0
        ratio = (l_float32)range / (l_float32)maxbins;
893
0
        binsize = 0;
894
0
        for (i = 0; i < NBinSizes; i++) {
895
0
            if (ratio < BinSizeArray[i]) {
896
0
                binsize = BinSizeArray[i];
897
0
                break;
898
0
            }
899
0
        }
900
0
        if (binsize == 0)
901
0
            return (NUMA *)ERROR_PTR("numbers too large", __func__, NULL);
902
0
    } else {
903
0
        binsize = 1;
904
0
    }
905
0
    if (pbinsize) *pbinsize = binsize;
906
0
    nbins = 1 + range / binsize;  /* +1 seems to be sufficient */
907
908
        /* Redetermine iminval */
909
0
    if (pbinstart && binsize > 1) {
910
0
        if (iminval >= 0)
911
0
            iminval = binsize * (iminval / binsize);
912
0
        else
913
0
            iminval = binsize * ((iminval - binsize + 1) / binsize);
914
0
    }
915
0
    if (pbinstart) *pbinstart = iminval;
916
917
#if  DEBUG_HISTO
918
    lept_stderr(" imaxval = %d, range = %d, nbins = %d\n",
919
                imaxval, range, nbins);
920
#endif  /* DEBUG_HISTO */
921
922
        /* Use integerized data for input */
923
0
    if ((nai = numaConvertToInt(na)) == NULL)
924
0
        return (NUMA *)ERROR_PTR("nai not made", __func__, NULL);
925
0
    n = numaGetCount(nai);
926
927
        /* Make histogram, converting value in input array
928
         * into a bin number for this histogram array. */
929
0
    if ((nahist = numaCreate(nbins)) == NULL) {
930
0
        numaDestroy(&nai);
931
0
        return (NUMA *)ERROR_PTR("nahist not made", __func__, NULL);
932
0
    }
933
0
    numaSetCount(nahist, nbins);
934
0
    numaSetParameters(nahist, iminval, binsize);
935
0
    for (i = 0; i < n; i++) {
936
0
        numaGetIValue(nai, i, &ival);
937
0
        ibin = (ival - iminval) / binsize;
938
0
        if (ibin >= 0 && ibin < nbins) {
939
0
            numaGetIValue(nahist, ibin, &hval);
940
0
            numaSetValue(nahist, ibin, hval + 1.0f);
941
0
        }
942
0
    }
943
944
0
    numaDestroy(&nai);
945
0
    return nahist;
946
0
}
947
948
949
/*!
950
 * \brief   numaMakeHistogramAuto()
951
 *
952
 * \param[in]    na       numa of floats; these may be integers
953
 * \param[in]    maxbins  max number of histogram bins; >= 1
954
 * \return  na consisiting of histogram of quantized float values,
955
 *              or NULL on error.
956
 *
957
 * <pre>
958
 * Notes:
959
 *      (1) This simple interface is designed for accurate binning
960
 *          of both integer and float data.
961
 *      (2) If the array data is integers, and the range of integers
962
 *          is smaller than %maxbins, they are binned as they fall,
963
 *          with binsize = 1.
964
 *      (3) If the range of data, (maxval - minval), is larger than
965
 *          %maxbins, or if the data is floats, they are binned into
966
 *          exactly %maxbins bins.
967
 *      (4) Unlike numaMakeHistogram(), these bins in general have
968
 *          non-integer location and width, even for integer data.
969
 * </pre>
970
 */
971
NUMA *
972
numaMakeHistogramAuto(NUMA    *na,
973
                      l_int32  maxbins)
974
0
{
975
0
l_int32    i, n, imin, imax, irange, ibin, ival, allints;
976
0
l_float32  minval, maxval, range, binsize, fval;
977
0
NUMA      *nah;
978
979
0
    if (!na)
980
0
        return (NUMA *)ERROR_PTR("na not defined", __func__, NULL);
981
0
    maxbins = L_MAX(1, maxbins);
982
983
        /* Determine input range */
984
0
    numaGetMin(na, &minval, NULL);
985
0
    numaGetMax(na, &maxval, NULL);
986
987
        /* Determine if values are all integers */
988
0
    n = numaGetCount(na);
989
0
    numaHasOnlyIntegers(na, &allints);
990
991
        /* Do simple integer binning if possible */
992
0
    if (allints && (maxval - minval < maxbins)) {
993
0
        imin = (l_int32)minval;
994
0
        imax = (l_int32)maxval;
995
0
        irange = imax - imin + 1;
996
0
        nah = numaCreate(irange);
997
0
        numaSetCount(nah, irange);  /* init */
998
0
        numaSetParameters(nah, minval, 1.0);
999
0
        for (i = 0; i < n; i++) {
1000
0
            numaGetIValue(na, i, &ival);
1001
0
            ibin = ival - imin;
1002
0
            numaGetIValue(nah, ibin, &ival);
1003
0
            numaSetValue(nah, ibin, ival + 1.0f);
1004
0
        }
1005
1006
0
        return nah;
1007
0
    }
1008
1009
        /* Do float binning, even if the data is integers. */
1010
0
    range = maxval - minval;
1011
0
    binsize = range / (l_float32)maxbins;
1012
0
    if (range == 0.0) {
1013
0
        nah = numaCreate(1);
1014
0
        numaSetParameters(nah, minval, binsize);
1015
0
        numaAddNumber(nah, n);
1016
0
        return nah;
1017
0
    }
1018
0
    nah = numaCreate(maxbins);
1019
0
    numaSetCount(nah, maxbins);
1020
0
    numaSetParameters(nah, minval, binsize);
1021
0
    for (i = 0; i < n; i++) {
1022
0
        numaGetFValue(na, i, &fval);
1023
0
        ibin = (l_int32)((fval - minval) / binsize);
1024
0
        ibin = L_MIN(ibin, maxbins - 1);  /* "edge" case; stay in bounds */
1025
0
        numaGetIValue(nah, ibin, &ival);
1026
0
        numaSetValue(nah, ibin, ival + 1.0f);
1027
0
    }
1028
1029
0
    return nah;
1030
0
}
1031
1032
1033
/*!
1034
 * \brief   numaMakeHistogramClipped()
1035
 *
1036
 * \param[in]    na
1037
 * \param[in]    binsize    typically 1.0
1038
 * \param[in]    maxsize    of histogram ordinate
1039
 * \return  na histogram of bins of size %binsize, starting with
1040
 *                  the na[0] (x = 0.0 and going up to a maximum of
1041
 *                  x = %maxsize, by increments of %binsize), or NULL on error
1042
 *
1043
 * <pre>
1044
 * Notes:
1045
 *      (1) This simple function generates a histogram of values
1046
 *          from na, discarding all values < 0.0 or greater than
1047
 *          min(%maxsize, maxval), where maxval is the maximum value in na.
1048
 *          The histogram data is put in bins of size delx = %binsize,
1049
 *          starting at x = 0.0.  We use as many bins as are
1050
 *          needed to hold the data.
1051
 * </pre>
1052
 */
1053
NUMA *
1054
numaMakeHistogramClipped(NUMA      *na,
1055
                         l_float32  binsize,
1056
                         l_float32  maxsize)
1057
0
{
1058
0
l_int32    i, n, nbins, ival, ibin;
1059
0
l_float32  val, maxval;
1060
0
NUMA      *nad;
1061
1062
0
    if (!na)
1063
0
        return (NUMA *)ERROR_PTR("na not defined", __func__, NULL);
1064
0
    if (binsize <= 0.0)
1065
0
        return (NUMA *)ERROR_PTR("binsize must be > 0.0", __func__, NULL);
1066
0
    if (binsize > maxsize)
1067
0
        binsize = maxsize;  /* just one bin */
1068
1069
0
    numaGetMax(na, &maxval, NULL);
1070
0
    n = numaGetCount(na);
1071
0
    maxsize = L_MIN(maxsize, maxval);
1072
0
    nbins = (l_int32)(maxsize / binsize) + 1;
1073
1074
/*    lept_stderr("maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */
1075
1076
0
    if ((nad = numaCreate(nbins)) == NULL)
1077
0
        return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
1078
0
    numaSetParameters(nad, 0.0, binsize);
1079
0
    numaSetCount(nad, nbins);  /* interpret zeroes in bins as data */
1080
0
    for (i = 0; i < n; i++) {
1081
0
        numaGetFValue(na, i, &val);
1082
0
        ibin = (l_int32)(val / binsize);
1083
0
        if (ibin >= 0 && ibin < nbins) {
1084
0
            numaGetIValue(nad, ibin, &ival);
1085
0
            numaSetValue(nad, ibin, ival + 1.0f);
1086
0
        }
1087
0
    }
1088
1089
0
    return nad;
1090
0
}
1091
1092
1093
/*!
1094
 * \brief   numaRebinHistogram()
1095
 *
1096
 * \param[in]    nas      input histogram
1097
 * \param[in]    newsize  number of old bins contained in each new bin
1098
 * \return  nad more coarsely re-binned histogram, or NULL on error
1099
 */
1100
NUMA *
1101
numaRebinHistogram(NUMA    *nas,
1102
                   l_int32  newsize)
1103
0
{
1104
0
l_int32    i, j, ns, nd, index, count, val;
1105
0
l_float32  start, oldsize;
1106
0
NUMA      *nad;
1107
1108
0
    if (!nas)
1109
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
1110
0
    if (newsize <= 1)
1111
0
        return (NUMA *)ERROR_PTR("newsize must be > 1", __func__, NULL);
1112
0
    if ((ns = numaGetCount(nas)) == 0)
1113
0
        return (NUMA *)ERROR_PTR("no bins in nas", __func__, NULL);
1114
1115
0
    nd = (ns + newsize - 1) / newsize;
1116
0
    if ((nad = numaCreate(nd)) == NULL)
1117
0
        return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
1118
0
    numaGetParameters(nad, &start, &oldsize);
1119
0
    numaSetParameters(nad, start, oldsize * newsize);
1120
1121
0
    for (i = 0; i < nd; i++) {  /* new bins */
1122
0
        count = 0;
1123
0
        index = i * newsize;
1124
0
        for (j = 0; j < newsize; j++) {
1125
0
            if (index < ns) {
1126
0
                numaGetIValue(nas, index, &val);
1127
0
                count += val;
1128
0
                index++;
1129
0
            }
1130
0
        }
1131
0
        numaAddNumber(nad, count);
1132
0
    }
1133
1134
0
    return nad;
1135
0
}
1136
1137
1138
/*!
1139
 * \brief   numaNormalizeHistogram()
1140
 *
1141
 * \param[in]    nas   input histogram
1142
 * \param[in]    tsum  target sum of all numbers in dest histogram; e.g., use
1143
 *                     %tsum= 1.0 if this represents a probability distribution
1144
 * \return  nad normalized histogram, or NULL on error
1145
 */
1146
NUMA *
1147
numaNormalizeHistogram(NUMA      *nas,
1148
                       l_float32  tsum)
1149
0
{
1150
0
l_int32    i, ns;
1151
0
l_float32  sum, factor, fval;
1152
0
NUMA      *nad;
1153
1154
0
    if (!nas)
1155
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
1156
0
    if (tsum <= 0.0)
1157
0
        return (NUMA *)ERROR_PTR("tsum must be > 0.0", __func__, NULL);
1158
0
    if ((ns = numaGetCount(nas)) == 0)
1159
0
        return (NUMA *)ERROR_PTR("no bins in nas", __func__, NULL);
1160
1161
0
    numaGetSum(nas, &sum);
1162
0
    factor = tsum / sum;
1163
1164
0
    if ((nad = numaCreate(ns)) == NULL)
1165
0
        return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
1166
0
    numaCopyParameters(nad, nas);
1167
1168
0
    for (i = 0; i < ns; i++) {
1169
0
        numaGetFValue(nas, i, &fval);
1170
0
        fval *= factor;
1171
0
        numaAddNumber(nad, fval);
1172
0
    }
1173
1174
0
    return nad;
1175
0
}
1176
1177
1178
/*!
1179
 * \brief   numaGetStatsUsingHistogram()
1180
 *
1181
 * \param[in]    na        an arbitrary set of numbers; not ordered and not
1182
 *                         a histogram
1183
 * \param[in]    maxbins   the maximum number of bins to be allowed in
1184
 *                         the histogram; use an integer larger than the
1185
 *                         largest number in %na for consecutive integer bins
1186
 * \param[out]   pmin      [optional] min value of set
1187
 * \param[out]   pmax      [optional] max value of set
1188
 * \param[out]   pmean     [optional] mean value of set
1189
 * \param[out]   pvariance [optional] variance
1190
 * \param[out]   pmedian   [optional] median value of set
1191
 * \param[in]    rank      in [0.0 ... 1.0]; median has a rank 0.5;
1192
 *                         ignored if &rval == NULL
1193
 * \param[out]   prval     [optional] value in na corresponding to %rank
1194
 * \param[out]   phisto    [optional] Numa histogram; use NULL to prevent
1195
 * \return  0 if OK, 1 on error
1196
 *
1197
 * <pre>
1198
 * Notes:
1199
 *      (1) This is a simple interface for gathering statistics
1200
 *          from a numa, where a histogram is used 'under the covers'
1201
 *          to avoid sorting if a rank value is requested.  In that case,
1202
 *          by using a histogram we are trading speed for accuracy, because
1203
 *          the values in %na are quantized to the center of a set of bins.
1204
 *      (2) If the median, other rank value, or histogram are not requested,
1205
 *          the calculation is all performed on the input Numa.
1206
 *      (3) The variance is the average of the square of the
1207
 *          difference from the mean.  The median is the value in na
1208
 *          with rank 0.5.
1209
 *      (4) There are two situations where this gives rank results with
1210
 *          accuracy comparable to computing stastics directly on the input
1211
 *          data, without binning into a histogram:
1212
 *           (a) the data is integers and the range of data is less than
1213
 *               %maxbins, and
1214
 *           (b) the data is floats and the range is small compared to
1215
 *               %maxbins, so that the binsize is much less than 1.
1216
 *      (5) If a histogram is used and the numbers in the Numa extend
1217
 *          over a large range, you can limit the required storage by
1218
 *          specifying the maximum number of bins in the histogram.
1219
 *          Use %maxbins == 0 to force the bin size to be 1.
1220
 *      (6) This optionally returns the median and one arbitrary rank value.
1221
 *          If you need several rank values, return the histogram and use
1222
 *               numaHistogramGetValFromRank(nah, rank, &rval)
1223
 *          multiple times.
1224
 * </pre>
1225
 */
1226
l_ok
1227
numaGetStatsUsingHistogram(NUMA       *na,
1228
                           l_int32     maxbins,
1229
                           l_float32  *pmin,
1230
                           l_float32  *pmax,
1231
                           l_float32  *pmean,
1232
                           l_float32  *pvariance,
1233
                           l_float32  *pmedian,
1234
                           l_float32   rank,
1235
                           l_float32  *prval,
1236
                           NUMA      **phisto)
1237
0
{
1238
0
l_int32    i, n;
1239
0
l_float32  minval, maxval, fval, mean, sum;
1240
0
NUMA      *nah;
1241
1242
0
    if (pmin) *pmin = 0.0;
1243
0
    if (pmax) *pmax = 0.0;
1244
0
    if (pmean) *pmean = 0.0;
1245
0
    if (pvariance) *pvariance = 0.0;
1246
0
    if (pmedian) *pmedian = 0.0;
1247
0
    if (prval) *prval = 0.0;
1248
0
    if (phisto) *phisto = NULL;
1249
0
    if (!na)
1250
0
        return ERROR_INT("na not defined", __func__, 1);
1251
0
    if ((n = numaGetCount(na)) == 0)
1252
0
        return ERROR_INT("numa is empty", __func__, 1);
1253
1254
0
    numaGetMin(na, &minval, NULL);
1255
0
    numaGetMax(na, &maxval, NULL);
1256
0
    if (pmin) *pmin = minval;
1257
0
    if (pmax) *pmax = maxval;
1258
0
    if (pmean || pvariance) {
1259
0
        sum = 0.0;
1260
0
        for (i = 0; i < n; i++) {
1261
0
            numaGetFValue(na, i, &fval);
1262
0
            sum += fval;
1263
0
        }
1264
0
        mean = sum / (l_float32)n;
1265
0
        if (pmean) *pmean = mean;
1266
0
    }
1267
0
    if (pvariance) {
1268
0
        sum = 0.0;
1269
0
        for (i = 0; i < n; i++) {
1270
0
            numaGetFValue(na, i, &fval);
1271
0
            sum += fval * fval;
1272
0
        }
1273
0
        *pvariance = sum / (l_float32)n - mean * mean;
1274
0
    }
1275
1276
0
    if (!pmedian && !prval && !phisto)
1277
0
        return 0;
1278
1279
0
    nah = numaMakeHistogramAuto(na, maxbins);
1280
0
    if (pmedian)
1281
0
        numaHistogramGetValFromRank(nah, 0.5, pmedian);
1282
0
    if (prval)
1283
0
        numaHistogramGetValFromRank(nah, rank, prval);
1284
0
    if (phisto)
1285
0
        *phisto = nah;
1286
0
    else
1287
0
        numaDestroy(&nah);
1288
0
    return 0;
1289
0
}
1290
1291
1292
/*!
1293
 * \brief   numaGetHistogramStats()
1294
 *
1295
 * \param[in]    nahisto     histogram: y(x(i)), i = 0 ... nbins - 1
1296
 * \param[in]    startx      x value of first bin: x(0)
1297
 * \param[in]    deltax      x increment between bins; the bin size; x(1) - x(0)
1298
 * \param[out]   pxmean      [optional] mean value of histogram
1299
 * \param[out]   pxmedian    [optional] median value of histogram
1300
 * \param[out]   pxmode      [optional] mode value of histogram:
1301
 *                           xmode = x(imode), where y(xmode) >= y(x(i)) for
1302
 *                           all i != imode
1303
 * \param[out]   pxvariance  [optional] variance of x
1304
 * \return  0 if OK, 1 on error
1305
 *
1306
 * <pre>
1307
 * Notes:
1308
 *      (1) If the histogram represents the relation y(x), the
1309
 *          computed values that are returned are the x values.
1310
 *          These are NOT the bucket indices i; they are related to the
1311
 *          bucket indices by
1312
 *                x(i) = startx + i * deltax
1313
 * </pre>
1314
 */
1315
l_ok
1316
numaGetHistogramStats(NUMA       *nahisto,
1317
                      l_float32   startx,
1318
                      l_float32   deltax,
1319
                      l_float32  *pxmean,
1320
                      l_float32  *pxmedian,
1321
                      l_float32  *pxmode,
1322
                      l_float32  *pxvariance)
1323
0
{
1324
0
    if (pxmean) *pxmean = 0.0;
1325
0
    if (pxmedian) *pxmedian = 0.0;
1326
0
    if (pxmode) *pxmode = 0.0;
1327
0
    if (pxvariance) *pxvariance = 0.0;
1328
0
    if (!nahisto)
1329
0
        return ERROR_INT("nahisto not defined", __func__, 1);
1330
1331
0
    return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, -1,
1332
0
                                           pxmean, pxmedian, pxmode,
1333
0
                                           pxvariance);
1334
0
}
1335
1336
1337
/*!
1338
 * \brief   numaGetHistogramStatsOnInterval()
1339
 *
1340
 * \param[in]    nahisto    histogram: y(x(i)), i = 0 ... nbins - 1
1341
 * \param[in]    startx     x value of first bin: x(0)
1342
 * \param[in]    deltax     x increment between bins; the bin size; x(1) - x(0)
1343
 * \param[in]    ifirst     first bin to use for collecting stats
1344
 * \param[in]    ilast      last bin for collecting stats; -1 to go to the end
1345
 * \param[out]   pxmean     [optional] mean value of histogram
1346
 * \param[out]   pxmedian   [optional] median value of histogram
1347
 * \param[out]   pxmode     [optional] mode value of histogram:
1348
 *                          xmode = x(imode), where y(xmode) >= y(x(i)) for
1349
 *                          all i != imode
1350
 * \param[out]   pxvariance [optional] variance of x
1351
 * \return  0 if OK, 1 on error
1352
 *
1353
 * <pre>
1354
 * Notes:
1355
 *      (1) If the histogram represents the relation y(x), the
1356
 *          computed values that are returned are the x values.
1357
 *          These are NOT the bucket indices i; they are related to the
1358
 *          bucket indices by
1359
 *                x(i) = startx + i * deltax
1360
 * </pre>
1361
 */
1362
l_ok
1363
numaGetHistogramStatsOnInterval(NUMA       *nahisto,
1364
                                l_float32   startx,
1365
                                l_float32   deltax,
1366
                                l_int32     ifirst,
1367
                                l_int32     ilast,
1368
                                l_float32  *pxmean,
1369
                                l_float32  *pxmedian,
1370
                                l_float32  *pxmode,
1371
                                l_float32  *pxvariance)
1372
0
{
1373
0
l_int32    i, n, imax;
1374
0
l_float32  sum, sumval, halfsum, moment, var, x, y, ymax;
1375
1376
0
    if (pxmean) *pxmean = 0.0;
1377
0
    if (pxmedian) *pxmedian = 0.0;
1378
0
    if (pxmode) *pxmode = 0.0;
1379
0
    if (pxvariance) *pxvariance = 0.0;
1380
0
    if (!nahisto)
1381
0
        return ERROR_INT("nahisto not defined", __func__, 1);
1382
0
    if (!pxmean && !pxmedian && !pxmode && !pxvariance)
1383
0
        return ERROR_INT("nothing to compute", __func__, 1);
1384
1385
0
    n = numaGetCount(nahisto);
1386
0
    ifirst = L_MAX(0, ifirst);
1387
0
    if (ilast < 0) ilast = n - 1;
1388
0
    if (ifirst >= n)
1389
0
        return ERROR_INT("invalid ifirst", __func__, 1);
1390
0
    if (ilast >= n) {
1391
0
        L_WARNING("ilast = %d is beyond max index = %d; adjusting\n",
1392
0
                  __func__, ilast, n - 1);
1393
0
        ilast = n - 1;
1394
0
    }
1395
0
    if (ifirst > ilast)
1396
0
        return ERROR_INT("ifirst > ilast", __func__, 1);
1397
0
    for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) {
1398
0
        x = startx + i * deltax;
1399
0
        numaGetFValue(nahisto, i, &y);
1400
0
        sum += y;
1401
0
        moment += x * y;
1402
0
        var += x * x * y;
1403
0
    }
1404
0
    if (sum == 0.0) {
1405
0
        L_INFO("sum is 0\n", __func__);
1406
0
        return 0;
1407
0
    }
1408
1409
0
    if (pxmean)
1410
0
        *pxmean = moment / sum;
1411
0
    if (pxvariance)
1412
0
        *pxvariance = var / sum - moment * moment / (sum * sum);
1413
1414
0
    if (pxmedian) {
1415
0
        halfsum = sum / 2.0f;
1416
0
        for (sumval = 0.0, i = ifirst; i <= ilast; i++) {
1417
0
            numaGetFValue(nahisto, i, &y);
1418
0
            sumval += y;
1419
0
            if (sumval >= halfsum) {
1420
0
                *pxmedian = startx + i * deltax;
1421
0
                break;
1422
0
            }
1423
0
        }
1424
0
    }
1425
1426
0
    if (pxmode) {
1427
0
        imax = -1;
1428
0
        ymax = -1.0e10;
1429
0
        for (i = ifirst; i <= ilast; i++) {
1430
0
            numaGetFValue(nahisto, i, &y);
1431
0
            if (y > ymax) {
1432
0
                ymax = y;
1433
0
                imax = i;
1434
0
            }
1435
0
        }
1436
0
        *pxmode = startx + imax * deltax;
1437
0
    }
1438
1439
0
    return 0;
1440
0
}
1441
1442
1443
/*!
1444
 * \brief   numaMakeRankFromHistogram()
1445
 *
1446
 * \param[in]    startx   xval corresponding to first element in nay
1447
 * \param[in]    deltax   x increment between array elements in nay
1448
 * \param[in]    nasy     input histogram, assumed equally spaced
1449
 * \param[in]    npts     number of points to evaluate rank function
1450
 * \param[out]   pnax     [optional] array of x values in range
1451
 * \param[out]   pnay     rank array of specified npts
1452
 * \return  0 if OK, 1 on error
1453
 */
1454
l_ok
1455
numaMakeRankFromHistogram(l_float32  startx,
1456
                          l_float32  deltax,
1457
                          NUMA      *nasy,
1458
                          l_int32    npts,
1459
                          NUMA     **pnax,
1460
                          NUMA     **pnay)
1461
0
{
1462
0
l_int32    i, n;
1463
0
l_float32  sum, fval;
1464
0
NUMA      *nan, *nar;
1465
1466
0
    if (pnax) *pnax = NULL;
1467
0
    if (!pnay)
1468
0
        return ERROR_INT("&nay not defined", __func__, 1);
1469
0
    *pnay = NULL;
1470
0
    if (!nasy)
1471
0
        return ERROR_INT("nasy not defined", __func__, 1);
1472
0
    if ((n = numaGetCount(nasy)) == 0)
1473
0
        return ERROR_INT("no bins in nas", __func__, 1);
1474
1475
        /* Normalize and generate the rank array corresponding to
1476
         * the binned histogram. */
1477
0
    nan = numaNormalizeHistogram(nasy, 1.0);
1478
0
    nar = numaCreate(n + 1);  /* rank numa corresponding to nan */
1479
0
    sum = 0.0;
1480
0
    numaAddNumber(nar, sum);  /* first element is 0.0 */
1481
0
    for (i = 0; i < n; i++) {
1482
0
        numaGetFValue(nan, i, &fval);
1483
0
        sum += fval;
1484
0
        numaAddNumber(nar, sum);
1485
0
    }
1486
1487
        /* Compute rank array on full range with specified
1488
         * number of points and correspondence to x-values. */
1489
0
    numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP,
1490
0
                               startx, startx + n * deltax, npts,
1491
0
                               pnax, pnay);
1492
0
    numaDestroy(&nan);
1493
0
    numaDestroy(&nar);
1494
0
    return 0;
1495
0
}
1496
1497
1498
/*!
1499
 * \brief   numaHistogramGetRankFromVal()
1500
 *
1501
 * \param[in]    na     histogram
1502
 * \param[in]    rval   value of input sample for which we want the rank
1503
 * \param[out]   prank  fraction of total samples below rval
1504
 * \return  0 if OK, 1 on error
1505
 *
1506
 * <pre>
1507
 * Notes:
1508
 *      (1) If we think of the histogram as a function y(x), normalized
1509
 *          to 1, for a given input value of x, this computes the
1510
 *          rank of x, which is the integral of y(x) from the start
1511
 *          value of x to the input value.
1512
 *      (2) This function only makes sense when applied to a Numa that
1513
 *          is a histogram.  The values in the histogram can be ints and
1514
 *          floats, and are computed as floats.  The rank is returned
1515
 *          as a float between 0.0 and 1.0.
1516
 *      (3) The numa parameters startx and binsize are used to
1517
 *          compute x from the Numa index i.
1518
 * </pre>
1519
 */
1520
l_ok
1521
numaHistogramGetRankFromVal(NUMA       *na,
1522
                            l_float32   rval,
1523
                            l_float32  *prank)
1524
0
{
1525
0
l_int32    i, ibinval, n;
1526
0
l_float32  startval, binsize, binval, maxval, fractval, total, sum, val;
1527
1528
0
    if (!prank)
1529
0
        return ERROR_INT("prank not defined", __func__, 1);
1530
0
    *prank = 0.0;
1531
0
    if (!na)
1532
0
        return ERROR_INT("na not defined", __func__, 1);
1533
0
    numaGetParameters(na, &startval, &binsize);
1534
0
    n = numaGetCount(na);
1535
0
    if (rval < startval)
1536
0
        return 0;
1537
0
    maxval = startval + n * binsize;
1538
0
    if (rval > maxval) {
1539
0
        *prank = 1.0;
1540
0
        return 0;
1541
0
    }
1542
1543
0
    binval = (rval - startval) / binsize;
1544
0
    ibinval = (l_int32)binval;
1545
0
    if (ibinval >= n) {
1546
0
        *prank = 1.0;
1547
0
        return 0;
1548
0
    }
1549
0
    fractval = binval - (l_float32)ibinval;
1550
1551
0
    sum = 0.0;
1552
0
    for (i = 0; i < ibinval; i++) {
1553
0
        numaGetFValue(na, i, &val);
1554
0
        sum += val;
1555
0
    }
1556
0
    numaGetFValue(na, ibinval, &val);
1557
0
    sum += fractval * val;
1558
0
    numaGetSum(na, &total);
1559
0
    *prank = sum / total;
1560
1561
/*    lept_stderr("binval = %7.3f, rank = %7.3f\n", binval, *prank); */
1562
1563
0
    return 0;
1564
0
}
1565
1566
1567
/*!
1568
 * \brief   numaHistogramGetValFromRank()
1569
 *
1570
 * \param[in]    na     histogram
1571
 * \param[in]    rank   fraction of total samples
1572
 * \param[out]   prval  approx. to the bin value
1573
 * \return  0 if OK, 1 on error
1574
 *
1575
 * <pre>
1576
 * Notes:
1577
 *      (1) If we think of the histogram as a function y(x), this returns
1578
 *          the value x such that the integral of y(x) from the start
1579
 *          value to x gives the fraction 'rank' of the integral
1580
 *          of y(x) over all bins.
1581
 *      (2) This function only makes sense when applied to a Numa that
1582
 *          is a histogram.  The values in the histogram can be ints and
1583
 *          floats, and are computed as floats.  The val is returned
1584
 *          as a float, even though the buckets are of integer width.
1585
 *      (3) The numa parameters startx and binsize are used to
1586
 *          compute x from the Numa index i.
1587
 * </pre>
1588
 */
1589
l_ok
1590
numaHistogramGetValFromRank(NUMA       *na,
1591
                            l_float32   rank,
1592
                            l_float32  *prval)
1593
0
{
1594
0
l_int32    i, n;
1595
0
l_float32  startval, binsize, rankcount, total, sum, fract, val;
1596
1597
0
    if (!prval)
1598
0
        return ERROR_INT("prval not defined", __func__, 1);
1599
0
    *prval = 0.0;
1600
0
    if (!na)
1601
0
        return ERROR_INT("na not defined", __func__, 1);
1602
0
    if (rank < 0.0) {
1603
0
        L_WARNING("rank < 0; setting to 0.0\n", __func__);
1604
0
        rank = 0.0;
1605
0
    }
1606
0
    if (rank > 1.0) {
1607
0
        L_WARNING("rank > 1.0; setting to 1.0\n", __func__);
1608
0
        rank = 1.0;
1609
0
    }
1610
1611
0
    n = numaGetCount(na);
1612
0
    numaGetParameters(na, &startval, &binsize);
1613
0
    numaGetSum(na, &total);
1614
0
    rankcount = rank * total;  /* count that corresponds to rank */
1615
0
    sum = 0.0;
1616
0
    val = 0.0;
1617
0
    for (i = 0; i < n; i++) {
1618
0
        numaGetFValue(na, i, &val);
1619
0
        if (sum + val >= rankcount)
1620
0
            break;
1621
0
        sum += val;
1622
0
    }
1623
0
    if (val <= 0.0)  /* can == 0 if rank == 0.0 */
1624
0
        fract = 0.0;
1625
0
    else  /* sum + fract * val = rankcount */
1626
0
        fract = (rankcount - sum) / val;
1627
1628
    /* The use of the fraction of a bin allows a simple calculation
1629
     * for the histogram value at the given rank. */
1630
0
    *prval = startval + binsize * ((l_float32)i + fract);
1631
1632
/*    lept_stderr("rank = %7.3f, val = %7.3f\n", rank, *prval); */
1633
1634
0
    return 0;
1635
0
}
1636
1637
1638
/*!
1639
 * \brief   numaDiscretizeSortedInBins()
1640
 *
1641
 * \param[in]    na          sorted
1642
 * \param[in]    nbins       number of equal population bins (> 1)
1643
 * \param[out]   pnabinval   average "gray" values in each bin
1644
 * \return  0 if OK, 1 on error
1645
 *
1646
 * <pre>
1647
 * Notes:
1648
 *      (1) The input %na is sorted in increasing value.
1649
 *      (2) The output array has the following mapping:
1650
 *             bin number  -->  average array value in bin (nabinval)
1651
 *      (3) With %nbins == 100, nabinval is the average gray value in
1652
 *          each of the 100 equally populated bins.  It is the function
1653
 *                gray[100 * rank].
1654
 *          Thus it is the inverse of
1655
 *                rank[gray]
1656
 *      (4) Contast with numaDiscretizeHistoInBins(), where the input %na
1657
 *          is a histogram.
1658
 * </pre>
1659
 */
1660
l_ok
1661
numaDiscretizeSortedInBins(NUMA    *na,
1662
                           l_int32  nbins,
1663
                           NUMA   **pnabinval)
1664
0
{
1665
0
NUMA      *nabinval;  /* average gray value in the bins */
1666
0
NUMA      *naeach;
1667
0
l_int32    i, ntot, bincount, binindex, binsize;
1668
0
l_float32  sum, val, ave;
1669
1670
0
    if (!pnabinval)
1671
0
        return ERROR_INT("&nabinval not defined", __func__, 1);
1672
0
    *pnabinval = NULL;
1673
0
    if (!na)
1674
0
        return ERROR_INT("na not defined", __func__, 1);
1675
0
    if (nbins < 2)
1676
0
        return ERROR_INT("nbins must be > 1", __func__, 1);
1677
1678
        /* Get the number of items in each bin */
1679
0
    ntot = numaGetCount(na);
1680
0
    if ((naeach = numaGetUniformBinSizes(ntot, nbins)) == NULL)
1681
0
        return ERROR_INT("naeach not made", __func__, 1);
1682
1683
        /* Get the average value in each bin */
1684
0
    sum = 0.0;
1685
0
    bincount = 0;
1686
0
    binindex = 0;
1687
0
    numaGetIValue(naeach, 0, &binsize);
1688
0
    nabinval = numaCreate(nbins);
1689
0
    for (i = 0; i < ntot; i++) {
1690
0
        numaGetFValue(na, i, &val);
1691
0
        bincount++;
1692
0
        sum += val;
1693
0
        if (bincount == binsize) {  /* add bin entry */
1694
0
            ave = sum / binsize;
1695
0
            numaAddNumber(nabinval, ave);
1696
0
            sum = 0.0;
1697
0
            bincount = 0;
1698
0
            binindex++;
1699
0
            if (binindex == nbins) break;
1700
0
            numaGetIValue(naeach, binindex, &binsize);
1701
0
        }
1702
0
    }
1703
0
    *pnabinval = nabinval;
1704
1705
0
    numaDestroy(&naeach);
1706
0
    return 0;
1707
0
}
1708
1709
1710
/*!
1711
 * \brief   numaDiscretizeHistoInBins()
1712
 *
1713
 * \param[in]    na          histogram
1714
 * \param[in]    nbins       number of equal population bins (> 1)
1715
 * \param[out]   pnabinval   average "gray" values in each bin
1716
 * \param[out]   pnarank     [optional] rank value of input histogram;
1717
 *                           this is a cumulative norm histogram.
1718
 * \return  0 if OK, 1 on error
1719
 *
1720
 * <pre>
1721
 * Notes:
1722
 *      (1) With %nbins == 100, nabinval is the average gray value in
1723
 *          each of the 100 equally populated bins.  It is the function
1724
 *                gray[100 * rank].
1725
 *          Thus it is the inverse of
1726
 *                rank[gray]
1727
 *          which is optionally returned in narank.
1728
 *      (2) The "gray value" is the index into the input histogram.
1729
 *      (3) The two output arrays give the following mappings, where the
1730
 *          input is an un-normalized histogram of array values:
1731
 *             bin number  -->  average array value in bin (nabinval)
1732
 *             array values     -->  cumulative normalized histogram (narank)
1733
 * </pre>
1734
 */
1735
l_ok
1736
numaDiscretizeHistoInBins(NUMA    *na,
1737
                          l_int32  nbins,
1738
                          NUMA   **pnabinval,
1739
                          NUMA   **pnarank)
1740
0
{
1741
0
NUMA      *nabinval;  /* average gray value in the bins */
1742
0
NUMA      *naeach, *nan;
1743
0
l_int32    i, j, nxvals, occup, count, bincount, binindex, binsize;
1744
0
l_float32  sum, ave, ntot;
1745
1746
0
    if (pnarank) *pnarank = NULL;
1747
0
    if (!pnabinval)
1748
0
        return ERROR_INT("&nabinval not defined", __func__, 1);
1749
0
    *pnabinval = NULL;
1750
0
    if (!na)
1751
0
        return ERROR_INT("na not defined", __func__, 1);
1752
0
    if (nbins < 2)
1753
0
        return ERROR_INT("nbins must be > 1", __func__, 1);
1754
1755
0
    nxvals = numaGetCount(na);
1756
0
    numaGetSum(na, &ntot);
1757
0
    occup = ntot / nxvals;
1758
0
    if (occup < 1) L_INFO("average occupancy %d < 1\n", __func__, occup);
1759
1760
        /* Get the number of items in each bin */
1761
0
    if ((naeach = numaGetUniformBinSizes(ntot, nbins)) == NULL)
1762
0
        return ERROR_INT("naeach not made", __func__, 1);
1763
1764
        /* Get the average value in each bin */
1765
0
    sum = 0.0;
1766
0
    bincount = 0;
1767
0
    binindex = 0;
1768
0
    numaGetIValue(naeach, 0, &binsize);
1769
0
    nabinval = numaCreate(nbins);
1770
0
    for (i = 0; i < nxvals; i++) {
1771
0
        numaGetIValue(na, i, &count);
1772
0
        for (j = 0; j < count; j++) {
1773
0
            bincount++;
1774
0
            sum += i;
1775
0
            if (bincount == binsize) {  /* add bin entry */
1776
0
                ave = sum / binsize;
1777
0
                numaAddNumber(nabinval, ave);
1778
0
                sum = 0.0;
1779
0
                bincount = 0;
1780
0
                binindex++;
1781
0
                if (binindex == nbins) break;
1782
0
                numaGetIValue(naeach, binindex, &binsize);
1783
0
            }
1784
0
        }
1785
0
        if (binindex == nbins) break;
1786
0
    }
1787
0
    *pnabinval = nabinval;
1788
0
    if (binindex != nbins)
1789
0
        L_ERROR("binindex = %d != nbins = %d\n", __func__, binindex, nbins);
1790
1791
        /* Get cumulative normalized histogram (rank[gray value]).
1792
         * This is the partial sum operating on the normalized histogram. */
1793
0
    if (pnarank) {
1794
0
        nan = numaNormalizeHistogram(na, 1.0);
1795
0
        *pnarank = numaGetPartialSums(nan);
1796
0
        numaDestroy(&nan);
1797
0
    }
1798
0
    numaDestroy(&naeach);
1799
0
    return 0;
1800
0
}
1801
1802
1803
/*!
1804
 * \brief   numaGetRankBinValues()
1805
 *
1806
 * \param[in]    na       an array of values
1807
 * \param[in]    nbins    number of bins at which the rank is divided
1808
 * \param[out]   pnam     mean intensity in a bin vs rank bin value,
1809
 *                        with %nbins of discretized rank values
1810
 * \return  0 if OK, 1 on error
1811
 *
1812
 * <pre>
1813
 * Notes:
1814
 *      (1) Simple interface for getting a binned rank representation
1815
 *          of an input array of values.  This returns:
1816
 *             rank bin number -->  average array value in each rank bin (nam)
1817
 *      (2) Uses bins either a sorted array or a histogram, depending on
1818
 *          the values in the array and the size of the array.
1819
 * </pre>
1820
 */
1821
l_ok
1822
numaGetRankBinValues(NUMA    *na,
1823
                     l_int32  nbins,
1824
                     NUMA   **pnam)
1825
0
{
1826
0
NUMA      *na1;
1827
0
l_int32    maxbins, type;
1828
0
l_float32  maxval, delx;
1829
1830
0
    if (!pnam)
1831
0
        return ERROR_INT("&pnam not defined", __func__, 1);
1832
0
    *pnam = NULL;
1833
0
    if (!na)
1834
0
        return ERROR_INT("na not defined", __func__, 1);
1835
0
    if (numaGetCount(na) == 0)
1836
0
        return ERROR_INT("na is empty", __func__, 1);
1837
0
    if (nbins < 2)
1838
0
        return ERROR_INT("nbins must be > 1", __func__, 1);
1839
1840
        /* Choose between sorted array and a histogram.
1841
         * If the input array is has a small number of numbers with
1842
         * a large maximum, we will sort it.  At the other extreme, if
1843
         * the array has many numbers with a small maximum, such as the
1844
         * values of pixels in an 8 bpp grayscale image, generate a histogram.
1845
         * If type comes back as L_BIN_SORT, use a histogram. */
1846
0
    type = numaChooseSortType(na);
1847
0
    if (type == L_SHELL_SORT) {  /* sort the array */
1848
0
        L_INFO("sort the array: input size = %d\n", __func__, numaGetCount(na));
1849
0
        na1 = numaSort(NULL, na, L_SORT_INCREASING);
1850
0
        numaDiscretizeSortedInBins(na1, nbins, pnam);
1851
0
        numaDestroy(&na1);
1852
0
        return 0;
1853
0
    }
1854
1855
        /* Make the histogram.  Assuming there are no negative values
1856
         * in the array, if the max value in the array does not exceed
1857
         * about 100000, the bin size for generating the histogram will
1858
         * be 1; maxbins refers to the number of entries in the histogram. */
1859
0
    L_INFO("use a histogram: input size = %d\n", __func__, numaGetCount(na));
1860
0
    numaGetMax(na, &maxval, NULL);
1861
0
    maxbins = L_MIN(100002, (l_int32)maxval + 2);
1862
0
    na1 = numaMakeHistogram(na, maxbins, NULL, NULL);
1863
1864
        /* Warn if there is a scale change.  This shouldn't happen
1865
         * unless the max value is above 100000.  */
1866
0
    numaGetParameters(na1, NULL, &delx);
1867
0
    if (delx > 1.0)
1868
0
        L_WARNING("scale change: delx = %6.2f\n", __func__, delx);
1869
1870
        /* Rank bin the results */
1871
0
    numaDiscretizeHistoInBins(na1, nbins, pnam, NULL);
1872
0
    numaDestroy(&na1);
1873
0
    return 0;
1874
0
}
1875
1876
1877
/*!
1878
 * \brief   numaGetUniformBinSizes()
1879
 *
1880
 * \param[in]    ntotal   number of values to be split up
1881
 * \param[in]    nbins    number of bins
1882
 * \return  naeach   number of values to go in each bin, or NULL on error
1883
 *
1884
 * <pre>
1885
 * Notes:
1886
 *      (1) The numbers in the bins can differ by 1.  The sum of
1887
 *          bin numbers in %naeach is %ntotal.
1888
 * </pre>
1889
 */
1890
NUMA *
1891
numaGetUniformBinSizes(l_int32  ntotal,
1892
                       l_int32  nbins)
1893
0
{
1894
0
l_int32  i, start, end;
1895
0
NUMA    *naeach;
1896
1897
0
    if (ntotal <= 0)
1898
0
        return (NUMA *)ERROR_PTR("ntotal <= 0", __func__, NULL);
1899
0
    if (nbins <= 0)
1900
0
        return (NUMA *)ERROR_PTR("nbins <= 0", __func__, NULL);
1901
1902
0
    if ((naeach = numaCreate(nbins)) == NULL)
1903
0
        return (NUMA *)ERROR_PTR("naeach not made", __func__, NULL);
1904
1905
0
    if (ntotal < nbins) {  /* put 1 in each of %ntotal bins */
1906
0
        for (i = 0; i < ntotal; i++)
1907
0
            numaAddNumber(naeach, 1);
1908
0
        return naeach;
1909
0
    }
1910
1911
0
    start = 0;
1912
0
    for (i = 0; i < nbins; i++) {
1913
0
        end = ntotal * (i + 1) / nbins;
1914
0
        numaAddNumber(naeach, end - start);
1915
0
        start = end;
1916
0
    }
1917
0
    return naeach;
1918
0
}
1919
1920
1921
/*----------------------------------------------------------------------*
1922
 *                      Splitting a distribution                        *
1923
 *----------------------------------------------------------------------*/
1924
/*!
1925
 * \brief   numaSplitDistribution()
1926
 *
1927
 * \param[in]    na           histogram
1928
 * \param[in]    scorefract   fraction of the max score, used to determine
1929
 *                            range over which the histogram min is searched
1930
 * \param[out]   psplitindex  [optional] index for splitting
1931
 * \param[out]   pave1        [optional] average of lower distribution
1932
 * \param[out]   pave2        [optional] average of upper distribution
1933
 * \param[out]   pnum1        [optional] population of lower distribution
1934
 * \param[out]   pnum2        [optional] population of upper distribution
1935
 * \param[out]   pnascore     [optional] for debugging; otherwise use NULL
1936
 * \return  0 if OK, 1 on error
1937
 *
1938
 * <pre>
1939
 * Notes:
1940
 *      (1) This function is intended to be used on a distribution of
1941
 *          values that represent two sets, such as a histogram of
1942
 *          pixel values for an image with a fg and bg, and the goal
1943
 *          is to determine the averages of the two sets and the
1944
 *          best splitting point.
1945
 *      (2) The Otsu method finds a split point that divides the distribution
1946
 *          into two parts by maximizing a score function that is the
1947
 *          product of two terms:
1948
 *            (a) the square of the difference of centroids, (ave1 - ave2)^2
1949
 *            (b) fract1 * (1 - fract1)
1950
 *          where fract1 is the fraction in the lower distribution.
1951
 *      (3) This works well for images where the fg and bg are
1952
 *          each relatively homogeneous and well-separated in color.
1953
 *          However, if the actual fg and bg sets are very different
1954
 *          in size, and the bg is highly varied, as can occur in some
1955
 *          scanned document images, this will bias the split point
1956
 *          into the larger "bump" (i.e., toward the point where the
1957
 *          (b) term reaches its maximum of 0.25 at fract1 = 0.5.
1958
 *          To avoid this, we define a range of values near the
1959
 *          maximum of the score function, and choose the value within
1960
 *          this range such that the histogram itself has a minimum value.
1961
 *          The range is determined by scorefract: we include all abscissa
1962
 *          values to the left and right of the value that maximizes the
1963
 *          score, such that the score stays above (1 - scorefract) * maxscore.
1964
 *          The intuition behind this modification is to try to find
1965
 *          a split point that both has a high variance score and is
1966
 *          at or near a minimum in the histogram, so that the histogram
1967
 *          slope is small at the split point.
1968
 *      (4) We normalize the score so that if the two distributions
1969
 *          were of equal size and at opposite ends of the numa, the
1970
 *          score would be 1.0.
1971
 * </pre>
1972
 */
1973
l_ok
1974
numaSplitDistribution(NUMA       *na,
1975
                      l_float32   scorefract,
1976
                      l_int32    *psplitindex,
1977
                      l_float32  *pave1,
1978
                      l_float32  *pave2,
1979
                      l_float32  *pnum1,
1980
                      l_float32  *pnum2,
1981
                      NUMA      **pnascore)
1982
0
{
1983
0
l_int32    i, n, bestsplit, minrange, maxrange, maxindex;
1984
0
l_float32  ave1, ave2, ave1prev, ave2prev;
1985
0
l_float32  num1, num2, num1prev, num2prev;
1986
0
l_float32  val, minval, sum, fract1;
1987
0
l_float32  norm, score, minscore, maxscore;
1988
0
NUMA      *nascore, *naave1, *naave2, *nanum1, *nanum2;
1989
1990
0
    if (psplitindex) *psplitindex = 0;
1991
0
    if (pave1) *pave1 = 0.0;
1992
0
    if (pave2) *pave2 = 0.0;
1993
0
    if (pnum1) *pnum1 = 0.0;
1994
0
    if (pnum2) *pnum2 = 0.0;
1995
0
    if (pnascore) *pnascore = NULL;
1996
0
    if (!na)
1997
0
        return ERROR_INT("na not defined", __func__, 1);
1998
1999
0
    n = numaGetCount(na);
2000
0
    if (n <= 1)
2001
0
        return ERROR_INT("n = 1 in histogram", __func__, 1);
2002
0
    numaGetSum(na, &sum);
2003
0
    if (sum <= 0.0)
2004
0
        return ERROR_INT("sum <= 0.0", __func__, 1);
2005
0
    norm = 4.0f / ((l_float32)(n - 1) * (n - 1));
2006
0
    ave1prev = 0.0;
2007
0
    numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL);
2008
0
    num1prev = 0.0;
2009
0
    num2prev = sum;
2010
0
    maxindex = n / 2;  /* initialize with something */
2011
2012
        /* Split the histogram with [0 ... i] in the lower part
2013
         * and [i+1 ... n-1] in upper part.  First, compute an otsu
2014
         * score for each possible splitting.  */
2015
0
    if ((nascore = numaCreate(n)) == NULL)
2016
0
        return ERROR_INT("nascore not made", __func__, 1);
2017
0
    naave1 = (pave1) ? numaCreate(n) : NULL;
2018
0
    naave2 = (pave2) ? numaCreate(n) : NULL;
2019
0
    nanum1 = (pnum1) ? numaCreate(n) : NULL;
2020
0
    nanum2 = (pnum2) ? numaCreate(n) : NULL;
2021
0
    maxscore = 0.0;
2022
0
    for (i = 0; i < n; i++) {
2023
0
        numaGetFValue(na, i, &val);
2024
0
        num1 = num1prev + val;
2025
0
        if (num1 == 0)
2026
0
            ave1 = ave1prev;
2027
0
        else
2028
0
            ave1 = (num1prev * ave1prev + i * val) / num1;
2029
0
        num2 = num2prev - val;
2030
0
        if (num2 == 0)
2031
0
            ave2 = ave2prev;
2032
0
        else
2033
0
            ave2 = (num2prev * ave2prev - i * val) / num2;
2034
0
        fract1 = num1 / sum;
2035
0
        score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1);
2036
0
        numaAddNumber(nascore, score);
2037
0
        if (pave1) numaAddNumber(naave1, ave1);
2038
0
        if (pave2) numaAddNumber(naave2, ave2);
2039
0
        if (pnum1) numaAddNumber(nanum1, num1);
2040
0
        if (pnum2) numaAddNumber(nanum2, num2);
2041
0
        if (score > maxscore) {
2042
0
            maxscore = score;
2043
0
            maxindex = i;
2044
0
        }
2045
0
        num1prev = num1;
2046
0
        num2prev = num2;
2047
0
        ave1prev = ave1;
2048
0
        ave2prev = ave2;
2049
0
    }
2050
2051
        /* Next, for all contiguous scores within a specified fraction
2052
         * of the max, choose the split point as the value with the
2053
         * minimum in the histogram. */
2054
0
    minscore = (1.f - scorefract) * maxscore;
2055
0
    for (i = maxindex - 1; i >= 0; i--) {
2056
0
        numaGetFValue(nascore, i, &val);
2057
0
        if (val < minscore)
2058
0
            break;
2059
0
    }
2060
0
    minrange = i + 1;
2061
0
    for (i = maxindex + 1; i < n; i++) {
2062
0
        numaGetFValue(nascore, i, &val);
2063
0
        if (val < minscore)
2064
0
            break;
2065
0
    }
2066
0
    maxrange = i - 1;
2067
0
    numaGetFValue(na, minrange, &minval);
2068
0
    bestsplit = minrange;
2069
0
    for (i = minrange + 1; i <= maxrange; i++) {
2070
0
        numaGetFValue(na, i, &val);
2071
0
        if (val < minval) {
2072
0
            minval = val;
2073
0
            bestsplit = i;
2074
0
        }
2075
0
    }
2076
2077
        /* Add one to the bestsplit value to get the threshold value,
2078
         * because when we take a threshold, as in pixThresholdToBinary(),
2079
         * we always choose the set with values below the threshold. */
2080
0
    bestsplit = L_MIN(255, bestsplit + 1);
2081
2082
0
    if (psplitindex) *psplitindex = bestsplit;
2083
0
    if (pave1) numaGetFValue(naave1, bestsplit, pave1);
2084
0
    if (pave2) numaGetFValue(naave2, bestsplit, pave2);
2085
0
    if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1);
2086
0
    if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2);
2087
2088
0
    if (pnascore) {  /* debug mode */
2089
0
        lept_stderr("minrange = %d, maxrange = %d\n", minrange, maxrange);
2090
0
        lept_stderr("minval = %10.0f\n", minval);
2091
0
        gplotSimple1(nascore, GPLOT_PNG, "/tmp/lept/nascore",
2092
0
                     "Score for split distribution");
2093
0
        *pnascore = nascore;
2094
0
    } else {
2095
0
        numaDestroy(&nascore);
2096
0
    }
2097
2098
0
    if (pave1) numaDestroy(&naave1);
2099
0
    if (pave2) numaDestroy(&naave2);
2100
0
    if (pnum1) numaDestroy(&nanum1);
2101
0
    if (pnum2) numaDestroy(&nanum2);
2102
0
    return 0;
2103
0
}
2104
2105
2106
/*----------------------------------------------------------------------*
2107
 *                         Comparing histograms                         *
2108
 *----------------------------------------------------------------------*/
2109
/*!
2110
 * \brief   grayHistogramsToEMD()
2111
 *
2112
 * \param[in]    naa1, naa2    two numaa, each with one or more 256-element
2113
 *                             histograms
2114
 * \param[out]   pnad          nad of EM distances for each histogram
2115
 * \return  0 if OK, 1 on error
2116
 *
2117
 * <pre>
2118
 * Notes:
2119
 *     (1) The two numaas must be the same size and have corresponding
2120
 *         256-element histograms.  Pairs do not need to be normalized
2121
 *         to the same sum.
2122
 *     (2) This is typically used on two sets of histograms from
2123
 *         corresponding tiles of two images.  The similarity of two
2124
 *         images can be found with the scoring function used in
2125
 *         pixCompareGrayByHisto():
2126
 *             score S = 1.0 - k * D, where
2127
 *                 k is a constant, say in the range 5-10
2128
 *                 D = EMD
2129
 *             for each tile; for multiple tiles, take the Min(S) over
2130
 *             the set of tiles to be the final score.
2131
 * </pre>
2132
 */
2133
l_ok
2134
grayHistogramsToEMD(NUMAA  *naa1,
2135
                    NUMAA  *naa2,
2136
                    NUMA  **pnad)
2137
0
{
2138
0
l_int32     i, n, nt;
2139
0
l_float32   dist;
2140
0
NUMA       *na1, *na2, *nad;
2141
2142
0
    if (!pnad)
2143
0
        return ERROR_INT("&nad not defined", __func__, 1);
2144
0
    *pnad = NULL;
2145
0
    if (!naa1 || !naa2)
2146
0
        return ERROR_INT("na1 and na2 not both defined", __func__, 1);
2147
0
    n = numaaGetCount(naa1);
2148
0
    if (n != numaaGetCount(naa2))
2149
0
        return ERROR_INT("naa1 and naa2 numa counts differ", __func__, 1);
2150
0
    nt = numaaGetNumberCount(naa1);
2151
0
    if (nt != numaaGetNumberCount(naa2))
2152
0
        return ERROR_INT("naa1 and naa2 number counts differ", __func__, 1);
2153
0
    if (256 * n != nt)  /* good enough check */
2154
0
        return ERROR_INT("na sizes must be 256", __func__, 1);
2155
2156
0
    nad = numaCreate(n);
2157
0
    *pnad = nad;
2158
0
    for (i = 0; i < n; i++) {
2159
0
        na1 = numaaGetNuma(naa1, i, L_CLONE);
2160
0
        na2 = numaaGetNuma(naa2, i, L_CLONE);
2161
0
        numaEarthMoverDistance(na1, na2, &dist);
2162
0
        numaAddNumber(nad, dist / 255.f);  /* normalize to [0.0 - 1.0] */
2163
0
        numaDestroy(&na1);
2164
0
        numaDestroy(&na2);
2165
0
    }
2166
0
    return 0;
2167
0
}
2168
2169
2170
/*!
2171
 * \brief   numaEarthMoverDistance()
2172
 *
2173
 * \param[in]    na1, na2    two numas of the same size, typically histograms
2174
 * \param[out]   pdist       earthmover distance
2175
 * \return  0 if OK, 1 on error
2176
 *
2177
 * <pre>
2178
 * Notes:
2179
 *     (1) The two numas must have the same size.  They do not need to be
2180
 *         normalized to the same sum before applying the function.
2181
 *     (2) For a 1D discrete function, the implementation of the EMD
2182
 *         is trivial.  Just keep filling or emptying buckets in one numa
2183
 *         to match the amount in the other, moving sequentially along
2184
 *         both arrays.
2185
 *     (3) We divide the sum of the absolute value of everything moved
2186
 *         (by 1 unit at a time) by the sum of the numa (amount of "earth")
2187
 *         to get the average distance that the "earth" was moved.
2188
 *         This is the value returned here.
2189
 *     (4) The caller can do a further normalization, by the number of
2190
 *         buckets (minus 1), to get the EM distance as a fraction of
2191
 *         the maximum possible distance, which is n-1.  This fraction
2192
 *         is 1.0 for the situation where all the 'earth' in the first
2193
 *         array is at one end, and all in the second array is at the
2194
 *         other end.
2195
 * </pre>
2196
 */
2197
l_ok
2198
numaEarthMoverDistance(NUMA       *na1,
2199
                       NUMA       *na2,
2200
                       l_float32  *pdist)
2201
0
{
2202
0
l_int32     n, norm, i;
2203
0
l_float32   sum1, sum2, diff, total;
2204
0
l_float32  *array1, *array3;
2205
0
NUMA       *na3;
2206
2207
0
    if (!pdist)
2208
0
        return ERROR_INT("&dist not defined", __func__, 1);
2209
0
    *pdist = 0.0;
2210
0
    if (!na1 || !na2)
2211
0
        return ERROR_INT("na1 and na2 not both defined", __func__, 1);
2212
0
    n = numaGetCount(na1);
2213
0
    if (n != numaGetCount(na2))
2214
0
        return ERROR_INT("na1 and na2 have different size", __func__, 1);
2215
2216
        /* Generate na3; normalize to na1 if necessary */
2217
0
    numaGetSum(na1, &sum1);
2218
0
    numaGetSum(na2, &sum2);
2219
0
    norm = (L_ABS(sum1 - sum2) < 0.00001 * L_ABS(sum1)) ? 1 : 0;
2220
0
    if (!norm)
2221
0
        na3 = numaTransform(na2, 0, sum1 / sum2);
2222
0
    else
2223
0
        na3 = numaCopy(na2);
2224
0
    array1 = numaGetFArray(na1, L_NOCOPY);
2225
0
    array3 = numaGetFArray(na3, L_NOCOPY);
2226
2227
        /* Move earth in n3 from array elements, to match n1 */
2228
0
    total = 0;
2229
0
    for (i = 1; i < n; i++) {
2230
0
        diff = array1[i - 1] - array3[i - 1];
2231
0
        array3[i] -= diff;
2232
0
        total += L_ABS(diff);
2233
0
    }
2234
0
    *pdist = total / sum1;
2235
2236
0
    numaDestroy(&na3);
2237
0
    return 0;
2238
0
}
2239
2240
2241
/*!
2242
 * \brief   grayInterHistogramStats()
2243
 *
2244
 * \param[in]    naa      numaa with two or more 256-element histograms
2245
 * \param[in]    wc       half-width of the smoothing window
2246
 * \param[out]   pnam     [optional] mean values
2247
 * \param[out]   pnams    [optional] mean square values
2248
 * \param[out]   pnav     [optional] variances
2249
 * \param[out]   pnarv    [optional] rms deviations from the mean
2250
 * \return  0 if OK, 1 on error
2251
 *
2252
 * <pre>
2253
 * Notes:
2254
 *     (1) The %naa has two or more 256-element numa histograms, which
2255
 *         are to be compared value-wise at each of the 256 gray levels.
2256
 *         The result are stats (mean, mean square, variance, root variance)
2257
 *         aggregated across the set of histograms, and each is output
2258
 *         as a 256 entry numa.  Think of these histograms as a matrix,
2259
 *         where each histogram is one row of the array.  The stats are
2260
 *         then aggregated column-wise, between the histograms.
2261
 *     (2) These stats are:
2262
 *            ~ average value: <v>  (nam)
2263
 *            ~ average squared value: <v*v> (nams)
2264
 *            ~ variance: <(v - <v>)*(v - <v>)> = <v*v> - <v>*<v>  (nav)
2265
 *            ~ square-root of variance: (narv)
2266
 *         where the brackets < .. > indicate that the average value is
2267
 *         to be taken over each column of the array.
2268
 *     (3) The input histograms are optionally smoothed before these
2269
 *         statistical operations.
2270
 *     (4) The input histograms are normalized to a sum of 10000.  By
2271
 *         doing this, the resulting numbers are independent of the
2272
 *         number of samples used in building the individual histograms.
2273
 *     (5) A typical application is on a set of histograms from tiles
2274
 *         of an image, to distinguish between text/tables and photo
2275
 *         regions.  If the tiles are much larger than the text line
2276
 *         spacing, text/table regions typically have smaller variance
2277
 *         across tiles than photo regions.  For this application, it
2278
 *         may be useful to ignore values near white, which are large for
2279
 *         text and would magnify the variance due to variations in
2280
 *         illumination.  However, because the variance of a drawing or
2281
 *         a light photo can be similar to that of grayscale text, this
2282
 *         function is only a discriminator between darker photos/drawings
2283
 *         and light photos/text/line-graphics.
2284
 * </pre>
2285
 */
2286
l_ok
2287
grayInterHistogramStats(NUMAA   *naa,
2288
                        l_int32  wc,
2289
                        NUMA   **pnam,
2290
                        NUMA   **pnams,
2291
                        NUMA   **pnav,
2292
                        NUMA   **pnarv)
2293
0
{
2294
0
l_int32      i, j, n, nn;
2295
0
l_float32  **arrays;
2296
0
l_float32    mean, var, rvar;
2297
0
NUMA        *na1, *na2, *na3, *na4;
2298
2299
0
    if (pnam) *pnam = NULL;
2300
0
    if (pnams) *pnams = NULL;
2301
0
    if (pnav) *pnav = NULL;
2302
0
    if (pnarv) *pnarv = NULL;
2303
0
    if (!pnam && !pnams && !pnav && !pnarv)
2304
0
        return ERROR_INT("nothing requested", __func__, 1);
2305
0
    if (!naa)
2306
0
        return ERROR_INT("naa not defined", __func__, 1);
2307
0
    n = numaaGetCount(naa);
2308
0
    for (i = 0; i < n; i++) {
2309
0
        nn = numaaGetNumaCount(naa, i);
2310
0
        if (nn != 256) {
2311
0
            L_ERROR("%d numbers in numa[%d]\n", __func__, nn, i);
2312
0
            return 1;
2313
0
        }
2314
0
    }
2315
2316
0
    if (pnam) *pnam = numaCreate(256);
2317
0
    if (pnams) *pnams = numaCreate(256);
2318
0
    if (pnav) *pnav = numaCreate(256);
2319
0
    if (pnarv) *pnarv = numaCreate(256);
2320
2321
        /* First, use mean smoothing, normalize each histogram,
2322
         * and save all results in a 2D matrix. */
2323
0
    arrays = (l_float32 **)LEPT_CALLOC(n, sizeof(l_float32 *));
2324
0
    for (i = 0; i < n; i++) {
2325
0
        na1 = numaaGetNuma(naa, i, L_CLONE);
2326
0
        na2 = numaWindowedMean(na1, wc);
2327
0
        na3 = numaNormalizeHistogram(na2, 10000.);
2328
0
        arrays[i] = numaGetFArray(na3, L_COPY);
2329
0
        numaDestroy(&na1);
2330
0
        numaDestroy(&na2);
2331
0
        numaDestroy(&na3);
2332
0
    }
2333
2334
        /* Get stats between histograms */
2335
0
    for (j = 0; j < 256; j++) {
2336
0
        na4 = numaCreate(n);
2337
0
        for (i = 0; i < n; i++) {
2338
0
            numaAddNumber(na4, arrays[i][j]);
2339
0
        }
2340
0
        numaSimpleStats(na4, 0, -1, &mean, &var, &rvar);
2341
0
        if (pnam) numaAddNumber(*pnam, mean);
2342
0
        if (pnams) numaAddNumber(*pnams, mean * mean);
2343
0
        if (pnav) numaAddNumber(*pnav, var);
2344
0
        if (pnarv) numaAddNumber(*pnarv, rvar);
2345
0
        numaDestroy(&na4);
2346
0
    }
2347
2348
0
    for (i = 0; i < n; i++)
2349
0
        LEPT_FREE(arrays[i]);
2350
0
    LEPT_FREE(arrays);
2351
0
    return 0;
2352
0
}
2353
2354
2355
/*----------------------------------------------------------------------*
2356
 *                             Extrema finding                          *
2357
 *----------------------------------------------------------------------*/
2358
/*!
2359
 * \brief   numaFindPeaks()
2360
 *
2361
 * \param[in]    nas      source numa
2362
 * \param[in]    nmax     max number of peaks to be found
2363
 * \param[in]    fract1   min fraction of peak value
2364
 * \param[in]    fract2   min slope
2365
 * \return  peak na, or NULL on error.
2366
 *
2367
 * <pre>
2368
 * Notes:
2369
 *     (1) The returned na consists of sets of four numbers representing
2370
 *         the peak, in the following order:
2371
 *            left edge; peak center; right edge; normalized peak area
2372
 * </pre>
2373
 */
2374
NUMA *
2375
numaFindPeaks(NUMA      *nas,
2376
              l_int32    nmax,
2377
              l_float32  fract1,
2378
              l_float32  fract2)
2379
0
{
2380
0
l_int32    i, k, n, maxloc, lloc, rloc;
2381
0
l_float32  fmaxval, sum, total, newtotal, val, lastval;
2382
0
l_float32  peakfract;
2383
0
NUMA      *na, *napeak;
2384
2385
0
    if (!nas)
2386
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
2387
0
    n = numaGetCount(nas);
2388
0
    numaGetSum(nas, &total);
2389
2390
        /* We munge this copy */
2391
0
    if ((na = numaCopy(nas)) == NULL)
2392
0
        return (NUMA *)ERROR_PTR("na not made", __func__, NULL);
2393
0
    if ((napeak = numaCreate(4 * nmax)) == NULL) {
2394
0
        numaDestroy(&na);
2395
0
        return (NUMA *)ERROR_PTR("napeak not made", __func__, NULL);
2396
0
    }
2397
2398
0
    for (k = 0; k < nmax; k++) {
2399
0
        numaGetSum(na, &newtotal);
2400
0
        if (newtotal == 0.0)   /* sanity check */
2401
0
            break;
2402
0
        numaGetMax(na, &fmaxval, &maxloc);
2403
0
        sum = fmaxval;
2404
0
        lastval = fmaxval;
2405
0
        lloc = 0;
2406
0
        for (i = maxloc - 1; i >= 0; --i) {
2407
0
            numaGetFValue(na, i, &val);
2408
0
            if (val == 0.0) {
2409
0
                lloc = i + 1;
2410
0
                break;
2411
0
            }
2412
0
            if (val > fract1 * fmaxval) {
2413
0
                sum += val;
2414
0
                lastval = val;
2415
0
                continue;
2416
0
            }
2417
0
            if (lastval - val > fract2 * lastval) {
2418
0
                sum += val;
2419
0
                lastval = val;
2420
0
                continue;
2421
0
            }
2422
0
            lloc = i;
2423
0
            break;
2424
0
        }
2425
0
        lastval = fmaxval;
2426
0
        rloc = n - 1;
2427
0
        for (i = maxloc + 1; i < n; ++i) {
2428
0
            numaGetFValue(na, i, &val);
2429
0
            if (val == 0.0) {
2430
0
                rloc = i - 1;
2431
0
                break;
2432
0
            }
2433
0
            if (val > fract1 * fmaxval) {
2434
0
                sum += val;
2435
0
                lastval = val;
2436
0
                continue;
2437
0
            }
2438
0
            if (lastval - val > fract2 * lastval) {
2439
0
                sum += val;
2440
0
                lastval = val;
2441
0
                continue;
2442
0
            }
2443
0
            rloc = i;
2444
0
            break;
2445
0
        }
2446
0
        peakfract = sum / total;
2447
0
        numaAddNumber(napeak, lloc);
2448
0
        numaAddNumber(napeak, maxloc);
2449
0
        numaAddNumber(napeak, rloc);
2450
0
        numaAddNumber(napeak, peakfract);
2451
2452
0
        for (i = lloc; i <= rloc; i++)
2453
0
            numaSetValue(na, i, 0.0);
2454
0
    }
2455
2456
0
    numaDestroy(&na);
2457
0
    return napeak;
2458
0
}
2459
2460
2461
/*!
2462
 * \brief   numaFindExtrema()
2463
 *
2464
 * \param[in]    nas     input values
2465
 * \param[in]    delta   relative amount to resolve peaks and valleys
2466
 * \param[out]   pnav    [optional] values of extrema
2467
 * \return  nad (locations of extrema, or NULL on error
2468
 *
2469
 * <pre>
2470
 * Notes:
2471
 *      (1) This returns a sequence of extrema (peaks and valleys).
2472
 *      (2) The algorithm is analogous to that for determining
2473
 *          mountain peaks.  Suppose we have a local peak, with
2474
 *          bumps on the side.  Under what conditions can we consider
2475
 *          those 'bumps' to be actual peaks?  The answer: if the
2476
 *          bump is separated from the peak by a saddle that is at
2477
 *          least 500 feet below the bump.
2478
 *      (3) Operationally, suppose we are trying to identify a peak.
2479
 *          We have a previous valley, and also the largest value that
2480
 *          we have seen since that valley.  We can identify this as
2481
 *          a peak if we find a value that is delta BELOW it.  When
2482
 *          we find such a value, label the peak, use the current
2483
 *          value to label the starting point for the search for
2484
 *          a valley, and do the same operation in reverse.  Namely,
2485
 *          keep track of the lowest point seen, and look for a value
2486
 *          that is delta ABOVE it.  Once found, the lowest point is
2487
 *          labeled the valley, and continue, looking for the next peak.
2488
 * </pre>
2489
 */
2490
NUMA *
2491
numaFindExtrema(NUMA      *nas,
2492
                l_float32  delta,
2493
                NUMA     **pnav)
2494
0
{
2495
0
l_int32    i, n, found, loc, direction;
2496
0
l_float32  startval, val, maxval, minval;
2497
0
NUMA      *nav, *nad;
2498
2499
0
    if (pnav) *pnav = NULL;
2500
0
    if (!nas)
2501
0
        return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
2502
0
    if (delta < 0.0)
2503
0
        return (NUMA *)ERROR_PTR("delta < 0", __func__, NULL);
2504
2505
0
    n = numaGetCount(nas);
2506
0
    nad = numaCreate(0);
2507
0
    nav =  NULL;
2508
0
    if (pnav) {
2509
0
        nav = numaCreate(0);
2510
0
        *pnav = nav;
2511
0
    }
2512
2513
        /* We don't know if we'll find a peak or valley first,
2514
         * but use the first element of nas as the reference point.
2515
         * Break when we deviate by 'delta' from the first point. */
2516
0
    numaGetFValue(nas, 0, &startval);
2517
0
    found = FALSE;
2518
0
    for (i = 1; i < n; i++) {
2519
0
        numaGetFValue(nas, i, &val);
2520
0
        if (L_ABS(val - startval) >= delta) {
2521
0
            found = TRUE;
2522
0
            break;
2523
0
        }
2524
0
    }
2525
2526
0
    if (!found)
2527
0
        return nad;  /* it's empty */
2528
2529
        /* Are we looking for a peak or a valley? */
2530
0
    if (val > startval) {  /* peak */
2531
0
        direction = 1;
2532
0
        maxval = val;
2533
0
    } else {
2534
0
        direction = -1;
2535
0
        minval = val;
2536
0
    }
2537
0
    loc = i;
2538
2539
        /* Sweep through the rest of the array, recording alternating
2540
         * peak/valley extrema. */
2541
0
    for (i = i + 1; i < n; i++) {
2542
0
        numaGetFValue(nas, i, &val);
2543
0
        if (direction == 1 && val > maxval ) {  /* new local max */
2544
0
            maxval = val;
2545
0
            loc = i;
2546
0
        } else if (direction == -1 && val < minval ) {  /* new local min */
2547
0
            minval = val;
2548
0
            loc = i;
2549
0
        } else if (direction == 1 && (maxval - val >= delta)) {
2550
0
            numaAddNumber(nad, loc);  /* save the current max location */
2551
0
            if (nav) numaAddNumber(nav, maxval);
2552
0
            direction = -1;  /* reverse: start looking for a min */
2553
0
            minval = val;
2554
0
            loc = i;  /* current min location */
2555
0
        } else if (direction == -1 && (val - minval >= delta)) {
2556
0
            numaAddNumber(nad, loc);  /* save the current min location */
2557
0
            if (nav) numaAddNumber(nav, minval);
2558
0
            direction = 1;  /* reverse: start looking for a max */
2559
0
            maxval = val;
2560
0
            loc = i;  /* current max location */
2561
0
        }
2562
0
    }
2563
2564
        /* Save the final extremum */
2565
/*    numaAddNumber(nad, loc); */
2566
0
    return nad;
2567
0
}
2568
2569
2570
/*!
2571
 * \brief   numaFindLocForThreshold()
2572
 *
2573
 * \param[in]    nas       input histogram
2574
 * \param[in]    skip      look-ahead distance to avoid false mininma;
2575
 *                         use 0 for default
2576
 * \param[out]   pthresh   threshold value
2577
 * \param[out]   pfract    [optional] fraction below or at threshold
2578
 * \return  0 if OK, 1 on error or if no threshold can be found
2579
 *
2580
 * <pre>
2581
 * Notes:
2582
 *      (1) This finds a good place to set a threshold for a histogram
2583
 *          of values that has two peaks.  The peaks can differ greatly
2584
 *          in area underneath them.  The number of buckets in the
2585
 *          histogram is expected to be 256 (e.g, from an 8 bpp gray image).
2586
 *      (2) The input histogram should have been smoothed with a window
2587
 *          to avoid false peak and valley detection due to noise.  For
2588
 *          example, see pixThresholdByHisto().
2589
 *      (3) A skip value can be input to determine the look-ahead distance
2590
 *          to ignore a false peak on the rise or descent from the first peak.
2591
 *          Input 0 to use the default value (it assumes a histo size of 256).
2592
 *      (4) Optionally, the fractional area under the first peak can
2593
 *          be returned.
2594
 * </pre>
2595
 */
2596
l_ok
2597
numaFindLocForThreshold(NUMA       *na,
2598
                        l_int32     skip,
2599
                        l_int32    *pthresh,
2600
                        l_float32  *pfract)
2601
0
{
2602
0
l_int32     i, n, start, index, minloc, found;
2603
0
l_float32   val, pval, jval, minval, maxval, sum, partsum;
2604
0
l_float32  *fa;
2605
2606
0
    if (pfract) *pfract = 0.0;
2607
0
    if (!pthresh)
2608
0
        return ERROR_INT("&thresh not defined", __func__, 1);
2609
0
    *pthresh = 0;
2610
0
    if (!na)
2611
0
        return ERROR_INT("na not defined", __func__, 1);
2612
0
    if (skip <= 0) skip = 20;
2613
2614
        /* Test for constant value */
2615
0
    numaGetMin(na, &minval, NULL);
2616
0
    numaGetMax(na, &maxval, NULL);
2617
0
    if (minval == maxval)
2618
0
       return ERROR_INT("all array values are the same", __func__, 1);
2619
2620
        /* Look for the top of the first peak */
2621
0
    n = numaGetCount(na);
2622
0
    if (n < 256)
2623
0
        L_WARNING("array size %d < 256\n", __func__, n);
2624
0
    fa = numaGetFArray(na, L_NOCOPY);
2625
0
    pval = fa[0];
2626
0
    for (i = 1; i < n; i++) {
2627
0
        val = fa[i];
2628
0
        index = L_MIN(i + skip, n - 1);
2629
0
        jval = fa[index];
2630
0
        if (val < pval && jval < pval)  /* near the top if not there */
2631
0
            break;
2632
0
        pval = val;
2633
0
    }
2634
2635
0
    if (i > n - 5)  /* just an increasing function */
2636
0
       return ERROR_INT("top of first peak not found", __func__, 1);
2637
2638
        /* Look for the low point in the valley */
2639
0
    found = FALSE;
2640
0
    start = i;
2641
0
    pval = fa[start];
2642
0
    for (i = start + 1; i < n; i++) {
2643
0
        val = fa[i];
2644
0
        if (val <= pval) {  /* appears to be going down */
2645
0
            pval = val;
2646
0
        } else {  /* appears to be going up */
2647
0
            index = L_MIN(i + skip, n - 1);
2648
0
            jval = fa[index];  /* junp ahead by 'skip' */
2649
0
            if (val > jval) {  /* still going down; jump ahead */
2650
0
                pval = jval;
2651
0
                i = index;
2652
0
            } else {  /* really going up; passed the min */
2653
0
                found = TRUE;
2654
0
                break;
2655
0
            }
2656
0
        }
2657
0
    }
2658
0
    if (!found)
2659
0
       return ERROR_INT("no minimum found", __func__, 1);
2660
2661
        /* Find the location of the minimum in the interval */
2662
0
    minloc = index;  /* likely passed the min; look backward */
2663
0
    minval = fa[index];
2664
0
    for (i = index - 1; i > index - skip; i--) {
2665
0
        if (fa[i] < minval) {
2666
0
            minval = fa[i];
2667
0
            minloc = i;
2668
0
        }
2669
0
    }
2670
2671
        /* Is the minimum very near the end of the array? */
2672
0
    if (minloc > n - 10)
2673
0
       return ERROR_INT("minimum at end of array; invalid", __func__, 1);
2674
0
    *pthresh = minloc;
2675
2676
        /* Find the fraction under the first peak */
2677
0
    if (pfract) {
2678
0
        numaGetSumOnInterval(na, 0, minloc, &partsum);
2679
0
        numaGetSum(na, &sum);
2680
0
        if (sum > 0.0)
2681
0
           *pfract = partsum / sum;
2682
0
    }
2683
0
    return 0;
2684
0
}
2685
2686
2687
/*!
2688
 * \brief   numaCountReversals()
2689
 *
2690
 * \param[in]    nas          input values
2691
 * \param[in]    minreversal  relative amount to resolve peaks and valleys
2692
 * \param[out]   pnr          [optional] number of reversals
2693
 * \param[out]   prd          [optional] reversal density: reversals/length
2694
 * \return  0 if OK, 1 on error
2695
 *
2696
 * <pre>
2697
 * Notes:
2698
 *      (1) The input numa is can be generated from pixExtractAlongLine().
2699
 *          If so, the x parameters can be used to find the reversal
2700
 *          frequency along a line.
2701
 *      (2) If the input numa was generated from a 1 bpp pix, the
2702
 *          values will be 0 and 1.  Use %minreversal == 1 to get
2703
 *          the number of pixel flips.  If the only values are 0 and 1,
2704
 *          but %minreversal > 1, set the reversal count to 0 and
2705
 *          issue a warning.
2706
 * </pre>
2707
 */
2708
l_ok
2709
numaCountReversals(NUMA       *nas,
2710
                   l_float32   minreversal,
2711
                   l_int32    *pnr,
2712
                   l_float32  *prd)
2713
0
{
2714
0
l_int32    i, n, nr, ival, binvals;
2715
0
l_int32   *ia;
2716
0
l_float32  fval, delx, len;
2717
0
NUMA      *nat;
2718
2719
0
    if (pnr) *pnr = 0;
2720
0
    if (prd) *prd = 0.0;
2721
0
    if (!pnr && !prd)
2722
0
        return ERROR_INT("neither &nr nor &rd are defined", __func__, 1);
2723
0
    if (!nas)
2724
0
        return ERROR_INT("nas not defined", __func__, 1);
2725
0
    if ((n = numaGetCount(nas)) == 0) {
2726
0
        L_INFO("nas is empty\n", __func__);
2727
0
        return 0;
2728
0
    }
2729
0
    if (minreversal < 0.0)
2730
0
        return ERROR_INT("minreversal < 0", __func__, 1);
2731
2732
        /* Decide if the only values are 0 and 1 */
2733
0
    binvals = TRUE;
2734
0
    for (i = 0; i < n; i++) {
2735
0
        numaGetFValue(nas, i, &fval);
2736
0
        if (fval != 0.0 && fval != 1.0) {
2737
0
            binvals = FALSE;
2738
0
            break;
2739
0
        }
2740
0
    }
2741
2742
0
    nr = 0;
2743
0
    if (binvals) {
2744
0
        if (minreversal > 1.0) {
2745
0
            L_WARNING("binary values but minreversal > 1\n", __func__);
2746
0
        } else {
2747
0
            ia = numaGetIArray(nas);
2748
0
            ival = ia[0];
2749
0
            for (i = 1; i < n; i++) {
2750
0
                if (ia[i] != ival) {
2751
0
                    nr++;
2752
0
                    ival = ia[i];
2753
0
                }
2754
0
            }
2755
0
            LEPT_FREE(ia);
2756
0
        }
2757
0
    } else {
2758
0
        nat = numaFindExtrema(nas, minreversal, NULL);
2759
0
        nr = numaGetCount(nat);
2760
0
        numaDestroy(&nat);
2761
0
    }
2762
0
    if (pnr) *pnr = nr;
2763
0
    if (prd) {
2764
0
        numaGetParameters(nas, NULL, &delx);
2765
0
        len = delx * n;
2766
0
        *prd = (l_float32)nr / len;
2767
0
    }
2768
2769
0
    return 0;
2770
0
}
2771
2772
2773
/*----------------------------------------------------------------------*
2774
 *                Threshold crossings and frequency analysis            *
2775
 *----------------------------------------------------------------------*/
2776
/*!
2777
 * \brief   numaSelectCrossingThreshold()
2778
 *
2779
 * \param[in]    nax          [optional] numa of abscissa values; can be NULL
2780
 * \param[in]    nay          signal
2781
 * \param[in]    estthresh    estimated pixel threshold for crossing:
2782
 *                            e.g., for images, white <--> black; typ. ~120
2783
 * \param[out]   pbestthresh  robust estimate of threshold to use
2784
 * \return  0 if OK, 1 on error or warning
2785
 *
2786
 * <pre>
2787
 * Notes:
2788
 *     (1) When a valid threshold is used, the number of crossings is
2789
 *         a maximum, because none are missed.  If no threshold intersects
2790
 *         all the crossings, the crossings must be determined with
2791
 *         numaCrossingsByPeaks().
2792
 *     (2) %estthresh is an input estimate of the threshold that should
2793
 *         be used.  We compute the crossings with 41 thresholds
2794
 *         (20 below and 20 above).  There is a range in which the
2795
 *         number of crossings is a maximum.  Return a threshold
2796
 *         in the center of this stable plateau of crossings.
2797
 *         This can then be used with numaCrossingsByThreshold()
2798
 *         to get a good estimate of crossing locations.
2799
 *     (3) If the count of nay is less than 2, a warning is issued.
2800
 * </pre>
2801
 */
2802
l_ok
2803
numaSelectCrossingThreshold(NUMA       *nax,
2804
                            NUMA       *nay,
2805
                            l_float32   estthresh,
2806
                            l_float32  *pbestthresh)
2807
0
{
2808
0
l_int32    i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen;
2809
0
l_int32    val, maxval, nmax, count;
2810
0
l_float32  thresh, fmaxval, fmodeval;
2811
0
NUMA      *nat, *nac;
2812
2813
0
    if (!pbestthresh)
2814
0
        return ERROR_INT("&bestthresh not defined", __func__, 1);
2815
0
    *pbestthresh = 0.0;
2816
0
    if (!nay)
2817
0
        return ERROR_INT("nay not defined", __func__, 1);
2818
0
    if (numaGetCount(nay) < 2) {
2819
0
        L_WARNING("nay count < 2; no threshold crossing\n", __func__);
2820
0
        return 1;
2821
0
    }
2822
2823
        /* Compute the number of crossings for different thresholds */
2824
0
    nat = numaCreate(41);
2825
0
    for (i = 0; i < 41; i++) {
2826
0
        thresh = estthresh - 80.0f + 4.0f * i;
2827
0
        nac = numaCrossingsByThreshold(nax, nay, thresh);
2828
0
        numaAddNumber(nat, numaGetCount(nac));
2829
0
        numaDestroy(&nac);
2830
0
    }
2831
2832
        /* Find the center of the plateau of max crossings, which
2833
         * extends from thresh[istart] to thresh[iend]. */
2834
0
    numaGetMax(nat, &fmaxval, NULL);
2835
0
    maxval = (l_int32)fmaxval;
2836
0
    nmax = 0;
2837
0
    for (i = 0; i < 41; i++) {
2838
0
        numaGetIValue(nat, i, &val);
2839
0
        if (val == maxval)
2840
0
            nmax++;
2841
0
    }
2842
0
    if (nmax < 3) {  /* likely accidental max; try the mode */
2843
0
        numaGetMode(nat, &fmodeval, &count);
2844
0
        if (count > nmax && fmodeval > 0.5 * fmaxval)
2845
0
            maxval = (l_int32)fmodeval;  /* use the mode */
2846
0
    }
2847
2848
0
    inrun = FALSE;
2849
0
    iend = 40;
2850
0
    maxrunlen = 0, maxstart = 0, maxend = 0;
2851
0
    for (i = 0; i < 41; i++) {
2852
0
        numaGetIValue(nat, i, &val);
2853
0
        if (val == maxval) {
2854
0
            if (!inrun) {
2855
0
                istart = i;
2856
0
                inrun = TRUE;
2857
0
            }
2858
0
            continue;
2859
0
        }
2860
0
        if (inrun && (val != maxval)) {
2861
0
            iend = i - 1;
2862
0
            runlen = iend - istart + 1;
2863
0
            inrun = FALSE;
2864
0
            if (runlen > maxrunlen) {
2865
0
                maxstart = istart;
2866
0
                maxend = iend;
2867
0
                maxrunlen = runlen;
2868
0
            }
2869
0
        }
2870
0
    }
2871
0
    if (inrun) {
2872
0
        runlen = i - istart;
2873
0
        if (runlen > maxrunlen) {
2874
0
            maxstart = istart;
2875
0
            maxend = i - 1;
2876
0
            maxrunlen = runlen;
2877
0
        }
2878
0
    }
2879
2880
0
    *pbestthresh = estthresh - 80.0f + 2.0f * (l_float32)(maxstart + maxend);
2881
2882
#if  DEBUG_CROSSINGS
2883
    lept_stderr("\nCrossings attain a maximum at %d thresholds, between:\n"
2884
                "  thresh[%d] = %5.1f and thresh[%d] = %5.1f\n",
2885
                nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart,
2886
                maxend, estthresh - 80.0 + 4.0 * maxend);
2887
    lept_stderr("The best choice: %5.1f\n", *pbestthresh);
2888
    lept_stderr("Number of crossings at the 41 thresholds:");
2889
    numaWriteStderr(nat);
2890
#endif  /* DEBUG_CROSSINGS */
2891
2892
0
    numaDestroy(&nat);
2893
0
    return 0;
2894
0
}
2895
2896
2897
/*!
2898
 * \brief   numaCrossingsByThreshold()
2899
 *
2900
 * \param[in]    nax     [optional] numa of abscissa values; can be NULL
2901
 * \param[in]    nay     numa of ordinate values, corresponding to nax
2902
 * \param[in]    thresh  threshold value for nay
2903
 * \return  nad abscissa pts at threshold, or NULL on error
2904
 *
2905
 * <pre>
2906
 * Notes:
2907
 *      (1) If nax == NULL, we use startx and delx from nay to compute
2908
 *          the crossing values in nad.
2909
 * </pre>
2910
 */
2911
NUMA *
2912
numaCrossingsByThreshold(NUMA      *nax,
2913
                         NUMA      *nay,
2914
                         l_float32  thresh)
2915
0
{
2916
0
l_int32    i, n;
2917
0
l_float32  startx, delx;
2918
0
l_float32  xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract;
2919
0
NUMA      *nad;
2920
2921
0
    if (!nay)
2922
0
        return (NUMA *)ERROR_PTR("nay not defined", __func__, NULL);
2923
0
    n = numaGetCount(nay);
2924
2925
0
    if (nax && (numaGetCount(nax) != n))
2926
0
        return (NUMA *)ERROR_PTR("nax and nay sizes differ", __func__, NULL);
2927
2928
0
    nad = numaCreate(0);
2929
0
    if (n < 2) return nad;
2930
0
    numaGetFValue(nay, 0, &yval1);
2931
0
    numaGetParameters(nay, &startx, &delx);
2932
0
    if (nax)
2933
0
        numaGetFValue(nax, 0, &xval1);
2934
0
    else
2935
0
        xval1 = startx;
2936
0
    for (i = 1; i < n; i++) {
2937
0
        numaGetFValue(nay, i, &yval2);
2938
0
        if (nax)
2939
0
            numaGetFValue(nax, i, &xval2);
2940
0
        else
2941
0
            xval2 = startx + i * delx;
2942
0
        delta1 = yval1 - thresh;
2943
0
        delta2 = yval2 - thresh;
2944
0
        if (delta1 == 0.0) {
2945
0
            numaAddNumber(nad, xval1);
2946
0
        } else if (delta2 == 0.0) {
2947
0
            numaAddNumber(nad, xval2);
2948
0
        } else if (delta1 * delta2 < 0.0) {  /* crossing */
2949
0
            fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
2950
0
            crossval = xval1 + fract * (xval2 - xval1);
2951
0
            numaAddNumber(nad, crossval);
2952
0
        }
2953
0
        xval1 = xval2;
2954
0
        yval1 = yval2;
2955
0
    }
2956
2957
0
    return nad;
2958
0
}
2959
2960
2961
/*!
2962
 * \brief   numaCrossingsByPeaks()
2963
 *
2964
 * \param[in]    nax     [optional] numa of abscissa values
2965
 * \param[in]    nay     numa of ordinate values, corresponding to nax
2966
 * \param[in]    delta   parameter used to identify when a new peak can be found
2967
 * \return  nad abscissa pts at threshold, or NULL on error
2968
 *
2969
 * <pre>
2970
 * Notes:
2971
 *      (1) If nax == NULL, we use startx and delx from nay to compute
2972
 *          the crossing values in nad.
2973
 * </pre>
2974
 */
2975
NUMA *
2976
numaCrossingsByPeaks(NUMA      *nax,
2977
                     NUMA      *nay,
2978
                     l_float32  delta)
2979
0
{
2980
0
l_int32    i, j, n, np, previndex, curindex;
2981
0
l_float32  startx, delx;
2982
0
l_float32  xval1, xval2, yval1, yval2, delta1, delta2;
2983
0
l_float32  prevval, curval, thresh, crossval, fract;
2984
0
NUMA      *nap, *nad;
2985
2986
0
    if (!nay)
2987
0
        return (NUMA *)ERROR_PTR("nay not defined", __func__, NULL);
2988
2989
0
    n = numaGetCount(nay);
2990
0
    if (nax && (numaGetCount(nax) != n))
2991
0
        return (NUMA *)ERROR_PTR("nax and nay sizes differ", __func__, NULL);
2992
2993
        /* Find the extrema.  Also add last point in nay to get
2994
         * the last transition (from the last peak to the end).
2995
         * The number of crossings is 1 more than the number of extrema. */
2996
0
    nap = numaFindExtrema(nay, delta, NULL);
2997
0
    numaAddNumber(nap, n - 1);
2998
0
    np = numaGetCount(nap);
2999
0
    L_INFO("Number of crossings: %d\n", __func__, np);
3000
3001
        /* Do all computation in index units of nax or the delx of nay */
3002
0
    nad = numaCreate(np);  /* output crossing locations, in nax units */
3003
0
    previndex = 0;  /* prime the search with 1st point */
3004
0
    numaGetFValue(nay, 0, &prevval);  /* prime the search with 1st point */
3005
0
    numaGetParameters(nay, &startx, &delx);
3006
0
    for (i = 0; i < np; i++) {
3007
0
        numaGetIValue(nap, i, &curindex);
3008
0
        numaGetFValue(nay, curindex, &curval);
3009
0
        thresh = (prevval + curval) / 2.0f;
3010
0
        if (nax)
3011
0
            numaGetFValue(nax, previndex, &xval1);
3012
0
        else
3013
0
            xval1 = startx + previndex * delx;
3014
0
        numaGetFValue(nay, previndex, &yval1);
3015
0
        for (j = previndex + 1; j <= curindex; j++) {
3016
0
            if (nax)
3017
0
                numaGetFValue(nax, j, &xval2);
3018
0
            else
3019
0
                xval2 = startx + j * delx;
3020
0
            numaGetFValue(nay, j, &yval2);
3021
0
            delta1 = yval1 - thresh;
3022
0
            delta2 = yval2 - thresh;
3023
0
            if (delta1 == 0.0) {
3024
0
                numaAddNumber(nad, xval1);
3025
0
                break;
3026
0
            } else if (delta2 == 0.0) {
3027
0
                numaAddNumber(nad, xval2);
3028
0
                break;
3029
0
            } else if (delta1 * delta2 < 0.0) {  /* crossing */
3030
0
                fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
3031
0
                crossval = xval1 + fract * (xval2 - xval1);
3032
0
                numaAddNumber(nad, crossval);
3033
0
                break;
3034
0
            }
3035
0
            xval1 = xval2;
3036
0
            yval1 = yval2;
3037
0
        }
3038
0
        previndex = curindex;
3039
0
        prevval = curval;
3040
0
    }
3041
3042
0
    numaDestroy(&nap);
3043
0
    return nad;
3044
0
}
3045
3046
3047
/*!
3048
 * \brief   numaEvalBestHaarParameters()
3049
 *
3050
 * \param[in]    nas         numa of non-negative signal values
3051
 * \param[in]    relweight   relative weight of (-1 comb) / (+1 comb)
3052
 *                           contributions to the 'convolution'.  In effect,
3053
 *                           the convolution kernel is a comb consisting of
3054
 *                           alternating +1 and -weight.
3055
 * \param[in]    nwidth      number of widths to consider
3056
 * \param[in]    nshift      number of shifts to consider for each width
3057
 * \param[in]    minwidth    smallest width to consider
3058
 * \param[in]    maxwidth    largest width to consider
3059
 * \param[out]   pbestwidth  width giving largest score
3060
 * \param[out]   pbestshift  shift giving largest score
3061
 * \param[out]   pbestscore  [optional] convolution with "Haar"-like comb
3062
 * \return  0 if OK, 1 on error
3063
 *
3064
 * <pre>
3065
 * Notes:
3066
 *      (1) This does a linear sweep of widths, evaluating at %nshift
3067
 *          shifts for each width, computing the score from a convolution
3068
 *          with a long comb, and finding the (width, shift) pair that
3069
 *          gives the maximum score.  The best width is the "half-wavelength"
3070
 *          of the signal.
3071
 *      (2) The convolving function is a comb of alternating values
3072
 *          +1 and -1 * relweight, separated by the width and phased by
3073
 *          the shift.  This is similar to a Haar transform, except
3074
 *          there the convolution is performed with a square wave.
3075
 *      (3) The function is useful for finding the line spacing
3076
 *          and strength of line signal from pixel sum projections.
3077
 *      (4) The score is normalized to the size of nas divided by
3078
 *          the number of half-widths.  For image applications, the input is
3079
 *          typically an array of pixel projections, so one should
3080
 *          normalize by dividing the score by the image width in the
3081
 *          pixel projection direction.
3082
 * </pre>
3083
 */
3084
l_ok
3085
numaEvalBestHaarParameters(NUMA       *nas,
3086
                           l_float32   relweight,
3087
                           l_int32     nwidth,
3088
                           l_int32     nshift,
3089
                           l_float32   minwidth,
3090
                           l_float32   maxwidth,
3091
                           l_float32  *pbestwidth,
3092
                           l_float32  *pbestshift,
3093
                           l_float32  *pbestscore)
3094
0
{
3095
0
l_int32    i, j;
3096
0
l_float32  delwidth, delshift, width, shift, score;
3097
0
l_float32  bestwidth, bestshift, bestscore;
3098
3099
0
    if (pbestscore) *pbestscore = 0.0;
3100
0
    if (pbestwidth) *pbestwidth = 0.0;
3101
0
    if (pbestshift) *pbestshift = 0.0;
3102
0
    if (!pbestwidth || !pbestshift)
3103
0
        return ERROR_INT("&bestwidth and &bestshift not defined", __func__, 1);
3104
0
    if (!nas)
3105
0
        return ERROR_INT("nas not defined", __func__, 1);
3106
3107
0
    bestscore = bestwidth = bestshift = 0.0;
3108
0
    delwidth = (maxwidth - minwidth) / (nwidth - 1.0f);
3109
0
    for (i = 0; i < nwidth; i++) {
3110
0
        width = minwidth + delwidth * i;
3111
0
        delshift = width / (l_float32)(nshift);
3112
0
        for (j = 0; j < nshift; j++) {
3113
0
            shift = j * delshift;
3114
0
            numaEvalHaarSum(nas, width, shift, relweight, &score);
3115
0
            if (score > bestscore) {
3116
0
                bestscore = score;
3117
0
                bestwidth = width;
3118
0
                bestshift = shift;
3119
#if  DEBUG_FREQUENCY
3120
                lept_stderr("width = %7.3f, shift = %7.3f, score = %7.3f\n",
3121
                            width, shift, score);
3122
#endif  /* DEBUG_FREQUENCY */
3123
0
            }
3124
0
        }
3125
0
    }
3126
3127
0
    *pbestwidth = bestwidth;
3128
0
    *pbestshift = bestshift;
3129
0
    if (pbestscore)
3130
0
        *pbestscore = bestscore;
3131
0
    return 0;
3132
0
}
3133
3134
3135
/*!
3136
 * \brief   numaEvalHaarSum()
3137
 *
3138
 * \param[in]    nas         numa of non-negative signal values
3139
 * \param[in]    width       distance between +1 and -1 in convolution comb
3140
 * \param[in]    shift       phase of the comb: location of first +1
3141
 * \param[in]    relweight   relative weight of (-1 comb) / (+1 comb)
3142
 *                           contributions to the 'convolution'.  In effect,
3143
 *                           the convolution kernel is a comb consisting of
3144
 *                           alternating +1 and -weight.
3145
 * \param[out]   pscore      convolution with "Haar"-like comb
3146
 * \return  0 if OK, 1 on error
3147
 *
3148
 * <pre>
3149
 * Notes:
3150
 *      (1) This does a convolution with a comb of alternating values
3151
 *          +1 and -relweight, separated by the width and phased by the shift.
3152
 *          This is similar to a Haar transform, except that for Haar,
3153
 *            (1) the convolution kernel is symmetric about 0, so the
3154
 *                relweight is 1.0, and
3155
 *            (2) the convolution is performed with a square wave.
3156
 *      (2) The score is normalized to the size of nas divided by
3157
 *          twice the "width".  For image applications, the input is
3158
 *          typically an array of pixel projections, so one should
3159
 *          normalize by dividing the score by the image width in the
3160
 *          pixel projection direction.
3161
 *      (3) To get a Haar-like result, use relweight = 1.0.  For detecting
3162
 *          signals where you expect every other sample to be close to
3163
 *          zero, as with barcodes or filtered text lines, you can
3164
 *          use relweight > 1.0.
3165
 * </pre>
3166
 */
3167
l_ok
3168
numaEvalHaarSum(NUMA       *nas,
3169
                l_float32   width,
3170
                l_float32   shift,
3171
                l_float32   relweight,
3172
                l_float32  *pscore)
3173
0
{
3174
0
l_int32    i, n, nsamp, index;
3175
0
l_float32  score, weight, val;
3176
3177
0
    if (!pscore)
3178
0
        return ERROR_INT("&score not defined", __func__, 1);
3179
0
    *pscore = 0.0;
3180
0
    if (!nas)
3181
0
        return ERROR_INT("nas not defined", __func__, 1);
3182
0
    if ((n = numaGetCount(nas)) < 2 * width)
3183
0
        return ERROR_INT("nas size too small", __func__, 1);
3184
3185
0
    score = 0.0;
3186
0
    nsamp = (l_int32)((n - shift) / width);
3187
0
    for (i = 0; i < nsamp; i++) {
3188
0
        index = (l_int32)(shift + i * width);
3189
0
        weight = (i % 2) ? 1.0f : -1.0f * relweight;
3190
0
        numaGetFValue(nas, index, &val);
3191
0
        score += weight * val;
3192
0
    }
3193
3194
0
    *pscore = 2.0f * width * score / (l_float32)n;
3195
0
    return 0;
3196
0
}
3197
3198
3199
/*----------------------------------------------------------------------*
3200
 *            Generating numbers in a range under constraints           *
3201
 *----------------------------------------------------------------------*/
3202
/*!
3203
 * \brief   genConstrainedNumaInRange()
3204
 *
3205
 * \param[in]    first     first number to choose; >= 0
3206
 * \param[in]    last      biggest possible number to reach; >= first
3207
 * \param[in]    nmax      maximum number of numbers to select; > 0
3208
 * \param[in]    use_pairs 1 = select pairs of adjacent numbers;
3209
 *                         0 = select individual numbers
3210
 * \return  0 if OK, 1 on error
3211
 *
3212
 * <pre>
3213
 * Notes:
3214
 *     (1) Selection is made uniformly in the range.  This can be used
3215
 *         to select pages distributed as uniformly as possible
3216
 *         through a book, where you are constrained to:
3217
 *          ~ choose between [first, ... biggest],
3218
 *          ~ choose no more than nmax numbers, and
3219
 *         and you have the option of requiring pairs of adjacent numbers.
3220
 * </pre>
3221
 */
3222
NUMA *
3223
genConstrainedNumaInRange(l_int32  first,
3224
                          l_int32  last,
3225
                          l_int32  nmax,
3226
                          l_int32  use_pairs)
3227
0
{
3228
0
l_int32    i, nsets, val;
3229
0
l_float32  delta;
3230
0
NUMA      *na;
3231
3232
0
    first = L_MAX(0, first);
3233
0
    if (last < first)
3234
0
        return (NUMA *)ERROR_PTR("last < first!", __func__, NULL);
3235
0
    if (nmax < 1)
3236
0
        return (NUMA *)ERROR_PTR("nmax < 1!", __func__, NULL);
3237
3238
0
    nsets = L_MIN(nmax, last - first + 1);
3239
0
    if (use_pairs == 1)
3240
0
        nsets = nsets / 2;
3241
0
    if (nsets == 0)
3242
0
        return (NUMA *)ERROR_PTR("nsets == 0", __func__, NULL);
3243
3244
        /* Select delta so that selection covers the full range if possible */
3245
0
    if (nsets == 1) {
3246
0
        delta = 0.0;
3247
0
    } else {
3248
0
        if (use_pairs == 0)
3249
0
            delta = (l_float32)(last - first) / (nsets - 1);
3250
0
        else
3251
0
            delta = (l_float32)(last - first - 1) / (nsets - 1);
3252
0
    }
3253
3254
0
    na = numaCreate(nsets);
3255
0
    for (i = 0; i < nsets; i++) {
3256
0
        val = (l_int32)(first + i * delta + 0.5);
3257
0
        numaAddNumber(na, val);
3258
0
        if (use_pairs == 1)
3259
0
            numaAddNumber(na, val + 1);
3260
0
    }
3261
3262
0
    return na;
3263
0
}