Coverage Report

Created: 2026-06-07 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/leptonica/src/classapp.c
Line
Count
Source
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*!
28
 * \file classapp.c
29
 * <pre>
30
 *
31
 *      Top-level jb2 correlation and rank-hausdorff
32
 *         l_int32         jbCorrelation()
33
 *         l_int32         jbRankHaus()
34
 *
35
 *      Extract and classify words in textline order
36
 *         JBCLASSER      *jbWordsInTextlines()
37
 *         l_int32         pixGetWordsInTextlines()
38
 *         l_int32         pixGetWordBoxesInTextlines()
39
 *
40
 *      Extract word and character bounding boxes
41
 *         l_int32         pixFindWordAndCharacterBoxes()
42
 *
43
 *      Use word bounding boxes to compare page images
44
 *         NUMAA          *boxaExtractSortedPattern()
45
 *         l_int32         numaaCompareImagesByBoxes()
46
 *         static l_int32  testLineAlignmentX()
47
 *         static l_int32  countAlignedMatches()
48
 *         static void     printRowIndices()
49
 * </pre>
50
 */
51
52
#ifdef HAVE_CONFIG_H
53
#include <config_auto.h>
54
#endif  /* HAVE_CONFIG_H */
55
56
#include <string.h>
57
#include "allheaders.h"
58
59
0
#define L_BUF_SIZE 512                        /*!< size of filename buffer    */
60
static const l_int32 JB_WORDS_MIN_WIDTH = 5;  /*!< min. word width in pixels  */
61
static const l_int32 JB_WORDS_MIN_HEIGHT = 3; /*!< min. word height in pixels */
62
63
    /* Static comparison functions */
64
static l_int32 testLineAlignmentX(NUMA *na1, NUMA *na2, l_int32 shiftx,
65
                                  l_int32 delx, l_int32 nperline);
66
static l_int32 countAlignedMatches(NUMA *nai1, NUMA *nai2, NUMA *nasx,
67
                                   NUMA *nasy, l_int32 n1, l_int32 n2,
68
                                   l_int32 delx, l_int32 dely,
69
                                   l_int32 nreq, l_int32 *psame,
70
                                   l_int32 debugflag);
71
static void printRowIndices(l_int32 *index1, l_int32 n1,
72
                            l_int32 *index2, l_int32 n2);
73
74
/*------------------------------------------------------------------*
75
 *          Top-level jb2 correlation and rank-hausdorff            *
76
 *------------------------------------------------------------------*/
77
/*!
78
 * \brief   jbCorrelation()
79
 *
80
 * \param[in]    dirin        directory of input images
81
 * \param[in]    thresh       typically ~0.8
82
 * \param[in]    weight       typically ~0.6
83
 * \param[in]    components   JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS
84
 * \param[in]    rootname     for output files
85
 * \param[in]    firstpage    0-based
86
 * \param[in]    npages       use 0 for all pages in dirin
87
 * \param[in]    renderflag   1 to render from templates; 0 to skip
88
 * \return  0 if OK, 1 on error
89
 *
90
 * <pre>
91
 * Notes:
92
 *      (1) The images must be 1 bpp.  If they are not, you can convert
93
 *          them using convertFilesTo1bpp().
94
 *      (2) See prog/jbcorrelation for generating more output (e.g.,
95
 *          for debugging)
96
 * </pre>
97
 */
98
l_ok
99
jbCorrelation(const char  *dirin,
100
              l_float32    thresh,
101
              l_float32    weight,
102
              l_int32      components,
103
              const char  *rootname,
104
              l_int32      firstpage,
105
              l_int32      npages,
106
              l_int32      renderflag)
107
0
{
108
0
char        filename[L_BUF_SIZE];
109
0
l_int32     nfiles, i, numpages;
110
0
JBDATA     *data;
111
0
JBCLASSER  *classer;
112
0
PIX        *pix;
113
0
PIXA       *pixa;
114
0
SARRAY     *safiles;
115
116
0
    if (!dirin)
117
0
        return ERROR_INT("dirin not defined", __func__, 1);
118
0
    if (!rootname)
119
0
        return ERROR_INT("rootname not defined", __func__, 1);
120
0
    if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
121
0
        components != JB_WORDS)
122
0
        return ERROR_INT("components invalid", __func__, 1);
123
124
0
    safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
125
0
    nfiles = sarrayGetCount(safiles);
126
127
        /* Classify components */
128
0
    classer = jbCorrelationInit(components, 0, 0, thresh, weight);
129
0
    jbAddPages(classer, safiles);
130
131
        /* Save data */
132
0
    data = jbDataSave(classer);
133
0
    jbDataWrite(rootname, data);
134
135
        /* Optionally, render pages using class templates */
136
0
    if (renderflag) {
137
0
        pixa = jbDataRender(data, FALSE);
138
0
        numpages = pixaGetCount(pixa);
139
0
        if (numpages != nfiles)
140
0
            lept_stderr("numpages = %d, nfiles = %d, not equal!\n",
141
0
                        numpages, nfiles);
142
0
        for (i = 0; i < numpages; i++) {
143
0
            pix = pixaGetPix(pixa, i, L_CLONE);
144
0
            snprintf(filename, L_BUF_SIZE, "%s.%04d", rootname, i);
145
0
            lept_stderr("filename: %s\n", filename);
146
0
            pixWrite(filename, pix, IFF_PNG);
147
0
            pixDestroy(&pix);
148
0
        }
149
0
        pixaDestroy(&pixa);
150
0
    }
151
152
0
    sarrayDestroy(&safiles);
153
0
    jbClasserDestroy(&classer);
154
0
    jbDataDestroy(&data);
155
0
    return 0;
156
0
}
157
158
159
/*!
160
 * \brief   jbRankHaus()
161
 *
162
 * \param[in]    dirin         directory of input images
163
 * \param[in]    size          of Sel used for dilation; typ. 2
164
 * \param[in]    rank          rank value of match; typ. 0.97
165
 * \param[in]    components    JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS
166
 * \param[in]    rootname      for output files
167
 * \param[in]    firstpage     0-based
168
 * \param[in]    npages        use 0 for all pages in dirin
169
 * \param[in]    renderflag    1 to render from templates; 0 to skip
170
 * \return  0 if OK, 1 on error
171
 *
172
 * <pre>
173
 * Notes:
174
 *      (1) See prog/jbrankhaus for generating more output (e.g.,
175
 *          for debugging)
176
 * </pre>
177
 */
178
l_ok
179
jbRankHaus(const char  *dirin,
180
           l_int32      size,
181
           l_float32    rank,
182
           l_int32      components,
183
           const char  *rootname,
184
           l_int32      firstpage,
185
           l_int32      npages,
186
           l_int32      renderflag)
187
0
{
188
0
char        filename[L_BUF_SIZE];
189
0
l_int32     nfiles, i, numpages;
190
0
JBDATA     *data;
191
0
JBCLASSER  *classer;
192
0
PIX        *pix;
193
0
PIXA       *pixa;
194
0
SARRAY     *safiles;
195
196
0
    if (!dirin)
197
0
        return ERROR_INT("dirin not defined", __func__, 1);
198
0
    if (!rootname)
199
0
        return ERROR_INT("rootname not defined", __func__, 1);
200
0
    if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
201
0
        components != JB_WORDS)
202
0
        return ERROR_INT("components invalid", __func__, 1);
203
204
0
    safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
205
0
    nfiles = sarrayGetCount(safiles);
206
207
        /* Classify components */
208
0
    classer = jbRankHausInit(components, 0, 0, size, rank);
209
0
    jbAddPages(classer, safiles);
210
211
        /* Save data */
212
0
    data = jbDataSave(classer);
213
0
    jbDataWrite(rootname, data);
214
215
        /* Optionally, render pages using class templates */
216
0
    if (renderflag) {
217
0
        pixa = jbDataRender(data, FALSE);
218
0
        numpages = pixaGetCount(pixa);
219
0
        if (numpages != nfiles)
220
0
            lept_stderr("numpages = %d, nfiles = %d, not equal!\n",
221
0
                        numpages, nfiles);
222
0
        for (i = 0; i < numpages; i++) {
223
0
            pix = pixaGetPix(pixa, i, L_CLONE);
224
0
            snprintf(filename, L_BUF_SIZE, "%s.%04d", rootname, i);
225
0
            lept_stderr("filename: %s\n", filename);
226
0
            pixWrite(filename, pix, IFF_PNG);
227
0
            pixDestroy(&pix);
228
0
        }
229
0
        pixaDestroy(&pixa);
230
0
    }
231
232
0
    sarrayDestroy(&safiles);
233
0
    jbClasserDestroy(&classer);
234
0
    jbDataDestroy(&data);
235
0
    return 0;
236
0
}
237
238
239
240
/*------------------------------------------------------------------*
241
 *           Extract and classify words in textline order           *
242
 *------------------------------------------------------------------*/
243
/*!
244
 * \brief   jbWordsInTextlines()
245
 *
246
 * \param[in]    dirin       directory of input pages
247
 * \param[in]    reduction   1 for full res; 2 for half-res
248
 * \param[in]    maxwidth    of word mask components, to be kept
249
 * \param[in]    maxheight   of word mask components, to be kept
250
 * \param[in]    thresh      on correlation; 0.80 is reasonable
251
 * \param[in]    weight      for handling thick text; 0.6 is reasonable
252
 * \param[out]   pnatl       numa with textline index for each component
253
 * \param[in]    firstpage   0-based
254
 * \param[in]    npages      use 0 for all pages in dirin
255
 * \return  classer for the set of pages
256
 *
257
 * <pre>
258
 * Notes:
259
 *      (1) This is a high-level function.  See prog/jbwords for example
260
 *          of usage.
261
 *      (2) Typically, use input of 75 - 150 ppi for finding words.
262
 * </pre>
263
 */
264
JBCLASSER *
265
jbWordsInTextlines(const char  *dirin,
266
                   l_int32      reduction,
267
                   l_int32      maxwidth,
268
                   l_int32      maxheight,
269
                   l_float32    thresh,
270
                   l_float32    weight,
271
                   NUMA       **pnatl,
272
                   l_int32      firstpage,
273
                   l_int32      npages)
274
0
{
275
0
char       *fname;
276
0
l_int32     nfiles, i, w, h;
277
0
BOXA       *boxa;
278
0
JBCLASSER  *classer;
279
0
NUMA       *nai, *natl;
280
0
PIX        *pix1, *pix2;
281
0
PIXA       *pixa;
282
0
SARRAY     *safiles;
283
284
0
    if (!pnatl)
285
0
        return (JBCLASSER *)ERROR_PTR("&natl not defined", __func__, NULL);
286
0
    *pnatl = NULL;
287
0
    if (!dirin)
288
0
        return (JBCLASSER *)ERROR_PTR("dirin not defined", __func__, NULL);
289
0
    if (reduction != 1 && reduction != 2)
290
0
        return (JBCLASSER *)ERROR_PTR("reduction not in {1,2}", __func__, NULL);
291
292
0
    safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
293
0
    nfiles = sarrayGetCount(safiles);
294
295
        /* Classify components */
296
0
    classer = jbCorrelationInit(JB_WORDS, maxwidth, maxheight, thresh, weight);
297
0
    classer->safiles = sarrayCopy(safiles);
298
0
    natl = numaCreate(0);
299
0
    *pnatl = natl;
300
0
    for (i = 0; i < nfiles; i++) {
301
0
        fname = sarrayGetString(safiles, i, L_NOCOPY);
302
0
        if ((pix1 = pixRead(fname)) == NULL) {
303
0
            L_WARNING("image file %d not read\n", __func__, i);
304
0
            continue;
305
0
        }
306
0
        if (reduction == 1)
307
0
            pix2 = pixClone(pix1);
308
0
        else  /* reduction == 2 */
309
0
            pix2 = pixReduceRankBinaryCascade(pix1, 1, 0, 0, 0);
310
0
        pixGetWordsInTextlines(pix2, JB_WORDS_MIN_WIDTH,
311
0
                               JB_WORDS_MIN_HEIGHT, maxwidth, maxheight,
312
0
                               &boxa, &pixa, &nai);
313
0
        pixGetDimensions(pix2, &w, &h, NULL);
314
0
        classer->w = w;
315
0
        classer->h = h;
316
0
        jbAddPageComponents(classer, pix2, boxa, pixa);
317
0
        numaJoin(natl, nai, 0, -1);
318
0
        pixDestroy(&pix1);
319
0
        pixDestroy(&pix2);
320
0
        numaDestroy(&nai);
321
0
        boxaDestroy(&boxa);
322
0
        pixaDestroy(&pixa);
323
0
    }
324
325
0
    sarrayDestroy(&safiles);
326
0
    return classer;
327
0
}
328
329
330
/*!
331
 * \brief   pixGetWordsInTextlines()
332
 *
333
 * \param[in]    pixs        1 bpp, typ. 75 - 150 ppi
334
 * \param[in]    minwidth    of saved components; smaller are discarded
335
 * \param[in]    minheight   of saved components; smaller are discarded
336
 * \param[in]    maxwidth    of saved components; larger are discarded
337
 * \param[in]    maxheight   of saved components; larger are discarded
338
 * \param[out]   pboxad      word boxes sorted in textline line order
339
 * \param[out]   ppixad      word images sorted in textline line order
340
 * \param[out]   pnai        index of textline for each word
341
 * \return  0 if OK, 1 on error
342
 *
343
 * <pre>
344
 * Notes:
345
 *      (1) The input should be at a resolution of between 75 and 150 ppi.
346
 *      (2) The four size constraints on saved components are all
347
 *          scaled by %reduction.
348
 *      (3) The result are word images (and their b.b.), extracted in
349
 *          textline order, at either full res or 2x reduction,
350
 *          and with a numa giving the textline index for each word.
351
 *      (4) The pixa and boxa interfaces should make this type of
352
 *          application simple to put together.  The steps are:
353
 *           ~ generate first estimate of word masks
354
 *           ~ get b.b. of these, and remove the small and big ones
355
 *           ~ extract pixa of the word images, using the b.b.
356
 *           ~ sort actual word images in textline order (2d)
357
 *           ~ flatten them to a pixa (1d), saving the textline index
358
 *             for each pix
359
 *      (5) In an actual application, it may be desirable to pre-filter
360
 *          the input image to remove large components, to extract
361
 *          single columns of text, and to deskew them.  For example,
362
 *          to remove both large components and small noisy components
363
 *          that can interfere with the statistics used to estimate
364
 *          parameters for segmenting by words, but still retain text lines,
365
 *          the following image preprocessing can be done:
366
 *                Pix *pixt = pixMorphSequence(pixs, "c40.1", 0);
367
 *                Pix *pixf = pixSelectBySize(pixt, 0, 60, 8,
368
 *                                     L_SELECT_HEIGHT, L_SELECT_IF_LT, NULL);
369
 *                pixAnd(pixf, pixf, pixs);  // the filtered image
370
 *          The closing turns text lines into long blobs, but does not
371
 *          significantly increase their height.  But if there are many
372
 *          small connected components in a dense texture, this is likely
373
 *          to generate tall components that will be eliminated in pixf.
374
 * </pre>
375
 */
376
l_ok
377
pixGetWordsInTextlines(PIX     *pixs,
378
                       l_int32  minwidth,
379
                       l_int32  minheight,
380
                       l_int32  maxwidth,
381
                       l_int32  maxheight,
382
                       BOXA   **pboxad,
383
                       PIXA   **ppixad,
384
                       NUMA   **pnai)
385
0
{
386
0
BOXA    *boxa1, *boxad;
387
0
BOXAA   *baa;
388
0
NUMA    *nai;
389
0
NUMAA   *naa;
390
0
PIXA    *pixa1, *pixad;
391
0
PIXAA   *paa;
392
393
0
    if (!pboxad || !ppixad || !pnai)
394
0
        return ERROR_INT("&boxad, &pixad, &nai not all defined", __func__, 1);
395
0
    *pboxad = NULL;
396
0
    *ppixad = NULL;
397
0
    *pnai = NULL;
398
0
    if (!pixs)
399
0
        return ERROR_INT("pixs not defined", __func__, 1);
400
401
        /* Get the bounding boxes of the words from the word mask. */
402
0
    pixWordBoxesByDilation(pixs, minwidth, minheight, maxwidth, maxheight,
403
0
                           &boxa1, NULL, NULL);
404
405
        /* Generate a pixa of the word images */
406
0
    pixa1 = pixaCreateFromBoxa(pixs, boxa1, 0, 0, NULL);
407
408
        /* Sort the bounding boxes of these words by line.  We use the
409
         * index mapping to allow identical sorting of the pixa. */
410
0
    baa = boxaSort2d(boxa1, &naa, -1, -1, 4);
411
0
    paa = pixaSort2dByIndex(pixa1, naa, L_CLONE);
412
413
        /* Flatten the word paa */
414
0
    pixad = pixaaFlattenToPixa(paa, &nai, L_CLONE);
415
0
    boxad = pixaGetBoxa(pixad, L_COPY);
416
417
0
    *pnai = nai;
418
0
    *pboxad = boxad;
419
0
    *ppixad = pixad;
420
421
0
    pixaDestroy(&pixa1);
422
0
    boxaDestroy(&boxa1);
423
0
    boxaaDestroy(&baa);
424
0
    pixaaDestroy(&paa);
425
0
    numaaDestroy(&naa);
426
0
    return 0;
427
0
}
428
429
430
/*!
431
 * \brief   pixGetWordBoxesInTextlines()
432
 *
433
 * \param[in]    pixs        1 bpp, typ. 75 - 150 ppi
434
 * \param[in]    minwidth    of saved components; smaller are discarded
435
 * \param[in]    minheight   of saved components; smaller are discarded
436
 * \param[in]    maxwidth    of saved components; larger are discarded
437
 * \param[in]    maxheight   of saved components; larger are discarded
438
 * \param[out]   pboxad      word boxes sorted in textline line order
439
 * \param[out]   pnai        [optional] index of textline for each word
440
 * \return  0 if OK, 1 on error
441
 *
442
 * <pre>
443
 * Notes:
444
 *      (1) The input should be at a resolution of between 75 and 150 ppi.
445
 *      (2) This is a special version of pixGetWordsInTextlines(), that
446
 *          just finds the word boxes in line order, with a numa
447
 *          giving the textline index for each word.
448
 *          See pixGetWordsInTextlines() for more details.
449
 * </pre>
450
 */
451
l_ok
452
pixGetWordBoxesInTextlines(PIX     *pixs,
453
                           l_int32  minwidth,
454
                           l_int32  minheight,
455
                           l_int32  maxwidth,
456
                           l_int32  maxheight,
457
                           BOXA   **pboxad,
458
                           NUMA   **pnai)
459
699
{
460
699
BOXA    *boxa1;
461
699
BOXAA   *baa;
462
699
NUMA    *nai;
463
464
699
    if (pnai) *pnai = NULL;
465
699
    if (!pboxad)
466
0
        return ERROR_INT("&boxad and &nai not both defined", __func__, 1);
467
699
    *pboxad = NULL;
468
699
    if (!pixs)
469
0
        return ERROR_INT("pixs not defined", __func__, 1);
470
471
        /* Get the bounding boxes of the words from the word mask. */
472
699
    pixWordBoxesByDilation(pixs, minwidth, minheight, maxwidth, maxheight,
473
699
                           &boxa1, NULL, NULL);
474
475
        /* 2D sort the bounding boxes of these words. */
476
699
    baa = boxaSort2d(boxa1, NULL, 3, -5, 5);
477
478
        /* Flatten the boxaa, saving the boxa index for each box */
479
699
    *pboxad = boxaaFlattenToBoxa(baa, &nai, L_CLONE);
480
481
699
    if (pnai)
482
0
        *pnai = nai;
483
699
    else
484
699
        numaDestroy(&nai);
485
699
    boxaDestroy(&boxa1);
486
699
    boxaaDestroy(&baa);
487
699
    return 0;
488
699
}
489
490
491
/*------------------------------------------------------------------*
492
 *             Extract word and character bounding boxes            *
493
 *------------------------------------------------------------------*/
494
/*!
495
 * \brief   pixFindWordAndCharacterBoxes()
496
 *
497
 * \param[in]    pixs        2, 4, 8 or 32 bpp; colormap OK; typ. 300 ppi
498
 * \param[in]    boxs        [optional] region to select in pixs
499
 * \param[in]    thresh      binarization threshold (typ. 100 - 150)
500
 * \param[out]   pboxaw      return the word boxes
501
 * \param[out]   pboxaac     return the character boxes
502
 * \param[in]    debugdir    [optional] for debug images; use NULL to skip
503
 * \return  0 if OK, 1 on error
504
 *
505
 * <pre>
506
 * Notes:
507
 *      (1) If %boxs == NULL, the entire input image is used.
508
 *      (2) Having an input pix that is not 1bpp is necessary to reduce
509
 *          touching characters by using a low binarization threshold.
510
 *          Suggested thresholds are between 100 and 150.
511
 *      (3) The coordinates in the output boxes are global, with respect
512
 *          to the input image.
513
 * </pre>
514
 */
515
l_ok
516
pixFindWordAndCharacterBoxes(PIX         *pixs,
517
                             BOX         *boxs,
518
                             l_int32      thresh,
519
                             BOXA       **pboxaw,
520
                             BOXAA      **pboxaac,
521
                             const char  *debugdir)
522
808
{
523
808
char      *debugfile, *subdir;
524
808
l_int32    i, xs, ys, xb, yb, nb, loc;
525
808
l_float32  scalefact;
526
808
BOX       *box1, *box2;
527
808
BOXA      *boxa1, *boxa1a, *boxa2, *boxa3, *boxa4, *boxa5, *boxaw;
528
808
BOXAA     *boxaac;
529
808
PIX       *pix1, *pix2, *pix3, *pix3a, *pix4, *pix5;
530
531
808
    if (pboxaw) *pboxaw = NULL;
532
808
    if (pboxaac) *pboxaac = NULL;
533
808
    if (!pboxaw || !pboxaac)
534
0
        return ERROR_INT("&boxaw and &boxaac not defined", __func__, 1);
535
808
    if (!pixs || pixGetDepth(pixs) == 1)
536
32
        return ERROR_INT("pixs not defined or 1 bpp", __func__, 1);
537
776
    if (thresh > 150)
538
776
        L_WARNING("threshold is %d; may be too high\n", __func__, thresh);
539
540
776
    if (boxs) {
541
776
        if ((pix1 = pixClipRectangle(pixs, boxs, NULL)) == NULL)
542
77
            return ERROR_INT("pix1 not made", __func__, 1);
543
699
        boxGetGeometry(boxs, &xs, &ys, NULL, NULL);
544
699
    } else {
545
0
        pix1 = pixClone(pixs);
546
0
        xs = ys = 0;
547
0
    }
548
549
        /* Convert pix1 to 8 bpp gray if necessary */
550
699
    pix2 = pixConvertTo8(pix1, FALSE);
551
552
        /* To find the words and letters, work with 1 bpp images and use
553
         * a low threshold to reduce the number of touching characters. */
554
699
    pix3 = pixConvertTo1(pix2, thresh);
555
556
        /* Work at about 120 ppi to find the word bounding boxes. */
557
699
    pix3a = pixScaleToResolution(pix3, 120.0, 300.0, &scalefact);
558
559
        /* First find the words, removing the very small things like
560
         * dots over the 'i' that weren't included in word boxes. */
561
699
    pixGetWordBoxesInTextlines(pix3a, 1, 4, 150, 40, &boxa1a, NULL);
562
699
    boxa1 = boxaTransform(boxa1a, 0, 0, 1.0 / scalefact, 1.0 / scalefact);
563
699
    if (debugdir) {
564
0
        loc = 0;
565
0
        subdir = stringReplaceSubstr(debugdir, "/tmp/", "", &loc, NULL);
566
0
        lept_mkdir(subdir);
567
0
        LEPT_FREE(subdir);
568
0
        pix4 = pixConvertTo32(pix2);
569
0
        pixRenderBoxaArb(pix4, boxa1, 2, 255, 0, 0);
570
0
        debugfile = stringJoin(debugdir, "/words.png");
571
0
        pixWrite(debugfile, pix4, IFF_PNG);
572
0
        pixDestroy(&pix4);
573
0
        LEPT_FREE(debugfile);
574
0
    }
575
576
        /* Now find the letters at 300 ppi */
577
699
    nb = boxaGetCount(boxa1);
578
699
    boxaw = boxaCreate(nb);
579
699
    boxaac = boxaaCreate(nb);
580
699
    *pboxaw = boxaw;
581
699
    *pboxaac = boxaac;
582
61.7k
    for (i = 0; i < nb; i++) {
583
61.0k
        box1 = boxaGetBox(boxa1, i, L_COPY);
584
61.0k
        boxGetGeometry(box1, &xb, &yb, NULL, NULL);
585
61.0k
        pix4 = pixClipRectangle(pix3, box1, NULL);
586
            /* Join detached parts of characters vertically */
587
61.0k
        pix5 = pixMorphSequence(pix4, "c1.10", 0);
588
            /* The connected components should mostly be characters */
589
61.0k
        boxa2 = pixConnCompBB(pix5, 4);
590
            /* Remove very small pieces */
591
61.0k
        boxa3 = boxaSelectBySize(boxa2, 2, 5, L_SELECT_IF_BOTH,
592
61.0k
                                 L_SELECT_IF_GTE, NULL);
593
            /* Order left to right */
594
61.0k
        boxa4 = boxaSort(boxa3, L_SORT_BY_X, L_SORT_INCREASING, NULL);
595
            /* Express locations with reference to the full input image */
596
61.0k
        boxa5 = boxaTransform(boxa4, xs + xb, ys + yb, 1.0, 1.0);
597
61.0k
        box2 = boxTransform(box1, xs, ys, 1.0, 1.0);
598
599
            /* Ignore any boxa with no boxes after size filtering */
600
61.0k
        if (boxaGetCount(boxa5) > 0) {
601
15.3k
            boxaAddBox(boxaw, box2, L_INSERT);
602
15.3k
            boxaaAddBoxa(boxaac, boxa5, L_INSERT);
603
45.7k
        } else {
604
45.7k
            boxDestroy(&box2);
605
45.7k
            boxaDestroy(&boxa5);
606
45.7k
        }
607
61.0k
        boxDestroy(&box1);
608
61.0k
        pixDestroy(&pix4);
609
61.0k
        pixDestroy(&pix5);
610
61.0k
        boxaDestroy(&boxa2);
611
61.0k
        boxaDestroy(&boxa3);
612
61.0k
        boxaDestroy(&boxa4);
613
61.0k
    }
614
699
    pixDestroy(&pix1);
615
699
    pixDestroy(&pix2);
616
699
    pixDestroy(&pix3);
617
699
    pixDestroy(&pix3a);
618
699
    boxaDestroy(&boxa1);
619
699
    boxaDestroy(&boxa1a);
620
699
    if (debugdir) {
621
0
        pix4 = pixConvertTo32(pixs);
622
0
        boxa2 = boxaaFlattenToBoxa(boxaac, NULL, L_COPY);
623
0
        pixRenderBoxaArb(pix4, boxa2, 2, 255, 0, 0);
624
0
        boxa3 = boxaAdjustSides(boxaw, -2, 2, -2, 2);
625
0
        pixRenderBoxaArb(pix4, boxa3, 2, 0, 255, 0);
626
0
        debugfile = stringJoin(debugdir, "/chars.png");
627
0
        pixWrite(debugfile, pix4, IFF_PNG);
628
0
        pixDestroy(&pix4);
629
0
        boxaDestroy(&boxa2);
630
0
        boxaDestroy(&boxa3);
631
0
        LEPT_FREE(debugfile);
632
0
    }
633
699
    return 0;
634
776
}
635
636
637
/*------------------------------------------------------------------*
638
 *           Use word bounding boxes to compare page images         *
639
 *------------------------------------------------------------------*/
640
/*!
641
 * \brief   boxaExtractSortedPattern()
642
 *
643
 * \param[in]    boxa    typ. of word bounding boxes, in textline order
644
 * \param[in]    na      index of textline for each box in boxa
645
 * \return  naa NUMAA, where each numa represents one textline,
646
 *                   or NULL on error
647
 *
648
 * <pre>
649
 * Notes:
650
 *      (1) The input is expected to come from pixGetWordBoxesInTextlines().
651
 *      (2) Each numa in the output consists of an average y coordinate
652
 *          of the first box in the textline, followed by pairs of
653
 *          x coordinates representing the left and right edges of each
654
 *          of the boxes in the textline.
655
 * </pre>
656
 */
657
NUMAA *
658
boxaExtractSortedPattern(BOXA  *boxa,
659
                         NUMA  *na)
660
0
{
661
0
l_int32  index, nbox, row, prevrow, x, y, w, h;
662
0
BOX     *box;
663
0
NUMA    *nad = NULL;
664
0
NUMAA   *naa;
665
666
0
    if (!boxa)
667
0
        return (NUMAA *)ERROR_PTR("boxa not defined", __func__, NULL);
668
0
    if (!na)
669
0
        return (NUMAA *)ERROR_PTR("na not defined", __func__, NULL);
670
671
0
    naa = numaaCreate(0);
672
0
    nbox = boxaGetCount(boxa);
673
0
    if (nbox == 0)
674
0
        return naa;
675
676
0
    prevrow = -1;
677
0
    for (index = 0; index < nbox; index++) {
678
0
        box = boxaGetBox(boxa, index, L_CLONE);
679
0
        numaGetIValue(na, index, &row);
680
0
        if (row > prevrow) {
681
0
            if (index > 0)
682
0
                numaaAddNuma(naa, nad, L_INSERT);
683
0
            nad = numaCreate(0);
684
0
            prevrow = row;
685
0
            boxGetGeometry(box, NULL, &y, NULL, &h);
686
0
            numaAddNumber(nad, y + h / 2);
687
0
        }
688
0
        boxGetGeometry(box, &x, NULL, &w, NULL);
689
0
        numaAddNumber(nad, x);
690
0
        numaAddNumber(nad, x + w - 1);
691
0
        boxDestroy(&box);
692
0
    }
693
0
    numaaAddNuma(naa, nad, L_INSERT);
694
695
0
    return naa;
696
0
}
697
698
699
/*!
700
 * \brief   numaaCompareImagesByBoxes()
701
 *
702
 * \param[in]    naa1       for image 1, formatted by boxaExtractSortedPattern()
703
 * \param[in]    naa2       for image 2, formatted by boxaExtractSortedPattern()
704
 * \param[in]    nperline   number of box regions to be used in each textline
705
 * \param[in]    nreq       number of complete row matches required
706
 * \param[in]    maxshiftx  max allowed x shift between two patterns, in pixels
707
 * \param[in]    maxshifty  max allowed y shift between two patterns, in pixels
708
 * \param[in]    delx       max allowed difference in x data, after alignment
709
 * \param[in]    dely       max allowed difference in y data, after alignment
710
 * \param[out]   psame      1 if %nreq row matches are found; 0 otherwise
711
 * \param[in]    debugflag  1 for debug output
712
 * \return  0 if OK, 1 on error
713
 *
714
 * <pre>
715
 * Notes:
716
 *      (1) Each input numaa describes a set of sorted bounding boxes
717
 *          (sorted by textline and, within each textline, from
718
 *          left to right) in the images from which they are derived.
719
 *          See boxaExtractSortedPattern() for a description of the data
720
 *          format in each of the input numaa.
721
 *      (2) This function does an alignment between the input
722
 *          descriptions of bounding boxes for two images. The
723
 *          input parameter %nperline specifies the number of boxes
724
 *          to consider in each line when testing for a match, and
725
 *          %nreq is the required number of lines that must be well-aligned
726
 *          to get a match.
727
 *      (3) Testing by alignment has 3 steps:
728
 *          (a) Generating the location of word bounding boxes from the
729
 *              images (prior to calling this function).
730
 *          (b) Listing all possible pairs of aligned rows, based on
731
 *              tolerances in horizontal and vertical positions of
732
 *              the boxes.  Specifically, all pairs of rows are enumerated
733
 *              whose first %nperline boxes can be brought into close
734
 *              alignment, based on the delx parameter for boxes in the
735
 *              line and within the overall the %maxshiftx and %maxshifty
736
 *              constraints.
737
 *          (c) Each pair, starting with the first, is used to search
738
 *              for a set of %nreq - 1 other pairs that can all be aligned
739
 *              with a difference in global translation of not more
740
 *              than (%delx, %dely).
741
 * </pre>
742
 */
743
l_ok
744
numaaCompareImagesByBoxes(NUMAA    *naa1,
745
                          NUMAA    *naa2,
746
                          l_int32   nperline,
747
                          l_int32   nreq,
748
                          l_int32   maxshiftx,
749
                          l_int32   maxshifty,
750
                          l_int32   delx,
751
                          l_int32   dely,
752
                          l_int32  *psame,
753
                          l_int32   debugflag)
754
0
{
755
0
l_int32   n1, n2, i, j, nbox, y1, y2, xl1, xl2;
756
0
l_int32   shiftx, shifty, match;
757
0
l_int32  *line1, *line2;  /* indicator for sufficient boxes in a line */
758
0
l_int32  *yloc1, *yloc2;  /* arrays of y value for first box in a line */
759
0
l_int32  *xleft1, *xleft2;  /* arrays of x value for left side of first box */
760
0
NUMA     *na1, *na2, *nai1, *nai2, *nasx, *nasy;
761
762
0
    if (!psame)
763
0
        return ERROR_INT("&same not defined", __func__, 1);
764
0
    *psame = 0;
765
0
    if (!naa1)
766
0
        return ERROR_INT("naa1 not defined", __func__, 1);
767
0
    if (!naa2)
768
0
        return ERROR_INT("naa2 not defined", __func__, 1);
769
0
    if (nperline < 1)
770
0
        return ERROR_INT("nperline < 1", __func__, 1);
771
0
    if (nreq < 1)
772
0
        return ERROR_INT("nreq < 1", __func__, 1);
773
774
0
    n1 = numaaGetCount(naa1);
775
0
    n2 = numaaGetCount(naa2);
776
0
    if (n1 < nreq || n2 < nreq)
777
0
        return 0;
778
779
        /* Find the lines in naa1 and naa2 with sufficient boxes.
780
         * Also, find the y-values for each of the lines, and the
781
         * LH x-values of the first box in each line. */
782
0
    line1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));
783
0
    line2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
784
0
    yloc1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));
785
0
    yloc2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
786
0
    xleft1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));
787
0
    xleft2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
788
0
    if (!line1 || !line2 || !yloc1 || !yloc2 || !xleft1 || !xleft2) {
789
0
        LEPT_FREE(line1);
790
0
        LEPT_FREE(line2);
791
0
        LEPT_FREE(yloc1);
792
0
        LEPT_FREE(yloc2);
793
0
        LEPT_FREE(xleft1);
794
0
        LEPT_FREE(xleft2);
795
0
        return ERROR_INT("calloc failure for an array", __func__, 1);
796
0
    }
797
0
    for (i = 0; i < n1; i++) {
798
0
        na1 = numaaGetNuma(naa1, i, L_CLONE);
799
0
        numaGetIValue(na1, 0, yloc1 + i);
800
0
        numaGetIValue(na1, 1, xleft1 + i);
801
0
        nbox = (numaGetCount(na1) - 1) / 2;
802
0
        if (nbox >= nperline)
803
0
            line1[i] = 1;
804
0
        numaDestroy(&na1);
805
0
    }
806
0
    for (i = 0; i < n2; i++) {
807
0
        na2 = numaaGetNuma(naa2, i, L_CLONE);
808
0
        numaGetIValue(na2, 0, yloc2 + i);
809
0
        numaGetIValue(na2, 1, xleft2 + i);
810
0
        nbox = (numaGetCount(na2) - 1) / 2;
811
0
        if (nbox >= nperline)
812
0
            line2[i] = 1;
813
0
        numaDestroy(&na2);
814
0
    }
815
816
        /* Enumerate all possible line matches.  A 'possible' line
817
         * match is one where the x and y shifts for the first box
818
         * in each line are within the maxshiftx and maxshifty
819
         * constraints, and the left and right sides of the remaining
820
         * (nperline - 1) successive boxes are within delx of each other.
821
         * The result is a set of four numas giving parameters of
822
         * each set of matching lines. */
823
0
    nai1 = numaCreate(0);  /* line index 1 of match */
824
0
    nai2 = numaCreate(0);  /* line index 2 of match */
825
0
    nasx = numaCreate(0);  /* shiftx for match */
826
0
    nasy = numaCreate(0);  /* shifty for match */
827
0
    for (i = 0; i < n1; i++) {
828
0
        if (line1[i] == 0) continue;
829
0
        y1 = yloc1[i];
830
0
        xl1 = xleft1[i];
831
0
        na1 = numaaGetNuma(naa1, i, L_CLONE);
832
0
        for (j = 0; j < n2; j++) {
833
0
            if (line2[j] == 0) continue;
834
0
            y2 = yloc2[j];
835
0
            if (L_ABS(y1 - y2) > maxshifty) continue;
836
0
            xl2 = xleft2[j];
837
0
            if (L_ABS(xl1 - xl2) > maxshiftx) continue;
838
0
            shiftx = xl1 - xl2;  /* shift to add to x2 values */
839
0
            shifty = y1 - y2;  /* shift to add to y2 values */
840
0
            na2 = numaaGetNuma(naa2, j, L_CLONE);
841
842
                /* Now check if 'nperline' boxes in the two lines match */
843
0
            match = testLineAlignmentX(na1, na2, shiftx, delx, nperline);
844
0
            if (match) {
845
0
                numaAddNumber(nai1, i);
846
0
                numaAddNumber(nai2, j);
847
0
                numaAddNumber(nasx, shiftx);
848
0
                numaAddNumber(nasy, shifty);
849
0
            }
850
0
            numaDestroy(&na2);
851
0
        }
852
0
        numaDestroy(&na1);
853
0
    }
854
855
        /* Determine if there are a sufficient number of mutually
856
         * aligned matches.  Mutually aligned matches place an additional
857
         * constraint on the 'possible' matches, where the relative
858
         * shifts must not exceed the (delx, dely) distances. */
859
0
    countAlignedMatches(nai1, nai2, nasx, nasy, n1, n2, delx, dely,
860
0
                        nreq, psame, debugflag);
861
862
0
    LEPT_FREE(line1);
863
0
    LEPT_FREE(line2);
864
0
    LEPT_FREE(yloc1);
865
0
    LEPT_FREE(yloc2);
866
0
    LEPT_FREE(xleft1);
867
0
    LEPT_FREE(xleft2);
868
0
    numaDestroy(&nai1);
869
0
    numaDestroy(&nai2);
870
0
    numaDestroy(&nasx);
871
0
    numaDestroy(&nasy);
872
0
    return 0;
873
0
}
874
875
876
static l_int32
877
testLineAlignmentX(NUMA    *na1,
878
                   NUMA    *na2,
879
                   l_int32  shiftx,
880
                   l_int32  delx,
881
                   l_int32  nperline)
882
0
{
883
0
l_int32  i, xl1, xr1, xl2, xr2, diffl, diffr;
884
885
0
    if (!na1)
886
0
        return ERROR_INT("na1 not defined", __func__, 1);
887
0
    if (!na2)
888
0
        return ERROR_INT("na2 not defined", __func__, 1);
889
890
0
    for (i = 0; i < nperline; i++) {
891
0
        numaGetIValue(na1, i + 1, &xl1);
892
0
        numaGetIValue(na1, i + 2, &xr1);
893
0
        numaGetIValue(na2, i + 1, &xl2);
894
0
        numaGetIValue(na2, i + 2, &xr2);
895
0
        diffl = L_ABS(xl1 - xl2 - shiftx);
896
0
        diffr = L_ABS(xr1 - xr2 - shiftx);
897
0
        if (diffl > delx || diffr > delx)
898
0
            return 0;
899
0
    }
900
901
0
    return 1;
902
0
}
903
904
905
/*
906
 * \brief   countAlignedMatches()
907
 *
908
 * \param[in]    nai1, nai2   numas of row pairs for matches
909
 * \param[in]    nasx, nasy   numas of x and y shifts for the matches
910
 * \param[in]    n1, n2       number of rows in images 1 and 2
911
 * \param[in]    delx, dely   allowed difference in shifts of the match,
912
 *                            compared to the reference match
913
 * \param[in]    nre1         number of required aligned matches
914
 * \param[out]   psame        return 1 if %nreq row matches are found;
915
 *                            0 otherwise
916
 * \return  0 if OK, 1 on error
917
 *
918
 * <pre>
919
 * Notes:
920
 *      (1) This takes 4 input arrays giving parameters of all the
921
 *          line matches.  It looks for the maximum set of aligned
922
 *          matches (matches with approximately the same overall shifts)
923
 *          that do not use rows from either image more than once.
924
 * </pre>
925
 */
926
static l_ok
927
countAlignedMatches(NUMA     *nai1,
928
                    NUMA     *nai2,
929
                    NUMA     *nasx,
930
                    NUMA     *nasy,
931
                    l_int32   n1,
932
                    l_int32   n2,
933
                    l_int32   delx,
934
                    l_int32   dely,
935
                    l_int32   nreq,
936
                    l_int32  *psame,
937
                    l_int32   debugflag)
938
0
{
939
0
l_int32   i, j, nm, shiftx, shifty, nmatch, diffx, diffy;
940
0
l_int32  *ia1, *ia2, *iasx, *iasy, *index1, *index2;
941
942
0
    if (!nai1 || !nai2 || !nasx || !nasy)
943
0
        return ERROR_INT("4 input numas not defined", __func__, 1);
944
0
    if (!psame)
945
0
        return ERROR_INT("&same not defined", __func__, 1);
946
0
    *psame = 0;
947
948
        /* Check for sufficient aligned matches, doing a double iteration
949
         * over the set of raw matches.  The row index arrays
950
         * are used to verify that the same rows in either image
951
         * are not used in more than one match.  Whenever there
952
         * is a match that is properly aligned, those rows are
953
         * marked in the index arrays.  */
954
0
    nm = numaGetCount(nai1);  /* number of matches */
955
0
    if (nm < nreq)
956
0
        return 0;
957
958
0
    ia1 = numaGetIArray(nai1);
959
0
    ia2 = numaGetIArray(nai2);
960
0
    iasx = numaGetIArray(nasx);
961
0
    iasy = numaGetIArray(nasy);
962
0
    index1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));  /* watch rows */
963
0
    index2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
964
0
    if (!index1 || !index2)
965
0
        return ERROR_INT("calloc fail for array", __func__, 1);
966
0
    for (i = 0; i < nm; i++) {
967
0
        if (*psame == 1)
968
0
            break;
969
970
            /* Reset row index arrays */
971
0
        memset(index1, 0, 4 * n1);
972
0
        memset(index2, 0, 4 * n2);
973
0
        nmatch = 1;
974
0
        index1[ia1[i]] = nmatch;  /* mark these rows as taken */
975
0
        index2[ia2[i]] = nmatch;
976
0
        shiftx = iasx[i];  /* reference shift between two rows */
977
0
        shifty = iasy[i];  /* ditto */
978
0
        if (nreq == 1) {
979
0
            *psame = 1;
980
0
            break;
981
0
        }
982
0
        for (j = 0; j < nm; j++) {
983
0
            if (j == i) continue;
984
                /* Rows must both be different from any previously seen */
985
0
            if (index1[ia1[j]] > 0 || index2[ia2[j]] > 0) continue;
986
                /* Check the shift for this match */
987
0
            diffx = L_ABS(shiftx - iasx[j]);
988
0
            diffy = L_ABS(shifty - iasy[j]);
989
0
            if (diffx > delx || diffy > dely) continue;
990
                /* We have a match */
991
0
            nmatch++;
992
0
            index1[ia1[j]] = nmatch;  /* mark the rows */
993
0
            index2[ia2[j]] = nmatch;
994
0
            if (nmatch >= nreq) {
995
0
                *psame = 1;
996
0
                if (debugflag)
997
0
                    printRowIndices(index1, n1, index2, n2);
998
0
                break;
999
0
            }
1000
0
        }
1001
0
    }
1002
1003
0
    LEPT_FREE(ia1);
1004
0
    LEPT_FREE(ia2);
1005
0
    LEPT_FREE(iasx);
1006
0
    LEPT_FREE(iasy);
1007
0
    LEPT_FREE(index1);
1008
0
    LEPT_FREE(index2);
1009
0
    return 0;
1010
0
}
1011
1012
1013
static void
1014
printRowIndices(l_int32  *index1,
1015
                l_int32   n1,
1016
                l_int32  *index2,
1017
                l_int32   n2)
1018
0
{
1019
0
l_int32  i;
1020
1021
0
    lept_stderr("Index1: ");
1022
0
    for (i = 0; i < n1; i++) {
1023
0
        if (i && (i % 20 == 0))
1024
0
            lept_stderr("\n        ");
1025
0
        lept_stderr("%3d", index1[i]);
1026
0
    }
1027
0
    lept_stderr("\n");
1028
1029
0
    lept_stderr("Index2: ");
1030
0
    for (i = 0; i < n2; i++) {
1031
0
        if (i && (i % 20 == 0))
1032
0
            lept_stderr("\n        ");
1033
0
        lept_stderr("%3d", index2[i]);
1034
0
    }
1035
0
    lept_stderr("\n");
1036
0
    return;
1037
0
}