Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccmain/paragraphs.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        paragraphs.cpp
3
 * Description: Paragraph detection for tesseract.
4
 * Author:      David Eger
5
 *
6
 * (C) Copyright 2011, Google Inc.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#include "paragraphs.h"
20
21
#include "helpers.h"             // for UpdateRange, ClipToRange
22
#include "host.h"                // for NearlyEqual
23
#include "mutableiterator.h"     // for MutableIterator
24
#include "ocrblock.h"            // for BLOCK
25
#include "ocrpara.h"             // for ParagraphModel, PARA, PARA_IT, PARA...
26
#include "ocrrow.h"              // for ROW
27
#include "pageres.h"             // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
28
#include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
29
#include "pdblock.h"             // for PDBLK
30
#include "polyblk.h"             // for POLY_BLOCK
31
#include "ratngs.h"              // for WERD_CHOICE
32
#include "rect.h"                // for TBOX
33
#include "statistc.h"            // for STATS
34
#include "tesserrstream.h"       // for tesserr
35
#include "tprintf.h"             // for tprintf
36
#include "unicharset.h"          // for UNICHARSET
37
#include "werd.h"                // for WERD, W_REP_CHAR
38
39
#include <tesseract/pageiterator.h> // for PageIterator
40
#include <tesseract/publictypes.h>  // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
41
#include <tesseract/unichar.h>      // for UNICHAR, UNICHAR_ID
42
43
#include <algorithm> // for max
44
#include <cctype>    // for isspace
45
#include <cmath>     // for abs
46
#include <cstdio>    // for snprintf
47
#include <cstdlib>   // for abs
48
#include <cstring>   // for strchr, strlen
49
#include <memory>    // for unique_ptr
50
51
static const char *const kRLE = "\u202A"; // Right-to-Left Embedding
52
static const char *const kPDF = "\u202C"; // Pop Directional Formatting
53
54
namespace tesseract {
55
56
// Special "weak" ParagraphModels.
57
const ParagraphModel *kCrownLeft =
58
    reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
59
const ParagraphModel *kCrownRight =
60
    reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
61
62
// Do the text and geometry of two rows support a paragraph break between them?
63
static bool LikelyParagraphStart(const RowScratchRegisters &before,
64
                                 const RowScratchRegisters &after,
65
                                 tesseract::ParagraphJustification j);
66
67
// Given the width of a typical space between words, what is the threshold
68
// by which by which we think left and right alignments for paragraphs
69
// can vary and still be aligned.
70
21.0k
static int Epsilon(int space_pix) {
71
21.0k
  return space_pix * 4 / 5;
72
21.0k
}
73
74
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
75
                              const std::vector<RowScratchRegisters> *rows, int row_start,
76
123k
                              int row_end) {
77
123k
  if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
78
0
    tesserr << "Invalid arguments rows[" << row_start << ", " << row_end
79
0
            << ") while rows is of size " << rows->size() << ".\n";
80
0
    return false;
81
0
  }
82
123k
  if (row_end - row_start < min_num_rows) {
83
6.06k
    if (debug_level > 1) {
84
0
      tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
85
0
    }
86
6.06k
    return false;
87
6.06k
  }
88
117k
  return true;
89
123k
}
90
91
// =============================== Debug Code ================================
92
93
// Given a row-major matrix of unicode text and a column separator, print
94
// a formatted table.  For ASCII, we get good column alignment.
95
0
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
96
0
  std::vector<int> max_col_widths;
97
0
  for (const auto &row : rows) {
98
0
    auto num_columns = row.size();
99
0
    for (size_t c = 0; c < num_columns; c++) {
100
0
      int num_unicodes = 0;
101
0
      for (char i : row[c]) {
102
0
        if ((i & 0xC0) != 0x80) {
103
0
          num_unicodes++;
104
0
        }
105
0
      }
106
0
      if (c >= max_col_widths.size()) {
107
0
        max_col_widths.push_back(num_unicodes);
108
0
      } else {
109
0
        if (num_unicodes > max_col_widths[c]) {
110
0
          max_col_widths[c] = num_unicodes;
111
0
        }
112
0
      }
113
0
    }
114
0
  }
115
116
0
  std::vector<std::string> col_width_patterns;
117
0
  col_width_patterns.reserve(max_col_widths.size());
118
0
  for (int max_col_width : max_col_widths) {
119
0
    col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s");
120
0
  }
121
122
0
  for (const auto &row : rows) {
123
0
    for (unsigned c = 0; c < row.size(); c++) {
124
0
      if (c > 0) {
125
0
        tprintf("%s", colsep);
126
0
      }
127
0
      tprintf(col_width_patterns[c].c_str(), row[c].c_str());
128
0
    }
129
0
    tprintf("\n");
130
0
  }
131
0
}
132
133
0
static std::string RtlEmbed(const std::string &word, bool rtlify) {
134
0
  if (rtlify) {
135
0
    return std::string(kRLE) + word + std::string(kPDF);
136
0
  }
137
0
  return word;
138
0
}
139
140
// Print the current thoughts of the paragraph detector.
141
static void PrintDetectorState(const ParagraphTheory &theory,
142
0
                               const std::vector<RowScratchRegisters> &rows) {
143
0
  std::vector<std::vector<std::string>> output;
144
0
  output.emplace_back();
145
0
  output.back().push_back("#row");
146
0
  output.back().push_back("space");
147
0
  output.back().push_back("..");
148
0
  output.back().push_back("lword[widthSEL]");
149
0
  output.back().push_back("rword[widthSEL]");
150
0
  RowScratchRegisters::AppendDebugHeaderFields(output.back());
151
0
  output.back().push_back("text");
152
153
0
  for (unsigned i = 0; i < rows.size(); i++) {
154
0
    output.emplace_back();
155
0
    std::vector<std::string> &row = output.back();
156
0
    const RowInfo &ri = *rows[i].ri_;
157
0
    row.push_back(std::to_string(i));
158
0
    row.push_back(std::to_string(ri.average_interword_space));
159
0
    row.emplace_back(ri.has_leaders ? ".." : " ");
160
0
    row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
161
0
                  (ri.lword_likely_starts_idea ? "S" : "s") +
162
0
                  (ri.lword_likely_ends_idea ? "E" : "e") +
163
0
                  (ri.lword_indicates_list_item ? "L" : "l") + "]");
164
0
    row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
165
0
                  (ri.rword_likely_starts_idea ? "S" : "s") +
166
0
                  (ri.rword_likely_ends_idea ? "E" : "e") +
167
0
                  (ri.rword_indicates_list_item ? "L" : "l") + "]");
168
0
    rows[i].AppendDebugInfo(theory, row);
169
0
    row.push_back(RtlEmbed(ri.text, !ri.ltr));
170
0
  }
171
0
  PrintTable(output, " ");
172
173
0
  tprintf("Active Paragraph Models:\n");
174
0
  unsigned m = 0;
175
0
  for (const auto &model : theory.models()) {
176
0
    tprintf(" %d: %s\n", ++m, model->ToString().c_str());
177
0
  }
178
0
}
179
180
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
181
94.7k
                      const std::vector<RowScratchRegisters> &rows) {
182
94.7k
  if (!should_print) {
183
94.7k
    return;
184
94.7k
  }
185
0
  tprintf("# %s\n", phase);
186
0
  PrintDetectorState(theory, rows);
187
0
}
188
189
// Print out the text for rows[row_start, row_end)
190
static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
191
0
                          int row_end) {
192
0
  tprintf("======================================\n");
193
0
  for (int row = row_start; row < row_end; row++) {
194
0
    tprintf("%s\n", rows[row].ri_->text.c_str());
195
0
  }
196
0
  tprintf("======================================\n");
197
0
}
198
199
// ============= Brain Dead Language Model (ASCII Version) ===================
200
201
0
static bool IsLatinLetter(int ch) {
202
0
  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
203
0
}
204
205
99.6k
static bool IsDigitLike(int ch) {
206
99.6k
  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
207
99.6k
}
208
209
0
static bool IsOpeningPunct(int ch) {
210
0
  return strchr("'\"({[", ch) != nullptr;
211
0
}
212
213
0
static bool IsTerminalPunct(int ch) {
214
0
  return strchr(":'\".?!]})", ch) != nullptr;
215
0
}
216
217
// Return a pointer after consuming as much text as qualifies as roman numeral.
218
0
static const char *SkipChars(const char *str, const char *toskip) {
219
0
  while (*str != '\0' && strchr(toskip, *str)) {
220
0
    str++;
221
0
  }
222
0
  return str;
223
0
}
224
225
0
static const char *SkipChars(const char *str, bool (*skip)(int)) {
226
0
  while (*str != '\0' && skip(*str)) {
227
0
    str++;
228
0
  }
229
0
  return str;
230
0
}
231
232
0
static const char *SkipOne(const char *str, const char *toskip) {
233
0
  if (*str != '\0' && strchr(toskip, *str)) {
234
0
    return str + 1;
235
0
  }
236
0
  return str;
237
0
}
238
239
// Return whether it is very likely that this is a numeral marker that could
240
// start a list item.  Some examples include:
241
//   A   I   iii.   VI   (2)   3.5.   [C-4]
242
0
static bool LikelyListNumeral(const std::string &word) {
243
0
  const char *kRomans = "ivxlmdIVXLMD";
244
0
  const char *kDigits = "012345789";
245
0
  const char *kOpen = "[{(";
246
0
  const char *kSep = ":;-.,";
247
0
  const char *kClose = "]})";
248
249
0
  int num_segments = 0;
250
0
  const char *pos = word.c_str();
251
0
  while (*pos != '\0' && num_segments < 3) {
252
    // skip up to two open parens.
253
0
    const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
254
0
    const char *numeral_end = SkipChars(numeral_start, kRomans);
255
0
    if (numeral_end != numeral_start) {
256
      // Got Roman Numeral. Great.
257
0
    } else {
258
0
      numeral_end = SkipChars(numeral_start, kDigits);
259
0
      if (numeral_end == numeral_start) {
260
        // If there's a single latin letter, we can use that.
261
0
        numeral_end = SkipChars(numeral_start, IsLatinLetter);
262
0
        if (numeral_end - numeral_start != 1) {
263
0
          break;
264
0
        }
265
0
      }
266
0
    }
267
    // We got some sort of numeral.
268
0
    num_segments++;
269
    // Skip any trailing parens or punctuation.
270
0
    pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
271
0
    if (pos == numeral_end) {
272
0
      break;
273
0
    }
274
0
  }
275
0
  return *pos == '\0';
276
0
}
277
278
209k
static bool LikelyListMark(const std::string &word) {
279
209k
  const char *kListMarks = "0Oo*.,+.";
280
209k
  return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
281
209k
}
282
283
0
bool AsciiLikelyListItem(const std::string &word) {
284
0
  return LikelyListMark(word) || LikelyListNumeral(word);
285
0
}
286
287
// ========== Brain Dead Language Model (Tesseract Version) ================
288
289
// Return the first Unicode Codepoint from werd[pos].
290
651k
static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
291
651k
  if (!u || !werd || pos > werd->length()) {
292
0
    return 0;
293
0
  }
294
651k
  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
295
651k
}
296
297
// A useful helper class for finding the first j >= i so that word[j]
298
// does not have given character type.
299
class UnicodeSpanSkipper {
300
public:
301
  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
302
274k
      : u_(unicharset), word_(word), wordlen_(word->length()) {
303
274k
  }
304
305
  // Given an input position, return the first position >= pos not punc.
306
  unsigned SkipPunc(unsigned pos);
307
  // Given an input position, return the first position >= pos not digit.
308
  unsigned SkipDigits(unsigned pos);
309
  // Given an input position, return the first position >= pos not roman.
310
  unsigned SkipRomans(unsigned pos);
311
  // Given an input position, return the first position >= pos not alpha.
312
  unsigned SkipAlpha(unsigned pos);
313
314
private:
315
  const UNICHARSET *u_;
316
  const WERD_CHOICE *word_;
317
  unsigned wordlen_;
318
};
319
320
459k
unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
321
572k
  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
322
112k
    pos++;
323
112k
  }
324
459k
  return pos;
325
459k
}
326
327
163k
unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
328
193k
  while (pos < wordlen_ &&
329
193k
         (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
330
29.6k
    pos++;
331
29.6k
  }
332
163k
  return pos;
333
163k
}
334
335
278k
unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
336
278k
  const char *kRomans = "ivxlmdIVXLMD";
337
472k
  while (pos < wordlen_) {
338
336k
    int ch = UnicodeFor(u_, word_, pos);
339
336k
    if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) {
340
142k
      break;
341
142k
    }
342
193k
    pos++;
343
193k
  }
344
278k
  return pos;
345
278k
}
346
347
140k
unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
348
240k
  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
349
99.6k
    pos++;
350
99.6k
  }
351
140k
  return pos;
352
140k
}
353
354
214k
static bool LikelyListMarkUnicode(int ch) {
355
214k
  if (ch < 0x80) {
356
209k
    std::string single_ch;
357
209k
    single_ch += ch;
358
209k
    return LikelyListMark(single_ch);
359
209k
  }
360
5.82k
  switch (ch) {
361
    // TODO(eger) expand this list of unicodes as needed.
362
6
    case 0x00B0: // degree sign
363
6
    case 0x2022: // bullet
364
6
    case 0x25E6: // white bullet
365
6
    case 0x00B7: // middle dot
366
6
    case 0x25A1: // white square
367
6
    case 0x25A0: // black square
368
6
    case 0x25AA: // black small square
369
6
    case 0x2B1D: // black very small square
370
6
    case 0x25BA: // black right-pointing pointer
371
6
    case 0x25CF: // black circle
372
6
    case 0x25CB: // white circle
373
6
      return true;
374
5.82k
    default:
375
5.82k
      break; // fall through
376
5.82k
  }
377
5.82k
  return false;
378
5.82k
}
379
380
// Return whether it is very likely that this is a numeral marker that could
381
// start a list item.  Some examples include:
382
//   A   I   iii.   VI   (2)   3.5.   [C-4]
383
285k
static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
384
285k
  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
385
10.4k
    return true;
386
10.4k
  }
387
388
274k
  UnicodeSpanSkipper m(u, werd);
389
274k
  int num_segments = 0;
390
274k
  unsigned pos = 0;
391
294k
  while (pos < werd->length() && num_segments < 3) {
392
285k
    auto numeral_start = m.SkipPunc(pos);
393
285k
    if (numeral_start > pos + 1) {
394
6.90k
      break;
395
6.90k
    }
396
278k
    auto numeral_end = m.SkipRomans(numeral_start);
397
278k
    if (numeral_end == numeral_start) {
398
163k
      numeral_end = m.SkipDigits(numeral_start);
399
163k
      if (numeral_end == numeral_start) {
400
        // If there's a single latin letter, we can use that.
401
140k
        numeral_end = m.SkipAlpha(numeral_start);
402
140k
        if (numeral_end - numeral_start != 1) {
403
104k
          break;
404
104k
        }
405
140k
      }
406
163k
    }
407
    // We got some sort of numeral.
408
174k
    num_segments++;
409
    // Skip any trailing punctuation.
410
174k
    pos = m.SkipPunc(numeral_end);
411
174k
    if (pos == numeral_end) {
412
154k
      break;
413
154k
    }
414
174k
  }
415
274k
  return pos == werd->length();
416
285k
}
417
418
template<class T>
419
873k
void push_back_new(std::vector<T> &vector, const T &data) {
420
873k
  if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
421
854k
    vector.push_back(data);
422
854k
  }
423
873k
}
void tesseract::push_back_new<tesseract::LineHypothesis>(std::__1::vector<tesseract::LineHypothesis, std::__1::allocator<tesseract::LineHypothesis> >&, tesseract::LineHypothesis const&)
Line
Count
Source
419
125k
void push_back_new(std::vector<T> &vector, const T &data) {
420
125k
  if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
421
123k
    vector.push_back(data);
422
123k
  }
423
125k
}
void tesseract::push_back_new<tesseract::ParagraphModel const*>(std::__1::vector<tesseract::ParagraphModel const*, std::__1::allocator<tesseract::ParagraphModel const*> >&, tesseract::ParagraphModel const* const&)
Line
Count
Source
419
744k
void push_back_new(std::vector<T> &vector, const T &data) {
420
744k
  if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
421
727k
    vector.push_back(data);
422
727k
  }
423
744k
}
void tesseract::push_back_new<tesseract::ParagraphModel*>(std::__1::vector<tesseract::ParagraphModel*, std::__1::allocator<tesseract::ParagraphModel*> >&, tesseract::ParagraphModel* const&)
Line
Count
Source
419
3.28k
void push_back_new(std::vector<T> &vector, const T &data) {
420
3.28k
  if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
421
3.28k
    vector.push_back(data);
422
3.28k
  }
423
3.28k
}
424
425
// ========= Brain Dead Language Model (combined entry points) ================
426
427
// Given the leftmost word of a line either as a Tesseract unicharset + werd
428
// or a utf8 string, set the following attributes for it:
429
//   is_list -      this word might be a list number or bullet.
430
//   starts_idea -  this word is likely to start a sentence.
431
//   ends_idea -    this word is likely to end a sentence.
432
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
433
142k
                        bool *is_list, bool *starts_idea, bool *ends_idea) {
434
142k
  *is_list = false;
435
142k
  *starts_idea = false;
436
142k
  *ends_idea = false;
437
142k
  if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
438
0
    *ends_idea = true;
439
0
    return;
440
0
  }
441
442
142k
  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
443
142k
    if (UniLikelyListItem(unicharset, werd)) {
444
79.9k
      *is_list = true;
445
79.9k
      *starts_idea = true;
446
79.9k
      *ends_idea = true;
447
79.9k
    }
448
142k
    if (unicharset->get_isupper(werd->unichar_id(0))) {
449
24.9k
      *starts_idea = true;
450
24.9k
    }
451
142k
    if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
452
32.8k
      *starts_idea = true;
453
32.8k
      *ends_idea = true;
454
32.8k
    }
455
142k
  } else { // Assume utf8 is mostly ASCII
456
0
    if (AsciiLikelyListItem(utf8)) {
457
0
      *is_list = true;
458
0
      *starts_idea = true;
459
0
    }
460
0
    int start_letter = utf8[0];
461
0
    if (IsOpeningPunct(start_letter)) {
462
0
      *starts_idea = true;
463
0
    }
464
0
    if (IsTerminalPunct(start_letter)) {
465
0
      *ends_idea = true;
466
0
    }
467
0
    if (start_letter >= 'A' && start_letter <= 'Z') {
468
0
      *starts_idea = true;
469
0
    }
470
0
  }
471
142k
}
472
473
// Given the rightmost word of a line either as a Tesseract unicharset + werd
474
// or a utf8 string, set the following attributes for it:
475
//   is_list -      this word might be a list number or bullet.
476
//   starts_idea -  this word is likely to start a sentence.
477
//   ends_idea -    this word is likely to end a sentence.
478
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
479
142k
                         bool *is_list, bool *starts_idea, bool *ends_idea) {
480
142k
  *is_list = false;
481
142k
  *starts_idea = false;
482
142k
  *ends_idea = false;
483
142k
  if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
484
0
    *ends_idea = true;
485
0
    return;
486
0
  }
487
488
142k
  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
489
142k
    if (UniLikelyListItem(unicharset, werd)) {
490
80.0k
      *is_list = true;
491
80.0k
      *starts_idea = true;
492
80.0k
    }
493
142k
    UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
494
142k
    if (unicharset->get_ispunctuation(last_letter)) {
495
33.3k
      *ends_idea = true;
496
33.3k
    }
497
142k
  } else { // Assume utf8 is mostly ASCII
498
0
    if (AsciiLikelyListItem(utf8)) {
499
0
      *is_list = true;
500
0
      *starts_idea = true;
501
0
    }
502
0
    int last_letter = utf8[utf8.size() - 1];
503
0
    if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
504
0
      *ends_idea = true;
505
0
    }
506
0
  }
507
142k
}
508
509
// =============== Implementation of RowScratchRegisters =====================
510
/* static */
511
0
void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
512
0
  header.emplace_back("[lmarg,lind;rind,rmarg]");
513
0
  header.emplace_back("model");
514
0
}
515
516
void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
517
0
                                          std::vector<std::string> &dbg) const {
518
0
  char s[60];
519
  // The largest (positive and negative) numbers are reported for lindent & rindent.
520
  // While the column header has widths 5,4,4,5, it is therefore opportune to slightly
521
  // offset the widths in the format string here to allow ample space for lindent & rindent
522
  // while keeping the final table output nicely readable: 4,5,5,4.
523
0
  snprintf(s, sizeof(s), "[%4d,%5d;%5d,%4d]", lmargin_, lindent_, rindent_, rmargin_);
524
0
  dbg.emplace_back(s);
525
0
  std::string model_string;
526
0
  model_string += static_cast<char>(GetLineType());
527
0
  model_string += ":";
528
529
0
  int model_numbers = 0;
530
0
  for (const auto &hypothese : hypotheses_) {
531
0
    if (hypothese.model == nullptr) {
532
0
      continue;
533
0
    }
534
0
    if (model_numbers > 0) {
535
0
      model_string += ",";
536
0
    }
537
0
    if (StrongModel(hypothese.model)) {
538
0
      model_string += std::to_string(1 + theory.IndexOf(hypothese.model));
539
0
    } else if (hypothese.model == kCrownLeft) {
540
0
      model_string += "CrL";
541
0
    } else if (hypothese.model == kCrownRight) {
542
0
      model_string += "CrR";
543
0
    }
544
0
    model_numbers++;
545
0
  }
546
0
  if (model_numbers == 0) {
547
0
    model_string += "0";
548
0
  }
549
550
0
  dbg.push_back(model_string);
551
0
}
552
553
142k
void RowScratchRegisters::Init(const RowInfo &row) {
554
142k
  ri_ = &row;
555
142k
  lmargin_ = 0;
556
142k
  lindent_ = row.pix_ldistance;
557
142k
  rmargin_ = 0;
558
142k
  rindent_ = row.pix_rdistance;
559
142k
}
560
561
680k
LineType RowScratchRegisters::GetLineType() const {
562
680k
  if (hypotheses_.empty()) {
563
475k
    return LT_UNKNOWN;
564
475k
  }
565
205k
  bool has_start = false;
566
205k
  bool has_body = false;
567
205k
  for (const auto &hypothese : hypotheses_) {
568
205k
    switch (hypothese.ty) {
569
54.4k
      case LT_START:
570
54.4k
        has_start = true;
571
54.4k
        break;
572
151k
      case LT_BODY:
573
151k
        has_body = true;
574
151k
        break;
575
0
      default:
576
0
        tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
577
0
        break;
578
205k
    }
579
205k
  }
580
205k
  if (has_start && has_body) {
581
271
    return LT_MULTIPLE;
582
271
  }
583
204k
  return has_start ? LT_START : LT_BODY;
584
205k
}
585
586
399k
LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
587
399k
  if (hypotheses_.empty()) {
588
9.52k
    return LT_UNKNOWN;
589
9.52k
  }
590
389k
  bool has_start = false;
591
389k
  bool has_body = false;
592
391k
  for (const auto &hypothese : hypotheses_) {
593
391k
    if (hypothese.model != model) {
594
68.1k
      continue;
595
68.1k
    }
596
322k
    switch (hypothese.ty) {
597
113k
      case LT_START:
598
113k
        has_start = true;
599
113k
        break;
600
209k
      case LT_BODY:
601
209k
        has_body = true;
602
209k
        break;
603
0
      default:
604
0
        tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
605
0
        break;
606
322k
    }
607
322k
  }
608
389k
  if (has_start && has_body) {
609
66
    return LT_MULTIPLE;
610
66
  }
611
389k
  return has_start ? LT_START : LT_BODY;
612
389k
}
613
614
10.6k
void RowScratchRegisters::SetStartLine() {
615
10.6k
  LineType current_lt = GetLineType();
616
10.6k
  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
617
0
    tprintf("Trying to set a line to be START when it's already BODY.\n");
618
0
  }
619
10.6k
  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
620
10.6k
    push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
621
10.6k
  }
622
10.6k
}
623
624
22.4k
void RowScratchRegisters::SetBodyLine() {
625
22.4k
  LineType current_lt = GetLineType();
626
22.4k
  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
627
0
    tprintf("Trying to set a line to be BODY when it's already START.\n");
628
0
  }
629
22.4k
  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
630
22.4k
    push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
631
22.4k
  }
632
22.4k
}
633
634
20.6k
void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
635
20.6k
  push_back_new(hypotheses_, LineHypothesis(LT_START, model));
636
20.6k
  auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
637
20.6k
  if (found != hypotheses_.end()) {
638
1.61k
    hypotheses_.erase(found);
639
1.61k
  }
640
20.6k
}
641
642
71.8k
void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
643
71.8k
  push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
644
71.8k
  auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
645
71.8k
  if (found != hypotheses_.end()) {
646
1.80k
    hypotheses_.erase(found);
647
1.80k
  }
648
71.8k
}
649
650
571k
void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
651
571k
  for (const auto &hypothese : hypotheses_) {
652
224k
    if (hypothese.ty == LT_START && StrongModel(hypothese.model)) {
653
2.71k
      push_back_new(*models, hypothese.model);
654
2.71k
    }
655
224k
  }
656
571k
}
657
658
1.09M
void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
659
1.09M
  for (const auto &hypothese : hypotheses_) {
660
433k
    if (StrongModel(hypothese.model)) {
661
79.6k
      push_back_new(*models, hypothese.model);
662
79.6k
    }
663
433k
  }
664
1.09M
}
665
666
1.28M
void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
667
1.28M
  for (const auto &hypothese : hypotheses_) {
668
696k
    if (hypothese.model != nullptr) {
669
622k
      push_back_new(*models, hypothese.model);
670
622k
    }
671
696k
  }
672
1.28M
}
673
674
14.6k
const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
675
14.6k
  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) {
676
173
    return nullptr;
677
173
  }
678
14.4k
  return hypotheses_[0].model;
679
14.6k
}
680
681
149k
const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
682
149k
  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) {
683
118k
    return nullptr;
684
118k
  }
685
30.6k
  return hypotheses_[0].model;
686
149k
}
687
688
// Discard any hypotheses whose model is not in the given list.
689
0
void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) {
690
0
  if (models.empty()) {
691
0
    return;
692
0
  }
693
0
  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
694
0
    if (!contains(models, hypotheses_[h].model)) {
695
0
      hypotheses_.erase(hypotheses_.begin() + h);
696
0
    }
697
0
  }
698
0
}
699
700
// ============ Geometry based Paragraph Detection Algorithm =================
701
702
struct Cluster {
703
0
  Cluster() : center(0), count(0) {}
704
109k
  Cluster(int cen, int num) : center(cen), count(num) {}
705
706
  int center; // The center of the cluster.
707
  int count;  // The number of entries within the cluster.
708
};
709
710
class SimpleClusterer {
711
public:
712
39.7k
  explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
713
523k
  void Add(int value) {
714
523k
    values_.push_back(value);
715
523k
  }
716
0
  size_t size() const {
717
0
    return values_.size();
718
0
  }
719
  void GetClusters(std::vector<Cluster> *clusters);
720
721
private:
722
  int max_cluster_width_;
723
  std::vector<int> values_;
724
};
725
726
// Return the index of the cluster closest to value.
727
363k
static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
728
363k
  unsigned best_index = 0;
729
1.11M
  for (unsigned i = 0; i < clusters.size(); i++) {
730
752k
    if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {
731
162k
      best_index = i;
732
162k
    }
733
752k
  }
734
363k
  return best_index;
735
363k
}
736
737
59.5k
void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
738
59.5k
  clusters->clear();
739
59.5k
  std::sort(values_.begin(), values_.end());
740
168k
  for (unsigned i = 0; i < values_.size();) {
741
109k
    int orig_i = i;
742
109k
    int lo = values_[i];
743
109k
    int hi = lo;
744
784k
    while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
745
675k
      hi = values_[i];
746
675k
    }
747
109k
    clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
748
109k
  }
749
59.5k
}
750
751
// Calculate left- and right-indent tab stop values seen in
752
// rows[row_start, row_end) given a tolerance of tolerance.
753
static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
754
                              int tolerance, std::vector<Cluster> *left_tabs,
755
9.92k
                              std::vector<Cluster> *right_tabs) {
756
9.92k
  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
757
0
    return;
758
0
  }
759
  // First pass: toss all left and right indents into clusterers.
760
9.92k
  SimpleClusterer initial_lefts(tolerance);
761
9.92k
  SimpleClusterer initial_rights(tolerance);
762
9.92k
  std::vector<Cluster> initial_left_tabs;
763
9.92k
  std::vector<Cluster> initial_right_tabs;
764
141k
  for (int i = row_start; i < row_end; i++) {
765
131k
    initial_lefts.Add((*rows)[i].lindent_);
766
131k
    initial_rights.Add((*rows)[i].rindent_);
767
131k
  }
768
9.92k
  initial_lefts.GetClusters(&initial_left_tabs);
769
9.92k
  initial_rights.GetClusters(&initial_right_tabs);
770
771
  // Second pass: cluster only lines that are not "stray"
772
  //   An example of a stray line is a page number -- a line whose start
773
  //   and end tab-stops are far outside the typical start and end tab-stops
774
  //   for the block.
775
  //   Put another way, we only cluster data from lines whose start or end
776
  //   tab stop is frequent.
777
9.92k
  SimpleClusterer lefts(tolerance);
778
9.92k
  SimpleClusterer rights(tolerance);
779
780
  // Outlier elimination.  We might want to switch this to test outlier-ness
781
  // based on how strange a position an outlier is in instead of or in addition
782
  // to how rare it is.  These outliers get re-added if we end up having too
783
  // few tab stops, to work with, however.
784
9.92k
  int infrequent_enough_to_ignore = 0;
785
9.92k
  if (row_end - row_start >= 8) {
786
7.11k
    infrequent_enough_to_ignore = 1;
787
7.11k
  }
788
9.92k
  if (row_end - row_start >= 20) {
789
2.07k
    infrequent_enough_to_ignore = 2;
790
2.07k
  }
791
792
141k
  for (int i = row_start; i < row_end; i++) {
793
131k
    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
794
131k
    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
795
131k
    if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
796
131k
        initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
797
130k
      lefts.Add((*rows)[i].lindent_);
798
130k
      rights.Add((*rows)[i].rindent_);
799
130k
    }
800
131k
  }
801
9.92k
  lefts.GetClusters(left_tabs);
802
9.92k
  rights.GetClusters(right_tabs);
803
804
9.92k
  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
805
9.92k
      (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
806
    // One side is really ragged, and the other only has one tab stop,
807
    // so those "insignificant outliers" are probably important, actually.
808
    // This often happens on a page of an index.  Add back in the ones
809
    // we omitted in the first pass.
810
3.91k
    for (int i = row_start; i < row_end; i++) {
811
3.69k
      int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
812
3.69k
      int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
813
3.69k
      if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
814
3.69k
            initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
815
60
        lefts.Add((*rows)[i].lindent_);
816
60
        rights.Add((*rows)[i].rindent_);
817
60
      }
818
3.69k
    }
819
227
  }
820
9.92k
  lefts.GetClusters(left_tabs);
821
9.92k
  rights.GetClusters(right_tabs);
822
823
  // If one side is almost a two-indent aligned side, and the other clearly
824
  // isn't, try to prune out the least frequent tab stop from that side.
825
9.92k
  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
826
171
    int to_prune = -1;
827
684
    for (int i = left_tabs->size() - 1; i >= 0; i--) {
828
513
      if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
829
195
        to_prune = i;
830
195
      }
831
513
    }
832
171
    if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
833
100
      left_tabs->erase(left_tabs->begin() + to_prune);
834
100
    }
835
171
  }
836
9.92k
  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
837
111
    int to_prune = -1;
838
444
    for (int i = right_tabs->size() - 1; i >= 0; i--) {
839
333
      if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
840
130
        to_prune = i;
841
130
      }
842
333
    }
843
111
    if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
844
87
      right_tabs->erase(right_tabs->begin() + to_prune);
845
87
    }
846
111
  }
847
9.92k
}
848
849
// Given a paragraph model mark rows[row_start, row_end) as said model
850
// start or body lines.
851
//
852
// Case 1: model->first_indent_ != model->body_indent_
853
//   Differentiating the paragraph start lines from the paragraph body lines in
854
//   this case is easy, we just see how far each line is indented.
855
//
856
// Case 2: model->first_indent_ == model->body_indent_
857
//   Here, we find end-of-paragraph lines by looking for "short lines."
858
//   What constitutes a "short line" changes depending on whether the text
859
//   ragged-right[left] or fully justified (aligned left and right).
860
//
861
//   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)
862
//     We have a new paragraph it the first word would have at the end
863
//     of the previous line.
864
//
865
//   Case 2b: Fully Justified.  (eop_threshold > 0)
866
//     We mark a line as short (end of paragraph) if the offside indent
867
//     is greater than eop_threshold.
868
static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
869
2.16k
                              const ParagraphModel *model, bool ltr, int eop_threshold) {
870
2.16k
  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
871
0
    return;
872
0
  }
873
31.0k
  for (int row = row_start; row < row_end; row++) {
874
28.8k
    bool valid_first = ValidFirstLine(rows, row, model);
875
28.8k
    bool valid_body = ValidBodyLine(rows, row, model);
876
28.8k
    if (valid_first && !valid_body) {
877
2.27k
      (*rows)[row].AddStartLine(model);
878
26.5k
    } else if (valid_body && !valid_first) {
879
8.65k
      (*rows)[row].AddBodyLine(model);
880
17.9k
    } else if (valid_body && valid_first) {
881
17.6k
      bool after_eop = (row == row_start);
882
17.6k
      if (row > row_start) {
883
16.3k
        if (eop_threshold > 0) {
884
6.66k
          if (model->justification() == JUSTIFICATION_LEFT) {
885
6.32k
            after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
886
6.32k
          } else {
887
336
            after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
888
336
          }
889
9.73k
        } else {
890
9.73k
          after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());
891
9.73k
        }
892
16.3k
      }
893
17.6k
      if (after_eop) {
894
4.20k
        (*rows)[row].AddStartLine(model);
895
13.4k
      } else {
896
13.4k
        (*rows)[row].AddBodyLine(model);
897
13.4k
      }
898
17.6k
    } else {
899
      // Do nothing. Stray row.
900
298
    }
901
28.8k
  }
902
2.16k
}
903
904
// GeometricClassifierState holds all of the information we'll use while
905
// trying to determine a paragraph model for the text lines in a block of
906
// text:
907
//   + the rows under consideration [row_start, row_end)
908
//   + the common left- and right-indent tab stops
909
//   + does the block start out left-to-right or right-to-left
910
// Further, this struct holds the data we amass for the (single) ParagraphModel
911
// we'll assign to the text lines (assuming we get that far).
912
struct GeometricClassifierState {
913
  GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
914
                           int r_end)
915
9.92k
      : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
916
9.92k
    tolerance = InterwordSpace(*r, r_start, r_end);
917
9.92k
    CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);
918
9.92k
    if (debug_level >= 3) {
919
0
      tesserr << "Geometry: TabStop cluster tolerance = " << tolerance << "; "
920
0
              << left_tabs.size() << " left tabs; "
921
0
              << right_tabs.size() << " right tabs\n";
922
0
    }
923
9.92k
    ltr = (*r)[r_start].ri_->ltr;
924
9.92k
  }
925
926
3.00k
  void AssumeLeftJustification() {
927
3.00k
    just = tesseract::JUSTIFICATION_LEFT;
928
3.00k
    margin = (*rows)[row_start].lmargin_;
929
3.00k
  }
930
931
482
  void AssumeRightJustification() {
932
482
    just = tesseract::JUSTIFICATION_RIGHT;
933
482
    margin = (*rows)[row_start].rmargin_;
934
482
  }
935
936
  // Align tabs are the tab stops the text is aligned to.
937
17.2k
  const std::vector<Cluster> &AlignTabs() const {
938
17.2k
    if (just == tesseract::JUSTIFICATION_RIGHT) {
939
3.26k
      return right_tabs;
940
3.26k
    }
941
13.9k
    return left_tabs;
942
17.2k
  }
943
944
  // Offside tabs are the tab stops opposite the tabs used to align the text.
945
  //
946
  // Note that for a left-to-right text which is aligned to the right such as
947
  //     this function comment, the offside tabs are the horizontal tab stops
948
  //                 marking the beginning of ("Note", "this" and "marking").
949
7.36k
  const std::vector<Cluster> &OffsideTabs() const {
950
7.36k
    if (just == tesseract::JUSTIFICATION_RIGHT) {
951
1.85k
      return left_tabs;
952
1.85k
    }
953
5.50k
    return right_tabs;
954
7.36k
  }
955
956
  // Return whether the i'th row extends from the leftmost left tab stop
957
  // to the right most right tab stop.
958
44.6k
  bool IsFullRow(int i) const {
959
44.6k
    return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
960
44.6k
           ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
961
44.6k
  }
962
963
5.39k
  int AlignsideTabIndex(int row_idx) const {
964
5.39k
    return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
965
5.39k
  }
966
967
  // Given what we know about the paragraph justification (just), would the
968
  // first word of row_b have fit at the end of row_a?
969
33.2k
  bool FirstWordWouldHaveFit(int row_a, int row_b) {
970
33.2k
    return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just);
971
33.2k
  }
972
973
0
  void PrintRows() const {
974
0
    PrintRowRange(*rows, row_start, row_end);
975
0
  }
976
977
6.43k
  void Fail(int min_debug_level, const char *why) const {
978
6.43k
    if (debug_level < min_debug_level) {
979
6.43k
      return;
980
6.43k
    }
981
0
    tprintf("# %s\n", why);
982
0
    PrintRows();
983
0
  }
984
985
2.16k
  ParagraphModel Model() const {
986
2.16k
    return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
987
2.16k
  }
988
989
  // We print out messages with a debug level at least as great as debug_level.
990
  int debug_level = 0;
991
992
  // The Geometric Classifier was asked to find a single paragraph model
993
  // to fit the text rows (*rows)[row_start, row_end)
994
  std::vector<RowScratchRegisters> *rows;
995
  int row_start = 0;
996
  int row_end = 0;
997
998
  // The amount by which we expect the text edge can vary and still be aligned.
999
  int tolerance = 0;
1000
1001
  // Is the script in this text block left-to-right?
1002
  // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve
1003
  bool ltr = false;
1004
1005
  // These left and right tab stops were determined to be the common tab
1006
  // stops for the given text.
1007
  std::vector<Cluster> left_tabs;
1008
  std::vector<Cluster> right_tabs;
1009
1010
  // These are parameters we must determine to create a ParagraphModel.
1011
  tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
1012
  int margin = 0;
1013
  int first_indent = 0;
1014
  int body_indent = 0;
1015
1016
  // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()
1017
  int eop_threshold = 0;
1018
};
1019
1020
// Given a section of text where strong textual clues did not help identifying
1021
// paragraph breaks, and for which the left and right indents have exactly
1022
// three tab stops between them, attempt to find the paragraph breaks based
1023
// solely on the outline of the text and whether the script is left-to-right.
1024
//
1025
// Algorithm Detail:
1026
//   The selected rows are in the form of a rectangle except
1027
//   for some number of "short lines" of the same length:
1028
//
1029
//   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx
1030
//           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.
1031
//         xxxxxxxxxxxxx       xxxxxxxxxxxx
1032
//         xxxxxxxxxxxxx       xxxxxxxxxxxx
1033
//
1034
//   We have a slightly different situation if the only short
1035
//   line is at the end of the excerpt.
1036
//
1037
//   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx
1038
//        xxxxxxxxxxxxx       xxxxxxxxxxxx
1039
//        xxxxxxxxxxxxx       xxxxxxxxxxxx
1040
//          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.
1041
//
1042
//   We'll interpret these as follows based on the reasoning in the comment for
1043
//   GeometricClassify():
1044
//       [script direction: first indent, body indent]
1045
//   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0
1046
//   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0
1047
static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,
1048
3.41k
                                                   ParagraphTheory *theory) {
1049
3.41k
  int num_rows = s.row_end - s.row_start;
1050
3.41k
  int num_full_rows = 0;
1051
3.41k
  int last_row_full = 0;
1052
48.0k
  for (int i = s.row_start; i < s.row_end; i++) {
1053
44.6k
    if (s.IsFullRow(i)) {
1054
12.0k
      num_full_rows++;
1055
12.0k
      if (i == s.row_end - 1) {
1056
671
        last_row_full++;
1057
671
      }
1058
12.0k
    }
1059
44.6k
  }
1060
1061
3.41k
  if (num_full_rows < 0.7 * num_rows) {
1062
2.57k
    s.Fail(1, "Not enough full lines to know which lines start paras.");
1063
2.57k
    return;
1064
2.57k
  }
1065
1066
  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1067
837
  s.eop_threshold = 0;
1068
1069
837
  if (s.ltr) {
1070
837
    s.AssumeLeftJustification();
1071
837
  } else {
1072
0
    s.AssumeRightJustification();
1073
0
  }
1074
1075
837
  if (debug_level > 0) {
1076
0
    tprintf(
1077
0
        "# Not enough variety for clear outline classification. "
1078
0
        "Guessing these are %s aligned based on script.\n",
1079
0
        s.ltr ? "left" : "right");
1080
0
    s.PrintRows();
1081
0
  }
1082
1083
837
  if (s.AlignTabs().size() == 2) { // case A1 or A2
1084
387
    s.first_indent = s.AlignTabs()[1].center;
1085
387
    s.body_indent = s.AlignTabs()[0].center;
1086
450
  } else { // case B1 or B2
1087
450
    if (num_rows - 1 == num_full_rows - last_row_full) {
1088
      // case B2
1089
82
      const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1090
82
      (*s.rows)[s.row_start].AddStartLine(model);
1091
732
      for (int i = s.row_start + 1; i < s.row_end; i++) {
1092
650
        (*s.rows)[i].AddBodyLine(model);
1093
650
      }
1094
82
      return;
1095
368
    } else {
1096
      // case B1
1097
368
      s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1098
368
      s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1099
368
    }
1100
450
  }
1101
755
  const ParagraphModel *model = theory->AddModel(s.Model());
1102
755
  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);
1103
755
  return;
1104
837
}
1105
1106
// This function is called if strong textual clues were not available, but
1107
// the caller hopes that the paragraph breaks will be super obvious just
1108
// by the outline of the text.
1109
//
1110
// The particularly difficult case is figuring out what's going on if you
1111
// don't have enough short paragraph end lines to tell us what's going on.
1112
//
1113
// For instance, let's say you have the following outline:
1114
//
1115
//   (A1)  xxxxxxxxxxxxxxxxxxxxxx
1116
//           xxxxxxxxxxxxxxxxxxxx
1117
//         xxxxxxxxxxxxxxxxxxxxxx
1118
//         xxxxxxxxxxxxxxxxxxxxxx
1119
//
1120
// Even if we know that the text is left-to-right and so will probably be
1121
// left-aligned, both of the following are possible texts:
1122
//
1123
//  (A1a)  1. Here our list item
1124
//           with two full lines.
1125
//         2. Here a second item.
1126
//         3. Here our third one.
1127
//
1128
//  (A1b)  so ends paragraph one.
1129
//           Here  starts another
1130
//         paragraph  we want  to
1131
//         read.  This  continues
1132
//
1133
// These examples are obvious from the text and should have been caught
1134
// by the StrongEvidenceClassify pass.  However, for languages where we don't
1135
// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1136
// it's worth guessing that (A1b) is the correct interpretation if there are
1137
// far more "full" lines than "short" lines.
1138
static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
1139
14.0k
                              int row_start, int row_end, ParagraphTheory *theory) {
1140
14.0k
  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1141
4.11k
    return;
1142
4.11k
  }
1143
9.92k
  if (debug_level > 1) {
1144
0
    tprintf("###############################################\n");
1145
0
    tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n", row_start, row_end);
1146
0
    tprintf("###############################################\n");
1147
0
  }
1148
9.92k
  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1149
1150
9.92k
  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1151
9.92k
  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1152
610
    s.Fail(2, "Too much variety for simple outline classification.");
1153
610
    return;
1154
610
  }
1155
9.31k
  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1156
3.25k
    s.Fail(1, "Not enough variety for simple outline classification.");
1157
3.25k
    return;
1158
3.25k
  }
1159
6.06k
  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1160
3.41k
    GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1161
3.41k
    return;
1162
3.41k
  }
1163
1164
  // At this point, we know that one side has at least two tab stops, and the
1165
  // other side has one or two tab stops.
1166
  // Left to determine:
1167
  //   (1) Which is the body indent and which is the first line indent?
1168
  //   (2) Is the text fully justified?
1169
1170
  // If one side happens to have three or more tab stops, assume that side
1171
  // is opposite of the aligned side.
1172
2.65k
  if (s.right_tabs.size() > 2) {
1173
1.42k
    s.AssumeLeftJustification();
1174
1.42k
  } else if (s.left_tabs.size() > 2) {
1175
482
    s.AssumeRightJustification();
1176
747
  } else if (s.ltr) { // guess based on script direction
1177
747
    s.AssumeLeftJustification();
1178
747
  } else {
1179
0
    s.AssumeRightJustification();
1180
0
  }
1181
1182
2.65k
  if (s.AlignTabs().size() == 2) {
1183
    // For each tab stop on the aligned side, how many of them appear
1184
    // to be paragraph start lines?  [first lines]
1185
1.88k
    int firsts[2] = {0, 0};
1186
    // Count the first line as a likely paragraph start line.
1187
1.88k
    firsts[s.AlignsideTabIndex(s.row_start)]++;
1188
    // For each line, if the first word would have fit on the previous
1189
    // line count it as a likely paragraph start line.
1190
1.88k
    bool jam_packed = true;
1191
29.8k
    for (int i = s.row_start + 1; i < s.row_end; i++) {
1192
27.9k
      if (s.FirstWordWouldHaveFit(i - 1, i)) {
1193
3.42k
        firsts[s.AlignsideTabIndex(i)]++;
1194
3.42k
        jam_packed = false;
1195
3.42k
      }
1196
27.9k
    }
1197
    // Make an extra accounting for the last line of the paragraph just
1198
    // in case it's the only short line in the block.  That is, take its
1199
    // first word as typical and see if this looks like the *last* line
1200
    // of a paragraph.  If so, mark the *other* indent as probably a first.
1201
1.88k
    if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1202
85
      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1203
85
    }
1204
1205
1.88k
    int percent0firsts, percent1firsts;
1206
1.88k
    percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1207
1.88k
    percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1208
1209
    // TODO(eger): Tune these constants if necessary.
1210
1.88k
    if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1211
385
      s.first_indent = s.AlignTabs()[1].center;
1212
385
      s.body_indent = s.AlignTabs()[0].center;
1213
1.49k
    } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1214
1.49k
               percent1firsts + 30 < percent0firsts) {
1215
253
      s.first_indent = s.AlignTabs()[0].center;
1216
253
      s.body_indent = s.AlignTabs()[1].center;
1217
1.24k
    } else {
1218
      // Ambiguous! Probably lineated (poetry)
1219
1.24k
      if (debug_level > 1) {
1220
0
        tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1221
0
                s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1222
0
        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1223
0
                s.AlignTabs()[0].center, percent0firsts);
1224
0
        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1225
0
                s.AlignTabs()[1].center, percent1firsts);
1226
0
        s.PrintRows();
1227
0
      }
1228
1.24k
      return;
1229
1.24k
    }
1230
1.88k
  } else {
1231
    // There's only one tab stop for the "aligned to" side.
1232
768
    s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1233
768
  }
1234
1235
  // At this point, we have our model.
1236
1.40k
  const ParagraphModel *model = theory->AddModel(s.Model());
1237
1238
  // Now all we have to do is figure out if the text is fully justified or not.
1239
  // eop_threshold: default to fully justified unless we see evidence below.
1240
  //    See description on MarkRowsWithModel()
1241
1.40k
  s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1242
  // If the text is not fully justified, re-set the eop_threshold to 0.
1243
1.40k
  if (s.AlignTabs().size() == 2) {
1244
    // Paragraphs with a paragraph-start indent.
1245
5.62k
    for (int i = s.row_start; i < s.row_end - 1; i++) {
1246
5.37k
      if (ValidFirstLine(s.rows, i + 1, model) &&
1247
5.37k
          !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1248
1.20k
                       s.tolerance)) {
1249
        // We found a non-end-of-paragraph short line: not fully justified.
1250
386
        s.eop_threshold = 0;
1251
386
        break;
1252
386
      }
1253
5.37k
    }
1254
768
  } else {
1255
    // Paragraphs with no paragraph-start indent.
1256
4.28k
    for (int i = s.row_start; i < s.row_end - 1; i++) {
1257
4.05k
      if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1258
4.05k
          !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1259
2.61k
                       s.tolerance)) {
1260
        // We found a non-end-of-paragraph short line: not fully justified.
1261
540
        s.eop_threshold = 0;
1262
540
        break;
1263
540
      }
1264
4.05k
    }
1265
768
  }
1266
1.40k
  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1267
1.40k
}
1268
1269
// =============== Implementation of ParagraphTheory =====================
1270
1271
3.32k
const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
1272
3.32k
  for (const auto &m : *models_) {
1273
384
    if (m->Comparable(model)) {
1274
40
      return m;
1275
40
    }
1276
384
  }
1277
3.28k
  auto *m = new ParagraphModel(model);
1278
3.28k
  models_->push_back(m);
1279
3.28k
  push_back_new(models_we_added_, m);
1280
3.28k
  return m;
1281
3.32k
}
1282
1283
13.9k
void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
1284
13.9k
  size_t w = 0;
1285
16.5k
  for (size_t r = 0; r < models_->size(); r++) {
1286
2.58k
    ParagraphModel *m = (*models_)[r];
1287
2.58k
    if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
1288
490
      delete m;
1289
2.09k
    } else {
1290
2.09k
      if (r > w) {
1291
2
        (*models_)[w] = m;
1292
2
      }
1293
2.09k
      w++;
1294
2.09k
    }
1295
2.58k
  }
1296
13.9k
  models_->resize(w);
1297
13.9k
}
1298
1299
// Examine rows[start, end) and try to determine if an existing non-centered
1300
// paragraph model would fit them perfectly.  If so, return a pointer to it.
1301
// If not, return nullptr.
1302
const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
1303
0
                                            int start, int end) const {
1304
0
  for (const auto *model : *models_) {
1305
0
    if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) {
1306
0
      return model;
1307
0
    }
1308
0
  }
1309
0
  return nullptr;
1310
0
}
1311
1312
111k
void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
1313
111k
  for (const auto *model : *models_) {
1314
4.62k
    if (model->justification() != JUSTIFICATION_CENTER) {
1315
3.98k
      push_back_new(*models, model);
1316
3.98k
    }
1317
4.62k
  }
1318
111k
}
1319
1320
0
int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1321
0
  int i = 0;
1322
0
  for (const auto *m : *models_) {
1323
0
    if (m == model) {
1324
0
      return i;
1325
0
    }
1326
0
    i++;
1327
0
  }
1328
0
  return -1;
1329
0
}
1330
1331
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
1332
79.5k
                    const ParagraphModel *model) {
1333
79.5k
  if (!StrongModel(model)) {
1334
0
    tprintf("ValidFirstLine() should only be called with strong models!\n");
1335
0
  }
1336
79.5k
  return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1337
79.5k
                                                     (*rows)[row].rindent_, (*rows)[row].rmargin_);
1338
79.5k
}
1339
1340
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
1341
52.0k
                   const ParagraphModel *model) {
1342
52.0k
  if (!StrongModel(model)) {
1343
0
    tprintf("ValidBodyLine() should only be called with strong models!\n");
1344
0
  }
1345
52.0k
  return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1346
52.0k
                                                    (*rows)[row].rindent_, (*rows)[row].rmargin_);
1347
52.0k
}
1348
1349
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
1350
127
                     const ParagraphModel *model) {
1351
127
  if (model != kCrownRight && model != kCrownLeft) {
1352
0
    tprintf("CrownCompatible() should only be called with crown models!\n");
1353
0
    return false;
1354
0
  }
1355
127
  auto &row_a = (*rows)[a];
1356
127
  auto &row_b = (*rows)[b];
1357
127
  if (model == kCrownRight) {
1358
32
    return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1359
32
                       Epsilon(row_a.ri_->average_interword_space));
1360
32
  }
1361
95
  return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1362
95
                     Epsilon(row_a.ri_->average_interword_space));
1363
127
}
1364
1365
// =============== Implementation of ParagraphModelSmearer ====================
1366
1367
ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
1368
                                             int row_start, int row_end, ParagraphTheory *theory)
1369
12.5k
    : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1370
12.5k
  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1371
0
    row_start_ = 0;
1372
0
    row_end_ = 0;
1373
0
    return;
1374
0
  }
1375
12.5k
  open_models_.resize(open_models_.size() + row_end - row_start + 2);
1376
12.5k
}
1377
1378
// see paragraphs_internal.h
1379
54.4k
void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1380
54.4k
  SetOfModels no_models;
1381
54.4k
  if (row_start < row_start_) {
1382
0
    row_start = row_start_;
1383
0
  }
1384
54.4k
  if (row_end > row_end_) {
1385
0
    row_end = row_end_;
1386
0
  }
1387
1388
626k
  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1389
571k
    if ((*rows_)[row].ri_->num_words == 0) {
1390
0
      OpenModels(row + 1) = no_models;
1391
571k
    } else {
1392
571k
      SetOfModels &opened = OpenModels(row);
1393
571k
      (*rows_)[row].StartHypotheses(&opened);
1394
1395
      // Which models survive the transition from row to row + 1?
1396
571k
      SetOfModels still_open;
1397
571k
      for (auto &m : opened) {
1398
37.5k
        if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) {
1399
          // This is basic filtering; we check likely paragraph starty-ness down
1400
          // below in Smear() -- you know, whether the first word would have fit
1401
          // and such.
1402
35.1k
          push_back_new(still_open, m);
1403
35.1k
        }
1404
37.5k
      }
1405
571k
      OpenModels(row + 1) = std::move(still_open);
1406
571k
    }
1407
571k
  }
1408
54.4k
}
1409
1410
// see paragraphs_internal.h
1411
12.5k
void ParagraphModelSmearer::Smear() {
1412
12.5k
  CalculateOpenModels(row_start_, row_end_);
1413
1414
  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1415
  // we have multiple LT_START hypotheses), see if there's a model that
1416
  // was recently used (an "open" model) which might model it well.
1417
155k
  for (int i = row_start_; i < row_end_; i++) {
1418
143k
    RowScratchRegisters &row = (*rows_)[i];
1419
143k
    if (row.ri_->num_words == 0) {
1420
0
      continue;
1421
0
    }
1422
1423
    // Step One:
1424
    //   Figure out if there are "open" models which are left-alined or
1425
    //   right-aligned.  This is important for determining whether the
1426
    //   "first" word in a row would fit at the "end" of the previous row.
1427
143k
    bool left_align_open = false;
1428
143k
    bool right_align_open = false;
1429
143k
    for (auto &m : OpenModels(i)) {
1430
5.14k
      switch (m->justification()) {
1431
4.72k
        case JUSTIFICATION_LEFT:
1432
4.72k
          left_align_open = true;
1433
4.72k
          break;
1434
0
        case JUSTIFICATION_RIGHT:
1435
0
          right_align_open = true;
1436
0
          break;
1437
423
        default:
1438
423
          left_align_open = right_align_open = true;
1439
5.14k
      }
1440
5.14k
    }
1441
    // Step Two:
1442
    //   Use that knowledge to figure out if this row is likely to
1443
    //   start a paragraph.
1444
143k
    bool likely_start;
1445
143k
    if (i == 0) {
1446
12.1k
      likely_start = true;
1447
131k
    } else {
1448
131k
      if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1449
126k
        likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||
1450
126k
                       LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1451
126k
      } else if (left_align_open) {
1452
4.39k
        likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);
1453
4.39k
      } else {
1454
0
        likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1455
0
      }
1456
131k
    }
1457
1458
    // Step Three:
1459
    //   If this text line seems like an obvious first line of an
1460
    //   open model, or an obvious continuation of an existing
1461
    //   modelled paragraph, mark it up.
1462
143k
    if (likely_start) {
1463
      // Add Start Hypotheses for all Open models that fit.
1464
16.6k
      for (unsigned m = 0; m < OpenModels(i).size(); m++) {
1465
459
        if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1466
391
          row.AddStartLine(OpenModels(i)[m]);
1467
391
        }
1468
459
      }
1469
127k
    } else {
1470
      // Add relevant body line hypotheses.
1471
127k
      SetOfModels last_line_models;
1472
127k
      if (i > 0) {
1473
127k
        (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1474
127k
      } else {
1475
0
        theory_->NonCenteredModels(&last_line_models);
1476
0
      }
1477
127k
      for (auto model : last_line_models) {
1478
4.54k
        if (ValidBodyLine(rows_, i, model)) {
1479
2.65k
          row.AddBodyLine(model);
1480
2.65k
        }
1481
4.54k
      }
1482
127k
    }
1483
1484
    // Step Four:
1485
    //   If we're still quite unsure about this line, go through all
1486
    //   models in our theory and see if this row could be the start
1487
    //   of any of our  models.
1488
143k
    if (row.GetLineType() == LT_UNKNOWN ||
1489
143k
        (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1490
111k
      SetOfModels all_models;
1491
111k
      theory_->NonCenteredModels(&all_models);
1492
111k
      for (auto &all_model : all_models) {
1493
3.98k
        if (ValidFirstLine(rows_, i, all_model)) {
1494
1.43k
          row.AddStartLine(all_model);
1495
1.43k
        }
1496
3.98k
      }
1497
111k
    }
1498
    // Step Five:
1499
    //   Since we may have updated the hypotheses about this row, we need
1500
    //   to recalculate the Open models for the rest of rows[i + 1, row_end)
1501
143k
    if (row.GetLineType() != LT_UNKNOWN) {
1502
41.8k
      CalculateOpenModels(i + 1, row_end_);
1503
41.8k
    }
1504
143k
  }
1505
12.5k
}
1506
1507
// ================ Main Paragraph Detection Algorithm =======================
1508
1509
// Find out what ParagraphModels are actually used, and discard any
1510
// that are not.
1511
static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
1512
13.9k
                                ParagraphTheory *theory) {
1513
13.9k
  SetOfModels used_models;
1514
142k
  for (const auto &row : rows) {
1515
142k
    row.StrongHypotheses(&used_models);
1516
142k
  }
1517
13.9k
  theory->DiscardUnusedModels(used_models);
1518
13.9k
}
1519
1520
// DowngradeWeakestToCrowns:
1521
//   Forget any flush-{left, right} models unless we see two or more
1522
//   of them in sequence.
1523
//
1524
// In pass 3, we start to classify even flush-left paragraphs (paragraphs
1525
// where the first line and body indent are the same) as having proper Models.
1526
// This is generally dangerous, since if you start imagining that flush-left
1527
// is a typical paragraph model when it is not, it will lead you to chop normal
1528
// indented paragraphs in the middle whenever a sentence happens to start on a
1529
// new line (see "This" above).  What to do?
1530
//   What we do is to take any paragraph which is flush left and is not
1531
// preceded by another paragraph of the same model and convert it to a "Crown"
1532
// paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder
1533
// for later.  It means that the paragraph is flush, but it would be desirable
1534
// to mark it as the same model as following text if it fits.  This downgrade
1535
// FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we
1536
// avoid making flush left Paragraph Models whenever we see a top-of-the-page
1537
// half-of-a-paragraph. and instead we mark it the same as normal body text.
1538
//
1539
// Implementation:
1540
//
1541
//   Comb backwards through the row scratch registers, and turn any
1542
//   sequences of body lines of equivalent type abutted against the beginning
1543
//   or a body or start line of a different type into a crown paragraph.
1544
static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1545
13.9k
                                     std::vector<RowScratchRegisters> *rows) {
1546
13.9k
  int start;
1547
18.4k
  for (int end = rows->size(); end > 0; end = start) {
1548
    // Search back for a body line of a unique type.
1549
16.7k
    const ParagraphModel *model = nullptr;
1550
131k
    while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1551
115k
      end--;
1552
115k
    }
1553
16.7k
    if (end == 0) {
1554
12.2k
      break;
1555
12.2k
    }
1556
4.52k
    start = end - 1;
1557
29.8k
    while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1558
25.2k
      start--; // walk back to the first line that is not the same body type.
1559
25.2k
    }
1560
4.52k
    if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&
1561
4.52k
        NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1562
2.15k
      start--;
1563
2.15k
    }
1564
4.52k
    start++;
1565
    // Now rows[start, end) is a sequence of unique body hypotheses of model.
1566
4.52k
    if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {
1567
92
      continue;
1568
92
    }
1569
4.43k
    if (!StrongModel(model)) {
1570
212
      while (start > 0 && CrownCompatible(rows, start - 1, start, model)) {
1571
119
        start--;
1572
119
      }
1573
93
    }
1574
4.43k
    if (start == 0 || (!StrongModel(model)) ||
1575
4.43k
        (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1576
      // crownify rows[start, end)
1577
1.96k
      const ParagraphModel *crown_model = model;
1578
1.96k
      if (StrongModel(model)) {
1579
1.87k
        if (model->justification() == JUSTIFICATION_LEFT) {
1580
1.62k
          crown_model = kCrownLeft;
1581
1.62k
        } else {
1582
249
          crown_model = kCrownRight;
1583
249
        }
1584
1.87k
      }
1585
1.96k
      (*rows)[start].SetUnknown();
1586
1.96k
      (*rows)[start].AddStartLine(crown_model);
1587
16.4k
      for (int row = start + 1; row < end; row++) {
1588
14.4k
        (*rows)[row].SetUnknown();
1589
14.4k
        (*rows)[row].AddBodyLine(crown_model);
1590
14.4k
      }
1591
1.96k
    }
1592
4.43k
  }
1593
13.9k
  DiscardUnusedModels(*rows, theory);
1594
13.9k
}
1595
1596
// Clear all hypotheses about lines [start, end) and reset margins.
1597
//
1598
// The empty space between the left of a row and the block boundary (and
1599
// similarly for the right) is split into two pieces: margin and indent.
1600
// In initial processing, we assume the block is tight and the margin for
1601
// all lines is set to zero.   However, if our first pass does not yield
1602
// models for  everything,  it may be  due to an  inset paragraph like a
1603
// block-quote.   In that case, we make a second pass over that unmarked
1604
// section of the page and reset the "margin" portion of the empty space
1605
// to the common amount of space at  the ends of the lines under consid-
1606
// eration.    This would be equivalent to percentile set to 0. However,
1607
// sometimes we have a single character sticking out in the right margin
1608
// of a text block  (like the 'r' in 'for' on line 3 above),  and we can
1609
// really  just ignore it as an outlier.   To express this, we allow the
1610
// user to specify  the percentile (0..100)  of indent values  to use as
1611
// the common margin for each row in the run of rows[start, end).
1612
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
1613
22.4k
                                        int end, int percentile) {
1614
22.4k
  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1615
0
    return;
1616
0
  }
1617
1618
22.4k
  int lmin, lmax, rmin, rmax;
1619
22.4k
  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1620
22.4k
  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1621
297k
  for (int i = start; i < end; i++) {
1622
274k
    RowScratchRegisters &sr = (*rows)[i];
1623
274k
    sr.SetUnknown();
1624
274k
    if (sr.ri_->num_words == 0) {
1625
0
      continue;
1626
0
    }
1627
274k
    UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1628
274k
    UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1629
274k
  }
1630
22.4k
  STATS lefts(lmin, lmax);
1631
22.4k
  STATS rights(rmin, rmax);
1632
297k
  for (int i = start; i < end; i++) {
1633
274k
    RowScratchRegisters &sr = (*rows)[i];
1634
274k
    if (sr.ri_->num_words == 0) {
1635
0
      continue;
1636
0
    }
1637
274k
    lefts.add(sr.lmargin_ + sr.lindent_, 1);
1638
274k
    rights.add(sr.rmargin_ + sr.rindent_, 1);
1639
274k
  }
1640
22.4k
  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1641
22.4k
  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1642
297k
  for (int i = start; i < end; i++) {
1643
274k
    RowScratchRegisters &sr = (*rows)[i];
1644
274k
    int ldelta = ignorable_left - sr.lmargin_;
1645
274k
    sr.lmargin_ += ldelta;
1646
274k
    sr.lindent_ -= ldelta;
1647
274k
    int rdelta = ignorable_right - sr.rmargin_;
1648
274k
    sr.rmargin_ += rdelta;
1649
274k
    sr.rindent_ -= rdelta;
1650
274k
  }
1651
22.4k
}
1652
1653
// Return the median inter-word space in rows[row_start, row_end).
1654
19.5k
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
1655
19.5k
  if (row_end < row_start + 1) {
1656
0
    return 1;
1657
0
  }
1658
19.5k
  int word_height =
1659
19.5k
      (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1660
19.5k
  int word_width =
1661
19.5k
      (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1662
19.5k
  STATS spacing_widths(0, 4 + word_width);
1663
179k
  for (int i = row_start; i < row_end; i++) {
1664
160k
    if (rows[i].ri_->num_words > 1) {
1665
18.7k
      spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1666
18.7k
    }
1667
160k
  }
1668
19.5k
  int minimum_reasonable_space = word_height / 3;
1669
19.5k
  if (minimum_reasonable_space < 2) {
1670
153
    minimum_reasonable_space = 2;
1671
153
  }
1672
19.5k
  int median = spacing_widths.median();
1673
19.5k
  return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1674
19.5k
}
1675
1676
// Return whether the first word on the after line can fit in the space at
1677
// the end of the before line (knowing which way the text is aligned and read).
1678
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
1679
535k
                           tesseract::ParagraphJustification justification) {
1680
535k
  if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1681
0
    return true;
1682
0
  }
1683
1684
535k
  if (justification == JUSTIFICATION_UNKNOWN) {
1685
0
    tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1686
0
  }
1687
535k
  int available_space;
1688
535k
  if (justification == JUSTIFICATION_CENTER) {
1689
0
    available_space = before.lindent_ + before.rindent_;
1690
535k
  } else {
1691
535k
    available_space = before.OffsideIndent(justification);
1692
535k
  }
1693
535k
  available_space -= before.ri_->average_interword_space;
1694
1695
535k
  if (before.ri_->ltr) {
1696
535k
    return after.ri_->lword_box.width() < available_space;
1697
535k
  }
1698
0
  return after.ri_->rword_box.width() < available_space;
1699
535k
}
1700
1701
// Return whether the first word on the after line can fit in the space at
1702
// the end of the before line (not knowing which way the text goes) in a left
1703
// or right alignment.
1704
22.7k
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1705
22.7k
  if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1706
0
    return true;
1707
0
  }
1708
1709
22.7k
  int available_space = before.lindent_;
1710
22.7k
  if (before.rindent_ > available_space) {
1711
6.87k
    available_space = before.rindent_;
1712
6.87k
  }
1713
22.7k
  available_space -= before.ri_->average_interword_space;
1714
1715
22.7k
  if (before.ri_->ltr) {
1716
22.7k
    return after.ri_->lword_box.width() < available_space;
1717
22.7k
  }
1718
0
  return after.ri_->rword_box.width() < available_space;
1719
22.7k
}
1720
1721
20.5k
static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1722
20.5k
  if (before.ri_->ltr) {
1723
20.5k
    return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;
1724
20.5k
  } else {
1725
0
    return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;
1726
0
  }
1727
20.5k
}
1728
1729
static bool LikelyParagraphStart(const RowScratchRegisters &before,
1730
                                 const RowScratchRegisters &after,
1731
346k
                                 tesseract::ParagraphJustification j) {
1732
346k
  return before.ri_->num_words == 0 ||
1733
346k
         (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));
1734
346k
}
1735
1736
// Examine rows[start, end) and try to determine what sort of ParagraphModel
1737
// would fit them as a single paragraph.
1738
// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1739
// If the rows given could be a consistent start to a paragraph, set *consistent
1740
// true.
1741
static ParagraphModel InternalParagraphModelByOutline(
1742
    const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
1743
35.5k
    bool *consistent) {
1744
35.5k
  int ltr_line_count = 0;
1745
188k
  for (int i = start; i < end; i++) {
1746
153k
    ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1747
153k
  }
1748
35.5k
  bool ltr = (ltr_line_count >= (end - start) / 2);
1749
1750
35.5k
  *consistent = true;
1751
35.5k
  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1752
0
    return ParagraphModel();
1753
0
  }
1754
1755
  // Ensure the caller only passed us a region with a common rmargin and
1756
  // lmargin.
1757
35.5k
  int lmargin = (*rows)[start].lmargin_;
1758
35.5k
  int rmargin = (*rows)[start].rmargin_;
1759
35.5k
  int lmin, lmax, rmin, rmax, cmin, cmax;
1760
35.5k
  lmin = lmax = (*rows)[start + 1].lindent_;
1761
35.5k
  rmin = rmax = (*rows)[start + 1].rindent_;
1762
35.5k
  cmin = cmax = 0;
1763
153k
  for (int i = start + 1; i < end; i++) {
1764
117k
    if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1765
0
      tprintf("Margins don't match! Software error.\n");
1766
0
      *consistent = false;
1767
0
      return ParagraphModel();
1768
0
    }
1769
117k
    UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1770
117k
    UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1771
117k
    UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1772
117k
  }
1773
35.5k
  int ldiff = lmax - lmin;
1774
35.5k
  int rdiff = rmax - rmin;
1775
35.5k
  int cdiff = cmax - cmin;
1776
35.5k
  if (rdiff > tolerance && ldiff > tolerance) {
1777
997
    if (cdiff < tolerance * 2) {
1778
345
      if (end - start < 3) {
1779
0
        return ParagraphModel();
1780
0
      }
1781
345
      return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1782
345
    }
1783
652
    *consistent = false;
1784
652
    return ParagraphModel();
1785
997
  }
1786
34.5k
  if (end - start < 3) { // Don't return a model for two line paras.
1787
16.9k
    return ParagraphModel();
1788
16.9k
  }
1789
1790
  // These booleans keep us from saying something is aligned left when the body
1791
  // left variance is too large.
1792
17.5k
  bool body_admits_left_alignment = ldiff < tolerance;
1793
17.5k
  bool body_admits_right_alignment = rdiff < tolerance;
1794
1795
17.5k
  ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1796
17.5k
                                             (lmin + lmax) / 2, tolerance);
1797
17.5k
  ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1798
17.5k
                                              (rmin + rmax) / 2, tolerance);
1799
1800
  // These booleans keep us from having an indent on the "wrong side" for the
1801
  // first line.
1802
17.5k
  bool text_admits_left_alignment = ltr || left_model.is_flush();
1803
17.5k
  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1804
1805
  // At least one of the edges is less than tolerance in variance.
1806
  // If the other is obviously ragged, it can't be the one aligned to.
1807
  // [Note the last line is included in this raggedness.]
1808
17.5k
  if (tolerance < rdiff) {
1809
6.00k
    if (body_admits_left_alignment && text_admits_left_alignment) {
1810
5.76k
      return left_model;
1811
5.76k
    }
1812
237
    *consistent = false;
1813
237
    return ParagraphModel();
1814
6.00k
  }
1815
11.5k
  if (tolerance < ldiff) {
1816
1.51k
    if (body_admits_right_alignment && text_admits_right_alignment) {
1817
1.23k
      return right_model;
1818
1.23k
    }
1819
281
    *consistent = false;
1820
281
    return ParagraphModel();
1821
1.51k
  }
1822
1823
  // At this point, we know the body text doesn't vary much on either side.
1824
1825
  // If the first line juts out oddly in one direction or the other,
1826
  // that likely indicates the side aligned to.
1827
10.0k
  int first_left = (*rows)[start].lindent_;
1828
10.0k
  int first_right = (*rows)[start].rindent_;
1829
1830
10.0k
  if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1831
4.42k
    return left_model;
1832
4.42k
  }
1833
5.57k
  if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1834
0
    return right_model;
1835
0
  }
1836
1837
5.57k
  *consistent = false;
1838
5.57k
  return ParagraphModel();
1839
5.57k
}
1840
1841
// Examine rows[start, end) and try to determine what sort of ParagraphModel
1842
// would fit them as a single paragraph.   If nothing fits,
1843
// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1844
// output if we're debugging.
1845
static ParagraphModel ParagraphModelByOutline(int debug_level,
1846
                                              const std::vector<RowScratchRegisters> *rows,
1847
9.61k
                                              int start, int end, int tolerance) {
1848
9.61k
  bool unused_consistent;
1849
9.61k
  ParagraphModel retval =
1850
9.61k
      InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1851
9.61k
  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1852
0
    tprintf("Could not determine a model for this paragraph:\n");
1853
0
    PrintRowRange(*rows, start, end);
1854
0
  }
1855
9.61k
  return retval;
1856
9.61k
}
1857
1858
// Do rows[start, end) form a single instance of the given paragraph model?
1859
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
1860
0
                  const ParagraphModel *model) {
1861
0
  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1862
0
    return false;
1863
0
  }
1864
0
  if (!ValidFirstLine(rows, start, model)) {
1865
0
    return false;
1866
0
  }
1867
0
  for (int i = start + 1; i < end; i++) {
1868
0
    if (!ValidBodyLine(rows, i, model)) {
1869
0
      return false;
1870
0
    }
1871
0
  }
1872
0
  return true;
1873
0
}
1874
1875
// Examine rows[row_start, row_end) as an independent section of text,
1876
// and mark rows that are exceptionally clear as start-of-paragraph
1877
// and paragraph-body lines.
1878
//
1879
// We presume that any lines surrounding rows[row_start, row_end) may
1880
// have wildly different paragraph models, so we don't key any data off
1881
// of those lines.
1882
//
1883
// We only take the very strongest signals, as we don't want to get
1884
// confused and marking up centered text, poetry, or source code as
1885
// clearly part of a typical paragraph.
1886
static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
1887
12.5k
                               int row_end) {
1888
  // Record patently obvious body text.
1889
143k
  for (int i = row_start + 1; i < row_end; i++) {
1890
130k
    const RowScratchRegisters &prev = (*rows)[i - 1];
1891
130k
    RowScratchRegisters &curr = (*rows)[i];
1892
130k
    tesseract::ParagraphJustification typical_justification =
1893
130k
        prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1894
130k
    if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1895
130k
        !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1896
22.4k
      curr.SetBodyLine();
1897
22.4k
    }
1898
130k
  }
1899
1900
  // Record patently obvious start paragraph lines.
1901
  //
1902
  // It's an extremely good signal of the start of a paragraph that
1903
  // the first word would have fit on the end of the previous line.
1904
  // However, applying just that signal would have us mark random
1905
  // start lines of lineated text (poetry and source code) and some
1906
  // centered headings as paragraph start lines.  Therefore, we use
1907
  // a second qualification for a paragraph start: Not only should
1908
  // the first word of this line have fit on the previous line,
1909
  // but also, this line should go full to the right of the block,
1910
  // disallowing a subsequent word from having fit on this line.
1911
1912
  // First row:
1913
12.5k
  {
1914
12.5k
    RowScratchRegisters &curr = (*rows)[row_start];
1915
12.5k
    RowScratchRegisters &next = (*rows)[row_start + 1];
1916
12.5k
    tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1917
12.5k
    if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1918
12.5k
        (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1919
10.1k
      curr.SetStartLine();
1920
10.1k
    }
1921
12.5k
  }
1922
  // Middle rows
1923
130k
  for (int i = row_start + 1; i < row_end - 1; i++) {
1924
118k
    RowScratchRegisters &prev = (*rows)[i - 1];
1925
118k
    RowScratchRegisters &curr = (*rows)[i];
1926
118k
    RowScratchRegisters &next = (*rows)[i + 1];
1927
118k
    tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1928
118k
    if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1929
118k
        LikelyParagraphStart(prev, curr, j)) {
1930
473
      curr.SetStartLine();
1931
473
    }
1932
118k
  }
1933
  // Last row
1934
12.5k
  { // the short circuit at the top means we have at least two lines.
1935
12.5k
    RowScratchRegisters &prev = (*rows)[row_end - 2];
1936
12.5k
    RowScratchRegisters &curr = (*rows)[row_end - 1];
1937
12.5k
    tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1938
12.5k
    if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&
1939
12.5k
        LikelyParagraphStart(prev, curr, j)) {
1940
13
      curr.SetStartLine();
1941
13
    }
1942
12.5k
  }
1943
12.5k
}
1944
1945
// Look for sequences of a start line followed by some body lines in
1946
// rows[row_start, row_end) and create ParagraphModels for them if
1947
// they seem coherent.
1948
static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
1949
                                int row_start, int row_end, bool allow_flush_models,
1950
12.5k
                                ParagraphTheory *theory) {
1951
12.5k
  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1952
0
    return;
1953
0
  }
1954
1955
12.5k
  int start = row_start;
1956
23.1k
  while (start < row_end) {
1957
136k
    while (start < row_end && (*rows)[start].GetLineType() != LT_START) {
1958
113k
      start++;
1959
113k
    }
1960
23.1k
    if (start >= row_end - 1) {
1961
12.5k
      break;
1962
12.5k
    }
1963
1964
10.6k
    int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1965
10.6k
    int end = start;
1966
10.6k
    ParagraphModel last_model;
1967
10.6k
    bool next_consistent;
1968
30.2k
    do {
1969
30.2k
      ++end;
1970
      // rows[row, end) was consistent.
1971
      // If rows[row, end + 1) is not consistent,
1972
      //   just model rows[row, end)
1973
30.2k
      if (end < row_end - 1) {
1974
27.7k
        RowScratchRegisters &next = (*rows)[end];
1975
27.7k
        LineType lt = next.GetLineType();
1976
27.7k
        next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&
1977
22.8k
                                            !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1978
27.7k
      } else {
1979
2.43k
        next_consistent = false;
1980
2.43k
      }
1981
30.2k
      if (next_consistent) {
1982
25.8k
        ParagraphModel next_model =
1983
25.8k
            InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1984
25.8k
        if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&
1985
25.8k
             next_model.justification() != JUSTIFICATION_LEFT) ||
1986
25.8k
            (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&
1987
24.7k
             next_model.justification() != JUSTIFICATION_RIGHT)) {
1988
1.13k
          next_consistent = false;
1989
1.13k
        }
1990
25.8k
        last_model = next_model;
1991
25.8k
      } else {
1992
4.30k
        next_consistent = false;
1993
4.30k
      }
1994
30.2k
    } while (next_consistent && end < row_end);
1995
    // At this point, rows[start, end) looked like it could have been a
1996
    // single paragraph.  If we can make a good ParagraphModel for it,
1997
    // do so and mark this sequence with that model.
1998
10.6k
    if (end > start + 1) {
1999
      // emit a new paragraph if we have more than one line.
2000
9.61k
      const ParagraphModel *model = nullptr;
2001
9.61k
      ParagraphModel new_model = ParagraphModelByOutline(
2002
9.61k
          debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));
2003
9.61k
      if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
2004
        // couldn't create a good model, oh well.
2005
7.93k
      } else if (new_model.is_flush()) {
2006
1.23k
        if (end == start + 2) {
2007
          // It's very likely we just got two paragraph starts in a row.
2008
0
          end = start + 1;
2009
1.23k
        } else if (start == row_start) {
2010
          // Mark this as a Crown.
2011
1.15k
          if (new_model.justification() == JUSTIFICATION_LEFT) {
2012
1.03k
            model = kCrownLeft;
2013
1.03k
          } else {
2014
118
            model = kCrownRight;
2015
118
          }
2016
1.15k
        } else if (allow_flush_models) {
2017
0
          model = theory->AddModel(new_model);
2018
0
        }
2019
1.23k
      } else {
2020
445
        model = theory->AddModel(new_model);
2021
445
      }
2022
9.61k
      if (model) {
2023
1.59k
        (*rows)[start].AddStartLine(model);
2024
9.64k
        for (int i = start + 1; i < end; i++) {
2025
8.04k
          (*rows)[i].AddBodyLine(model);
2026
8.04k
        }
2027
1.59k
      }
2028
9.61k
    }
2029
10.6k
    start = end;
2030
10.6k
  }
2031
12.5k
}
2032
2033
// We examine rows[row_start, row_end) and do the following:
2034
//   (1) Clear all existing hypotheses for the rows being considered.
2035
//   (2) Mark up any rows as exceptionally likely to be paragraph starts
2036
//       or paragraph body lines as such using both geometric and textual
2037
//       clues.
2038
//   (3) Form models for any sequence of start + continuation lines.
2039
//   (4) Smear the paragraph models to cover surrounding text.
2040
static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
2041
14.4k
                                   int row_start, int row_end, ParagraphTheory *theory) {
2042
14.4k
  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2043
1.94k
    return;
2044
1.94k
  }
2045
2046
12.5k
  if (debug_level > 1) {
2047
0
    tprintf("#############################################\n");
2048
0
    tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2049
0
    tprintf("#############################################\n");
2050
0
  }
2051
2052
12.5k
  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2053
12.5k
  MarkStrongEvidence(rows, row_start, row_end);
2054
2055
12.5k
  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2056
2057
  // Create paragraph models.
2058
12.5k
  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2059
2060
12.5k
  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2061
2062
  // At this point, some rows are marked up as paragraphs with model numbers,
2063
  // and some rows are marked up as either LT_START or LT_BODY.  Now let's
2064
  // smear any good paragraph hypotheses forward and backward.
2065
12.5k
  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2066
12.5k
  smearer.Smear();
2067
12.5k
}
2068
2069
static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
2070
13.9k
                                      int row_end, ParagraphTheory *theory) {
2071
130k
  for (int i = row_start + 1; i < row_end - 1; i++) {
2072
116k
    if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
2073
116k
        (*rows)[i + 1].ri_->has_leaders) {
2074
0
      const ParagraphModel *model =
2075
0
          theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2076
0
      (*rows)[i].AddStartLine(model);
2077
0
    }
2078
116k
  }
2079
13.9k
}
2080
2081
// Collect sequences of unique hypotheses in row registers and create proper
2082
// paragraphs for them, referencing the paragraphs in row_owners.
2083
static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
2084
                                                     std::vector<RowScratchRegisters> &rows,
2085
                                                     std::vector<PARA *> *row_owners,
2086
13.9k
                                                     ParagraphTheory *theory) {
2087
13.9k
  int end = rows.size();
2088
13.9k
  int start;
2089
132k
  for (; end > 0; end = start) {
2090
118k
    start = end - 1;
2091
118k
    const ParagraphModel *model = nullptr;
2092
    // TODO(eger): Be smarter about dealing with multiple hypotheses.
2093
118k
    bool single_line_paragraph = false;
2094
118k
    SetOfModels models;
2095
118k
    rows[start].NonNullHypotheses(&models);
2096
118k
    if (!models.empty()) {
2097
8.76k
      model = models[0];
2098
8.76k
      if (rows[start].GetLineType(model) != LT_BODY) {
2099
4.42k
        single_line_paragraph = true;
2100
4.42k
      }
2101
8.76k
    }
2102
118k
    if (model && !single_line_paragraph) {
2103
      // walk back looking for more body lines and then a start line.
2104
23.9k
      while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2105
        // do nothing
2106
19.6k
      }
2107
4.34k
      if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2108
6
        model = nullptr;
2109
6
      }
2110
4.34k
    }
2111
118k
    if (model == nullptr) {
2112
109k
      continue;
2113
109k
    }
2114
    // rows[start, end) should be a paragraph.
2115
8.75k
    PARA *p = new PARA();
2116
8.75k
    if (model == kCrownLeft || model == kCrownRight) {
2117
1.87k
      p->is_very_first_or_continuation = true;
2118
      // Crown paragraph.
2119
      //   If we can find an existing ParagraphModel that fits, use it,
2120
      //   else create a new one.
2121
2.75k
      for (unsigned row = end; row < rows.size(); row++) {
2122
2.03k
        if ((*row_owners)[row] &&
2123
2.03k
            (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2124
1.96k
             (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2125
1.16k
          model = (*row_owners)[row]->model;
2126
1.16k
          break;
2127
1.16k
        }
2128
2.03k
      }
2129
1.87k
      if (model == kCrownLeft) {
2130
        // No subsequent model fits, so cons one up.
2131
595
        model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,
2132
595
                                                rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2133
595
                                                Epsilon(rows[start].ri_->average_interword_space)));
2134
1.28k
      } else if (model == kCrownRight) {
2135
        // No subsequent model fits, so cons one up.
2136
123
        model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,
2137
123
                                                rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2138
123
                                                Epsilon(rows[start].ri_->average_interword_space)));
2139
123
      }
2140
1.87k
    }
2141
8.75k
    rows[start].SetUnknown();
2142
8.75k
    rows[start].AddStartLine(model);
2143
32.7k
    for (int i = start + 1; i < end; i++) {
2144
23.9k
      rows[i].SetUnknown();
2145
23.9k
      rows[i].AddBodyLine(model);
2146
23.9k
    }
2147
8.75k
    p->model = model;
2148
8.75k
    p->has_drop_cap = rows[start].ri_->has_drop_cap;
2149
8.75k
    p->is_list_item = model->justification() == JUSTIFICATION_RIGHT
2150
8.75k
                          ? rows[start].ri_->rword_indicates_list_item
2151
8.75k
                          : rows[start].ri_->lword_indicates_list_item;
2152
41.4k
    for (int row = start; row < end; row++) {
2153
32.7k
      if ((*row_owners)[row] != nullptr) {
2154
0
        tprintf(
2155
0
            "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2156
0
            "more than once!\n");
2157
0
        delete (*row_owners)[row];
2158
0
      }
2159
32.7k
      (*row_owners)[row] = p;
2160
32.7k
    }
2161
8.75k
  }
2162
13.9k
}
2163
2164
struct Interval {
2165
0
  Interval() : begin(0), end(0) {}
2166
54.0k
  Interval(int b, int e) : begin(b), end(e) {}
2167
2168
  int begin;
2169
  int end;
2170
};
2171
2172
// Return whether rows[row] appears to be stranded, meaning that the evidence
2173
// for this row is very weak due to context.  For instance, two lines of source
2174
// code may happen to be indented at the same tab vector as body text starts,
2175
// leading us to think they are two start-of-paragraph lines.  This is not
2176
// optimal.  However, we also don't want to mark a sequence of short dialog
2177
// as "weak," so our heuristic is:
2178
//   (1) If a line is surrounded by lines of unknown type, it's weak.
2179
//   (2) If two lines in a row are start lines for a given paragraph type, but
2180
//       after that the same paragraph type does not continue, they're weak.
2181
25.3k
static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
2182
25.3k
  SetOfModels row_models;
2183
25.3k
  rows[row].StrongHypotheses(&row_models);
2184
2185
25.3k
  for (auto &row_model : row_models) {
2186
25.3k
    bool all_starts = rows[row].GetLineType();
2187
25.3k
    int run_length = 1;
2188
25.3k
    bool continues = true;
2189
224k
    for (int i = row - 1; i >= 0 && continues; i--) {
2190
198k
      SetOfModels models;
2191
198k
      rows[i].NonNullHypotheses(&models);
2192
198k
      switch (rows[i].GetLineType(row_model)) {
2193
56.8k
        case LT_START:
2194
56.8k
          run_length++;
2195
56.8k
          break;
2196
38
        case LT_MULTIPLE: // explicit fall-through
2197
137k
        case LT_BODY:
2198
137k
          run_length++;
2199
137k
          all_starts = false;
2200
137k
          break;
2201
4.04k
        case LT_UNKNOWN: // explicit fall-through
2202
4.04k
        default:
2203
4.04k
          continues = false;
2204
198k
      }
2205
198k
    }
2206
25.3k
    continues = true;
2207
190k
    for (unsigned i = row + 1; i < rows.size() && continues; i++) {
2208
165k
      SetOfModels models;
2209
165k
      rows[i].NonNullHypotheses(&models);
2210
165k
      switch (rows[i].GetLineType(row_model)) {
2211
45.3k
        case LT_START:
2212
45.3k
          run_length++;
2213
45.3k
          break;
2214
22
        case LT_MULTIPLE: // explicit fall-through
2215
114k
        case LT_BODY:
2216
114k
          run_length++;
2217
114k
          all_starts = false;
2218
114k
          break;
2219
5.47k
        case LT_UNKNOWN: // explicit fall-through
2220
5.47k
        default:
2221
5.47k
          continues = false;
2222
165k
      }
2223
165k
    }
2224
25.3k
    if (run_length > 2 || (!all_starts && run_length > 1)) {
2225
25.0k
      return false;
2226
25.0k
    }
2227
25.3k
  }
2228
328
  return true;
2229
25.3k
}
2230
2231
// Go through rows[row_start, row_end) and gather up sequences that need better
2232
// classification.
2233
// + Sequences of non-empty rows without hypotheses.
2234
// + Crown paragraphs not immediately followed by a strongly modeled line.
2235
// + Single line paragraphs surrounded by text that doesn't match the
2236
//   model.
2237
static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
2238
55.7k
                             std::vector<Interval> *to_fix, int row_start, int row_end) {
2239
55.7k
  to_fix->clear();
2240
626k
  for (int i = row_start; i < row_end; i++) {
2241
570k
    bool needs_fixing = false;
2242
2243
570k
    SetOfModels models;
2244
570k
    SetOfModels models_w_crowns;
2245
570k
    rows[i].StrongHypotheses(&models);
2246
570k
    rows[i].NonNullHypotheses(&models_w_crowns);
2247
570k
    if (models.empty() && !models_w_crowns.empty()) {
2248
      // Crown paragraph.  Is it followed by a modeled line?
2249
235k
      for (unsigned end = i + 1; end < rows.size(); end++) {
2250
227k
        SetOfModels end_models;
2251
227k
        SetOfModels strong_end_models;
2252
227k
        rows[end].NonNullHypotheses(&end_models);
2253
227k
        rows[end].StrongHypotheses(&strong_end_models);
2254
227k
        if (end_models.empty()) {
2255
15.9k
          needs_fixing = true;
2256
15.9k
          break;
2257
211k
        } else if (!strong_end_models.empty()) {
2258
7.11k
          needs_fixing = false;
2259
7.11k
          break;
2260
7.11k
        }
2261
227k
      }
2262
539k
    } else if (models.empty() && rows[i].ri_->num_words > 0) {
2263
      // No models at all.
2264
513k
      needs_fixing = true;
2265
513k
    }
2266
2267
570k
    if (!needs_fixing && !models.empty()) {
2268
25.3k
      needs_fixing = RowIsStranded(rows, i);
2269
25.3k
    }
2270
2271
570k
    if (needs_fixing) {
2272
530k
      if (!to_fix->empty() && to_fix->back().end == i - 1) {
2273
476k
        to_fix->back().end = i;
2274
476k
      } else {
2275
54.0k
        to_fix->push_back(Interval(i, i));
2276
54.0k
      }
2277
530k
    }
2278
570k
  }
2279
  // Convert inclusive intervals to half-open intervals.
2280
55.7k
  for (auto &i : *to_fix) {
2281
54.0k
    i.end = i.end + 1;
2282
54.0k
  }
2283
55.7k
}
2284
2285
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2286
// normalize each row_owner to point to an actual PARA, and output the
2287
// paragraphs in order onto paragraphs.
2288
13.9k
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
2289
13.9k
  std::vector<PARA *> &rows = *row_owners;
2290
13.9k
  paragraphs->clear();
2291
13.9k
  PARA_IT out(paragraphs);
2292
13.9k
  PARA *formerly_null = nullptr;
2293
156k
  for (unsigned i = 0; i < rows.size(); i++) {
2294
142k
    if (rows[i] == nullptr) {
2295
110k
      if (i == 0 || rows[i - 1] != formerly_null) {
2296
11.9k
        rows[i] = formerly_null = new PARA();
2297
98.0k
      } else {
2298
98.0k
        rows[i] = formerly_null;
2299
98.0k
        continue;
2300
98.0k
      }
2301
110k
    } else if (i > 0 && rows[i - 1] == rows[i]) {
2302
23.9k
      continue;
2303
23.9k
    }
2304
20.7k
    out.add_after_then_move(rows[i]);
2305
20.7k
  }
2306
13.9k
}
2307
2308
// Main entry point for Paragraph Detection Algorithm.
2309
//
2310
// Given a set of equally spaced textlines (described by row_infos),
2311
// Split them into paragraphs.
2312
//
2313
// Output:
2314
//   row_owners - one pointer for each row, to the paragraph it belongs to.
2315
//   paragraphs - this is the actual list of PARA objects.
2316
//   models - the list of paragraph models referenced by the PARA objects.
2317
//            caller is responsible for deleting the models.
2318
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
2319
                      std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2320
13.9k
                      std::vector<ParagraphModel *> *models) {
2321
13.9k
  ParagraphTheory theory(models);
2322
2323
  // Initialize row_owners to be a bunch of nullptr pointers.
2324
13.9k
  row_owners->clear();
2325
13.9k
  row_owners->resize(row_infos->size());
2326
2327
  // Set up row scratch registers for the main algorithm.
2328
13.9k
  std::vector<RowScratchRegisters> rows(row_infos->size());
2329
156k
  for (unsigned i = 0; i < row_infos->size(); i++) {
2330
142k
    rows[i].Init((*row_infos)[i]);
2331
142k
  }
2332
2333
  // Pass 1:
2334
  //   Detect sequences of lines that all contain leader dots (.....)
2335
  //   These are likely Tables of Contents.  If there are three text lines in
2336
  //   a row with leader dots, it's pretty safe to say the middle one should
2337
  //   be a paragraph of its own.
2338
13.9k
  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2339
2340
13.9k
  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2341
2342
13.9k
  std::vector<Interval> leftovers;
2343
13.9k
  LeftoverSegments(rows, &leftovers, 0, rows.size());
2344
13.9k
  for (auto &leftover : leftovers) {
2345
    // Pass 2a:
2346
    //   Find any strongly evidenced start-of-paragraph lines.  If they're
2347
    //   followed by two lines that look like body lines, make a paragraph
2348
    //   model for that and see if that model applies throughout the text
2349
    //   (that is, "smear" it).
2350
13.9k
    StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2351
2352
    // Pass 2b:
2353
    //   If we had any luck in pass 2a, we got part of the page and didn't
2354
    //   know how to classify a few runs of rows. Take the segments that
2355
    //   didn't find a model and reprocess them individually.
2356
13.9k
    std::vector<Interval> leftovers2;
2357
13.9k
    LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2358
13.9k
    bool pass2a_was_useful =
2359
13.9k
        leftovers2.size() > 1 ||
2360
13.9k
        (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
2361
13.9k
    if (pass2a_was_useful) {
2362
556
      for (auto &leftover2 : leftovers2) {
2363
556
        StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2364
556
      }
2365
308
    }
2366
13.9k
  }
2367
2368
13.9k
  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2369
2370
  // Pass 3:
2371
  //   These are the dregs for which we didn't have enough strong textual
2372
  //   and geometric clues to form matching models for.  Let's see if
2373
  //   the geometric clues are simple enough that we could just use those.
2374
13.9k
  LeftoverSegments(rows, &leftovers, 0, rows.size());
2375
14.0k
  for (auto &leftover : leftovers) {
2376
14.0k
    GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2377
14.0k
  }
2378
2379
  // Undo any flush models for which there's little evidence.
2380
13.9k
  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2381
2382
13.9k
  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2383
2384
  // Pass 4:
2385
  //   Take everything that's still not marked up well and clear all markings.
2386
13.9k
  LeftoverSegments(rows, &leftovers, 0, rows.size());
2387
13.9k
  for (auto &leftover : leftovers) {
2388
121k
    for (int j = leftover.begin; j < leftover.end; j++) {
2389
109k
      rows[j].SetUnknown();
2390
109k
    }
2391
11.9k
  }
2392
2393
13.9k
  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2394
2395
  // Convert all of the unique hypothesis runs to PARAs.
2396
13.9k
  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2397
2398
13.9k
  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2399
2400
  // Finally, clean up any dangling nullptr row paragraph parents.
2401
13.9k
  CanonicalizeDetectionResults(row_owners, paragraphs);
2402
13.9k
}
2403
2404
// ============ Code interfacing with the rest of Tesseract ==================
2405
2406
0
static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
2407
  // Set up text, lword_text, and rword_text (mostly for debug printing).
2408
0
  std::string fake_text;
2409
0
  PageIterator pit(static_cast<const PageIterator &>(it));
2410
0
  if (!pit.Empty(RIL_WORD)) {
2411
0
    bool first_word = true;
2412
0
    do {
2413
0
      fake_text += "x";
2414
0
      if (first_word) {
2415
0
        info->lword_text += "x";
2416
0
      }
2417
0
      info->rword_text += "x";
2418
0
      if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2419
0
          !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2420
0
        fake_text += " ";
2421
0
        info->rword_text = "";
2422
0
        first_word = false;
2423
0
      }
2424
0
    } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));
2425
0
  }
2426
0
  if (fake_text.empty()) {
2427
0
    return;
2428
0
  }
2429
2430
0
  int lspaces = info->pix_ldistance / info->average_interword_space;
2431
0
  for (int i = 0; i < lspaces; i++) {
2432
0
    info->text += ' ';
2433
0
  }
2434
0
  info->text += fake_text;
2435
2436
  // Set up lword_box, rword_box, and num_words.
2437
0
  PAGE_RES_IT page_res_it = *it.PageResIt();
2438
0
  WERD_RES *word_res = page_res_it.restart_row();
2439
0
  ROW_RES *this_row = page_res_it.row();
2440
2441
0
  WERD_RES *lword = nullptr;
2442
0
  WERD_RES *rword = nullptr;
2443
0
  info->num_words = 0;
2444
0
  do {
2445
0
    if (word_res) {
2446
0
      if (!lword) {
2447
0
        lword = word_res;
2448
0
      }
2449
0
      if (rword != word_res) {
2450
0
        info->num_words++;
2451
0
      }
2452
0
      rword = word_res;
2453
0
    }
2454
0
    word_res = page_res_it.forward();
2455
0
  } while (page_res_it.row() == this_row);
2456
2457
0
  if (lword) {
2458
0
    info->lword_box = lword->word->bounding_box();
2459
0
  }
2460
0
  if (rword) {
2461
0
    info->rword_box = rword->word->bounding_box();
2462
0
  }
2463
0
}
2464
2465
// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2466
// detector RowInfo with all relevant information from the row.
2467
142k
static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) {
2468
142k
  if (it.PageResIt()->row() != nullptr) {
2469
142k
    ROW *row = it.PageResIt()->row()->row;
2470
142k
    info->pix_ldistance = row->lmargin();
2471
142k
    info->pix_rdistance = row->rmargin();
2472
142k
    info->average_interword_space =
2473
142k
        row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2474
142k
    info->pix_xheight = row->x_height();
2475
142k
    info->has_leaders = false;
2476
142k
    info->has_drop_cap = row->has_drop_cap();
2477
142k
    info->ltr = true; // set below depending on word scripts
2478
142k
  } else {
2479
0
    info->pix_ldistance = info->pix_rdistance = 0;
2480
0
    info->average_interword_space = 1;
2481
0
    info->pix_xheight = 1.0;
2482
0
    info->has_leaders = false;
2483
0
    info->has_drop_cap = false;
2484
0
    info->ltr = true;
2485
0
  }
2486
2487
142k
  info->num_words = 0;
2488
142k
  info->lword_indicates_list_item = false;
2489
142k
  info->lword_likely_starts_idea = false;
2490
142k
  info->lword_likely_ends_idea = false;
2491
142k
  info->rword_indicates_list_item = false;
2492
142k
  info->rword_likely_starts_idea = false;
2493
142k
  info->rword_likely_ends_idea = false;
2494
142k
  info->has_leaders = false;
2495
142k
  info->ltr = true;
2496
2497
142k
  if (!after_recognition) {
2498
0
    InitializeTextAndBoxesPreRecognition(it, info);
2499
0
    return;
2500
0
  }
2501
142k
  info->text = "";
2502
142k
  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2503
142k
  int trailing_ws_idx = strlen(text.get()); // strip trailing space
2504
285k
  while (trailing_ws_idx > 0 &&
2505
         // isspace() only takes ASCII
2506
285k
         isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2507
142k
    trailing_ws_idx--;
2508
142k
  }
2509
142k
  if (trailing_ws_idx > 0) {
2510
142k
    int lspaces = info->pix_ldistance / info->average_interword_space;
2511
218k
    for (int i = 0; i < lspaces; i++) {
2512
75.7k
      info->text += ' ';
2513
75.7k
    }
2514
600k
    for (int i = 0; i < trailing_ws_idx; i++) {
2515
457k
      info->text += text[i];
2516
457k
    }
2517
142k
  }
2518
2519
142k
  if (info->text.empty()) {
2520
0
    return;
2521
0
  }
2522
2523
142k
  PAGE_RES_IT page_res_it = *it.PageResIt();
2524
142k
  std::vector<WERD_RES *> werds;
2525
142k
  WERD_RES *word_res = page_res_it.restart_row();
2526
142k
  ROW_RES *this_row = page_res_it.row();
2527
142k
  int num_leaders = 0;
2528
142k
  int ltr = 0;
2529
142k
  int rtl = 0;
2530
175k
  do {
2531
175k
    if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2532
175k
      werds.push_back(word_res);
2533
175k
      ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2534
175k
      rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2535
175k
      if (word_res->word->flag(W_REP_CHAR)) {
2536
0
        num_leaders++;
2537
0
      }
2538
175k
    }
2539
175k
    word_res = page_res_it.forward();
2540
175k
  } while (page_res_it.row() == this_row);
2541
142k
  info->ltr = ltr >= rtl;
2542
142k
  info->has_leaders = num_leaders > 3;
2543
142k
  info->num_words = werds.size();
2544
142k
  if (!werds.empty()) {
2545
142k
    WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2546
142k
    info->lword_text = lword->best_choice->unichar_string().c_str();
2547
142k
    info->rword_text = rword->best_choice->unichar_string().c_str();
2548
142k
    info->lword_box = lword->word->bounding_box();
2549
142k
    info->rword_box = rword->word->bounding_box();
2550
142k
    LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,
2551
142k
                       &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2552
142k
                       &info->lword_likely_ends_idea);
2553
142k
    RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,
2554
142k
                        &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2555
142k
                        &info->rword_likely_ends_idea);
2556
142k
  }
2557
142k
}
2558
2559
// This is called after rows have been identified and words are recognized.
2560
// Much of this could be implemented before word recognition, but text helps
2561
// to identify bulleted lists and gives good signals for sentence boundaries.
2562
void DetectParagraphs(int debug_level, bool after_text_recognition,
2563
13.9k
                      const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2564
  // Clear out any preconceived notions.
2565
13.9k
  if (block_start->Empty(RIL_TEXTLINE)) {
2566
7
    return;
2567
7
  }
2568
13.9k
  BLOCK *block = block_start->PageResIt()->block()->block;
2569
13.9k
  block->para_list()->clear();
2570
13.9k
  bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2571
2572
  // Convert the Tesseract structures to RowInfos
2573
  // for the paragraph detection algorithm.
2574
13.9k
  MutableIterator row(*block_start);
2575
13.9k
  if (row.Empty(RIL_TEXTLINE)) {
2576
0
    return; // end of input already.
2577
0
  }
2578
2579
13.9k
  std::vector<RowInfo> row_infos;
2580
142k
  do {
2581
142k
    if (!row.PageResIt()->row()) {
2582
0
      continue; // empty row.
2583
0
    }
2584
142k
    row.PageResIt()->row()->row->set_para(nullptr);
2585
142k
    row_infos.emplace_back();
2586
142k
    RowInfo &ri = row_infos.back();
2587
142k
    InitializeRowInfo(after_text_recognition, row, &ri);
2588
142k
  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));
2589
2590
  // If we're called before text recognition, we might not have
2591
  // tight block bounding boxes, so trim by the minimum on each side.
2592
13.9k
  if (!row_infos.empty()) {
2593
13.9k
    int min_lmargin = row_infos[0].pix_ldistance;
2594
13.9k
    int min_rmargin = row_infos[0].pix_rdistance;
2595
142k
    for (unsigned i = 1; i < row_infos.size(); i++) {
2596
128k
      if (row_infos[i].pix_ldistance < min_lmargin) {
2597
1.94k
        min_lmargin = row_infos[i].pix_ldistance;
2598
1.94k
      }
2599
128k
      if (row_infos[i].pix_rdistance < min_rmargin) {
2600
4.08k
        min_rmargin = row_infos[i].pix_rdistance;
2601
4.08k
      }
2602
128k
    }
2603
13.9k
    if (min_lmargin > 0 || min_rmargin > 0) {
2604
66.2k
      for (auto &row_info : row_infos) {
2605
66.2k
        row_info.pix_ldistance -= min_lmargin;
2606
66.2k
        row_info.pix_rdistance -= min_rmargin;
2607
66.2k
      }
2608
6.76k
    }
2609
13.9k
  }
2610
2611
  // Run the paragraph detection algorithm.
2612
13.9k
  std::vector<PARA *> row_owners;
2613
13.9k
  if (!is_image_block) {
2614
13.9k
    DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
2615
13.9k
  } else {
2616
0
    row_owners.resize(row_infos.size());
2617
0
    CanonicalizeDetectionResults(&row_owners, block->para_list());
2618
0
  }
2619
2620
  // Now stitch in the row_owners into the rows.
2621
13.9k
  row = *block_start;
2622
142k
  for (auto &row_owner : row_owners) {
2623
142k
    while (!row.PageResIt()->row()) {
2624
0
      row.Next(RIL_TEXTLINE);
2625
0
    }
2626
142k
    row.PageResIt()->row()->row->set_para(row_owner);
2627
142k
    row.Next(RIL_TEXTLINE);
2628
142k
  }
2629
13.9k
}
2630
2631
} // namespace tesseract