/src/tesseract/src/ccmain/paragraphs.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /**********************************************************************  | 
2  |  |  * File:        paragraphs.cpp  | 
3  |  |  * Description: Paragraph detection for tesseract.  | 
4  |  |  * Author:      David Eger  | 
5  |  |  *  | 
6  |  |  * (C) Copyright 2011, Google Inc.  | 
7  |  |  ** Licensed under the Apache License, Version 2.0 (the "License");  | 
8  |  |  ** you may not use this file except in compliance with the License.  | 
9  |  |  ** You may obtain a copy of the License at  | 
10  |  |  ** http://www.apache.org/licenses/LICENSE-2.0  | 
11  |  |  ** Unless required by applicable law or agreed to in writing, software  | 
12  |  |  ** distributed under the License is distributed on an "AS IS" BASIS,  | 
13  |  |  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
14  |  |  ** See the License for the specific language governing permissions and  | 
15  |  |  ** limitations under the License.  | 
16  |  |  *  | 
17  |  |  **********************************************************************/  | 
18  |  |  | 
19  |  | #include "paragraphs.h"  | 
20  |  |  | 
21  |  | #include "helpers.h"             // for UpdateRange, ClipToRange  | 
22  |  | #include "host.h"                // for NearlyEqual  | 
23  |  | #include "mutableiterator.h"     // for MutableIterator  | 
24  |  | #include "ocrblock.h"            // for BLOCK  | 
25  |  | #include "ocrpara.h"             // for ParagraphModel, PARA, PARA_IT, PARA...  | 
26  |  | #include "ocrrow.h"              // for ROW  | 
27  |  | #include "pageres.h"             // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...  | 
28  |  | #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels  | 
29  |  | #include "pdblock.h"             // for PDBLK  | 
30  |  | #include "polyblk.h"             // for POLY_BLOCK  | 
31  |  | #include "ratngs.h"              // for WERD_CHOICE  | 
32  |  | #include "rect.h"                // for TBOX  | 
33  |  | #include "statistc.h"            // for STATS  | 
34  |  | #include "tesserrstream.h"       // for tesserr  | 
35  |  | #include "tprintf.h"             // for tprintf  | 
36  |  | #include "unicharset.h"          // for UNICHARSET  | 
37  |  | #include "werd.h"                // for WERD, W_REP_CHAR  | 
38  |  |  | 
39  |  | #include <tesseract/pageiterator.h> // for PageIterator  | 
40  |  | #include <tesseract/publictypes.h>  // for JUSTIFICATION_LEFT, JUSTIFICATION_R...  | 
41  |  | #include <tesseract/unichar.h>      // for UNICHAR, UNICHAR_ID  | 
42  |  |  | 
43  |  | #include <algorithm> // for max  | 
44  |  | #include <cctype>    // for isspace  | 
45  |  | #include <cmath>     // for abs  | 
46  |  | #include <cstdio>    // for snprintf  | 
47  |  | #include <cstdlib>   // for abs  | 
48  |  | #include <cstring>   // for strchr, strlen  | 
49  |  | #include <memory>    // for unique_ptr  | 
50  |  |  | 
51  |  | static const char *const kRLE = "\u202A"; // Right-to-Left Embedding  | 
52  |  | static const char *const kPDF = "\u202C"; // Pop Directional Formatting  | 
53  |  |  | 
54  |  | namespace tesseract { | 
55  |  |  | 
56  |  | // Special "weak" ParagraphModels.  | 
57  |  | const ParagraphModel *kCrownLeft =  | 
58  |  |     reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));  | 
59  |  | const ParagraphModel *kCrownRight =  | 
60  |  |     reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));  | 
61  |  |  | 
62  |  | // Do the text and geometry of two rows support a paragraph break between them?  | 
63  |  | static bool LikelyParagraphStart(const RowScratchRegisters &before,  | 
64  |  |                                  const RowScratchRegisters &after,  | 
65  |  |                                  tesseract::ParagraphJustification j);  | 
66  |  |  | 
67  |  | // Given the width of a typical space between words, what is the threshold  | 
68  |  | // by which by which we think left and right alignments for paragraphs  | 
69  |  | // can vary and still be aligned.  | 
70  | 21.0k  | static int Epsilon(int space_pix) { | 
71  | 21.0k  |   return space_pix * 4 / 5;  | 
72  | 21.0k  | }  | 
73  |  |  | 
74  |  | static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,  | 
75  |  |                               const std::vector<RowScratchRegisters> *rows, int row_start,  | 
76  | 123k  |                               int row_end) { | 
77  | 123k  |   if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) { | 
78  | 0  |     tesserr << "Invalid arguments rows[" << row_start << ", " << row_end  | 
79  | 0  |             << ") while rows is of size " << rows->size() << ".\n";  | 
80  | 0  |     return false;  | 
81  | 0  |   }  | 
82  | 123k  |   if (row_end - row_start < min_num_rows) { | 
83  | 6.06k  |     if (debug_level > 1) { | 
84  | 0  |       tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name); | 
85  | 0  |     }  | 
86  | 6.06k  |     return false;  | 
87  | 6.06k  |   }  | 
88  | 117k  |   return true;  | 
89  | 123k  | }  | 
90  |  |  | 
91  |  | // =============================== Debug Code ================================  | 
92  |  |  | 
93  |  | // Given a row-major matrix of unicode text and a column separator, print  | 
94  |  | // a formatted table.  For ASCII, we get good column alignment.  | 
95  | 0  | static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) { | 
96  | 0  |   std::vector<int> max_col_widths;  | 
97  | 0  |   for (const auto &row : rows) { | 
98  | 0  |     auto num_columns = row.size();  | 
99  | 0  |     for (size_t c = 0; c < num_columns; c++) { | 
100  | 0  |       int num_unicodes = 0;  | 
101  | 0  |       for (char i : row[c]) { | 
102  | 0  |         if ((i & 0xC0) != 0x80) { | 
103  | 0  |           num_unicodes++;  | 
104  | 0  |         }  | 
105  | 0  |       }  | 
106  | 0  |       if (c >= max_col_widths.size()) { | 
107  | 0  |         max_col_widths.push_back(num_unicodes);  | 
108  | 0  |       } else { | 
109  | 0  |         if (num_unicodes > max_col_widths[c]) { | 
110  | 0  |           max_col_widths[c] = num_unicodes;  | 
111  | 0  |         }  | 
112  | 0  |       }  | 
113  | 0  |     }  | 
114  | 0  |   }  | 
115  |  | 
  | 
116  | 0  |   std::vector<std::string> col_width_patterns;  | 
117  | 0  |   col_width_patterns.reserve(max_col_widths.size());  | 
118  | 0  |   for (int max_col_width : max_col_widths) { | 
119  | 0  |     col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s"); | 
120  | 0  |   }  | 
121  |  | 
  | 
122  | 0  |   for (const auto &row : rows) { | 
123  | 0  |     for (unsigned c = 0; c < row.size(); c++) { | 
124  | 0  |       if (c > 0) { | 
125  | 0  |         tprintf("%s", colsep); | 
126  | 0  |       }  | 
127  | 0  |       tprintf(col_width_patterns[c].c_str(), row[c].c_str());  | 
128  | 0  |     }  | 
129  | 0  |     tprintf("\n"); | 
130  | 0  |   }  | 
131  | 0  | }  | 
132  |  |  | 
133  | 0  | static std::string RtlEmbed(const std::string &word, bool rtlify) { | 
134  | 0  |   if (rtlify) { | 
135  | 0  |     return std::string(kRLE) + word + std::string(kPDF);  | 
136  | 0  |   }  | 
137  | 0  |   return word;  | 
138  | 0  | }  | 
139  |  |  | 
140  |  | // Print the current thoughts of the paragraph detector.  | 
141  |  | static void PrintDetectorState(const ParagraphTheory &theory,  | 
142  | 0  |                                const std::vector<RowScratchRegisters> &rows) { | 
143  | 0  |   std::vector<std::vector<std::string>> output;  | 
144  | 0  |   output.emplace_back();  | 
145  | 0  |   output.back().push_back("#row"); | 
146  | 0  |   output.back().push_back("space"); | 
147  | 0  |   output.back().push_back(".."); | 
148  | 0  |   output.back().push_back("lword[widthSEL]"); | 
149  | 0  |   output.back().push_back("rword[widthSEL]"); | 
150  | 0  |   RowScratchRegisters::AppendDebugHeaderFields(output.back());  | 
151  | 0  |   output.back().push_back("text"); | 
152  |  | 
  | 
153  | 0  |   for (unsigned i = 0; i < rows.size(); i++) { | 
154  | 0  |     output.emplace_back();  | 
155  | 0  |     std::vector<std::string> &row = output.back();  | 
156  | 0  |     const RowInfo &ri = *rows[i].ri_;  | 
157  | 0  |     row.push_back(std::to_string(i));  | 
158  | 0  |     row.push_back(std::to_string(ri.average_interword_space));  | 
159  | 0  |     row.emplace_back(ri.has_leaders ? ".." : " ");  | 
160  | 0  |     row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +  | 
161  | 0  |                   (ri.lword_likely_starts_idea ? "S" : "s") +  | 
162  | 0  |                   (ri.lword_likely_ends_idea ? "E" : "e") +  | 
163  | 0  |                   (ri.lword_indicates_list_item ? "L" : "l") + "]");  | 
164  | 0  |     row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +  | 
165  | 0  |                   (ri.rword_likely_starts_idea ? "S" : "s") +  | 
166  | 0  |                   (ri.rword_likely_ends_idea ? "E" : "e") +  | 
167  | 0  |                   (ri.rword_indicates_list_item ? "L" : "l") + "]");  | 
168  | 0  |     rows[i].AppendDebugInfo(theory, row);  | 
169  | 0  |     row.push_back(RtlEmbed(ri.text, !ri.ltr));  | 
170  | 0  |   }  | 
171  | 0  |   PrintTable(output, " ");  | 
172  |  | 
  | 
173  | 0  |   tprintf("Active Paragraph Models:\n"); | 
174  | 0  |   unsigned m = 0;  | 
175  | 0  |   for (const auto &model : theory.models()) { | 
176  | 0  |     tprintf(" %d: %s\n", ++m, model->ToString().c_str()); | 
177  | 0  |   }  | 
178  | 0  | }  | 
179  |  |  | 
180  |  | static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,  | 
181  | 94.7k  |                       const std::vector<RowScratchRegisters> &rows) { | 
182  | 94.7k  |   if (!should_print) { | 
183  | 94.7k  |     return;  | 
184  | 94.7k  |   }  | 
185  | 0  |   tprintf("# %s\n", phase); | 
186  | 0  |   PrintDetectorState(theory, rows);  | 
187  | 0  | }  | 
188  |  |  | 
189  |  | // Print out the text for rows[row_start, row_end)  | 
190  |  | static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,  | 
191  | 0  |                           int row_end) { | 
192  | 0  |   tprintf("======================================\n"); | 
193  | 0  |   for (int row = row_start; row < row_end; row++) { | 
194  | 0  |     tprintf("%s\n", rows[row].ri_->text.c_str()); | 
195  | 0  |   }  | 
196  | 0  |   tprintf("======================================\n"); | 
197  | 0  | }  | 
198  |  |  | 
199  |  | // ============= Brain Dead Language Model (ASCII Version) ===================  | 
200  |  |  | 
201  | 0  | static bool IsLatinLetter(int ch) { | 
202  | 0  |   return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');  | 
203  | 0  | }  | 
204  |  |  | 
205  | 99.6k  | static bool IsDigitLike(int ch) { | 
206  | 99.6k  |   return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';  | 
207  | 99.6k  | }  | 
208  |  |  | 
209  | 0  | static bool IsOpeningPunct(int ch) { | 
210  | 0  |   return strchr("'\"({[", ch) != nullptr; | 
211  | 0  | }  | 
212  |  |  | 
213  | 0  | static bool IsTerminalPunct(int ch) { | 
214  | 0  |   return strchr(":'\".?!]})", ch) != nullptr; | 
215  | 0  | }  | 
216  |  |  | 
217  |  | // Return a pointer after consuming as much text as qualifies as roman numeral.  | 
218  | 0  | static const char *SkipChars(const char *str, const char *toskip) { | 
219  | 0  |   while (*str != '\0' && strchr(toskip, *str)) { | 
220  | 0  |     str++;  | 
221  | 0  |   }  | 
222  | 0  |   return str;  | 
223  | 0  | }  | 
224  |  |  | 
225  | 0  | static const char *SkipChars(const char *str, bool (*skip)(int)) { | 
226  | 0  |   while (*str != '\0' && skip(*str)) { | 
227  | 0  |     str++;  | 
228  | 0  |   }  | 
229  | 0  |   return str;  | 
230  | 0  | }  | 
231  |  |  | 
232  | 0  | static const char *SkipOne(const char *str, const char *toskip) { | 
233  | 0  |   if (*str != '\0' && strchr(toskip, *str)) { | 
234  | 0  |     return str + 1;  | 
235  | 0  |   }  | 
236  | 0  |   return str;  | 
237  | 0  | }  | 
238  |  |  | 
239  |  | // Return whether it is very likely that this is a numeral marker that could  | 
240  |  | // start a list item.  Some examples include:  | 
241  |  | //   A   I   iii.   VI   (2)   3.5.   [C-4]  | 
242  | 0  | static bool LikelyListNumeral(const std::string &word) { | 
243  | 0  |   const char *kRomans = "ivxlmdIVXLMD";  | 
244  | 0  |   const char *kDigits = "012345789";  | 
245  | 0  |   const char *kOpen = "[{("; | 
246  | 0  |   const char *kSep = ":;-.,";  | 
247  | 0  |   const char *kClose = "]})";  | 
248  |  | 
  | 
249  | 0  |   int num_segments = 0;  | 
250  | 0  |   const char *pos = word.c_str();  | 
251  | 0  |   while (*pos != '\0' && num_segments < 3) { | 
252  |  |     // skip up to two open parens.  | 
253  | 0  |     const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);  | 
254  | 0  |     const char *numeral_end = SkipChars(numeral_start, kRomans);  | 
255  | 0  |     if (numeral_end != numeral_start) { | 
256  |  |       // Got Roman Numeral. Great.  | 
257  | 0  |     } else { | 
258  | 0  |       numeral_end = SkipChars(numeral_start, kDigits);  | 
259  | 0  |       if (numeral_end == numeral_start) { | 
260  |  |         // If there's a single latin letter, we can use that.  | 
261  | 0  |         numeral_end = SkipChars(numeral_start, IsLatinLetter);  | 
262  | 0  |         if (numeral_end - numeral_start != 1) { | 
263  | 0  |           break;  | 
264  | 0  |         }  | 
265  | 0  |       }  | 
266  | 0  |     }  | 
267  |  |     // We got some sort of numeral.  | 
268  | 0  |     num_segments++;  | 
269  |  |     // Skip any trailing parens or punctuation.  | 
270  | 0  |     pos = SkipChars(SkipChars(numeral_end, kClose), kSep);  | 
271  | 0  |     if (pos == numeral_end) { | 
272  | 0  |       break;  | 
273  | 0  |     }  | 
274  | 0  |   }  | 
275  | 0  |   return *pos == '\0';  | 
276  | 0  | }  | 
277  |  |  | 
278  | 209k  | static bool LikelyListMark(const std::string &word) { | 
279  | 209k  |   const char *kListMarks = "0Oo*.,+.";  | 
280  | 209k  |   return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;  | 
281  | 209k  | }  | 
282  |  |  | 
283  | 0  | bool AsciiLikelyListItem(const std::string &word) { | 
284  | 0  |   return LikelyListMark(word) || LikelyListNumeral(word);  | 
285  | 0  | }  | 
286  |  |  | 
287  |  | // ========== Brain Dead Language Model (Tesseract Version) ================  | 
288  |  |  | 
289  |  | // Return the first Unicode Codepoint from werd[pos].  | 
290  | 651k  | static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) { | 
291  | 651k  |   if (!u || !werd || pos > werd->length()) { | 
292  | 0  |     return 0;  | 
293  | 0  |   }  | 
294  | 651k  |   return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();  | 
295  | 651k  | }  | 
296  |  |  | 
297  |  | // A useful helper class for finding the first j >= i so that word[j]  | 
298  |  | // does not have given character type.  | 
299  |  | class UnicodeSpanSkipper { | 
300  |  | public:  | 
301  |  |   UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)  | 
302  | 274k  |       : u_(unicharset), word_(word), wordlen_(word->length()) { | 
303  | 274k  |   }  | 
304  |  |  | 
305  |  |   // Given an input position, return the first position >= pos not punc.  | 
306  |  |   unsigned SkipPunc(unsigned pos);  | 
307  |  |   // Given an input position, return the first position >= pos not digit.  | 
308  |  |   unsigned SkipDigits(unsigned pos);  | 
309  |  |   // Given an input position, return the first position >= pos not roman.  | 
310  |  |   unsigned SkipRomans(unsigned pos);  | 
311  |  |   // Given an input position, return the first position >= pos not alpha.  | 
312  |  |   unsigned SkipAlpha(unsigned pos);  | 
313  |  |  | 
314  |  | private:  | 
315  |  |   const UNICHARSET *u_;  | 
316  |  |   const WERD_CHOICE *word_;  | 
317  |  |   unsigned wordlen_;  | 
318  |  | };  | 
319  |  |  | 
320  | 459k  | unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) { | 
321  | 572k  |   while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) { | 
322  | 112k  |     pos++;  | 
323  | 112k  |   }  | 
324  | 459k  |   return pos;  | 
325  | 459k  | }  | 
326  |  |  | 
327  | 163k  | unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) { | 
328  | 193k  |   while (pos < wordlen_ &&  | 
329  | 193k  |          (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) { | 
330  | 29.6k  |     pos++;  | 
331  | 29.6k  |   }  | 
332  | 163k  |   return pos;  | 
333  | 163k  | }  | 
334  |  |  | 
335  | 278k  | unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) { | 
336  | 278k  |   const char *kRomans = "ivxlmdIVXLMD";  | 
337  | 472k  |   while (pos < wordlen_) { | 
338  | 336k  |     int ch = UnicodeFor(u_, word_, pos);  | 
339  | 336k  |     if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) { | 
340  | 142k  |       break;  | 
341  | 142k  |     }  | 
342  | 193k  |     pos++;  | 
343  | 193k  |   }  | 
344  | 278k  |   return pos;  | 
345  | 278k  | }  | 
346  |  |  | 
347  | 140k  | unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) { | 
348  | 240k  |   while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) { | 
349  | 99.6k  |     pos++;  | 
350  | 99.6k  |   }  | 
351  | 140k  |   return pos;  | 
352  | 140k  | }  | 
353  |  |  | 
354  | 214k  | static bool LikelyListMarkUnicode(int ch) { | 
355  | 214k  |   if (ch < 0x80) { | 
356  | 209k  |     std::string single_ch;  | 
357  | 209k  |     single_ch += ch;  | 
358  | 209k  |     return LikelyListMark(single_ch);  | 
359  | 209k  |   }  | 
360  | 5.82k  |   switch (ch) { | 
361  |  |     // TODO(eger) expand this list of unicodes as needed.  | 
362  | 6  |     case 0x00B0: // degree sign  | 
363  | 6  |     case 0x2022: // bullet  | 
364  | 6  |     case 0x25E6: // white bullet  | 
365  | 6  |     case 0x00B7: // middle dot  | 
366  | 6  |     case 0x25A1: // white square  | 
367  | 6  |     case 0x25A0: // black square  | 
368  | 6  |     case 0x25AA: // black small square  | 
369  | 6  |     case 0x2B1D: // black very small square  | 
370  | 6  |     case 0x25BA: // black right-pointing pointer  | 
371  | 6  |     case 0x25CF: // black circle  | 
372  | 6  |     case 0x25CB: // white circle  | 
373  | 6  |       return true;  | 
374  | 5.82k  |     default:  | 
375  | 5.82k  |       break; // fall through  | 
376  | 5.82k  |   }  | 
377  | 5.82k  |   return false;  | 
378  | 5.82k  | }  | 
379  |  |  | 
380  |  | // Return whether it is very likely that this is a numeral marker that could  | 
381  |  | // start a list item.  Some examples include:  | 
382  |  | //   A   I   iii.   VI   (2)   3.5.   [C-4]  | 
383  | 285k  | static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { | 
384  | 285k  |   if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) { | 
385  | 10.4k  |     return true;  | 
386  | 10.4k  |   }  | 
387  |  |  | 
388  | 274k  |   UnicodeSpanSkipper m(u, werd);  | 
389  | 274k  |   int num_segments = 0;  | 
390  | 274k  |   unsigned pos = 0;  | 
391  | 294k  |   while (pos < werd->length() && num_segments < 3) { | 
392  | 285k  |     auto numeral_start = m.SkipPunc(pos);  | 
393  | 285k  |     if (numeral_start > pos + 1) { | 
394  | 6.90k  |       break;  | 
395  | 6.90k  |     }  | 
396  | 278k  |     auto numeral_end = m.SkipRomans(numeral_start);  | 
397  | 278k  |     if (numeral_end == numeral_start) { | 
398  | 163k  |       numeral_end = m.SkipDigits(numeral_start);  | 
399  | 163k  |       if (numeral_end == numeral_start) { | 
400  |  |         // If there's a single latin letter, we can use that.  | 
401  | 140k  |         numeral_end = m.SkipAlpha(numeral_start);  | 
402  | 140k  |         if (numeral_end - numeral_start != 1) { | 
403  | 104k  |           break;  | 
404  | 104k  |         }  | 
405  | 140k  |       }  | 
406  | 163k  |     }  | 
407  |  |     // We got some sort of numeral.  | 
408  | 174k  |     num_segments++;  | 
409  |  |     // Skip any trailing punctuation.  | 
410  | 174k  |     pos = m.SkipPunc(numeral_end);  | 
411  | 174k  |     if (pos == numeral_end) { | 
412  | 154k  |       break;  | 
413  | 154k  |     }  | 
414  | 174k  |   }  | 
415  | 274k  |   return pos == werd->length();  | 
416  | 285k  | }  | 
417  |  |  | 
418  |  | template<class T>  | 
419  | 873k  | void push_back_new(std::vector<T> &vector, const T &data) { | 
420  | 873k  |   if (std::find(vector.begin(), vector.end(), data) == vector.end()) { | 
421  | 854k  |     vector.push_back(data);  | 
422  | 854k  |   }  | 
423  | 873k  | } void tesseract::push_back_new<tesseract::LineHypothesis>(std::__1::vector<tesseract::LineHypothesis, std::__1::allocator<tesseract::LineHypothesis> >&, tesseract::LineHypothesis const&) Line  | Count  | Source  |  419  | 125k  | void push_back_new(std::vector<T> &vector, const T &data) { |  420  | 125k  |   if (std::find(vector.begin(), vector.end(), data) == vector.end()) { |  421  | 123k  |     vector.push_back(data);  |  422  | 123k  |   }  |  423  | 125k  | }  |  
 void tesseract::push_back_new<tesseract::ParagraphModel const*>(std::__1::vector<tesseract::ParagraphModel const*, std::__1::allocator<tesseract::ParagraphModel const*> >&, tesseract::ParagraphModel const* const&) Line  | Count  | Source  |  419  | 744k  | void push_back_new(std::vector<T> &vector, const T &data) { |  420  | 744k  |   if (std::find(vector.begin(), vector.end(), data) == vector.end()) { |  421  | 727k  |     vector.push_back(data);  |  422  | 727k  |   }  |  423  | 744k  | }  |  
 void tesseract::push_back_new<tesseract::ParagraphModel*>(std::__1::vector<tesseract::ParagraphModel*, std::__1::allocator<tesseract::ParagraphModel*> >&, tesseract::ParagraphModel* const&) Line  | Count  | Source  |  419  | 3.28k  | void push_back_new(std::vector<T> &vector, const T &data) { |  420  | 3.28k  |   if (std::find(vector.begin(), vector.end(), data) == vector.end()) { |  421  | 3.28k  |     vector.push_back(data);  |  422  | 3.28k  |   }  |  423  | 3.28k  | }  |  
  | 
424  |  |  | 
425  |  | // ========= Brain Dead Language Model (combined entry points) ================  | 
426  |  |  | 
427  |  | // Given the leftmost word of a line either as a Tesseract unicharset + werd  | 
428  |  | // or a utf8 string, set the following attributes for it:  | 
429  |  | //   is_list -      this word might be a list number or bullet.  | 
430  |  | //   starts_idea -  this word is likely to start a sentence.  | 
431  |  | //   ends_idea -    this word is likely to end a sentence.  | 
432  |  | void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,  | 
433  | 142k  |                         bool *is_list, bool *starts_idea, bool *ends_idea) { | 
434  | 142k  |   *is_list = false;  | 
435  | 142k  |   *starts_idea = false;  | 
436  | 142k  |   *ends_idea = false;  | 
437  | 142k  |   if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty | 
438  | 0  |     *ends_idea = true;  | 
439  | 0  |     return;  | 
440  | 0  |   }  | 
441  |  |  | 
442  | 142k  |   if (unicharset && werd) { // We have a proper werd and unicharset so use it. | 
443  | 142k  |     if (UniLikelyListItem(unicharset, werd)) { | 
444  | 79.9k  |       *is_list = true;  | 
445  | 79.9k  |       *starts_idea = true;  | 
446  | 79.9k  |       *ends_idea = true;  | 
447  | 79.9k  |     }  | 
448  | 142k  |     if (unicharset->get_isupper(werd->unichar_id(0))) { | 
449  | 24.9k  |       *starts_idea = true;  | 
450  | 24.9k  |     }  | 
451  | 142k  |     if (unicharset->get_ispunctuation(werd->unichar_id(0))) { | 
452  | 32.8k  |       *starts_idea = true;  | 
453  | 32.8k  |       *ends_idea = true;  | 
454  | 32.8k  |     }  | 
455  | 142k  |   } else { // Assume utf8 is mostly ASCII | 
456  | 0  |     if (AsciiLikelyListItem(utf8)) { | 
457  | 0  |       *is_list = true;  | 
458  | 0  |       *starts_idea = true;  | 
459  | 0  |     }  | 
460  | 0  |     int start_letter = utf8[0];  | 
461  | 0  |     if (IsOpeningPunct(start_letter)) { | 
462  | 0  |       *starts_idea = true;  | 
463  | 0  |     }  | 
464  | 0  |     if (IsTerminalPunct(start_letter)) { | 
465  | 0  |       *ends_idea = true;  | 
466  | 0  |     }  | 
467  | 0  |     if (start_letter >= 'A' && start_letter <= 'Z') { | 
468  | 0  |       *starts_idea = true;  | 
469  | 0  |     }  | 
470  | 0  |   }  | 
471  | 142k  | }  | 
472  |  |  | 
473  |  | // Given the rightmost word of a line either as a Tesseract unicharset + werd  | 
474  |  | // or a utf8 string, set the following attributes for it:  | 
475  |  | //   is_list -      this word might be a list number or bullet.  | 
476  |  | //   starts_idea -  this word is likely to start a sentence.  | 
477  |  | //   ends_idea -    this word is likely to end a sentence.  | 
478  |  | void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,  | 
479  | 142k  |                          bool *is_list, bool *starts_idea, bool *ends_idea) { | 
480  | 142k  |   *is_list = false;  | 
481  | 142k  |   *starts_idea = false;  | 
482  | 142k  |   *ends_idea = false;  | 
483  | 142k  |   if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty | 
484  | 0  |     *ends_idea = true;  | 
485  | 0  |     return;  | 
486  | 0  |   }  | 
487  |  |  | 
488  | 142k  |   if (unicharset && werd) { // We have a proper werd and unicharset so use it. | 
489  | 142k  |     if (UniLikelyListItem(unicharset, werd)) { | 
490  | 80.0k  |       *is_list = true;  | 
491  | 80.0k  |       *starts_idea = true;  | 
492  | 80.0k  |     }  | 
493  | 142k  |     UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);  | 
494  | 142k  |     if (unicharset->get_ispunctuation(last_letter)) { | 
495  | 33.3k  |       *ends_idea = true;  | 
496  | 33.3k  |     }  | 
497  | 142k  |   } else { // Assume utf8 is mostly ASCII | 
498  | 0  |     if (AsciiLikelyListItem(utf8)) { | 
499  | 0  |       *is_list = true;  | 
500  | 0  |       *starts_idea = true;  | 
501  | 0  |     }  | 
502  | 0  |     int last_letter = utf8[utf8.size() - 1];  | 
503  | 0  |     if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { | 
504  | 0  |       *ends_idea = true;  | 
505  | 0  |     }  | 
506  | 0  |   }  | 
507  | 142k  | }  | 
508  |  |  | 
509  |  | // =============== Implementation of RowScratchRegisters =====================  | 
510  |  | /* static */  | 
511  | 0  | void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) { | 
512  | 0  |   header.emplace_back("[lmarg,lind;rind,rmarg]"); | 
513  | 0  |   header.emplace_back("model"); | 
514  | 0  | }  | 
515  |  |  | 
516  |  | void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,  | 
517  | 0  |                                           std::vector<std::string> &dbg) const { | 
518  | 0  |   char s[60];  | 
519  |  |   // The largest (positive and negative) numbers are reported for lindent & rindent.  | 
520  |  |   // While the column header has widths 5,4,4,5, it is therefore opportune to slightly  | 
521  |  |   // offset the widths in the format string here to allow ample space for lindent & rindent  | 
522  |  |   // while keeping the final table output nicely readable: 4,5,5,4.  | 
523  | 0  |   snprintf(s, sizeof(s), "[%4d,%5d;%5d,%4d]", lmargin_, lindent_, rindent_, rmargin_);  | 
524  | 0  |   dbg.emplace_back(s);  | 
525  | 0  |   std::string model_string;  | 
526  | 0  |   model_string += static_cast<char>(GetLineType());  | 
527  | 0  |   model_string += ":";  | 
528  |  | 
  | 
529  | 0  |   int model_numbers = 0;  | 
530  | 0  |   for (const auto &hypothese : hypotheses_) { | 
531  | 0  |     if (hypothese.model == nullptr) { | 
532  | 0  |       continue;  | 
533  | 0  |     }  | 
534  | 0  |     if (model_numbers > 0) { | 
535  | 0  |       model_string += ",";  | 
536  | 0  |     }  | 
537  | 0  |     if (StrongModel(hypothese.model)) { | 
538  | 0  |       model_string += std::to_string(1 + theory.IndexOf(hypothese.model));  | 
539  | 0  |     } else if (hypothese.model == kCrownLeft) { | 
540  | 0  |       model_string += "CrL";  | 
541  | 0  |     } else if (hypothese.model == kCrownRight) { | 
542  | 0  |       model_string += "CrR";  | 
543  | 0  |     }  | 
544  | 0  |     model_numbers++;  | 
545  | 0  |   }  | 
546  | 0  |   if (model_numbers == 0) { | 
547  | 0  |     model_string += "0";  | 
548  | 0  |   }  | 
549  |  | 
  | 
550  | 0  |   dbg.push_back(model_string);  | 
551  | 0  | }  | 
552  |  |  | 
553  | 142k  | void RowScratchRegisters::Init(const RowInfo &row) { | 
554  | 142k  |   ri_ = &row;  | 
555  | 142k  |   lmargin_ = 0;  | 
556  | 142k  |   lindent_ = row.pix_ldistance;  | 
557  | 142k  |   rmargin_ = 0;  | 
558  | 142k  |   rindent_ = row.pix_rdistance;  | 
559  | 142k  | }  | 
560  |  |  | 
561  | 680k  | LineType RowScratchRegisters::GetLineType() const { | 
562  | 680k  |   if (hypotheses_.empty()) { | 
563  | 475k  |     return LT_UNKNOWN;  | 
564  | 475k  |   }  | 
565  | 205k  |   bool has_start = false;  | 
566  | 205k  |   bool has_body = false;  | 
567  | 205k  |   for (const auto &hypothese : hypotheses_) { | 
568  | 205k  |     switch (hypothese.ty) { | 
569  | 54.4k  |       case LT_START:  | 
570  | 54.4k  |         has_start = true;  | 
571  | 54.4k  |         break;  | 
572  | 151k  |       case LT_BODY:  | 
573  | 151k  |         has_body = true;  | 
574  | 151k  |         break;  | 
575  | 0  |       default:  | 
576  | 0  |         tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty); | 
577  | 0  |         break;  | 
578  | 205k  |     }  | 
579  | 205k  |   }  | 
580  | 205k  |   if (has_start && has_body) { | 
581  | 271  |     return LT_MULTIPLE;  | 
582  | 271  |   }  | 
583  | 204k  |   return has_start ? LT_START : LT_BODY;  | 
584  | 205k  | }  | 
585  |  |  | 
586  | 399k  | LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { | 
587  | 399k  |   if (hypotheses_.empty()) { | 
588  | 9.52k  |     return LT_UNKNOWN;  | 
589  | 9.52k  |   }  | 
590  | 389k  |   bool has_start = false;  | 
591  | 389k  |   bool has_body = false;  | 
592  | 391k  |   for (const auto &hypothese : hypotheses_) { | 
593  | 391k  |     if (hypothese.model != model) { | 
594  | 68.1k  |       continue;  | 
595  | 68.1k  |     }  | 
596  | 322k  |     switch (hypothese.ty) { | 
597  | 113k  |       case LT_START:  | 
598  | 113k  |         has_start = true;  | 
599  | 113k  |         break;  | 
600  | 209k  |       case LT_BODY:  | 
601  | 209k  |         has_body = true;  | 
602  | 209k  |         break;  | 
603  | 0  |       default:  | 
604  | 0  |         tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty); | 
605  | 0  |         break;  | 
606  | 322k  |     }  | 
607  | 322k  |   }  | 
608  | 389k  |   if (has_start && has_body) { | 
609  | 66  |     return LT_MULTIPLE;  | 
610  | 66  |   }  | 
611  | 389k  |   return has_start ? LT_START : LT_BODY;  | 
612  | 389k  | }  | 
613  |  |  | 
614  | 10.6k  | void RowScratchRegisters::SetStartLine() { | 
615  | 10.6k  |   LineType current_lt = GetLineType();  | 
616  | 10.6k  |   if (current_lt != LT_UNKNOWN && current_lt != LT_START) { | 
617  | 0  |     tprintf("Trying to set a line to be START when it's already BODY.\n"); | 
618  | 0  |   }  | 
619  | 10.6k  |   if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { | 
620  | 10.6k  |     push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));  | 
621  | 10.6k  |   }  | 
622  | 10.6k  | }  | 
623  |  |  | 
624  | 22.4k  | void RowScratchRegisters::SetBodyLine() { | 
625  | 22.4k  |   LineType current_lt = GetLineType();  | 
626  | 22.4k  |   if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { | 
627  | 0  |     tprintf("Trying to set a line to be BODY when it's already START.\n"); | 
628  | 0  |   }  | 
629  | 22.4k  |   if (current_lt == LT_UNKNOWN || current_lt == LT_START) { | 
630  | 22.4k  |     push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));  | 
631  | 22.4k  |   }  | 
632  | 22.4k  | }  | 
633  |  |  | 
634  | 20.6k  | void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { | 
635  | 20.6k  |   push_back_new(hypotheses_, LineHypothesis(LT_START, model));  | 
636  | 20.6k  |   auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));  | 
637  | 20.6k  |   if (found != hypotheses_.end()) { | 
638  | 1.61k  |     hypotheses_.erase(found);  | 
639  | 1.61k  |   }  | 
640  | 20.6k  | }  | 
641  |  |  | 
642  | 71.8k  | void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { | 
643  | 71.8k  |   push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));  | 
644  | 71.8k  |   auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));  | 
645  | 71.8k  |   if (found != hypotheses_.end()) { | 
646  | 1.80k  |     hypotheses_.erase(found);  | 
647  | 1.80k  |   }  | 
648  | 71.8k  | }  | 
649  |  |  | 
650  | 571k  | void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { | 
651  | 571k  |   for (const auto &hypothese : hypotheses_) { | 
652  | 224k  |     if (hypothese.ty == LT_START && StrongModel(hypothese.model)) { | 
653  | 2.71k  |       push_back_new(*models, hypothese.model);  | 
654  | 2.71k  |     }  | 
655  | 224k  |   }  | 
656  | 571k  | }  | 
657  |  |  | 
658  | 1.09M  | void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { | 
659  | 1.09M  |   for (const auto &hypothese : hypotheses_) { | 
660  | 433k  |     if (StrongModel(hypothese.model)) { | 
661  | 79.6k  |       push_back_new(*models, hypothese.model);  | 
662  | 79.6k  |     }  | 
663  | 433k  |   }  | 
664  | 1.09M  | }  | 
665  |  |  | 
666  | 1.28M  | void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { | 
667  | 1.28M  |   for (const auto &hypothese : hypotheses_) { | 
668  | 696k  |     if (hypothese.model != nullptr) { | 
669  | 622k  |       push_back_new(*models, hypothese.model);  | 
670  | 622k  |     }  | 
671  | 696k  |   }  | 
672  | 1.28M  | }  | 
673  |  |  | 
674  | 14.6k  | const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { | 
675  | 14.6k  |   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) { | 
676  | 173  |     return nullptr;  | 
677  | 173  |   }  | 
678  | 14.4k  |   return hypotheses_[0].model;  | 
679  | 14.6k  | }  | 
680  |  |  | 
681  | 149k  | const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { | 
682  | 149k  |   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) { | 
683  | 118k  |     return nullptr;  | 
684  | 118k  |   }  | 
685  | 30.6k  |   return hypotheses_[0].model;  | 
686  | 149k  | }  | 
687  |  |  | 
688  |  | // Discard any hypotheses whose model is not in the given list.  | 
689  | 0  | void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) { | 
690  | 0  |   if (models.empty()) { | 
691  | 0  |     return;  | 
692  | 0  |   }  | 
693  | 0  |   for (int h = hypotheses_.size() - 1; h >= 0; h--) { | 
694  | 0  |     if (!contains(models, hypotheses_[h].model)) { | 
695  | 0  |       hypotheses_.erase(hypotheses_.begin() + h);  | 
696  | 0  |     }  | 
697  | 0  |   }  | 
698  | 0  | }  | 
699  |  |  | 
700  |  | // ============ Geometry based Paragraph Detection Algorithm =================  | 
701  |  |  | 
702  |  | struct Cluster { | 
703  | 0  |   Cluster() : center(0), count(0) {} | 
704  | 109k  |   Cluster(int cen, int num) : center(cen), count(num) {} | 
705  |  |  | 
706  |  |   int center; // The center of the cluster.  | 
707  |  |   int count;  // The number of entries within the cluster.  | 
708  |  | };  | 
709  |  |  | 
710  |  | class SimpleClusterer { | 
711  |  | public:  | 
712  | 39.7k  |   explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {} | 
713  | 523k  |   void Add(int value) { | 
714  | 523k  |     values_.push_back(value);  | 
715  | 523k  |   }  | 
716  | 0  |   size_t size() const { | 
717  | 0  |     return values_.size();  | 
718  | 0  |   }  | 
719  |  |   void GetClusters(std::vector<Cluster> *clusters);  | 
720  |  |  | 
721  |  | private:  | 
722  |  |   int max_cluster_width_;  | 
723  |  |   std::vector<int> values_;  | 
724  |  | };  | 
725  |  |  | 
726  |  | // Return the index of the cluster closest to value.  | 
727  | 363k  | static int ClosestCluster(const std::vector<Cluster> &clusters, int value) { | 
728  | 363k  |   unsigned best_index = 0;  | 
729  | 1.11M  |   for (unsigned i = 0; i < clusters.size(); i++) { | 
730  | 752k  |     if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) { | 
731  | 162k  |       best_index = i;  | 
732  | 162k  |     }  | 
733  | 752k  |   }  | 
734  | 363k  |   return best_index;  | 
735  | 363k  | }  | 
736  |  |  | 
737  | 59.5k  | void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) { | 
738  | 59.5k  |   clusters->clear();  | 
739  | 59.5k  |   std::sort(values_.begin(), values_.end());  | 
740  | 168k  |   for (unsigned i = 0; i < values_.size();) { | 
741  | 109k  |     int orig_i = i;  | 
742  | 109k  |     int lo = values_[i];  | 
743  | 109k  |     int hi = lo;  | 
744  | 784k  |     while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { | 
745  | 675k  |       hi = values_[i];  | 
746  | 675k  |     }  | 
747  | 109k  |     clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));  | 
748  | 109k  |   }  | 
749  | 59.5k  | }  | 
750  |  |  | 
751  |  | // Calculate left- and right-indent tab stop values seen in  | 
752  |  | // rows[row_start, row_end) given a tolerance of tolerance.  | 
753  |  | static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,  | 
754  |  |                               int tolerance, std::vector<Cluster> *left_tabs,  | 
755  | 9.92k  |                               std::vector<Cluster> *right_tabs) { | 
756  | 9.92k  |   if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) { | 
757  | 0  |     return;  | 
758  | 0  |   }  | 
759  |  |   // First pass: toss all left and right indents into clusterers.  | 
760  | 9.92k  |   SimpleClusterer initial_lefts(tolerance);  | 
761  | 9.92k  |   SimpleClusterer initial_rights(tolerance);  | 
762  | 9.92k  |   std::vector<Cluster> initial_left_tabs;  | 
763  | 9.92k  |   std::vector<Cluster> initial_right_tabs;  | 
764  | 141k  |   for (int i = row_start; i < row_end; i++) { | 
765  | 131k  |     initial_lefts.Add((*rows)[i].lindent_);  | 
766  | 131k  |     initial_rights.Add((*rows)[i].rindent_);  | 
767  | 131k  |   }  | 
768  | 9.92k  |   initial_lefts.GetClusters(&initial_left_tabs);  | 
769  | 9.92k  |   initial_rights.GetClusters(&initial_right_tabs);  | 
770  |  |  | 
771  |  |   // Second pass: cluster only lines that are not "stray"  | 
772  |  |   //   An example of a stray line is a page number -- a line whose start  | 
773  |  |   //   and end tab-stops are far outside the typical start and end tab-stops  | 
774  |  |   //   for the block.  | 
775  |  |   //   Put another way, we only cluster data from lines whose start or end  | 
776  |  |   //   tab stop is frequent.  | 
777  | 9.92k  |   SimpleClusterer lefts(tolerance);  | 
778  | 9.92k  |   SimpleClusterer rights(tolerance);  | 
779  |  |  | 
780  |  |   // Outlier elimination.  We might want to switch this to test outlier-ness  | 
781  |  |   // based on how strange a position an outlier is in instead of or in addition  | 
782  |  |   // to how rare it is.  These outliers get re-added if we end up having too  | 
783  |  |   // few tab stops, to work with, however.  | 
784  | 9.92k  |   int infrequent_enough_to_ignore = 0;  | 
785  | 9.92k  |   if (row_end - row_start >= 8) { | 
786  | 7.11k  |     infrequent_enough_to_ignore = 1;  | 
787  | 7.11k  |   }  | 
788  | 9.92k  |   if (row_end - row_start >= 20) { | 
789  | 2.07k  |     infrequent_enough_to_ignore = 2;  | 
790  | 2.07k  |   }  | 
791  |  |  | 
792  | 141k  |   for (int i = row_start; i < row_end; i++) { | 
793  | 131k  |     int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);  | 
794  | 131k  |     int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);  | 
795  | 131k  |     if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||  | 
796  | 131k  |         initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { | 
797  | 130k  |       lefts.Add((*rows)[i].lindent_);  | 
798  | 130k  |       rights.Add((*rows)[i].rindent_);  | 
799  | 130k  |     }  | 
800  | 131k  |   }  | 
801  | 9.92k  |   lefts.GetClusters(left_tabs);  | 
802  | 9.92k  |   rights.GetClusters(right_tabs);  | 
803  |  |  | 
804  | 9.92k  |   if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||  | 
805  | 9.92k  |       (right_tabs->size() == 1 && left_tabs->size() >= 4)) { | 
806  |  |     // One side is really ragged, and the other only has one tab stop,  | 
807  |  |     // so those "insignificant outliers" are probably important, actually.  | 
808  |  |     // This often happens on a page of an index.  Add back in the ones  | 
809  |  |     // we omitted in the first pass.  | 
810  | 3.91k  |     for (int i = row_start; i < row_end; i++) { | 
811  | 3.69k  |       int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);  | 
812  | 3.69k  |       int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);  | 
813  | 3.69k  |       if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||  | 
814  | 3.69k  |             initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) { | 
815  | 60  |         lefts.Add((*rows)[i].lindent_);  | 
816  | 60  |         rights.Add((*rows)[i].rindent_);  | 
817  | 60  |       }  | 
818  | 3.69k  |     }  | 
819  | 227  |   }  | 
820  | 9.92k  |   lefts.GetClusters(left_tabs);  | 
821  | 9.92k  |   rights.GetClusters(right_tabs);  | 
822  |  |  | 
823  |  |   // If one side is almost a two-indent aligned side, and the other clearly  | 
824  |  |   // isn't, try to prune out the least frequent tab stop from that side.  | 
825  | 9.92k  |   if (left_tabs->size() == 3 && right_tabs->size() >= 4) { | 
826  | 171  |     int to_prune = -1;  | 
827  | 684  |     for (int i = left_tabs->size() - 1; i >= 0; i--) { | 
828  | 513  |       if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) { | 
829  | 195  |         to_prune = i;  | 
830  | 195  |       }  | 
831  | 513  |     }  | 
832  | 171  |     if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { | 
833  | 100  |       left_tabs->erase(left_tabs->begin() + to_prune);  | 
834  | 100  |     }  | 
835  | 171  |   }  | 
836  | 9.92k  |   if (right_tabs->size() == 3 && left_tabs->size() >= 4) { | 
837  | 111  |     int to_prune = -1;  | 
838  | 444  |     for (int i = right_tabs->size() - 1; i >= 0; i--) { | 
839  | 333  |       if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) { | 
840  | 130  |         to_prune = i;  | 
841  | 130  |       }  | 
842  | 333  |     }  | 
843  | 111  |     if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { | 
844  | 87  |       right_tabs->erase(right_tabs->begin() + to_prune);  | 
845  | 87  |     }  | 
846  | 111  |   }  | 
847  | 9.92k  | }  | 
848  |  |  | 
849  |  | // Given a paragraph model mark rows[row_start, row_end) as said model  | 
850  |  | // start or body lines.  | 
851  |  | //  | 
852  |  | // Case 1: model->first_indent_ != model->body_indent_  | 
853  |  | //   Differentiating the paragraph start lines from the paragraph body lines in  | 
854  |  | //   this case is easy, we just see how far each line is indented.  | 
855  |  | //  | 
856  |  | // Case 2: model->first_indent_ == model->body_indent_  | 
857  |  | //   Here, we find end-of-paragraph lines by looking for "short lines."  | 
858  |  | //   What constitutes a "short line" changes depending on whether the text  | 
859  |  | //   ragged-right[left] or fully justified (aligned left and right).  | 
860  |  | //  | 
861  |  | //   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)  | 
862  |  | //     We have a new paragraph it the first word would have at the end  | 
863  |  | //     of the previous line.  | 
864  |  | //  | 
865  |  | //   Case 2b: Fully Justified.  (eop_threshold > 0)  | 
866  |  | //     We mark a line as short (end of paragraph) if the offside indent  | 
867  |  | //     is greater than eop_threshold.  | 
868  |  | static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,  | 
869  | 2.16k  |                               const ParagraphModel *model, bool ltr, int eop_threshold) { | 
870  | 2.16k  |   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { | 
871  | 0  |     return;  | 
872  | 0  |   }  | 
873  | 31.0k  |   for (int row = row_start; row < row_end; row++) { | 
874  | 28.8k  |     bool valid_first = ValidFirstLine(rows, row, model);  | 
875  | 28.8k  |     bool valid_body = ValidBodyLine(rows, row, model);  | 
876  | 28.8k  |     if (valid_first && !valid_body) { | 
877  | 2.27k  |       (*rows)[row].AddStartLine(model);  | 
878  | 26.5k  |     } else if (valid_body && !valid_first) { | 
879  | 8.65k  |       (*rows)[row].AddBodyLine(model);  | 
880  | 17.9k  |     } else if (valid_body && valid_first) { | 
881  | 17.6k  |       bool after_eop = (row == row_start);  | 
882  | 17.6k  |       if (row > row_start) { | 
883  | 16.3k  |         if (eop_threshold > 0) { | 
884  | 6.66k  |           if (model->justification() == JUSTIFICATION_LEFT) { | 
885  | 6.32k  |             after_eop = (*rows)[row - 1].rindent_ > eop_threshold;  | 
886  | 6.32k  |           } else { | 
887  | 336  |             after_eop = (*rows)[row - 1].lindent_ > eop_threshold;  | 
888  | 336  |           }  | 
889  | 9.73k  |         } else { | 
890  | 9.73k  |           after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());  | 
891  | 9.73k  |         }  | 
892  | 16.3k  |       }  | 
893  | 17.6k  |       if (after_eop) { | 
894  | 4.20k  |         (*rows)[row].AddStartLine(model);  | 
895  | 13.4k  |       } else { | 
896  | 13.4k  |         (*rows)[row].AddBodyLine(model);  | 
897  | 13.4k  |       }  | 
898  | 17.6k  |     } else { | 
899  |  |       // Do nothing. Stray row.  | 
900  | 298  |     }  | 
901  | 28.8k  |   }  | 
902  | 2.16k  | }  | 
903  |  |  | 
904  |  | // GeometricClassifierState holds all of the information we'll use while  | 
905  |  | // trying to determine a paragraph model for the text lines in a block of  | 
906  |  | // text:  | 
907  |  | //   + the rows under consideration [row_start, row_end)  | 
908  |  | //   + the common left- and right-indent tab stops  | 
909  |  | //   + does the block start out left-to-right or right-to-left  | 
910  |  | // Further, this struct holds the data we amass for the (single) ParagraphModel  | 
911  |  | // we'll assign to the text lines (assuming we get that far).  | 
912  |  | struct GeometricClassifierState { | 
913  |  |   GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,  | 
914  |  |                            int r_end)  | 
915  | 9.92k  |       : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) { | 
916  | 9.92k  |     tolerance = InterwordSpace(*r, r_start, r_end);  | 
917  | 9.92k  |     CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);  | 
918  | 9.92k  |     if (debug_level >= 3) { | 
919  | 0  |       tesserr << "Geometry: TabStop cluster tolerance = " << tolerance << "; "  | 
920  | 0  |               << left_tabs.size() << " left tabs; "  | 
921  | 0  |               << right_tabs.size() << " right tabs\n";  | 
922  | 0  |     }  | 
923  | 9.92k  |     ltr = (*r)[r_start].ri_->ltr;  | 
924  | 9.92k  |   }  | 
925  |  |  | 
926  | 3.00k  |   void AssumeLeftJustification() { | 
927  | 3.00k  |     just = tesseract::JUSTIFICATION_LEFT;  | 
928  | 3.00k  |     margin = (*rows)[row_start].lmargin_;  | 
929  | 3.00k  |   }  | 
930  |  |  | 
931  | 482  |   void AssumeRightJustification() { | 
932  | 482  |     just = tesseract::JUSTIFICATION_RIGHT;  | 
933  | 482  |     margin = (*rows)[row_start].rmargin_;  | 
934  | 482  |   }  | 
935  |  |  | 
936  |  |   // Align tabs are the tab stops the text is aligned to.  | 
937  | 17.2k  |   const std::vector<Cluster> &AlignTabs() const { | 
938  | 17.2k  |     if (just == tesseract::JUSTIFICATION_RIGHT) { | 
939  | 3.26k  |       return right_tabs;  | 
940  | 3.26k  |     }  | 
941  | 13.9k  |     return left_tabs;  | 
942  | 17.2k  |   }  | 
943  |  |  | 
944  |  |   // Offside tabs are the tab stops opposite the tabs used to align the text.  | 
945  |  |   //  | 
946  |  |   // Note that for a left-to-right text which is aligned to the right such as  | 
947  |  |   //     this function comment, the offside tabs are the horizontal tab stops  | 
948  |  |   //                 marking the beginning of ("Note", "this" and "marking"). | 
949  | 7.36k  |   const std::vector<Cluster> &OffsideTabs() const { | 
950  | 7.36k  |     if (just == tesseract::JUSTIFICATION_RIGHT) { | 
951  | 1.85k  |       return left_tabs;  | 
952  | 1.85k  |     }  | 
953  | 5.50k  |     return right_tabs;  | 
954  | 7.36k  |   }  | 
955  |  |  | 
956  |  |   // Return whether the i'th row extends from the leftmost left tab stop  | 
957  |  |   // to the right most right tab stop.  | 
958  | 44.6k  |   bool IsFullRow(int i) const { | 
959  | 44.6k  |     return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&  | 
960  | 44.6k  |            ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;  | 
961  | 44.6k  |   }  | 
962  |  |  | 
963  | 5.39k  |   int AlignsideTabIndex(int row_idx) const { | 
964  | 5.39k  |     return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));  | 
965  | 5.39k  |   }  | 
966  |  |  | 
967  |  |   // Given what we know about the paragraph justification (just), would the  | 
968  |  |   // first word of row_b have fit at the end of row_a?  | 
969  | 33.2k  |   bool FirstWordWouldHaveFit(int row_a, int row_b) { | 
970  | 33.2k  |     return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just);  | 
971  | 33.2k  |   }  | 
972  |  |  | 
973  | 0  |   void PrintRows() const { | 
974  | 0  |     PrintRowRange(*rows, row_start, row_end);  | 
975  | 0  |   }  | 
976  |  |  | 
977  | 6.43k  |   void Fail(int min_debug_level, const char *why) const { | 
978  | 6.43k  |     if (debug_level < min_debug_level) { | 
979  | 6.43k  |       return;  | 
980  | 6.43k  |     }  | 
981  | 0  |     tprintf("# %s\n", why); | 
982  | 0  |     PrintRows();  | 
983  | 0  |   }  | 
984  |  |  | 
985  | 2.16k  |   ParagraphModel Model() const { | 
986  | 2.16k  |     return ParagraphModel(just, margin, first_indent, body_indent, tolerance);  | 
987  | 2.16k  |   }  | 
988  |  |  | 
989  |  |   // We print out messages with a debug level at least as great as debug_level.  | 
990  |  |   int debug_level = 0;  | 
991  |  |  | 
992  |  |   // The Geometric Classifier was asked to find a single paragraph model  | 
993  |  |   // to fit the text rows (*rows)[row_start, row_end)  | 
994  |  |   std::vector<RowScratchRegisters> *rows;  | 
995  |  |   int row_start = 0;  | 
996  |  |   int row_end = 0;  | 
997  |  |  | 
998  |  |   // The amount by which we expect the text edge can vary and still be aligned.  | 
999  |  |   int tolerance = 0;  | 
1000  |  |  | 
1001  |  |   // Is the script in this text block left-to-right?  | 
1002  |  |   // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve  | 
1003  |  |   bool ltr = false;  | 
1004  |  |  | 
1005  |  |   // These left and right tab stops were determined to be the common tab  | 
1006  |  |   // stops for the given text.  | 
1007  |  |   std::vector<Cluster> left_tabs;  | 
1008  |  |   std::vector<Cluster> right_tabs;  | 
1009  |  |  | 
1010  |  |   // These are parameters we must determine to create a ParagraphModel.  | 
1011  |  |   tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;  | 
1012  |  |   int margin = 0;  | 
1013  |  |   int first_indent = 0;  | 
1014  |  |   int body_indent = 0;  | 
1015  |  |  | 
1016  |  |   // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()  | 
1017  |  |   int eop_threshold = 0;  | 
1018  |  | };  | 
1019  |  |  | 
1020  |  | // Given a section of text where strong textual clues did not help identifying  | 
1021  |  | // paragraph breaks, and for which the left and right indents have exactly  | 
1022  |  | // three tab stops between them, attempt to find the paragraph breaks based  | 
1023  |  | // solely on the outline of the text and whether the script is left-to-right.  | 
1024  |  | //  | 
1025  |  | // Algorithm Detail:  | 
1026  |  | //   The selected rows are in the form of a rectangle except  | 
1027  |  | //   for some number of "short lines" of the same length:  | 
1028  |  | //  | 
1029  |  | //   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx  | 
1030  |  | //           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.  | 
1031  |  | //         xxxxxxxxxxxxx       xxxxxxxxxxxx  | 
1032  |  | //         xxxxxxxxxxxxx       xxxxxxxxxxxx  | 
1033  |  | //  | 
1034  |  | //   We have a slightly different situation if the only short  | 
1035  |  | //   line is at the end of the excerpt.  | 
1036  |  | //  | 
1037  |  | //   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx  | 
1038  |  | //        xxxxxxxxxxxxx       xxxxxxxxxxxx  | 
1039  |  | //        xxxxxxxxxxxxx       xxxxxxxxxxxx  | 
1040  |  | //          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.  | 
1041  |  | //  | 
1042  |  | //   We'll interpret these as follows based on the reasoning in the comment for  | 
1043  |  | //   GeometricClassify():  | 
1044  |  | //       [script direction: first indent, body indent]  | 
1045  |  | //   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0  | 
1046  |  | //   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0  | 
1047  |  | static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,  | 
1048  | 3.41k  |                                                    ParagraphTheory *theory) { | 
1049  | 3.41k  |   int num_rows = s.row_end - s.row_start;  | 
1050  | 3.41k  |   int num_full_rows = 0;  | 
1051  | 3.41k  |   int last_row_full = 0;  | 
1052  | 48.0k  |   for (int i = s.row_start; i < s.row_end; i++) { | 
1053  | 44.6k  |     if (s.IsFullRow(i)) { | 
1054  | 12.0k  |       num_full_rows++;  | 
1055  | 12.0k  |       if (i == s.row_end - 1) { | 
1056  | 671  |         last_row_full++;  | 
1057  | 671  |       }  | 
1058  | 12.0k  |     }  | 
1059  | 44.6k  |   }  | 
1060  |  |  | 
1061  | 3.41k  |   if (num_full_rows < 0.7 * num_rows) { | 
1062  | 2.57k  |     s.Fail(1, "Not enough full lines to know which lines start paras.");  | 
1063  | 2.57k  |     return;  | 
1064  | 2.57k  |   }  | 
1065  |  |  | 
1066  |  |   // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()  | 
1067  | 837  |   s.eop_threshold = 0;  | 
1068  |  |  | 
1069  | 837  |   if (s.ltr) { | 
1070  | 837  |     s.AssumeLeftJustification();  | 
1071  | 837  |   } else { | 
1072  | 0  |     s.AssumeRightJustification();  | 
1073  | 0  |   }  | 
1074  |  |  | 
1075  | 837  |   if (debug_level > 0) { | 
1076  | 0  |     tprintf(  | 
1077  | 0  |         "# Not enough variety for clear outline classification. "  | 
1078  | 0  |         "Guessing these are %s aligned based on script.\n",  | 
1079  | 0  |         s.ltr ? "left" : "right");  | 
1080  | 0  |     s.PrintRows();  | 
1081  | 0  |   }  | 
1082  |  |  | 
1083  | 837  |   if (s.AlignTabs().size() == 2) { // case A1 or A2 | 
1084  | 387  |     s.first_indent = s.AlignTabs()[1].center;  | 
1085  | 387  |     s.body_indent = s.AlignTabs()[0].center;  | 
1086  | 450  |   } else { // case B1 or B2 | 
1087  | 450  |     if (num_rows - 1 == num_full_rows - last_row_full) { | 
1088  |  |       // case B2  | 
1089  | 82  |       const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;  | 
1090  | 82  |       (*s.rows)[s.row_start].AddStartLine(model);  | 
1091  | 732  |       for (int i = s.row_start + 1; i < s.row_end; i++) { | 
1092  | 650  |         (*s.rows)[i].AddBodyLine(model);  | 
1093  | 650  |       }  | 
1094  | 82  |       return;  | 
1095  | 368  |     } else { | 
1096  |  |       // case B1  | 
1097  | 368  |       s.first_indent = s.body_indent = s.AlignTabs()[0].center;  | 
1098  | 368  |       s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;  | 
1099  | 368  |     }  | 
1100  | 450  |   }  | 
1101  | 755  |   const ParagraphModel *model = theory->AddModel(s.Model());  | 
1102  | 755  |   MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);  | 
1103  | 755  |   return;  | 
1104  | 837  | }  | 
1105  |  |  | 
1106  |  | // This function is called if strong textual clues were not available, but  | 
1107  |  | // the caller hopes that the paragraph breaks will be super obvious just  | 
1108  |  | // by the outline of the text.  | 
1109  |  | //  | 
1110  |  | // The particularly difficult case is figuring out what's going on if you  | 
1111  |  | // don't have enough short paragraph end lines to tell us what's going on.  | 
1112  |  | //  | 
1113  |  | // For instance, let's say you have the following outline:  | 
1114  |  | //  | 
1115  |  | //   (A1)  xxxxxxxxxxxxxxxxxxxxxx  | 
1116  |  | //           xxxxxxxxxxxxxxxxxxxx  | 
1117  |  | //         xxxxxxxxxxxxxxxxxxxxxx  | 
1118  |  | //         xxxxxxxxxxxxxxxxxxxxxx  | 
1119  |  | //  | 
1120  |  | // Even if we know that the text is left-to-right and so will probably be  | 
1121  |  | // left-aligned, both of the following are possible texts:  | 
1122  |  | //  | 
1123  |  | //  (A1a)  1. Here our list item  | 
1124  |  | //           with two full lines.  | 
1125  |  | //         2. Here a second item.  | 
1126  |  | //         3. Here our third one.  | 
1127  |  | //  | 
1128  |  | //  (A1b)  so ends paragraph one.  | 
1129  |  | //           Here  starts another  | 
1130  |  | //         paragraph  we want  to  | 
1131  |  | //         read.  This  continues  | 
1132  |  | //  | 
1133  |  | // These examples are obvious from the text and should have been caught  | 
1134  |  | // by the StrongEvidenceClassify pass.  However, for languages where we don't  | 
1135  |  | // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),  | 
1136  |  | // it's worth guessing that (A1b) is the correct interpretation if there are  | 
1137  |  | // far more "full" lines than "short" lines.  | 
1138  |  | static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,  | 
1139  | 14.0k  |                               int row_start, int row_end, ParagraphTheory *theory) { | 
1140  | 14.0k  |   if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) { | 
1141  | 4.11k  |     return;  | 
1142  | 4.11k  |   }  | 
1143  | 9.92k  |   if (debug_level > 1) { | 
1144  | 0  |     tprintf("###############################################\n"); | 
1145  | 0  |     tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n", row_start, row_end); | 
1146  | 0  |     tprintf("###############################################\n"); | 
1147  | 0  |   }  | 
1148  | 9.92k  |   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);  | 
1149  |  |  | 
1150  | 9.92k  |   GeometricClassifierState s(debug_level, rows, row_start, row_end);  | 
1151  | 9.92k  |   if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { | 
1152  | 610  |     s.Fail(2, "Too much variety for simple outline classification.");  | 
1153  | 610  |     return;  | 
1154  | 610  |   }  | 
1155  | 9.31k  |   if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { | 
1156  | 3.25k  |     s.Fail(1, "Not enough variety for simple outline classification.");  | 
1157  | 3.25k  |     return;  | 
1158  | 3.25k  |   }  | 
1159  | 6.06k  |   if (s.left_tabs.size() + s.right_tabs.size() == 3) { | 
1160  | 3.41k  |     GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);  | 
1161  | 3.41k  |     return;  | 
1162  | 3.41k  |   }  | 
1163  |  |  | 
1164  |  |   // At this point, we know that one side has at least two tab stops, and the  | 
1165  |  |   // other side has one or two tab stops.  | 
1166  |  |   // Left to determine:  | 
1167  |  |   //   (1) Which is the body indent and which is the first line indent?  | 
1168  |  |   //   (2) Is the text fully justified?  | 
1169  |  |  | 
1170  |  |   // If one side happens to have three or more tab stops, assume that side  | 
1171  |  |   // is opposite of the aligned side.  | 
1172  | 2.65k  |   if (s.right_tabs.size() > 2) { | 
1173  | 1.42k  |     s.AssumeLeftJustification();  | 
1174  | 1.42k  |   } else if (s.left_tabs.size() > 2) { | 
1175  | 482  |     s.AssumeRightJustification();  | 
1176  | 747  |   } else if (s.ltr) { // guess based on script direction | 
1177  | 747  |     s.AssumeLeftJustification();  | 
1178  | 747  |   } else { | 
1179  | 0  |     s.AssumeRightJustification();  | 
1180  | 0  |   }  | 
1181  |  |  | 
1182  | 2.65k  |   if (s.AlignTabs().size() == 2) { | 
1183  |  |     // For each tab stop on the aligned side, how many of them appear  | 
1184  |  |     // to be paragraph start lines?  [first lines]  | 
1185  | 1.88k  |     int firsts[2] = {0, 0}; | 
1186  |  |     // Count the first line as a likely paragraph start line.  | 
1187  | 1.88k  |     firsts[s.AlignsideTabIndex(s.row_start)]++;  | 
1188  |  |     // For each line, if the first word would have fit on the previous  | 
1189  |  |     // line count it as a likely paragraph start line.  | 
1190  | 1.88k  |     bool jam_packed = true;  | 
1191  | 29.8k  |     for (int i = s.row_start + 1; i < s.row_end; i++) { | 
1192  | 27.9k  |       if (s.FirstWordWouldHaveFit(i - 1, i)) { | 
1193  | 3.42k  |         firsts[s.AlignsideTabIndex(i)]++;  | 
1194  | 3.42k  |         jam_packed = false;  | 
1195  | 3.42k  |       }  | 
1196  | 27.9k  |     }  | 
1197  |  |     // Make an extra accounting for the last line of the paragraph just  | 
1198  |  |     // in case it's the only short line in the block.  That is, take its  | 
1199  |  |     // first word as typical and see if this looks like the *last* line  | 
1200  |  |     // of a paragraph.  If so, mark the *other* indent as probably a first.  | 
1201  | 1.88k  |     if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { | 
1202  | 85  |       firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;  | 
1203  | 85  |     }  | 
1204  |  |  | 
1205  | 1.88k  |     int percent0firsts, percent1firsts;  | 
1206  | 1.88k  |     percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;  | 
1207  | 1.88k  |     percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;  | 
1208  |  |  | 
1209  |  |     // TODO(eger): Tune these constants if necessary.  | 
1210  | 1.88k  |     if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) { | 
1211  | 385  |       s.first_indent = s.AlignTabs()[1].center;  | 
1212  | 385  |       s.body_indent = s.AlignTabs()[0].center;  | 
1213  | 1.49k  |     } else if ((percent1firsts < 20 && 30 < percent0firsts) ||  | 
1214  | 1.49k  |                percent1firsts + 30 < percent0firsts) { | 
1215  | 253  |       s.first_indent = s.AlignTabs()[0].center;  | 
1216  | 253  |       s.body_indent = s.AlignTabs()[1].center;  | 
1217  | 1.24k  |     } else { | 
1218  |  |       // Ambiguous! Probably lineated (poetry)  | 
1219  | 1.24k  |       if (debug_level > 1) { | 
1220  | 0  |         tprintf("# Cannot determine %s indent likely to start paragraphs.\n", | 
1221  | 0  |                 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");  | 
1222  | 0  |         tprintf("# Indent of %d looks like a first line %d%% of the time.\n", | 
1223  | 0  |                 s.AlignTabs()[0].center, percent0firsts);  | 
1224  | 0  |         tprintf("# Indent of %d looks like a first line %d%% of the time.\n", | 
1225  | 0  |                 s.AlignTabs()[1].center, percent1firsts);  | 
1226  | 0  |         s.PrintRows();  | 
1227  | 0  |       }  | 
1228  | 1.24k  |       return;  | 
1229  | 1.24k  |     }  | 
1230  | 1.88k  |   } else { | 
1231  |  |     // There's only one tab stop for the "aligned to" side.  | 
1232  | 768  |     s.first_indent = s.body_indent = s.AlignTabs()[0].center;  | 
1233  | 768  |   }  | 
1234  |  |  | 
1235  |  |   // At this point, we have our model.  | 
1236  | 1.40k  |   const ParagraphModel *model = theory->AddModel(s.Model());  | 
1237  |  |  | 
1238  |  |   // Now all we have to do is figure out if the text is fully justified or not.  | 
1239  |  |   // eop_threshold: default to fully justified unless we see evidence below.  | 
1240  |  |   //    See description on MarkRowsWithModel()  | 
1241  | 1.40k  |   s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;  | 
1242  |  |   // If the text is not fully justified, re-set the eop_threshold to 0.  | 
1243  | 1.40k  |   if (s.AlignTabs().size() == 2) { | 
1244  |  |     // Paragraphs with a paragraph-start indent.  | 
1245  | 5.62k  |     for (int i = s.row_start; i < s.row_end - 1; i++) { | 
1246  | 5.37k  |       if (ValidFirstLine(s.rows, i + 1, model) &&  | 
1247  | 5.37k  |           !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),  | 
1248  | 1.20k  |                        s.tolerance)) { | 
1249  |  |         // We found a non-end-of-paragraph short line: not fully justified.  | 
1250  | 386  |         s.eop_threshold = 0;  | 
1251  | 386  |         break;  | 
1252  | 386  |       }  | 
1253  | 5.37k  |     }  | 
1254  | 768  |   } else { | 
1255  |  |     // Paragraphs with no paragraph-start indent.  | 
1256  | 4.28k  |     for (int i = s.row_start; i < s.row_end - 1; i++) { | 
1257  | 4.05k  |       if (!s.FirstWordWouldHaveFit(i, i + 1) &&  | 
1258  | 4.05k  |           !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),  | 
1259  | 2.61k  |                        s.tolerance)) { | 
1260  |  |         // We found a non-end-of-paragraph short line: not fully justified.  | 
1261  | 540  |         s.eop_threshold = 0;  | 
1262  | 540  |         break;  | 
1263  | 540  |       }  | 
1264  | 4.05k  |     }  | 
1265  | 768  |   }  | 
1266  | 1.40k  |   MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);  | 
1267  | 1.40k  | }  | 
1268  |  |  | 
1269  |  | // =============== Implementation of ParagraphTheory =====================  | 
1270  |  |  | 
1271  | 3.32k  | const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { | 
1272  | 3.32k  |   for (const auto &m : *models_) { | 
1273  | 384  |     if (m->Comparable(model)) { | 
1274  | 40  |       return m;  | 
1275  | 40  |     }  | 
1276  | 384  |   }  | 
1277  | 3.28k  |   auto *m = new ParagraphModel(model);  | 
1278  | 3.28k  |   models_->push_back(m);  | 
1279  | 3.28k  |   push_back_new(models_we_added_, m);  | 
1280  | 3.28k  |   return m;  | 
1281  | 3.32k  | }  | 
1282  |  |  | 
1283  | 13.9k  | void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { | 
1284  | 13.9k  |   size_t w = 0;  | 
1285  | 16.5k  |   for (size_t r = 0; r < models_->size(); r++) { | 
1286  | 2.58k  |     ParagraphModel *m = (*models_)[r];  | 
1287  | 2.58k  |     if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) { | 
1288  | 490  |       delete m;  | 
1289  | 2.09k  |     } else { | 
1290  | 2.09k  |       if (r > w) { | 
1291  | 2  |         (*models_)[w] = m;  | 
1292  | 2  |       }  | 
1293  | 2.09k  |       w++;  | 
1294  | 2.09k  |     }  | 
1295  | 2.58k  |   }  | 
1296  | 13.9k  |   models_->resize(w);  | 
1297  | 13.9k  | }  | 
1298  |  |  | 
1299  |  | // Examine rows[start, end) and try to determine if an existing non-centered  | 
1300  |  | // paragraph model would fit them perfectly.  If so, return a pointer to it.  | 
1301  |  | // If not, return nullptr.  | 
1302  |  | const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,  | 
1303  | 0  |                                             int start, int end) const { | 
1304  | 0  |   for (const auto *model : *models_) { | 
1305  | 0  |     if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) { | 
1306  | 0  |       return model;  | 
1307  | 0  |     }  | 
1308  | 0  |   }  | 
1309  | 0  |   return nullptr;  | 
1310  | 0  | }  | 
1311  |  |  | 
1312  | 111k  | void ParagraphTheory::NonCenteredModels(SetOfModels *models) { | 
1313  | 111k  |   for (const auto *model : *models_) { | 
1314  | 4.62k  |     if (model->justification() != JUSTIFICATION_CENTER) { | 
1315  | 3.98k  |       push_back_new(*models, model);  | 
1316  | 3.98k  |     }  | 
1317  | 4.62k  |   }  | 
1318  | 111k  | }  | 
1319  |  |  | 
1320  | 0  | int ParagraphTheory::IndexOf(const ParagraphModel *model) const { | 
1321  | 0  |   int i = 0;  | 
1322  | 0  |   for (const auto *m : *models_) { | 
1323  | 0  |     if (m == model) { | 
1324  | 0  |       return i;  | 
1325  | 0  |     }  | 
1326  | 0  |     i++;  | 
1327  | 0  |   }  | 
1328  | 0  |   return -1;  | 
1329  | 0  | }  | 
1330  |  |  | 
1331  |  | bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,  | 
1332  | 79.5k  |                     const ParagraphModel *model) { | 
1333  | 79.5k  |   if (!StrongModel(model)) { | 
1334  | 0  |     tprintf("ValidFirstLine() should only be called with strong models!\n"); | 
1335  | 0  |   }  | 
1336  | 79.5k  |   return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,  | 
1337  | 79.5k  |                                                      (*rows)[row].rindent_, (*rows)[row].rmargin_);  | 
1338  | 79.5k  | }  | 
1339  |  |  | 
1340  |  | bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,  | 
1341  | 52.0k  |                    const ParagraphModel *model) { | 
1342  | 52.0k  |   if (!StrongModel(model)) { | 
1343  | 0  |     tprintf("ValidBodyLine() should only be called with strong models!\n"); | 
1344  | 0  |   }  | 
1345  | 52.0k  |   return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,  | 
1346  | 52.0k  |                                                     (*rows)[row].rindent_, (*rows)[row].rmargin_);  | 
1347  | 52.0k  | }  | 
1348  |  |  | 
1349  |  | bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,  | 
1350  | 127  |                      const ParagraphModel *model) { | 
1351  | 127  |   if (model != kCrownRight && model != kCrownLeft) { | 
1352  | 0  |     tprintf("CrownCompatible() should only be called with crown models!\n"); | 
1353  | 0  |     return false;  | 
1354  | 0  |   }  | 
1355  | 127  |   auto &row_a = (*rows)[a];  | 
1356  | 127  |   auto &row_b = (*rows)[b];  | 
1357  | 127  |   if (model == kCrownRight) { | 
1358  | 32  |     return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,  | 
1359  | 32  |                        Epsilon(row_a.ri_->average_interword_space));  | 
1360  | 32  |   }  | 
1361  | 95  |   return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,  | 
1362  | 95  |                      Epsilon(row_a.ri_->average_interword_space));  | 
1363  | 127  | }  | 
1364  |  |  | 
1365  |  | // =============== Implementation of ParagraphModelSmearer ====================  | 
1366  |  |  | 
1367  |  | ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,  | 
1368  |  |                                              int row_start, int row_end, ParagraphTheory *theory)  | 
1369  | 12.5k  |     : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) { | 
1370  | 12.5k  |   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { | 
1371  | 0  |     row_start_ = 0;  | 
1372  | 0  |     row_end_ = 0;  | 
1373  | 0  |     return;  | 
1374  | 0  |   }  | 
1375  | 12.5k  |   open_models_.resize(open_models_.size() + row_end - row_start + 2);  | 
1376  | 12.5k  | }  | 
1377  |  |  | 
1378  |  | // see paragraphs_internal.h  | 
1379  | 54.4k  | void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { | 
1380  | 54.4k  |   SetOfModels no_models;  | 
1381  | 54.4k  |   if (row_start < row_start_) { | 
1382  | 0  |     row_start = row_start_;  | 
1383  | 0  |   }  | 
1384  | 54.4k  |   if (row_end > row_end_) { | 
1385  | 0  |     row_end = row_end_;  | 
1386  | 0  |   }  | 
1387  |  |  | 
1388  | 626k  |   for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) { | 
1389  | 571k  |     if ((*rows_)[row].ri_->num_words == 0) { | 
1390  | 0  |       OpenModels(row + 1) = no_models;  | 
1391  | 571k  |     } else { | 
1392  | 571k  |       SetOfModels &opened = OpenModels(row);  | 
1393  | 571k  |       (*rows_)[row].StartHypotheses(&opened);  | 
1394  |  |  | 
1395  |  |       // Which models survive the transition from row to row + 1?  | 
1396  | 571k  |       SetOfModels still_open;  | 
1397  | 571k  |       for (auto &m : opened) { | 
1398  | 37.5k  |         if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) { | 
1399  |  |           // This is basic filtering; we check likely paragraph starty-ness down  | 
1400  |  |           // below in Smear() -- you know, whether the first word would have fit  | 
1401  |  |           // and such.  | 
1402  | 35.1k  |           push_back_new(still_open, m);  | 
1403  | 35.1k  |         }  | 
1404  | 37.5k  |       }  | 
1405  | 571k  |       OpenModels(row + 1) = std::move(still_open);  | 
1406  | 571k  |     }  | 
1407  | 571k  |   }  | 
1408  | 54.4k  | }  | 
1409  |  |  | 
1410  |  | // see paragraphs_internal.h  | 
1411  | 12.5k  | void ParagraphModelSmearer::Smear() { | 
1412  | 12.5k  |   CalculateOpenModels(row_start_, row_end_);  | 
1413  |  |  | 
1414  |  |   // For each row which we're unsure about (that is, it is LT_UNKNOWN or  | 
1415  |  |   // we have multiple LT_START hypotheses), see if there's a model that  | 
1416  |  |   // was recently used (an "open" model) which might model it well.  | 
1417  | 155k  |   for (int i = row_start_; i < row_end_; i++) { | 
1418  | 143k  |     RowScratchRegisters &row = (*rows_)[i];  | 
1419  | 143k  |     if (row.ri_->num_words == 0) { | 
1420  | 0  |       continue;  | 
1421  | 0  |     }  | 
1422  |  |  | 
1423  |  |     // Step One:  | 
1424  |  |     //   Figure out if there are "open" models which are left-alined or  | 
1425  |  |     //   right-aligned.  This is important for determining whether the  | 
1426  |  |     //   "first" word in a row would fit at the "end" of the previous row.  | 
1427  | 143k  |     bool left_align_open = false;  | 
1428  | 143k  |     bool right_align_open = false;  | 
1429  | 143k  |     for (auto &m : OpenModels(i)) { | 
1430  | 5.14k  |       switch (m->justification()) { | 
1431  | 4.72k  |         case JUSTIFICATION_LEFT:  | 
1432  | 4.72k  |           left_align_open = true;  | 
1433  | 4.72k  |           break;  | 
1434  | 0  |         case JUSTIFICATION_RIGHT:  | 
1435  | 0  |           right_align_open = true;  | 
1436  | 0  |           break;  | 
1437  | 423  |         default:  | 
1438  | 423  |           left_align_open = right_align_open = true;  | 
1439  | 5.14k  |       }  | 
1440  | 5.14k  |     }  | 
1441  |  |     // Step Two:  | 
1442  |  |     //   Use that knowledge to figure out if this row is likely to  | 
1443  |  |     //   start a paragraph.  | 
1444  | 143k  |     bool likely_start;  | 
1445  | 143k  |     if (i == 0) { | 
1446  | 12.1k  |       likely_start = true;  | 
1447  | 131k  |     } else { | 
1448  | 131k  |       if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) { | 
1449  | 126k  |         likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||  | 
1450  | 126k  |                        LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);  | 
1451  | 126k  |       } else if (left_align_open) { | 
1452  | 4.39k  |         likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);  | 
1453  | 4.39k  |       } else { | 
1454  | 0  |         likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);  | 
1455  | 0  |       }  | 
1456  | 131k  |     }  | 
1457  |  |  | 
1458  |  |     // Step Three:  | 
1459  |  |     //   If this text line seems like an obvious first line of an  | 
1460  |  |     //   open model, or an obvious continuation of an existing  | 
1461  |  |     //   modelled paragraph, mark it up.  | 
1462  | 143k  |     if (likely_start) { | 
1463  |  |       // Add Start Hypotheses for all Open models that fit.  | 
1464  | 16.6k  |       for (unsigned m = 0; m < OpenModels(i).size(); m++) { | 
1465  | 459  |         if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { | 
1466  | 391  |           row.AddStartLine(OpenModels(i)[m]);  | 
1467  | 391  |         }  | 
1468  | 459  |       }  | 
1469  | 127k  |     } else { | 
1470  |  |       // Add relevant body line hypotheses.  | 
1471  | 127k  |       SetOfModels last_line_models;  | 
1472  | 127k  |       if (i > 0) { | 
1473  | 127k  |         (*rows_)[i - 1].StrongHypotheses(&last_line_models);  | 
1474  | 127k  |       } else { | 
1475  | 0  |         theory_->NonCenteredModels(&last_line_models);  | 
1476  | 0  |       }  | 
1477  | 127k  |       for (auto model : last_line_models) { | 
1478  | 4.54k  |         if (ValidBodyLine(rows_, i, model)) { | 
1479  | 2.65k  |           row.AddBodyLine(model);  | 
1480  | 2.65k  |         }  | 
1481  | 4.54k  |       }  | 
1482  | 127k  |     }  | 
1483  |  |  | 
1484  |  |     // Step Four:  | 
1485  |  |     //   If we're still quite unsure about this line, go through all  | 
1486  |  |     //   models in our theory and see if this row could be the start  | 
1487  |  |     //   of any of our  models.  | 
1488  | 143k  |     if (row.GetLineType() == LT_UNKNOWN ||  | 
1489  | 143k  |         (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { | 
1490  | 111k  |       SetOfModels all_models;  | 
1491  | 111k  |       theory_->NonCenteredModels(&all_models);  | 
1492  | 111k  |       for (auto &all_model : all_models) { | 
1493  | 3.98k  |         if (ValidFirstLine(rows_, i, all_model)) { | 
1494  | 1.43k  |           row.AddStartLine(all_model);  | 
1495  | 1.43k  |         }  | 
1496  | 3.98k  |       }  | 
1497  | 111k  |     }  | 
1498  |  |     // Step Five:  | 
1499  |  |     //   Since we may have updated the hypotheses about this row, we need  | 
1500  |  |     //   to recalculate the Open models for the rest of rows[i + 1, row_end)  | 
1501  | 143k  |     if (row.GetLineType() != LT_UNKNOWN) { | 
1502  | 41.8k  |       CalculateOpenModels(i + 1, row_end_);  | 
1503  | 41.8k  |     }  | 
1504  | 143k  |   }  | 
1505  | 12.5k  | }  | 
1506  |  |  | 
1507  |  | // ================ Main Paragraph Detection Algorithm =======================  | 
1508  |  |  | 
1509  |  | // Find out what ParagraphModels are actually used, and discard any  | 
1510  |  | // that are not.  | 
1511  |  | static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,  | 
1512  | 13.9k  |                                 ParagraphTheory *theory) { | 
1513  | 13.9k  |   SetOfModels used_models;  | 
1514  | 142k  |   for (const auto &row : rows) { | 
1515  | 142k  |     row.StrongHypotheses(&used_models);  | 
1516  | 142k  |   }  | 
1517  | 13.9k  |   theory->DiscardUnusedModels(used_models);  | 
1518  | 13.9k  | }  | 
1519  |  |  | 
1520  |  | // DowngradeWeakestToCrowns:  | 
1521  |  | //   Forget any flush-{left, right} models unless we see two or more | 
1522  |  | //   of them in sequence.  | 
1523  |  | //  | 
1524  |  | // In pass 3, we start to classify even flush-left paragraphs (paragraphs  | 
1525  |  | // where the first line and body indent are the same) as having proper Models.  | 
1526  |  | // This is generally dangerous, since if you start imagining that flush-left  | 
1527  |  | // is a typical paragraph model when it is not, it will lead you to chop normal  | 
1528  |  | // indented paragraphs in the middle whenever a sentence happens to start on a  | 
1529  |  | // new line (see "This" above).  What to do?  | 
1530  |  | //   What we do is to take any paragraph which is flush left and is not  | 
1531  |  | // preceded by another paragraph of the same model and convert it to a "Crown"  | 
1532  |  | // paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder  | 
1533  |  | // for later.  It means that the paragraph is flush, but it would be desirable  | 
1534  |  | // to mark it as the same model as following text if it fits.  This downgrade  | 
1535  |  | // FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we  | 
1536  |  | // avoid making flush left Paragraph Models whenever we see a top-of-the-page  | 
1537  |  | // half-of-a-paragraph. and instead we mark it the same as normal body text.  | 
1538  |  | //  | 
1539  |  | // Implementation:  | 
1540  |  | //  | 
1541  |  | //   Comb backwards through the row scratch registers, and turn any  | 
1542  |  | //   sequences of body lines of equivalent type abutted against the beginning  | 
1543  |  | //   or a body or start line of a different type into a crown paragraph.  | 
1544  |  | static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,  | 
1545  | 13.9k  |                                      std::vector<RowScratchRegisters> *rows) { | 
1546  | 13.9k  |   int start;  | 
1547  | 18.4k  |   for (int end = rows->size(); end > 0; end = start) { | 
1548  |  |     // Search back for a body line of a unique type.  | 
1549  | 16.7k  |     const ParagraphModel *model = nullptr;  | 
1550  | 131k  |     while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) { | 
1551  | 115k  |       end--;  | 
1552  | 115k  |     }  | 
1553  | 16.7k  |     if (end == 0) { | 
1554  | 12.2k  |       break;  | 
1555  | 12.2k  |     }  | 
1556  | 4.52k  |     start = end - 1;  | 
1557  | 29.8k  |     while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { | 
1558  | 25.2k  |       start--; // walk back to the first line that is not the same body type.  | 
1559  | 25.2k  |     }  | 
1560  | 4.52k  |     if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&  | 
1561  | 4.52k  |         NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) { | 
1562  | 2.15k  |       start--;  | 
1563  | 2.15k  |     }  | 
1564  | 4.52k  |     start++;  | 
1565  |  |     // Now rows[start, end) is a sequence of unique body hypotheses of model.  | 
1566  | 4.52k  |     if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) { | 
1567  | 92  |       continue;  | 
1568  | 92  |     }  | 
1569  | 4.43k  |     if (!StrongModel(model)) { | 
1570  | 212  |       while (start > 0 && CrownCompatible(rows, start - 1, start, model)) { | 
1571  | 119  |         start--;  | 
1572  | 119  |       }  | 
1573  | 93  |     }  | 
1574  | 4.43k  |     if (start == 0 || (!StrongModel(model)) ||  | 
1575  | 4.43k  |         (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { | 
1576  |  |       // crownify rows[start, end)  | 
1577  | 1.96k  |       const ParagraphModel *crown_model = model;  | 
1578  | 1.96k  |       if (StrongModel(model)) { | 
1579  | 1.87k  |         if (model->justification() == JUSTIFICATION_LEFT) { | 
1580  | 1.62k  |           crown_model = kCrownLeft;  | 
1581  | 1.62k  |         } else { | 
1582  | 249  |           crown_model = kCrownRight;  | 
1583  | 249  |         }  | 
1584  | 1.87k  |       }  | 
1585  | 1.96k  |       (*rows)[start].SetUnknown();  | 
1586  | 1.96k  |       (*rows)[start].AddStartLine(crown_model);  | 
1587  | 16.4k  |       for (int row = start + 1; row < end; row++) { | 
1588  | 14.4k  |         (*rows)[row].SetUnknown();  | 
1589  | 14.4k  |         (*rows)[row].AddBodyLine(crown_model);  | 
1590  | 14.4k  |       }  | 
1591  | 1.96k  |     }  | 
1592  | 4.43k  |   }  | 
1593  | 13.9k  |   DiscardUnusedModels(*rows, theory);  | 
1594  | 13.9k  | }  | 
1595  |  |  | 
1596  |  | // Clear all hypotheses about lines [start, end) and reset margins.  | 
1597  |  | //  | 
1598  |  | // The empty space between the left of a row and the block boundary (and  | 
1599  |  | // similarly for the right) is split into two pieces: margin and indent.  | 
1600  |  | // In initial processing, we assume the block is tight and the margin for  | 
1601  |  | // all lines is set to zero.   However, if our first pass does not yield  | 
1602  |  | // models for  everything,  it may be  due to an  inset paragraph like a  | 
1603  |  | // block-quote.   In that case, we make a second pass over that unmarked  | 
1604  |  | // section of the page and reset the "margin" portion of the empty space  | 
1605  |  | // to the common amount of space at  the ends of the lines under consid-  | 
1606  |  | // eration.    This would be equivalent to percentile set to 0. However,  | 
1607  |  | // sometimes we have a single character sticking out in the right margin  | 
1608  |  | // of a text block  (like the 'r' in 'for' on line 3 above),  and we can  | 
1609  |  | // really  just ignore it as an outlier.   To express this, we allow the  | 
1610  |  | // user to specify  the percentile (0..100)  of indent values  to use as  | 
1611  |  | // the common margin for each row in the run of rows[start, end).  | 
1612  |  | void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,  | 
1613  | 22.4k  |                                         int end, int percentile) { | 
1614  | 22.4k  |   if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) { | 
1615  | 0  |     return;  | 
1616  | 0  |   }  | 
1617  |  |  | 
1618  | 22.4k  |   int lmin, lmax, rmin, rmax;  | 
1619  | 22.4k  |   lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;  | 
1620  | 22.4k  |   rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;  | 
1621  | 297k  |   for (int i = start; i < end; i++) { | 
1622  | 274k  |     RowScratchRegisters &sr = (*rows)[i];  | 
1623  | 274k  |     sr.SetUnknown();  | 
1624  | 274k  |     if (sr.ri_->num_words == 0) { | 
1625  | 0  |       continue;  | 
1626  | 0  |     }  | 
1627  | 274k  |     UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);  | 
1628  | 274k  |     UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);  | 
1629  | 274k  |   }  | 
1630  | 22.4k  |   STATS lefts(lmin, lmax);  | 
1631  | 22.4k  |   STATS rights(rmin, rmax);  | 
1632  | 297k  |   for (int i = start; i < end; i++) { | 
1633  | 274k  |     RowScratchRegisters &sr = (*rows)[i];  | 
1634  | 274k  |     if (sr.ri_->num_words == 0) { | 
1635  | 0  |       continue;  | 
1636  | 0  |     }  | 
1637  | 274k  |     lefts.add(sr.lmargin_ + sr.lindent_, 1);  | 
1638  | 274k  |     rights.add(sr.rmargin_ + sr.rindent_, 1);  | 
1639  | 274k  |   }  | 
1640  | 22.4k  |   int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);  | 
1641  | 22.4k  |   int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);  | 
1642  | 297k  |   for (int i = start; i < end; i++) { | 
1643  | 274k  |     RowScratchRegisters &sr = (*rows)[i];  | 
1644  | 274k  |     int ldelta = ignorable_left - sr.lmargin_;  | 
1645  | 274k  |     sr.lmargin_ += ldelta;  | 
1646  | 274k  |     sr.lindent_ -= ldelta;  | 
1647  | 274k  |     int rdelta = ignorable_right - sr.rmargin_;  | 
1648  | 274k  |     sr.rmargin_ += rdelta;  | 
1649  | 274k  |     sr.rindent_ -= rdelta;  | 
1650  | 274k  |   }  | 
1651  | 22.4k  | }  | 
1652  |  |  | 
1653  |  | // Return the median inter-word space in rows[row_start, row_end).  | 
1654  | 19.5k  | int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) { | 
1655  | 19.5k  |   if (row_end < row_start + 1) { | 
1656  | 0  |     return 1;  | 
1657  | 0  |   }  | 
1658  | 19.5k  |   int word_height =  | 
1659  | 19.5k  |       (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;  | 
1660  | 19.5k  |   int word_width =  | 
1661  | 19.5k  |       (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;  | 
1662  | 19.5k  |   STATS spacing_widths(0, 4 + word_width);  | 
1663  | 179k  |   for (int i = row_start; i < row_end; i++) { | 
1664  | 160k  |     if (rows[i].ri_->num_words > 1) { | 
1665  | 18.7k  |       spacing_widths.add(rows[i].ri_->average_interword_space, 1);  | 
1666  | 18.7k  |     }  | 
1667  | 160k  |   }  | 
1668  | 19.5k  |   int minimum_reasonable_space = word_height / 3;  | 
1669  | 19.5k  |   if (minimum_reasonable_space < 2) { | 
1670  | 153  |     minimum_reasonable_space = 2;  | 
1671  | 153  |   }  | 
1672  | 19.5k  |   int median = spacing_widths.median();  | 
1673  | 19.5k  |   return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;  | 
1674  | 19.5k  | }  | 
1675  |  |  | 
1676  |  | // Return whether the first word on the after line can fit in the space at  | 
1677  |  | // the end of the before line (knowing which way the text is aligned and read).  | 
1678  |  | bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,  | 
1679  | 535k  |                            tesseract::ParagraphJustification justification) { | 
1680  | 535k  |   if (before.ri_->num_words == 0 || after.ri_->num_words == 0) { | 
1681  | 0  |     return true;  | 
1682  | 0  |   }  | 
1683  |  |  | 
1684  | 535k  |   if (justification == JUSTIFICATION_UNKNOWN) { | 
1685  | 0  |     tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); | 
1686  | 0  |   }  | 
1687  | 535k  |   int available_space;  | 
1688  | 535k  |   if (justification == JUSTIFICATION_CENTER) { | 
1689  | 0  |     available_space = before.lindent_ + before.rindent_;  | 
1690  | 535k  |   } else { | 
1691  | 535k  |     available_space = before.OffsideIndent(justification);  | 
1692  | 535k  |   }  | 
1693  | 535k  |   available_space -= before.ri_->average_interword_space;  | 
1694  |  |  | 
1695  | 535k  |   if (before.ri_->ltr) { | 
1696  | 535k  |     return after.ri_->lword_box.width() < available_space;  | 
1697  | 535k  |   }  | 
1698  | 0  |   return after.ri_->rword_box.width() < available_space;  | 
1699  | 535k  | }  | 
1700  |  |  | 
1701  |  | // Return whether the first word on the after line can fit in the space at  | 
1702  |  | // the end of the before line (not knowing which way the text goes) in a left  | 
1703  |  | // or right alignment.  | 
1704  | 22.7k  | bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) { | 
1705  | 22.7k  |   if (before.ri_->num_words == 0 || after.ri_->num_words == 0) { | 
1706  | 0  |     return true;  | 
1707  | 0  |   }  | 
1708  |  |  | 
1709  | 22.7k  |   int available_space = before.lindent_;  | 
1710  | 22.7k  |   if (before.rindent_ > available_space) { | 
1711  | 6.87k  |     available_space = before.rindent_;  | 
1712  | 6.87k  |   }  | 
1713  | 22.7k  |   available_space -= before.ri_->average_interword_space;  | 
1714  |  |  | 
1715  | 22.7k  |   if (before.ri_->ltr) { | 
1716  | 22.7k  |     return after.ri_->lword_box.width() < available_space;  | 
1717  | 22.7k  |   }  | 
1718  | 0  |   return after.ri_->rword_box.width() < available_space;  | 
1719  | 22.7k  | }  | 
1720  |  |  | 
1721  | 20.5k  | static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) { | 
1722  | 20.5k  |   if (before.ri_->ltr) { | 
1723  | 20.5k  |     return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;  | 
1724  | 20.5k  |   } else { | 
1725  | 0  |     return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;  | 
1726  | 0  |   }  | 
1727  | 20.5k  | }  | 
1728  |  |  | 
1729  |  | static bool LikelyParagraphStart(const RowScratchRegisters &before,  | 
1730  |  |                                  const RowScratchRegisters &after,  | 
1731  | 346k  |                                  tesseract::ParagraphJustification j) { | 
1732  | 346k  |   return before.ri_->num_words == 0 ||  | 
1733  | 346k  |          (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));  | 
1734  | 346k  | }  | 
1735  |  |  | 
1736  |  | // Examine rows[start, end) and try to determine what sort of ParagraphModel  | 
1737  |  | // would fit them as a single paragraph.  | 
1738  |  | // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.  | 
1739  |  | // If the rows given could be a consistent start to a paragraph, set *consistent  | 
1740  |  | // true.  | 
1741  |  | static ParagraphModel InternalParagraphModelByOutline(  | 
1742  |  |     const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,  | 
1743  | 35.5k  |     bool *consistent) { | 
1744  | 35.5k  |   int ltr_line_count = 0;  | 
1745  | 188k  |   for (int i = start; i < end; i++) { | 
1746  | 153k  |     ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);  | 
1747  | 153k  |   }  | 
1748  | 35.5k  |   bool ltr = (ltr_line_count >= (end - start) / 2);  | 
1749  |  |  | 
1750  | 35.5k  |   *consistent = true;  | 
1751  | 35.5k  |   if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) { | 
1752  | 0  |     return ParagraphModel();  | 
1753  | 0  |   }  | 
1754  |  |  | 
1755  |  |   // Ensure the caller only passed us a region with a common rmargin and  | 
1756  |  |   // lmargin.  | 
1757  | 35.5k  |   int lmargin = (*rows)[start].lmargin_;  | 
1758  | 35.5k  |   int rmargin = (*rows)[start].rmargin_;  | 
1759  | 35.5k  |   int lmin, lmax, rmin, rmax, cmin, cmax;  | 
1760  | 35.5k  |   lmin = lmax = (*rows)[start + 1].lindent_;  | 
1761  | 35.5k  |   rmin = rmax = (*rows)[start + 1].rindent_;  | 
1762  | 35.5k  |   cmin = cmax = 0;  | 
1763  | 153k  |   for (int i = start + 1; i < end; i++) { | 
1764  | 117k  |     if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { | 
1765  | 0  |       tprintf("Margins don't match! Software error.\n"); | 
1766  | 0  |       *consistent = false;  | 
1767  | 0  |       return ParagraphModel();  | 
1768  | 0  |     }  | 
1769  | 117k  |     UpdateRange((*rows)[i].lindent_, &lmin, &lmax);  | 
1770  | 117k  |     UpdateRange((*rows)[i].rindent_, &rmin, &rmax);  | 
1771  | 117k  |     UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);  | 
1772  | 117k  |   }  | 
1773  | 35.5k  |   int ldiff = lmax - lmin;  | 
1774  | 35.5k  |   int rdiff = rmax - rmin;  | 
1775  | 35.5k  |   int cdiff = cmax - cmin;  | 
1776  | 35.5k  |   if (rdiff > tolerance && ldiff > tolerance) { | 
1777  | 997  |     if (cdiff < tolerance * 2) { | 
1778  | 345  |       if (end - start < 3) { | 
1779  | 0  |         return ParagraphModel();  | 
1780  | 0  |       }  | 
1781  | 345  |       return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);  | 
1782  | 345  |     }  | 
1783  | 652  |     *consistent = false;  | 
1784  | 652  |     return ParagraphModel();  | 
1785  | 997  |   }  | 
1786  | 34.5k  |   if (end - start < 3) { // Don't return a model for two line paras. | 
1787  | 16.9k  |     return ParagraphModel();  | 
1788  | 16.9k  |   }  | 
1789  |  |  | 
1790  |  |   // These booleans keep us from saying something is aligned left when the body  | 
1791  |  |   // left variance is too large.  | 
1792  | 17.5k  |   bool body_admits_left_alignment = ldiff < tolerance;  | 
1793  | 17.5k  |   bool body_admits_right_alignment = rdiff < tolerance;  | 
1794  |  |  | 
1795  | 17.5k  |   ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,  | 
1796  | 17.5k  |                                              (lmin + lmax) / 2, tolerance);  | 
1797  | 17.5k  |   ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,  | 
1798  | 17.5k  |                                               (rmin + rmax) / 2, tolerance);  | 
1799  |  |  | 
1800  |  |   // These booleans keep us from having an indent on the "wrong side" for the  | 
1801  |  |   // first line.  | 
1802  | 17.5k  |   bool text_admits_left_alignment = ltr || left_model.is_flush();  | 
1803  | 17.5k  |   bool text_admits_right_alignment = !ltr || right_model.is_flush();  | 
1804  |  |  | 
1805  |  |   // At least one of the edges is less than tolerance in variance.  | 
1806  |  |   // If the other is obviously ragged, it can't be the one aligned to.  | 
1807  |  |   // [Note the last line is included in this raggedness.]  | 
1808  | 17.5k  |   if (tolerance < rdiff) { | 
1809  | 6.00k  |     if (body_admits_left_alignment && text_admits_left_alignment) { | 
1810  | 5.76k  |       return left_model;  | 
1811  | 5.76k  |     }  | 
1812  | 237  |     *consistent = false;  | 
1813  | 237  |     return ParagraphModel();  | 
1814  | 6.00k  |   }  | 
1815  | 11.5k  |   if (tolerance < ldiff) { | 
1816  | 1.51k  |     if (body_admits_right_alignment && text_admits_right_alignment) { | 
1817  | 1.23k  |       return right_model;  | 
1818  | 1.23k  |     }  | 
1819  | 281  |     *consistent = false;  | 
1820  | 281  |     return ParagraphModel();  | 
1821  | 1.51k  |   }  | 
1822  |  |  | 
1823  |  |   // At this point, we know the body text doesn't vary much on either side.  | 
1824  |  |  | 
1825  |  |   // If the first line juts out oddly in one direction or the other,  | 
1826  |  |   // that likely indicates the side aligned to.  | 
1827  | 10.0k  |   int first_left = (*rows)[start].lindent_;  | 
1828  | 10.0k  |   int first_right = (*rows)[start].rindent_;  | 
1829  |  |  | 
1830  | 10.0k  |   if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) { | 
1831  | 4.42k  |     return left_model;  | 
1832  | 4.42k  |   }  | 
1833  | 5.57k  |   if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) { | 
1834  | 0  |     return right_model;  | 
1835  | 0  |   }  | 
1836  |  |  | 
1837  | 5.57k  |   *consistent = false;  | 
1838  | 5.57k  |   return ParagraphModel();  | 
1839  | 5.57k  | }  | 
1840  |  |  | 
1841  |  | // Examine rows[start, end) and try to determine what sort of ParagraphModel  | 
1842  |  | // would fit them as a single paragraph.   If nothing fits,  | 
1843  |  | // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug  | 
1844  |  | // output if we're debugging.  | 
1845  |  | static ParagraphModel ParagraphModelByOutline(int debug_level,  | 
1846  |  |                                               const std::vector<RowScratchRegisters> *rows,  | 
1847  | 9.61k  |                                               int start, int end, int tolerance) { | 
1848  | 9.61k  |   bool unused_consistent;  | 
1849  | 9.61k  |   ParagraphModel retval =  | 
1850  | 9.61k  |       InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);  | 
1851  | 9.61k  |   if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { | 
1852  | 0  |     tprintf("Could not determine a model for this paragraph:\n"); | 
1853  | 0  |     PrintRowRange(*rows, start, end);  | 
1854  | 0  |   }  | 
1855  | 9.61k  |   return retval;  | 
1856  | 9.61k  | }  | 
1857  |  |  | 
1858  |  | // Do rows[start, end) form a single instance of the given paragraph model?  | 
1859  |  | bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,  | 
1860  | 0  |                   const ParagraphModel *model) { | 
1861  | 0  |   if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) { | 
1862  | 0  |     return false;  | 
1863  | 0  |   }  | 
1864  | 0  |   if (!ValidFirstLine(rows, start, model)) { | 
1865  | 0  |     return false;  | 
1866  | 0  |   }  | 
1867  | 0  |   for (int i = start + 1; i < end; i++) { | 
1868  | 0  |     if (!ValidBodyLine(rows, i, model)) { | 
1869  | 0  |       return false;  | 
1870  | 0  |     }  | 
1871  | 0  |   }  | 
1872  | 0  |   return true;  | 
1873  | 0  | }  | 
1874  |  |  | 
1875  |  | // Examine rows[row_start, row_end) as an independent section of text,  | 
1876  |  | // and mark rows that are exceptionally clear as start-of-paragraph  | 
1877  |  | // and paragraph-body lines.  | 
1878  |  | //  | 
1879  |  | // We presume that any lines surrounding rows[row_start, row_end) may  | 
1880  |  | // have wildly different paragraph models, so we don't key any data off  | 
1881  |  | // of those lines.  | 
1882  |  | //  | 
1883  |  | // We only take the very strongest signals, as we don't want to get  | 
1884  |  | // confused and marking up centered text, poetry, or source code as  | 
1885  |  | // clearly part of a typical paragraph.  | 
1886  |  | static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,  | 
1887  | 12.5k  |                                int row_end) { | 
1888  |  |   // Record patently obvious body text.  | 
1889  | 143k  |   for (int i = row_start + 1; i < row_end; i++) { | 
1890  | 130k  |     const RowScratchRegisters &prev = (*rows)[i - 1];  | 
1891  | 130k  |     RowScratchRegisters &curr = (*rows)[i];  | 
1892  | 130k  |     tesseract::ParagraphJustification typical_justification =  | 
1893  | 130k  |         prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;  | 
1894  | 130k  |     if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&  | 
1895  | 130k  |         !FirstWordWouldHaveFit(prev, curr, typical_justification)) { | 
1896  | 22.4k  |       curr.SetBodyLine();  | 
1897  | 22.4k  |     }  | 
1898  | 130k  |   }  | 
1899  |  |  | 
1900  |  |   // Record patently obvious start paragraph lines.  | 
1901  |  |   //  | 
1902  |  |   // It's an extremely good signal of the start of a paragraph that  | 
1903  |  |   // the first word would have fit on the end of the previous line.  | 
1904  |  |   // However, applying just that signal would have us mark random  | 
1905  |  |   // start lines of lineated text (poetry and source code) and some  | 
1906  |  |   // centered headings as paragraph start lines.  Therefore, we use  | 
1907  |  |   // a second qualification for a paragraph start: Not only should  | 
1908  |  |   // the first word of this line have fit on the previous line,  | 
1909  |  |   // but also, this line should go full to the right of the block,  | 
1910  |  |   // disallowing a subsequent word from having fit on this line.  | 
1911  |  |  | 
1912  |  |   // First row:  | 
1913  | 12.5k  |   { | 
1914  | 12.5k  |     RowScratchRegisters &curr = (*rows)[row_start];  | 
1915  | 12.5k  |     RowScratchRegisters &next = (*rows)[row_start + 1];  | 
1916  | 12.5k  |     tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;  | 
1917  | 12.5k  |     if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&  | 
1918  | 12.5k  |         (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) { | 
1919  | 10.1k  |       curr.SetStartLine();  | 
1920  | 10.1k  |     }  | 
1921  | 12.5k  |   }  | 
1922  |  |   // Middle rows  | 
1923  | 130k  |   for (int i = row_start + 1; i < row_end - 1; i++) { | 
1924  | 118k  |     RowScratchRegisters &prev = (*rows)[i - 1];  | 
1925  | 118k  |     RowScratchRegisters &curr = (*rows)[i];  | 
1926  | 118k  |     RowScratchRegisters &next = (*rows)[i + 1];  | 
1927  | 118k  |     tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;  | 
1928  | 118k  |     if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&  | 
1929  | 118k  |         LikelyParagraphStart(prev, curr, j)) { | 
1930  | 473  |       curr.SetStartLine();  | 
1931  | 473  |     }  | 
1932  | 118k  |   }  | 
1933  |  |   // Last row  | 
1934  | 12.5k  |   { // the short circuit at the top means we have at least two lines. | 
1935  | 12.5k  |     RowScratchRegisters &prev = (*rows)[row_end - 2];  | 
1936  | 12.5k  |     RowScratchRegisters &curr = (*rows)[row_end - 1];  | 
1937  | 12.5k  |     tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;  | 
1938  | 12.5k  |     if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&  | 
1939  | 12.5k  |         LikelyParagraphStart(prev, curr, j)) { | 
1940  | 13  |       curr.SetStartLine();  | 
1941  | 13  |     }  | 
1942  | 12.5k  |   }  | 
1943  | 12.5k  | }  | 
1944  |  |  | 
1945  |  | // Look for sequences of a start line followed by some body lines in  | 
1946  |  | // rows[row_start, row_end) and create ParagraphModels for them if  | 
1947  |  | // they seem coherent.  | 
1948  |  | static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,  | 
1949  |  |                                 int row_start, int row_end, bool allow_flush_models,  | 
1950  | 12.5k  |                                 ParagraphTheory *theory) { | 
1951  | 12.5k  |   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) { | 
1952  | 0  |     return;  | 
1953  | 0  |   }  | 
1954  |  |  | 
1955  | 12.5k  |   int start = row_start;  | 
1956  | 23.1k  |   while (start < row_end) { | 
1957  | 136k  |     while (start < row_end && (*rows)[start].GetLineType() != LT_START) { | 
1958  | 113k  |       start++;  | 
1959  | 113k  |     }  | 
1960  | 23.1k  |     if (start >= row_end - 1) { | 
1961  | 12.5k  |       break;  | 
1962  | 12.5k  |     }  | 
1963  |  |  | 
1964  | 10.6k  |     int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);  | 
1965  | 10.6k  |     int end = start;  | 
1966  | 10.6k  |     ParagraphModel last_model;  | 
1967  | 10.6k  |     bool next_consistent;  | 
1968  | 30.2k  |     do { | 
1969  | 30.2k  |       ++end;  | 
1970  |  |       // rows[row, end) was consistent.  | 
1971  |  |       // If rows[row, end + 1) is not consistent,  | 
1972  |  |       //   just model rows[row, end)  | 
1973  | 30.2k  |       if (end < row_end - 1) { | 
1974  | 27.7k  |         RowScratchRegisters &next = (*rows)[end];  | 
1975  | 27.7k  |         LineType lt = next.GetLineType();  | 
1976  | 27.7k  |         next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&  | 
1977  | 22.8k  |                                             !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));  | 
1978  | 27.7k  |       } else { | 
1979  | 2.43k  |         next_consistent = false;  | 
1980  | 2.43k  |       }  | 
1981  | 30.2k  |       if (next_consistent) { | 
1982  | 25.8k  |         ParagraphModel next_model =  | 
1983  | 25.8k  |             InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);  | 
1984  | 25.8k  |         if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&  | 
1985  | 25.8k  |              next_model.justification() != JUSTIFICATION_LEFT) ||  | 
1986  | 25.8k  |             (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&  | 
1987  | 24.7k  |              next_model.justification() != JUSTIFICATION_RIGHT)) { | 
1988  | 1.13k  |           next_consistent = false;  | 
1989  | 1.13k  |         }  | 
1990  | 25.8k  |         last_model = next_model;  | 
1991  | 25.8k  |       } else { | 
1992  | 4.30k  |         next_consistent = false;  | 
1993  | 4.30k  |       }  | 
1994  | 30.2k  |     } while (next_consistent && end < row_end);  | 
1995  |  |     // At this point, rows[start, end) looked like it could have been a  | 
1996  |  |     // single paragraph.  If we can make a good ParagraphModel for it,  | 
1997  |  |     // do so and mark this sequence with that model.  | 
1998  | 10.6k  |     if (end > start + 1) { | 
1999  |  |       // emit a new paragraph if we have more than one line.  | 
2000  | 9.61k  |       const ParagraphModel *model = nullptr;  | 
2001  | 9.61k  |       ParagraphModel new_model = ParagraphModelByOutline(  | 
2002  | 9.61k  |           debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));  | 
2003  | 9.61k  |       if (new_model.justification() == JUSTIFICATION_UNKNOWN) { | 
2004  |  |         // couldn't create a good model, oh well.  | 
2005  | 7.93k  |       } else if (new_model.is_flush()) { | 
2006  | 1.23k  |         if (end == start + 2) { | 
2007  |  |           // It's very likely we just got two paragraph starts in a row.  | 
2008  | 0  |           end = start + 1;  | 
2009  | 1.23k  |         } else if (start == row_start) { | 
2010  |  |           // Mark this as a Crown.  | 
2011  | 1.15k  |           if (new_model.justification() == JUSTIFICATION_LEFT) { | 
2012  | 1.03k  |             model = kCrownLeft;  | 
2013  | 1.03k  |           } else { | 
2014  | 118  |             model = kCrownRight;  | 
2015  | 118  |           }  | 
2016  | 1.15k  |         } else if (allow_flush_models) { | 
2017  | 0  |           model = theory->AddModel(new_model);  | 
2018  | 0  |         }  | 
2019  | 1.23k  |       } else { | 
2020  | 445  |         model = theory->AddModel(new_model);  | 
2021  | 445  |       }  | 
2022  | 9.61k  |       if (model) { | 
2023  | 1.59k  |         (*rows)[start].AddStartLine(model);  | 
2024  | 9.64k  |         for (int i = start + 1; i < end; i++) { | 
2025  | 8.04k  |           (*rows)[i].AddBodyLine(model);  | 
2026  | 8.04k  |         }  | 
2027  | 1.59k  |       }  | 
2028  | 9.61k  |     }  | 
2029  | 10.6k  |     start = end;  | 
2030  | 10.6k  |   }  | 
2031  | 12.5k  | }  | 
2032  |  |  | 
2033  |  | // We examine rows[row_start, row_end) and do the following:  | 
2034  |  | //   (1) Clear all existing hypotheses for the rows being considered.  | 
2035  |  | //   (2) Mark up any rows as exceptionally likely to be paragraph starts  | 
2036  |  | //       or paragraph body lines as such using both geometric and textual  | 
2037  |  | //       clues.  | 
2038  |  | //   (3) Form models for any sequence of start + continuation lines.  | 
2039  |  | //   (4) Smear the paragraph models to cover surrounding text.  | 
2040  |  | static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,  | 
2041  | 14.4k  |                                    int row_start, int row_end, ParagraphTheory *theory) { | 
2042  | 14.4k  |   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) { | 
2043  | 1.94k  |     return;  | 
2044  | 1.94k  |   }  | 
2045  |  |  | 
2046  | 12.5k  |   if (debug_level > 1) { | 
2047  | 0  |     tprintf("#############################################\n"); | 
2048  | 0  |     tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); | 
2049  | 0  |     tprintf("#############################################\n"); | 
2050  | 0  |   }  | 
2051  |  |  | 
2052  | 12.5k  |   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);  | 
2053  | 12.5k  |   MarkStrongEvidence(rows, row_start, row_end);  | 
2054  |  |  | 
2055  | 12.5k  |   DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);  | 
2056  |  |  | 
2057  |  |   // Create paragraph models.  | 
2058  | 12.5k  |   ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);  | 
2059  |  |  | 
2060  | 12.5k  |   DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);  | 
2061  |  |  | 
2062  |  |   // At this point, some rows are marked up as paragraphs with model numbers,  | 
2063  |  |   // and some rows are marked up as either LT_START or LT_BODY.  Now let's  | 
2064  |  |   // smear any good paragraph hypotheses forward and backward.  | 
2065  | 12.5k  |   ParagraphModelSmearer smearer(rows, row_start, row_end, theory);  | 
2066  | 12.5k  |   smearer.Smear();  | 
2067  | 12.5k  | }  | 
2068  |  |  | 
2069  |  | static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,  | 
2070  | 13.9k  |                                       int row_end, ParagraphTheory *theory) { | 
2071  | 130k  |   for (int i = row_start + 1; i < row_end - 1; i++) { | 
2072  | 116k  |     if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&  | 
2073  | 116k  |         (*rows)[i + 1].ri_->has_leaders) { | 
2074  | 0  |       const ParagraphModel *model =  | 
2075  | 0  |           theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));  | 
2076  | 0  |       (*rows)[i].AddStartLine(model);  | 
2077  | 0  |     }  | 
2078  | 116k  |   }  | 
2079  | 13.9k  | }  | 
2080  |  |  | 
2081  |  | // Collect sequences of unique hypotheses in row registers and create proper  | 
2082  |  | // paragraphs for them, referencing the paragraphs in row_owners.  | 
2083  |  | static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,  | 
2084  |  |                                                      std::vector<RowScratchRegisters> &rows,  | 
2085  |  |                                                      std::vector<PARA *> *row_owners,  | 
2086  | 13.9k  |                                                      ParagraphTheory *theory) { | 
2087  | 13.9k  |   int end = rows.size();  | 
2088  | 13.9k  |   int start;  | 
2089  | 132k  |   for (; end > 0; end = start) { | 
2090  | 118k  |     start = end - 1;  | 
2091  | 118k  |     const ParagraphModel *model = nullptr;  | 
2092  |  |     // TODO(eger): Be smarter about dealing with multiple hypotheses.  | 
2093  | 118k  |     bool single_line_paragraph = false;  | 
2094  | 118k  |     SetOfModels models;  | 
2095  | 118k  |     rows[start].NonNullHypotheses(&models);  | 
2096  | 118k  |     if (!models.empty()) { | 
2097  | 8.76k  |       model = models[0];  | 
2098  | 8.76k  |       if (rows[start].GetLineType(model) != LT_BODY) { | 
2099  | 4.42k  |         single_line_paragraph = true;  | 
2100  | 4.42k  |       }  | 
2101  | 8.76k  |     }  | 
2102  | 118k  |     if (model && !single_line_paragraph) { | 
2103  |  |       // walk back looking for more body lines and then a start line.  | 
2104  | 23.9k  |       while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { | 
2105  |  |         // do nothing  | 
2106  | 19.6k  |       }  | 
2107  | 4.34k  |       if (start < 0 || rows[start].GetLineType(model) != LT_START) { | 
2108  | 6  |         model = nullptr;  | 
2109  | 6  |       }  | 
2110  | 4.34k  |     }  | 
2111  | 118k  |     if (model == nullptr) { | 
2112  | 109k  |       continue;  | 
2113  | 109k  |     }  | 
2114  |  |     // rows[start, end) should be a paragraph.  | 
2115  | 8.75k  |     PARA *p = new PARA();  | 
2116  | 8.75k  |     if (model == kCrownLeft || model == kCrownRight) { | 
2117  | 1.87k  |       p->is_very_first_or_continuation = true;  | 
2118  |  |       // Crown paragraph.  | 
2119  |  |       //   If we can find an existing ParagraphModel that fits, use it,  | 
2120  |  |       //   else create a new one.  | 
2121  | 2.75k  |       for (unsigned row = end; row < rows.size(); row++) { | 
2122  | 2.03k  |         if ((*row_owners)[row] &&  | 
2123  | 2.03k  |             (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&  | 
2124  | 1.96k  |              (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { | 
2125  | 1.16k  |           model = (*row_owners)[row]->model;  | 
2126  | 1.16k  |           break;  | 
2127  | 1.16k  |         }  | 
2128  | 2.03k  |       }  | 
2129  | 1.87k  |       if (model == kCrownLeft) { | 
2130  |  |         // No subsequent model fits, so cons one up.  | 
2131  | 595  |         model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,  | 
2132  | 595  |                                                 rows[start].lmargin_ + rows[start].lindent_, 0, 0,  | 
2133  | 595  |                                                 Epsilon(rows[start].ri_->average_interword_space)));  | 
2134  | 1.28k  |       } else if (model == kCrownRight) { | 
2135  |  |         // No subsequent model fits, so cons one up.  | 
2136  | 123  |         model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,  | 
2137  | 123  |                                                 rows[start].rmargin_ + rows[start].rmargin_, 0, 0,  | 
2138  | 123  |                                                 Epsilon(rows[start].ri_->average_interword_space)));  | 
2139  | 123  |       }  | 
2140  | 1.87k  |     }  | 
2141  | 8.75k  |     rows[start].SetUnknown();  | 
2142  | 8.75k  |     rows[start].AddStartLine(model);  | 
2143  | 32.7k  |     for (int i = start + 1; i < end; i++) { | 
2144  | 23.9k  |       rows[i].SetUnknown();  | 
2145  | 23.9k  |       rows[i].AddBodyLine(model);  | 
2146  | 23.9k  |     }  | 
2147  | 8.75k  |     p->model = model;  | 
2148  | 8.75k  |     p->has_drop_cap = rows[start].ri_->has_drop_cap;  | 
2149  | 8.75k  |     p->is_list_item = model->justification() == JUSTIFICATION_RIGHT  | 
2150  | 8.75k  |                           ? rows[start].ri_->rword_indicates_list_item  | 
2151  | 8.75k  |                           : rows[start].ri_->lword_indicates_list_item;  | 
2152  | 41.4k  |     for (int row = start; row < end; row++) { | 
2153  | 32.7k  |       if ((*row_owners)[row] != nullptr) { | 
2154  | 0  |         tprintf(  | 
2155  | 0  |             "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "  | 
2156  | 0  |             "more than once!\n");  | 
2157  | 0  |         delete (*row_owners)[row];  | 
2158  | 0  |       }  | 
2159  | 32.7k  |       (*row_owners)[row] = p;  | 
2160  | 32.7k  |     }  | 
2161  | 8.75k  |   }  | 
2162  | 13.9k  | }  | 
2163  |  |  | 
2164  |  | struct Interval { | 
2165  | 0  |   Interval() : begin(0), end(0) {} | 
2166  | 54.0k  |   Interval(int b, int e) : begin(b), end(e) {} | 
2167  |  |  | 
2168  |  |   int begin;  | 
2169  |  |   int end;  | 
2170  |  | };  | 
2171  |  |  | 
2172  |  | // Return whether rows[row] appears to be stranded, meaning that the evidence  | 
2173  |  | // for this row is very weak due to context.  For instance, two lines of source  | 
2174  |  | // code may happen to be indented at the same tab vector as body text starts,  | 
2175  |  | // leading us to think they are two start-of-paragraph lines.  This is not  | 
2176  |  | // optimal.  However, we also don't want to mark a sequence of short dialog  | 
2177  |  | // as "weak," so our heuristic is:  | 
2178  |  | //   (1) If a line is surrounded by lines of unknown type, it's weak.  | 
2179  |  | //   (2) If two lines in a row are start lines for a given paragraph type, but  | 
2180  |  | //       after that the same paragraph type does not continue, they're weak.  | 
2181  | 25.3k  | static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) { | 
2182  | 25.3k  |   SetOfModels row_models;  | 
2183  | 25.3k  |   rows[row].StrongHypotheses(&row_models);  | 
2184  |  |  | 
2185  | 25.3k  |   for (auto &row_model : row_models) { | 
2186  | 25.3k  |     bool all_starts = rows[row].GetLineType();  | 
2187  | 25.3k  |     int run_length = 1;  | 
2188  | 25.3k  |     bool continues = true;  | 
2189  | 224k  |     for (int i = row - 1; i >= 0 && continues; i--) { | 
2190  | 198k  |       SetOfModels models;  | 
2191  | 198k  |       rows[i].NonNullHypotheses(&models);  | 
2192  | 198k  |       switch (rows[i].GetLineType(row_model)) { | 
2193  | 56.8k  |         case LT_START:  | 
2194  | 56.8k  |           run_length++;  | 
2195  | 56.8k  |           break;  | 
2196  | 38  |         case LT_MULTIPLE: // explicit fall-through  | 
2197  | 137k  |         case LT_BODY:  | 
2198  | 137k  |           run_length++;  | 
2199  | 137k  |           all_starts = false;  | 
2200  | 137k  |           break;  | 
2201  | 4.04k  |         case LT_UNKNOWN: // explicit fall-through  | 
2202  | 4.04k  |         default:  | 
2203  | 4.04k  |           continues = false;  | 
2204  | 198k  |       }  | 
2205  | 198k  |     }  | 
2206  | 25.3k  |     continues = true;  | 
2207  | 190k  |     for (unsigned i = row + 1; i < rows.size() && continues; i++) { | 
2208  | 165k  |       SetOfModels models;  | 
2209  | 165k  |       rows[i].NonNullHypotheses(&models);  | 
2210  | 165k  |       switch (rows[i].GetLineType(row_model)) { | 
2211  | 45.3k  |         case LT_START:  | 
2212  | 45.3k  |           run_length++;  | 
2213  | 45.3k  |           break;  | 
2214  | 22  |         case LT_MULTIPLE: // explicit fall-through  | 
2215  | 114k  |         case LT_BODY:  | 
2216  | 114k  |           run_length++;  | 
2217  | 114k  |           all_starts = false;  | 
2218  | 114k  |           break;  | 
2219  | 5.47k  |         case LT_UNKNOWN: // explicit fall-through  | 
2220  | 5.47k  |         default:  | 
2221  | 5.47k  |           continues = false;  | 
2222  | 165k  |       }  | 
2223  | 165k  |     }  | 
2224  | 25.3k  |     if (run_length > 2 || (!all_starts && run_length > 1)) { | 
2225  | 25.0k  |       return false;  | 
2226  | 25.0k  |     }  | 
2227  | 25.3k  |   }  | 
2228  | 328  |   return true;  | 
2229  | 25.3k  | }  | 
2230  |  |  | 
2231  |  | // Go through rows[row_start, row_end) and gather up sequences that need better  | 
2232  |  | // classification.  | 
2233  |  | // + Sequences of non-empty rows without hypotheses.  | 
2234  |  | // + Crown paragraphs not immediately followed by a strongly modeled line.  | 
2235  |  | // + Single line paragraphs surrounded by text that doesn't match the  | 
2236  |  | //   model.  | 
2237  |  | static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,  | 
2238  | 55.7k  |                              std::vector<Interval> *to_fix, int row_start, int row_end) { | 
2239  | 55.7k  |   to_fix->clear();  | 
2240  | 626k  |   for (int i = row_start; i < row_end; i++) { | 
2241  | 570k  |     bool needs_fixing = false;  | 
2242  |  |  | 
2243  | 570k  |     SetOfModels models;  | 
2244  | 570k  |     SetOfModels models_w_crowns;  | 
2245  | 570k  |     rows[i].StrongHypotheses(&models);  | 
2246  | 570k  |     rows[i].NonNullHypotheses(&models_w_crowns);  | 
2247  | 570k  |     if (models.empty() && !models_w_crowns.empty()) { | 
2248  |  |       // Crown paragraph.  Is it followed by a modeled line?  | 
2249  | 235k  |       for (unsigned end = i + 1; end < rows.size(); end++) { | 
2250  | 227k  |         SetOfModels end_models;  | 
2251  | 227k  |         SetOfModels strong_end_models;  | 
2252  | 227k  |         rows[end].NonNullHypotheses(&end_models);  | 
2253  | 227k  |         rows[end].StrongHypotheses(&strong_end_models);  | 
2254  | 227k  |         if (end_models.empty()) { | 
2255  | 15.9k  |           needs_fixing = true;  | 
2256  | 15.9k  |           break;  | 
2257  | 211k  |         } else if (!strong_end_models.empty()) { | 
2258  | 7.11k  |           needs_fixing = false;  | 
2259  | 7.11k  |           break;  | 
2260  | 7.11k  |         }  | 
2261  | 227k  |       }  | 
2262  | 539k  |     } else if (models.empty() && rows[i].ri_->num_words > 0) { | 
2263  |  |       // No models at all.  | 
2264  | 513k  |       needs_fixing = true;  | 
2265  | 513k  |     }  | 
2266  |  |  | 
2267  | 570k  |     if (!needs_fixing && !models.empty()) { | 
2268  | 25.3k  |       needs_fixing = RowIsStranded(rows, i);  | 
2269  | 25.3k  |     }  | 
2270  |  |  | 
2271  | 570k  |     if (needs_fixing) { | 
2272  | 530k  |       if (!to_fix->empty() && to_fix->back().end == i - 1) { | 
2273  | 476k  |         to_fix->back().end = i;  | 
2274  | 476k  |       } else { | 
2275  | 54.0k  |         to_fix->push_back(Interval(i, i));  | 
2276  | 54.0k  |       }  | 
2277  | 530k  |     }  | 
2278  | 570k  |   }  | 
2279  |  |   // Convert inclusive intervals to half-open intervals.  | 
2280  | 55.7k  |   for (auto &i : *to_fix) { | 
2281  | 54.0k  |     i.end = i.end + 1;  | 
2282  | 54.0k  |   }  | 
2283  | 55.7k  | }  | 
2284  |  |  | 
2285  |  | // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),  | 
2286  |  | // normalize each row_owner to point to an actual PARA, and output the  | 
2287  |  | // paragraphs in order onto paragraphs.  | 
2288  | 13.9k  | void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) { | 
2289  | 13.9k  |   std::vector<PARA *> &rows = *row_owners;  | 
2290  | 13.9k  |   paragraphs->clear();  | 
2291  | 13.9k  |   PARA_IT out(paragraphs);  | 
2292  | 13.9k  |   PARA *formerly_null = nullptr;  | 
2293  | 156k  |   for (unsigned i = 0; i < rows.size(); i++) { | 
2294  | 142k  |     if (rows[i] == nullptr) { | 
2295  | 110k  |       if (i == 0 || rows[i - 1] != formerly_null) { | 
2296  | 11.9k  |         rows[i] = formerly_null = new PARA();  | 
2297  | 98.0k  |       } else { | 
2298  | 98.0k  |         rows[i] = formerly_null;  | 
2299  | 98.0k  |         continue;  | 
2300  | 98.0k  |       }  | 
2301  | 110k  |     } else if (i > 0 && rows[i - 1] == rows[i]) { | 
2302  | 23.9k  |       continue;  | 
2303  | 23.9k  |     }  | 
2304  | 20.7k  |     out.add_after_then_move(rows[i]);  | 
2305  | 20.7k  |   }  | 
2306  | 13.9k  | }  | 
2307  |  |  | 
2308  |  | // Main entry point for Paragraph Detection Algorithm.  | 
2309  |  | //  | 
2310  |  | // Given a set of equally spaced textlines (described by row_infos),  | 
2311  |  | // Split them into paragraphs.  | 
2312  |  | //  | 
2313  |  | // Output:  | 
2314  |  | //   row_owners - one pointer for each row, to the paragraph it belongs to.  | 
2315  |  | //   paragraphs - this is the actual list of PARA objects.  | 
2316  |  | //   models - the list of paragraph models referenced by the PARA objects.  | 
2317  |  | //            caller is responsible for deleting the models.  | 
2318  |  | void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,  | 
2319  |  |                       std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,  | 
2320  | 13.9k  |                       std::vector<ParagraphModel *> *models) { | 
2321  | 13.9k  |   ParagraphTheory theory(models);  | 
2322  |  |  | 
2323  |  |   // Initialize row_owners to be a bunch of nullptr pointers.  | 
2324  | 13.9k  |   row_owners->clear();  | 
2325  | 13.9k  |   row_owners->resize(row_infos->size());  | 
2326  |  |  | 
2327  |  |   // Set up row scratch registers for the main algorithm.  | 
2328  | 13.9k  |   std::vector<RowScratchRegisters> rows(row_infos->size());  | 
2329  | 156k  |   for (unsigned i = 0; i < row_infos->size(); i++) { | 
2330  | 142k  |     rows[i].Init((*row_infos)[i]);  | 
2331  | 142k  |   }  | 
2332  |  |  | 
2333  |  |   // Pass 1:  | 
2334  |  |   //   Detect sequences of lines that all contain leader dots (.....)  | 
2335  |  |   //   These are likely Tables of Contents.  If there are three text lines in  | 
2336  |  |   //   a row with leader dots, it's pretty safe to say the middle one should  | 
2337  |  |   //   be a paragraph of its own.  | 
2338  | 13.9k  |   SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);  | 
2339  |  |  | 
2340  | 13.9k  |   DebugDump(debug_level > 1, "End of Pass 1", theory, rows);  | 
2341  |  |  | 
2342  | 13.9k  |   std::vector<Interval> leftovers;  | 
2343  | 13.9k  |   LeftoverSegments(rows, &leftovers, 0, rows.size());  | 
2344  | 13.9k  |   for (auto &leftover : leftovers) { | 
2345  |  |     // Pass 2a:  | 
2346  |  |     //   Find any strongly evidenced start-of-paragraph lines.  If they're  | 
2347  |  |     //   followed by two lines that look like body lines, make a paragraph  | 
2348  |  |     //   model for that and see if that model applies throughout the text  | 
2349  |  |     //   (that is, "smear" it).  | 
2350  | 13.9k  |     StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);  | 
2351  |  |  | 
2352  |  |     // Pass 2b:  | 
2353  |  |     //   If we had any luck in pass 2a, we got part of the page and didn't  | 
2354  |  |     //   know how to classify a few runs of rows. Take the segments that  | 
2355  |  |     //   didn't find a model and reprocess them individually.  | 
2356  | 13.9k  |     std::vector<Interval> leftovers2;  | 
2357  | 13.9k  |     LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);  | 
2358  | 13.9k  |     bool pass2a_was_useful =  | 
2359  | 13.9k  |         leftovers2.size() > 1 ||  | 
2360  | 13.9k  |         (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));  | 
2361  | 13.9k  |     if (pass2a_was_useful) { | 
2362  | 556  |       for (auto &leftover2 : leftovers2) { | 
2363  | 556  |         StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);  | 
2364  | 556  |       }  | 
2365  | 308  |     }  | 
2366  | 13.9k  |   }  | 
2367  |  |  | 
2368  | 13.9k  |   DebugDump(debug_level > 1, "End of Pass 2", theory, rows);  | 
2369  |  |  | 
2370  |  |   // Pass 3:  | 
2371  |  |   //   These are the dregs for which we didn't have enough strong textual  | 
2372  |  |   //   and geometric clues to form matching models for.  Let's see if  | 
2373  |  |   //   the geometric clues are simple enough that we could just use those.  | 
2374  | 13.9k  |   LeftoverSegments(rows, &leftovers, 0, rows.size());  | 
2375  | 14.0k  |   for (auto &leftover : leftovers) { | 
2376  | 14.0k  |     GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);  | 
2377  | 14.0k  |   }  | 
2378  |  |  | 
2379  |  |   // Undo any flush models for which there's little evidence.  | 
2380  | 13.9k  |   DowngradeWeakestToCrowns(debug_level, &theory, &rows);  | 
2381  |  |  | 
2382  | 13.9k  |   DebugDump(debug_level > 1, "End of Pass 3", theory, rows);  | 
2383  |  |  | 
2384  |  |   // Pass 4:  | 
2385  |  |   //   Take everything that's still not marked up well and clear all markings.  | 
2386  | 13.9k  |   LeftoverSegments(rows, &leftovers, 0, rows.size());  | 
2387  | 13.9k  |   for (auto &leftover : leftovers) { | 
2388  | 121k  |     for (int j = leftover.begin; j < leftover.end; j++) { | 
2389  | 109k  |       rows[j].SetUnknown();  | 
2390  | 109k  |     }  | 
2391  | 11.9k  |   }  | 
2392  |  |  | 
2393  | 13.9k  |   DebugDump(debug_level > 1, "End of Pass 4", theory, rows);  | 
2394  |  |  | 
2395  |  |   // Convert all of the unique hypothesis runs to PARAs.  | 
2396  | 13.9k  |   ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);  | 
2397  |  |  | 
2398  | 13.9k  |   DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);  | 
2399  |  |  | 
2400  |  |   // Finally, clean up any dangling nullptr row paragraph parents.  | 
2401  | 13.9k  |   CanonicalizeDetectionResults(row_owners, paragraphs);  | 
2402  | 13.9k  | }  | 
2403  |  |  | 
2404  |  | // ============ Code interfacing with the rest of Tesseract ==================  | 
2405  |  |  | 
2406  | 0  | static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) { | 
2407  |  |   // Set up text, lword_text, and rword_text (mostly for debug printing).  | 
2408  | 0  |   std::string fake_text;  | 
2409  | 0  |   PageIterator pit(static_cast<const PageIterator &>(it));  | 
2410  | 0  |   if (!pit.Empty(RIL_WORD)) { | 
2411  | 0  |     bool first_word = true;  | 
2412  | 0  |     do { | 
2413  | 0  |       fake_text += "x";  | 
2414  | 0  |       if (first_word) { | 
2415  | 0  |         info->lword_text += "x";  | 
2416  | 0  |       }  | 
2417  | 0  |       info->rword_text += "x";  | 
2418  | 0  |       if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&  | 
2419  | 0  |           !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) { | 
2420  | 0  |         fake_text += " ";  | 
2421  | 0  |         info->rword_text = "";  | 
2422  | 0  |         first_word = false;  | 
2423  | 0  |       }  | 
2424  | 0  |     } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));  | 
2425  | 0  |   }  | 
2426  | 0  |   if (fake_text.empty()) { | 
2427  | 0  |     return;  | 
2428  | 0  |   }  | 
2429  |  |  | 
2430  | 0  |   int lspaces = info->pix_ldistance / info->average_interword_space;  | 
2431  | 0  |   for (int i = 0; i < lspaces; i++) { | 
2432  | 0  |     info->text += ' ';  | 
2433  | 0  |   }  | 
2434  | 0  |   info->text += fake_text;  | 
2435  |  |  | 
2436  |  |   // Set up lword_box, rword_box, and num_words.  | 
2437  | 0  |   PAGE_RES_IT page_res_it = *it.PageResIt();  | 
2438  | 0  |   WERD_RES *word_res = page_res_it.restart_row();  | 
2439  | 0  |   ROW_RES *this_row = page_res_it.row();  | 
2440  |  | 
  | 
2441  | 0  |   WERD_RES *lword = nullptr;  | 
2442  | 0  |   WERD_RES *rword = nullptr;  | 
2443  | 0  |   info->num_words = 0;  | 
2444  | 0  |   do { | 
2445  | 0  |     if (word_res) { | 
2446  | 0  |       if (!lword) { | 
2447  | 0  |         lword = word_res;  | 
2448  | 0  |       }  | 
2449  | 0  |       if (rword != word_res) { | 
2450  | 0  |         info->num_words++;  | 
2451  | 0  |       }  | 
2452  | 0  |       rword = word_res;  | 
2453  | 0  |     }  | 
2454  | 0  |     word_res = page_res_it.forward();  | 
2455  | 0  |   } while (page_res_it.row() == this_row);  | 
2456  |  | 
  | 
2457  | 0  |   if (lword) { | 
2458  | 0  |     info->lword_box = lword->word->bounding_box();  | 
2459  | 0  |   }  | 
2460  | 0  |   if (rword) { | 
2461  | 0  |     info->rword_box = rword->word->bounding_box();  | 
2462  | 0  |   }  | 
2463  | 0  | }  | 
2464  |  |  | 
2465  |  | // Given a Tesseract Iterator pointing to a text line, fill in the paragraph  | 
2466  |  | // detector RowInfo with all relevant information from the row.  | 
2467  | 142k  | static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) { | 
2468  | 142k  |   if (it.PageResIt()->row() != nullptr) { | 
2469  | 142k  |     ROW *row = it.PageResIt()->row()->row;  | 
2470  | 142k  |     info->pix_ldistance = row->lmargin();  | 
2471  | 142k  |     info->pix_rdistance = row->rmargin();  | 
2472  | 142k  |     info->average_interword_space =  | 
2473  | 142k  |         row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);  | 
2474  | 142k  |     info->pix_xheight = row->x_height();  | 
2475  | 142k  |     info->has_leaders = false;  | 
2476  | 142k  |     info->has_drop_cap = row->has_drop_cap();  | 
2477  | 142k  |     info->ltr = true; // set below depending on word scripts  | 
2478  | 142k  |   } else { | 
2479  | 0  |     info->pix_ldistance = info->pix_rdistance = 0;  | 
2480  | 0  |     info->average_interword_space = 1;  | 
2481  | 0  |     info->pix_xheight = 1.0;  | 
2482  | 0  |     info->has_leaders = false;  | 
2483  | 0  |     info->has_drop_cap = false;  | 
2484  | 0  |     info->ltr = true;  | 
2485  | 0  |   }  | 
2486  |  |  | 
2487  | 142k  |   info->num_words = 0;  | 
2488  | 142k  |   info->lword_indicates_list_item = false;  | 
2489  | 142k  |   info->lword_likely_starts_idea = false;  | 
2490  | 142k  |   info->lword_likely_ends_idea = false;  | 
2491  | 142k  |   info->rword_indicates_list_item = false;  | 
2492  | 142k  |   info->rword_likely_starts_idea = false;  | 
2493  | 142k  |   info->rword_likely_ends_idea = false;  | 
2494  | 142k  |   info->has_leaders = false;  | 
2495  | 142k  |   info->ltr = true;  | 
2496  |  |  | 
2497  | 142k  |   if (!after_recognition) { | 
2498  | 0  |     InitializeTextAndBoxesPreRecognition(it, info);  | 
2499  | 0  |     return;  | 
2500  | 0  |   }  | 
2501  | 142k  |   info->text = "";  | 
2502  | 142k  |   const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));  | 
2503  | 142k  |   int trailing_ws_idx = strlen(text.get()); // strip trailing space  | 
2504  | 285k  |   while (trailing_ws_idx > 0 &&  | 
2505  |  |          // isspace() only takes ASCII  | 
2506  | 285k  |          isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) { | 
2507  | 142k  |     trailing_ws_idx--;  | 
2508  | 142k  |   }  | 
2509  | 142k  |   if (trailing_ws_idx > 0) { | 
2510  | 142k  |     int lspaces = info->pix_ldistance / info->average_interword_space;  | 
2511  | 218k  |     for (int i = 0; i < lspaces; i++) { | 
2512  | 75.7k  |       info->text += ' ';  | 
2513  | 75.7k  |     }  | 
2514  | 600k  |     for (int i = 0; i < trailing_ws_idx; i++) { | 
2515  | 457k  |       info->text += text[i];  | 
2516  | 457k  |     }  | 
2517  | 142k  |   }  | 
2518  |  |  | 
2519  | 142k  |   if (info->text.empty()) { | 
2520  | 0  |     return;  | 
2521  | 0  |   }  | 
2522  |  |  | 
2523  | 142k  |   PAGE_RES_IT page_res_it = *it.PageResIt();  | 
2524  | 142k  |   std::vector<WERD_RES *> werds;  | 
2525  | 142k  |   WERD_RES *word_res = page_res_it.restart_row();  | 
2526  | 142k  |   ROW_RES *this_row = page_res_it.row();  | 
2527  | 142k  |   int num_leaders = 0;  | 
2528  | 142k  |   int ltr = 0;  | 
2529  | 142k  |   int rtl = 0;  | 
2530  | 175k  |   do { | 
2531  | 175k  |     if (word_res && word_res->best_choice->unichar_string().length() > 0) { | 
2532  | 175k  |       werds.push_back(word_res);  | 
2533  | 175k  |       ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;  | 
2534  | 175k  |       rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;  | 
2535  | 175k  |       if (word_res->word->flag(W_REP_CHAR)) { | 
2536  | 0  |         num_leaders++;  | 
2537  | 0  |       }  | 
2538  | 175k  |     }  | 
2539  | 175k  |     word_res = page_res_it.forward();  | 
2540  | 175k  |   } while (page_res_it.row() == this_row);  | 
2541  | 142k  |   info->ltr = ltr >= rtl;  | 
2542  | 142k  |   info->has_leaders = num_leaders > 3;  | 
2543  | 142k  |   info->num_words = werds.size();  | 
2544  | 142k  |   if (!werds.empty()) { | 
2545  | 142k  |     WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];  | 
2546  | 142k  |     info->lword_text = lword->best_choice->unichar_string().c_str();  | 
2547  | 142k  |     info->rword_text = rword->best_choice->unichar_string().c_str();  | 
2548  | 142k  |     info->lword_box = lword->word->bounding_box();  | 
2549  | 142k  |     info->rword_box = rword->word->bounding_box();  | 
2550  | 142k  |     LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,  | 
2551  | 142k  |                        &info->lword_indicates_list_item, &info->lword_likely_starts_idea,  | 
2552  | 142k  |                        &info->lword_likely_ends_idea);  | 
2553  | 142k  |     RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,  | 
2554  | 142k  |                         &info->rword_indicates_list_item, &info->rword_likely_starts_idea,  | 
2555  | 142k  |                         &info->rword_likely_ends_idea);  | 
2556  | 142k  |   }  | 
2557  | 142k  | }  | 
2558  |  |  | 
2559  |  | // This is called after rows have been identified and words are recognized.  | 
2560  |  | // Much of this could be implemented before word recognition, but text helps  | 
2561  |  | // to identify bulleted lists and gives good signals for sentence boundaries.  | 
2562  |  | void DetectParagraphs(int debug_level, bool after_text_recognition,  | 
2563  | 13.9k  |                       const MutableIterator *block_start, std::vector<ParagraphModel *> *models) { | 
2564  |  |   // Clear out any preconceived notions.  | 
2565  | 13.9k  |   if (block_start->Empty(RIL_TEXTLINE)) { | 
2566  | 7  |     return;  | 
2567  | 7  |   }  | 
2568  | 13.9k  |   BLOCK *block = block_start->PageResIt()->block()->block;  | 
2569  | 13.9k  |   block->para_list()->clear();  | 
2570  | 13.9k  |   bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();  | 
2571  |  |  | 
2572  |  |   // Convert the Tesseract structures to RowInfos  | 
2573  |  |   // for the paragraph detection algorithm.  | 
2574  | 13.9k  |   MutableIterator row(*block_start);  | 
2575  | 13.9k  |   if (row.Empty(RIL_TEXTLINE)) { | 
2576  | 0  |     return; // end of input already.  | 
2577  | 0  |   }  | 
2578  |  |  | 
2579  | 13.9k  |   std::vector<RowInfo> row_infos;  | 
2580  | 142k  |   do { | 
2581  | 142k  |     if (!row.PageResIt()->row()) { | 
2582  | 0  |       continue; // empty row.  | 
2583  | 0  |     }  | 
2584  | 142k  |     row.PageResIt()->row()->row->set_para(nullptr);  | 
2585  | 142k  |     row_infos.emplace_back();  | 
2586  | 142k  |     RowInfo &ri = row_infos.back();  | 
2587  | 142k  |     InitializeRowInfo(after_text_recognition, row, &ri);  | 
2588  | 142k  |   } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));  | 
2589  |  |  | 
2590  |  |   // If we're called before text recognition, we might not have  | 
2591  |  |   // tight block bounding boxes, so trim by the minimum on each side.  | 
2592  | 13.9k  |   if (!row_infos.empty()) { | 
2593  | 13.9k  |     int min_lmargin = row_infos[0].pix_ldistance;  | 
2594  | 13.9k  |     int min_rmargin = row_infos[0].pix_rdistance;  | 
2595  | 142k  |     for (unsigned i = 1; i < row_infos.size(); i++) { | 
2596  | 128k  |       if (row_infos[i].pix_ldistance < min_lmargin) { | 
2597  | 1.94k  |         min_lmargin = row_infos[i].pix_ldistance;  | 
2598  | 1.94k  |       }  | 
2599  | 128k  |       if (row_infos[i].pix_rdistance < min_rmargin) { | 
2600  | 4.08k  |         min_rmargin = row_infos[i].pix_rdistance;  | 
2601  | 4.08k  |       }  | 
2602  | 128k  |     }  | 
2603  | 13.9k  |     if (min_lmargin > 0 || min_rmargin > 0) { | 
2604  | 66.2k  |       for (auto &row_info : row_infos) { | 
2605  | 66.2k  |         row_info.pix_ldistance -= min_lmargin;  | 
2606  | 66.2k  |         row_info.pix_rdistance -= min_rmargin;  | 
2607  | 66.2k  |       }  | 
2608  | 6.76k  |     }  | 
2609  | 13.9k  |   }  | 
2610  |  |  | 
2611  |  |   // Run the paragraph detection algorithm.  | 
2612  | 13.9k  |   std::vector<PARA *> row_owners;  | 
2613  | 13.9k  |   if (!is_image_block) { | 
2614  | 13.9k  |     DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);  | 
2615  | 13.9k  |   } else { | 
2616  | 0  |     row_owners.resize(row_infos.size());  | 
2617  | 0  |     CanonicalizeDetectionResults(&row_owners, block->para_list());  | 
2618  | 0  |   }  | 
2619  |  |  | 
2620  |  |   // Now stitch in the row_owners into the rows.  | 
2621  | 13.9k  |   row = *block_start;  | 
2622  | 142k  |   for (auto &row_owner : row_owners) { | 
2623  | 142k  |     while (!row.PageResIt()->row()) { | 
2624  | 0  |       row.Next(RIL_TEXTLINE);  | 
2625  | 0  |     }  | 
2626  | 142k  |     row.PageResIt()->row()->row->set_para(row_owner);  | 
2627  | 142k  |     row.Next(RIL_TEXTLINE);  | 
2628  | 142k  |   }  | 
2629  | 13.9k  | }  | 
2630  |  |  | 
2631  |  | } // namespace tesseract  |