/src/tesseract/src/ccmain/paragraphs.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: paragraphs.cpp |
3 | | * Description: Paragraph detection for tesseract. |
4 | | * Author: David Eger |
5 | | * |
6 | | * (C) Copyright 2011, Google Inc. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #include "paragraphs.h" |
20 | | |
21 | | #include "helpers.h" // for UpdateRange, ClipToRange |
22 | | #include "host.h" // for NearlyEqual |
23 | | #include "mutableiterator.h" // for MutableIterator |
24 | | #include "ocrblock.h" // for BLOCK |
25 | | #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA... |
26 | | #include "ocrrow.h" // for ROW |
27 | | #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO... |
28 | | #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels |
29 | | #include "pdblock.h" // for PDBLK |
30 | | #include "polyblk.h" // for POLY_BLOCK |
31 | | #include "ratngs.h" // for WERD_CHOICE |
32 | | #include "rect.h" // for TBOX |
33 | | #include "statistc.h" // for STATS |
34 | | #include "tesserrstream.h" // for tesserr |
35 | | #include "tprintf.h" // for tprintf |
36 | | #include "unicharset.h" // for UNICHARSET |
37 | | #include "werd.h" // for WERD, W_REP_CHAR |
38 | | |
39 | | #include <tesseract/pageiterator.h> // for PageIterator |
40 | | #include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R... |
41 | | #include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID |
42 | | |
43 | | #include <algorithm> // for max |
44 | | #include <cctype> // for isspace |
45 | | #include <cmath> // for abs |
46 | | #include <cstdio> // for snprintf |
47 | | #include <cstdlib> // for abs |
48 | | #include <cstring> // for strchr, strlen |
49 | | #include <memory> // for unique_ptr |
50 | | |
51 | | static const char *const kRLE = "\u202A"; // Right-to-Left Embedding |
52 | | static const char *const kPDF = "\u202C"; // Pop Directional Formatting |
53 | | |
54 | | namespace tesseract { |
55 | | |
56 | | // Special "weak" ParagraphModels. |
57 | | const ParagraphModel *kCrownLeft = |
58 | | reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F)); |
59 | | const ParagraphModel *kCrownRight = |
60 | | reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F)); |
61 | | |
62 | | // Do the text and geometry of two rows support a paragraph break between them? |
63 | | static bool LikelyParagraphStart(const RowScratchRegisters &before, |
64 | | const RowScratchRegisters &after, |
65 | | tesseract::ParagraphJustification j); |
66 | | |
67 | | // Given the width of a typical space between words, what is the threshold |
68 | | // by which by which we think left and right alignments for paragraphs |
69 | | // can vary and still be aligned. |
70 | 21.0k | static int Epsilon(int space_pix) { |
71 | 21.0k | return space_pix * 4 / 5; |
72 | 21.0k | } |
73 | | |
74 | | static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name, |
75 | | const std::vector<RowScratchRegisters> *rows, int row_start, |
76 | 123k | int row_end) { |
77 | 123k | if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) { |
78 | 0 | tesserr << "Invalid arguments rows[" << row_start << ", " << row_end |
79 | 0 | << ") while rows is of size " << rows->size() << ".\n"; |
80 | 0 | return false; |
81 | 0 | } |
82 | 123k | if (row_end - row_start < min_num_rows) { |
83 | 6.06k | if (debug_level > 1) { |
84 | 0 | tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name); |
85 | 0 | } |
86 | 6.06k | return false; |
87 | 6.06k | } |
88 | 117k | return true; |
89 | 123k | } |
90 | | |
91 | | // =============================== Debug Code ================================ |
92 | | |
93 | | // Given a row-major matrix of unicode text and a column separator, print |
94 | | // a formatted table. For ASCII, we get good column alignment. |
95 | 0 | static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) { |
96 | 0 | std::vector<int> max_col_widths; |
97 | 0 | for (const auto &row : rows) { |
98 | 0 | auto num_columns = row.size(); |
99 | 0 | for (size_t c = 0; c < num_columns; c++) { |
100 | 0 | int num_unicodes = 0; |
101 | 0 | for (char i : row[c]) { |
102 | 0 | if ((i & 0xC0) != 0x80) { |
103 | 0 | num_unicodes++; |
104 | 0 | } |
105 | 0 | } |
106 | 0 | if (c >= max_col_widths.size()) { |
107 | 0 | max_col_widths.push_back(num_unicodes); |
108 | 0 | } else { |
109 | 0 | if (num_unicodes > max_col_widths[c]) { |
110 | 0 | max_col_widths[c] = num_unicodes; |
111 | 0 | } |
112 | 0 | } |
113 | 0 | } |
114 | 0 | } |
115 | |
|
116 | 0 | std::vector<std::string> col_width_patterns; |
117 | 0 | col_width_patterns.reserve(max_col_widths.size()); |
118 | 0 | for (int max_col_width : max_col_widths) { |
119 | 0 | col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s"); |
120 | 0 | } |
121 | |
|
122 | 0 | for (const auto &row : rows) { |
123 | 0 | for (unsigned c = 0; c < row.size(); c++) { |
124 | 0 | if (c > 0) { |
125 | 0 | tprintf("%s", colsep); |
126 | 0 | } |
127 | 0 | tprintf(col_width_patterns[c].c_str(), row[c].c_str()); |
128 | 0 | } |
129 | 0 | tprintf("\n"); |
130 | 0 | } |
131 | 0 | } |
132 | | |
133 | 0 | static std::string RtlEmbed(const std::string &word, bool rtlify) { |
134 | 0 | if (rtlify) { |
135 | 0 | return std::string(kRLE) + word + std::string(kPDF); |
136 | 0 | } |
137 | 0 | return word; |
138 | 0 | } |
139 | | |
140 | | // Print the current thoughts of the paragraph detector. |
141 | | static void PrintDetectorState(const ParagraphTheory &theory, |
142 | 0 | const std::vector<RowScratchRegisters> &rows) { |
143 | 0 | std::vector<std::vector<std::string>> output; |
144 | 0 | output.emplace_back(); |
145 | 0 | output.back().push_back("#row"); |
146 | 0 | output.back().push_back("space"); |
147 | 0 | output.back().push_back(".."); |
148 | 0 | output.back().push_back("lword[widthSEL]"); |
149 | 0 | output.back().push_back("rword[widthSEL]"); |
150 | 0 | RowScratchRegisters::AppendDebugHeaderFields(output.back()); |
151 | 0 | output.back().push_back("text"); |
152 | |
|
153 | 0 | for (unsigned i = 0; i < rows.size(); i++) { |
154 | 0 | output.emplace_back(); |
155 | 0 | std::vector<std::string> &row = output.back(); |
156 | 0 | const RowInfo &ri = *rows[i].ri_; |
157 | 0 | row.push_back(std::to_string(i)); |
158 | 0 | row.push_back(std::to_string(ri.average_interword_space)); |
159 | 0 | row.emplace_back(ri.has_leaders ? ".." : " "); |
160 | 0 | row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) + |
161 | 0 | (ri.lword_likely_starts_idea ? "S" : "s") + |
162 | 0 | (ri.lword_likely_ends_idea ? "E" : "e") + |
163 | 0 | (ri.lword_indicates_list_item ? "L" : "l") + "]"); |
164 | 0 | row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) + |
165 | 0 | (ri.rword_likely_starts_idea ? "S" : "s") + |
166 | 0 | (ri.rword_likely_ends_idea ? "E" : "e") + |
167 | 0 | (ri.rword_indicates_list_item ? "L" : "l") + "]"); |
168 | 0 | rows[i].AppendDebugInfo(theory, row); |
169 | 0 | row.push_back(RtlEmbed(ri.text, !ri.ltr)); |
170 | 0 | } |
171 | 0 | PrintTable(output, " "); |
172 | |
|
173 | 0 | tprintf("Active Paragraph Models:\n"); |
174 | 0 | unsigned m = 0; |
175 | 0 | for (const auto &model : theory.models()) { |
176 | 0 | tprintf(" %d: %s\n", ++m, model->ToString().c_str()); |
177 | 0 | } |
178 | 0 | } |
179 | | |
180 | | static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory, |
181 | 94.7k | const std::vector<RowScratchRegisters> &rows) { |
182 | 94.7k | if (!should_print) { |
183 | 94.7k | return; |
184 | 94.7k | } |
185 | 0 | tprintf("# %s\n", phase); |
186 | 0 | PrintDetectorState(theory, rows); |
187 | 0 | } |
188 | | |
189 | | // Print out the text for rows[row_start, row_end) |
190 | | static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start, |
191 | 0 | int row_end) { |
192 | 0 | tprintf("======================================\n"); |
193 | 0 | for (int row = row_start; row < row_end; row++) { |
194 | 0 | tprintf("%s\n", rows[row].ri_->text.c_str()); |
195 | 0 | } |
196 | 0 | tprintf("======================================\n"); |
197 | 0 | } |
198 | | |
199 | | // ============= Brain Dead Language Model (ASCII Version) =================== |
200 | | |
201 | 0 | static bool IsLatinLetter(int ch) { |
202 | 0 | return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); |
203 | 0 | } |
204 | | |
205 | 99.6k | static bool IsDigitLike(int ch) { |
206 | 99.6k | return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; |
207 | 99.6k | } |
208 | | |
209 | 0 | static bool IsOpeningPunct(int ch) { |
210 | 0 | return strchr("'\"({[", ch) != nullptr; |
211 | 0 | } |
212 | | |
213 | 0 | static bool IsTerminalPunct(int ch) { |
214 | 0 | return strchr(":'\".?!]})", ch) != nullptr; |
215 | 0 | } |
216 | | |
217 | | // Return a pointer after consuming as much text as qualifies as roman numeral. |
218 | 0 | static const char *SkipChars(const char *str, const char *toskip) { |
219 | 0 | while (*str != '\0' && strchr(toskip, *str)) { |
220 | 0 | str++; |
221 | 0 | } |
222 | 0 | return str; |
223 | 0 | } |
224 | | |
225 | 0 | static const char *SkipChars(const char *str, bool (*skip)(int)) { |
226 | 0 | while (*str != '\0' && skip(*str)) { |
227 | 0 | str++; |
228 | 0 | } |
229 | 0 | return str; |
230 | 0 | } |
231 | | |
232 | 0 | static const char *SkipOne(const char *str, const char *toskip) { |
233 | 0 | if (*str != '\0' && strchr(toskip, *str)) { |
234 | 0 | return str + 1; |
235 | 0 | } |
236 | 0 | return str; |
237 | 0 | } |
238 | | |
239 | | // Return whether it is very likely that this is a numeral marker that could |
240 | | // start a list item. Some examples include: |
241 | | // A I iii. VI (2) 3.5. [C-4] |
242 | 0 | static bool LikelyListNumeral(const std::string &word) { |
243 | 0 | const char *kRomans = "ivxlmdIVXLMD"; |
244 | 0 | const char *kDigits = "012345789"; |
245 | 0 | const char *kOpen = "[{("; |
246 | 0 | const char *kSep = ":;-.,"; |
247 | 0 | const char *kClose = "]})"; |
248 | |
|
249 | 0 | int num_segments = 0; |
250 | 0 | const char *pos = word.c_str(); |
251 | 0 | while (*pos != '\0' && num_segments < 3) { |
252 | | // skip up to two open parens. |
253 | 0 | const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); |
254 | 0 | const char *numeral_end = SkipChars(numeral_start, kRomans); |
255 | 0 | if (numeral_end != numeral_start) { |
256 | | // Got Roman Numeral. Great. |
257 | 0 | } else { |
258 | 0 | numeral_end = SkipChars(numeral_start, kDigits); |
259 | 0 | if (numeral_end == numeral_start) { |
260 | | // If there's a single latin letter, we can use that. |
261 | 0 | numeral_end = SkipChars(numeral_start, IsLatinLetter); |
262 | 0 | if (numeral_end - numeral_start != 1) { |
263 | 0 | break; |
264 | 0 | } |
265 | 0 | } |
266 | 0 | } |
267 | | // We got some sort of numeral. |
268 | 0 | num_segments++; |
269 | | // Skip any trailing parens or punctuation. |
270 | 0 | pos = SkipChars(SkipChars(numeral_end, kClose), kSep); |
271 | 0 | if (pos == numeral_end) { |
272 | 0 | break; |
273 | 0 | } |
274 | 0 | } |
275 | 0 | return *pos == '\0'; |
276 | 0 | } |
277 | | |
278 | 209k | static bool LikelyListMark(const std::string &word) { |
279 | 209k | const char *kListMarks = "0Oo*.,+."; |
280 | 209k | return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr; |
281 | 209k | } |
282 | | |
283 | 0 | bool AsciiLikelyListItem(const std::string &word) { |
284 | 0 | return LikelyListMark(word) || LikelyListNumeral(word); |
285 | 0 | } |
286 | | |
287 | | // ========== Brain Dead Language Model (Tesseract Version) ================ |
288 | | |
289 | | // Return the first Unicode Codepoint from werd[pos]. |
290 | 651k | static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) { |
291 | 651k | if (!u || !werd || pos > werd->length()) { |
292 | 0 | return 0; |
293 | 0 | } |
294 | 651k | return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); |
295 | 651k | } |
296 | | |
297 | | // A useful helper class for finding the first j >= i so that word[j] |
298 | | // does not have given character type. |
299 | | class UnicodeSpanSkipper { |
300 | | public: |
301 | | UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) |
302 | 274k | : u_(unicharset), word_(word), wordlen_(word->length()) { |
303 | 274k | } |
304 | | |
305 | | // Given an input position, return the first position >= pos not punc. |
306 | | unsigned SkipPunc(unsigned pos); |
307 | | // Given an input position, return the first position >= pos not digit. |
308 | | unsigned SkipDigits(unsigned pos); |
309 | | // Given an input position, return the first position >= pos not roman. |
310 | | unsigned SkipRomans(unsigned pos); |
311 | | // Given an input position, return the first position >= pos not alpha. |
312 | | unsigned SkipAlpha(unsigned pos); |
313 | | |
314 | | private: |
315 | | const UNICHARSET *u_; |
316 | | const WERD_CHOICE *word_; |
317 | | unsigned wordlen_; |
318 | | }; |
319 | | |
320 | 459k | unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) { |
321 | 572k | while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) { |
322 | 112k | pos++; |
323 | 112k | } |
324 | 459k | return pos; |
325 | 459k | } |
326 | | |
327 | 163k | unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) { |
328 | 193k | while (pos < wordlen_ && |
329 | 193k | (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) { |
330 | 29.6k | pos++; |
331 | 29.6k | } |
332 | 163k | return pos; |
333 | 163k | } |
334 | | |
335 | 278k | unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) { |
336 | 278k | const char *kRomans = "ivxlmdIVXLMD"; |
337 | 472k | while (pos < wordlen_) { |
338 | 336k | int ch = UnicodeFor(u_, word_, pos); |
339 | 336k | if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) { |
340 | 142k | break; |
341 | 142k | } |
342 | 193k | pos++; |
343 | 193k | } |
344 | 278k | return pos; |
345 | 278k | } |
346 | | |
347 | 140k | unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) { |
348 | 240k | while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) { |
349 | 99.6k | pos++; |
350 | 99.6k | } |
351 | 140k | return pos; |
352 | 140k | } |
353 | | |
354 | 214k | static bool LikelyListMarkUnicode(int ch) { |
355 | 214k | if (ch < 0x80) { |
356 | 209k | std::string single_ch; |
357 | 209k | single_ch += ch; |
358 | 209k | return LikelyListMark(single_ch); |
359 | 209k | } |
360 | 5.82k | switch (ch) { |
361 | | // TODO(eger) expand this list of unicodes as needed. |
362 | 6 | case 0x00B0: // degree sign |
363 | 6 | case 0x2022: // bullet |
364 | 6 | case 0x25E6: // white bullet |
365 | 6 | case 0x00B7: // middle dot |
366 | 6 | case 0x25A1: // white square |
367 | 6 | case 0x25A0: // black square |
368 | 6 | case 0x25AA: // black small square |
369 | 6 | case 0x2B1D: // black very small square |
370 | 6 | case 0x25BA: // black right-pointing pointer |
371 | 6 | case 0x25CF: // black circle |
372 | 6 | case 0x25CB: // white circle |
373 | 6 | return true; |
374 | 5.82k | default: |
375 | 5.82k | break; // fall through |
376 | 5.82k | } |
377 | 5.82k | return false; |
378 | 5.82k | } |
379 | | |
380 | | // Return whether it is very likely that this is a numeral marker that could |
381 | | // start a list item. Some examples include: |
382 | | // A I iii. VI (2) 3.5. [C-4] |
383 | 285k | static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { |
384 | 285k | if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) { |
385 | 10.4k | return true; |
386 | 10.4k | } |
387 | | |
388 | 274k | UnicodeSpanSkipper m(u, werd); |
389 | 274k | int num_segments = 0; |
390 | 274k | unsigned pos = 0; |
391 | 294k | while (pos < werd->length() && num_segments < 3) { |
392 | 285k | auto numeral_start = m.SkipPunc(pos); |
393 | 285k | if (numeral_start > pos + 1) { |
394 | 6.90k | break; |
395 | 6.90k | } |
396 | 278k | auto numeral_end = m.SkipRomans(numeral_start); |
397 | 278k | if (numeral_end == numeral_start) { |
398 | 163k | numeral_end = m.SkipDigits(numeral_start); |
399 | 163k | if (numeral_end == numeral_start) { |
400 | | // If there's a single latin letter, we can use that. |
401 | 140k | numeral_end = m.SkipAlpha(numeral_start); |
402 | 140k | if (numeral_end - numeral_start != 1) { |
403 | 104k | break; |
404 | 104k | } |
405 | 140k | } |
406 | 163k | } |
407 | | // We got some sort of numeral. |
408 | 174k | num_segments++; |
409 | | // Skip any trailing punctuation. |
410 | 174k | pos = m.SkipPunc(numeral_end); |
411 | 174k | if (pos == numeral_end) { |
412 | 154k | break; |
413 | 154k | } |
414 | 174k | } |
415 | 274k | return pos == werd->length(); |
416 | 285k | } |
417 | | |
418 | | template<class T> |
419 | 873k | void push_back_new(std::vector<T> &vector, const T &data) { |
420 | 873k | if (std::find(vector.begin(), vector.end(), data) == vector.end()) { |
421 | 854k | vector.push_back(data); |
422 | 854k | } |
423 | 873k | } void tesseract::push_back_new<tesseract::LineHypothesis>(std::__1::vector<tesseract::LineHypothesis, std::__1::allocator<tesseract::LineHypothesis> >&, tesseract::LineHypothesis const&) Line | Count | Source | 419 | 125k | void push_back_new(std::vector<T> &vector, const T &data) { | 420 | 125k | if (std::find(vector.begin(), vector.end(), data) == vector.end()) { | 421 | 123k | vector.push_back(data); | 422 | 123k | } | 423 | 125k | } |
void tesseract::push_back_new<tesseract::ParagraphModel const*>(std::__1::vector<tesseract::ParagraphModel const*, std::__1::allocator<tesseract::ParagraphModel const*> >&, tesseract::ParagraphModel const* const&) Line | Count | Source | 419 | 744k | void push_back_new(std::vector<T> &vector, const T &data) { | 420 | 744k | if (std::find(vector.begin(), vector.end(), data) == vector.end()) { | 421 | 727k | vector.push_back(data); | 422 | 727k | } | 423 | 744k | } |
void tesseract::push_back_new<tesseract::ParagraphModel*>(std::__1::vector<tesseract::ParagraphModel*, std::__1::allocator<tesseract::ParagraphModel*> >&, tesseract::ParagraphModel* const&) Line | Count | Source | 419 | 3.28k | void push_back_new(std::vector<T> &vector, const T &data) { | 420 | 3.28k | if (std::find(vector.begin(), vector.end(), data) == vector.end()) { | 421 | 3.28k | vector.push_back(data); | 422 | 3.28k | } | 423 | 3.28k | } |
|
424 | | |
425 | | // ========= Brain Dead Language Model (combined entry points) ================ |
426 | | |
427 | | // Given the leftmost word of a line either as a Tesseract unicharset + werd |
428 | | // or a utf8 string, set the following attributes for it: |
429 | | // is_list - this word might be a list number or bullet. |
430 | | // starts_idea - this word is likely to start a sentence. |
431 | | // ends_idea - this word is likely to end a sentence. |
432 | | void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, |
433 | 142k | bool *is_list, bool *starts_idea, bool *ends_idea) { |
434 | 142k | *is_list = false; |
435 | 142k | *starts_idea = false; |
436 | 142k | *ends_idea = false; |
437 | 142k | if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty |
438 | 0 | *ends_idea = true; |
439 | 0 | return; |
440 | 0 | } |
441 | | |
442 | 142k | if (unicharset && werd) { // We have a proper werd and unicharset so use it. |
443 | 142k | if (UniLikelyListItem(unicharset, werd)) { |
444 | 79.9k | *is_list = true; |
445 | 79.9k | *starts_idea = true; |
446 | 79.9k | *ends_idea = true; |
447 | 79.9k | } |
448 | 142k | if (unicharset->get_isupper(werd->unichar_id(0))) { |
449 | 24.9k | *starts_idea = true; |
450 | 24.9k | } |
451 | 142k | if (unicharset->get_ispunctuation(werd->unichar_id(0))) { |
452 | 32.8k | *starts_idea = true; |
453 | 32.8k | *ends_idea = true; |
454 | 32.8k | } |
455 | 142k | } else { // Assume utf8 is mostly ASCII |
456 | 0 | if (AsciiLikelyListItem(utf8)) { |
457 | 0 | *is_list = true; |
458 | 0 | *starts_idea = true; |
459 | 0 | } |
460 | 0 | int start_letter = utf8[0]; |
461 | 0 | if (IsOpeningPunct(start_letter)) { |
462 | 0 | *starts_idea = true; |
463 | 0 | } |
464 | 0 | if (IsTerminalPunct(start_letter)) { |
465 | 0 | *ends_idea = true; |
466 | 0 | } |
467 | 0 | if (start_letter >= 'A' && start_letter <= 'Z') { |
468 | 0 | *starts_idea = true; |
469 | 0 | } |
470 | 0 | } |
471 | 142k | } |
472 | | |
473 | | // Given the rightmost word of a line either as a Tesseract unicharset + werd |
474 | | // or a utf8 string, set the following attributes for it: |
475 | | // is_list - this word might be a list number or bullet. |
476 | | // starts_idea - this word is likely to start a sentence. |
477 | | // ends_idea - this word is likely to end a sentence. |
478 | | void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, |
479 | 142k | bool *is_list, bool *starts_idea, bool *ends_idea) { |
480 | 142k | *is_list = false; |
481 | 142k | *starts_idea = false; |
482 | 142k | *ends_idea = false; |
483 | 142k | if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty |
484 | 0 | *ends_idea = true; |
485 | 0 | return; |
486 | 0 | } |
487 | | |
488 | 142k | if (unicharset && werd) { // We have a proper werd and unicharset so use it. |
489 | 142k | if (UniLikelyListItem(unicharset, werd)) { |
490 | 80.0k | *is_list = true; |
491 | 80.0k | *starts_idea = true; |
492 | 80.0k | } |
493 | 142k | UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); |
494 | 142k | if (unicharset->get_ispunctuation(last_letter)) { |
495 | 33.3k | *ends_idea = true; |
496 | 33.3k | } |
497 | 142k | } else { // Assume utf8 is mostly ASCII |
498 | 0 | if (AsciiLikelyListItem(utf8)) { |
499 | 0 | *is_list = true; |
500 | 0 | *starts_idea = true; |
501 | 0 | } |
502 | 0 | int last_letter = utf8[utf8.size() - 1]; |
503 | 0 | if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { |
504 | 0 | *ends_idea = true; |
505 | 0 | } |
506 | 0 | } |
507 | 142k | } |
508 | | |
509 | | // =============== Implementation of RowScratchRegisters ===================== |
510 | | /* static */ |
511 | 0 | void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) { |
512 | 0 | header.emplace_back("[lmarg,lind;rind,rmarg]"); |
513 | 0 | header.emplace_back("model"); |
514 | 0 | } |
515 | | |
516 | | void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, |
517 | 0 | std::vector<std::string> &dbg) const { |
518 | 0 | char s[60]; |
519 | | // The largest (positive and negative) numbers are reported for lindent & rindent. |
520 | | // While the column header has widths 5,4,4,5, it is therefore opportune to slightly |
521 | | // offset the widths in the format string here to allow ample space for lindent & rindent |
522 | | // while keeping the final table output nicely readable: 4,5,5,4. |
523 | 0 | snprintf(s, sizeof(s), "[%4d,%5d;%5d,%4d]", lmargin_, lindent_, rindent_, rmargin_); |
524 | 0 | dbg.emplace_back(s); |
525 | 0 | std::string model_string; |
526 | 0 | model_string += static_cast<char>(GetLineType()); |
527 | 0 | model_string += ":"; |
528 | |
|
529 | 0 | int model_numbers = 0; |
530 | 0 | for (const auto &hypothese : hypotheses_) { |
531 | 0 | if (hypothese.model == nullptr) { |
532 | 0 | continue; |
533 | 0 | } |
534 | 0 | if (model_numbers > 0) { |
535 | 0 | model_string += ","; |
536 | 0 | } |
537 | 0 | if (StrongModel(hypothese.model)) { |
538 | 0 | model_string += std::to_string(1 + theory.IndexOf(hypothese.model)); |
539 | 0 | } else if (hypothese.model == kCrownLeft) { |
540 | 0 | model_string += "CrL"; |
541 | 0 | } else if (hypothese.model == kCrownRight) { |
542 | 0 | model_string += "CrR"; |
543 | 0 | } |
544 | 0 | model_numbers++; |
545 | 0 | } |
546 | 0 | if (model_numbers == 0) { |
547 | 0 | model_string += "0"; |
548 | 0 | } |
549 | |
|
550 | 0 | dbg.push_back(model_string); |
551 | 0 | } |
552 | | |
553 | 142k | void RowScratchRegisters::Init(const RowInfo &row) { |
554 | 142k | ri_ = &row; |
555 | 142k | lmargin_ = 0; |
556 | 142k | lindent_ = row.pix_ldistance; |
557 | 142k | rmargin_ = 0; |
558 | 142k | rindent_ = row.pix_rdistance; |
559 | 142k | } |
560 | | |
561 | 680k | LineType RowScratchRegisters::GetLineType() const { |
562 | 680k | if (hypotheses_.empty()) { |
563 | 475k | return LT_UNKNOWN; |
564 | 475k | } |
565 | 205k | bool has_start = false; |
566 | 205k | bool has_body = false; |
567 | 205k | for (const auto &hypothese : hypotheses_) { |
568 | 205k | switch (hypothese.ty) { |
569 | 54.4k | case LT_START: |
570 | 54.4k | has_start = true; |
571 | 54.4k | break; |
572 | 151k | case LT_BODY: |
573 | 151k | has_body = true; |
574 | 151k | break; |
575 | 0 | default: |
576 | 0 | tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty); |
577 | 0 | break; |
578 | 205k | } |
579 | 205k | } |
580 | 205k | if (has_start && has_body) { |
581 | 271 | return LT_MULTIPLE; |
582 | 271 | } |
583 | 204k | return has_start ? LT_START : LT_BODY; |
584 | 205k | } |
585 | | |
586 | 399k | LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { |
587 | 399k | if (hypotheses_.empty()) { |
588 | 9.52k | return LT_UNKNOWN; |
589 | 9.52k | } |
590 | 389k | bool has_start = false; |
591 | 389k | bool has_body = false; |
592 | 391k | for (const auto &hypothese : hypotheses_) { |
593 | 391k | if (hypothese.model != model) { |
594 | 68.1k | continue; |
595 | 68.1k | } |
596 | 322k | switch (hypothese.ty) { |
597 | 113k | case LT_START: |
598 | 113k | has_start = true; |
599 | 113k | break; |
600 | 209k | case LT_BODY: |
601 | 209k | has_body = true; |
602 | 209k | break; |
603 | 0 | default: |
604 | 0 | tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty); |
605 | 0 | break; |
606 | 322k | } |
607 | 322k | } |
608 | 389k | if (has_start && has_body) { |
609 | 66 | return LT_MULTIPLE; |
610 | 66 | } |
611 | 389k | return has_start ? LT_START : LT_BODY; |
612 | 389k | } |
613 | | |
614 | 10.6k | void RowScratchRegisters::SetStartLine() { |
615 | 10.6k | LineType current_lt = GetLineType(); |
616 | 10.6k | if (current_lt != LT_UNKNOWN && current_lt != LT_START) { |
617 | 0 | tprintf("Trying to set a line to be START when it's already BODY.\n"); |
618 | 0 | } |
619 | 10.6k | if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { |
620 | 10.6k | push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr)); |
621 | 10.6k | } |
622 | 10.6k | } |
623 | | |
624 | 22.4k | void RowScratchRegisters::SetBodyLine() { |
625 | 22.4k | LineType current_lt = GetLineType(); |
626 | 22.4k | if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { |
627 | 0 | tprintf("Trying to set a line to be BODY when it's already START.\n"); |
628 | 0 | } |
629 | 22.4k | if (current_lt == LT_UNKNOWN || current_lt == LT_START) { |
630 | 22.4k | push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr)); |
631 | 22.4k | } |
632 | 22.4k | } |
633 | | |
634 | 20.6k | void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { |
635 | 20.6k | push_back_new(hypotheses_, LineHypothesis(LT_START, model)); |
636 | 20.6k | auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr)); |
637 | 20.6k | if (found != hypotheses_.end()) { |
638 | 1.61k | hypotheses_.erase(found); |
639 | 1.61k | } |
640 | 20.6k | } |
641 | | |
642 | 71.8k | void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { |
643 | 71.8k | push_back_new(hypotheses_, LineHypothesis(LT_BODY, model)); |
644 | 71.8k | auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr)); |
645 | 71.8k | if (found != hypotheses_.end()) { |
646 | 1.80k | hypotheses_.erase(found); |
647 | 1.80k | } |
648 | 71.8k | } |
649 | | |
650 | 571k | void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { |
651 | 571k | for (const auto &hypothese : hypotheses_) { |
652 | 224k | if (hypothese.ty == LT_START && StrongModel(hypothese.model)) { |
653 | 2.71k | push_back_new(*models, hypothese.model); |
654 | 2.71k | } |
655 | 224k | } |
656 | 571k | } |
657 | | |
658 | 1.09M | void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { |
659 | 1.09M | for (const auto &hypothese : hypotheses_) { |
660 | 433k | if (StrongModel(hypothese.model)) { |
661 | 79.6k | push_back_new(*models, hypothese.model); |
662 | 79.6k | } |
663 | 433k | } |
664 | 1.09M | } |
665 | | |
666 | 1.28M | void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { |
667 | 1.28M | for (const auto &hypothese : hypotheses_) { |
668 | 696k | if (hypothese.model != nullptr) { |
669 | 622k | push_back_new(*models, hypothese.model); |
670 | 622k | } |
671 | 696k | } |
672 | 1.28M | } |
673 | | |
674 | 14.6k | const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { |
675 | 14.6k | if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) { |
676 | 173 | return nullptr; |
677 | 173 | } |
678 | 14.4k | return hypotheses_[0].model; |
679 | 14.6k | } |
680 | | |
681 | 149k | const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { |
682 | 149k | if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) { |
683 | 118k | return nullptr; |
684 | 118k | } |
685 | 30.6k | return hypotheses_[0].model; |
686 | 149k | } |
687 | | |
688 | | // Discard any hypotheses whose model is not in the given list. |
689 | 0 | void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) { |
690 | 0 | if (models.empty()) { |
691 | 0 | return; |
692 | 0 | } |
693 | 0 | for (int h = hypotheses_.size() - 1; h >= 0; h--) { |
694 | 0 | if (!contains(models, hypotheses_[h].model)) { |
695 | 0 | hypotheses_.erase(hypotheses_.begin() + h); |
696 | 0 | } |
697 | 0 | } |
698 | 0 | } |
699 | | |
700 | | // ============ Geometry based Paragraph Detection Algorithm ================= |
701 | | |
702 | | struct Cluster { |
703 | 0 | Cluster() : center(0), count(0) {} |
704 | 109k | Cluster(int cen, int num) : center(cen), count(num) {} |
705 | | |
706 | | int center; // The center of the cluster. |
707 | | int count; // The number of entries within the cluster. |
708 | | }; |
709 | | |
710 | | class SimpleClusterer { |
711 | | public: |
712 | 39.7k | explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {} |
713 | 523k | void Add(int value) { |
714 | 523k | values_.push_back(value); |
715 | 523k | } |
716 | 0 | size_t size() const { |
717 | 0 | return values_.size(); |
718 | 0 | } |
719 | | void GetClusters(std::vector<Cluster> *clusters); |
720 | | |
721 | | private: |
722 | | int max_cluster_width_; |
723 | | std::vector<int> values_; |
724 | | }; |
725 | | |
726 | | // Return the index of the cluster closest to value. |
727 | 363k | static int ClosestCluster(const std::vector<Cluster> &clusters, int value) { |
728 | 363k | unsigned best_index = 0; |
729 | 1.11M | for (unsigned i = 0; i < clusters.size(); i++) { |
730 | 752k | if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) { |
731 | 162k | best_index = i; |
732 | 162k | } |
733 | 752k | } |
734 | 363k | return best_index; |
735 | 363k | } |
736 | | |
737 | 59.5k | void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) { |
738 | 59.5k | clusters->clear(); |
739 | 59.5k | std::sort(values_.begin(), values_.end()); |
740 | 168k | for (unsigned i = 0; i < values_.size();) { |
741 | 109k | int orig_i = i; |
742 | 109k | int lo = values_[i]; |
743 | 109k | int hi = lo; |
744 | 784k | while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { |
745 | 675k | hi = values_[i]; |
746 | 675k | } |
747 | 109k | clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); |
748 | 109k | } |
749 | 59.5k | } |
750 | | |
751 | | // Calculate left- and right-indent tab stop values seen in |
752 | | // rows[row_start, row_end) given a tolerance of tolerance. |
753 | | static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end, |
754 | | int tolerance, std::vector<Cluster> *left_tabs, |
755 | 9.92k | std::vector<Cluster> *right_tabs) { |
756 | 9.92k | if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) { |
757 | 0 | return; |
758 | 0 | } |
759 | | // First pass: toss all left and right indents into clusterers. |
760 | 9.92k | SimpleClusterer initial_lefts(tolerance); |
761 | 9.92k | SimpleClusterer initial_rights(tolerance); |
762 | 9.92k | std::vector<Cluster> initial_left_tabs; |
763 | 9.92k | std::vector<Cluster> initial_right_tabs; |
764 | 141k | for (int i = row_start; i < row_end; i++) { |
765 | 131k | initial_lefts.Add((*rows)[i].lindent_); |
766 | 131k | initial_rights.Add((*rows)[i].rindent_); |
767 | 131k | } |
768 | 9.92k | initial_lefts.GetClusters(&initial_left_tabs); |
769 | 9.92k | initial_rights.GetClusters(&initial_right_tabs); |
770 | | |
771 | | // Second pass: cluster only lines that are not "stray" |
772 | | // An example of a stray line is a page number -- a line whose start |
773 | | // and end tab-stops are far outside the typical start and end tab-stops |
774 | | // for the block. |
775 | | // Put another way, we only cluster data from lines whose start or end |
776 | | // tab stop is frequent. |
777 | 9.92k | SimpleClusterer lefts(tolerance); |
778 | 9.92k | SimpleClusterer rights(tolerance); |
779 | | |
780 | | // Outlier elimination. We might want to switch this to test outlier-ness |
781 | | // based on how strange a position an outlier is in instead of or in addition |
782 | | // to how rare it is. These outliers get re-added if we end up having too |
783 | | // few tab stops, to work with, however. |
784 | 9.92k | int infrequent_enough_to_ignore = 0; |
785 | 9.92k | if (row_end - row_start >= 8) { |
786 | 7.11k | infrequent_enough_to_ignore = 1; |
787 | 7.11k | } |
788 | 9.92k | if (row_end - row_start >= 20) { |
789 | 2.07k | infrequent_enough_to_ignore = 2; |
790 | 2.07k | } |
791 | | |
792 | 141k | for (int i = row_start; i < row_end; i++) { |
793 | 131k | int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); |
794 | 131k | int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); |
795 | 131k | if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || |
796 | 131k | initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { |
797 | 130k | lefts.Add((*rows)[i].lindent_); |
798 | 130k | rights.Add((*rows)[i].rindent_); |
799 | 130k | } |
800 | 131k | } |
801 | 9.92k | lefts.GetClusters(left_tabs); |
802 | 9.92k | rights.GetClusters(right_tabs); |
803 | | |
804 | 9.92k | if ((left_tabs->size() == 1 && right_tabs->size() >= 4) || |
805 | 9.92k | (right_tabs->size() == 1 && left_tabs->size() >= 4)) { |
806 | | // One side is really ragged, and the other only has one tab stop, |
807 | | // so those "insignificant outliers" are probably important, actually. |
808 | | // This often happens on a page of an index. Add back in the ones |
809 | | // we omitted in the first pass. |
810 | 3.91k | for (int i = row_start; i < row_end; i++) { |
811 | 3.69k | int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); |
812 | 3.69k | int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); |
813 | 3.69k | if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore || |
814 | 3.69k | initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) { |
815 | 60 | lefts.Add((*rows)[i].lindent_); |
816 | 60 | rights.Add((*rows)[i].rindent_); |
817 | 60 | } |
818 | 3.69k | } |
819 | 227 | } |
820 | 9.92k | lefts.GetClusters(left_tabs); |
821 | 9.92k | rights.GetClusters(right_tabs); |
822 | | |
823 | | // If one side is almost a two-indent aligned side, and the other clearly |
824 | | // isn't, try to prune out the least frequent tab stop from that side. |
825 | 9.92k | if (left_tabs->size() == 3 && right_tabs->size() >= 4) { |
826 | 171 | int to_prune = -1; |
827 | 684 | for (int i = left_tabs->size() - 1; i >= 0; i--) { |
828 | 513 | if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) { |
829 | 195 | to_prune = i; |
830 | 195 | } |
831 | 513 | } |
832 | 171 | if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { |
833 | 100 | left_tabs->erase(left_tabs->begin() + to_prune); |
834 | 100 | } |
835 | 171 | } |
836 | 9.92k | if (right_tabs->size() == 3 && left_tabs->size() >= 4) { |
837 | 111 | int to_prune = -1; |
838 | 444 | for (int i = right_tabs->size() - 1; i >= 0; i--) { |
839 | 333 | if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) { |
840 | 130 | to_prune = i; |
841 | 130 | } |
842 | 333 | } |
843 | 111 | if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { |
844 | 87 | right_tabs->erase(right_tabs->begin() + to_prune); |
845 | 87 | } |
846 | 111 | } |
847 | 9.92k | } |
848 | | |
849 | | // Given a paragraph model mark rows[row_start, row_end) as said model |
850 | | // start or body lines. |
851 | | // |
852 | | // Case 1: model->first_indent_ != model->body_indent_ |
853 | | // Differentiating the paragraph start lines from the paragraph body lines in |
854 | | // this case is easy, we just see how far each line is indented. |
855 | | // |
856 | | // Case 2: model->first_indent_ == model->body_indent_ |
857 | | // Here, we find end-of-paragraph lines by looking for "short lines." |
858 | | // What constitutes a "short line" changes depending on whether the text |
859 | | // ragged-right[left] or fully justified (aligned left and right). |
860 | | // |
861 | | // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) |
862 | | // We have a new paragraph it the first word would have at the end |
863 | | // of the previous line. |
864 | | // |
865 | | // Case 2b: Fully Justified. (eop_threshold > 0) |
866 | | // We mark a line as short (end of paragraph) if the offside indent |
867 | | // is greater than eop_threshold. |
868 | | static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end, |
869 | 2.16k | const ParagraphModel *model, bool ltr, int eop_threshold) { |
870 | 2.16k | if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { |
871 | 0 | return; |
872 | 0 | } |
873 | 31.0k | for (int row = row_start; row < row_end; row++) { |
874 | 28.8k | bool valid_first = ValidFirstLine(rows, row, model); |
875 | 28.8k | bool valid_body = ValidBodyLine(rows, row, model); |
876 | 28.8k | if (valid_first && !valid_body) { |
877 | 2.27k | (*rows)[row].AddStartLine(model); |
878 | 26.5k | } else if (valid_body && !valid_first) { |
879 | 8.65k | (*rows)[row].AddBodyLine(model); |
880 | 17.9k | } else if (valid_body && valid_first) { |
881 | 17.6k | bool after_eop = (row == row_start); |
882 | 17.6k | if (row > row_start) { |
883 | 16.3k | if (eop_threshold > 0) { |
884 | 6.66k | if (model->justification() == JUSTIFICATION_LEFT) { |
885 | 6.32k | after_eop = (*rows)[row - 1].rindent_ > eop_threshold; |
886 | 6.32k | } else { |
887 | 336 | after_eop = (*rows)[row - 1].lindent_ > eop_threshold; |
888 | 336 | } |
889 | 9.73k | } else { |
890 | 9.73k | after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification()); |
891 | 9.73k | } |
892 | 16.3k | } |
893 | 17.6k | if (after_eop) { |
894 | 4.20k | (*rows)[row].AddStartLine(model); |
895 | 13.4k | } else { |
896 | 13.4k | (*rows)[row].AddBodyLine(model); |
897 | 13.4k | } |
898 | 17.6k | } else { |
899 | | // Do nothing. Stray row. |
900 | 298 | } |
901 | 28.8k | } |
902 | 2.16k | } |
903 | | |
904 | | // GeometricClassifierState holds all of the information we'll use while |
905 | | // trying to determine a paragraph model for the text lines in a block of |
906 | | // text: |
907 | | // + the rows under consideration [row_start, row_end) |
908 | | // + the common left- and right-indent tab stops |
909 | | // + does the block start out left-to-right or right-to-left |
910 | | // Further, this struct holds the data we amass for the (single) ParagraphModel |
911 | | // we'll assign to the text lines (assuming we get that far). |
912 | | struct GeometricClassifierState { |
913 | | GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start, |
914 | | int r_end) |
915 | 9.92k | : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) { |
916 | 9.92k | tolerance = InterwordSpace(*r, r_start, r_end); |
917 | 9.92k | CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs); |
918 | 9.92k | if (debug_level >= 3) { |
919 | 0 | tesserr << "Geometry: TabStop cluster tolerance = " << tolerance << "; " |
920 | 0 | << left_tabs.size() << " left tabs; " |
921 | 0 | << right_tabs.size() << " right tabs\n"; |
922 | 0 | } |
923 | 9.92k | ltr = (*r)[r_start].ri_->ltr; |
924 | 9.92k | } |
925 | | |
926 | 3.00k | void AssumeLeftJustification() { |
927 | 3.00k | just = tesseract::JUSTIFICATION_LEFT; |
928 | 3.00k | margin = (*rows)[row_start].lmargin_; |
929 | 3.00k | } |
930 | | |
931 | 482 | void AssumeRightJustification() { |
932 | 482 | just = tesseract::JUSTIFICATION_RIGHT; |
933 | 482 | margin = (*rows)[row_start].rmargin_; |
934 | 482 | } |
935 | | |
936 | | // Align tabs are the tab stops the text is aligned to. |
937 | 17.2k | const std::vector<Cluster> &AlignTabs() const { |
938 | 17.2k | if (just == tesseract::JUSTIFICATION_RIGHT) { |
939 | 3.26k | return right_tabs; |
940 | 3.26k | } |
941 | 13.9k | return left_tabs; |
942 | 17.2k | } |
943 | | |
944 | | // Offside tabs are the tab stops opposite the tabs used to align the text. |
945 | | // |
946 | | // Note that for a left-to-right text which is aligned to the right such as |
947 | | // this function comment, the offside tabs are the horizontal tab stops |
948 | | // marking the beginning of ("Note", "this" and "marking"). |
949 | 7.36k | const std::vector<Cluster> &OffsideTabs() const { |
950 | 7.36k | if (just == tesseract::JUSTIFICATION_RIGHT) { |
951 | 1.85k | return left_tabs; |
952 | 1.85k | } |
953 | 5.50k | return right_tabs; |
954 | 7.36k | } |
955 | | |
956 | | // Return whether the i'th row extends from the leftmost left tab stop |
957 | | // to the right most right tab stop. |
958 | 44.6k | bool IsFullRow(int i) const { |
959 | 44.6k | return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && |
960 | 44.6k | ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; |
961 | 44.6k | } |
962 | | |
963 | 5.39k | int AlignsideTabIndex(int row_idx) const { |
964 | 5.39k | return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); |
965 | 5.39k | } |
966 | | |
967 | | // Given what we know about the paragraph justification (just), would the |
968 | | // first word of row_b have fit at the end of row_a? |
969 | 33.2k | bool FirstWordWouldHaveFit(int row_a, int row_b) { |
970 | 33.2k | return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just); |
971 | 33.2k | } |
972 | | |
973 | 0 | void PrintRows() const { |
974 | 0 | PrintRowRange(*rows, row_start, row_end); |
975 | 0 | } |
976 | | |
977 | 6.43k | void Fail(int min_debug_level, const char *why) const { |
978 | 6.43k | if (debug_level < min_debug_level) { |
979 | 6.43k | return; |
980 | 6.43k | } |
981 | 0 | tprintf("# %s\n", why); |
982 | 0 | PrintRows(); |
983 | 0 | } |
984 | | |
985 | 2.16k | ParagraphModel Model() const { |
986 | 2.16k | return ParagraphModel(just, margin, first_indent, body_indent, tolerance); |
987 | 2.16k | } |
988 | | |
989 | | // We print out messages with a debug level at least as great as debug_level. |
990 | | int debug_level = 0; |
991 | | |
992 | | // The Geometric Classifier was asked to find a single paragraph model |
993 | | // to fit the text rows (*rows)[row_start, row_end) |
994 | | std::vector<RowScratchRegisters> *rows; |
995 | | int row_start = 0; |
996 | | int row_end = 0; |
997 | | |
998 | | // The amount by which we expect the text edge can vary and still be aligned. |
999 | | int tolerance = 0; |
1000 | | |
1001 | | // Is the script in this text block left-to-right? |
1002 | | // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve |
1003 | | bool ltr = false; |
1004 | | |
1005 | | // These left and right tab stops were determined to be the common tab |
1006 | | // stops for the given text. |
1007 | | std::vector<Cluster> left_tabs; |
1008 | | std::vector<Cluster> right_tabs; |
1009 | | |
1010 | | // These are parameters we must determine to create a ParagraphModel. |
1011 | | tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN; |
1012 | | int margin = 0; |
1013 | | int first_indent = 0; |
1014 | | int body_indent = 0; |
1015 | | |
1016 | | // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() |
1017 | | int eop_threshold = 0; |
1018 | | }; |
1019 | | |
1020 | | // Given a section of text where strong textual clues did not help identifying |
1021 | | // paragraph breaks, and for which the left and right indents have exactly |
1022 | | // three tab stops between them, attempt to find the paragraph breaks based |
1023 | | // solely on the outline of the text and whether the script is left-to-right. |
1024 | | // |
1025 | | // Algorithm Detail: |
1026 | | // The selected rows are in the form of a rectangle except |
1027 | | // for some number of "short lines" of the same length: |
1028 | | // |
1029 | | // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx |
1030 | | // xxxxxxxxxxx xxxxxxxxxx # A "short" line. |
1031 | | // xxxxxxxxxxxxx xxxxxxxxxxxx |
1032 | | // xxxxxxxxxxxxx xxxxxxxxxxxx |
1033 | | // |
1034 | | // We have a slightly different situation if the only short |
1035 | | // line is at the end of the excerpt. |
1036 | | // |
1037 | | // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx |
1038 | | // xxxxxxxxxxxxx xxxxxxxxxxxx |
1039 | | // xxxxxxxxxxxxx xxxxxxxxxxxx |
1040 | | // xxxxxxxxxxx xxxxxxxxxx # A "short" line. |
1041 | | // |
1042 | | // We'll interpret these as follows based on the reasoning in the comment for |
1043 | | // GeometricClassify(): |
1044 | | // [script direction: first indent, body indent] |
1045 | | // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 |
1046 | | // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 |
1047 | | static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, |
1048 | 3.41k | ParagraphTheory *theory) { |
1049 | 3.41k | int num_rows = s.row_end - s.row_start; |
1050 | 3.41k | int num_full_rows = 0; |
1051 | 3.41k | int last_row_full = 0; |
1052 | 48.0k | for (int i = s.row_start; i < s.row_end; i++) { |
1053 | 44.6k | if (s.IsFullRow(i)) { |
1054 | 12.0k | num_full_rows++; |
1055 | 12.0k | if (i == s.row_end - 1) { |
1056 | 671 | last_row_full++; |
1057 | 671 | } |
1058 | 12.0k | } |
1059 | 44.6k | } |
1060 | | |
1061 | 3.41k | if (num_full_rows < 0.7 * num_rows) { |
1062 | 2.57k | s.Fail(1, "Not enough full lines to know which lines start paras."); |
1063 | 2.57k | return; |
1064 | 2.57k | } |
1065 | | |
1066 | | // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() |
1067 | 837 | s.eop_threshold = 0; |
1068 | | |
1069 | 837 | if (s.ltr) { |
1070 | 837 | s.AssumeLeftJustification(); |
1071 | 837 | } else { |
1072 | 0 | s.AssumeRightJustification(); |
1073 | 0 | } |
1074 | | |
1075 | 837 | if (debug_level > 0) { |
1076 | 0 | tprintf( |
1077 | 0 | "# Not enough variety for clear outline classification. " |
1078 | 0 | "Guessing these are %s aligned based on script.\n", |
1079 | 0 | s.ltr ? "left" : "right"); |
1080 | 0 | s.PrintRows(); |
1081 | 0 | } |
1082 | | |
1083 | 837 | if (s.AlignTabs().size() == 2) { // case A1 or A2 |
1084 | 387 | s.first_indent = s.AlignTabs()[1].center; |
1085 | 387 | s.body_indent = s.AlignTabs()[0].center; |
1086 | 450 | } else { // case B1 or B2 |
1087 | 450 | if (num_rows - 1 == num_full_rows - last_row_full) { |
1088 | | // case B2 |
1089 | 82 | const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; |
1090 | 82 | (*s.rows)[s.row_start].AddStartLine(model); |
1091 | 732 | for (int i = s.row_start + 1; i < s.row_end; i++) { |
1092 | 650 | (*s.rows)[i].AddBodyLine(model); |
1093 | 650 | } |
1094 | 82 | return; |
1095 | 368 | } else { |
1096 | | // case B1 |
1097 | 368 | s.first_indent = s.body_indent = s.AlignTabs()[0].center; |
1098 | 368 | s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; |
1099 | 368 | } |
1100 | 450 | } |
1101 | 755 | const ParagraphModel *model = theory->AddModel(s.Model()); |
1102 | 755 | MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold); |
1103 | 755 | return; |
1104 | 837 | } |
1105 | | |
1106 | | // This function is called if strong textual clues were not available, but |
1107 | | // the caller hopes that the paragraph breaks will be super obvious just |
1108 | | // by the outline of the text. |
1109 | | // |
1110 | | // The particularly difficult case is figuring out what's going on if you |
1111 | | // don't have enough short paragraph end lines to tell us what's going on. |
1112 | | // |
1113 | | // For instance, let's say you have the following outline: |
1114 | | // |
1115 | | // (A1) xxxxxxxxxxxxxxxxxxxxxx |
1116 | | // xxxxxxxxxxxxxxxxxxxx |
1117 | | // xxxxxxxxxxxxxxxxxxxxxx |
1118 | | // xxxxxxxxxxxxxxxxxxxxxx |
1119 | | // |
1120 | | // Even if we know that the text is left-to-right and so will probably be |
1121 | | // left-aligned, both of the following are possible texts: |
1122 | | // |
1123 | | // (A1a) 1. Here our list item |
1124 | | // with two full lines. |
1125 | | // 2. Here a second item. |
1126 | | // 3. Here our third one. |
1127 | | // |
1128 | | // (A1b) so ends paragraph one. |
1129 | | // Here starts another |
1130 | | // paragraph we want to |
1131 | | // read. This continues |
1132 | | // |
1133 | | // These examples are obvious from the text and should have been caught |
1134 | | // by the StrongEvidenceClassify pass. However, for languages where we don't |
1135 | | // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), |
1136 | | // it's worth guessing that (A1b) is the correct interpretation if there are |
1137 | | // far more "full" lines than "short" lines. |
1138 | | static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows, |
1139 | 14.0k | int row_start, int row_end, ParagraphTheory *theory) { |
1140 | 14.0k | if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) { |
1141 | 4.11k | return; |
1142 | 4.11k | } |
1143 | 9.92k | if (debug_level > 1) { |
1144 | 0 | tprintf("###############################################\n"); |
1145 | 0 | tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end); |
1146 | 0 | tprintf("###############################################\n"); |
1147 | 0 | } |
1148 | 9.92k | RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); |
1149 | | |
1150 | 9.92k | GeometricClassifierState s(debug_level, rows, row_start, row_end); |
1151 | 9.92k | if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { |
1152 | 610 | s.Fail(2, "Too much variety for simple outline classification."); |
1153 | 610 | return; |
1154 | 610 | } |
1155 | 9.31k | if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { |
1156 | 3.25k | s.Fail(1, "Not enough variety for simple outline classification."); |
1157 | 3.25k | return; |
1158 | 3.25k | } |
1159 | 6.06k | if (s.left_tabs.size() + s.right_tabs.size() == 3) { |
1160 | 3.41k | GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); |
1161 | 3.41k | return; |
1162 | 3.41k | } |
1163 | | |
1164 | | // At this point, we know that one side has at least two tab stops, and the |
1165 | | // other side has one or two tab stops. |
1166 | | // Left to determine: |
1167 | | // (1) Which is the body indent and which is the first line indent? |
1168 | | // (2) Is the text fully justified? |
1169 | | |
1170 | | // If one side happens to have three or more tab stops, assume that side |
1171 | | // is opposite of the aligned side. |
1172 | 2.65k | if (s.right_tabs.size() > 2) { |
1173 | 1.42k | s.AssumeLeftJustification(); |
1174 | 1.42k | } else if (s.left_tabs.size() > 2) { |
1175 | 482 | s.AssumeRightJustification(); |
1176 | 747 | } else if (s.ltr) { // guess based on script direction |
1177 | 747 | s.AssumeLeftJustification(); |
1178 | 747 | } else { |
1179 | 0 | s.AssumeRightJustification(); |
1180 | 0 | } |
1181 | | |
1182 | 2.65k | if (s.AlignTabs().size() == 2) { |
1183 | | // For each tab stop on the aligned side, how many of them appear |
1184 | | // to be paragraph start lines? [first lines] |
1185 | 1.88k | int firsts[2] = {0, 0}; |
1186 | | // Count the first line as a likely paragraph start line. |
1187 | 1.88k | firsts[s.AlignsideTabIndex(s.row_start)]++; |
1188 | | // For each line, if the first word would have fit on the previous |
1189 | | // line count it as a likely paragraph start line. |
1190 | 1.88k | bool jam_packed = true; |
1191 | 29.8k | for (int i = s.row_start + 1; i < s.row_end; i++) { |
1192 | 27.9k | if (s.FirstWordWouldHaveFit(i - 1, i)) { |
1193 | 3.42k | firsts[s.AlignsideTabIndex(i)]++; |
1194 | 3.42k | jam_packed = false; |
1195 | 3.42k | } |
1196 | 27.9k | } |
1197 | | // Make an extra accounting for the last line of the paragraph just |
1198 | | // in case it's the only short line in the block. That is, take its |
1199 | | // first word as typical and see if this looks like the *last* line |
1200 | | // of a paragraph. If so, mark the *other* indent as probably a first. |
1201 | 1.88k | if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { |
1202 | 85 | firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; |
1203 | 85 | } |
1204 | | |
1205 | 1.88k | int percent0firsts, percent1firsts; |
1206 | 1.88k | percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; |
1207 | 1.88k | percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; |
1208 | | |
1209 | | // TODO(eger): Tune these constants if necessary. |
1210 | 1.88k | if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) { |
1211 | 385 | s.first_indent = s.AlignTabs()[1].center; |
1212 | 385 | s.body_indent = s.AlignTabs()[0].center; |
1213 | 1.49k | } else if ((percent1firsts < 20 && 30 < percent0firsts) || |
1214 | 1.49k | percent1firsts + 30 < percent0firsts) { |
1215 | 253 | s.first_indent = s.AlignTabs()[0].center; |
1216 | 253 | s.body_indent = s.AlignTabs()[1].center; |
1217 | 1.24k | } else { |
1218 | | // Ambiguous! Probably lineated (poetry) |
1219 | 1.24k | if (debug_level > 1) { |
1220 | 0 | tprintf("# Cannot determine %s indent likely to start paragraphs.\n", |
1221 | 0 | s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); |
1222 | 0 | tprintf("# Indent of %d looks like a first line %d%% of the time.\n", |
1223 | 0 | s.AlignTabs()[0].center, percent0firsts); |
1224 | 0 | tprintf("# Indent of %d looks like a first line %d%% of the time.\n", |
1225 | 0 | s.AlignTabs()[1].center, percent1firsts); |
1226 | 0 | s.PrintRows(); |
1227 | 0 | } |
1228 | 1.24k | return; |
1229 | 1.24k | } |
1230 | 1.88k | } else { |
1231 | | // There's only one tab stop for the "aligned to" side. |
1232 | 768 | s.first_indent = s.body_indent = s.AlignTabs()[0].center; |
1233 | 768 | } |
1234 | | |
1235 | | // At this point, we have our model. |
1236 | 1.40k | const ParagraphModel *model = theory->AddModel(s.Model()); |
1237 | | |
1238 | | // Now all we have to do is figure out if the text is fully justified or not. |
1239 | | // eop_threshold: default to fully justified unless we see evidence below. |
1240 | | // See description on MarkRowsWithModel() |
1241 | 1.40k | s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; |
1242 | | // If the text is not fully justified, re-set the eop_threshold to 0. |
1243 | 1.40k | if (s.AlignTabs().size() == 2) { |
1244 | | // Paragraphs with a paragraph-start indent. |
1245 | 5.62k | for (int i = s.row_start; i < s.row_end - 1; i++) { |
1246 | 5.37k | if (ValidFirstLine(s.rows, i + 1, model) && |
1247 | 5.37k | !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just), |
1248 | 1.20k | s.tolerance)) { |
1249 | | // We found a non-end-of-paragraph short line: not fully justified. |
1250 | 386 | s.eop_threshold = 0; |
1251 | 386 | break; |
1252 | 386 | } |
1253 | 5.37k | } |
1254 | 768 | } else { |
1255 | | // Paragraphs with no paragraph-start indent. |
1256 | 4.28k | for (int i = s.row_start; i < s.row_end - 1; i++) { |
1257 | 4.05k | if (!s.FirstWordWouldHaveFit(i, i + 1) && |
1258 | 4.05k | !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just), |
1259 | 2.61k | s.tolerance)) { |
1260 | | // We found a non-end-of-paragraph short line: not fully justified. |
1261 | 540 | s.eop_threshold = 0; |
1262 | 540 | break; |
1263 | 540 | } |
1264 | 4.05k | } |
1265 | 768 | } |
1266 | 1.40k | MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); |
1267 | 1.40k | } |
1268 | | |
1269 | | // =============== Implementation of ParagraphTheory ===================== |
1270 | | |
1271 | 3.32k | const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { |
1272 | 3.32k | for (const auto &m : *models_) { |
1273 | 384 | if (m->Comparable(model)) { |
1274 | 40 | return m; |
1275 | 40 | } |
1276 | 384 | } |
1277 | 3.28k | auto *m = new ParagraphModel(model); |
1278 | 3.28k | models_->push_back(m); |
1279 | 3.28k | push_back_new(models_we_added_, m); |
1280 | 3.28k | return m; |
1281 | 3.32k | } |
1282 | | |
1283 | 13.9k | void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { |
1284 | 13.9k | size_t w = 0; |
1285 | 16.5k | for (size_t r = 0; r < models_->size(); r++) { |
1286 | 2.58k | ParagraphModel *m = (*models_)[r]; |
1287 | 2.58k | if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) { |
1288 | 490 | delete m; |
1289 | 2.09k | } else { |
1290 | 2.09k | if (r > w) { |
1291 | 2 | (*models_)[w] = m; |
1292 | 2 | } |
1293 | 2.09k | w++; |
1294 | 2.09k | } |
1295 | 2.58k | } |
1296 | 13.9k | models_->resize(w); |
1297 | 13.9k | } |
1298 | | |
1299 | | // Examine rows[start, end) and try to determine if an existing non-centered |
1300 | | // paragraph model would fit them perfectly. If so, return a pointer to it. |
1301 | | // If not, return nullptr. |
1302 | | const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows, |
1303 | 0 | int start, int end) const { |
1304 | 0 | for (const auto *model : *models_) { |
1305 | 0 | if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) { |
1306 | 0 | return model; |
1307 | 0 | } |
1308 | 0 | } |
1309 | 0 | return nullptr; |
1310 | 0 | } |
1311 | | |
1312 | 111k | void ParagraphTheory::NonCenteredModels(SetOfModels *models) { |
1313 | 111k | for (const auto *model : *models_) { |
1314 | 4.62k | if (model->justification() != JUSTIFICATION_CENTER) { |
1315 | 3.98k | push_back_new(*models, model); |
1316 | 3.98k | } |
1317 | 4.62k | } |
1318 | 111k | } |
1319 | | |
1320 | 0 | int ParagraphTheory::IndexOf(const ParagraphModel *model) const { |
1321 | 0 | int i = 0; |
1322 | 0 | for (const auto *m : *models_) { |
1323 | 0 | if (m == model) { |
1324 | 0 | return i; |
1325 | 0 | } |
1326 | 0 | i++; |
1327 | 0 | } |
1328 | 0 | return -1; |
1329 | 0 | } |
1330 | | |
1331 | | bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row, |
1332 | 79.5k | const ParagraphModel *model) { |
1333 | 79.5k | if (!StrongModel(model)) { |
1334 | 0 | tprintf("ValidFirstLine() should only be called with strong models!\n"); |
1335 | 0 | } |
1336 | 79.5k | return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_, |
1337 | 79.5k | (*rows)[row].rindent_, (*rows)[row].rmargin_); |
1338 | 79.5k | } |
1339 | | |
1340 | | bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row, |
1341 | 52.0k | const ParagraphModel *model) { |
1342 | 52.0k | if (!StrongModel(model)) { |
1343 | 0 | tprintf("ValidBodyLine() should only be called with strong models!\n"); |
1344 | 0 | } |
1345 | 52.0k | return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_, |
1346 | 52.0k | (*rows)[row].rindent_, (*rows)[row].rmargin_); |
1347 | 52.0k | } |
1348 | | |
1349 | | bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b, |
1350 | 127 | const ParagraphModel *model) { |
1351 | 127 | if (model != kCrownRight && model != kCrownLeft) { |
1352 | 0 | tprintf("CrownCompatible() should only be called with crown models!\n"); |
1353 | 0 | return false; |
1354 | 0 | } |
1355 | 127 | auto &row_a = (*rows)[a]; |
1356 | 127 | auto &row_b = (*rows)[b]; |
1357 | 127 | if (model == kCrownRight) { |
1358 | 32 | return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_, |
1359 | 32 | Epsilon(row_a.ri_->average_interword_space)); |
1360 | 32 | } |
1361 | 95 | return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_, |
1362 | 95 | Epsilon(row_a.ri_->average_interword_space)); |
1363 | 127 | } |
1364 | | |
1365 | | // =============== Implementation of ParagraphModelSmearer ==================== |
1366 | | |
1367 | | ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, |
1368 | | int row_start, int row_end, ParagraphTheory *theory) |
1369 | 12.5k | : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) { |
1370 | 12.5k | if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { |
1371 | 0 | row_start_ = 0; |
1372 | 0 | row_end_ = 0; |
1373 | 0 | return; |
1374 | 0 | } |
1375 | 12.5k | open_models_.resize(open_models_.size() + row_end - row_start + 2); |
1376 | 12.5k | } |
1377 | | |
1378 | | // see paragraphs_internal.h |
1379 | 54.4k | void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { |
1380 | 54.4k | SetOfModels no_models; |
1381 | 54.4k | if (row_start < row_start_) { |
1382 | 0 | row_start = row_start_; |
1383 | 0 | } |
1384 | 54.4k | if (row_end > row_end_) { |
1385 | 0 | row_end = row_end_; |
1386 | 0 | } |
1387 | | |
1388 | 626k | for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) { |
1389 | 571k | if ((*rows_)[row].ri_->num_words == 0) { |
1390 | 0 | OpenModels(row + 1) = no_models; |
1391 | 571k | } else { |
1392 | 571k | SetOfModels &opened = OpenModels(row); |
1393 | 571k | (*rows_)[row].StartHypotheses(&opened); |
1394 | | |
1395 | | // Which models survive the transition from row to row + 1? |
1396 | 571k | SetOfModels still_open; |
1397 | 571k | for (auto &m : opened) { |
1398 | 37.5k | if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) { |
1399 | | // This is basic filtering; we check likely paragraph starty-ness down |
1400 | | // below in Smear() -- you know, whether the first word would have fit |
1401 | | // and such. |
1402 | 35.1k | push_back_new(still_open, m); |
1403 | 35.1k | } |
1404 | 37.5k | } |
1405 | 571k | OpenModels(row + 1) = std::move(still_open); |
1406 | 571k | } |
1407 | 571k | } |
1408 | 54.4k | } |
1409 | | |
1410 | | // see paragraphs_internal.h |
1411 | 12.5k | void ParagraphModelSmearer::Smear() { |
1412 | 12.5k | CalculateOpenModels(row_start_, row_end_); |
1413 | | |
1414 | | // For each row which we're unsure about (that is, it is LT_UNKNOWN or |
1415 | | // we have multiple LT_START hypotheses), see if there's a model that |
1416 | | // was recently used (an "open" model) which might model it well. |
1417 | 155k | for (int i = row_start_; i < row_end_; i++) { |
1418 | 143k | RowScratchRegisters &row = (*rows_)[i]; |
1419 | 143k | if (row.ri_->num_words == 0) { |
1420 | 0 | continue; |
1421 | 0 | } |
1422 | | |
1423 | | // Step One: |
1424 | | // Figure out if there are "open" models which are left-alined or |
1425 | | // right-aligned. This is important for determining whether the |
1426 | | // "first" word in a row would fit at the "end" of the previous row. |
1427 | 143k | bool left_align_open = false; |
1428 | 143k | bool right_align_open = false; |
1429 | 143k | for (auto &m : OpenModels(i)) { |
1430 | 5.14k | switch (m->justification()) { |
1431 | 4.72k | case JUSTIFICATION_LEFT: |
1432 | 4.72k | left_align_open = true; |
1433 | 4.72k | break; |
1434 | 0 | case JUSTIFICATION_RIGHT: |
1435 | 0 | right_align_open = true; |
1436 | 0 | break; |
1437 | 423 | default: |
1438 | 423 | left_align_open = right_align_open = true; |
1439 | 5.14k | } |
1440 | 5.14k | } |
1441 | | // Step Two: |
1442 | | // Use that knowledge to figure out if this row is likely to |
1443 | | // start a paragraph. |
1444 | 143k | bool likely_start; |
1445 | 143k | if (i == 0) { |
1446 | 12.1k | likely_start = true; |
1447 | 131k | } else { |
1448 | 131k | if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) { |
1449 | 126k | likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) || |
1450 | 126k | LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT); |
1451 | 126k | } else if (left_align_open) { |
1452 | 4.39k | likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT); |
1453 | 4.39k | } else { |
1454 | 0 | likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT); |
1455 | 0 | } |
1456 | 131k | } |
1457 | | |
1458 | | // Step Three: |
1459 | | // If this text line seems like an obvious first line of an |
1460 | | // open model, or an obvious continuation of an existing |
1461 | | // modelled paragraph, mark it up. |
1462 | 143k | if (likely_start) { |
1463 | | // Add Start Hypotheses for all Open models that fit. |
1464 | 16.6k | for (unsigned m = 0; m < OpenModels(i).size(); m++) { |
1465 | 459 | if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { |
1466 | 391 | row.AddStartLine(OpenModels(i)[m]); |
1467 | 391 | } |
1468 | 459 | } |
1469 | 127k | } else { |
1470 | | // Add relevant body line hypotheses. |
1471 | 127k | SetOfModels last_line_models; |
1472 | 127k | if (i > 0) { |
1473 | 127k | (*rows_)[i - 1].StrongHypotheses(&last_line_models); |
1474 | 127k | } else { |
1475 | 0 | theory_->NonCenteredModels(&last_line_models); |
1476 | 0 | } |
1477 | 127k | for (auto model : last_line_models) { |
1478 | 4.54k | if (ValidBodyLine(rows_, i, model)) { |
1479 | 2.65k | row.AddBodyLine(model); |
1480 | 2.65k | } |
1481 | 4.54k | } |
1482 | 127k | } |
1483 | | |
1484 | | // Step Four: |
1485 | | // If we're still quite unsure about this line, go through all |
1486 | | // models in our theory and see if this row could be the start |
1487 | | // of any of our models. |
1488 | 143k | if (row.GetLineType() == LT_UNKNOWN || |
1489 | 143k | (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { |
1490 | 111k | SetOfModels all_models; |
1491 | 111k | theory_->NonCenteredModels(&all_models); |
1492 | 111k | for (auto &all_model : all_models) { |
1493 | 3.98k | if (ValidFirstLine(rows_, i, all_model)) { |
1494 | 1.43k | row.AddStartLine(all_model); |
1495 | 1.43k | } |
1496 | 3.98k | } |
1497 | 111k | } |
1498 | | // Step Five: |
1499 | | // Since we may have updated the hypotheses about this row, we need |
1500 | | // to recalculate the Open models for the rest of rows[i + 1, row_end) |
1501 | 143k | if (row.GetLineType() != LT_UNKNOWN) { |
1502 | 41.8k | CalculateOpenModels(i + 1, row_end_); |
1503 | 41.8k | } |
1504 | 143k | } |
1505 | 12.5k | } |
1506 | | |
1507 | | // ================ Main Paragraph Detection Algorithm ======================= |
1508 | | |
1509 | | // Find out what ParagraphModels are actually used, and discard any |
1510 | | // that are not. |
1511 | | static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows, |
1512 | 13.9k | ParagraphTheory *theory) { |
1513 | 13.9k | SetOfModels used_models; |
1514 | 142k | for (const auto &row : rows) { |
1515 | 142k | row.StrongHypotheses(&used_models); |
1516 | 142k | } |
1517 | 13.9k | theory->DiscardUnusedModels(used_models); |
1518 | 13.9k | } |
1519 | | |
1520 | | // DowngradeWeakestToCrowns: |
1521 | | // Forget any flush-{left, right} models unless we see two or more |
1522 | | // of them in sequence. |
1523 | | // |
1524 | | // In pass 3, we start to classify even flush-left paragraphs (paragraphs |
1525 | | // where the first line and body indent are the same) as having proper Models. |
1526 | | // This is generally dangerous, since if you start imagining that flush-left |
1527 | | // is a typical paragraph model when it is not, it will lead you to chop normal |
1528 | | // indented paragraphs in the middle whenever a sentence happens to start on a |
1529 | | // new line (see "This" above). What to do? |
1530 | | // What we do is to take any paragraph which is flush left and is not |
1531 | | // preceded by another paragraph of the same model and convert it to a "Crown" |
1532 | | // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder |
1533 | | // for later. It means that the paragraph is flush, but it would be desirable |
1534 | | // to mark it as the same model as following text if it fits. This downgrade |
1535 | | // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we |
1536 | | // avoid making flush left Paragraph Models whenever we see a top-of-the-page |
1537 | | // half-of-a-paragraph. and instead we mark it the same as normal body text. |
1538 | | // |
1539 | | // Implementation: |
1540 | | // |
1541 | | // Comb backwards through the row scratch registers, and turn any |
1542 | | // sequences of body lines of equivalent type abutted against the beginning |
1543 | | // or a body or start line of a different type into a crown paragraph. |
1544 | | static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, |
1545 | 13.9k | std::vector<RowScratchRegisters> *rows) { |
1546 | 13.9k | int start; |
1547 | 18.4k | for (int end = rows->size(); end > 0; end = start) { |
1548 | | // Search back for a body line of a unique type. |
1549 | 16.7k | const ParagraphModel *model = nullptr; |
1550 | 131k | while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) { |
1551 | 115k | end--; |
1552 | 115k | } |
1553 | 16.7k | if (end == 0) { |
1554 | 12.2k | break; |
1555 | 12.2k | } |
1556 | 4.52k | start = end - 1; |
1557 | 29.8k | while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { |
1558 | 25.2k | start--; // walk back to the first line that is not the same body type. |
1559 | 25.2k | } |
1560 | 4.52k | if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) && |
1561 | 4.52k | NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) { |
1562 | 2.15k | start--; |
1563 | 2.15k | } |
1564 | 4.52k | start++; |
1565 | | // Now rows[start, end) is a sequence of unique body hypotheses of model. |
1566 | 4.52k | if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) { |
1567 | 92 | continue; |
1568 | 92 | } |
1569 | 4.43k | if (!StrongModel(model)) { |
1570 | 212 | while (start > 0 && CrownCompatible(rows, start - 1, start, model)) { |
1571 | 119 | start--; |
1572 | 119 | } |
1573 | 93 | } |
1574 | 4.43k | if (start == 0 || (!StrongModel(model)) || |
1575 | 4.43k | (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { |
1576 | | // crownify rows[start, end) |
1577 | 1.96k | const ParagraphModel *crown_model = model; |
1578 | 1.96k | if (StrongModel(model)) { |
1579 | 1.87k | if (model->justification() == JUSTIFICATION_LEFT) { |
1580 | 1.62k | crown_model = kCrownLeft; |
1581 | 1.62k | } else { |
1582 | 249 | crown_model = kCrownRight; |
1583 | 249 | } |
1584 | 1.87k | } |
1585 | 1.96k | (*rows)[start].SetUnknown(); |
1586 | 1.96k | (*rows)[start].AddStartLine(crown_model); |
1587 | 16.4k | for (int row = start + 1; row < end; row++) { |
1588 | 14.4k | (*rows)[row].SetUnknown(); |
1589 | 14.4k | (*rows)[row].AddBodyLine(crown_model); |
1590 | 14.4k | } |
1591 | 1.96k | } |
1592 | 4.43k | } |
1593 | 13.9k | DiscardUnusedModels(*rows, theory); |
1594 | 13.9k | } |
1595 | | |
1596 | | // Clear all hypotheses about lines [start, end) and reset margins. |
1597 | | // |
1598 | | // The empty space between the left of a row and the block boundary (and |
1599 | | // similarly for the right) is split into two pieces: margin and indent. |
1600 | | // In initial processing, we assume the block is tight and the margin for |
1601 | | // all lines is set to zero. However, if our first pass does not yield |
1602 | | // models for everything, it may be due to an inset paragraph like a |
1603 | | // block-quote. In that case, we make a second pass over that unmarked |
1604 | | // section of the page and reset the "margin" portion of the empty space |
1605 | | // to the common amount of space at the ends of the lines under consid- |
1606 | | // eration. This would be equivalent to percentile set to 0. However, |
1607 | | // sometimes we have a single character sticking out in the right margin |
1608 | | // of a text block (like the 'r' in 'for' on line 3 above), and we can |
1609 | | // really just ignore it as an outlier. To express this, we allow the |
1610 | | // user to specify the percentile (0..100) of indent values to use as |
1611 | | // the common margin for each row in the run of rows[start, end). |
1612 | | void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start, |
1613 | 22.4k | int end, int percentile) { |
1614 | 22.4k | if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) { |
1615 | 0 | return; |
1616 | 0 | } |
1617 | | |
1618 | 22.4k | int lmin, lmax, rmin, rmax; |
1619 | 22.4k | lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; |
1620 | 22.4k | rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; |
1621 | 297k | for (int i = start; i < end; i++) { |
1622 | 274k | RowScratchRegisters &sr = (*rows)[i]; |
1623 | 274k | sr.SetUnknown(); |
1624 | 274k | if (sr.ri_->num_words == 0) { |
1625 | 0 | continue; |
1626 | 0 | } |
1627 | 274k | UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); |
1628 | 274k | UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); |
1629 | 274k | } |
1630 | 22.4k | STATS lefts(lmin, lmax); |
1631 | 22.4k | STATS rights(rmin, rmax); |
1632 | 297k | for (int i = start; i < end; i++) { |
1633 | 274k | RowScratchRegisters &sr = (*rows)[i]; |
1634 | 274k | if (sr.ri_->num_words == 0) { |
1635 | 0 | continue; |
1636 | 0 | } |
1637 | 274k | lefts.add(sr.lmargin_ + sr.lindent_, 1); |
1638 | 274k | rights.add(sr.rmargin_ + sr.rindent_, 1); |
1639 | 274k | } |
1640 | 22.4k | int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); |
1641 | 22.4k | int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); |
1642 | 297k | for (int i = start; i < end; i++) { |
1643 | 274k | RowScratchRegisters &sr = (*rows)[i]; |
1644 | 274k | int ldelta = ignorable_left - sr.lmargin_; |
1645 | 274k | sr.lmargin_ += ldelta; |
1646 | 274k | sr.lindent_ -= ldelta; |
1647 | 274k | int rdelta = ignorable_right - sr.rmargin_; |
1648 | 274k | sr.rmargin_ += rdelta; |
1649 | 274k | sr.rindent_ -= rdelta; |
1650 | 274k | } |
1651 | 22.4k | } |
1652 | | |
1653 | | // Return the median inter-word space in rows[row_start, row_end). |
1654 | 19.5k | int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) { |
1655 | 19.5k | if (row_end < row_start + 1) { |
1656 | 0 | return 1; |
1657 | 0 | } |
1658 | 19.5k | int word_height = |
1659 | 19.5k | (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2; |
1660 | 19.5k | int word_width = |
1661 | 19.5k | (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2; |
1662 | 19.5k | STATS spacing_widths(0, 4 + word_width); |
1663 | 179k | for (int i = row_start; i < row_end; i++) { |
1664 | 160k | if (rows[i].ri_->num_words > 1) { |
1665 | 18.7k | spacing_widths.add(rows[i].ri_->average_interword_space, 1); |
1666 | 18.7k | } |
1667 | 160k | } |
1668 | 19.5k | int minimum_reasonable_space = word_height / 3; |
1669 | 19.5k | if (minimum_reasonable_space < 2) { |
1670 | 153 | minimum_reasonable_space = 2; |
1671 | 153 | } |
1672 | 19.5k | int median = spacing_widths.median(); |
1673 | 19.5k | return (median > minimum_reasonable_space) ? median : minimum_reasonable_space; |
1674 | 19.5k | } |
1675 | | |
1676 | | // Return whether the first word on the after line can fit in the space at |
1677 | | // the end of the before line (knowing which way the text is aligned and read). |
1678 | | bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, |
1679 | 535k | tesseract::ParagraphJustification justification) { |
1680 | 535k | if (before.ri_->num_words == 0 || after.ri_->num_words == 0) { |
1681 | 0 | return true; |
1682 | 0 | } |
1683 | | |
1684 | 535k | if (justification == JUSTIFICATION_UNKNOWN) { |
1685 | 0 | tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); |
1686 | 0 | } |
1687 | 535k | int available_space; |
1688 | 535k | if (justification == JUSTIFICATION_CENTER) { |
1689 | 0 | available_space = before.lindent_ + before.rindent_; |
1690 | 535k | } else { |
1691 | 535k | available_space = before.OffsideIndent(justification); |
1692 | 535k | } |
1693 | 535k | available_space -= before.ri_->average_interword_space; |
1694 | | |
1695 | 535k | if (before.ri_->ltr) { |
1696 | 535k | return after.ri_->lword_box.width() < available_space; |
1697 | 535k | } |
1698 | 0 | return after.ri_->rword_box.width() < available_space; |
1699 | 535k | } |
1700 | | |
1701 | | // Return whether the first word on the after line can fit in the space at |
1702 | | // the end of the before line (not knowing which way the text goes) in a left |
1703 | | // or right alignment. |
1704 | 22.7k | bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) { |
1705 | 22.7k | if (before.ri_->num_words == 0 || after.ri_->num_words == 0) { |
1706 | 0 | return true; |
1707 | 0 | } |
1708 | | |
1709 | 22.7k | int available_space = before.lindent_; |
1710 | 22.7k | if (before.rindent_ > available_space) { |
1711 | 6.87k | available_space = before.rindent_; |
1712 | 6.87k | } |
1713 | 22.7k | available_space -= before.ri_->average_interword_space; |
1714 | | |
1715 | 22.7k | if (before.ri_->ltr) { |
1716 | 22.7k | return after.ri_->lword_box.width() < available_space; |
1717 | 22.7k | } |
1718 | 0 | return after.ri_->rword_box.width() < available_space; |
1719 | 22.7k | } |
1720 | | |
1721 | 20.5k | static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) { |
1722 | 20.5k | if (before.ri_->ltr) { |
1723 | 20.5k | return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea; |
1724 | 20.5k | } else { |
1725 | 0 | return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea; |
1726 | 0 | } |
1727 | 20.5k | } |
1728 | | |
1729 | | static bool LikelyParagraphStart(const RowScratchRegisters &before, |
1730 | | const RowScratchRegisters &after, |
1731 | 346k | tesseract::ParagraphJustification j) { |
1732 | 346k | return before.ri_->num_words == 0 || |
1733 | 346k | (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after)); |
1734 | 346k | } |
1735 | | |
1736 | | // Examine rows[start, end) and try to determine what sort of ParagraphModel |
1737 | | // would fit them as a single paragraph. |
1738 | | // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. |
1739 | | // If the rows given could be a consistent start to a paragraph, set *consistent |
1740 | | // true. |
1741 | | static ParagraphModel InternalParagraphModelByOutline( |
1742 | | const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance, |
1743 | 35.5k | bool *consistent) { |
1744 | 35.5k | int ltr_line_count = 0; |
1745 | 188k | for (int i = start; i < end; i++) { |
1746 | 153k | ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr); |
1747 | 153k | } |
1748 | 35.5k | bool ltr = (ltr_line_count >= (end - start) / 2); |
1749 | | |
1750 | 35.5k | *consistent = true; |
1751 | 35.5k | if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) { |
1752 | 0 | return ParagraphModel(); |
1753 | 0 | } |
1754 | | |
1755 | | // Ensure the caller only passed us a region with a common rmargin and |
1756 | | // lmargin. |
1757 | 35.5k | int lmargin = (*rows)[start].lmargin_; |
1758 | 35.5k | int rmargin = (*rows)[start].rmargin_; |
1759 | 35.5k | int lmin, lmax, rmin, rmax, cmin, cmax; |
1760 | 35.5k | lmin = lmax = (*rows)[start + 1].lindent_; |
1761 | 35.5k | rmin = rmax = (*rows)[start + 1].rindent_; |
1762 | 35.5k | cmin = cmax = 0; |
1763 | 153k | for (int i = start + 1; i < end; i++) { |
1764 | 117k | if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { |
1765 | 0 | tprintf("Margins don't match! Software error.\n"); |
1766 | 0 | *consistent = false; |
1767 | 0 | return ParagraphModel(); |
1768 | 0 | } |
1769 | 117k | UpdateRange((*rows)[i].lindent_, &lmin, &lmax); |
1770 | 117k | UpdateRange((*rows)[i].rindent_, &rmin, &rmax); |
1771 | 117k | UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); |
1772 | 117k | } |
1773 | 35.5k | int ldiff = lmax - lmin; |
1774 | 35.5k | int rdiff = rmax - rmin; |
1775 | 35.5k | int cdiff = cmax - cmin; |
1776 | 35.5k | if (rdiff > tolerance && ldiff > tolerance) { |
1777 | 997 | if (cdiff < tolerance * 2) { |
1778 | 345 | if (end - start < 3) { |
1779 | 0 | return ParagraphModel(); |
1780 | 0 | } |
1781 | 345 | return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); |
1782 | 345 | } |
1783 | 652 | *consistent = false; |
1784 | 652 | return ParagraphModel(); |
1785 | 997 | } |
1786 | 34.5k | if (end - start < 3) { // Don't return a model for two line paras. |
1787 | 16.9k | return ParagraphModel(); |
1788 | 16.9k | } |
1789 | | |
1790 | | // These booleans keep us from saying something is aligned left when the body |
1791 | | // left variance is too large. |
1792 | 17.5k | bool body_admits_left_alignment = ldiff < tolerance; |
1793 | 17.5k | bool body_admits_right_alignment = rdiff < tolerance; |
1794 | | |
1795 | 17.5k | ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, |
1796 | 17.5k | (lmin + lmax) / 2, tolerance); |
1797 | 17.5k | ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, |
1798 | 17.5k | (rmin + rmax) / 2, tolerance); |
1799 | | |
1800 | | // These booleans keep us from having an indent on the "wrong side" for the |
1801 | | // first line. |
1802 | 17.5k | bool text_admits_left_alignment = ltr || left_model.is_flush(); |
1803 | 17.5k | bool text_admits_right_alignment = !ltr || right_model.is_flush(); |
1804 | | |
1805 | | // At least one of the edges is less than tolerance in variance. |
1806 | | // If the other is obviously ragged, it can't be the one aligned to. |
1807 | | // [Note the last line is included in this raggedness.] |
1808 | 17.5k | if (tolerance < rdiff) { |
1809 | 6.00k | if (body_admits_left_alignment && text_admits_left_alignment) { |
1810 | 5.76k | return left_model; |
1811 | 5.76k | } |
1812 | 237 | *consistent = false; |
1813 | 237 | return ParagraphModel(); |
1814 | 6.00k | } |
1815 | 11.5k | if (tolerance < ldiff) { |
1816 | 1.51k | if (body_admits_right_alignment && text_admits_right_alignment) { |
1817 | 1.23k | return right_model; |
1818 | 1.23k | } |
1819 | 281 | *consistent = false; |
1820 | 281 | return ParagraphModel(); |
1821 | 1.51k | } |
1822 | | |
1823 | | // At this point, we know the body text doesn't vary much on either side. |
1824 | | |
1825 | | // If the first line juts out oddly in one direction or the other, |
1826 | | // that likely indicates the side aligned to. |
1827 | 10.0k | int first_left = (*rows)[start].lindent_; |
1828 | 10.0k | int first_right = (*rows)[start].rindent_; |
1829 | | |
1830 | 10.0k | if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) { |
1831 | 4.42k | return left_model; |
1832 | 4.42k | } |
1833 | 5.57k | if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) { |
1834 | 0 | return right_model; |
1835 | 0 | } |
1836 | | |
1837 | 5.57k | *consistent = false; |
1838 | 5.57k | return ParagraphModel(); |
1839 | 5.57k | } |
1840 | | |
1841 | | // Examine rows[start, end) and try to determine what sort of ParagraphModel |
1842 | | // would fit them as a single paragraph. If nothing fits, |
1843 | | // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug |
1844 | | // output if we're debugging. |
1845 | | static ParagraphModel ParagraphModelByOutline(int debug_level, |
1846 | | const std::vector<RowScratchRegisters> *rows, |
1847 | 9.61k | int start, int end, int tolerance) { |
1848 | 9.61k | bool unused_consistent; |
1849 | 9.61k | ParagraphModel retval = |
1850 | 9.61k | InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent); |
1851 | 9.61k | if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { |
1852 | 0 | tprintf("Could not determine a model for this paragraph:\n"); |
1853 | 0 | PrintRowRange(*rows, start, end); |
1854 | 0 | } |
1855 | 9.61k | return retval; |
1856 | 9.61k | } |
1857 | | |
1858 | | // Do rows[start, end) form a single instance of the given paragraph model? |
1859 | | bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end, |
1860 | 0 | const ParagraphModel *model) { |
1861 | 0 | if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) { |
1862 | 0 | return false; |
1863 | 0 | } |
1864 | 0 | if (!ValidFirstLine(rows, start, model)) { |
1865 | 0 | return false; |
1866 | 0 | } |
1867 | 0 | for (int i = start + 1; i < end; i++) { |
1868 | 0 | if (!ValidBodyLine(rows, i, model)) { |
1869 | 0 | return false; |
1870 | 0 | } |
1871 | 0 | } |
1872 | 0 | return true; |
1873 | 0 | } |
1874 | | |
1875 | | // Examine rows[row_start, row_end) as an independent section of text, |
1876 | | // and mark rows that are exceptionally clear as start-of-paragraph |
1877 | | // and paragraph-body lines. |
1878 | | // |
1879 | | // We presume that any lines surrounding rows[row_start, row_end) may |
1880 | | // have wildly different paragraph models, so we don't key any data off |
1881 | | // of those lines. |
1882 | | // |
1883 | | // We only take the very strongest signals, as we don't want to get |
1884 | | // confused and marking up centered text, poetry, or source code as |
1885 | | // clearly part of a typical paragraph. |
1886 | | static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start, |
1887 | 12.5k | int row_end) { |
1888 | | // Record patently obvious body text. |
1889 | 143k | for (int i = row_start + 1; i < row_end; i++) { |
1890 | 130k | const RowScratchRegisters &prev = (*rows)[i - 1]; |
1891 | 130k | RowScratchRegisters &curr = (*rows)[i]; |
1892 | 130k | tesseract::ParagraphJustification typical_justification = |
1893 | 130k | prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; |
1894 | 130k | if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea && |
1895 | 130k | !FirstWordWouldHaveFit(prev, curr, typical_justification)) { |
1896 | 22.4k | curr.SetBodyLine(); |
1897 | 22.4k | } |
1898 | 130k | } |
1899 | | |
1900 | | // Record patently obvious start paragraph lines. |
1901 | | // |
1902 | | // It's an extremely good signal of the start of a paragraph that |
1903 | | // the first word would have fit on the end of the previous line. |
1904 | | // However, applying just that signal would have us mark random |
1905 | | // start lines of lineated text (poetry and source code) and some |
1906 | | // centered headings as paragraph start lines. Therefore, we use |
1907 | | // a second qualification for a paragraph start: Not only should |
1908 | | // the first word of this line have fit on the previous line, |
1909 | | // but also, this line should go full to the right of the block, |
1910 | | // disallowing a subsequent word from having fit on this line. |
1911 | | |
1912 | | // First row: |
1913 | 12.5k | { |
1914 | 12.5k | RowScratchRegisters &curr = (*rows)[row_start]; |
1915 | 12.5k | RowScratchRegisters &next = (*rows)[row_start + 1]; |
1916 | 12.5k | tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; |
1917 | 12.5k | if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) && |
1918 | 12.5k | (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) { |
1919 | 10.1k | curr.SetStartLine(); |
1920 | 10.1k | } |
1921 | 12.5k | } |
1922 | | // Middle rows |
1923 | 130k | for (int i = row_start + 1; i < row_end - 1; i++) { |
1924 | 118k | RowScratchRegisters &prev = (*rows)[i - 1]; |
1925 | 118k | RowScratchRegisters &curr = (*rows)[i]; |
1926 | 118k | RowScratchRegisters &next = (*rows)[i + 1]; |
1927 | 118k | tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; |
1928 | 118k | if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) && |
1929 | 118k | LikelyParagraphStart(prev, curr, j)) { |
1930 | 473 | curr.SetStartLine(); |
1931 | 473 | } |
1932 | 118k | } |
1933 | | // Last row |
1934 | 12.5k | { // the short circuit at the top means we have at least two lines. |
1935 | 12.5k | RowScratchRegisters &prev = (*rows)[row_end - 2]; |
1936 | 12.5k | RowScratchRegisters &curr = (*rows)[row_end - 1]; |
1937 | 12.5k | tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; |
1938 | 12.5k | if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) && |
1939 | 12.5k | LikelyParagraphStart(prev, curr, j)) { |
1940 | 13 | curr.SetStartLine(); |
1941 | 13 | } |
1942 | 12.5k | } |
1943 | 12.5k | } |
1944 | | |
1945 | | // Look for sequences of a start line followed by some body lines in |
1946 | | // rows[row_start, row_end) and create ParagraphModels for them if |
1947 | | // they seem coherent. |
1948 | | static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows, |
1949 | | int row_start, int row_end, bool allow_flush_models, |
1950 | 12.5k | ParagraphTheory *theory) { |
1951 | 12.5k | if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) { |
1952 | 0 | return; |
1953 | 0 | } |
1954 | | |
1955 | 12.5k | int start = row_start; |
1956 | 23.1k | while (start < row_end) { |
1957 | 136k | while (start < row_end && (*rows)[start].GetLineType() != LT_START) { |
1958 | 113k | start++; |
1959 | 113k | } |
1960 | 23.1k | if (start >= row_end - 1) { |
1961 | 12.5k | break; |
1962 | 12.5k | } |
1963 | | |
1964 | 10.6k | int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); |
1965 | 10.6k | int end = start; |
1966 | 10.6k | ParagraphModel last_model; |
1967 | 10.6k | bool next_consistent; |
1968 | 30.2k | do { |
1969 | 30.2k | ++end; |
1970 | | // rows[row, end) was consistent. |
1971 | | // If rows[row, end + 1) is not consistent, |
1972 | | // just model rows[row, end) |
1973 | 30.2k | if (end < row_end - 1) { |
1974 | 27.7k | RowScratchRegisters &next = (*rows)[end]; |
1975 | 27.7k | LineType lt = next.GetLineType(); |
1976 | 27.7k | next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN && |
1977 | 22.8k | !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); |
1978 | 27.7k | } else { |
1979 | 2.43k | next_consistent = false; |
1980 | 2.43k | } |
1981 | 30.2k | if (next_consistent) { |
1982 | 25.8k | ParagraphModel next_model = |
1983 | 25.8k | InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent); |
1984 | 25.8k | if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT && |
1985 | 25.8k | next_model.justification() != JUSTIFICATION_LEFT) || |
1986 | 25.8k | (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT && |
1987 | 24.7k | next_model.justification() != JUSTIFICATION_RIGHT)) { |
1988 | 1.13k | next_consistent = false; |
1989 | 1.13k | } |
1990 | 25.8k | last_model = next_model; |
1991 | 25.8k | } else { |
1992 | 4.30k | next_consistent = false; |
1993 | 4.30k | } |
1994 | 30.2k | } while (next_consistent && end < row_end); |
1995 | | // At this point, rows[start, end) looked like it could have been a |
1996 | | // single paragraph. If we can make a good ParagraphModel for it, |
1997 | | // do so and mark this sequence with that model. |
1998 | 10.6k | if (end > start + 1) { |
1999 | | // emit a new paragraph if we have more than one line. |
2000 | 9.61k | const ParagraphModel *model = nullptr; |
2001 | 9.61k | ParagraphModel new_model = ParagraphModelByOutline( |
2002 | 9.61k | debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end))); |
2003 | 9.61k | if (new_model.justification() == JUSTIFICATION_UNKNOWN) { |
2004 | | // couldn't create a good model, oh well. |
2005 | 7.93k | } else if (new_model.is_flush()) { |
2006 | 1.23k | if (end == start + 2) { |
2007 | | // It's very likely we just got two paragraph starts in a row. |
2008 | 0 | end = start + 1; |
2009 | 1.23k | } else if (start == row_start) { |
2010 | | // Mark this as a Crown. |
2011 | 1.15k | if (new_model.justification() == JUSTIFICATION_LEFT) { |
2012 | 1.03k | model = kCrownLeft; |
2013 | 1.03k | } else { |
2014 | 118 | model = kCrownRight; |
2015 | 118 | } |
2016 | 1.15k | } else if (allow_flush_models) { |
2017 | 0 | model = theory->AddModel(new_model); |
2018 | 0 | } |
2019 | 1.23k | } else { |
2020 | 445 | model = theory->AddModel(new_model); |
2021 | 445 | } |
2022 | 9.61k | if (model) { |
2023 | 1.59k | (*rows)[start].AddStartLine(model); |
2024 | 9.64k | for (int i = start + 1; i < end; i++) { |
2025 | 8.04k | (*rows)[i].AddBodyLine(model); |
2026 | 8.04k | } |
2027 | 1.59k | } |
2028 | 9.61k | } |
2029 | 10.6k | start = end; |
2030 | 10.6k | } |
2031 | 12.5k | } |
2032 | | |
2033 | | // We examine rows[row_start, row_end) and do the following: |
2034 | | // (1) Clear all existing hypotheses for the rows being considered. |
2035 | | // (2) Mark up any rows as exceptionally likely to be paragraph starts |
2036 | | // or paragraph body lines as such using both geometric and textual |
2037 | | // clues. |
2038 | | // (3) Form models for any sequence of start + continuation lines. |
2039 | | // (4) Smear the paragraph models to cover surrounding text. |
2040 | | static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows, |
2041 | 14.4k | int row_start, int row_end, ParagraphTheory *theory) { |
2042 | 14.4k | if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) { |
2043 | 1.94k | return; |
2044 | 1.94k | } |
2045 | | |
2046 | 12.5k | if (debug_level > 1) { |
2047 | 0 | tprintf("#############################################\n"); |
2048 | 0 | tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); |
2049 | 0 | tprintf("#############################################\n"); |
2050 | 0 | } |
2051 | | |
2052 | 12.5k | RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); |
2053 | 12.5k | MarkStrongEvidence(rows, row_start, row_end); |
2054 | | |
2055 | 12.5k | DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); |
2056 | | |
2057 | | // Create paragraph models. |
2058 | 12.5k | ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); |
2059 | | |
2060 | 12.5k | DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); |
2061 | | |
2062 | | // At this point, some rows are marked up as paragraphs with model numbers, |
2063 | | // and some rows are marked up as either LT_START or LT_BODY. Now let's |
2064 | | // smear any good paragraph hypotheses forward and backward. |
2065 | 12.5k | ParagraphModelSmearer smearer(rows, row_start, row_end, theory); |
2066 | 12.5k | smearer.Smear(); |
2067 | 12.5k | } |
2068 | | |
2069 | | static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start, |
2070 | 13.9k | int row_end, ParagraphTheory *theory) { |
2071 | 130k | for (int i = row_start + 1; i < row_end - 1; i++) { |
2072 | 116k | if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders && |
2073 | 116k | (*rows)[i + 1].ri_->has_leaders) { |
2074 | 0 | const ParagraphModel *model = |
2075 | 0 | theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); |
2076 | 0 | (*rows)[i].AddStartLine(model); |
2077 | 0 | } |
2078 | 116k | } |
2079 | 13.9k | } |
2080 | | |
2081 | | // Collect sequences of unique hypotheses in row registers and create proper |
2082 | | // paragraphs for them, referencing the paragraphs in row_owners. |
2083 | | static void ConvertHypothesizedModelRunsToParagraphs(int debug_level, |
2084 | | std::vector<RowScratchRegisters> &rows, |
2085 | | std::vector<PARA *> *row_owners, |
2086 | 13.9k | ParagraphTheory *theory) { |
2087 | 13.9k | int end = rows.size(); |
2088 | 13.9k | int start; |
2089 | 132k | for (; end > 0; end = start) { |
2090 | 118k | start = end - 1; |
2091 | 118k | const ParagraphModel *model = nullptr; |
2092 | | // TODO(eger): Be smarter about dealing with multiple hypotheses. |
2093 | 118k | bool single_line_paragraph = false; |
2094 | 118k | SetOfModels models; |
2095 | 118k | rows[start].NonNullHypotheses(&models); |
2096 | 118k | if (!models.empty()) { |
2097 | 8.76k | model = models[0]; |
2098 | 8.76k | if (rows[start].GetLineType(model) != LT_BODY) { |
2099 | 4.42k | single_line_paragraph = true; |
2100 | 4.42k | } |
2101 | 8.76k | } |
2102 | 118k | if (model && !single_line_paragraph) { |
2103 | | // walk back looking for more body lines and then a start line. |
2104 | 23.9k | while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { |
2105 | | // do nothing |
2106 | 19.6k | } |
2107 | 4.34k | if (start < 0 || rows[start].GetLineType(model) != LT_START) { |
2108 | 6 | model = nullptr; |
2109 | 6 | } |
2110 | 4.34k | } |
2111 | 118k | if (model == nullptr) { |
2112 | 109k | continue; |
2113 | 109k | } |
2114 | | // rows[start, end) should be a paragraph. |
2115 | 8.75k | PARA *p = new PARA(); |
2116 | 8.75k | if (model == kCrownLeft || model == kCrownRight) { |
2117 | 1.87k | p->is_very_first_or_continuation = true; |
2118 | | // Crown paragraph. |
2119 | | // If we can find an existing ParagraphModel that fits, use it, |
2120 | | // else create a new one. |
2121 | 2.75k | for (unsigned row = end; row < rows.size(); row++) { |
2122 | 2.03k | if ((*row_owners)[row] && |
2123 | 2.03k | (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && |
2124 | 1.96k | (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { |
2125 | 1.16k | model = (*row_owners)[row]->model; |
2126 | 1.16k | break; |
2127 | 1.16k | } |
2128 | 2.03k | } |
2129 | 1.87k | if (model == kCrownLeft) { |
2130 | | // No subsequent model fits, so cons one up. |
2131 | 595 | model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT, |
2132 | 595 | rows[start].lmargin_ + rows[start].lindent_, 0, 0, |
2133 | 595 | Epsilon(rows[start].ri_->average_interword_space))); |
2134 | 1.28k | } else if (model == kCrownRight) { |
2135 | | // No subsequent model fits, so cons one up. |
2136 | 123 | model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT, |
2137 | 123 | rows[start].rmargin_ + rows[start].rmargin_, 0, 0, |
2138 | 123 | Epsilon(rows[start].ri_->average_interword_space))); |
2139 | 123 | } |
2140 | 1.87k | } |
2141 | 8.75k | rows[start].SetUnknown(); |
2142 | 8.75k | rows[start].AddStartLine(model); |
2143 | 32.7k | for (int i = start + 1; i < end; i++) { |
2144 | 23.9k | rows[i].SetUnknown(); |
2145 | 23.9k | rows[i].AddBodyLine(model); |
2146 | 23.9k | } |
2147 | 8.75k | p->model = model; |
2148 | 8.75k | p->has_drop_cap = rows[start].ri_->has_drop_cap; |
2149 | 8.75k | p->is_list_item = model->justification() == JUSTIFICATION_RIGHT |
2150 | 8.75k | ? rows[start].ri_->rword_indicates_list_item |
2151 | 8.75k | : rows[start].ri_->lword_indicates_list_item; |
2152 | 41.4k | for (int row = start; row < end; row++) { |
2153 | 32.7k | if ((*row_owners)[row] != nullptr) { |
2154 | 0 | tprintf( |
2155 | 0 | "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " |
2156 | 0 | "more than once!\n"); |
2157 | 0 | delete (*row_owners)[row]; |
2158 | 0 | } |
2159 | 32.7k | (*row_owners)[row] = p; |
2160 | 32.7k | } |
2161 | 8.75k | } |
2162 | 13.9k | } |
2163 | | |
2164 | | struct Interval { |
2165 | 0 | Interval() : begin(0), end(0) {} |
2166 | 54.0k | Interval(int b, int e) : begin(b), end(e) {} |
2167 | | |
2168 | | int begin; |
2169 | | int end; |
2170 | | }; |
2171 | | |
2172 | | // Return whether rows[row] appears to be stranded, meaning that the evidence |
2173 | | // for this row is very weak due to context. For instance, two lines of source |
2174 | | // code may happen to be indented at the same tab vector as body text starts, |
2175 | | // leading us to think they are two start-of-paragraph lines. This is not |
2176 | | // optimal. However, we also don't want to mark a sequence of short dialog |
2177 | | // as "weak," so our heuristic is: |
2178 | | // (1) If a line is surrounded by lines of unknown type, it's weak. |
2179 | | // (2) If two lines in a row are start lines for a given paragraph type, but |
2180 | | // after that the same paragraph type does not continue, they're weak. |
2181 | 25.3k | static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) { |
2182 | 25.3k | SetOfModels row_models; |
2183 | 25.3k | rows[row].StrongHypotheses(&row_models); |
2184 | | |
2185 | 25.3k | for (auto &row_model : row_models) { |
2186 | 25.3k | bool all_starts = rows[row].GetLineType(); |
2187 | 25.3k | int run_length = 1; |
2188 | 25.3k | bool continues = true; |
2189 | 224k | for (int i = row - 1; i >= 0 && continues; i--) { |
2190 | 198k | SetOfModels models; |
2191 | 198k | rows[i].NonNullHypotheses(&models); |
2192 | 198k | switch (rows[i].GetLineType(row_model)) { |
2193 | 56.8k | case LT_START: |
2194 | 56.8k | run_length++; |
2195 | 56.8k | break; |
2196 | 38 | case LT_MULTIPLE: // explicit fall-through |
2197 | 137k | case LT_BODY: |
2198 | 137k | run_length++; |
2199 | 137k | all_starts = false; |
2200 | 137k | break; |
2201 | 4.04k | case LT_UNKNOWN: // explicit fall-through |
2202 | 4.04k | default: |
2203 | 4.04k | continues = false; |
2204 | 198k | } |
2205 | 198k | } |
2206 | 25.3k | continues = true; |
2207 | 190k | for (unsigned i = row + 1; i < rows.size() && continues; i++) { |
2208 | 165k | SetOfModels models; |
2209 | 165k | rows[i].NonNullHypotheses(&models); |
2210 | 165k | switch (rows[i].GetLineType(row_model)) { |
2211 | 45.3k | case LT_START: |
2212 | 45.3k | run_length++; |
2213 | 45.3k | break; |
2214 | 22 | case LT_MULTIPLE: // explicit fall-through |
2215 | 114k | case LT_BODY: |
2216 | 114k | run_length++; |
2217 | 114k | all_starts = false; |
2218 | 114k | break; |
2219 | 5.47k | case LT_UNKNOWN: // explicit fall-through |
2220 | 5.47k | default: |
2221 | 5.47k | continues = false; |
2222 | 165k | } |
2223 | 165k | } |
2224 | 25.3k | if (run_length > 2 || (!all_starts && run_length > 1)) { |
2225 | 25.0k | return false; |
2226 | 25.0k | } |
2227 | 25.3k | } |
2228 | 328 | return true; |
2229 | 25.3k | } |
2230 | | |
2231 | | // Go through rows[row_start, row_end) and gather up sequences that need better |
2232 | | // classification. |
2233 | | // + Sequences of non-empty rows without hypotheses. |
2234 | | // + Crown paragraphs not immediately followed by a strongly modeled line. |
2235 | | // + Single line paragraphs surrounded by text that doesn't match the |
2236 | | // model. |
2237 | | static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows, |
2238 | 55.7k | std::vector<Interval> *to_fix, int row_start, int row_end) { |
2239 | 55.7k | to_fix->clear(); |
2240 | 626k | for (int i = row_start; i < row_end; i++) { |
2241 | 570k | bool needs_fixing = false; |
2242 | | |
2243 | 570k | SetOfModels models; |
2244 | 570k | SetOfModels models_w_crowns; |
2245 | 570k | rows[i].StrongHypotheses(&models); |
2246 | 570k | rows[i].NonNullHypotheses(&models_w_crowns); |
2247 | 570k | if (models.empty() && !models_w_crowns.empty()) { |
2248 | | // Crown paragraph. Is it followed by a modeled line? |
2249 | 235k | for (unsigned end = i + 1; end < rows.size(); end++) { |
2250 | 227k | SetOfModels end_models; |
2251 | 227k | SetOfModels strong_end_models; |
2252 | 227k | rows[end].NonNullHypotheses(&end_models); |
2253 | 227k | rows[end].StrongHypotheses(&strong_end_models); |
2254 | 227k | if (end_models.empty()) { |
2255 | 15.9k | needs_fixing = true; |
2256 | 15.9k | break; |
2257 | 211k | } else if (!strong_end_models.empty()) { |
2258 | 7.11k | needs_fixing = false; |
2259 | 7.11k | break; |
2260 | 7.11k | } |
2261 | 227k | } |
2262 | 539k | } else if (models.empty() && rows[i].ri_->num_words > 0) { |
2263 | | // No models at all. |
2264 | 513k | needs_fixing = true; |
2265 | 513k | } |
2266 | | |
2267 | 570k | if (!needs_fixing && !models.empty()) { |
2268 | 25.3k | needs_fixing = RowIsStranded(rows, i); |
2269 | 25.3k | } |
2270 | | |
2271 | 570k | if (needs_fixing) { |
2272 | 530k | if (!to_fix->empty() && to_fix->back().end == i - 1) { |
2273 | 476k | to_fix->back().end = i; |
2274 | 476k | } else { |
2275 | 54.0k | to_fix->push_back(Interval(i, i)); |
2276 | 54.0k | } |
2277 | 530k | } |
2278 | 570k | } |
2279 | | // Convert inclusive intervals to half-open intervals. |
2280 | 55.7k | for (auto &i : *to_fix) { |
2281 | 54.0k | i.end = i.end + 1; |
2282 | 54.0k | } |
2283 | 55.7k | } |
2284 | | |
2285 | | // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), |
2286 | | // normalize each row_owner to point to an actual PARA, and output the |
2287 | | // paragraphs in order onto paragraphs. |
2288 | 13.9k | void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) { |
2289 | 13.9k | std::vector<PARA *> &rows = *row_owners; |
2290 | 13.9k | paragraphs->clear(); |
2291 | 13.9k | PARA_IT out(paragraphs); |
2292 | 13.9k | PARA *formerly_null = nullptr; |
2293 | 156k | for (unsigned i = 0; i < rows.size(); i++) { |
2294 | 142k | if (rows[i] == nullptr) { |
2295 | 110k | if (i == 0 || rows[i - 1] != formerly_null) { |
2296 | 11.9k | rows[i] = formerly_null = new PARA(); |
2297 | 98.0k | } else { |
2298 | 98.0k | rows[i] = formerly_null; |
2299 | 98.0k | continue; |
2300 | 98.0k | } |
2301 | 110k | } else if (i > 0 && rows[i - 1] == rows[i]) { |
2302 | 23.9k | continue; |
2303 | 23.9k | } |
2304 | 20.7k | out.add_after_then_move(rows[i]); |
2305 | 20.7k | } |
2306 | 13.9k | } |
2307 | | |
2308 | | // Main entry point for Paragraph Detection Algorithm. |
2309 | | // |
2310 | | // Given a set of equally spaced textlines (described by row_infos), |
2311 | | // Split them into paragraphs. |
2312 | | // |
2313 | | // Output: |
2314 | | // row_owners - one pointer for each row, to the paragraph it belongs to. |
2315 | | // paragraphs - this is the actual list of PARA objects. |
2316 | | // models - the list of paragraph models referenced by the PARA objects. |
2317 | | // caller is responsible for deleting the models. |
2318 | | void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos, |
2319 | | std::vector<PARA *> *row_owners, PARA_LIST *paragraphs, |
2320 | 13.9k | std::vector<ParagraphModel *> *models) { |
2321 | 13.9k | ParagraphTheory theory(models); |
2322 | | |
2323 | | // Initialize row_owners to be a bunch of nullptr pointers. |
2324 | 13.9k | row_owners->clear(); |
2325 | 13.9k | row_owners->resize(row_infos->size()); |
2326 | | |
2327 | | // Set up row scratch registers for the main algorithm. |
2328 | 13.9k | std::vector<RowScratchRegisters> rows(row_infos->size()); |
2329 | 156k | for (unsigned i = 0; i < row_infos->size(); i++) { |
2330 | 142k | rows[i].Init((*row_infos)[i]); |
2331 | 142k | } |
2332 | | |
2333 | | // Pass 1: |
2334 | | // Detect sequences of lines that all contain leader dots (.....) |
2335 | | // These are likely Tables of Contents. If there are three text lines in |
2336 | | // a row with leader dots, it's pretty safe to say the middle one should |
2337 | | // be a paragraph of its own. |
2338 | 13.9k | SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); |
2339 | | |
2340 | 13.9k | DebugDump(debug_level > 1, "End of Pass 1", theory, rows); |
2341 | | |
2342 | 13.9k | std::vector<Interval> leftovers; |
2343 | 13.9k | LeftoverSegments(rows, &leftovers, 0, rows.size()); |
2344 | 13.9k | for (auto &leftover : leftovers) { |
2345 | | // Pass 2a: |
2346 | | // Find any strongly evidenced start-of-paragraph lines. If they're |
2347 | | // followed by two lines that look like body lines, make a paragraph |
2348 | | // model for that and see if that model applies throughout the text |
2349 | | // (that is, "smear" it). |
2350 | 13.9k | StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory); |
2351 | | |
2352 | | // Pass 2b: |
2353 | | // If we had any luck in pass 2a, we got part of the page and didn't |
2354 | | // know how to classify a few runs of rows. Take the segments that |
2355 | | // didn't find a model and reprocess them individually. |
2356 | 13.9k | std::vector<Interval> leftovers2; |
2357 | 13.9k | LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end); |
2358 | 13.9k | bool pass2a_was_useful = |
2359 | 13.9k | leftovers2.size() > 1 || |
2360 | 13.9k | (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size())); |
2361 | 13.9k | if (pass2a_was_useful) { |
2362 | 556 | for (auto &leftover2 : leftovers2) { |
2363 | 556 | StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory); |
2364 | 556 | } |
2365 | 308 | } |
2366 | 13.9k | } |
2367 | | |
2368 | 13.9k | DebugDump(debug_level > 1, "End of Pass 2", theory, rows); |
2369 | | |
2370 | | // Pass 3: |
2371 | | // These are the dregs for which we didn't have enough strong textual |
2372 | | // and geometric clues to form matching models for. Let's see if |
2373 | | // the geometric clues are simple enough that we could just use those. |
2374 | 13.9k | LeftoverSegments(rows, &leftovers, 0, rows.size()); |
2375 | 14.0k | for (auto &leftover : leftovers) { |
2376 | 14.0k | GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory); |
2377 | 14.0k | } |
2378 | | |
2379 | | // Undo any flush models for which there's little evidence. |
2380 | 13.9k | DowngradeWeakestToCrowns(debug_level, &theory, &rows); |
2381 | | |
2382 | 13.9k | DebugDump(debug_level > 1, "End of Pass 3", theory, rows); |
2383 | | |
2384 | | // Pass 4: |
2385 | | // Take everything that's still not marked up well and clear all markings. |
2386 | 13.9k | LeftoverSegments(rows, &leftovers, 0, rows.size()); |
2387 | 13.9k | for (auto &leftover : leftovers) { |
2388 | 121k | for (int j = leftover.begin; j < leftover.end; j++) { |
2389 | 109k | rows[j].SetUnknown(); |
2390 | 109k | } |
2391 | 11.9k | } |
2392 | | |
2393 | 13.9k | DebugDump(debug_level > 1, "End of Pass 4", theory, rows); |
2394 | | |
2395 | | // Convert all of the unique hypothesis runs to PARAs. |
2396 | 13.9k | ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory); |
2397 | | |
2398 | 13.9k | DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); |
2399 | | |
2400 | | // Finally, clean up any dangling nullptr row paragraph parents. |
2401 | 13.9k | CanonicalizeDetectionResults(row_owners, paragraphs); |
2402 | 13.9k | } |
2403 | | |
2404 | | // ============ Code interfacing with the rest of Tesseract ================== |
2405 | | |
2406 | 0 | static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) { |
2407 | | // Set up text, lword_text, and rword_text (mostly for debug printing). |
2408 | 0 | std::string fake_text; |
2409 | 0 | PageIterator pit(static_cast<const PageIterator &>(it)); |
2410 | 0 | if (!pit.Empty(RIL_WORD)) { |
2411 | 0 | bool first_word = true; |
2412 | 0 | do { |
2413 | 0 | fake_text += "x"; |
2414 | 0 | if (first_word) { |
2415 | 0 | info->lword_text += "x"; |
2416 | 0 | } |
2417 | 0 | info->rword_text += "x"; |
2418 | 0 | if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) && |
2419 | 0 | !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) { |
2420 | 0 | fake_text += " "; |
2421 | 0 | info->rword_text = ""; |
2422 | 0 | first_word = false; |
2423 | 0 | } |
2424 | 0 | } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL)); |
2425 | 0 | } |
2426 | 0 | if (fake_text.empty()) { |
2427 | 0 | return; |
2428 | 0 | } |
2429 | | |
2430 | 0 | int lspaces = info->pix_ldistance / info->average_interword_space; |
2431 | 0 | for (int i = 0; i < lspaces; i++) { |
2432 | 0 | info->text += ' '; |
2433 | 0 | } |
2434 | 0 | info->text += fake_text; |
2435 | | |
2436 | | // Set up lword_box, rword_box, and num_words. |
2437 | 0 | PAGE_RES_IT page_res_it = *it.PageResIt(); |
2438 | 0 | WERD_RES *word_res = page_res_it.restart_row(); |
2439 | 0 | ROW_RES *this_row = page_res_it.row(); |
2440 | |
|
2441 | 0 | WERD_RES *lword = nullptr; |
2442 | 0 | WERD_RES *rword = nullptr; |
2443 | 0 | info->num_words = 0; |
2444 | 0 | do { |
2445 | 0 | if (word_res) { |
2446 | 0 | if (!lword) { |
2447 | 0 | lword = word_res; |
2448 | 0 | } |
2449 | 0 | if (rword != word_res) { |
2450 | 0 | info->num_words++; |
2451 | 0 | } |
2452 | 0 | rword = word_res; |
2453 | 0 | } |
2454 | 0 | word_res = page_res_it.forward(); |
2455 | 0 | } while (page_res_it.row() == this_row); |
2456 | |
|
2457 | 0 | if (lword) { |
2458 | 0 | info->lword_box = lword->word->bounding_box(); |
2459 | 0 | } |
2460 | 0 | if (rword) { |
2461 | 0 | info->rword_box = rword->word->bounding_box(); |
2462 | 0 | } |
2463 | 0 | } |
2464 | | |
2465 | | // Given a Tesseract Iterator pointing to a text line, fill in the paragraph |
2466 | | // detector RowInfo with all relevant information from the row. |
2467 | 142k | static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) { |
2468 | 142k | if (it.PageResIt()->row() != nullptr) { |
2469 | 142k | ROW *row = it.PageResIt()->row()->row; |
2470 | 142k | info->pix_ldistance = row->lmargin(); |
2471 | 142k | info->pix_rdistance = row->rmargin(); |
2472 | 142k | info->average_interword_space = |
2473 | 142k | row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1); |
2474 | 142k | info->pix_xheight = row->x_height(); |
2475 | 142k | info->has_leaders = false; |
2476 | 142k | info->has_drop_cap = row->has_drop_cap(); |
2477 | 142k | info->ltr = true; // set below depending on word scripts |
2478 | 142k | } else { |
2479 | 0 | info->pix_ldistance = info->pix_rdistance = 0; |
2480 | 0 | info->average_interword_space = 1; |
2481 | 0 | info->pix_xheight = 1.0; |
2482 | 0 | info->has_leaders = false; |
2483 | 0 | info->has_drop_cap = false; |
2484 | 0 | info->ltr = true; |
2485 | 0 | } |
2486 | | |
2487 | 142k | info->num_words = 0; |
2488 | 142k | info->lword_indicates_list_item = false; |
2489 | 142k | info->lword_likely_starts_idea = false; |
2490 | 142k | info->lword_likely_ends_idea = false; |
2491 | 142k | info->rword_indicates_list_item = false; |
2492 | 142k | info->rword_likely_starts_idea = false; |
2493 | 142k | info->rword_likely_ends_idea = false; |
2494 | 142k | info->has_leaders = false; |
2495 | 142k | info->ltr = true; |
2496 | | |
2497 | 142k | if (!after_recognition) { |
2498 | 0 | InitializeTextAndBoxesPreRecognition(it, info); |
2499 | 0 | return; |
2500 | 0 | } |
2501 | 142k | info->text = ""; |
2502 | 142k | const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE)); |
2503 | 142k | int trailing_ws_idx = strlen(text.get()); // strip trailing space |
2504 | 285k | while (trailing_ws_idx > 0 && |
2505 | | // isspace() only takes ASCII |
2506 | 285k | isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) { |
2507 | 142k | trailing_ws_idx--; |
2508 | 142k | } |
2509 | 142k | if (trailing_ws_idx > 0) { |
2510 | 142k | int lspaces = info->pix_ldistance / info->average_interword_space; |
2511 | 218k | for (int i = 0; i < lspaces; i++) { |
2512 | 75.7k | info->text += ' '; |
2513 | 75.7k | } |
2514 | 600k | for (int i = 0; i < trailing_ws_idx; i++) { |
2515 | 457k | info->text += text[i]; |
2516 | 457k | } |
2517 | 142k | } |
2518 | | |
2519 | 142k | if (info->text.empty()) { |
2520 | 0 | return; |
2521 | 0 | } |
2522 | | |
2523 | 142k | PAGE_RES_IT page_res_it = *it.PageResIt(); |
2524 | 142k | std::vector<WERD_RES *> werds; |
2525 | 142k | WERD_RES *word_res = page_res_it.restart_row(); |
2526 | 142k | ROW_RES *this_row = page_res_it.row(); |
2527 | 142k | int num_leaders = 0; |
2528 | 142k | int ltr = 0; |
2529 | 142k | int rtl = 0; |
2530 | 175k | do { |
2531 | 175k | if (word_res && word_res->best_choice->unichar_string().length() > 0) { |
2532 | 175k | werds.push_back(word_res); |
2533 | 175k | ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; |
2534 | 175k | rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; |
2535 | 175k | if (word_res->word->flag(W_REP_CHAR)) { |
2536 | 0 | num_leaders++; |
2537 | 0 | } |
2538 | 175k | } |
2539 | 175k | word_res = page_res_it.forward(); |
2540 | 175k | } while (page_res_it.row() == this_row); |
2541 | 142k | info->ltr = ltr >= rtl; |
2542 | 142k | info->has_leaders = num_leaders > 3; |
2543 | 142k | info->num_words = werds.size(); |
2544 | 142k | if (!werds.empty()) { |
2545 | 142k | WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; |
2546 | 142k | info->lword_text = lword->best_choice->unichar_string().c_str(); |
2547 | 142k | info->rword_text = rword->best_choice->unichar_string().c_str(); |
2548 | 142k | info->lword_box = lword->word->bounding_box(); |
2549 | 142k | info->rword_box = rword->word->bounding_box(); |
2550 | 142k | LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text, |
2551 | 142k | &info->lword_indicates_list_item, &info->lword_likely_starts_idea, |
2552 | 142k | &info->lword_likely_ends_idea); |
2553 | 142k | RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text, |
2554 | 142k | &info->rword_indicates_list_item, &info->rword_likely_starts_idea, |
2555 | 142k | &info->rword_likely_ends_idea); |
2556 | 142k | } |
2557 | 142k | } |
2558 | | |
2559 | | // This is called after rows have been identified and words are recognized. |
2560 | | // Much of this could be implemented before word recognition, but text helps |
2561 | | // to identify bulleted lists and gives good signals for sentence boundaries. |
2562 | | void DetectParagraphs(int debug_level, bool after_text_recognition, |
2563 | 13.9k | const MutableIterator *block_start, std::vector<ParagraphModel *> *models) { |
2564 | | // Clear out any preconceived notions. |
2565 | 13.9k | if (block_start->Empty(RIL_TEXTLINE)) { |
2566 | 7 | return; |
2567 | 7 | } |
2568 | 13.9k | BLOCK *block = block_start->PageResIt()->block()->block; |
2569 | 13.9k | block->para_list()->clear(); |
2570 | 13.9k | bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText(); |
2571 | | |
2572 | | // Convert the Tesseract structures to RowInfos |
2573 | | // for the paragraph detection algorithm. |
2574 | 13.9k | MutableIterator row(*block_start); |
2575 | 13.9k | if (row.Empty(RIL_TEXTLINE)) { |
2576 | 0 | return; // end of input already. |
2577 | 0 | } |
2578 | | |
2579 | 13.9k | std::vector<RowInfo> row_infos; |
2580 | 142k | do { |
2581 | 142k | if (!row.PageResIt()->row()) { |
2582 | 0 | continue; // empty row. |
2583 | 0 | } |
2584 | 142k | row.PageResIt()->row()->row->set_para(nullptr); |
2585 | 142k | row_infos.emplace_back(); |
2586 | 142k | RowInfo &ri = row_infos.back(); |
2587 | 142k | InitializeRowInfo(after_text_recognition, row, &ri); |
2588 | 142k | } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE)); |
2589 | | |
2590 | | // If we're called before text recognition, we might not have |
2591 | | // tight block bounding boxes, so trim by the minimum on each side. |
2592 | 13.9k | if (!row_infos.empty()) { |
2593 | 13.9k | int min_lmargin = row_infos[0].pix_ldistance; |
2594 | 13.9k | int min_rmargin = row_infos[0].pix_rdistance; |
2595 | 142k | for (unsigned i = 1; i < row_infos.size(); i++) { |
2596 | 128k | if (row_infos[i].pix_ldistance < min_lmargin) { |
2597 | 1.94k | min_lmargin = row_infos[i].pix_ldistance; |
2598 | 1.94k | } |
2599 | 128k | if (row_infos[i].pix_rdistance < min_rmargin) { |
2600 | 4.08k | min_rmargin = row_infos[i].pix_rdistance; |
2601 | 4.08k | } |
2602 | 128k | } |
2603 | 13.9k | if (min_lmargin > 0 || min_rmargin > 0) { |
2604 | 66.2k | for (auto &row_info : row_infos) { |
2605 | 66.2k | row_info.pix_ldistance -= min_lmargin; |
2606 | 66.2k | row_info.pix_rdistance -= min_rmargin; |
2607 | 66.2k | } |
2608 | 6.76k | } |
2609 | 13.9k | } |
2610 | | |
2611 | | // Run the paragraph detection algorithm. |
2612 | 13.9k | std::vector<PARA *> row_owners; |
2613 | 13.9k | if (!is_image_block) { |
2614 | 13.9k | DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models); |
2615 | 13.9k | } else { |
2616 | 0 | row_owners.resize(row_infos.size()); |
2617 | 0 | CanonicalizeDetectionResults(&row_owners, block->para_list()); |
2618 | 0 | } |
2619 | | |
2620 | | // Now stitch in the row_owners into the rows. |
2621 | 13.9k | row = *block_start; |
2622 | 142k | for (auto &row_owner : row_owners) { |
2623 | 142k | while (!row.PageResIt()->row()) { |
2624 | 0 | row.Next(RIL_TEXTLINE); |
2625 | 0 | } |
2626 | 142k | row.PageResIt()->row()->row->set_para(row_owner); |
2627 | 142k | row.Next(RIL_TEXTLINE); |
2628 | 142k | } |
2629 | 13.9k | } |
2630 | | |
2631 | | } // namespace tesseract |