Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/textord/colpartition.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        colpartition.cpp
3
// Description: Class to hold partitions of the page that correspond
4
//              roughly to text lines.
5
// Author:      Ray Smith
6
//
7
// (C) Copyright 2008, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
18
///////////////////////////////////////////////////////////////////////
19
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include "colpartition.h"
25
#include "colpartitiongrid.h"
26
#include "colpartitionset.h"
27
#include "detlinefit.h"
28
#include "dppoint.h"
29
#include "helpers.h" // for UpdateRange
30
#include "host.h"    // for NearlyEqual
31
#include "imagefind.h"
32
#include "workingpartset.h"
33
34
#include <algorithm>
35
36
namespace tesseract {
37
38
//////////////// ColPartition Implementation ////////////////
39
40
// enum to refer to the entries in a neighbourhood of lines.
41
// Used by SmoothSpacings to test for blips with OKSpacingBlip.
42
enum SpacingNeighbourhood {
43
  PN_ABOVE2,
44
  PN_ABOVE1,
45
  PN_UPPER,
46
  PN_LOWER,
47
  PN_BELOW1,
48
  PN_BELOW2,
49
  PN_COUNT
50
};
51
52
// Maximum change in spacing (in inches) to ignore.
53
const double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.
54
// Maximum fraction of line height used as an additional allowance
55
// for top spacing.
56
const double kMaxTopSpacingFraction = 0.25;
57
// What multiple of the largest line height should be used as an upper bound
58
// for whether lines are in the same text block?
59
const double kMaxSameBlockLineSpacing = 3;
60
// Maximum ratio of sizes for lines to be considered the same size.
61
const double kMaxSizeRatio = 1.5;
62
// Fraction of max of leader width and gap for max IQR of gaps.
63
const double kMaxLeaderGapFractionOfMax = 0.25;
64
// Fraction of min of leader width and gap for max IQR of gaps.
65
const double kMaxLeaderGapFractionOfMin = 0.5;
66
// Minimum number of blobs to be considered a leader.
67
const int kMinLeaderCount = 5;
68
// Minimum score for a STRONG_CHAIN textline.
69
const int kMinStrongTextValue = 6;
70
// Minimum score for a CHAIN textline.
71
const int kMinChainTextValue = 3;
72
// Minimum number of blobs for strong horizontal text lines.
73
const int kHorzStrongTextlineCount = 8;
74
// Minimum height (in image pixels) for strong horizontal text lines.
75
const int kHorzStrongTextlineHeight = 10;
76
// Minimum aspect ratio for strong horizontal text lines.
77
const int kHorzStrongTextlineAspect = 5;
78
// Maximum upper quartile error allowed on a baseline fit as a fraction
79
// of height.
80
const double kMaxBaselineError = 0.4375;
81
// Min coverage for a good baseline between vectors
82
const double kMinBaselineCoverage = 0.5;
83
// Max RMS color noise to compare colors.
84
const int kMaxRMSColorNoise = 128;
85
// Maximum distance to allow a partition color to be to use that partition
86
// in smoothing neighbouring types. This is a squared distance.
87
const int kMaxColorDistance = 900;
88
89
// blob_type is the blob_region_type_ of the blobs in this partition.
90
// Vertical is the direction of logical vertical on the possibly skewed image.
91
ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD &vertical)
92
    : left_margin_(-INT32_MAX),
93
      right_margin_(INT32_MAX),
94
      median_bottom_(INT32_MAX),
95
      median_top_(-INT32_MAX),
96
      median_left_(INT32_MAX),
97
      median_right_(-INT32_MAX),
98
      blob_type_(blob_type),
99
0
      vertical_(vertical) {
100
0
  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
101
0
}
102
103
// Constructs a fake ColPartition with a single fake BLOBNBOX, all made
104
// from a single TBOX.
105
// WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
106
// the ColPartition owns the BLOBNBOX!!!
107
// Call DeleteBoxes before deleting the ColPartition.
108
ColPartition *ColPartition::FakePartition(const TBOX &box,
109
                                          PolyBlockType block_type,
110
                                          BlobRegionType blob_type,
111
0
                                          BlobTextFlowType flow) {
112
0
  auto *part = new ColPartition(blob_type, ICOORD(0, 1));
113
0
  part->set_type(block_type);
114
0
  part->set_flow(flow);
115
0
  part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
116
0
  part->set_left_margin(box.left());
117
0
  part->set_right_margin(box.right());
118
0
  part->SetBlobTypes();
119
0
  part->ComputeLimits();
120
0
  part->ClaimBoxes();
121
0
  return part;
122
0
}
123
124
// Constructs and returns a ColPartition with the given real BLOBNBOX,
125
// and sets it up to be a "big" partition (single-blob partition bigger
126
// than the surrounding text that may be a dropcap, two or more vertically
127
// touching characters, or some graphic element.
128
// If the given list is not nullptr, the partition is also added to the list.
129
ColPartition *ColPartition::MakeBigPartition(BLOBNBOX *box,
130
0
                                             ColPartition_LIST *big_part_list) {
131
0
  box->set_owner(nullptr);
132
0
  auto *single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
133
0
  single->set_flow(BTFT_NONE);
134
0
  single->AddBox(box);
135
0
  single->ComputeLimits();
136
0
  single->ClaimBoxes();
137
0
  single->SetBlobTypes();
138
0
  single->set_block_owned(true);
139
0
  if (big_part_list != nullptr) {
140
0
    ColPartition_IT part_it(big_part_list);
141
0
    part_it.add_to_end(single);
142
0
  }
143
0
  return single;
144
0
}
145
146
0
ColPartition::~ColPartition() {
147
  // Remove this as a partner of all partners, as we don't want them
148
  // referring to a deleted object.
149
0
  ColPartition_C_IT it(&upper_partners_);
150
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
151
0
    it.data()->RemovePartner(false, this);
152
0
  }
153
0
  it.set_to_list(&lower_partners_);
154
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
155
0
    it.data()->RemovePartner(true, this);
156
0
  }
157
0
}
158
159
// Constructs a fake ColPartition with no BLOBNBOXes to represent a
160
// horizontal or vertical line, given a type and a bounding box.
161
ColPartition *ColPartition::MakeLinePartition(BlobRegionType blob_type,
162
                                              const ICOORD &vertical, int left,
163
0
                                              int bottom, int right, int top) {
164
0
  auto *part = new ColPartition(blob_type, vertical);
165
0
  part->bounding_box_ = TBOX(left, bottom, right, top);
166
0
  part->median_bottom_ = bottom;
167
0
  part->median_top_ = top;
168
0
  part->median_height_ = top - bottom;
169
0
  part->median_left_ = left;
170
0
  part->median_right_ = right;
171
0
  part->median_width_ = right - left;
172
0
  part->left_key_ = part->BoxLeftKey();
173
0
  part->right_key_ = part->BoxRightKey();
174
0
  return part;
175
0
}
176
177
// Adds the given box to the partition, updating the partition bounds.
178
// The list of boxes in the partition is updated, ensuring that no box is
179
// recorded twice, and the boxes are kept in increasing left position.
180
0
void ColPartition::AddBox(BLOBNBOX *bbox) {
181
0
  TBOX box = bbox->bounding_box();
182
  // Update the partition limits.
183
0
  if (boxes_.empty()) {
184
0
    bounding_box_ = box;
185
0
  } else {
186
0
    bounding_box_ += box;
187
0
  }
188
189
0
  if (IsVerticalType()) {
190
0
    if (!last_add_was_vertical_) {
191
0
      boxes_.sort(SortByBoxBottom<BLOBNBOX>);
192
0
      last_add_was_vertical_ = true;
193
0
    }
194
0
    boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
195
0
  } else {
196
0
    if (last_add_was_vertical_) {
197
0
      boxes_.sort(SortByBoxLeft<BLOBNBOX>);
198
0
      last_add_was_vertical_ = false;
199
0
    }
200
0
    boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
201
0
  }
202
0
  if (!left_key_tab_) {
203
0
    left_key_ = BoxLeftKey();
204
0
  }
205
0
  if (!right_key_tab_) {
206
0
    right_key_ = BoxRightKey();
207
0
  }
208
0
  if (TabFind::WithinTestRegion(2, box.left(), box.bottom())) {
209
0
    tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
210
0
            box.left(), box.bottom(), box.right(), box.top(),
211
0
            bounding_box_.left(), bounding_box_.right());
212
0
  }
213
0
}
214
215
// Removes the given box from the partition, updating the bounds.
216
0
void ColPartition::RemoveBox(BLOBNBOX *box) {
217
0
  BLOBNBOX_C_IT bb_it(&boxes_);
218
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
219
0
    if (box == bb_it.data()) {
220
0
      bb_it.extract();
221
0
      ComputeLimits();
222
0
      return;
223
0
    }
224
0
  }
225
0
}
226
227
// Returns the tallest box in the partition, as measured perpendicular to the
228
// presumed flow of text.
229
0
BLOBNBOX *ColPartition::BiggestBox() {
230
0
  BLOBNBOX *biggest = nullptr;
231
0
  BLOBNBOX_C_IT bb_it(&boxes_);
232
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
233
0
    BLOBNBOX *bbox = bb_it.data();
234
0
    if (IsVerticalType()) {
235
0
      if (biggest == nullptr ||
236
0
          bbox->bounding_box().width() > biggest->bounding_box().width()) {
237
0
        biggest = bbox;
238
0
      }
239
0
    } else {
240
0
      if (biggest == nullptr ||
241
0
          bbox->bounding_box().height() > biggest->bounding_box().height()) {
242
0
        biggest = bbox;
243
0
      }
244
0
    }
245
0
  }
246
0
  return biggest;
247
0
}
248
249
// Returns the bounding box excluding the given box.
250
0
TBOX ColPartition::BoundsWithoutBox(BLOBNBOX *box) {
251
0
  TBOX result;
252
0
  BLOBNBOX_C_IT bb_it(&boxes_);
253
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
254
0
    if (box != bb_it.data()) {
255
0
      result += bb_it.data()->bounding_box();
256
0
    }
257
0
  }
258
0
  return result;
259
0
}
260
261
// Claims the boxes in the boxes_list by marking them with a this owner
262
// pointer. If a box is already owned, then it must be owned by this.
263
0
void ColPartition::ClaimBoxes() {
264
0
  BLOBNBOX_C_IT bb_it(&boxes_);
265
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
266
0
    BLOBNBOX *bblob = bb_it.data();
267
0
    ColPartition *other = bblob->owner();
268
0
    if (other == nullptr) {
269
      // Normal case: ownership is available.
270
0
      bblob->set_owner(this);
271
0
    } else {
272
0
      ASSERT_HOST(other == this);
273
0
    }
274
0
  }
275
0
}
276
277
// nullptr the owner of the blobs in this partition, so they can be deleted
278
// independently of the ColPartition.
279
0
void ColPartition::DisownBoxes() {
280
0
  BLOBNBOX_C_IT bb_it(&boxes_);
281
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
282
0
    BLOBNBOX *bblob = bb_it.data();
283
0
    ASSERT_HOST(bblob->owner() == this || bblob->owner() == nullptr);
284
0
    bblob->set_owner(nullptr);
285
0
  }
286
0
}
287
288
// nullptr the owner of the blobs in this partition that are owned by this
289
// partition, so they can be deleted independently of the ColPartition.
290
// Any blobs that are not owned by this partition get to keep their owner
291
// without an assert failure.
292
0
void ColPartition::DisownBoxesNoAssert() {
293
0
  BLOBNBOX_C_IT bb_it(&boxes_);
294
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
295
0
    BLOBNBOX *bblob = bb_it.data();
296
0
    if (bblob->owner() == this) {
297
0
      bblob->set_owner(nullptr);
298
0
    }
299
0
  }
300
0
}
301
302
// Nulls the owner of the blobs in this partition that are owned by this
303
// partition and not leader blobs, removing them from the boxes_ list, thus
304
// turning this partition back to a leader partition if it contains a leader,
305
// or otherwise leaving it empty. Returns true if any boxes remain.
306
0
bool ColPartition::ReleaseNonLeaderBoxes() {
307
0
  BLOBNBOX_C_IT bb_it(&boxes_);
308
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
309
0
    BLOBNBOX *bblob = bb_it.data();
310
0
    if (bblob->flow() != BTFT_LEADER) {
311
0
      if (bblob->owner() == this) {
312
0
        bblob->set_owner(nullptr);
313
0
      }
314
0
      bb_it.extract();
315
0
    }
316
0
  }
317
0
  if (bb_it.empty()) {
318
0
    return false;
319
0
  }
320
0
  flow_ = BTFT_LEADER;
321
0
  ComputeLimits();
322
0
  return true;
323
0
}
324
325
// Delete the boxes that this partition owns.
326
0
void ColPartition::DeleteBoxes() {
327
  // Although the boxes_ list is a C_LIST, in some cases it owns the
328
  // BLOBNBOXes, as the ColPartition takes ownership from the grid,
329
  // and the BLOBNBOXes own the underlying C_BLOBs.
330
0
  for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
331
0
    BLOBNBOX *bblob = bb_it.extract();
332
    // TODO: remove next line, currently still needed for resultiterator_test.
333
0
    delete bblob->remove_cblob();
334
0
    delete bblob;
335
0
  }
336
0
}
337
338
// Reflects the partition in the y-axis, assuming that its blobs have
339
// already been done. Corrects only a limited part of the members, since
340
// this function is assumed to be used shortly after initial creation, which
341
// is before a lot of the members are used.
342
0
void ColPartition::ReflectInYAxis() {
343
0
  BLOBNBOX_CLIST reversed_boxes;
344
0
  BLOBNBOX_C_IT reversed_it(&reversed_boxes);
345
  // Reverse the order of the boxes_.
346
0
  BLOBNBOX_C_IT bb_it(&boxes_);
347
0
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
348
0
    reversed_it.add_before_then_move(bb_it.extract());
349
0
  }
350
0
  bb_it.add_list_after(&reversed_boxes);
351
0
  ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
352
0
  int tmp = left_margin_;
353
0
  left_margin_ = -right_margin_;
354
0
  right_margin_ = -tmp;
355
0
  ComputeLimits();
356
0
}
357
358
// Returns true if this is a legal partition - meaning that the conditions
359
// left_margin <= bounding_box left
360
// left_key <= bounding box left key
361
// bounding box left <= bounding box right
362
// and likewise for right margin and key
363
// are all met.
364
0
bool ColPartition::IsLegal() {
365
0
  if (bounding_box_.left() > bounding_box_.right()) {
366
0
    if (textord_debug_bugs) {
367
0
      tprintf("Bounding box invalid\n");
368
0
      Print();
369
0
    }
370
0
    return false; // Bounding box invalid.
371
0
  }
372
0
  if (left_margin_ > bounding_box_.left() ||
373
0
      right_margin_ < bounding_box_.right()) {
374
0
    if (textord_debug_bugs) {
375
0
      tprintf("Margins invalid\n");
376
0
      Print();
377
0
    }
378
0
    return false; // Margins invalid.
379
0
  }
380
0
  if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
381
0
    if (textord_debug_bugs) {
382
0
      tprintf("Key inside box: %d v %d or %d v %d\n", left_key_, BoxLeftKey(),
383
0
              right_key_, BoxRightKey());
384
0
      Print();
385
0
    }
386
0
    return false; // Keys inside the box.
387
0
  }
388
0
  return true;
389
0
}
390
391
// Returns true if the left and right edges are approximately equal.
392
0
bool ColPartition::MatchingColumns(const ColPartition &other) const {
393
0
  int y = (MidY() + other.MidY()) / 2;
394
0
  if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
395
0
                   LeftAtY(y) / kColumnWidthFactor, 1)) {
396
0
    return false;
397
0
  }
398
0
  if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
399
0
                   RightAtY(y) / kColumnWidthFactor, 1)) {
400
0
    return false;
401
0
  }
402
0
  return true;
403
0
}
404
405
// Returns true if the colors match for two text partitions.
406
0
bool ColPartition::MatchingTextColor(const ColPartition &other) const {
407
0
  if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
408
0
      other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise) {
409
0
    return false; // Too noisy.
410
0
  }
411
412
  // Colors must match for other to count.
413
0
  double d_this1_o =
414
0
      ImageFind::ColorDistanceFromLine(other.color1_, other.color2_, color1_);
415
0
  double d_this2_o =
416
0
      ImageFind::ColorDistanceFromLine(other.color1_, other.color2_, color2_);
417
0
  double d_o1_this =
418
0
      ImageFind::ColorDistanceFromLine(color1_, color2_, other.color1_);
419
0
  double d_o2_this =
420
0
      ImageFind::ColorDistanceFromLine(color1_, color2_, other.color2_);
421
  // All 4 distances must be small enough.
422
0
  return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
423
0
         d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
424
0
}
425
426
// Returns true if the sizes match for two text partitions,
427
// taking orientation into account. See also SizesSimilar.
428
0
bool ColPartition::MatchingSizes(const ColPartition &other) const {
429
0
  if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT) {
430
0
    return !TabFind::DifferentSizes(median_width_, other.median_width_);
431
0
  } else {
432
0
    return !TabFind::DifferentSizes(median_height_, other.median_height_);
433
0
  }
434
0
}
435
436
// Returns true if there is no tabstop violation in merging this and other.
437
0
bool ColPartition::ConfirmNoTabViolation(const ColPartition &other) const {
438
0
  if (bounding_box_.right() < other.bounding_box_.left() &&
439
0
      bounding_box_.right() < other.LeftBlobRule()) {
440
0
    return false;
441
0
  }
442
0
  if (other.bounding_box_.right() < bounding_box_.left() &&
443
0
      other.bounding_box_.right() < LeftBlobRule()) {
444
0
    return false;
445
0
  }
446
0
  if (bounding_box_.left() > other.bounding_box_.right() &&
447
0
      bounding_box_.left() > other.RightBlobRule()) {
448
0
    return false;
449
0
  }
450
0
  if (other.bounding_box_.left() > bounding_box_.right() &&
451
0
      other.bounding_box_.left() > RightBlobRule()) {
452
0
    return false;
453
0
  }
454
0
  return true;
455
0
}
456
457
// Returns true if other has a similar stroke width to this.
458
bool ColPartition::MatchingStrokeWidth(const ColPartition &other,
459
                                       double fractional_tolerance,
460
0
                                       double constant_tolerance) const {
461
0
  int match_count = 0;
462
0
  int nonmatch_count = 0;
463
0
  BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
464
0
  BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST *>(&other.boxes_));
465
0
  box_it.mark_cycle_pt();
466
0
  other_it.mark_cycle_pt();
467
0
  while (!box_it.cycled_list() && !other_it.cycled_list()) {
468
0
    if (box_it.data()->MatchingStrokeWidth(
469
0
            *other_it.data(), fractional_tolerance, constant_tolerance)) {
470
0
      ++match_count;
471
0
    } else {
472
0
      ++nonmatch_count;
473
0
    }
474
0
    box_it.forward();
475
0
    other_it.forward();
476
0
  }
477
0
  return match_count > nonmatch_count;
478
0
}
479
480
// Returns true if base is an acceptable diacritic base char merge
481
// with this as the diacritic.
482
// Returns true if:
483
// (1) this is a ColPartition containing only diacritics, and
484
// (2) the base characters indicated on the diacritics all believably lie
485
// within the text line of the candidate ColPartition.
486
bool ColPartition::OKDiacriticMerge(const ColPartition &candidate,
487
0
                                    bool debug) const {
488
0
  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
489
0
  int min_top = INT32_MAX;
490
0
  int max_bottom = -INT32_MAX;
491
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
492
0
    BLOBNBOX *blob = it.data();
493
0
    if (!blob->IsDiacritic()) {
494
0
      if (debug) {
495
0
        tprintf("Blob is not a diacritic:");
496
0
        blob->bounding_box().print();
497
0
      }
498
0
      return false; // All blobs must have diacritic bases.
499
0
    }
500
0
    if (blob->base_char_top() < min_top) {
501
0
      min_top = blob->base_char_top();
502
0
    }
503
0
    if (blob->base_char_bottom() > max_bottom) {
504
0
      max_bottom = blob->base_char_bottom();
505
0
    }
506
0
  }
507
  // If the intersection of all vertical ranges of all base characters
508
  // overlaps the median range of this, then it is OK.
509
0
  bool result =
510
0
      min_top > candidate.median_bottom_ && max_bottom < candidate.median_top_;
511
0
  if (debug) {
512
0
    if (result) {
513
0
      tprintf("OKDiacritic!\n");
514
0
    } else {
515
0
      tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n", max_bottom, min_top,
516
0
              median_bottom_, median_top_);
517
0
    }
518
0
  }
519
0
  return result;
520
0
}
521
522
// Sets the sort key using either the tab vector, or the bounding box if
523
// the tab vector is nullptr. If the tab_vector lies inside the bounding_box,
524
// use the edge of the box as a key any way.
525
0
void ColPartition::SetLeftTab(const TabVector *tab_vector) {
526
0
  if (tab_vector != nullptr) {
527
0
    left_key_ = tab_vector->sort_key();
528
0
    left_key_tab_ = left_key_ <= BoxLeftKey();
529
0
  } else {
530
0
    left_key_tab_ = false;
531
0
  }
532
0
  if (!left_key_tab_) {
533
0
    left_key_ = BoxLeftKey();
534
0
  }
535
0
}
536
537
// As SetLeftTab, but with the right.
538
0
void ColPartition::SetRightTab(const TabVector *tab_vector) {
539
0
  if (tab_vector != nullptr) {
540
0
    right_key_ = tab_vector->sort_key();
541
0
    right_key_tab_ = right_key_ >= BoxRightKey();
542
0
  } else {
543
0
    right_key_tab_ = false;
544
0
  }
545
0
  if (!right_key_tab_) {
546
0
    right_key_ = BoxRightKey();
547
0
  }
548
0
}
549
550
// Copies the left/right tab from the src partition, but if take_box is
551
// true, copies the box instead and uses that as a key.
552
0
void ColPartition::CopyLeftTab(const ColPartition &src, bool take_box) {
553
0
  left_key_tab_ = take_box ? false : src.left_key_tab_;
554
0
  if (left_key_tab_) {
555
0
    left_key_ = src.left_key_;
556
0
  } else {
557
0
    bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
558
0
    left_key_ = BoxLeftKey();
559
0
  }
560
0
  if (left_margin_ > bounding_box_.left()) {
561
0
    left_margin_ = src.left_margin_;
562
0
  }
563
0
}
564
565
// As CopyLeftTab, but with the right.
566
0
void ColPartition::CopyRightTab(const ColPartition &src, bool take_box) {
567
0
  right_key_tab_ = take_box ? false : src.right_key_tab_;
568
0
  if (right_key_tab_) {
569
0
    right_key_ = src.right_key_;
570
0
  } else {
571
0
    bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
572
0
    right_key_ = BoxRightKey();
573
0
  }
574
0
  if (right_margin_ < bounding_box_.right()) {
575
0
    right_margin_ = src.right_margin_;
576
0
  }
577
0
}
578
579
// Returns the left rule line x coord of the leftmost blob.
580
0
int ColPartition::LeftBlobRule() const {
581
0
  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
582
0
  return it.data()->left_rule();
583
0
}
584
// Returns the right rule line x coord of the rightmost blob.
585
0
int ColPartition::RightBlobRule() const {
586
0
  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
587
0
  it.move_to_last();
588
0
  return it.data()->right_rule();
589
0
}
590
591
0
float ColPartition::SpecialBlobsDensity(const BlobSpecialTextType type) const {
592
0
  ASSERT_HOST(type < BSTT_COUNT);
593
0
  return special_blobs_densities_[type];
594
0
}
595
596
0
int ColPartition::SpecialBlobsCount(const BlobSpecialTextType type) {
597
0
  ASSERT_HOST(type < BSTT_COUNT);
598
0
  BLOBNBOX_C_IT blob_it(&boxes_);
599
0
  int count = 0;
600
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
601
0
    BLOBNBOX *blob = blob_it.data();
602
0
    BlobSpecialTextType blob_type = blob->special_text_type();
603
0
    if (blob_type == type) {
604
0
      count++;
605
0
    }
606
0
  }
607
608
0
  return count;
609
0
}
610
611
void ColPartition::SetSpecialBlobsDensity(const BlobSpecialTextType type,
612
0
                                          const float density) {
613
0
  ASSERT_HOST(type < BSTT_COUNT);
614
0
  special_blobs_densities_[type] = density;
615
0
}
616
617
0
void ColPartition::ComputeSpecialBlobsDensity() {
618
0
  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
619
0
  if (boxes_.empty()) {
620
0
    return;
621
0
  }
622
623
0
  BLOBNBOX_C_IT blob_it(&boxes_);
624
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
625
0
    BLOBNBOX *blob = blob_it.data();
626
0
    BlobSpecialTextType type = blob->special_text_type();
627
0
    special_blobs_densities_[type]++;
628
0
  }
629
630
0
  for (float &special_blobs_density : special_blobs_densities_) {
631
0
    special_blobs_density /= boxes_.length();
632
0
  }
633
0
}
634
635
// Add a partner above if upper, otherwise below.
636
// Add them uniquely and keep the list sorted by box left.
637
// Partnerships are added symmetrically to partner and this.
638
0
void ColPartition::AddPartner(bool upper, ColPartition *partner) {
639
0
  if (upper) {
640
0
    partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true,
641
0
                                        this);
642
0
    upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
643
0
  } else {
644
0
    partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true,
645
0
                                        this);
646
0
    lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
647
0
  }
648
0
}
649
650
// Removes the partner from this, but does not remove this from partner.
651
// This asymmetric removal is so as not to mess up the iterator that is
652
// working on partner's partner list.
653
0
void ColPartition::RemovePartner(bool upper, ColPartition *partner) {
654
0
  ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
655
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
656
0
    if (it.data() == partner) {
657
0
      it.extract();
658
0
      break;
659
0
    }
660
0
  }
661
0
}
662
663
// Returns the partner if the given partner is a singleton, otherwise nullptr.
664
0
ColPartition *ColPartition::SingletonPartner(bool upper) {
665
0
  ColPartition_CLIST *partners = upper ? &upper_partners_ : &lower_partners_;
666
0
  if (!partners->singleton()) {
667
0
    return nullptr;
668
0
  }
669
0
  ColPartition_C_IT it(partners);
670
0
  return it.data();
671
0
}
672
673
// Merge with the other partition and delete it.
674
0
void ColPartition::Absorb(ColPartition *other, const WidthCallback &cb) {
675
  // The result has to either own all of the blobs or none of them.
676
  // Verify the flag is consistent.
677
0
  ASSERT_HOST(owns_blobs() == other->owns_blobs());
678
  // TODO(nbeato): check owns_blobs better. Right now owns_blobs
679
  // should always be true when this is called. So there is no issues.
680
0
  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
681
0
                                bounding_box_.bottom()) ||
682
0
      TabFind::WithinTestRegion(2, other->bounding_box_.left(),
683
0
                                other->bounding_box_.bottom())) {
684
0
    tprintf("Merging:");
685
0
    Print();
686
0
    other->Print();
687
0
  }
688
689
  // Update the special_blobs_densities_.
690
0
  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
691
0
  for (int type = 0; type < BSTT_COUNT; ++type) {
692
0
    unsigned w1 = boxes_.length();
693
0
    unsigned w2 = other->boxes_.length();
694
0
    float new_val = special_blobs_densities_[type] * w1 +
695
0
                    other->special_blobs_densities_[type] * w2;
696
0
    if (!w1 || !w2) {
697
0
      ASSERT_HOST((w1 + w2) > 0);
698
0
      special_blobs_densities_[type] = new_val / (w1 + w2);
699
0
    }
700
0
  }
701
702
  // Merge the two sorted lists.
703
0
  BLOBNBOX_C_IT it(&boxes_);
704
0
  BLOBNBOX_C_IT it2(&other->boxes_);
705
0
  for (; !it2.empty(); it2.forward()) {
706
0
    BLOBNBOX *bbox2 = it2.extract();
707
0
    ColPartition *prev_owner = bbox2->owner();
708
0
    if (prev_owner != other && prev_owner != nullptr) {
709
      // A blob on other's list is owned by someone else; let them have it.
710
0
      continue;
711
0
    }
712
0
    ASSERT_HOST(prev_owner == other || prev_owner == nullptr);
713
0
    if (prev_owner == other) {
714
0
      bbox2->set_owner(this);
715
0
    }
716
0
    it.add_to_end(bbox2);
717
0
  }
718
0
  left_margin_ = std::min(left_margin_, other->left_margin_);
719
0
  right_margin_ = std::max(right_margin_, other->right_margin_);
720
0
  if (other->left_key_ < left_key_) {
721
0
    left_key_ = other->left_key_;
722
0
    left_key_tab_ = other->left_key_tab_;
723
0
  }
724
0
  if (other->right_key_ > right_key_) {
725
0
    right_key_ = other->right_key_;
726
0
    right_key_tab_ = other->right_key_tab_;
727
0
  }
728
  // Combine the flow and blob_type in a sensible way.
729
  // Dominant flows stay.
730
0
  if (!DominatesInMerge(flow_, other->flow_)) {
731
0
    flow_ = other->flow_;
732
0
    blob_type_ = other->blob_type_;
733
0
  }
734
0
  SetBlobTypes();
735
0
  if (IsVerticalType()) {
736
0
    boxes_.sort(SortByBoxBottom<BLOBNBOX>);
737
0
    last_add_was_vertical_ = true;
738
0
  } else {
739
0
    boxes_.sort(SortByBoxLeft<BLOBNBOX>);
740
0
    last_add_was_vertical_ = false;
741
0
  }
742
0
  ComputeLimits();
743
  // Fix partner lists. other is going away, so remove it as a
744
  // partner of all its partners and add this in its place.
745
0
  for (int upper = 0; upper < 2; ++upper) {
746
0
    ColPartition_CLIST partners;
747
0
    ColPartition_C_IT part_it(&partners);
748
0
    part_it.add_list_after(upper ? &other->upper_partners_
749
0
                                 : &other->lower_partners_);
750
0
    for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
751
0
      ColPartition *partner = part_it.extract();
752
0
      partner->RemovePartner(!upper, other);
753
0
      partner->RemovePartner(!upper, this);
754
0
      partner->AddPartner(!upper, this);
755
0
    }
756
0
  }
757
0
  delete other;
758
0
  if (cb != nullptr) {
759
0
    SetColumnGoodness(cb);
760
0
  }
761
0
}
762
763
// Merge1 and merge2 are candidates to be merged, yet their combined box
764
// overlaps this. Is that allowed?
765
// Returns true if the overlap between this and the merged pair of
766
// merge candidates is sufficiently trivial to be allowed.
767
// The merged box can graze the edge of this by the ok_box_overlap
768
// if that exceeds the margin to the median top and bottom.
769
// ok_box_overlap should be set by the caller appropriate to the sizes of
770
// the text involved, and is usually a fraction of the median size of merge1
771
// and/or merge2, or this.
772
// TODO(rays) Determine whether vertical text needs to be considered.
773
bool ColPartition::OKMergeOverlap(const ColPartition &merge1,
774
                                  const ColPartition &merge2,
775
0
                                  int ok_box_overlap, bool debug) {
776
  // Vertical partitions are not allowed to be involved.
777
0
  if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
778
0
    if (debug) {
779
0
      tprintf("Vertical partition\n");
780
0
    }
781
0
    return false;
782
0
  }
783
  // The merging partitions must strongly overlap each other.
784
0
  if (!merge1.VSignificantCoreOverlap(merge2)) {
785
0
    if (debug) {
786
0
      tprintf("Voverlap %d (%d)\n", merge1.VCoreOverlap(merge2),
787
0
              merge1.VSignificantCoreOverlap(merge2));
788
0
    }
789
0
    return false;
790
0
  }
791
  // The merged box must not overlap the median bounds of this.
792
0
  TBOX merged_box(merge1.bounding_box());
793
0
  merged_box += merge2.bounding_box();
794
0
  if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
795
0
      merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
796
0
      merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
797
0
    if (debug) {
798
0
      tprintf("Excessive box overlap\n");
799
0
    }
800
0
    return false;
801
0
  }
802
  // Looks OK!
803
0
  return true;
804
0
}
805
806
// Find the blob at which to split this to minimize the overlap with the
807
// given box. Returns the first blob to go in the second partition.
808
0
BLOBNBOX *ColPartition::OverlapSplitBlob(const TBOX &box) {
809
0
  if (boxes_.empty() || boxes_.singleton()) {
810
0
    return nullptr;
811
0
  }
812
0
  BLOBNBOX_C_IT it(&boxes_);
813
0
  TBOX left_box(it.data()->bounding_box());
814
0
  for (it.forward(); !it.at_first(); it.forward()) {
815
0
    BLOBNBOX *bbox = it.data();
816
0
    left_box += bbox->bounding_box();
817
0
    if (left_box.overlap(box)) {
818
0
      return bbox;
819
0
    }
820
0
  }
821
0
  return nullptr;
822
0
}
823
824
// Split this partition keeping the first half in this and returning
825
// the second half.
826
// Splits by putting the split_blob and the blobs that follow
827
// in the second half, and the rest in the first half.
828
0
ColPartition *ColPartition::SplitAtBlob(BLOBNBOX *split_blob) {
829
0
  ColPartition *split_part = ShallowCopy();
830
0
  split_part->set_owns_blobs(owns_blobs());
831
0
  BLOBNBOX_C_IT it(&boxes_);
832
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
833
0
    BLOBNBOX *bbox = it.data();
834
0
    ColPartition *prev_owner = bbox->owner();
835
0
    ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
836
0
    if (bbox == split_blob || !split_part->boxes_.empty()) {
837
0
      split_part->AddBox(it.extract());
838
0
      if (owns_blobs() && prev_owner != nullptr) {
839
0
        bbox->set_owner(split_part);
840
0
      }
841
0
    }
842
0
  }
843
0
  ASSERT_HOST(!it.empty());
844
0
  if (split_part->IsEmpty()) {
845
    // Split part ended up with nothing. Possible if split_blob is not
846
    // in the list of blobs.
847
0
    delete split_part;
848
0
    return nullptr;
849
0
  }
850
0
  right_key_tab_ = false;
851
0
  split_part->left_key_tab_ = false;
852
0
  ComputeLimits();
853
  // TODO(nbeato) Merge Ray's CL like this:
854
  // if (owns_blobs())
855
  //  SetBlobTextlineGoodness();
856
0
  split_part->ComputeLimits();
857
  // TODO(nbeato) Merge Ray's CL like this:
858
  // if (split_part->owns_blobs())
859
  //   split_part->SetBlobTextlineGoodness();
860
0
  return split_part;
861
0
}
862
863
// Split this partition at the given x coordinate, returning the right
864
// half and keeping the left half in this.
865
0
ColPartition *ColPartition::SplitAt(int split_x) {
866
0
  if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right()) {
867
0
    return nullptr; // There will be no change.
868
0
  }
869
0
  ColPartition *split_part = ShallowCopy();
870
0
  split_part->set_owns_blobs(owns_blobs());
871
0
  BLOBNBOX_C_IT it(&boxes_);
872
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
873
0
    BLOBNBOX *bbox = it.data();
874
0
    ColPartition *prev_owner = bbox->owner();
875
0
    ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
876
0
    const TBOX &box = bbox->bounding_box();
877
0
    if (box.left() >= split_x) {
878
0
      split_part->AddBox(it.extract());
879
0
      if (owns_blobs() && prev_owner != nullptr) {
880
0
        bbox->set_owner(split_part);
881
0
      }
882
0
    }
883
0
  }
884
0
  if (it.empty()) {
885
    // Possible if split-x passes through the first blob.
886
0
    it.add_list_after(&split_part->boxes_);
887
0
  }
888
0
  ASSERT_HOST(!it.empty());
889
0
  if (split_part->IsEmpty()) {
890
    // Split part ended up with nothing. Possible if split_x passes
891
    // through the last blob.
892
0
    delete split_part;
893
0
    return nullptr;
894
0
  }
895
0
  right_key_tab_ = false;
896
0
  split_part->left_key_tab_ = false;
897
0
  right_margin_ = split_x;
898
0
  split_part->left_margin_ = split_x;
899
0
  ComputeLimits();
900
0
  split_part->ComputeLimits();
901
0
  return split_part;
902
0
}
903
904
// Recalculates all the coordinate limits of the partition.
905
0
void ColPartition::ComputeLimits() {
906
0
  bounding_box_ = TBOX(); // Clear it
907
0
  BLOBNBOX_C_IT it(&boxes_);
908
0
  BLOBNBOX *bbox = nullptr;
909
0
  int non_leader_count = 0;
910
0
  if (it.empty()) {
911
0
    bounding_box_.set_left(left_margin_);
912
0
    bounding_box_.set_right(right_margin_);
913
0
    bounding_box_.set_bottom(0);
914
0
    bounding_box_.set_top(0);
915
0
  } else {
916
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
917
0
      bbox = it.data();
918
0
      bounding_box_ += bbox->bounding_box();
919
0
      if (bbox->flow() != BTFT_LEADER) {
920
0
        ++non_leader_count;
921
0
      }
922
0
    }
923
0
  }
924
0
  if (!left_key_tab_) {
925
0
    left_key_ = BoxLeftKey();
926
0
  }
927
0
  if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
928
    // TODO(rays) investigate the causes of these error messages, to find
929
    // out if they are genuinely harmful, or just indicative of junk input.
930
0
    tprintf("Computed left-illegal partition\n");
931
0
    Print();
932
0
  }
933
0
  if (!right_key_tab_) {
934
0
    right_key_ = BoxRightKey();
935
0
  }
936
0
  if (right_key_ < BoxRightKey() && textord_debug_bugs) {
937
0
    tprintf("Computed right-illegal partition\n");
938
0
    Print();
939
0
  }
940
0
  if (it.empty()) {
941
0
    return;
942
0
  }
943
0
  if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
944
0
      blob_type() == BRT_POLYIMAGE) {
945
0
    median_top_ = bounding_box_.top();
946
0
    median_bottom_ = bounding_box_.bottom();
947
0
    median_height_ = bounding_box_.height();
948
0
    median_left_ = bounding_box_.left();
949
0
    median_right_ = bounding_box_.right();
950
0
    median_width_ = bounding_box_.width();
951
0
  } else {
952
0
    STATS top_stats(bounding_box_.bottom(), bounding_box_.top());
953
0
    STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top());
954
0
    STATS height_stats(0, bounding_box_.height());
955
0
    STATS left_stats(bounding_box_.left(), bounding_box_.right());
956
0
    STATS right_stats(bounding_box_.left(), bounding_box_.right());
957
0
    STATS width_stats(0, bounding_box_.width());
958
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
959
0
      bbox = it.data();
960
0
      if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
961
0
        const TBOX &box = bbox->bounding_box();
962
0
        int area = box.area();
963
0
        top_stats.add(box.top(), area);
964
0
        bottom_stats.add(box.bottom(), area);
965
0
        height_stats.add(box.height(), area);
966
0
        left_stats.add(box.left(), area);
967
0
        right_stats.add(box.right(), area);
968
0
        width_stats.add(box.width(), area);
969
0
      }
970
0
    }
971
0
    median_top_ = static_cast<int>(top_stats.median() + 0.5);
972
0
    median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
973
0
    median_height_ = static_cast<int>(height_stats.median() + 0.5);
974
0
    median_left_ = static_cast<int>(left_stats.median() + 0.5);
975
0
    median_right_ = static_cast<int>(right_stats.median() + 0.5);
976
0
    median_width_ = static_cast<int>(width_stats.median() + 0.5);
977
0
  }
978
979
0
  if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
980
0
    tprintf("Made partition with bad right coords, %d < %d\n", right_margin_,
981
0
            bounding_box_.right());
982
0
    Print();
983
0
  }
984
0
  if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
985
0
    tprintf("Made partition with bad left coords, %d > %d\n", left_margin_,
986
0
            bounding_box_.left());
987
0
    Print();
988
0
  }
989
  // Fix partner lists. The bounding box has changed and partners are stored
990
  // in bounding box order, so remove and reinsert this as a partner
991
  // of all its partners.
992
0
  for (int upper = 0; upper < 2; ++upper) {
993
0
    ColPartition_CLIST partners;
994
0
    ColPartition_C_IT part_it(&partners);
995
0
    part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
996
0
    for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
997
0
      ColPartition *partner = part_it.extract();
998
0
      partner->RemovePartner(!upper, this);
999
0
      partner->AddPartner(!upper, this);
1000
0
    }
1001
0
  }
1002
0
  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1003
0
                                bounding_box_.bottom())) {
1004
0
    tprintf("Recomputed box for partition %p\n", static_cast<void *>(this));
1005
0
    Print();
1006
0
  }
1007
0
}
1008
1009
// Returns the number of boxes that overlap the given box.
1010
0
int ColPartition::CountOverlappingBoxes(const TBOX &box) {
1011
0
  BLOBNBOX_C_IT it(&boxes_);
1012
0
  int overlap_count = 0;
1013
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1014
0
    BLOBNBOX *bbox = it.data();
1015
0
    if (box.overlap(bbox->bounding_box())) {
1016
0
      ++overlap_count;
1017
0
    }
1018
0
  }
1019
0
  return overlap_count;
1020
0
}
1021
1022
// Computes and sets the type_ and first_column_, last_column_ and column_set_.
1023
// resolution refers to the ppi resolution of the image.
1024
0
void ColPartition::SetPartitionType(int resolution, ColPartitionSet *columns) {
1025
0
  int first_spanned_col = -1;
1026
0
  ColumnSpanningType span_type = columns->SpanningType(
1027
0
      resolution, bounding_box_.left(), bounding_box_.right(),
1028
0
      std::min(bounding_box_.height(), bounding_box_.width()), MidY(),
1029
0
      left_margin_, right_margin_, &first_column_, &last_column_,
1030
0
      &first_spanned_col);
1031
0
  column_set_ = columns;
1032
0
  if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
1033
0
      !IsLineType()) {
1034
    // Unequal columns may indicate that the pullout spans one of the columns
1035
    // it lies in, so force it to be allocated to just that column.
1036
0
    if (first_spanned_col >= 0) {
1037
0
      first_column_ = first_spanned_col;
1038
0
      last_column_ = first_spanned_col;
1039
0
    } else {
1040
0
      if ((first_column_ & 1) == 0) {
1041
0
        last_column_ = first_column_;
1042
0
      } else if ((last_column_ & 1) == 0) {
1043
0
        first_column_ = last_column_;
1044
0
      } else {
1045
0
        first_column_ = last_column_ = (first_column_ + last_column_) / 2;
1046
0
      }
1047
0
    }
1048
0
  }
1049
0
  type_ = PartitionType(span_type);
1050
0
}
1051
1052
// Returns the PartitionType from the current BlobRegionType and a column
1053
// flow spanning type ColumnSpanningType, generated by
1054
// ColPartitionSet::SpanningType, that indicates how the partition sits
1055
// in the columns.
1056
0
PolyBlockType ColPartition::PartitionType(ColumnSpanningType flow) const {
1057
0
  if (flow == CST_NOISE) {
1058
0
    if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
1059
0
        blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT) {
1060
0
      return PT_NOISE;
1061
0
    }
1062
0
    flow = CST_FLOWING;
1063
0
  }
1064
1065
0
  switch (blob_type_) {
1066
0
    case BRT_NOISE:
1067
0
      return PT_NOISE;
1068
0
    case BRT_HLINE:
1069
0
      return PT_HORZ_LINE;
1070
0
    case BRT_VLINE:
1071
0
      return PT_VERT_LINE;
1072
0
    case BRT_RECTIMAGE:
1073
0
    case BRT_POLYIMAGE:
1074
0
      switch (flow) {
1075
0
        case CST_FLOWING:
1076
0
          return PT_FLOWING_IMAGE;
1077
0
        case CST_HEADING:
1078
0
          return PT_HEADING_IMAGE;
1079
0
        case CST_PULLOUT:
1080
0
          return PT_PULLOUT_IMAGE;
1081
0
        default:
1082
0
          ASSERT_HOST(!"Undefined flow type for image!");
1083
0
      }
1084
0
      break;
1085
0
    case BRT_VERT_TEXT:
1086
0
      return PT_VERTICAL_TEXT;
1087
0
    case BRT_TEXT:
1088
0
    case BRT_UNKNOWN:
1089
0
    default:
1090
0
      switch (flow) {
1091
0
        case CST_FLOWING:
1092
0
          return PT_FLOWING_TEXT;
1093
0
        case CST_HEADING:
1094
0
          return PT_HEADING_TEXT;
1095
0
        case CST_PULLOUT:
1096
0
          return PT_PULLOUT_TEXT;
1097
0
        default:
1098
0
          ASSERT_HOST(!"Undefined flow type for text!");
1099
0
      }
1100
0
  }
1101
0
  ASSERT_HOST(!"Should never get here!");
1102
0
  return PT_NOISE;
1103
0
}
1104
1105
// Returns the first and last column touched by this partition.
1106
// resolution refers to the ppi resolution of the image.
1107
void ColPartition::ColumnRange(int resolution, ColPartitionSet *columns,
1108
0
                               int *first_col, int *last_col) {
1109
0
  int first_spanned_col = -1;
1110
0
  ColumnSpanningType span_type = columns->SpanningType(
1111
0
      resolution, bounding_box_.left(), bounding_box_.right(),
1112
0
      std::min(bounding_box_.height(), bounding_box_.width()), MidY(),
1113
0
      left_margin_, right_margin_, first_col, last_col, &first_spanned_col);
1114
0
  type_ = PartitionType(span_type);
1115
0
}
1116
1117
// Sets the internal flags good_width_ and good_column_.
1118
0
void ColPartition::SetColumnGoodness(const WidthCallback &cb) {
1119
0
  int y = MidY();
1120
0
  int width = RightAtY(y) - LeftAtY(y);
1121
0
  good_width_ = cb(width);
1122
0
  good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
1123
0
}
1124
1125
// Determines whether the blobs in this partition mostly represent
1126
// a leader (fixed pitch sequence) and sets the member blobs accordingly.
1127
// Note that height is assumed to have been tested elsewhere, and that this
1128
// function will find most fixed-pitch text as leader without a height filter.
1129
// Leader detection is limited to sequences of identical width objects,
1130
// such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
1131
0
bool ColPartition::MarkAsLeaderIfMonospaced() {
1132
0
  bool result = false;
1133
  // Gather statistics on the gaps between blobs and the widths of the blobs.
1134
0
  int part_width = bounding_box_.width();
1135
0
  STATS gap_stats(0, part_width - 1);
1136
0
  STATS width_stats(0, part_width - 1);
1137
0
  BLOBNBOX_C_IT it(&boxes_);
1138
0
  BLOBNBOX *prev_blob = it.data();
1139
0
  prev_blob->set_flow(BTFT_NEIGHBOURS);
1140
0
  width_stats.add(prev_blob->bounding_box().width(), 1);
1141
0
  int blob_count = 1;
1142
0
  for (it.forward(); !it.at_first(); it.forward()) {
1143
0
    BLOBNBOX *blob = it.data();
1144
0
    int left = blob->bounding_box().left();
1145
0
    int right = blob->bounding_box().right();
1146
0
    gap_stats.add(left - prev_blob->bounding_box().right(), 1);
1147
0
    width_stats.add(right - left, 1);
1148
0
    blob->set_flow(BTFT_NEIGHBOURS);
1149
0
    prev_blob = blob;
1150
0
    ++blob_count;
1151
0
  }
1152
0
  double median_gap = gap_stats.median();
1153
0
  double median_width = width_stats.median();
1154
0
  double max_width = std::max(median_gap, median_width);
1155
0
  double min_width = std::min(median_gap, median_width);
1156
0
  double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
1157
0
  if (textord_debug_tabfind >= 4) {
1158
0
    tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n", gap_iqr, blob_count,
1159
0
            max_width * kMaxLeaderGapFractionOfMax,
1160
0
            min_width * kMaxLeaderGapFractionOfMin);
1161
0
  }
1162
0
  if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
1163
0
      gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
1164
0
      blob_count >= kMinLeaderCount) {
1165
    // This is stable enough to be called a leader, so check the widths.
1166
    // Since leader dashes can join, run a dp cutting algorithm and go
1167
    // on the cost.
1168
0
    int offset = static_cast<int>(ceil(gap_iqr * 2));
1169
0
    int min_step = static_cast<int>(median_gap + median_width + 0.5);
1170
0
    int max_step = min_step + offset;
1171
0
    min_step -= offset;
1172
    // Pad the buffer with min_step/2 on each end.
1173
0
    int part_left = bounding_box_.left() - min_step / 2;
1174
0
    part_width += min_step;
1175
0
    auto *projection = new DPPoint[part_width];
1176
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1177
0
      BLOBNBOX *blob = it.data();
1178
0
      int left = blob->bounding_box().left();
1179
0
      int right = blob->bounding_box().right();
1180
0
      int height = blob->bounding_box().height();
1181
0
      for (int x = left; x < right; ++x) {
1182
0
        projection[left - part_left].AddLocalCost(height);
1183
0
      }
1184
0
    }
1185
0
    DPPoint *best_end =
1186
0
        DPPoint::Solve(min_step, max_step, false, &DPPoint::CostWithVariance,
1187
0
                       part_width, projection);
1188
0
    if (best_end != nullptr && best_end->total_cost() < blob_count) {
1189
      // Good enough. Call it a leader.
1190
0
      result = true;
1191
0
      bool modified_blob_list = false;
1192
0
      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1193
0
        BLOBNBOX *blob = it.data();
1194
        // If the first or last blob is spaced too much, don't mark it.
1195
0
        if (it.at_first()) {
1196
0
          int gap = it.data_relative(1)->bounding_box().left() -
1197
0
                    blob->bounding_box().right();
1198
0
          if (blob->bounding_box().width() + gap > max_step) {
1199
0
            it.extract();
1200
0
            modified_blob_list = true;
1201
0
            continue;
1202
0
          }
1203
0
        }
1204
0
        if (it.at_last()) {
1205
0
          int gap = blob->bounding_box().left() -
1206
0
                    it.data_relative(-1)->bounding_box().right();
1207
0
          if (blob->bounding_box().width() + gap > max_step) {
1208
0
            it.extract();
1209
0
            modified_blob_list = true;
1210
0
            break;
1211
0
          }
1212
0
        }
1213
0
        blob->set_region_type(BRT_TEXT);
1214
0
        blob->set_flow(BTFT_LEADER);
1215
0
      }
1216
0
      if (modified_blob_list) {
1217
0
        ComputeLimits();
1218
0
      }
1219
0
      blob_type_ = BRT_TEXT;
1220
0
      flow_ = BTFT_LEADER;
1221
0
    } else if (textord_debug_tabfind) {
1222
0
      if (best_end == nullptr) {
1223
0
        tprintf("No path\n");
1224
0
      } else {
1225
0
        tprintf("Total cost = %d vs allowed %d\n", best_end->total_cost(),
1226
0
                blob_count);
1227
0
      }
1228
0
    }
1229
0
    delete[] projection;
1230
0
  }
1231
0
  return result;
1232
0
}
1233
1234
// Given the result of TextlineProjection::EvaluateColPartition, (positive for
1235
// horizontal text, negative for vertical text, and near zero for non-text),
1236
// sets the blob_type_ and flow_ for this partition to indicate whether it
1237
// is strongly or weakly vertical or horizontal text, or non-text.
1238
// The function assumes that the blob neighbours are valid (from
1239
// StrokeWidth::SetNeighbours) and that those neighbours have their
1240
// region_type() set.
1241
0
void ColPartition::SetRegionAndFlowTypesFromProjectionValue(int value) {
1242
0
  int blob_count = 0;       // Total # blobs.
1243
0
  int good_blob_score_ = 0; // Total # good strokewidth neighbours.
1244
0
  int noisy_count = 0;      // Total # neighbours marked as noise.
1245
0
  int hline_count = 0;
1246
0
  int vline_count = 0;
1247
0
  BLOBNBOX_C_IT it(&boxes_);
1248
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1249
0
    BLOBNBOX *blob = it.data();
1250
0
    ++blob_count;
1251
0
    noisy_count += blob->NoisyNeighbours();
1252
0
    good_blob_score_ += blob->GoodTextBlob();
1253
0
    if (blob->region_type() == BRT_HLINE) {
1254
0
      ++hline_count;
1255
0
    }
1256
0
    if (blob->region_type() == BRT_VLINE) {
1257
0
      ++vline_count;
1258
0
    }
1259
0
  }
1260
0
  flow_ = BTFT_NEIGHBOURS;
1261
0
  blob_type_ = BRT_UNKNOWN;
1262
0
  if (hline_count > vline_count) {
1263
0
    flow_ = BTFT_NONE;
1264
0
    blob_type_ = BRT_HLINE;
1265
0
  } else if (vline_count > hline_count) {
1266
0
    flow_ = BTFT_NONE;
1267
0
    blob_type_ = BRT_VLINE;
1268
0
  } else if (value < -1 || 1 < value) {
1269
0
    int long_side;
1270
0
    int short_side;
1271
0
    if (value > 0) {
1272
0
      long_side = bounding_box_.width();
1273
0
      short_side = bounding_box_.height();
1274
0
      blob_type_ = BRT_TEXT;
1275
0
    } else {
1276
0
      long_side = bounding_box_.height();
1277
0
      short_side = bounding_box_.width();
1278
0
      blob_type_ = BRT_VERT_TEXT;
1279
0
    }
1280
    // We will combine the old metrics using aspect ratio and blob counts
1281
    // with the input value by allowing a strong indication to flip the
1282
    // STRONG_CHAIN/CHAIN flow values.
1283
0
    int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
1284
0
    if (short_side > kHorzStrongTextlineHeight) {
1285
0
      ++strong_score;
1286
0
    }
1287
0
    if (short_side * kHorzStrongTextlineAspect < long_side) {
1288
0
      ++strong_score;
1289
0
    }
1290
0
    if (abs(value) >= kMinStrongTextValue) {
1291
0
      flow_ = BTFT_STRONG_CHAIN;
1292
0
    } else if (abs(value) >= kMinChainTextValue) {
1293
0
      flow_ = BTFT_CHAIN;
1294
0
    } else {
1295
0
      flow_ = BTFT_NEIGHBOURS;
1296
0
    }
1297
    // Upgrade chain to strong chain if the other indicators are good
1298
0
    if (flow_ == BTFT_CHAIN && strong_score == 3) {
1299
0
      flow_ = BTFT_STRONG_CHAIN;
1300
0
    }
1301
    // Downgrade strong vertical text to chain if the indicators are bad.
1302
0
    if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2) {
1303
0
      flow_ = BTFT_CHAIN;
1304
0
    }
1305
0
  }
1306
0
  if (flow_ == BTFT_NEIGHBOURS) {
1307
    // Check for noisy neighbours.
1308
0
    if (noisy_count >= blob_count) {
1309
0
      flow_ = BTFT_NONTEXT;
1310
0
      blob_type_ = BRT_NOISE;
1311
0
    }
1312
0
  }
1313
0
  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1314
0
                                bounding_box_.bottom())) {
1315
0
    tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1316
0
            blob_count, noisy_count, good_blob_score_);
1317
0
    tprintf(" Projection value=%d, flow=%d, blob_type=%d\n", value, flow_,
1318
0
            blob_type_);
1319
0
    Print();
1320
0
  }
1321
0
  SetBlobTypes();
1322
0
}
1323
1324
// Sets all blobs with the partition blob type and flow, but never overwrite
1325
// leader blobs, as we need to be able to identify them later.
1326
0
void ColPartition::SetBlobTypes() {
1327
0
  if (!owns_blobs()) {
1328
0
    return;
1329
0
  }
1330
0
  BLOBNBOX_C_IT it(&boxes_);
1331
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1332
0
    BLOBNBOX *blob = it.data();
1333
0
    if (blob->flow() != BTFT_LEADER) {
1334
0
      blob->set_flow(flow_);
1335
0
    }
1336
0
    blob->set_region_type(blob_type_);
1337
0
    ASSERT_HOST(blob->owner() == nullptr || blob->owner() == this);
1338
0
  }
1339
0
}
1340
1341
// Returns true if a decent baseline can be fitted through the blobs.
1342
// Works for both horizontal and vertical text.
1343
0
bool ColPartition::HasGoodBaseline() {
1344
  // Approximation of the baseline.
1345
0
  DetLineFit linepoints;
1346
  // Calculation of the mean height on this line segment. Note that these
1347
  // variable names apply to the context of a horizontal line, and work
1348
  // analogously, rather than literally in the case of a vertical line.
1349
0
  int total_height = 0;
1350
0
  int coverage = 0;
1351
0
  int height_count = 0;
1352
0
  int width = 0;
1353
0
  BLOBNBOX_C_IT it(&boxes_);
1354
0
  TBOX box(it.data()->bounding_box());
1355
  // Accumulate points representing the baseline at the middle of each blob,
1356
  // but add an additional point for each end of the line. This makes it
1357
  // harder to fit a severe skew angle, as it is most likely not right.
1358
0
  if (IsVerticalType()) {
1359
    // For a vertical line, use the right side as the baseline.
1360
0
    ICOORD first_pt(box.right(), box.bottom());
1361
    // Use the bottom-right of the first (bottom) box, the top-right of the
1362
    // last, and the middle-right of all others.
1363
0
    linepoints.Add(first_pt);
1364
0
    for (it.forward(); !it.at_last(); it.forward()) {
1365
0
      BLOBNBOX *blob = it.data();
1366
0
      box = blob->bounding_box();
1367
0
      ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1368
0
      linepoints.Add(box_pt);
1369
0
      total_height += box.width();
1370
0
      coverage += box.height();
1371
0
      ++height_count;
1372
0
    }
1373
0
    box = it.data()->bounding_box();
1374
0
    ICOORD last_pt(box.right(), box.top());
1375
0
    linepoints.Add(last_pt);
1376
0
    width = last_pt.y() - first_pt.y();
1377
1378
0
  } else {
1379
    // Horizontal lines use the bottom as the baseline.
1380
0
    TBOX box(it.data()->bounding_box());
1381
    // Use the bottom-left of the first box, the bottom-right of the last,
1382
    // and the middle of all others.
1383
0
    ICOORD first_pt(box.left(), box.bottom());
1384
0
    linepoints.Add(first_pt);
1385
0
    for (it.forward(); !it.at_last(); it.forward()) {
1386
0
      BLOBNBOX *blob = it.data();
1387
0
      box = blob->bounding_box();
1388
0
      ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1389
0
      linepoints.Add(box_pt);
1390
0
      total_height += box.height();
1391
0
      coverage += box.width();
1392
0
      ++height_count;
1393
0
    }
1394
0
    box = it.data()->bounding_box();
1395
0
    ICOORD last_pt(box.right(), box.bottom());
1396
0
    linepoints.Add(last_pt);
1397
0
    width = last_pt.x() - first_pt.x();
1398
0
  }
1399
  // Maximum median error allowed to be a good text line.
1400
0
  if (height_count == 0) {
1401
0
    return false;
1402
0
  }
1403
0
  double max_error = kMaxBaselineError * total_height / height_count;
1404
0
  ICOORD start_pt, end_pt;
1405
0
  double error = linepoints.Fit(&start_pt, &end_pt);
1406
0
  return error < max_error && coverage >= kMinBaselineCoverage * width;
1407
0
}
1408
1409
// Adds this ColPartition to a matching WorkingPartSet if one can be found,
1410
// otherwise starts a new one in the appropriate column, ending the previous.
1411
void ColPartition::AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright,
1412
                                   int resolution,
1413
                                   ColPartition_LIST *used_parts,
1414
0
                                   WorkingPartSet_LIST *working_sets) {
1415
0
  if (block_owned_) {
1416
0
    return; // Done it already.
1417
0
  }
1418
0
  block_owned_ = true;
1419
0
  WorkingPartSet_IT it(working_sets);
1420
  // If there is an upper partner use its working_set_ directly.
1421
0
  ColPartition *partner = SingletonPartner(true);
1422
0
  if (partner != nullptr && partner->working_set_ != nullptr) {
1423
0
    working_set_ = partner->working_set_;
1424
0
    working_set_->AddPartition(this);
1425
0
    return;
1426
0
  }
1427
0
  if (partner != nullptr && textord_debug_bugs) {
1428
0
    tprintf("Partition with partner has no working set!:");
1429
0
    Print();
1430
0
    partner->Print();
1431
0
  }
1432
  // Search for the column that the left edge fits in.
1433
0
  WorkingPartSet *work_set = nullptr;
1434
0
  it.move_to_first();
1435
0
  int col_index = 0;
1436
0
  for (it.mark_cycle_pt(); !it.cycled_list() && col_index != first_column_;
1437
0
       it.forward(), ++col_index) {
1438
0
    ;
1439
0
  }
1440
0
  if (textord_debug_tabfind >= 2) {
1441
0
    tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
1442
0
    Print();
1443
0
  }
1444
0
  if (it.cycled_list() && textord_debug_bugs) {
1445
0
    tprintf("Target column=%d, only had %d\n", first_column_, col_index);
1446
0
  }
1447
0
  ASSERT_HOST(!it.cycled_list());
1448
0
  work_set = it.data();
1449
  // If last_column_ != first_column, then we need to scoop up all blocks
1450
  // between here and the last_column_ and put back in work_set.
1451
0
  if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
1452
    // Find the column that the right edge falls in.
1453
0
    BLOCK_LIST completed_blocks;
1454
0
    TO_BLOCK_LIST to_blocks;
1455
0
    for (; !it.cycled_list() && col_index <= last_column_;
1456
0
         it.forward(), ++col_index) {
1457
0
      WorkingPartSet *end_set = it.data();
1458
0
      end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
1459
0
                                      &completed_blocks, &to_blocks);
1460
0
    }
1461
0
    work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1462
0
  }
1463
0
  working_set_ = work_set;
1464
0
  work_set->AddPartition(this);
1465
0
}
1466
1467
// From the given block_parts list, builds one or more BLOCKs and
1468
// corresponding TO_BLOCKs, such that the line spacing is uniform in each.
1469
// Created blocks are appended to the end of completed_blocks and to_blocks.
1470
// The used partitions are put onto used_parts, as they may still be referred
1471
// to in the partition grid. bleft, tright and resolution are the bounds
1472
// and resolution of the original image.
1473
void ColPartition::LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright,
1474
                                     int resolution,
1475
                                     ColPartition_LIST *block_parts,
1476
                                     ColPartition_LIST *used_parts,
1477
                                     BLOCK_LIST *completed_blocks,
1478
0
                                     TO_BLOCK_LIST *to_blocks) {
1479
0
  int page_height = tright.y() - bleft.y();
1480
  // Compute the initial spacing stats.
1481
0
  ColPartition_IT it(block_parts);
1482
0
  int part_count = 0;
1483
0
  int max_line_height = 0;
1484
1485
  // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
1486
  // because their line spacing with their neighbors maybe smaller and their
1487
  // height may be slightly larger.
1488
1489
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1490
0
    ColPartition *part = it.data();
1491
0
    ASSERT_HOST(!part->boxes()->empty());
1492
0
    STATS side_steps(0, part->bounding_box().height() - 1);
1493
0
    if (part->bounding_box().height() > max_line_height) {
1494
0
      max_line_height = part->bounding_box().height();
1495
0
    }
1496
0
    BLOBNBOX_C_IT blob_it(part->boxes());
1497
0
    int prev_bottom = blob_it.data()->bounding_box().bottom();
1498
0
    for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1499
0
      BLOBNBOX *blob = blob_it.data();
1500
0
      int bottom = blob->bounding_box().bottom();
1501
0
      int step = bottom - prev_bottom;
1502
0
      if (step < 0) {
1503
0
        step = -step;
1504
0
      }
1505
0
      side_steps.add(step, 1);
1506
0
      prev_bottom = bottom;
1507
0
    }
1508
0
    part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
1509
0
    if (!it.at_last()) {
1510
0
      ColPartition *next_part = it.data_relative(1);
1511
0
      part->set_bottom_spacing(part->median_bottom() -
1512
0
                               next_part->median_bottom());
1513
0
      part->set_top_spacing(part->median_top() - next_part->median_top());
1514
0
    } else {
1515
0
      part->set_bottom_spacing(page_height);
1516
0
      part->set_top_spacing(page_height);
1517
0
    }
1518
0
    if (textord_debug_tabfind) {
1519
0
      part->Print();
1520
0
      tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1521
0
              side_steps.median(), part->top_spacing(), part->bottom_spacing());
1522
0
    }
1523
0
    ++part_count;
1524
0
  }
1525
0
  if (part_count == 0) {
1526
0
    return;
1527
0
  }
1528
1529
0
  SmoothSpacings(resolution, page_height, block_parts);
1530
1531
  // Move the partitions into individual block lists and make the blocks.
1532
0
  BLOCK_IT block_it(completed_blocks);
1533
0
  TO_BLOCK_IT to_block_it(to_blocks);
1534
0
  ColPartition_LIST spacing_parts;
1535
0
  ColPartition_IT sp_block_it(&spacing_parts);
1536
0
  int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
1537
0
  for (it.mark_cycle_pt(); !it.empty();) {
1538
0
    ColPartition *part = it.extract();
1539
0
    sp_block_it.add_to_end(part);
1540
0
    it.forward();
1541
0
    if (it.empty() || part->bottom_spacing() > same_block_threshold ||
1542
0
        !part->SpacingsEqual(*it.data(), resolution)) {
1543
      // There is a spacing boundary. Check to see if it.data() belongs
1544
      // better in the current block or the next one.
1545
0
      if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
1546
0
        ColPartition *next_part = it.data();
1547
        // If there is a size match one-way, then the middle line goes with
1548
        // its matched size, otherwise it goes with the smallest spacing.
1549
0
        ColPartition *third_part = it.at_last() ? nullptr : it.data_relative(1);
1550
0
        if (textord_debug_tabfind) {
1551
0
          tprintf(
1552
0
              "Spacings unequal: upper:%d/%d, lower:%d/%d,"
1553
0
              " sizes %d %d %d\n",
1554
0
              part->top_spacing(), part->bottom_spacing(),
1555
0
              next_part->top_spacing(), next_part->bottom_spacing(),
1556
0
              part->median_height(), next_part->median_height(),
1557
0
              third_part != nullptr ? third_part->median_height() : 0);
1558
0
        }
1559
        // We can only consider adding the next line to the block if the sizes
1560
        // match and the lines are close enough for their size.
1561
0
        if (part->SizesSimilar(*next_part) &&
1562
0
            next_part->median_height() * kMaxSameBlockLineSpacing >
1563
0
                part->bottom_spacing() &&
1564
0
            part->median_height() * kMaxSameBlockLineSpacing >
1565
0
                part->top_spacing()) {
1566
          // Even now, we can only add it as long as the third line doesn't
1567
          // match in the same way and have a smaller bottom spacing.
1568
0
          if (third_part == nullptr || !next_part->SizesSimilar(*third_part) ||
1569
0
              third_part->median_height() * kMaxSameBlockLineSpacing <=
1570
0
                  next_part->bottom_spacing() ||
1571
0
              next_part->median_height() * kMaxSameBlockLineSpacing <=
1572
0
                  next_part->top_spacing() ||
1573
0
              next_part->bottom_spacing() > part->bottom_spacing()) {
1574
            // Add to the current block.
1575
0
            sp_block_it.add_to_end(it.extract());
1576
0
            it.forward();
1577
0
            if (textord_debug_tabfind) {
1578
0
              tprintf("Added line to current block.\n");
1579
0
            }
1580
0
          }
1581
0
        }
1582
0
      }
1583
0
      TO_BLOCK *to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
1584
0
      if (to_block != nullptr) {
1585
0
        to_block_it.add_to_end(to_block);
1586
0
        block_it.add_to_end(to_block->block);
1587
0
      }
1588
0
      sp_block_it.set_to_list(&spacing_parts);
1589
0
    } else {
1590
0
      if (textord_debug_tabfind && !it.empty()) {
1591
0
        ColPartition *next_part = it.data();
1592
0
        tprintf("Spacings equal: upper:%d/%d, lower:%d/%d, median:%d/%d\n",
1593
0
                part->top_spacing(), part->bottom_spacing(),
1594
0
                next_part->top_spacing(), next_part->bottom_spacing(),
1595
0
                part->median_height(), next_part->median_height());
1596
0
      }
1597
0
    }
1598
0
  }
1599
0
}
1600
1601
// Helper function to clip the input pos to the given bleft, tright bounds.
1602
0
static void ClipCoord(const ICOORD &bleft, const ICOORD &tright, ICOORD *pos) {
1603
0
  if (pos->x() < bleft.x()) {
1604
0
    pos->set_x(bleft.x());
1605
0
  }
1606
0
  if (pos->x() > tright.x()) {
1607
0
    pos->set_x(tright.x());
1608
0
  }
1609
0
  if (pos->y() < bleft.y()) {
1610
0
    pos->set_y(bleft.y());
1611
0
  }
1612
0
  if (pos->y() > tright.y()) {
1613
0
    pos->set_y(tright.y());
1614
0
  }
1615
0
}
1616
1617
// Helper moves the blobs from the given list of block_parts into the block
1618
// itself. Sets up the block for (old) textline formation correctly for
1619
// vertical and horizontal text. The partitions are moved to used_parts
1620
// afterwards, as they cannot be deleted yet.
1621
static TO_BLOCK *MoveBlobsToBlock(bool vertical_text, int line_spacing,
1622
                                  BLOCK *block, ColPartition_LIST *block_parts,
1623
0
                                  ColPartition_LIST *used_parts) {
1624
  // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
1625
  // Move all the parts to a done list as they are no longer needed, except
1626
  // that have to continue to exist until the part grid is deleted.
1627
  // Compute the median blob size as we go, as the block needs to know.
1628
0
  TBOX block_box(block->pdblk.bounding_box());
1629
0
  STATS sizes(0, std::max(block_box.width(), block_box.height()) - 1);
1630
0
  bool text_type = block->pdblk.poly_block()->IsText();
1631
0
  ColPartition_IT it(block_parts);
1632
0
  auto *to_block = new TO_BLOCK(block);
1633
0
  BLOBNBOX_IT blob_it(&to_block->blobs);
1634
0
  ColPartition_IT used_it(used_parts);
1635
0
  for (it.move_to_first(); !it.empty(); it.forward()) {
1636
0
    ColPartition *part = it.extract();
1637
    // Transfer blobs from all regions to the output blocks.
1638
    // Blobs for non-text regions will be used to define the polygonal
1639
    // bounds of the region.
1640
0
    for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty(); bb_it.forward()) {
1641
0
      BLOBNBOX *bblob = bb_it.extract();
1642
0
      if (bblob->owner() != part) {
1643
0
        tprintf("Ownership incorrect for blob:");
1644
0
        bblob->bounding_box().print();
1645
0
        tprintf("Part=");
1646
0
        part->Print();
1647
0
        if (bblob->owner() == nullptr) {
1648
0
          tprintf("Not owned\n");
1649
0
        } else {
1650
0
          tprintf("Owner part:");
1651
0
          bblob->owner()->Print();
1652
0
        }
1653
0
      }
1654
0
      ASSERT_HOST(bblob->owner() == part);
1655
      // Assert failure here is caused by arbitrarily changing the partition
1656
      // type without also changing the blob type, such as in
1657
      // InsertSmallBlobsAsUnknowns.
1658
0
      ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
1659
0
      C_OUTLINE_LIST *outlines = bblob->cblob()->out_list();
1660
0
      C_OUTLINE_IT ol_it(outlines);
1661
0
      ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1662
0
      if (vertical_text) {
1663
0
        sizes.add(bblob->bounding_box().width(), 1);
1664
0
      } else {
1665
0
        sizes.add(bblob->bounding_box().height(), 1);
1666
0
      }
1667
0
      blob_it.add_after_then_move(bblob);
1668
0
    }
1669
0
    used_it.add_to_end(part);
1670
0
  }
1671
0
  if (text_type && blob_it.empty()) {
1672
0
    delete block;
1673
0
    delete to_block;
1674
0
    return nullptr;
1675
0
  }
1676
0
  to_block->line_size = sizes.median();
1677
0
  if (vertical_text) {
1678
0
    int block_width = block->pdblk.bounding_box().width();
1679
0
    if (block_width < line_spacing) {
1680
0
      line_spacing = block_width;
1681
0
    }
1682
0
    to_block->line_spacing = static_cast<float>(line_spacing);
1683
0
    to_block->max_blob_size = static_cast<float>(block_width + 1);
1684
0
  } else {
1685
0
    int block_height = block->pdblk.bounding_box().height();
1686
0
    if (block_height < line_spacing) {
1687
0
      line_spacing = block_height;
1688
0
    }
1689
0
    to_block->line_spacing = static_cast<float>(line_spacing);
1690
0
    to_block->max_blob_size = static_cast<float>(block_height + 1);
1691
0
  }
1692
0
  return to_block;
1693
0
}
1694
1695
// Constructs a block from the given list of partitions.
1696
// Arguments are as LineSpacingBlocks above.
1697
TO_BLOCK *ColPartition::MakeBlock(const ICOORD &bleft, const ICOORD &tright,
1698
                                  ColPartition_LIST *block_parts,
1699
0
                                  ColPartition_LIST *used_parts) {
1700
0
  if (block_parts->empty()) {
1701
0
    return nullptr; // Nothing to do.
1702
0
  }
1703
  // If the block_parts are not in reading order, then it will make an invalid
1704
  // block polygon and bounding_box, so sort by bounding box now just to make
1705
  // sure.
1706
0
  block_parts->sort(&ColPartition::SortByBBox);
1707
0
  ColPartition_IT it(block_parts);
1708
0
  ColPartition *part = it.data();
1709
0
  PolyBlockType type = part->type();
1710
0
  if (type == PT_VERTICAL_TEXT) {
1711
0
    return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
1712
0
  }
1713
  // LineSpacingBlocks has handed us a collection of evenly spaced lines and
1714
  // put the average spacing in each partition, so we can just take the
1715
  // linespacing from the first partition.
1716
0
  int line_spacing = part->bottom_spacing();
1717
0
  if (line_spacing < part->median_height()) {
1718
0
    line_spacing = part->bounding_box().height();
1719
0
  }
1720
0
  ICOORDELT_LIST vertices;
1721
0
  ICOORDELT_IT vert_it(&vertices);
1722
0
  ICOORD start, end;
1723
0
  int min_x = INT32_MAX;
1724
0
  int max_x = -INT32_MAX;
1725
0
  int min_y = INT32_MAX;
1726
0
  int max_y = -INT32_MAX;
1727
0
  int iteration = 0;
1728
0
  do {
1729
0
    if (iteration == 0) {
1730
0
      ColPartition::LeftEdgeRun(&it, &start, &end);
1731
0
    } else {
1732
0
      ColPartition::RightEdgeRun(&it, &start, &end);
1733
0
    }
1734
0
    ClipCoord(bleft, tright, &start);
1735
0
    ClipCoord(bleft, tright, &end);
1736
0
    vert_it.add_after_then_move(new ICOORDELT(start));
1737
0
    vert_it.add_after_then_move(new ICOORDELT(end));
1738
0
    UpdateRange(start.x(), &min_x, &max_x);
1739
0
    UpdateRange(end.x(), &min_x, &max_x);
1740
0
    UpdateRange(start.y(), &min_y, &max_y);
1741
0
    UpdateRange(end.y(), &min_y, &max_y);
1742
0
    if ((iteration == 0 && it.at_first()) || (iteration == 1 && it.at_last())) {
1743
0
      ++iteration;
1744
0
      it.move_to_last();
1745
0
    }
1746
0
  } while (iteration < 2);
1747
0
  if (textord_debug_tabfind) {
1748
0
    tprintf("Making block at (%d,%d)->(%d,%d)\n", min_x, min_y, max_x, max_y);
1749
0
  }
1750
0
  auto *block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
1751
0
  block->pdblk.set_poly_block(new POLY_BLOCK(&vertices, type));
1752
0
  return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
1753
0
}
1754
1755
// Constructs a block from the given list of vertical text partitions.
1756
// Currently only creates rectangular blocks.
1757
TO_BLOCK *ColPartition::MakeVerticalTextBlock(const ICOORD &bleft,
1758
                                              const ICOORD &tright,
1759
                                              ColPartition_LIST *block_parts,
1760
0
                                              ColPartition_LIST *used_parts) {
1761
0
  if (block_parts->empty()) {
1762
0
    return nullptr; // Nothing to do.
1763
0
  }
1764
0
  ColPartition_IT it(block_parts);
1765
0
  ColPartition *part = it.data();
1766
0
  TBOX block_box = part->bounding_box();
1767
0
  int line_spacing = block_box.width();
1768
0
  PolyBlockType type = it.data()->type();
1769
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1770
0
    block_box += it.data()->bounding_box();
1771
0
  }
1772
0
  if (textord_debug_tabfind) {
1773
0
    tprintf("Making block at:");
1774
0
    block_box.print();
1775
0
  }
1776
0
  auto *block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
1777
0
                          block_box.right(), block_box.top());
1778
0
  block->pdblk.set_poly_block(new POLY_BLOCK(block_box, type));
1779
0
  return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
1780
0
}
1781
1782
// Makes a TO_ROW matching this and moves all the blobs to it, transferring
1783
// ownership to returned TO_ROW.
1784
0
TO_ROW *ColPartition::MakeToRow() {
1785
0
  BLOBNBOX_C_IT blob_it(&boxes_);
1786
0
  TO_ROW *row = nullptr;
1787
0
  int line_size = IsVerticalType() ? median_width_ : median_height_;
1788
  // Add all the blobs to a single TO_ROW.
1789
0
  for (; !blob_it.empty(); blob_it.forward()) {
1790
0
    BLOBNBOX *blob = blob_it.extract();
1791
    //    blob->compute_bounding_box();
1792
0
    int top = blob->bounding_box().top();
1793
0
    int bottom = blob->bounding_box().bottom();
1794
0
    if (row == nullptr) {
1795
0
      row =
1796
0
          new TO_ROW(blob, static_cast<float>(top), static_cast<float>(bottom),
1797
0
                     static_cast<float>(line_size));
1798
0
    } else {
1799
0
      row->add_blob(blob, static_cast<float>(top), static_cast<float>(bottom),
1800
0
                    static_cast<float>(line_size));
1801
0
    }
1802
0
  }
1803
0
  return row;
1804
0
}
1805
1806
// Returns a copy of everything except the list of boxes. The resulting
1807
// ColPartition is only suitable for keeping in a column candidate list.
1808
0
ColPartition *ColPartition::ShallowCopy() const {
1809
0
  auto *part = new ColPartition(blob_type_, vertical_);
1810
0
  part->left_margin_ = left_margin_;
1811
0
  part->right_margin_ = right_margin_;
1812
0
  part->bounding_box_ = bounding_box_;
1813
0
  memcpy(part->special_blobs_densities_, special_blobs_densities_,
1814
0
         sizeof(special_blobs_densities_));
1815
0
  part->median_bottom_ = median_bottom_;
1816
0
  part->median_top_ = median_top_;
1817
0
  part->median_height_ = median_height_;
1818
0
  part->median_left_ = median_left_;
1819
0
  part->median_right_ = median_right_;
1820
0
  part->median_width_ = median_width_;
1821
0
  part->good_width_ = good_width_;
1822
0
  part->good_column_ = good_column_;
1823
0
  part->left_key_tab_ = left_key_tab_;
1824
0
  part->right_key_tab_ = right_key_tab_;
1825
0
  part->type_ = type_;
1826
0
  part->flow_ = flow_;
1827
0
  part->left_key_ = left_key_;
1828
0
  part->right_key_ = right_key_;
1829
0
  part->first_column_ = first_column_;
1830
0
  part->last_column_ = last_column_;
1831
0
  part->owns_blobs_ = false;
1832
0
  return part;
1833
0
}
1834
1835
0
ColPartition *ColPartition::CopyButDontOwnBlobs() {
1836
0
  ColPartition *copy = ShallowCopy();
1837
0
  copy->set_owns_blobs(false);
1838
0
  BLOBNBOX_C_IT inserter(copy->boxes());
1839
0
  BLOBNBOX_C_IT traverser(boxes());
1840
0
  for (traverser.mark_cycle_pt(); !traverser.cycled_list();
1841
0
       traverser.forward()) {
1842
0
    inserter.add_after_then_move(traverser.data());
1843
0
  }
1844
0
  return copy;
1845
0
}
1846
1847
#ifndef GRAPHICS_DISABLED
1848
// Provides a color for BBGrid to draw the rectangle.
1849
// Must be kept in sync with PolyBlockType.
1850
ScrollView::Color ColPartition::BoxColor() const {
1851
  if (type_ == PT_UNKNOWN) {
1852
    return BLOBNBOX::TextlineColor(blob_type_, flow_);
1853
  }
1854
  return POLY_BLOCK::ColorForPolyBlockType(type_);
1855
}
1856
#endif // !GRAPHICS_DISABLED
1857
1858
// Keep in sync with BlobRegionType.
1859
static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
1860
1861
// Prints debug information on this.
1862
0
void ColPartition::Print() const {
1863
0
  int y = MidY();
1864
0
  tprintf(
1865
0
      "ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1866
0
      " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1867
0
      " ts=%d bs=%d ls=%d rs=%d\n",
1868
0
      boxes_.empty() ? 'E' : ' ', left_margin_, left_key_tab_ ? 'T' : 'B',
1869
0
      LeftAtY(y), bounding_box_.left(), median_left_, bounding_box_.bottom(),
1870
0
      median_bottom_, bounding_box_.right(), RightAtY(y),
1871
0
      right_key_tab_ ? 'T' : 'B', right_margin_, median_right_,
1872
0
      bounding_box_.top(), median_top_, good_width_, good_column_, type_,
1873
0
      kBlobTypes[blob_type_], flow_, first_column_, last_column_,
1874
0
      boxes_.length(), space_above_, space_below_, space_to_left_,
1875
0
      space_to_right_);
1876
0
}
1877
1878
// Prints debug information on the colors.
1879
0
void ColPartition::PrintColors() {
1880
0
  tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n", color1_[COLOR_RED],
1881
0
          color1_[COLOR_GREEN], color1_[COLOR_BLUE], color1_[L_ALPHA_CHANNEL],
1882
0
          color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1883
0
}
1884
1885
// Sets the types of all partitions in the run to be the max of the types.
1886
0
void ColPartition::SmoothPartnerRun(int working_set_count) {
1887
0
  STATS left_stats(0, working_set_count - 1);
1888
0
  STATS right_stats(0, working_set_count - 1);
1889
0
  PolyBlockType max_type = type_;
1890
0
  ColPartition *partner;
1891
0
  for (partner = SingletonPartner(false); partner != nullptr;
1892
0
       partner = partner->SingletonPartner(false)) {
1893
0
    if (partner->type_ > max_type) {
1894
0
      max_type = partner->type_;
1895
0
    }
1896
0
    if (column_set_ == partner->column_set_) {
1897
0
      left_stats.add(partner->first_column_, 1);
1898
0
      right_stats.add(partner->last_column_, 1);
1899
0
    }
1900
0
  }
1901
0
  type_ = max_type;
1902
  // TODO(rays) Either establish that it isn't necessary to set the columns,
1903
  // or find a way to do it that does not cause an assert failure in
1904
  // AddToWorkingSet.
1905
#if 0
1906
  first_column_ = left_stats.mode();
1907
  last_column_ = right_stats.mode();
1908
  if (last_column_ < first_column_)
1909
    last_column_ = first_column_;
1910
#endif
1911
1912
0
  for (partner = SingletonPartner(false); partner != nullptr;
1913
0
       partner = partner->SingletonPartner(false)) {
1914
0
    partner->type_ = max_type;
1915
#if 0 // See TODO above
1916
    if (column_set_ == partner->column_set_) {
1917
      partner->first_column_ = first_column_;
1918
      partner->last_column_ = last_column_;
1919
    }
1920
#endif
1921
0
  }
1922
0
}
1923
1924
// ======= Scenario common to all Refine*Partners* functions =======
1925
// ColPartitions are aiming to represent textlines, or horizontal slices
1926
// of images, and we are trying to form bi-directional (upper/lower) chains
1927
// of UNIQUE partner ColPartitions that can be made into blocks.
1928
// The ColPartitions have previously been typed (see SetPartitionType)
1929
// according to a combination of the content type and
1930
// how they lie on the columns. We want to chain text into
1931
// groups of a single type, but image ColPartitions may have been typed
1932
// differently in different parts of the image, due to being non-rectangular.
1933
//
1934
// We previously ran a search for upper and lower partners, but there may
1935
// be more than one, and they may be of mixed types, so now we wish to
1936
// refine the partners down to at most one.
1937
// A heading may have multiple partners:
1938
// ===============================
1939
// ========  ==========  =========
1940
// ========  ==========  =========
1941
// but it should be a different type.
1942
// A regular flowing text line may have multiple partners:
1943
// ==================   ===================
1944
// =======   =================  ===========
1945
// This could be the start of a pull-out, or it might all be in a single
1946
// column and might be caused by tightly spaced text, bold words, bullets,
1947
// funny punctuation etc, all of which can cause textlines to be split into
1948
// multiple ColPartitions. Pullouts and figure captions should now be different
1949
// types so we can more aggressively merge groups of partners that all sit
1950
// in a single column.
1951
//
1952
// Cleans up the partners of the given type so that there is at most
1953
// one partner. This makes block creation simpler.
1954
// If get_desperate is true, goes to more desperate merge methods
1955
// to merge flowing text before breaking partnerships.
1956
void ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,
1957
0
                                  ColPartitionGrid *grid) {
1958
0
  if (TypesSimilar(type_, type)) {
1959
0
    RefinePartnersInternal(true, get_desperate, grid);
1960
0
    RefinePartnersInternal(false, get_desperate, grid);
1961
0
  } else if (type == PT_COUNT) {
1962
    // This is the final pass. Make sure only the correctly typed
1963
    // partners surivive, however many there are.
1964
0
    RefinePartnersByType(true, &upper_partners_);
1965
0
    RefinePartnersByType(false, &lower_partners_);
1966
    // It is possible for a merge to have given a partition multiple
1967
    // partners again, so the last resort is to use overlap which is
1968
    // guaranteed to leave at most one partner left.
1969
0
    if (!upper_partners_.empty() && !upper_partners_.singleton()) {
1970
0
      RefinePartnersByOverlap(true, &upper_partners_);
1971
0
    }
1972
0
    if (!lower_partners_.empty() && !lower_partners_.singleton()) {
1973
0
      RefinePartnersByOverlap(false, &lower_partners_);
1974
0
    }
1975
0
  }
1976
0
}
1977
1978
////////////////// PRIVATE CODE /////////////////////////////
1979
1980
// Cleans up the partners above if upper is true, else below.
1981
// If get_desperate is true, goes to more desperate merge methods
1982
// to merge flowing text before breaking partnerships.
1983
void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
1984
0
                                          ColPartitionGrid *grid) {
1985
0
  ColPartition_CLIST *partners = upper ? &upper_partners_ : &lower_partners_;
1986
0
  if (!partners->empty() && !partners->singleton()) {
1987
0
    RefinePartnersByType(upper, partners);
1988
0
    if (!partners->empty() && !partners->singleton()) {
1989
      // Check for transitive partnerships and break the cycle.
1990
0
      RefinePartnerShortcuts(upper, partners);
1991
0
      if (!partners->empty() && !partners->singleton()) {
1992
        // Types didn't fix it. Flowing text keeps the one with the longest
1993
        // sequence of singleton matching partners. All others max overlap.
1994
0
        if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
1995
0
          RefineTextPartnersByMerge(upper, false, partners, grid);
1996
0
          if (!partners->empty() && !partners->singleton()) {
1997
0
            RefineTextPartnersByMerge(upper, true, partners, grid);
1998
0
          }
1999
0
        }
2000
        // The last resort is to use overlap.
2001
0
        if (!partners->empty() && !partners->singleton()) {
2002
0
          RefinePartnersByOverlap(upper, partners);
2003
0
        }
2004
0
      }
2005
0
    }
2006
0
  }
2007
0
}
2008
2009
// Cleans up the partners above if upper is true, else below.
2010
// Restricts the partners to only desirable types. For text and BRT_HLINE this
2011
// means the same type_ , and for image types it means any image type.
2012
void ColPartition::RefinePartnersByType(bool upper,
2013
0
                                        ColPartition_CLIST *partners) {
2014
0
  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2015
0
                                         bounding_box_.bottom());
2016
0
  if (debug) {
2017
0
    tprintf("Refining %d %s partners by type for:\n", partners->length(),
2018
0
            upper ? "Upper" : "Lower");
2019
0
    Print();
2020
0
  }
2021
0
  ColPartition_C_IT it(partners);
2022
  // Purify text by type.
2023
0
  if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
2024
    // Keep only partners matching type_.
2025
    // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
2026
    // text types if it is the only partner.
2027
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2028
0
      ColPartition *partner = it.data();
2029
0
      if (!TypesSimilar(type_, partner->type_)) {
2030
0
        if (debug) {
2031
0
          tprintf("Removing partner:");
2032
0
          partner->Print();
2033
0
        }
2034
0
        partner->RemovePartner(!upper, this);
2035
0
        it.extract();
2036
0
      } else if (debug) {
2037
0
        tprintf("Keeping partner:");
2038
0
        partner->Print();
2039
0
      }
2040
0
    }
2041
0
  } else {
2042
    // Only polyimages are allowed to have partners of any kind!
2043
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2044
0
      ColPartition *partner = it.data();
2045
0
      if (partner->blob_type() != BRT_POLYIMAGE ||
2046
0
          blob_type() != BRT_POLYIMAGE) {
2047
0
        if (debug) {
2048
0
          tprintf("Removing partner:");
2049
0
          partner->Print();
2050
0
        }
2051
0
        partner->RemovePartner(!upper, this);
2052
0
        it.extract();
2053
0
      } else if (debug) {
2054
0
        tprintf("Keeping partner:");
2055
0
        partner->Print();
2056
0
      }
2057
0
    }
2058
0
  }
2059
0
}
2060
2061
// Cleans up the partners above if upper is true, else below.
2062
// Remove transitive partnerships: this<->a, and a<->b and this<->b.
2063
// Gets rid of this<->b, leaving a clean chain.
2064
// Also if we have this<->a and a<->this, then gets rid of this<->a, as
2065
// this has multiple partners.
2066
void ColPartition::RefinePartnerShortcuts(bool upper,
2067
0
                                          ColPartition_CLIST *partners) {
2068
0
  bool done_any = false;
2069
0
  do {
2070
0
    done_any = false;
2071
0
    ColPartition_C_IT it(partners);
2072
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2073
0
      ColPartition *a = it.data();
2074
      // Check for a match between all of a's partners (it1/b1) and all
2075
      // of this's partners (it2/b2).
2076
0
      ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
2077
0
      for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
2078
0
        ColPartition *b1 = it1.data();
2079
0
        if (b1 == this) {
2080
0
          done_any = true;
2081
0
          it.extract();
2082
0
          a->RemovePartner(!upper, this);
2083
0
          break;
2084
0
        }
2085
0
        ColPartition_C_IT it2(partners);
2086
0
        for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
2087
0
          ColPartition *b2 = it2.data();
2088
0
          if (b1 == b2) {
2089
            // Jackpot! b2 should not be a partner of this.
2090
0
            it2.extract();
2091
0
            b2->RemovePartner(!upper, this);
2092
0
            done_any = true;
2093
            // That potentially invalidated all the iterators, so break out
2094
            // and start again.
2095
0
            break;
2096
0
          }
2097
0
        }
2098
0
        if (done_any) {
2099
0
          break;
2100
0
        }
2101
0
      }
2102
0
      if (done_any) {
2103
0
        break;
2104
0
      }
2105
0
    }
2106
0
  } while (done_any && !partners->empty() && !partners->singleton());
2107
0
}
2108
2109
// Cleans up the partners above if upper is true, else below.
2110
// If multiple text partners can be merged, (with each other, NOT with this),
2111
// then do so.
2112
// If desperate is true, then an increase in overlap with the merge is
2113
// allowed. If the overlap increases, then the desperately_merged_ flag
2114
// is set, indicating that the textlines probably need to be regenerated
2115
// by aggressive line fitting/splitting, as there are probably vertically
2116
// joined blobs that cross textlines.
2117
void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
2118
                                             ColPartition_CLIST *partners,
2119
0
                                             ColPartitionGrid *grid) {
2120
0
  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2121
0
                                         bounding_box_.bottom());
2122
0
  if (debug) {
2123
0
    tprintf("Refining %d %s partners by merge for:\n", partners->length(),
2124
0
            upper ? "Upper" : "Lower");
2125
0
    Print();
2126
0
  }
2127
0
  while (!partners->empty() && !partners->singleton()) {
2128
    // Absorb will mess up the iterators, so we have to merge one partition
2129
    // at a time and rebuild the iterators each time.
2130
0
    ColPartition_C_IT it(partners);
2131
0
    ColPartition *part = it.data();
2132
    // Gather a list of merge candidates, from the list of partners, that
2133
    // are all in the same single column. See general scenario comment above.
2134
0
    ColPartition_CLIST candidates;
2135
0
    ColPartition_C_IT cand_it(&candidates);
2136
0
    for (it.forward(); !it.at_first(); it.forward()) {
2137
0
      ColPartition *candidate = it.data();
2138
0
      if (part->first_column_ == candidate->last_column_ &&
2139
0
          part->last_column_ == candidate->first_column_) {
2140
0
        cand_it.add_after_then_move(it.data());
2141
0
      }
2142
0
    }
2143
0
    int overlap_increase;
2144
0
    ColPartition *candidate = grid->BestMergeCandidate(
2145
0
        part, &candidates, debug, nullptr, &overlap_increase);
2146
0
    if (candidate != nullptr && (overlap_increase <= 0 || desperate)) {
2147
0
      if (debug) {
2148
0
        tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2149
0
                part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2150
0
                overlap_increase);
2151
0
      }
2152
      // Remove before merge and re-insert to keep the integrity of the grid.
2153
0
      grid->RemoveBBox(candidate);
2154
0
      grid->RemoveBBox(part);
2155
0
      part->Absorb(candidate, nullptr);
2156
      // We modified the box of part, so re-insert it into the grid.
2157
0
      grid->InsertBBox(true, true, part);
2158
0
      if (overlap_increase > 0) {
2159
0
        part->desperately_merged_ = true;
2160
0
      }
2161
0
    } else {
2162
0
      break; // Can't merge.
2163
0
    }
2164
0
  }
2165
0
}
2166
2167
// Cleans up the partners above if upper is true, else below.
2168
// Keep the partner with the biggest overlap.
2169
void ColPartition::RefinePartnersByOverlap(bool upper,
2170
0
                                           ColPartition_CLIST *partners) {
2171
0
  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2172
0
                                         bounding_box_.bottom());
2173
0
  if (debug) {
2174
0
    tprintf("Refining %d %s partners by overlap for:\n", partners->length(),
2175
0
            upper ? "Upper" : "Lower");
2176
0
    Print();
2177
0
  }
2178
0
  ColPartition_C_IT it(partners);
2179
0
  ColPartition *best_partner = it.data();
2180
  // Find the partner with the best overlap.
2181
0
  int best_overlap = 0;
2182
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2183
0
    ColPartition *partner = it.data();
2184
0
    int overlap =
2185
0
        std::min(bounding_box_.right(), partner->bounding_box_.right()) -
2186
0
        std::max(bounding_box_.left(), partner->bounding_box_.left());
2187
0
    if (overlap > best_overlap) {
2188
0
      best_overlap = overlap;
2189
0
      best_partner = partner;
2190
0
    }
2191
0
  }
2192
  // Keep only the best partner.
2193
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2194
0
    ColPartition *partner = it.data();
2195
0
    if (partner != best_partner) {
2196
0
      if (debug) {
2197
0
        tprintf("Removing partner:");
2198
0
        partner->Print();
2199
0
      }
2200
0
      partner->RemovePartner(!upper, this);
2201
0
      it.extract();
2202
0
    }
2203
0
  }
2204
0
}
2205
2206
// Return true if bbox belongs better in this than other.
2207
bool ColPartition::ThisPartitionBetter(BLOBNBOX *bbox,
2208
0
                                       const ColPartition &other) {
2209
0
  const TBOX &box = bbox->bounding_box();
2210
  // Margins take priority.
2211
0
  int left = box.left();
2212
0
  int right = box.right();
2213
0
  if (left < left_margin_ || right > right_margin_) {
2214
0
    return false;
2215
0
  }
2216
0
  if (left < other.left_margin_ || right > other.right_margin_) {
2217
0
    return true;
2218
0
  }
2219
0
  int top = box.top();
2220
0
  int bottom = box.bottom();
2221
0
  int this_overlap =
2222
0
      std::min(top, median_top_) - std::max(bottom, median_bottom_);
2223
0
  int other_overlap =
2224
0
      std::min(top, other.median_top_) - std::max(bottom, other.median_bottom_);
2225
0
  int this_miss = median_top_ - median_bottom_ - this_overlap;
2226
0
  int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2227
0
  if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
2228
0
    tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2229
0
            box.left(), box.bottom(), box.right(), box.top(), this_overlap,
2230
0
            other_overlap, this_miss, other_miss, median_top_,
2231
0
            other.median_top_);
2232
0
  }
2233
0
  if (this_miss < other_miss) {
2234
0
    return true;
2235
0
  }
2236
0
  if (this_miss > other_miss) {
2237
0
    return false;
2238
0
  }
2239
0
  if (this_overlap > other_overlap) {
2240
0
    return true;
2241
0
  }
2242
0
  if (this_overlap < other_overlap) {
2243
0
    return false;
2244
0
  }
2245
0
  return median_top_ >= other.median_top_;
2246
0
}
2247
2248
// Returns the median line-spacing between the current position and the end
2249
// of the list.
2250
// The iterator is passed by value so the iteration does not modify the
2251
// caller's iterator.
2252
0
static int MedianSpacing(int page_height, ColPartition_IT it) {
2253
0
  STATS stats(0, page_height - 1);
2254
0
  while (!it.cycled_list()) {
2255
0
    ColPartition *part = it.data();
2256
0
    it.forward();
2257
0
    stats.add(part->bottom_spacing(), 1);
2258
0
    stats.add(part->top_spacing(), 1);
2259
0
  }
2260
0
  return static_cast<int>(stats.median() + 0.5);
2261
0
}
2262
2263
// Returns true if this column partition is in the same column as
2264
// part. This function will only work after the SetPartitionType function
2265
// has been called on both column partitions. This is useful for
2266
// doing a SideSearch when you want things in the same page column.
2267
//
2268
// Currently called by the table detection code to identify if potential table
2269
// partitions exist in the same column.
2270
0
bool ColPartition::IsInSameColumnAs(const ColPartition &part) const {
2271
  // Overlap does not occur when last < part.first or first > part.last.
2272
  // In other words, one is completely to the side of the other.
2273
  // This is just DeMorgan's law applied to that so the function returns true.
2274
0
  return (last_column_ >= part.first_column_) &&
2275
0
         (first_column_ <= part.last_column_);
2276
0
}
2277
2278
// Smoothes the spacings in the list into groups of equal linespacing.
2279
// resolution is the resolution of the original image, used as a basis
2280
// for thresholds in change of spacing. page_height is in pixels.
2281
void ColPartition::SmoothSpacings(int resolution, int page_height,
2282
0
                                  ColPartition_LIST *parts) {
2283
  // The task would be trivial if we didn't have to allow for blips -
2284
  // occasional offsets in spacing caused by anomalous text, such as all
2285
  // caps, groups of descenders, joined words, Arabic etc.
2286
  // The neighbourhood stores a consecutive group of partitions so that
2287
  // blips can be detected correctly, yet conservatively enough to not
2288
  // mistake genuine spacing changes for blips. See example below.
2289
0
  ColPartition *neighbourhood[PN_COUNT];
2290
0
  ColPartition_IT it(parts);
2291
0
  it.mark_cycle_pt();
2292
  // Although we know nothing about the spacings is this list, the median is
2293
  // used as an approximation to allow blips.
2294
  // If parts of this block aren't spaced to the median, then we can't
2295
  // accept blips in those parts, but we'll recalculate it each time we
2296
  // split the block, so the median becomes more likely to match all the text.
2297
0
  int median_space = MedianSpacing(page_height, it);
2298
0
  ColPartition_IT start_it(it);
2299
0
  ColPartition_IT end_it(it);
2300
0
  for (int i = 0; i < PN_COUNT; ++i) {
2301
0
    if (i < PN_UPPER || it.cycled_list()) {
2302
0
      neighbourhood[i] = nullptr;
2303
0
    } else {
2304
0
      if (i == PN_LOWER) {
2305
0
        end_it = it;
2306
0
      }
2307
0
      neighbourhood[i] = it.data();
2308
0
      it.forward();
2309
0
    }
2310
0
  }
2311
0
  while (neighbourhood[PN_UPPER] != nullptr) {
2312
    // Test for end of a group. Normally SpacingsEqual is true within a group,
2313
    // but in the case of a blip, it will be false. Here is an example:
2314
    // Line enum   Spacing below (spacing between tops of lines)
2315
    //  1   ABOVE2    20
2316
    //  2   ABOVE1    20
2317
    //  3   UPPER     15
2318
    //  4   LOWER     25
2319
    //  5   BELOW1    20
2320
    //  6   BELOW2    20
2321
    // Line 4 is all in caps (regular caps), so the spacing between line 3
2322
    // and line 4 (looking at the tops) is smaller than normal, and the
2323
    // spacing between line 4 and line 5 is larger than normal, but the
2324
    // two of them add to twice the normal spacing.
2325
    // The following if has to accept unequal spacings 3 times to pass the
2326
    // blip (20/15, 15/25 and 25/20)
2327
    // When the blip is in the middle, OKSpacingBlip tests that one of
2328
    // ABOVE1 and BELOW1 matches the median.
2329
    // The first time, everything is shifted down 1, so we present
2330
    // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
2331
    // The last time, everything is shifted up 1, so we present OKSpacingBlip
2332
    // with neighbourhood-1 and check that PN_LOWER matches the median.
2333
0
    if (neighbourhood[PN_LOWER] == nullptr ||
2334
0
        (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2335
0
                                                 resolution) &&
2336
0
         (neighbourhood[PN_UPPER] == nullptr ||
2337
0
          neighbourhood[PN_LOWER] == nullptr ||
2338
0
          !OKSpacingBlip(resolution, median_space, neighbourhood, 0)) &&
2339
0
         (neighbourhood[PN_UPPER - 1] == nullptr ||
2340
0
          neighbourhood[PN_LOWER - 1] == nullptr ||
2341
0
          !OKSpacingBlip(resolution, median_space, neighbourhood, -1) ||
2342
0
          !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2343
0
         (neighbourhood[PN_UPPER + 1] == nullptr ||
2344
0
          neighbourhood[PN_LOWER + 1] == nullptr ||
2345
0
          !OKSpacingBlip(resolution, median_space, neighbourhood, 1) ||
2346
0
          !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2347
      // The group has ended. PN_UPPER is the last member.
2348
      // Compute the mean spacing over the group.
2349
0
      ColPartition_IT sum_it(start_it);
2350
0
      ColPartition *last_part = neighbourhood[PN_UPPER];
2351
0
      double total_bottom = 0.0;
2352
0
      double total_top = 0.0;
2353
0
      int total_count = 0;
2354
0
      ColPartition *upper = sum_it.data();
2355
      // We do not process last_part, as its spacing is different.
2356
0
      while (upper != last_part) {
2357
0
        total_bottom += upper->bottom_spacing();
2358
0
        total_top += upper->top_spacing();
2359
0
        ++total_count;
2360
0
        sum_it.forward();
2361
0
        upper = sum_it.data();
2362
0
      }
2363
0
      if (total_count > 0) {
2364
        // There were at least 2 lines, so set them all to the mean.
2365
0
        int top_spacing = static_cast<int>(total_top / total_count + 0.5);
2366
0
        int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
2367
0
        if (textord_debug_tabfind) {
2368
0
          tprintf("Spacing run ended. Cause:");
2369
0
          if (neighbourhood[PN_LOWER] == nullptr) {
2370
0
            tprintf("No more lines\n");
2371
0
          } else {
2372
0
            tprintf("Spacing change. Spacings:\n");
2373
0
            for (int i = 0; i < PN_COUNT; ++i) {
2374
0
              if (neighbourhood[i] == nullptr) {
2375
0
                tprintf("NULL");
2376
0
                if (i > 0 && neighbourhood[i - 1] != nullptr) {
2377
0
                  if (neighbourhood[i - 1]->SingletonPartner(false) !=
2378
0
                      nullptr) {
2379
0
                    tprintf(" Lower partner:");
2380
0
                    neighbourhood[i - 1]->SingletonPartner(false)->Print();
2381
0
                  } else {
2382
0
                    tprintf(" nullptr lower partner:\n");
2383
0
                  }
2384
0
                } else {
2385
0
                  tprintf("\n");
2386
0
                }
2387
0
              } else {
2388
0
                tprintf("Top = %d, bottom = %d\n",
2389
0
                        neighbourhood[i]->top_spacing(),
2390
0
                        neighbourhood[i]->bottom_spacing());
2391
0
              }
2392
0
            }
2393
0
          }
2394
0
          tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
2395
0
        }
2396
0
        sum_it = start_it;
2397
0
        upper = sum_it.data();
2398
0
        while (upper != last_part) {
2399
0
          upper->set_top_spacing(top_spacing);
2400
0
          upper->set_bottom_spacing(bottom_spacing);
2401
0
          if (textord_debug_tabfind) {
2402
0
            tprintf("Setting mean on:");
2403
0
            upper->Print();
2404
0
          }
2405
0
          sum_it.forward();
2406
0
          upper = sum_it.data();
2407
0
        }
2408
0
      }
2409
      // PN_LOWER starts the next group and end_it is the next start_it.
2410
0
      start_it = end_it;
2411
      // Recalculate the median spacing to maximize the chances of detecting
2412
      // spacing blips.
2413
0
      median_space = MedianSpacing(page_height, end_it);
2414
0
    }
2415
    // Shuffle pointers.
2416
0
    for (int j = 1; j < PN_COUNT; ++j) {
2417
0
      neighbourhood[j - 1] = neighbourhood[j];
2418
0
    }
2419
0
    if (it.cycled_list()) {
2420
0
      neighbourhood[PN_COUNT - 1] = nullptr;
2421
0
    } else {
2422
0
      neighbourhood[PN_COUNT - 1] = it.data();
2423
0
      it.forward();
2424
0
    }
2425
0
    end_it.forward();
2426
0
  }
2427
0
}
2428
2429
// Returns true if the parts array of pointers to partitions matches the
2430
// condition for a spacing blip. See SmoothSpacings for what this means
2431
// and how it is used.
2432
bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
2433
0
                                 ColPartition **parts, int offset) {
2434
  // The blip is OK if upper and lower sum to an OK value and at least
2435
  // one of above1 and below1 is equal to the median.
2436
0
  parts += offset;
2437
0
  return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER], median_spacing,
2438
0
                                          resolution) &&
2439
0
         ((parts[PN_ABOVE1] != nullptr &&
2440
0
           parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2441
0
          (parts[PN_BELOW1] != nullptr &&
2442
0
           parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2443
0
}
2444
2445
// Returns true if both the top and bottom spacings of this match the given
2446
// spacing to within suitable margins dictated by the image resolution.
2447
0
bool ColPartition::SpacingEqual(int spacing, int resolution) const {
2448
0
  int bottom_error = BottomSpacingMargin(resolution);
2449
0
  int top_error = TopSpacingMargin(resolution);
2450
0
  return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2451
0
         NearlyEqual(top_spacing_, spacing, top_error);
2452
0
}
2453
2454
// Returns true if both the top and bottom spacings of this and other
2455
// match to within suitable margins dictated by the image resolution.
2456
bool ColPartition::SpacingsEqual(const ColPartition &other,
2457
0
                                 int resolution) const {
2458
0
  int bottom_error = std::max(BottomSpacingMargin(resolution),
2459
0
                              other.BottomSpacingMargin(resolution));
2460
0
  int top_error = std::max(TopSpacingMargin(resolution),
2461
0
                           other.TopSpacingMargin(resolution));
2462
0
  return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2463
0
         (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2464
0
          NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2465
0
                      bottom_error));
2466
0
}
2467
2468
// Returns true if the sum spacing of this and other match the given
2469
// spacing (or twice the given spacing) to within a suitable margin dictated
2470
// by the image resolution.
2471
bool ColPartition::SummedSpacingOK(const ColPartition &other, int spacing,
2472
0
                                   int resolution) const {
2473
0
  int bottom_error = std::max(BottomSpacingMargin(resolution),
2474
0
                              other.BottomSpacingMargin(resolution));
2475
0
  int top_error = std::max(TopSpacingMargin(resolution),
2476
0
                           other.TopSpacingMargin(resolution));
2477
0
  int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2478
0
  int top_total = top_spacing_ + other.top_spacing_;
2479
0
  return (NearlyEqual(spacing, bottom_total, bottom_error) &&
2480
0
          NearlyEqual(spacing, top_total, top_error)) ||
2481
0
         (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2482
0
          NearlyEqual(spacing * 2, top_total, top_error));
2483
0
}
2484
2485
// Returns a suitable spacing margin that can be applied to bottoms of
2486
// text lines, based on the resolution and the stored side_step_.
2487
0
int ColPartition::BottomSpacingMargin(int resolution) const {
2488
0
  return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
2489
0
}
2490
2491
// Returns a suitable spacing margin that can be applied to tops of
2492
// text lines, based on the resolution and the stored side_step_.
2493
0
int ColPartition::TopSpacingMargin(int resolution) const {
2494
0
  return static_cast<int>(kMaxTopSpacingFraction * median_height_ + 0.5) +
2495
0
         BottomSpacingMargin(resolution);
2496
0
}
2497
2498
// Returns true if the median text sizes of this and other agree to within
2499
// a reasonable multiplicative factor.
2500
0
bool ColPartition::SizesSimilar(const ColPartition &other) const {
2501
0
  return median_height_ <= other.median_height_ * kMaxSizeRatio &&
2502
0
         other.median_height_ <= median_height_ * kMaxSizeRatio;
2503
0
}
2504
2505
// Helper updates margin_left and margin_right, being the bounds of the left
2506
// margin of part of a block. Returns false and does not update the bounds if
2507
// this partition has a disjoint margin with the established margin.
2508
static bool UpdateLeftMargin(const ColPartition &part, int *margin_left,
2509
0
                             int *margin_right) {
2510
0
  const TBOX &part_box = part.bounding_box();
2511
0
  int top = part_box.top();
2512
0
  int bottom = part_box.bottom();
2513
0
  int tl_key = part.SortKey(part.left_margin(), top);
2514
0
  int tr_key = part.SortKey(part_box.left(), top);
2515
0
  int bl_key = part.SortKey(part.left_margin(), bottom);
2516
0
  int br_key = part.SortKey(part_box.left(), bottom);
2517
0
  int left_key = std::max(tl_key, bl_key);
2518
0
  int right_key = std::min(tr_key, br_key);
2519
0
  if (left_key <= *margin_right && right_key >= *margin_left) {
2520
    // This part is good - let's keep it.
2521
0
    *margin_right = std::min(*margin_right, right_key);
2522
0
    *margin_left = std::max(*margin_left, left_key);
2523
0
    return true;
2524
0
  }
2525
0
  return false;
2526
0
}
2527
2528
// Computes and returns in start, end a line segment formed from a
2529
// forwards-iterated group of left edges of partitions that satisfy the
2530
// condition that the intersection of the left margins is non-empty, ie the
2531
// rightmost left margin is to the left of the leftmost left bounding box edge.
2532
// On return the iterator is set to the start of the next run.
2533
void ColPartition::LeftEdgeRun(ColPartition_IT *part_it, ICOORD *start,
2534
0
                               ICOORD *end) {
2535
0
  ColPartition *part = part_it->data();
2536
0
  ColPartition *start_part = part;
2537
0
  int start_y = part->bounding_box_.top();
2538
0
  if (!part_it->at_first()) {
2539
0
    int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2540
0
    if (prev_bottom < start_y) {
2541
0
      start_y = prev_bottom;
2542
0
    } else if (prev_bottom > start_y) {
2543
0
      start_y = (start_y + prev_bottom) / 2;
2544
0
    }
2545
0
  }
2546
0
  int end_y = part->bounding_box_.bottom();
2547
0
  int margin_right = INT32_MAX;
2548
0
  int margin_left = -INT32_MAX;
2549
0
  UpdateLeftMargin(*part, &margin_left, &margin_right);
2550
0
  do {
2551
0
    part_it->forward();
2552
0
    part = part_it->data();
2553
0
  } while (!part_it->at_first() &&
2554
0
           UpdateLeftMargin(*part, &margin_left, &margin_right));
2555
  // The run ended. If we were pushed inwards, compute the next run and
2556
  // extend it backwards into the run we just calculated to find the end of
2557
  // this run that provides a tight box.
2558
0
  int next_margin_right = INT32_MAX;
2559
0
  int next_margin_left = -INT32_MAX;
2560
0
  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2561
0
  if (next_margin_left > margin_right) {
2562
0
    ColPartition_IT next_it(*part_it);
2563
0
    do {
2564
0
      next_it.forward();
2565
0
      part = next_it.data();
2566
0
    } while (!next_it.at_first() &&
2567
0
             UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2568
    // Now extend the next run backwards into the original run to get the
2569
    // tightest fit.
2570
0
    do {
2571
0
      part_it->backward();
2572
0
      part = part_it->data();
2573
0
    } while (part != start_part &&
2574
0
             UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2575
0
    part_it->forward();
2576
0
  }
2577
  // Now calculate the end_y.
2578
0
  part = part_it->data_relative(-1);
2579
0
  end_y = part->bounding_box_.bottom();
2580
0
  if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y) {
2581
0
    end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2582
0
  }
2583
0
  start->set_y(start_y);
2584
0
  start->set_x(part->XAtY(margin_right, start_y));
2585
0
  end->set_y(end_y);
2586
0
  end->set_x(part->XAtY(margin_right, end_y));
2587
0
  if (textord_debug_tabfind && !part_it->at_first()) {
2588
0
    tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2589
0
            start_y, end_y, part->XAtY(margin_left, end_y), end->x(),
2590
0
            part->left_margin_, part->bounding_box_.left());
2591
0
  }
2592
0
}
2593
2594
// Helper updates margin_left and margin_right, being the bounds of the right
2595
// margin of part of a block. Returns false and does not update the bounds if
2596
// this partition has a disjoint margin with the established margin.
2597
static bool UpdateRightMargin(const ColPartition &part, int *margin_left,
2598
0
                              int *margin_right) {
2599
0
  const TBOX &part_box = part.bounding_box();
2600
0
  int top = part_box.top();
2601
0
  int bottom = part_box.bottom();
2602
0
  int tl_key = part.SortKey(part_box.right(), top);
2603
0
  int tr_key = part.SortKey(part.right_margin(), top);
2604
0
  int bl_key = part.SortKey(part_box.right(), bottom);
2605
0
  int br_key = part.SortKey(part.right_margin(), bottom);
2606
0
  int left_key = std::max(tl_key, bl_key);
2607
0
  int right_key = std::min(tr_key, br_key);
2608
0
  if (left_key <= *margin_right && right_key >= *margin_left) {
2609
    // This part is good - let's keep it.
2610
0
    *margin_right = std::min(*margin_right, right_key);
2611
0
    *margin_left = std::max(*margin_left, left_key);
2612
0
    return true;
2613
0
  }
2614
0
  return false;
2615
0
}
2616
2617
// Computes and returns in start, end a line segment formed from a
2618
// backwards-iterated group of right edges of partitions that satisfy the
2619
// condition that the intersection of the right margins is non-empty, ie the
2620
// leftmost right margin is to the right of the rightmost right bounding box
2621
// edge.
2622
// On return the iterator is set to the start of the next run.
2623
void ColPartition::RightEdgeRun(ColPartition_IT *part_it, ICOORD *start,
2624
0
                                ICOORD *end) {
2625
0
  ColPartition *part = part_it->data();
2626
0
  ColPartition *start_part = part;
2627
0
  int start_y = part->bounding_box_.bottom();
2628
0
  if (!part_it->at_last()) {
2629
0
    int next_y = part_it->data_relative(1)->bounding_box_.top();
2630
0
    if (next_y > start_y) {
2631
0
      start_y = next_y;
2632
0
    } else if (next_y < start_y) {
2633
0
      start_y = (start_y + next_y) / 2;
2634
0
    }
2635
0
  }
2636
0
  int end_y = part->bounding_box_.top();
2637
0
  int margin_right = INT32_MAX;
2638
0
  int margin_left = -INT32_MAX;
2639
0
  UpdateRightMargin(*part, &margin_left, &margin_right);
2640
0
  do {
2641
0
    part_it->backward();
2642
0
    part = part_it->data();
2643
0
  } while (!part_it->at_last() &&
2644
0
           UpdateRightMargin(*part, &margin_left, &margin_right));
2645
  // The run ended. If we were pushed inwards, compute the next run and
2646
  // extend it backwards to find the end of this run for a tight box.
2647
0
  int next_margin_right = INT32_MAX;
2648
0
  int next_margin_left = -INT32_MAX;
2649
0
  UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2650
0
  if (next_margin_right < margin_left) {
2651
0
    ColPartition_IT next_it(*part_it);
2652
0
    do {
2653
0
      next_it.backward();
2654
0
      part = next_it.data();
2655
0
    } while (!next_it.at_last() &&
2656
0
             UpdateRightMargin(*part, &next_margin_left, &next_margin_right));
2657
    // Now extend the next run forwards into the original run to get the
2658
    // tightest fit.
2659
0
    do {
2660
0
      part_it->forward();
2661
0
      part = part_it->data();
2662
0
    } while (part != start_part &&
2663
0
             UpdateRightMargin(*part, &next_margin_left, &next_margin_right));
2664
0
    part_it->backward();
2665
0
  }
2666
  // Now calculate the end_y.
2667
0
  part = part_it->data_relative(1);
2668
0
  end_y = part->bounding_box().top();
2669
0
  if (!part_it->at_last() && part_it->data()->bounding_box_.bottom() > end_y) {
2670
0
    end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2671
0
  }
2672
0
  start->set_y(start_y);
2673
0
  start->set_x(part->XAtY(margin_left, start_y));
2674
0
  end->set_y(end_y);
2675
0
  end->set_x(part->XAtY(margin_left, end_y));
2676
0
  if (textord_debug_tabfind && !part_it->at_last()) {
2677
0
    tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2678
0
            start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
2679
0
            part->bounding_box_.right(), part->right_margin_);
2680
0
  }
2681
0
}
2682
2683
} // namespace tesseract.