Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/classify/shapetable.cpp
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2010 Google Inc. All Rights Reserved.
2
// Author: rays@google.com (Ray Smith)
3
///////////////////////////////////////////////////////////////////////
4
// File:        shapetable.cpp
5
// Description: Class to map a classifier shape index to unicharset
6
//              indices and font indices.
7
// Author:      Ray Smith
8
//
9
// (C) Copyright 2010, Google Inc.
10
// Licensed under the Apache License, Version 2.0 (the "License");
11
// you may not use this file except in compliance with the License.
12
// You may obtain a copy of the License at
13
// http://www.apache.org/licenses/LICENSE-2.0
14
// Unless required by applicable law or agreed to in writing, software
15
// distributed under the License is distributed on an "AS IS" BASIS,
16
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
// See the License for the specific language governing permissions and
18
// limitations under the License.
19
//
20
///////////////////////////////////////////////////////////////////////
21
22
#include "shapetable.h"
23
24
#include "bitvector.h"
25
#include "fontinfo.h"
26
#include "intfeaturespace.h"
27
#include "unicharset.h"
28
#include "unicity_table.h"
29
30
#include <algorithm>
31
32
namespace tesseract {
33
34
// Helper function to get the index of the first result with the required
35
// unichar_id. If the results are sorted by rating, this will also be the
36
// best result with the required unichar_id.
37
// Returns -1 if the unichar_id is not found
38
int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,
39
0
                                        const ShapeTable &shape_table, UNICHAR_ID unichar_id) {
40
0
  for (unsigned r = 0; r < results.size(); ++r) {
41
0
    const auto shape_id = results[r].shape_id;
42
0
    const Shape &shape = shape_table.GetShape(shape_id);
43
0
    if (shape.ContainsUnichar(unichar_id)) {
44
0
      return r;
45
0
    }
46
0
  }
47
0
  return -1;
48
0
}
49
50
// Helper function to get the index of the first result with the required
51
// unichar_id. If the results are sorted by rating, this will also be the
52
// best result with the required unichar_id.
53
// Returns -1 if the unichar_id is not found
54
int UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results,
55
0
                                          UNICHAR_ID unichar_id) {
56
0
  for (unsigned r = 0; r < results.size(); ++r) {
57
0
    if (results[r].unichar_id == unichar_id) {
58
0
      return r;
59
0
    }
60
0
  }
61
0
  return -1;
62
0
}
63
64
// Writes to the given file. Returns false in case of error.
65
0
bool UnicharAndFonts::Serialize(FILE *fp) const {
66
0
  return tesseract::Serialize(fp, &unichar_id) && tesseract::Serialize(fp, font_ids);
67
0
}
68
69
// Reads from the given file. Returns false in case of error.
70
14.0k
bool UnicharAndFonts::DeSerialize(TFile *fp) {
71
14.0k
  return fp->DeSerialize(&unichar_id) && fp->DeSerialize(font_ids);
72
14.0k
}
73
74
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
75
0
int UnicharAndFonts::SortByUnicharId(const void *v1, const void *v2) {
76
0
  const auto *p1 = static_cast<const UnicharAndFonts *>(v1);
77
0
  const auto *p2 = static_cast<const UnicharAndFonts *>(v2);
78
0
  return p1->unichar_id - p2->unichar_id;
79
0
}
80
81
0
bool UnicharAndFonts::StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2) {
82
0
  return v1.unichar_id < v2.unichar_id;
83
0
}
84
85
// Writes to the given file. Returns false in case of error.
86
0
bool Shape::Serialize(FILE *fp) const {
87
0
  uint8_t sorted = unichars_sorted_;
88
0
  return tesseract::Serialize(fp, &sorted) && tesseract::Serialize(fp, unichars_);
89
0
}
90
// Reads from the given file. Returns false in case of error.
91
92
14.0k
bool Shape::DeSerialize(TFile *fp) {
93
14.0k
  uint8_t sorted;
94
14.0k
  if (!fp->DeSerialize(&sorted)) {
95
0
    return false;
96
0
  }
97
14.0k
  unichars_sorted_ = sorted != 0;
98
14.0k
  return fp->DeSerialize(unichars_);
99
14.0k
}
100
101
// Adds a font_id for the given unichar_id. If the unichar_id is not
102
// in the shape, it is added.
103
0
void Shape::AddToShape(int unichar_id, int font_id) {
104
0
  for (auto &unichar : unichars_) {
105
0
    if (unichar.unichar_id == unichar_id) {
106
      // Found the unichar in the shape table.
107
0
      std::vector<int> &font_list = unichar.font_ids;
108
0
      for (int f : font_list) {
109
0
        if (f == font_id) {
110
0
          return; // Font is already there.
111
0
        }
112
0
      }
113
0
      font_list.push_back(font_id);
114
0
      return;
115
0
    }
116
0
  }
117
  // Unichar_id is not in shape, so add it to shape.
118
0
  unichars_.emplace_back(unichar_id, font_id);
119
0
  unichars_sorted_ = unichars_.size() <= 1;
120
0
}
121
122
// Adds everything in other to this.
123
0
void Shape::AddShape(const Shape &other) {
124
0
  for (const auto &unichar : other.unichars_) {
125
0
    for (unsigned f = 0; f < unichar.font_ids.size(); ++f) {
126
0
      AddToShape(unichar.unichar_id, unichar.font_ids[f]);
127
0
    }
128
0
  }
129
0
  unichars_sorted_ = unichars_.size() <= 1;
130
0
}
131
132
// Returns true if the shape contains the given unichar_id, font_id pair.
133
0
bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
134
0
  for (const auto &unichar : unichars_) {
135
0
    if (unichar.unichar_id == unichar_id) {
136
      // Found the unichar, so look for the font.
137
0
      auto &font_list = unichar.font_ids;
138
0
      for (int f : font_list) {
139
0
        if (f == font_id) {
140
0
          return true;
141
0
        }
142
0
      }
143
0
      return false;
144
0
    }
145
0
  }
146
0
  return false;
147
0
}
148
149
// Returns true if the shape contains the given unichar_id, ignoring font.
150
0
bool Shape::ContainsUnichar(int unichar_id) const {
151
0
  for (const auto &unichar : unichars_) {
152
0
    if (unichar.unichar_id == unichar_id) {
153
0
      return true;
154
0
    }
155
0
  }
156
0
  return false;
157
0
}
158
159
// Returns true if the shape contains the given font, ignoring unichar_id.
160
0
bool Shape::ContainsFont(int font_id) const {
161
0
  for (const auto &unichar : unichars_) {
162
0
    auto &font_list = unichar.font_ids;
163
0
    for (int f : font_list) {
164
0
      if (f == font_id) {
165
0
        return true;
166
0
      }
167
0
    }
168
0
  }
169
0
  return false;
170
0
}
171
// Returns true if the shape contains the given font properties, ignoring
172
// unichar_id.
173
0
bool Shape::ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const {
174
0
  for (const auto &unichar : unichars_) {
175
0
    auto &font_list = unichar.font_ids;
176
0
    for (int f : font_list) {
177
0
      if (font_table.at(f).properties == properties) {
178
0
        return true;
179
0
      }
180
0
    }
181
0
  }
182
0
  return false;
183
0
}
184
// Returns true if the shape contains multiple different font properties,
185
// ignoring unichar_id.
186
0
bool Shape::ContainsMultipleFontProperties(const FontInfoTable &font_table) const {
187
0
  uint32_t properties = font_table.at(unichars_[0].font_ids[0]).properties;
188
0
  for (const auto &unichar : unichars_) {
189
0
    auto &font_list = unichar.font_ids;
190
0
    for (int f : font_list) {
191
0
      if (font_table.at(f).properties != properties) {
192
0
        return true;
193
0
      }
194
0
    }
195
0
  }
196
0
  return false;
197
0
}
198
199
// Returns true if this shape is equal to other (ignoring order of unichars
200
// and fonts).
201
0
bool Shape::operator==(const Shape &other) const {
202
0
  return IsSubsetOf(other) && other.IsSubsetOf(*this);
203
0
}
204
205
// Returns true if this is a subset (including equal) of other.
206
0
bool Shape::IsSubsetOf(const Shape &other) const {
207
0
  for (const auto &unichar : unichars_) {
208
0
    int unichar_id = unichar.unichar_id;
209
0
    const std::vector<int> &font_list = unichar.font_ids;
210
0
    for (int f : font_list) {
211
0
      if (!other.ContainsUnicharAndFont(unichar_id, f)) {
212
0
        return false;
213
0
      }
214
0
    }
215
0
  }
216
0
  return true;
217
0
}
218
219
// Returns true if the lists of unichar ids are the same in this and other,
220
// ignoring fonts.
221
// NOT const, as it will sort the unichars on demand.
222
0
bool Shape::IsEqualUnichars(Shape *other) {
223
0
  if (unichars_.size() != other->unichars_.size()) {
224
0
    return false;
225
0
  }
226
0
  if (!unichars_sorted_) {
227
0
    SortUnichars();
228
0
  }
229
0
  if (!other->unichars_sorted_) {
230
0
    other->SortUnichars();
231
0
  }
232
0
  for (unsigned c = 0; c < unichars_.size(); ++c) {
233
0
    if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) {
234
0
      return false;
235
0
    }
236
0
  }
237
0
  return true;
238
0
}
239
240
// Sorts the unichars_ vector by unichar.
241
0
void Shape::SortUnichars() {
242
0
  std::sort(unichars_.begin(), unichars_.end(), UnicharAndFonts::StdSortByUnicharId);
243
0
  unichars_sorted_ = true;
244
0
}
245
246
0
ShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {}
247
4
ShapeTable::ShapeTable(const UNICHARSET &unicharset) : unicharset_(&unicharset), num_fonts_(0) {}
248
249
// Writes to the given file. Returns false in case of error.
250
0
bool ShapeTable::Serialize(FILE *fp) const {
251
0
  return tesseract::Serialize(fp, shape_table_);
252
0
}
253
// Reads from the given file. Returns false in case of error.
254
255
4
bool ShapeTable::DeSerialize(TFile *fp) {
256
4
  if (!fp->DeSerialize(shape_table_)) {
257
0
    return false;
258
0
  }
259
4
  num_fonts_ = 0;
260
4
  return true;
261
4
}
262
263
// Returns the number of fonts used in this ShapeTable, computing it if
264
// necessary.
265
0
int ShapeTable::NumFonts() const {
266
0
  if (num_fonts_ <= 0) {
267
0
    for (auto shape_id : shape_table_) {
268
0
      const Shape &shape = *shape_id;
269
0
      for (int c = 0; c < shape.size(); ++c) {
270
0
        for (int font_id : shape[c].font_ids) {
271
0
          if (font_id >= num_fonts_) {
272
0
            num_fonts_ = font_id + 1;
273
0
          }
274
0
        }
275
0
      }
276
0
    }
277
0
  }
278
0
  return num_fonts_;
279
0
}
280
281
// Re-indexes the class_ids in the shapetable according to the given map.
282
// Useful in conjunction with set_unicharset.
283
0
void ShapeTable::ReMapClassIds(const std::vector<int> &unicharset_map) {
284
0
  for (auto shape : shape_table_) {
285
0
    for (int c = 0; c < shape->size(); ++c) {
286
0
      shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
287
0
    }
288
0
  }
289
0
}
290
291
// Returns a string listing the classes/fonts in a shape.
292
0
std::string ShapeTable::DebugStr(unsigned shape_id) const {
293
0
  if (shape_id >= shape_table_.size()) {
294
0
    return "INVALID_UNICHAR_ID";
295
0
  }
296
0
  const Shape &shape = GetShape(shape_id);
297
0
  std::string result;
298
0
  result += "Shape" + std::to_string(shape_id);
299
0
  if (shape.size() > 100) {
300
0
    result += " Num unichars=" + std::to_string(shape.size());
301
0
    return result;
302
0
  }
303
0
  for (int c = 0; c < shape.size(); ++c) {
304
0
    result += " c_id=" + std::to_string(shape[c].unichar_id);
305
0
    result += "=";
306
0
    result += unicharset_->id_to_unichar(shape[c].unichar_id);
307
0
    if (shape.size() < 10) {
308
0
      result += ", " + std::to_string(shape[c].font_ids.size());
309
0
      result += " fonts =";
310
0
      int num_fonts = shape[c].font_ids.size();
311
0
      if (num_fonts > 10) {
312
0
        result += " " + std::to_string(shape[c].font_ids[0]);
313
0
        result += " ... " + std::to_string(shape[c].font_ids[num_fonts - 1]);
314
0
      } else {
315
0
        for (int f = 0; f < num_fonts; ++f) {
316
0
          result += " " + std::to_string(shape[c].font_ids[f]);
317
0
        }
318
0
      }
319
0
    }
320
0
  }
321
0
  return result;
322
0
}
323
324
// Returns a debug string summarizing the table.
325
0
std::string ShapeTable::SummaryStr() const {
326
0
  int max_unichars = 0;
327
0
  int num_multi_shapes = 0;
328
0
  int num_master_shapes = 0;
329
0
  for (unsigned s = 0; s < shape_table_.size(); ++s) {
330
0
    if (MasterDestinationIndex(s) != s) {
331
0
      continue;
332
0
    }
333
0
    ++num_master_shapes;
334
0
    int shape_size = GetShape(s).size();
335
0
    if (shape_size > 1) {
336
0
      ++num_multi_shapes;
337
0
    }
338
0
    if (shape_size > max_unichars) {
339
0
      max_unichars = shape_size;
340
0
    }
341
0
  }
342
0
  std::string result;
343
0
  result += "Number of shapes = " + std::to_string(num_master_shapes);
344
0
  result += " max unichars = " + std::to_string(max_unichars);
345
0
  result += " number with multiple unichars = " + std::to_string(num_multi_shapes);
346
0
  return result;
347
0
}
348
349
// Adds a new shape starting with the given unichar_id and font_id.
350
// Returns the assigned index.
351
0
unsigned ShapeTable::AddShape(int unichar_id, int font_id) {
352
0
  auto index = shape_table_.size();
353
0
  auto *shape = new Shape;
354
0
  shape->AddToShape(unichar_id, font_id);
355
0
  shape_table_.push_back(shape);
356
0
  num_fonts_ = std::max(num_fonts_, font_id + 1);
357
0
  return index;
358
0
}
359
360
// Adds a copy of the given shape unless it is already present.
361
// Returns the assigned index or index of existing shape if already present.
362
0
unsigned ShapeTable::AddShape(const Shape &other) {
363
0
  unsigned index;
364
0
  for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) {
365
0
    continue;
366
0
  }
367
0
  if (index == shape_table_.size()) {
368
0
    auto *shape = new Shape(other);
369
0
    shape_table_.push_back(shape);
370
0
  }
371
0
  num_fonts_ = 0;
372
0
  return index;
373
0
}
374
375
// Removes the shape given by the shape index.
376
0
void ShapeTable::DeleteShape(unsigned shape_id) {
377
0
  delete shape_table_[shape_id];
378
0
  shape_table_.erase(shape_table_.begin() + shape_id);
379
0
}
380
381
// Adds a font_id to the given existing shape index for the given
382
// unichar_id. If the unichar_id is not in the shape, it is added.
383
0
void ShapeTable::AddToShape(unsigned shape_id, int unichar_id, int font_id) {
384
0
  Shape &shape = *shape_table_[shape_id];
385
0
  shape.AddToShape(unichar_id, font_id);
386
0
  num_fonts_ = std::max(num_fonts_, font_id + 1);
387
0
}
388
389
// Adds the given shape to the existing shape with the given index.
390
0
void ShapeTable::AddShapeToShape(unsigned shape_id, const Shape &other) {
391
0
  Shape &shape = *shape_table_[shape_id];
392
0
  shape.AddShape(other);
393
0
  num_fonts_ = 0;
394
0
}
395
396
// Returns the id of the shape that contains the given unichar and font.
397
// If not found, returns -1.
398
// If font_id < 0, the font_id is ignored and the first shape that matches
399
// the unichar_id is returned.
400
0
int ShapeTable::FindShape(int unichar_id, int font_id) const {
401
0
  for (unsigned s = 0; s < shape_table_.size(); ++s) {
402
0
    const Shape &shape = GetShape(s);
403
0
    for (int c = 0; c < shape.size(); ++c) {
404
0
      if (shape[c].unichar_id == unichar_id) {
405
0
        if (font_id < 0) {
406
0
          return s; // We don't care about the font.
407
0
        }
408
0
        for (int f : shape[c].font_ids) {
409
0
          if (f == font_id) {
410
0
            return s;
411
0
          }
412
0
        }
413
0
      }
414
0
    }
415
0
  }
416
0
  return -1;
417
0
}
418
419
// Returns the first unichar_id and font_id in the given shape.
420
0
void ShapeTable::GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const {
421
0
  const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0];
422
0
  *unichar_id = unichar_and_fonts.unichar_id;
423
0
  *font_id = unichar_and_fonts.font_ids[0];
424
0
}
425
426
// Expands all the classes/fonts in the shape individually to build
427
// a ShapeTable.
428
0
int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) {
429
0
  BitVector shape_map(master_shapes.NumShapes());
430
0
  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
431
0
    for (unsigned f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
432
0
      int c = shape[u_ind].unichar_id;
433
0
      int f = shape[u_ind].font_ids[f_ind];
434
0
      int master_id = master_shapes.FindShape(c, f);
435
0
      if (master_id >= 0) {
436
0
        shape_map.SetBit(master_id);
437
0
      } else if (FindShape(c, f) < 0) {
438
0
        AddShape(c, f);
439
0
      }
440
0
    }
441
0
  }
442
0
  int num_masters = 0;
443
0
  for (unsigned s = 0; s < master_shapes.NumShapes(); ++s) {
444
0
    if (shape_map[s]) {
445
0
      AddShape(master_shapes.GetShape(s));
446
0
      ++num_masters;
447
0
    }
448
0
  }
449
0
  return num_masters;
450
0
}
451
452
// Returns true if the shapes are already merged.
453
0
bool ShapeTable::AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const {
454
0
  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
455
0
}
456
457
// Returns true if any shape contains multiple unichars.
458
0
bool ShapeTable::AnyMultipleUnichars() const {
459
0
  auto num_shapes = NumShapes();
460
0
  for (unsigned s1 = 0; s1 < num_shapes; ++s1) {
461
0
    if (MasterDestinationIndex(s1) != s1) {
462
0
      continue;
463
0
    }
464
0
    if (GetShape(s1).size() > 1) {
465
0
      return true;
466
0
    }
467
0
  }
468
0
  return false;
469
0
}
470
471
// Returns the maximum number of unichars over all shapes.
472
1.96M
int ShapeTable::MaxNumUnichars() const {
473
1.96M
  int max_num_unichars = 0;
474
1.96M
  int num_shapes = NumShapes();
475
6.91G
  for (int s = 0; s < num_shapes; ++s) {
476
6.90G
    if (GetShape(s).size() > max_num_unichars) {
477
1.96M
      max_num_unichars = GetShape(s).size();
478
1.96M
    }
479
6.90G
  }
480
1.96M
  return max_num_unichars;
481
1.96M
}
482
483
// Merges shapes with a common unichar over the [start, end) interval.
484
// Assumes single unichar per shape.
485
0
void ShapeTable::ForceFontMerges(unsigned start, unsigned end) {
486
0
  for (unsigned s1 = start; s1 < end; ++s1) {
487
0
    if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
488
0
      int unichar_id = GetShape(s1)[0].unichar_id;
489
0
      for (auto s2 = s1 + 1; s2 < end; ++s2) {
490
0
        if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
491
0
            unichar_id == GetShape(s2)[0].unichar_id) {
492
0
          MergeShapes(s1, s2);
493
0
        }
494
0
      }
495
0
    }
496
0
  }
497
0
  ShapeTable compacted(*unicharset_);
498
0
  compacted.AppendMasterShapes(*this, nullptr);
499
0
  *this = compacted;
500
0
}
501
502
// Returns the number of unichars in the master shape.
503
0
unsigned ShapeTable::MasterUnicharCount(unsigned shape_id) const {
504
0
  int master_id = MasterDestinationIndex(shape_id);
505
0
  return GetShape(master_id).size();
506
0
}
507
508
// Returns the sum of the font counts in the master shape.
509
0
int ShapeTable::MasterFontCount(unsigned shape_id) const {
510
0
  int master_id = MasterDestinationIndex(shape_id);
511
0
  const Shape &shape = GetShape(master_id);
512
0
  int font_count = 0;
513
0
  for (int c = 0; c < shape.size(); ++c) {
514
0
    font_count += shape[c].font_ids.size();
515
0
  }
516
0
  return font_count;
517
0
}
518
519
// Returns the number of unichars that would result from merging the shapes.
520
0
int ShapeTable::MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const {
521
  // Do it the easy way for now.
522
0
  int master_id1 = MasterDestinationIndex(shape_id1);
523
0
  int master_id2 = MasterDestinationIndex(shape_id2);
524
0
  Shape combined_shape(*shape_table_[master_id1]);
525
0
  combined_shape.AddShape(*shape_table_[master_id2]);
526
0
  return combined_shape.size();
527
0
}
528
529
// Merges two shape_ids, leaving shape_id2 marked as merged.
530
0
void ShapeTable::MergeShapes(unsigned shape_id1, unsigned shape_id2) {
531
0
  auto master_id1 = MasterDestinationIndex(shape_id1);
532
0
  auto master_id2 = MasterDestinationIndex(shape_id2);
533
  // Point master_id2 (and all merged shapes) to master_id1.
534
0
  shape_table_[master_id2]->set_destination_index(master_id1);
535
  // Add all the shapes of master_id2 to master_id1.
536
0
  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
537
0
}
538
539
// Swaps two shape_ids.
540
0
void ShapeTable::SwapShapes(unsigned shape_id1, unsigned shape_id2) {
541
0
  Shape *tmp = shape_table_[shape_id1];
542
0
  shape_table_[shape_id1] = shape_table_[shape_id2];
543
0
  shape_table_[shape_id2] = tmp;
544
0
}
545
546
// Returns the destination of this shape, (if merged), taking into account
547
// the fact that the destination may itself have been merged.
548
0
unsigned ShapeTable::MasterDestinationIndex(unsigned shape_id) const {
549
0
  auto dest_id = shape_table_[shape_id]->destination_index();
550
0
  if (static_cast<unsigned>(dest_id) == shape_id || dest_id < 0) {
551
0
    return shape_id; // Is master already.
552
0
  }
553
0
  auto master_id = shape_table_[dest_id]->destination_index();
554
0
  if (master_id == dest_id || master_id < 0) {
555
0
    return dest_id; // Dest is the master and shape_id points to it.
556
0
  }
557
0
  master_id = MasterDestinationIndex(master_id);
558
0
  return master_id;
559
0
}
560
561
// Returns false if the unichars in neither shape is a subset of the other.
562
0
bool ShapeTable::SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const {
563
0
  const Shape &shape1 = GetShape(shape_id1);
564
0
  const Shape &shape2 = GetShape(shape_id2);
565
0
  int c1, c2;
566
0
  for (c1 = 0; c1 < shape1.size(); ++c1) {
567
0
    int unichar_id1 = shape1[c1].unichar_id;
568
0
    if (!shape2.ContainsUnichar(unichar_id1)) {
569
0
      break;
570
0
    }
571
0
  }
572
0
  for (c2 = 0; c2 < shape2.size(); ++c2) {
573
0
    int unichar_id2 = shape2[c2].unichar_id;
574
0
    if (!shape1.ContainsUnichar(unichar_id2)) {
575
0
      break;
576
0
    }
577
0
  }
578
0
  return c1 == shape1.size() || c2 == shape2.size();
579
0
}
580
581
// Returns false if the unichars in neither shape is a subset of the other.
582
0
bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const {
583
0
  const Shape &merge1 = GetShape(merge_id1);
584
0
  const Shape &merge2 = GetShape(merge_id2);
585
0
  const Shape &shape = GetShape(shape_id);
586
0
  int cm1, cm2, cs;
587
0
  for (cs = 0; cs < shape.size(); ++cs) {
588
0
    int unichar_id = shape[cs].unichar_id;
589
0
    if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {
590
0
      break; // Shape is not a subset of the merge.
591
0
    }
592
0
  }
593
0
  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
594
0
    int unichar_id1 = merge1[cm1].unichar_id;
595
0
    if (!shape.ContainsUnichar(unichar_id1)) {
596
0
      break; // Merge is not a subset of shape
597
0
    }
598
0
  }
599
0
  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
600
0
    int unichar_id2 = merge2[cm2].unichar_id;
601
0
    if (!shape.ContainsUnichar(unichar_id2)) {
602
0
      break; // Merge is not a subset of shape
603
0
    }
604
0
  }
605
0
  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
606
0
}
607
608
// Returns true if the unichar sets are equal between the shapes.
609
0
bool ShapeTable::EqualUnichars(unsigned shape_id1, unsigned shape_id2) const {
610
0
  const Shape &shape1 = GetShape(shape_id1);
611
0
  const Shape &shape2 = GetShape(shape_id2);
612
0
  for (int c1 = 0; c1 < shape1.size(); ++c1) {
613
0
    int unichar_id1 = shape1[c1].unichar_id;
614
0
    if (!shape2.ContainsUnichar(unichar_id1)) {
615
0
      return false;
616
0
    }
617
0
  }
618
0
  for (int c2 = 0; c2 < shape2.size(); ++c2) {
619
0
    int unichar_id2 = shape2[c2].unichar_id;
620
0
    if (!shape1.ContainsUnichar(unichar_id2)) {
621
0
      return false;
622
0
    }
623
0
  }
624
0
  return true;
625
0
}
626
627
// Returns true if the unichar sets are equal between the shapes.
628
0
bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const {
629
0
  const Shape &merge1 = GetShape(merge_id1);
630
0
  const Shape &merge2 = GetShape(merge_id2);
631
0
  const Shape &shape = GetShape(shape_id);
632
0
  for (int cs = 0; cs < shape.size(); ++cs) {
633
0
    int unichar_id = shape[cs].unichar_id;
634
0
    if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {
635
0
      return false; // Shape has a unichar that appears in neither merge.
636
0
    }
637
0
  }
638
0
  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
639
0
    int unichar_id1 = merge1[cm1].unichar_id;
640
0
    if (!shape.ContainsUnichar(unichar_id1)) {
641
0
      return false; // Merge has a unichar that is not in shape.
642
0
    }
643
0
  }
644
0
  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
645
0
    int unichar_id2 = merge2[cm2].unichar_id;
646
0
    if (!shape.ContainsUnichar(unichar_id2)) {
647
0
      return false; // Merge has a unichar that is not in shape.
648
0
    }
649
0
  }
650
0
  return true;
651
0
}
652
653
// Returns true if there is a common unichar between the shapes.
654
0
bool ShapeTable::CommonUnichars(unsigned shape_id1, unsigned shape_id2) const {
655
0
  const Shape &shape1 = GetShape(shape_id1);
656
0
  const Shape &shape2 = GetShape(shape_id2);
657
0
  for (int c1 = 0; c1 < shape1.size(); ++c1) {
658
0
    int unichar_id1 = shape1[c1].unichar_id;
659
0
    if (shape2.ContainsUnichar(unichar_id1)) {
660
0
      return true;
661
0
    }
662
0
  }
663
0
  return false;
664
0
}
665
666
// Returns true if there is a common font id between the shapes.
667
0
bool ShapeTable::CommonFont(unsigned shape_id1, unsigned shape_id2) const {
668
0
  const Shape &shape1 = GetShape(shape_id1);
669
0
  const Shape &shape2 = GetShape(shape_id2);
670
0
  for (int c1 = 0; c1 < shape1.size(); ++c1) {
671
0
    const std::vector<int> &font_list1 = shape1[c1].font_ids;
672
0
    for (int f : font_list1) {
673
0
      if (shape2.ContainsFont(f)) {
674
0
        return true;
675
0
      }
676
0
    }
677
0
  }
678
0
  return false;
679
0
}
680
681
// Appends the master shapes from other to this.
682
// If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
683
0
void ShapeTable::AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map) {
684
0
  if (shape_map != nullptr) {
685
0
    shape_map->clear();
686
0
    shape_map->resize(other.NumShapes(), -1);
687
0
  }
688
0
  for (unsigned s = 0; s < other.shape_table_.size(); ++s) {
689
0
    if (other.shape_table_[s]->destination_index() < 0) {
690
0
      int index = AddShape(*other.shape_table_[s]);
691
0
      if (shape_map != nullptr) {
692
0
        (*shape_map)[s] = index;
693
0
      }
694
0
    }
695
0
  }
696
0
}
697
698
// Returns the number of master shapes remaining after merging.
699
0
int ShapeTable::NumMasterShapes() const {
700
0
  int num_shapes = 0;
701
0
  for (auto s : shape_table_) {
702
0
    if (s->destination_index() < 0) {
703
0
      ++num_shapes;
704
0
    }
705
0
  }
706
0
  return num_shapes;
707
0
}
708
709
// Adds the unichars of the given shape_id to the vector of results. Any
710
// unichar_id that is already present just has the fonts added to the
711
// font set for that result without adding a new entry in the vector.
712
// NOTE: it is assumed that the results are given to this function in order
713
// of decreasing rating.
714
// The unichar_map vector indicates the index of the results entry containing
715
// each unichar, or -1 if the unichar is not yet included in results.
716
void ShapeTable::AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,
717
0
                                   std::vector<UnicharRating> *results) const {
718
0
  if (shape_rating.joined) {
719
0
    AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, results);
720
0
  }
721
0
  if (shape_rating.broken) {
722
0
    AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, results);
723
0
  }
724
0
  const Shape &shape = GetShape(shape_rating.shape_id);
725
0
  for (int u = 0; u < shape.size(); ++u) {
726
0
    int result_index =
727
0
        AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results);
728
0
    for (int font_id : shape[u].font_ids) {
729
0
      (*results)[result_index].fonts.emplace_back(font_id,
730
0
                                                  IntCastRounded(shape_rating.rating * INT16_MAX));
731
0
    }
732
0
  }
733
0
}
734
735
// Adds the given unichar_id to the results if needed, updating unichar_map
736
// and returning the index of unichar in results.
737
int ShapeTable::AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,
738
0
                                    std::vector<UnicharRating> *results) const {
739
0
  int result_index = unichar_map->at(unichar_id);
740
0
  if (result_index < 0) {
741
0
    UnicharRating result(unichar_id, rating);
742
0
    result_index = results->size();
743
0
    results->push_back(result);
744
0
    (*unichar_map)[unichar_id] = result_index;
745
0
  }
746
0
  return result_index;
747
0
}
748
749
} // namespace tesseract