Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/lstm/networkio.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        networkio.cpp
3
// Description: Network input/output data, allowing float/int implementations.
4
// Author:      Ray Smith
5
//
6
// (C) Copyright 2014, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
///////////////////////////////////////////////////////////////////////
17
18
#include "networkio.h"
19
#include <cfloat> // for FLT_MAX
20
#include <cmath>
21
22
#include <allheaders.h>
23
#include "functions.h"
24
#include "statistc.h"
25
#include "tprintf.h"
26
27
namespace tesseract {
28
29
// Minimum value to output for certainty.
30
const float kMinCertainty = -20.0f;
31
// Probability corresponding to kMinCertainty.
32
const float kMinProb = std::exp(kMinCertainty);
33
34
// Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.
35
0
void NetworkIO::Resize2d(bool int_mode, int width, int num_features) {
36
0
  stride_map_ = StrideMap();
37
0
  int_mode_ = int_mode;
38
0
  if (int_mode_) {
39
0
    i_.ResizeNoInit(width, num_features, GetPadding(num_features));
40
0
  } else {
41
0
    f_.ResizeNoInit(width, num_features);
42
0
  }
43
0
}
44
45
// Resizes to a specific stride_map.
46
3.19M
void NetworkIO::ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features) {
47
  // If this method crashes with this == nullptr,
48
  // it most likely got here through an uninitialized scratch element,
49
  // ie call NetworkScratch::IO::Resizexxx() not NetworkIO::Resizexxx()!!
50
3.19M
  stride_map_ = stride_map;
51
3.19M
  int_mode_ = int_mode;
52
3.19M
  if (int_mode_) {
53
3.00M
    i_.ResizeNoInit(stride_map.Width(), num_features, GetPadding(num_features));
54
3.00M
  } else {
55
187k
    f_.ResizeNoInit(stride_map.Width(), num_features);
56
187k
  }
57
3.19M
  ZeroInvalidElements();
58
3.19M
}
59
60
// Shrinks image size by x_scale,y_scale, and use given number of features.
61
187k
void NetworkIO::ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features) {
62
187k
  StrideMap stride_map = src.stride_map_;
63
187k
  stride_map.ScaleXY(x_scale, y_scale);
64
187k
  ResizeToMap(src.int_mode_, stride_map, num_features);
65
187k
}
66
67
// Resizes to just 1 x-coord, whatever the input.
68
187k
void NetworkIO::ResizeXTo1(const NetworkIO &src, int num_features) {
69
187k
  StrideMap stride_map = src.stride_map_;
70
187k
  stride_map.ReduceWidthTo1();
71
187k
  ResizeToMap(src.int_mode_, stride_map, num_features);
72
187k
}
73
74
// Initialize all the array to zero.
75
0
void NetworkIO::Zero() {
76
0
  int width = Width();
77
  // Zero out the everything. Column-by-column in case it is aligned.
78
0
  for (int t = 0; t < width; ++t) {
79
0
    ZeroTimeStep(t);
80
0
  }
81
0
}
82
83
// Initializes to zero all elements of the array that do not correspond to
84
// valid image positions. (If a batch of different-sized images are packed
85
// together, then there will be padding pixels.)
86
3.56M
void NetworkIO::ZeroInvalidElements() {
87
3.56M
  int num_features = NumFeatures();
88
3.56M
  int full_width = stride_map_.Size(FD_WIDTH);
89
3.56M
  int full_height = stride_map_.Size(FD_HEIGHT);
90
3.56M
  StrideMap::Index b_index(stride_map_);
91
3.56M
  do {
92
3.56M
    int end_x = b_index.MaxIndexOfDim(FD_WIDTH) + 1;
93
3.56M
    if (end_x < full_width) {
94
      // The width is small, so fill for every valid y.
95
0
      StrideMap::Index y_index(b_index);
96
0
      int fill_size = num_features * (full_width - end_x);
97
0
      do {
98
0
        StrideMap::Index z_index(y_index);
99
0
        z_index.AddOffset(end_x, FD_WIDTH);
100
0
        if (int_mode_) {
101
0
          ZeroVector(fill_size, i_[z_index.t()]);
102
0
        } else {
103
0
          ZeroVector(fill_size, f_[z_index.t()]);
104
0
        }
105
0
      } while (y_index.AddOffset(1, FD_HEIGHT));
106
0
    }
107
3.56M
    int end_y = b_index.MaxIndexOfDim(FD_HEIGHT) + 1;
108
3.56M
    if (end_y < full_height) {
109
      // The height is small, so fill in the space in one go.
110
0
      StrideMap::Index y_index(b_index);
111
0
      y_index.AddOffset(end_y, FD_HEIGHT);
112
0
      int fill_size = num_features * full_width * (full_height - end_y);
113
0
      if (int_mode_) {
114
0
        ZeroVector(fill_size, i_[y_index.t()]);
115
0
      } else {
116
0
        ZeroVector(fill_size, f_[y_index.t()]);
117
0
      }
118
0
    }
119
3.56M
  } while (b_index.AddOffset(1, FD_BATCH));
120
3.56M
}
121
122
// Helper computes a black point and white point to contrast-enhance an image.
123
// The computation is based on the assumption that the image is of a single line
124
// of text, so a horizontal line through the middle of the image passes through
125
// at least some of it, so local minima and maxima are a good proxy for black
126
// and white pixel samples.
127
187k
static void ComputeBlackWhite(Image pix, float *black, float *white) {
128
187k
  int width = pixGetWidth(pix);
129
187k
  int height = pixGetHeight(pix);
130
187k
  STATS mins(0, 255), maxes(0, 255);
131
187k
  if (width >= 3) {
132
187k
    int y = height / 2;
133
187k
    l_uint32 *line = pixGetData(pix) + pixGetWpl(pix) * y;
134
187k
    int prev = GET_DATA_BYTE(line, 0);
135
187k
    int curr = GET_DATA_BYTE(line, 1);
136
7.71M
    for (int x = 1; x + 1 < width; ++x) {
137
7.53M
      int next = GET_DATA_BYTE(line, x + 1);
138
7.53M
      if ((curr < prev && curr <= next) || (curr <= prev && curr < next)) {
139
        // Local minimum.
140
1.46M
        mins.add(curr, 1);
141
1.46M
      }
142
7.53M
      if ((curr > prev && curr >= next) || (curr >= prev && curr > next)) {
143
        // Local maximum.
144
1.46M
        maxes.add(curr, 1);
145
1.46M
      }
146
7.53M
      prev = curr;
147
7.53M
      curr = next;
148
7.53M
    }
149
187k
  }
150
187k
  if (mins.get_total() == 0) {
151
15.1k
    mins.add(0, 1);
152
15.1k
  }
153
187k
  if (maxes.get_total() == 0) {
154
14.7k
    maxes.add(255, 1);
155
14.7k
  }
156
187k
  *black = mins.ile(0.25);
157
187k
  *white = maxes.ile(0.75);
158
187k
}
159
160
// Sets up the array from the given image, using the currently set int_mode_.
161
// If the image width doesn't match the shape, the image is truncated or padded
162
// with noise to match.
163
187k
void NetworkIO::FromPix(const StaticShape &shape, const Image pix, TRand *randomizer) {
164
187k
  std::vector<Image> pixes(1, pix);
165
187k
  FromPixes(shape, pixes, randomizer);
166
187k
}
167
168
// Sets up the array from the given set of images, using the currently set
169
// int_mode_. If the image width doesn't match the shape, the images are
170
// truncated or padded with noise to match.
171
void NetworkIO::FromPixes(const StaticShape &shape, const std::vector<Image> &pixes,
172
187k
                          TRand *randomizer) {
173
187k
  int target_height = shape.height();
174
187k
  int target_width = shape.width();
175
187k
  std::vector<std::pair<int, int>> h_w_pairs;
176
187k
  for (auto &&pix : pixes) {
177
187k
    Image var_pix = pix;
178
187k
    int width = pixGetWidth(var_pix);
179
187k
    if (target_width != 0) {
180
0
      width = target_width;
181
0
    }
182
187k
    int height = pixGetHeight(var_pix);
183
187k
    if (target_height != 0) {
184
187k
      height = target_height;
185
187k
    }
186
187k
    h_w_pairs.emplace_back(height, width);
187
187k
  }
188
187k
  stride_map_.SetStride(h_w_pairs);
189
187k
  ResizeToMap(int_mode(), stride_map_, shape.depth());
190
  // Iterate over the images again to copy the data.
191
375k
  for (size_t b = 0; b < pixes.size(); ++b) {
192
187k
    Image pix = pixes[b];
193
187k
    float black = 0.0f, white = 255.0f;
194
187k
    if (shape.depth() != 3) {
195
187k
      ComputeBlackWhite(pix, &black, &white);
196
187k
    }
197
187k
    float contrast = (white - black) / 2.0f;
198
187k
    if (contrast <= 0.0f) {
199
17
      contrast = 1.0f;
200
17
    }
201
187k
    if (shape.height() == 1) {
202
0
      Copy1DGreyImage(b, pix, black, contrast, randomizer);
203
187k
    } else {
204
187k
      Copy2DImage(b, pix, black, contrast, randomizer);
205
187k
    }
206
187k
  }
207
187k
}
208
209
// Copies the given pix to *this at the given batch index, stretching and
210
// clipping the pixel values so that [black, black + 2*contrast] maps to the
211
// dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.
212
// This is a 2-d operation in the sense that the output depth is the number
213
// of input channels, the height is the height of the image, and the width
214
// is the width of the image, or truncated/padded with noise if the width
215
// is a fixed size.
216
187k
void NetworkIO::Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer) {
217
187k
  int width = pixGetWidth(pix);
218
187k
  int height = pixGetHeight(pix);
219
187k
  int wpl = pixGetWpl(pix);
220
187k
  StrideMap::Index index(stride_map_);
221
187k
  index.AddOffset(batch, FD_BATCH);
222
187k
  int t = index.t();
223
187k
  int target_height = stride_map_.Size(FD_HEIGHT);
224
187k
  int target_width = stride_map_.Size(FD_WIDTH);
225
187k
  int num_features = NumFeatures();
226
187k
  bool color = num_features == 3;
227
187k
  if (width > target_width) {
228
0
    width = target_width;
229
0
  }
230
187k
  uint32_t *line = pixGetData(pix);
231
6.94M
  for (int y = 0; y < target_height; ++y, line += wpl) {
232
6.75M
    int x = 0;
233
6.75M
    if (y < height) {
234
291M
      for (x = 0; x < width; ++x, ++t) {
235
284M
        if (color) {
236
0
          int f = 0;
237
0
          for (int c = COLOR_RED; c <= COLOR_BLUE; ++c) {
238
0
            int pixel = GET_DATA_BYTE(line + x, c);
239
0
            SetPixel(t, f++, pixel, black, contrast);
240
0
          }
241
284M
        } else {
242
284M
          int pixel = GET_DATA_BYTE(line, x);
243
284M
          SetPixel(t, 0, pixel, black, contrast);
244
284M
        }
245
284M
      }
246
6.75M
    }
247
6.75M
    for (; x < target_width; ++x) {
248
0
      Randomize(t++, 0, num_features, randomizer);
249
0
    }
250
6.75M
  }
251
187k
}
252
253
// Copies the given pix to *this at the given batch index, as Copy2DImage
254
// above, except that the output depth is the height of the input image, the
255
// output height is 1, and the output width as for Copy2DImage.
256
// The image is thus treated as a 1-d set of vertical pixel strips.
257
void NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contrast,
258
0
                                TRand *randomizer) {
259
0
  int width = pixGetWidth(pix);
260
0
  int height = pixGetHeight(pix);
261
0
  ASSERT_HOST(height == NumFeatures());
262
0
  int wpl = pixGetWpl(pix);
263
0
  StrideMap::Index index(stride_map_);
264
0
  index.AddOffset(batch, FD_BATCH);
265
0
  int t = index.t();
266
0
  int target_width = stride_map_.Size(FD_WIDTH);
267
0
  if (width > target_width) {
268
0
    width = target_width;
269
0
  }
270
0
  int x;
271
0
  for (x = 0; x < width; ++x, ++t) {
272
0
    for (int y = 0; y < height; ++y) {
273
0
      uint32_t *line = pixGetData(pix) + wpl * y;
274
0
      int pixel = GET_DATA_BYTE(line, x);
275
0
      SetPixel(t, y, pixel, black, contrast);
276
0
    }
277
0
  }
278
0
  for (; x < target_width; ++x) {
279
0
    Randomize(t++, 0, height, randomizer);
280
0
  }
281
0
}
282
283
// Helper stores the pixel value in i_ or f_ according to int_mode_.
284
// t: is the index from the StrideMap corresponding to the current
285
//   [batch,y,x] position
286
// f: is the index into the depth/channel
287
// pixel: the value of the pixel from the image (in one channel)
288
// black: the pixel value to map to the lowest of the range of *this
289
// contrast: the range of pixel values to stretch to half the range of *this.
290
284M
void NetworkIO::SetPixel(int t, int f, int pixel, float black, float contrast) {
291
284M
  float float_pixel = (pixel - black) / contrast - 1.0f;
292
284M
  if (int_mode_) {
293
284M
    i_[t][f] = ClipToRange<int>(IntCastRounded((INT8_MAX + 1) * float_pixel), -INT8_MAX, INT8_MAX);
294
284M
  } else {
295
0
    f_[t][f] = float_pixel;
296
0
  }
297
284M
}
298
299
// Converts the array to a Pix. Must be pixDestroyed after use.
300
0
Image NetworkIO::ToPix() const {
301
  // Count the width of the image, and find the max multiplication factor.
302
0
  int im_width = stride_map_.Size(FD_WIDTH);
303
0
  int im_height = stride_map_.Size(FD_HEIGHT);
304
0
  int num_features = NumFeatures();
305
0
  int feature_factor = 1;
306
0
  if (num_features == 3) {
307
    // Special hack for color.
308
0
    num_features = 1;
309
0
    feature_factor = 3;
310
0
  }
311
0
  Image pix = pixCreate(im_width, im_height * num_features, 32);
312
0
  StrideMap::Index index(stride_map_);
313
0
  do {
314
0
    int im_x = index.index(FD_WIDTH);
315
0
    int top_im_y = index.index(FD_HEIGHT);
316
0
    int im_y = top_im_y;
317
0
    int t = index.t();
318
0
    if (int_mode_) {
319
0
      const int8_t *features = i_[t];
320
0
      for (int y = 0; y < num_features; ++y, im_y += im_height) {
321
0
        int pixel = features[y * feature_factor];
322
        // 1 or 2 features use greyscale.
323
0
        int red = ClipToRange<int>(pixel + 128, 0, 255);
324
0
        int green = red, blue = red;
325
0
        if (feature_factor == 3) {
326
          // With 3 features assume RGB color.
327
0
          green = ClipToRange<int>(features[y * feature_factor + 1] + 128, 0, 255);
328
0
          blue = ClipToRange<int>(features[y * feature_factor + 2] + 128, 0, 255);
329
0
        } else if (num_features > 3) {
330
          // More than 3 features use false yellow/blue color, assuming a signed
331
          // input in the range [-1,1].
332
0
          red = abs(pixel) * 2;
333
0
          if (pixel >= 0) {
334
0
            green = red;
335
0
            blue = 0;
336
0
          } else {
337
0
            blue = red;
338
0
            green = red = 0;
339
0
          }
340
0
        }
341
0
        pixSetPixel(pix, im_x, im_y,
342
0
                    (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT));
343
0
      }
344
0
    } else {
345
0
      const float *features = f_[t];
346
0
      for (int y = 0; y < num_features; ++y, im_y += im_height) {
347
0
        float pixel = features[y * feature_factor];
348
        // 1 or 2 features use greyscale.
349
0
        int red = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);
350
0
        int green = red, blue = red;
351
0
        if (feature_factor == 3) {
352
          // With 3 features assume RGB color.
353
0
          pixel = features[y * feature_factor + 1];
354
0
          green = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);
355
0
          pixel = features[y * feature_factor + 2];
356
0
          blue = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);
357
0
        } else if (num_features > 3) {
358
          // More than 3 features use false yellow/blue color, assuming a signed
359
          // input in the range [-1,1].
360
0
          red = ClipToRange<int>(IntCastRounded(std::fabs(pixel) * 255), 0, 255);
361
0
          if (pixel >= 0) {
362
0
            green = red;
363
0
            blue = 0;
364
0
          } else {
365
0
            blue = red;
366
0
            green = red = 0;
367
0
          }
368
0
        }
369
0
        pixSetPixel(pix, im_x, im_y,
370
0
                    (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT));
371
0
      }
372
0
    }
373
0
  } while (index.Increment());
374
0
  return pix;
375
0
}
376
377
// Prints the first and last num timesteps of the array for each feature.
378
0
void NetworkIO::Print(int num) const {
379
0
  int num_features = NumFeatures();
380
0
  for (int y = 0; y < num_features; ++y) {
381
0
    for (int t = 0; t < Width(); ++t) {
382
0
      if (num == 0 || t < num || t + num >= Width()) {
383
0
        if (int_mode_) {
384
0
          tprintf(" %g", static_cast<float>(i_[t][y]) / INT8_MAX);
385
0
        } else {
386
0
          tprintf(" %g", f_[t][y]);
387
0
        }
388
0
      }
389
0
    }
390
0
    tprintf("\n");
391
0
  }
392
0
}
393
394
// Copies a single time step from src.
395
69.5M
void NetworkIO::CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t) {
396
69.5M
  ASSERT_HOST(int_mode_ == src.int_mode_);
397
69.5M
  if (int_mode_) {
398
69.5M
    memcpy(i_[dest_t], src.i_[src_t], i_.dim2() * sizeof(i_[0][0]));
399
69.5M
  } else {
400
0
    memcpy(f_[dest_t], src.f_[src_t], f_.dim2() * sizeof(f_[0][0]));
401
0
  }
402
69.5M
}
403
404
// Copies a part of single time step from src.
405
void NetworkIO::CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features,
406
2.51G
                                    const NetworkIO &src, int src_t, int src_offset) {
407
2.51G
  ASSERT_HOST(int_mode_ == src.int_mode_);
408
2.51G
  if (int_mode_) {
409
2.51G
    memcpy(i_[dest_t] + dest_offset, src.i_[src_t] + src_offset, num_features * sizeof(i_[0][0]));
410
2.51G
  } else {
411
0
    memcpy(f_[dest_t] + dest_offset, src.f_[src_t] + src_offset, num_features * sizeof(f_[0][0]));
412
0
  }
413
2.51G
}
414
415
// Sets the given range to random values.
416
60.2M
void NetworkIO::Randomize(int t, int offset, int num_features, TRand *randomizer) {
417
60.2M
  if (int_mode_) {
418
60.2M
    int8_t *line = i_[t] + offset;
419
147M
    for (int i = 0; i < num_features; ++i) {
420
87.2M
      line[i] = IntCastRounded(randomizer->SignedRand(INT8_MAX));
421
87.2M
    }
422
60.2M
  } else {
423
    // float mode.
424
0
    float *line = f_[t] + offset;
425
0
    for (int i = 0; i < num_features; ++i) {
426
0
      line[i] = randomizer->SignedRand(1.0);
427
0
    }
428
0
  }
429
60.2M
}
430
431
// Helper returns the label and score of the best choice over a range.
432
int NetworkIO::BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating,
433
0
                                   float *certainty) const {
434
0
  if (t_end <= t_start) {
435
0
    return -1;
436
0
  }
437
0
  int max_char = -1;
438
0
  float min_score = 0.0f;
439
0
  for (int c = 0; c < NumFeatures(); ++c) {
440
0
    if (c == not_this || c == null_ch) {
441
0
      continue;
442
0
    }
443
0
    ScoresOverRange(t_start, t_end, c, null_ch, rating, certainty);
444
0
    if (max_char < 0 || *rating < min_score) {
445
0
      min_score = *rating;
446
0
      max_char = c;
447
0
    }
448
0
  }
449
0
  ScoresOverRange(t_start, t_end, max_char, null_ch, rating, certainty);
450
0
  return max_char;
451
0
}
452
453
// Helper returns the rating and certainty of the choice over a range in output.
454
void NetworkIO::ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating,
455
0
                                float *certainty) const {
456
0
  ASSERT_HOST(!int_mode_);
457
0
  *rating = 0.0f;
458
0
  *certainty = 0.0f;
459
0
  if (t_end <= t_start || t_end <= 0) {
460
0
    return;
461
0
  }
462
0
  float ratings[3] = {0.0f, 0.0f, 0.0f};
463
0
  float certs[3] = {0.0f, 0.0f, 0.0f};
464
0
  for (int t = t_start; t < t_end; ++t) {
465
0
    const float *line = f_[t];
466
0
    float score = ProbToCertainty(line[choice]);
467
0
    float zero = ProbToCertainty(line[null_ch]);
468
0
    if (t == t_start) {
469
0
      ratings[2] = FLT_MAX;
470
0
      ratings[1] = -score;
471
0
      certs[1] = score;
472
0
    } else {
473
0
      for (int i = 2; i >= 1; --i) {
474
0
        if (ratings[i] > ratings[i - 1]) {
475
0
          ratings[i] = ratings[i - 1];
476
0
          certs[i] = certs[i - 1];
477
0
        }
478
0
      }
479
0
      ratings[2] -= zero;
480
0
      if (zero < certs[2]) {
481
0
        certs[2] = zero;
482
0
      }
483
0
      ratings[1] -= score;
484
0
      if (score < certs[1]) {
485
0
        certs[1] = score;
486
0
      }
487
0
    }
488
0
    ratings[0] -= zero;
489
0
    if (zero < certs[0]) {
490
0
      certs[0] = zero;
491
0
    }
492
0
  }
493
0
  int best_i = ratings[2] < ratings[1] ? 2 : 1;
494
0
  *rating = ratings[best_i] + t_end - t_start;
495
0
  *certainty = certs[best_i];
496
0
}
497
498
// Returns the index (label) of the best value at the given timestep,
499
// excluding not_this and not_that, and if not null, sets the score to the
500
// log of the corresponding value.
501
2.57M
int NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const {
502
2.57M
  ASSERT_HOST(!int_mode_);
503
2.57M
  int best_index = -1;
504
2.57M
  float best_score = -FLT_MAX;
505
2.57M
  const float *line = f_[t];
506
288M
  for (int i = 0; i < f_.dim2(); ++i) {
507
285M
    if (line[i] > best_score && i != not_this && i != not_that) {
508
12.0M
      best_score = line[i];
509
12.0M
      best_index = i;
510
12.0M
    }
511
285M
  }
512
2.57M
  if (score != nullptr) {
513
0
    *score = ProbToCertainty(best_score);
514
0
  }
515
2.57M
  return best_index;
516
2.57M
}
517
518
// Returns the best start position out of [start, end) (into which all labels
519
// must fit) to obtain the highest cumulative score for the given labels.
520
0
int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const {
521
0
  int length = labels.size();
522
0
  int last_start = end - length;
523
0
  int best_start = -1;
524
0
  TFloat best_score = 0;
525
0
  for (int s = start; s <= last_start; ++s) {
526
0
    TFloat score = ScoreOfLabels(labels, s);
527
0
    if (score > best_score || best_start < 0) {
528
0
      best_score = score;
529
0
      best_start = s;
530
0
    }
531
0
  }
532
0
  return best_start;
533
0
}
534
535
// Returns the cumulative score of the given labels starting at start, and
536
// using one label per time-step.
537
0
TFloat NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {
538
0
  int length = labels.size();
539
0
  TFloat score = 0;
540
0
  for (int i = 0; i < length; ++i) {
541
0
    score += f_(start + i, labels[i]);
542
0
  }
543
0
  return score;
544
0
}
545
546
// Helper function sets all the outputs for a single timestep, such that
547
// label has value ok_score, and the other labels share 1 - ok_score.
548
0
void NetworkIO::SetActivations(int t, int label, float ok_score) {
549
0
  ASSERT_HOST(!int_mode_);
550
0
  int num_classes = NumFeatures();
551
0
  float bad_score = (1.0f - ok_score) / (num_classes - 1);
552
0
  float *targets = f_[t];
553
0
  for (int i = 0; i < num_classes; ++i) {
554
0
    targets[i] = bad_score;
555
0
  }
556
0
  targets[label] = ok_score;
557
0
}
558
559
// Modifies the values, only if needed, so that the given label is
560
// the winner at the given time step t.
561
0
void NetworkIO::EnsureBestLabel(int t, int label) {
562
0
  ASSERT_HOST(!int_mode_);
563
0
  if (BestLabel(t, nullptr) != label) {
564
    // Output value needs enhancing. Third all the other elements and add the
565
    // remainder to best_label.
566
0
    int num_classes = NumFeatures();
567
0
    float *targets = f_[t];
568
0
    for (int c = 0; c < num_classes; ++c) {
569
0
      if (c == label) {
570
0
        targets[c] += (1.0 - targets[c]) * (2 / 3.0);
571
0
      } else {
572
0
        targets[c] /= 3.0;
573
0
      }
574
0
    }
575
0
  }
576
0
}
577
578
// Helper function converts prob to certainty taking the minimum into account.
579
/* static */
580
36.3M
float NetworkIO::ProbToCertainty(float prob) {
581
36.3M
  return prob > kMinProb ? std::log(prob) : kMinCertainty;
582
36.3M
}
583
584
// Returns true if there is any bad value that is suspiciously like a GT
585
// error. Assuming that *this is the difference(gradient) between target
586
// and forward output, returns true if there is a large negative value
587
// (correcting a very confident output) for which there is no corresponding
588
// positive value in an adjacent timestep for the same feature index. This
589
// allows the box-truthed samples to make fine adjustments to position while
590
// stopping other disagreements of confident output with ground truth.
591
0
bool NetworkIO::AnySuspiciousTruth(float confidence_thr) const {
592
0
  int num_features = NumFeatures();
593
0
  for (int t = 0; t < Width(); ++t) {
594
0
    const float *features = f_[t];
595
0
    for (int y = 0; y < num_features; ++y) {
596
0
      float grad = features[y];
597
0
      if (grad < -confidence_thr) {
598
        // Correcting strong output. Check for movement.
599
0
        if ((t == 0 || f_[t - 1][y] < confidence_thr / 2) &&
600
0
            (t + 1 == Width() || f_[t + 1][y] < confidence_thr / 2)) {
601
0
          return true; // No strong positive on either side.
602
0
        }
603
0
      }
604
0
    }
605
0
  }
606
0
  return false;
607
0
}
608
609
// Reads a single timestep to floats in the range [-1, 1].
610
0
void NetworkIO::ReadTimeStep(int t, TFloat *output) const {
611
0
  if (int_mode_) {
612
0
    const int8_t *line = i_[t];
613
0
    for (int i = 0; i < i_.dim2(); ++i) {
614
0
      output[i] = static_cast<TFloat>(line[i]) / INT8_MAX;
615
0
    }
616
0
  } else {
617
0
    const float *line = f_[t];
618
0
    for (int i = 0; i < f_.dim2(); ++i) {
619
0
      output[i] = static_cast<TFloat>(line[i]);
620
0
    }
621
0
  }
622
0
}
623
624
// Adds a single timestep to floats.
625
0
void NetworkIO::AddTimeStep(int t, TFloat *inout) const {
626
0
  int num_features = NumFeatures();
627
0
  if (int_mode_) {
628
0
    const int8_t *line = i_[t];
629
0
    for (int i = 0; i < num_features; ++i) {
630
0
      inout[i] += static_cast<TFloat>(line[i]) / INT8_MAX;
631
0
    }
632
0
  } else {
633
0
    const float *line = f_[t];
634
0
    for (int i = 0; i < num_features; ++i) {
635
0
      inout[i] += line[i];
636
0
    }
637
0
  }
638
0
}
639
640
// Adds part of a single timestep to floats.
641
0
void NetworkIO::AddTimeStepPart(int t, int offset, int num_features, float *inout) const {
642
0
  if (int_mode_) {
643
0
    const int8_t *line = i_[t] + offset;
644
0
    for (int i = 0; i < num_features; ++i) {
645
0
      inout[i] += static_cast<float>(line[i]) / INT8_MAX;
646
0
    }
647
0
  } else {
648
0
    const float *line = f_[t] + offset;
649
0
    for (int i = 0; i < num_features; ++i) {
650
0
      inout[i] += line[i];
651
0
    }
652
0
  }
653
0
}
654
655
// Writes a single timestep from floats in the range [-1, 1].
656
297M
void NetworkIO::WriteTimeStep(int t, const TFloat *input) {
657
297M
  WriteTimeStepPart(t, 0, NumFeatures(), input);
658
297M
}
659
660
// Writes a single timestep from floats in the range [-1, 1] writing only
661
// num_features elements of input to (*this)[t], starting at offset.
662
336M
void NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input) {
663
336M
  if (int_mode_) {
664
333M
    int8_t *line = i_[t] + offset;
665
10.6G
    for (int i = 0; i < num_features; ++i) {
666
10.3G
      line[i] = ClipToRange<int>(IntCastRounded(input[i] * INT8_MAX), -INT8_MAX, INT8_MAX);
667
10.3G
    }
668
333M
  } else {
669
2.57M
    float *line = f_[t] + offset;
670
288M
    for (int i = 0; i < num_features; ++i) {
671
285M
      line[i] = static_cast<float>(input[i]);
672
285M
    }
673
2.57M
  }
674
336M
}
675
676
// Maxpools a single time step from src.
677
278M
void NetworkIO::MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line) {
678
278M
  ASSERT_HOST(int_mode_ == src.int_mode_);
679
278M
  if (int_mode_) {
680
278M
    int dim = i_.dim2();
681
278M
    int8_t *dest_line = i_[dest_t];
682
278M
    const int8_t *src_line = src.i_[src_t];
683
4.72G
    for (int i = 0; i < dim; ++i) {
684
4.45G
      if (dest_line[i] < src_line[i]) {
685
811M
        dest_line[i] = src_line[i];
686
811M
        max_line[i] = src_t;
687
811M
      }
688
4.45G
    }
689
278M
  } else {
690
0
    int dim = f_.dim2();
691
0
    float *dest_line = f_[dest_t];
692
0
    const float *src_line = src.f_[src_t];
693
0
    for (int i = 0; i < dim; ++i) {
694
0
      if (dest_line[i] < src_line[i]) {
695
0
        dest_line[i] = src_line[i];
696
0
        max_line[i] = src_t;
697
0
      }
698
0
    }
699
0
  }
700
278M
}
701
702
// Runs maxpool backward, using maxes to index timesteps in *this.
703
0
void NetworkIO::MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes) {
704
0
  ASSERT_HOST(!int_mode_);
705
0
  Zero();
706
0
  StrideMap::Index index(fwd.stride_map_);
707
0
  do {
708
0
    int t = index.t();
709
0
    const int *max_line = maxes[t];
710
0
    const float *fwd_line = fwd.f_[t];
711
0
    int num_features = fwd.f_.dim2();
712
0
    for (int i = 0; i < num_features; ++i) {
713
0
      f_[max_line[i]][i] = fwd_line[i];
714
0
    }
715
0
  } while (index.Increment());
716
0
}
717
718
// Returns the min over time of the maxes over features of the outputs.
719
0
float NetworkIO::MinOfMaxes() const {
720
0
  float min_max = 0.0f;
721
0
  int width = Width();
722
0
  int num_features = NumFeatures();
723
0
  for (int t = 0; t < width; ++t) {
724
0
    float max_value = -FLT_MAX;
725
0
    if (int_mode_) {
726
0
      const int8_t *column = i_[t];
727
0
      for (int i = 0; i < num_features; ++i) {
728
0
        if (column[i] > max_value) {
729
0
          max_value = column[i];
730
0
        }
731
0
      }
732
0
    } else {
733
0
      const float *column = f_[t];
734
0
      for (int i = 0; i < num_features; ++i) {
735
0
        if (column[i] > max_value) {
736
0
          max_value = column[i];
737
0
        }
738
0
      }
739
0
    }
740
0
    if (t == 0 || max_value < min_max) {
741
0
      min_max = max_value;
742
0
    }
743
0
  }
744
0
  return min_max;
745
0
}
746
747
// Computes combined results for a combiner that chooses between an existing
748
// input and itself, with an additional output to indicate the choice.
749
0
void NetworkIO::CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output) {
750
0
  int no = base_output.NumFeatures();
751
0
  ASSERT_HOST(combiner_output.NumFeatures() == no + 1);
752
0
  Resize(base_output, no);
753
0
  int width = Width();
754
0
  if (int_mode_) {
755
    // Number of outputs from base and final result.
756
0
    for (int t = 0; t < width; ++t) {
757
0
      int8_t *out_line = i_[t];
758
0
      const int8_t *base_line = base_output.i_[t];
759
0
      const int8_t *comb_line = combiner_output.i_[t];
760
0
      float base_weight = static_cast<float>(comb_line[no]) / INT8_MAX;
761
0
      float boost_weight = 1.0f - base_weight;
762
0
      for (int i = 0; i < no; ++i) {
763
0
        out_line[i] = IntCastRounded(base_line[i] * base_weight + comb_line[i] * boost_weight);
764
0
      }
765
0
    }
766
0
  } else {
767
0
    for (int t = 0; t < width; ++t) {
768
0
      float *out_line = f_[t];
769
0
      const float *base_line = base_output.f_[t];
770
0
      const float *comb_line = combiner_output.f_[t];
771
0
      float base_weight = comb_line[no];
772
0
      float boost_weight = 1.0f - base_weight;
773
0
      for (int i = 0; i < no; ++i) {
774
0
        out_line[i] = base_line[i] * base_weight + comb_line[i] * boost_weight;
775
0
      }
776
0
    }
777
0
  }
778
0
}
779
780
// Computes deltas for a combiner that chooses between 2 sets of inputs.
781
0
void NetworkIO::ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output) {
782
0
  ASSERT_HOST(!int_mode_);
783
  // Compute the deltas for the combiner.
784
0
  int width = Width();
785
0
  int no = NumFeatures() - 1;
786
0
  ASSERT_HOST(fwd_deltas.NumFeatures() == no);
787
0
  ASSERT_HOST(base_output.NumFeatures() == no);
788
  // Number of outputs from base and final result.
789
0
  for (int t = 0; t < width; ++t) {
790
0
    const float *delta_line = fwd_deltas.f_[t];
791
0
    const float *base_line = base_output.f_[t];
792
0
    float *comb_line = f_[t];
793
0
    float base_weight = comb_line[no];
794
0
    float boost_weight = 1.0f - base_weight;
795
0
    float max_base_delta = 0.0;
796
0
    for (int i = 0; i < no; ++i) {
797
      // What did the combiner actually produce?
798
0
      float output = base_line[i] * base_weight + comb_line[i] * boost_weight;
799
      // Reconstruct the target from the delta.
800
0
      float comb_target = delta_line[i] + output;
801
0
      comb_line[i] = comb_target - comb_line[i];
802
0
      float base_delta = std::fabs(comb_target - base_line[i]);
803
0
      if (base_delta > max_base_delta) {
804
0
        max_base_delta = base_delta;
805
0
      }
806
0
    }
807
0
    if (max_base_delta >= 0.5) {
808
      // The base network got it wrong. The combiner should output the right
809
      // answer and 0 for the base network.
810
0
      comb_line[no] = 0.0 - base_weight;
811
0
    } else {
812
      // The base network was right. The combiner should flag that.
813
0
      for (int i = 0; i < no; ++i) {
814
        // All other targets are 0.
815
0
        if (comb_line[i] > 0.0) {
816
0
          comb_line[i] -= 1.0;
817
0
        }
818
0
      }
819
0
      comb_line[no] = 1.0 - base_weight;
820
0
    }
821
0
  }
822
0
}
823
824
// Copies the array checking that the types match.
825
0
void NetworkIO::CopyAll(const NetworkIO &src) {
826
0
  ASSERT_HOST(src.int_mode_ == int_mode_);
827
0
  f_ = src.f_;
828
0
}
829
830
// Checks that both are floats and adds the src array to *this.
831
0
void NetworkIO::AddAllToFloat(const NetworkIO &src) {
832
0
  ASSERT_HOST(!int_mode_);
833
0
  ASSERT_HOST(!src.int_mode_);
834
0
  f_ += src.f_;
835
0
}
836
837
// Subtracts the array from a float array. src must also be float.
838
0
void NetworkIO::SubtractAllFromFloat(const NetworkIO &src) {
839
0
  ASSERT_HOST(!int_mode_);
840
0
  ASSERT_HOST(!src.int_mode_);
841
0
  f_ -= src.f_;
842
0
}
843
844
// Copies src to *this, with maxabs normalization to match scale.
845
0
void NetworkIO::CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale) {
846
0
  ASSERT_HOST(!int_mode_);
847
0
  ASSERT_HOST(!src.int_mode_);
848
0
  ASSERT_HOST(!scale.int_mode_);
849
0
  float src_max = src.f_.MaxAbs();
850
0
  ASSERT_HOST(std::isfinite(src_max));
851
0
  float scale_max = scale.f_.MaxAbs();
852
0
  ASSERT_HOST(std::isfinite(scale_max));
853
0
  if (src_max > 0.0f) {
854
0
    float factor = scale_max / src_max;
855
0
    for (int t = 0; t < src.Width(); ++t) {
856
0
      const float *src_ptr = src.f_[t];
857
0
      float *dest_ptr = f_[t];
858
0
      for (int i = 0; i < src.f_.dim2(); ++i) {
859
0
        dest_ptr[i] = src_ptr[i] * factor;
860
0
      }
861
0
    }
862
0
  } else {
863
0
    f_.Clear();
864
0
  }
865
0
}
866
867
// Copies src to *this with independent reversal of the y dimension.
868
0
void NetworkIO::CopyWithYReversal(const NetworkIO &src) {
869
0
  int num_features = src.NumFeatures();
870
0
  Resize(src, num_features);
871
0
  StrideMap::Index b_index(src.stride_map_);
872
0
  do {
873
0
    int width = b_index.MaxIndexOfDim(FD_WIDTH) + 1;
874
0
    StrideMap::Index fwd_index(b_index);
875
0
    StrideMap::Index rev_index(b_index);
876
0
    rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_HEIGHT), FD_HEIGHT);
877
0
    do {
878
0
      int fwd_t = fwd_index.t();
879
0
      int rev_t = rev_index.t();
880
0
      for (int x = 0; x < width; ++x) {
881
0
        CopyTimeStepFrom(rev_t++, src, fwd_t++);
882
0
      }
883
0
    } while (fwd_index.AddOffset(1, FD_HEIGHT) && rev_index.AddOffset(-1, FD_HEIGHT));
884
0
  } while (b_index.AddOffset(1, FD_BATCH));
885
0
}
886
887
// Copies src to *this with independent reversal of the x dimension.
888
375k
void NetworkIO::CopyWithXReversal(const NetworkIO &src) {
889
375k
  int num_features = src.NumFeatures();
890
375k
  Resize(src, num_features);
891
375k
  StrideMap::Index b_index(src.stride_map_);
892
375k
  do {
893
375k
    StrideMap::Index y_index(b_index);
894
375k
    do {
895
375k
      StrideMap::Index fwd_index(y_index);
896
375k
      StrideMap::Index rev_index(y_index);
897
375k
      rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_WIDTH), FD_WIDTH);
898
5.15M
      do {
899
5.15M
        CopyTimeStepFrom(rev_index.t(), src, fwd_index.t());
900
5.15M
      } while (fwd_index.AddOffset(1, FD_WIDTH) && rev_index.AddOffset(-1, FD_WIDTH));
901
375k
    } while (y_index.AddOffset(1, FD_HEIGHT));
902
375k
  } while (b_index.AddOffset(1, FD_BATCH));
903
375k
}
904
905
// Copies src to *this with independent transpose of the x and y dimensions.
906
375k
void NetworkIO::CopyWithXYTranspose(const NetworkIO &src) {
907
375k
  int num_features = src.NumFeatures();
908
375k
  stride_map_ = src.stride_map_;
909
375k
  stride_map_.TransposeXY();
910
375k
  ResizeToMap(src.int_mode(), stride_map_, num_features);
911
375k
  StrideMap::Index src_b_index(src.stride_map_);
912
375k
  StrideMap::Index dest_b_index(stride_map_);
913
375k
  do {
914
375k
    StrideMap::Index src_y_index(src_b_index);
915
375k
    StrideMap::Index dest_x_index(dest_b_index);
916
4.82M
    do {
917
4.82M
      StrideMap::Index src_x_index(src_y_index);
918
4.82M
      StrideMap::Index dest_y_index(dest_x_index);
919
33.4M
      do {
920
33.4M
        CopyTimeStepFrom(dest_y_index.t(), src, src_x_index.t());
921
33.4M
      } while (src_x_index.AddOffset(1, FD_WIDTH) && dest_y_index.AddOffset(1, FD_HEIGHT));
922
4.82M
    } while (src_y_index.AddOffset(1, FD_HEIGHT) && dest_x_index.AddOffset(1, FD_WIDTH));
923
375k
  } while (src_b_index.AddOffset(1, FD_BATCH) && dest_b_index.AddOffset(1, FD_BATCH));
924
375k
}
925
926
// Copies src to *this, at the given feature_offset, returning the total
927
// feature offset after the copy. Multiple calls will stack outputs from
928
// multiple sources in feature space.
929
0
int NetworkIO::CopyPacking(const NetworkIO &src, int feature_offset) {
930
0
  ASSERT_HOST(int_mode_ == src.int_mode_);
931
0
  int width = src.Width();
932
0
  ASSERT_HOST(width <= Width());
933
0
  int num_features = src.NumFeatures();
934
0
  ASSERT_HOST(num_features + feature_offset <= NumFeatures());
935
0
  if (int_mode_) {
936
0
    for (int t = 0; t < width; ++t) {
937
0
      memcpy(i_[t] + feature_offset, src.i_[t], num_features * sizeof(i_[t][0]));
938
0
    }
939
0
    for (int t = width; t < i_.dim1(); ++t) {
940
0
      memset(i_[t], 0, num_features * sizeof(i_[t][0]));
941
0
    }
942
0
  } else {
943
0
    for (int t = 0; t < width; ++t) {
944
0
      memcpy(f_[t] + feature_offset, src.f_[t], num_features * sizeof(f_[t][0]));
945
0
    }
946
0
    for (int t = width; t < f_.dim1(); ++t) {
947
0
      memset(f_[t], 0, num_features * sizeof(f_[t][0]));
948
0
    }
949
0
  }
950
0
  return num_features + feature_offset;
951
0
}
952
953
// Opposite of CopyPacking, fills *this with a part of src, starting at
954
// feature_offset, and picking num_features.
955
0
void NetworkIO::CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features) {
956
0
  Resize(src, num_features);
957
0
  int width = src.Width();
958
0
  ASSERT_HOST(num_features + feature_offset <= src.NumFeatures());
959
0
  if (int_mode_) {
960
0
    for (int t = 0; t < width; ++t) {
961
0
      memcpy(i_[t], src.i_[t] + feature_offset, num_features * sizeof(i_[t][0]));
962
0
    }
963
0
  } else {
964
0
    for (int t = 0; t < width; ++t) {
965
0
      memcpy(f_[t], src.f_[t] + feature_offset, num_features * sizeof(f_[t][0]));
966
0
    }
967
0
  }
968
0
}
969
970
// Transposes the float part of *this into dest.
971
0
void NetworkIO::Transpose(TransposedArray *dest) const {
972
0
  int width = Width();
973
0
  dest->ResizeNoInit(NumFeatures(), width);
974
0
  for (int t = 0; t < width; ++t) {
975
0
    dest->WriteStrided(t, f_[t]);
976
0
  }
977
0
}
978
979
// Clips the content of a single time-step to +/-range.
980
0
void NetworkIO::ClipVector(int t, float range) {
981
0
  ASSERT_HOST(!int_mode_);
982
0
  float *v = f_[t];
983
0
  int dim = f_.dim2();
984
0
  for (int i = 0; i < dim; ++i) {
985
0
    v[i] = ClipToRange<float>(v[i], -range, range);
986
0
  }
987
0
}
988
989
// Returns the padding required for the given number of features in order
990
// for the SIMD operations to be safe.
991
/* static */
992
3.00M
int NetworkIO::GetPadding(int num_features) {
993
3.00M
  int padding = 0;
994
3.00M
  if (IntSimdMatrix::intSimdMatrix) {
995
3.00M
    padding = IntSimdMatrix::intSimdMatrix->RoundInputs(num_features) - num_features;
996
3.00M
  }
997
3.00M
  return padding;
998
3.00M
}
999
1000
} // namespace tesseract.