/src/tesseract/src/lstm/networkio.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: networkio.h |
3 | | // Description: Network input/output data, allowing float/int implementations. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2014, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | /////////////////////////////////////////////////////////////////////// |
17 | | |
18 | | #ifndef TESSERACT_LSTM_NETWORKIO_H_ |
19 | | #define TESSERACT_LSTM_NETWORKIO_H_ |
20 | | |
21 | | #include "helpers.h" |
22 | | #include "image.h" |
23 | | #include "static_shape.h" |
24 | | #include "stridemap.h" |
25 | | #include "weightmatrix.h" |
26 | | |
27 | | #include <cmath> |
28 | | #include <cstdio> |
29 | | #include <vector> |
30 | | |
31 | | struct Pix; |
32 | | |
33 | | namespace tesseract { |
34 | | |
35 | | // Class to contain all the input/output of a network, allowing for fixed or |
36 | | // variable-strided 2d to 1d mapping, and float or int8_t values. Provides |
37 | | // enough calculating functions to hide the detail of the implementation. |
38 | | class TESS_API NetworkIO { |
39 | | public: |
40 | 570k | NetworkIO() : int_mode_(false) {} |
41 | | // Resizes the array (and stride), avoiding realloc if possible, to the given |
42 | | // size from various size specs: |
43 | | // Same stride size, but given number of features. |
44 | 3.10M | void Resize(const NetworkIO &src, int num_features) { |
45 | 3.10M | ResizeToMap(src.int_mode(), src.stride_map(), num_features); |
46 | 3.10M | } |
47 | | // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. |
48 | | void Resize2d(bool int_mode, int width, int num_features); |
49 | | // Resizes forcing a float representation with the stridemap of src and the |
50 | | // given number of features. |
51 | 282k | void ResizeFloat(const NetworkIO &src, int num_features) { |
52 | 282k | ResizeToMap(false, src.stride_map(), num_features); |
53 | 282k | } |
54 | | // Resizes to a specific stride_map. |
55 | | void ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features); |
56 | | // Shrinks image size by x_scale,y_scale, and use given number of features. |
57 | | void ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features); |
58 | | // Resizes to just 1 x-coord, whatever the input. |
59 | | void ResizeXTo1(const NetworkIO &src, int num_features); |
60 | | // Initialize all the array to zero. |
61 | | void Zero(); |
62 | | // Initializes to zero all elements of the array that do not correspond to |
63 | | // valid image positions. (If a batch of different-sized images are packed |
64 | | // together, then there will be padding pixels.) |
65 | | void ZeroInvalidElements(); |
66 | | // Sets up the array from the given image, using the currently set int_mode_. |
67 | | // If the image width doesn't match the shape, the image is truncated or |
68 | | // padded with noise to match. |
69 | | void FromPix(const StaticShape &shape, const Image pix, TRand *randomizer); |
70 | | // Sets up the array from the given set of images, using the currently set |
71 | | // int_mode_. If the image width doesn't match the shape, the images are |
72 | | // truncated or padded with noise to match. |
73 | | void FromPixes(const StaticShape &shape, const std::vector<Image> &pixes, |
74 | | TRand *randomizer); |
75 | | // Copies the given pix to *this at the given batch index, stretching and |
76 | | // clipping the pixel values so that [black, black + 2*contrast] maps to the |
77 | | // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. |
78 | | // This is a 2-d operation in the sense that the output depth is the number |
79 | | // of input channels, the height is the height of the image, and the width |
80 | | // is the width of the image, or truncated/padded with noise if the width |
81 | | // is a fixed size. |
82 | | void Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer); |
83 | | // Copies the given pix to *this at the given batch index, as Copy2DImage |
84 | | // above, except that the output depth is the height of the input image, the |
85 | | // output height is 1, and the output width as for Copy2DImage. |
86 | | // The image is thus treated as a 1-d set of vertical pixel strips. |
87 | | void Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer); |
88 | | // Helper stores the pixel value in i_ or f_ according to int_mode_. |
89 | | // t: is the index from the StrideMap corresponding to the current |
90 | | // [batch,y,x] position |
91 | | // f: is the index into the depth/channel |
92 | | // pixel: the value of the pixel from the image (in one channel) |
93 | | // black: the pixel value to map to the lowest of the range of *this |
94 | | // contrast: the range of pixel values to stretch to half the range of *this. |
95 | | void SetPixel(int t, int f, int pixel, float black, float contrast); |
96 | | // Converts the array to a Pix. Must be pixDestroyed after use. |
97 | | Image ToPix() const; |
98 | | // Prints the first and last num timesteps of the array for each feature. |
99 | | void Print(int num) const; |
100 | | |
101 | | // Returns the timestep width. |
102 | 6.87M | int Width() const { |
103 | 6.87M | return int_mode_ ? i_.dim1() : f_.dim1(); |
104 | 6.87M | } |
105 | | // Returns the number of features. |
106 | 395M | int NumFeatures() const { |
107 | 395M | return int_mode_ ? i_.dim2() : f_.dim2(); |
108 | 395M | } |
109 | | // Accessor to a timestep of the float matrix. |
110 | 0 | float *f(int t) { |
111 | 0 | ASSERT_HOST(!int_mode_); |
112 | 0 | return f_[t]; |
113 | 0 | } |
114 | 3.83M | const float *f(int t) const { |
115 | 3.83M | ASSERT_HOST(!int_mode_); |
116 | 3.83M | return f_[t]; |
117 | 3.83M | } |
118 | 574M | const int8_t *i(int t) const { |
119 | 574M | ASSERT_HOST(int_mode_); |
120 | 574M | return i_[t]; |
121 | 574M | } |
122 | 629M | bool int_mode() const { |
123 | 629M | return int_mode_; |
124 | 629M | } |
125 | 282k | void set_int_mode(bool is_quantized) { |
126 | 282k | int_mode_ = is_quantized; |
127 | 282k | } |
128 | 46.5M | const StrideMap &stride_map() const { |
129 | 46.5M | return stride_map_; |
130 | 46.5M | } |
131 | 0 | void set_stride_map(const StrideMap &map) { |
132 | 0 | stride_map_ = map; |
133 | 0 | } |
134 | 0 | const GENERIC_2D_ARRAY<float> &float_array() const { |
135 | 0 | return f_; |
136 | 0 | } |
137 | 0 | GENERIC_2D_ARRAY<float> *mutable_float_array() { |
138 | 0 | return &f_; |
139 | 0 | } |
140 | | |
141 | | // Copies a single time step from src. |
142 | | void CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t); |
143 | | // Copies a part of single time step from src. |
144 | | void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src, |
145 | | int src_t, int src_offset); |
146 | | // Zeroes a single time step. |
147 | 0 | void ZeroTimeStep(int t) { |
148 | 0 | if (int_mode_) { |
149 | 0 | memset(i_[t], 0, sizeof(*i_[t]) * NumFeatures()); |
150 | 0 | } else { |
151 | 0 | memset(f_[t], 0, sizeof(*f_[t]) * NumFeatures()); |
152 | 0 | } |
153 | 0 | } |
154 | | // Sets the given range to random values. |
155 | | void Randomize(int t, int offset, int num_features, TRand *randomizer); |
156 | | |
157 | | // Helper returns the label and score of the best choice over a range. |
158 | | int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating, |
159 | | float *certainty) const; |
160 | | // Helper returns the rating and certainty of the choice over a range in t. |
161 | | void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating, |
162 | | float *certainty) const; |
163 | | // Returns the index (label) of the best value at the given timestep, |
164 | | // and if not null, sets the score to the log of the corresponding value. |
165 | 3.33M | int BestLabel(int t, float *score) const { |
166 | 3.33M | return BestLabel(t, -1, -1, score); |
167 | 3.33M | } |
168 | | // Returns the index (label) of the best value at the given timestep, |
169 | | // excluding not_this and not_that, and if not null, sets the score to the |
170 | | // log of the corresponding value. |
171 | | int BestLabel(int t, int not_this, int not_that, float *score) const; |
172 | | // Returns the best start position out of range (into which both start and end |
173 | | // must fit) to obtain the highest cumulative score for the given labels. |
174 | | int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const; |
175 | | // Returns the cumulative score of the given labels starting at start, and |
176 | | // using one label per time-step. |
177 | | TFloat ScoreOfLabels(const std::vector<int> &labels, int start) const; |
178 | | // Helper function sets all the outputs for a single timestep, such that |
179 | | // label has value ok_score, and the other labels share 1 - ok_score. |
180 | | // Assumes float mode. |
181 | | void SetActivations(int t, int label, float ok_score); |
182 | | // Modifies the values, only if needed, so that the given label is |
183 | | // the winner at the given time step t. |
184 | | // Assumes float mode. |
185 | | void EnsureBestLabel(int t, int label); |
186 | | // Helper function converts prob to certainty taking the minimum into account. |
187 | | static float ProbToCertainty(float prob); |
188 | | // Returns true if there is any bad value that is suspiciously like a GT |
189 | | // error. Assuming that *this is the difference(gradient) between target |
190 | | // and forward output, returns true if there is a large negative value |
191 | | // (correcting a very confident output) for which there is no corresponding |
192 | | // positive value in an adjacent timestep for the same feature index. This |
193 | | // allows the box-truthed samples to make fine adjustments to position while |
194 | | // stopping other disagreements of confident output with ground truth. |
195 | | bool AnySuspiciousTruth(float confidence_thr) const; |
196 | | |
197 | | // Reads a single timestep to floats in the range [-1, 1]. |
198 | | void ReadTimeStep(int t, TFloat *output) const; |
199 | | // Adds a single timestep to floats. |
200 | | void AddTimeStep(int t, TFloat *inout) const; |
201 | | // Adds part of a single timestep to floats. |
202 | | void AddTimeStepPart(int t, int offset, int num_features, float *inout) const; |
203 | | // Writes a single timestep from floats in the range [-1, 1]. |
204 | | void WriteTimeStep(int t, const TFloat *input); |
205 | | // Writes a single timestep from floats in the range [-1, 1] writing only |
206 | | // num_features elements of input to (*this)[t], starting at offset. |
207 | | void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input); |
208 | | // Maxpools a single time step from src. |
209 | | void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line); |
210 | | // Runs maxpool backward, using maxes to index timesteps in *this. |
211 | | void MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes); |
212 | | // Returns the min over time of the maxes over features of the outputs. |
213 | | float MinOfMaxes() const; |
214 | | // Returns the min over time. |
215 | 0 | float Max() const { |
216 | 0 | return int_mode_ ? i_.Max() : f_.Max(); |
217 | 0 | } |
218 | | // Computes combined results for a combiner that chooses between an existing |
219 | | // input and itself, with an additional output to indicate the choice. |
220 | | void CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output); |
221 | | // Computes deltas for a combiner that chooses between 2 sets of inputs. |
222 | | void ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output); |
223 | | |
224 | | // Copies the array checking that the types match. |
225 | | void CopyAll(const NetworkIO &src); |
226 | | // Adds the array to a float array, with scaling to [-1, 1] if the src is int. |
227 | | void AddAllToFloat(const NetworkIO &src); |
228 | | // Subtracts the array from a float array. src must also be float. |
229 | | void SubtractAllFromFloat(const NetworkIO &src); |
230 | | |
231 | | // Copies src to *this, with maxabs normalization to match scale. |
232 | | void CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale); |
233 | | // Multiplies the float data by the given factor. |
234 | 0 | void ScaleFloatBy(float factor) { |
235 | 0 | f_ *= factor; |
236 | 0 | } |
237 | | // Copies src to *this with independent reversal of the y dimension. |
238 | | void CopyWithYReversal(const NetworkIO &src); |
239 | | // Copies src to *this with independent reversal of the x dimension. |
240 | | void CopyWithXReversal(const NetworkIO &src); |
241 | | // Copies src to *this with independent transpose of the x and y dimensions. |
242 | | void CopyWithXYTranspose(const NetworkIO &src); |
243 | | // Copies src to *this, at the given feature_offset, returning the total |
244 | | // feature offset after the copy. Multiple calls will stack outputs from |
245 | | // multiple sources in feature space. |
246 | | int CopyPacking(const NetworkIO &src, int feature_offset); |
247 | | // Opposite of CopyPacking, fills *this with a part of src, starting at |
248 | | // feature_offset, and picking num_features. Resizes *this to match. |
249 | | void CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features); |
250 | | // Transposes the float part of *this into dest. |
251 | | void Transpose(TransposedArray *dest) const; |
252 | | |
253 | | // Clips the content of a single time-step to +/-range. |
254 | | void ClipVector(int t, float range); |
255 | | |
256 | | // Applies Func to timestep t of *this (u) and multiplies the result by v |
257 | | // component-wise, putting the product in *product. |
258 | | // *this and v may be int or float, but must match. The outputs are TFloat. |
259 | | template <class Func> |
260 | 0 | void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product) { |
261 | 0 | Func f; |
262 | 0 | ASSERT_HOST(!int_mode_); |
263 | 0 | ASSERT_HOST(!v_io.int_mode_); |
264 | 0 | int dim = f_.dim2(); |
265 | 0 | if (int_mode_) { |
266 | 0 | const int8_t *u = i_[t]; |
267 | 0 | const int8_t *v = v_io.i_[t]; |
268 | 0 | for (int i = 0; i < dim; ++i) { |
269 | 0 | product[i] = f(u[i] / static_cast<TFloat>(INT8_MAX)) * v[i] / INT8_MAX; |
270 | 0 | } |
271 | 0 | } else { |
272 | 0 | const float *u = f_[t]; |
273 | 0 | const float *v = v_io.f_[t]; |
274 | 0 | for (int i = 0; i < dim; ++i) { |
275 | 0 | product[i] = f(u[i]) * v[i]; |
276 | 0 | } |
277 | 0 | } |
278 | 0 | } Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply<tesseract::GPrime>(tesseract::NetworkIO const&, int, float*) Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply<tesseract::FPrime>(tesseract::NetworkIO const&, int, float*) Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply<tesseract::ClipFPrime>(tesseract::NetworkIO const&, int, float*) Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply<tesseract::ClipGPrime>(tesseract::NetworkIO const&, int, float*) Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply<tesseract::ReluPrime>(tesseract::NetworkIO const&, int, float*) |
279 | | // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, |
280 | | // component-wise, putting the product in *product. |
281 | | // All NetworkIOs are assumed to be float. |
282 | | template <class Func> |
283 | | void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w, |
284 | 0 | TFloat *product) const { |
285 | 0 | ASSERT_HOST(!int_mode_); |
286 | 0 | ASSERT_HOST(!v_io.int_mode_); |
287 | 0 | Func f; |
288 | 0 | const float *u = f_[u_t]; |
289 | 0 | const float *v = v_io.f_[v_t]; |
290 | 0 | int dim = f_.dim2(); |
291 | 0 | for (int i = 0; i < dim; ++i) { |
292 | 0 | product[i] = f(u[i]) * v[i] * w[i]; |
293 | 0 | } |
294 | 0 | } Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply3<tesseract::GPrime>(int, tesseract::NetworkIO const&, int, float const*, float*) const Unexecuted instantiation: void tesseract::NetworkIO::FuncMultiply3<tesseract::FPrime>(int, tesseract::NetworkIO const&, int, float const*, float*) const |
295 | | // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, |
296 | | // component-wise, adding the product to *product. |
297 | | // All NetworkIOs are assumed to be float. |
298 | | template <class Func> |
299 | 0 | void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const { |
300 | 0 | ASSERT_HOST(!int_mode_); |
301 | 0 | ASSERT_HOST(!v_io.int_mode_); |
302 | 0 | Func f; |
303 | 0 | const float *u = f_[t]; |
304 | 0 | const float *v = v_io.f_[t]; |
305 | 0 | int dim = f_.dim2(); |
306 | 0 | for (int i = 0; i < dim; ++i) { |
307 | 0 | product[i] += f(u[i]) * v[i] * w[i]; |
308 | 0 | } |
309 | 0 | } |
310 | | // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w, |
311 | | // component-wise, putting the product in product, all at timestep t, except |
312 | | // w, which is a simple array. All NetworkIOs are assumed to be float. |
313 | | template <class Func1, class Func2> |
314 | 0 | void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const { |
315 | 0 | ASSERT_HOST(!int_mode_); |
316 | 0 | ASSERT_HOST(!v_io.int_mode_); |
317 | 0 | Func1 f; |
318 | 0 | Func2 g; |
319 | 0 | const float *u = f_[t]; |
320 | 0 | const float *v = v_io.f_[t]; |
321 | 0 | int dim = f_.dim2(); |
322 | 0 | for (int i = 0; i < dim; ++i) { |
323 | 0 | product[i] = f(u[i]) * g(v[i]) * w[i]; |
324 | 0 | } |
325 | 0 | } |
326 | | |
327 | | private: |
328 | | // Returns the padding required for the given number of features in order |
329 | | // for the SIMD operations to be safe. |
330 | | static int GetPadding(int num_features); |
331 | | |
332 | | // Choice of float vs 8 bit int for data. |
333 | | GENERIC_2D_ARRAY<float> f_; |
334 | | GENERIC_2D_ARRAY<int8_t> i_; |
335 | | // Which of f_ and i_ are we actually using. |
336 | | bool int_mode_; |
337 | | // Stride for 2d input data. |
338 | | StrideMap stride_map_; |
339 | | }; |
340 | | |
341 | | } // namespace tesseract. |
342 | | |
343 | | #endif // TESSERACT_LSTM_NETWORKIO_H_ |