/src/tesseract/src/ccmain/thresholder.cpp
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: thresholder.cpp |
3 | | // Description: Base API for thresholding images in tesseract. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2008, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | // Include automatically generated configuration file |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "otsuthr.h" |
25 | | #include "thresholder.h" |
26 | | #include "tprintf.h" // for tprintf |
27 | | |
28 | | #include <allheaders.h> |
29 | | #include <tesseract/baseapi.h> // for api->GetIntVariable() |
30 | | |
31 | | #include <algorithm> // for std::max, std::min |
32 | | #include <cstdint> // for uint32_t |
33 | | #include <cstring> |
34 | | #include <tuple> |
35 | | |
36 | | namespace tesseract { |
37 | | |
38 | | ImageThresholder::ImageThresholder() |
39 | 2 | : pix_(nullptr) |
40 | 2 | , image_width_(0) |
41 | 2 | , image_height_(0) |
42 | 2 | , pix_channels_(0) |
43 | 2 | , pix_wpl_(0) |
44 | 2 | , scale_(1) |
45 | 2 | , yres_(300) |
46 | 2 | , estimated_res_(300) { |
47 | 2 | SetRectangle(0, 0, 0, 0); |
48 | 2 | } |
49 | | |
50 | 0 | ImageThresholder::~ImageThresholder() { |
51 | 0 | Clear(); |
52 | 0 | } |
53 | | |
54 | | // Destroy the Pix if there is one, freeing memory. |
55 | 0 | void ImageThresholder::Clear() { |
56 | 0 | pix_.destroy(); |
57 | 0 | } |
58 | | |
59 | | // Return true if no image has been set. |
60 | 16.2k | bool ImageThresholder::IsEmpty() const { |
61 | 16.2k | return pix_ == nullptr; |
62 | 16.2k | } |
63 | | |
64 | | // SetImage makes a copy of all the image data, so it may be deleted |
65 | | // immediately after this call. |
66 | | // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. |
67 | | // Palette color images will not work properly and must be converted to |
68 | | // 24 bit. |
69 | | // Binary images of 1 bit per pixel may also be given but they must be |
70 | | // byte packed with the MSB of the first byte being the first pixel, and a |
71 | | // one pixel is WHITE. For binary images set bytes_per_pixel=0. |
72 | | void ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height, |
73 | 0 | int bytes_per_pixel, int bytes_per_line) { |
74 | 0 | int bpp = bytes_per_pixel * 8; |
75 | 0 | if (bpp == 0) { |
76 | 0 | bpp = 1; |
77 | 0 | } |
78 | 0 | Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp); |
79 | 0 | l_uint32 *data = pixGetData(pix); |
80 | 0 | int wpl = pixGetWpl(pix); |
81 | 0 | switch (bpp) { |
82 | 0 | case 1: |
83 | 0 | for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) { |
84 | 0 | for (int x = 0; x < width; ++x) { |
85 | 0 | if (imagedata[x / 8] & (0x80 >> (x % 8))) { |
86 | 0 | CLEAR_DATA_BIT(data, x); |
87 | 0 | } else { |
88 | 0 | SET_DATA_BIT(data, x); |
89 | 0 | } |
90 | 0 | } |
91 | 0 | } |
92 | 0 | break; |
93 | | |
94 | 0 | case 8: |
95 | | // Greyscale just copies the bytes in the right order. |
96 | 0 | for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) { |
97 | 0 | for (int x = 0; x < width; ++x) { |
98 | 0 | SET_DATA_BYTE(data, x, imagedata[x]); |
99 | 0 | } |
100 | 0 | } |
101 | 0 | break; |
102 | | |
103 | 0 | case 24: |
104 | | // Put the colors in the correct places in the line buffer. |
105 | 0 | for (int y = 0; y < height; ++y, imagedata += bytes_per_line) { |
106 | 0 | for (int x = 0; x < width; ++x, ++data) { |
107 | 0 | SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]); |
108 | 0 | SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]); |
109 | 0 | SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]); |
110 | 0 | } |
111 | 0 | } |
112 | 0 | break; |
113 | | |
114 | 0 | case 32: |
115 | | // Maintain byte order consistency across different endianness. |
116 | 0 | for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) { |
117 | 0 | for (int x = 0; x < width; ++x) { |
118 | 0 | data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) | |
119 | 0 | (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3]; |
120 | 0 | } |
121 | 0 | } |
122 | 0 | break; |
123 | | |
124 | 0 | default: |
125 | 0 | tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp); |
126 | 0 | } |
127 | 0 | SetImage(pix); |
128 | 0 | pix.destroy(); |
129 | 0 | } |
130 | | |
131 | | // Store the coordinates of the rectangle to process for later use. |
132 | | // Doesn't actually do any thresholding. |
133 | 16.2k | void ImageThresholder::SetRectangle(int left, int top, int width, int height) { |
134 | 16.2k | rect_left_ = left; |
135 | 16.2k | rect_top_ = top; |
136 | 16.2k | rect_width_ = width; |
137 | 16.2k | rect_height_ = height; |
138 | 16.2k | } |
139 | | |
140 | | // Get enough parameters to be able to rebuild bounding boxes in the |
141 | | // original image (not just within the rectangle). |
142 | | // Left and top are enough with top-down coordinates, but |
143 | | // the height of the rectangle and the image are needed for bottom-up. |
144 | | void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, |
145 | 16.2k | int *imageheight) { |
146 | 16.2k | *left = rect_left_; |
147 | 16.2k | *top = rect_top_; |
148 | 16.2k | *width = rect_width_; |
149 | 16.2k | *height = rect_height_; |
150 | 16.2k | *imagewidth = image_width_; |
151 | 16.2k | *imageheight = image_height_; |
152 | 16.2k | } |
153 | | |
154 | | // Pix vs raw, which to use? Pix is the preferred input for efficiency, |
155 | | // since raw buffers are copied. |
156 | | // SetImage for Pix clones its input, so the source pix may be pixDestroyed |
157 | | // immediately after, but may not go away until after the Thresholder has |
158 | | // finished with it. |
159 | 16.2k | void ImageThresholder::SetImage(const Image pix) { |
160 | 16.2k | if (pix_ != nullptr) { |
161 | 16.2k | pix_.destroy(); |
162 | 16.2k | } |
163 | 16.2k | Image src = pix; |
164 | 16.2k | int depth; |
165 | 16.2k | pixGetDimensions(src, &image_width_, &image_height_, &depth); |
166 | | // Convert the image as necessary so it is one of binary, plain RGB, or |
167 | | // 8 bit with no colormap. Guarantee that we always end up with our own copy, |
168 | | // not just a clone of the input. |
169 | 16.2k | if (depth > 1 && depth < 8) { |
170 | 0 | pix_ = pixConvertTo8(src, false); |
171 | 16.2k | } else { |
172 | 16.2k | pix_ = src.copy(); |
173 | 16.2k | } |
174 | 16.2k | depth = pixGetDepth(pix_); |
175 | 16.2k | pix_channels_ = depth / 8; |
176 | 16.2k | pix_wpl_ = pixGetWpl(pix_); |
177 | 16.2k | scale_ = 1; |
178 | 16.2k | estimated_res_ = yres_ = pixGetYRes(pix_); |
179 | 16.2k | Init(); |
180 | 16.2k | } |
181 | | |
182 | | std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold( |
183 | | TessBaseAPI *api, |
184 | 0 | ThresholdMethod method) { |
185 | 0 | Image pix_binary = nullptr; |
186 | 0 | Image pix_thresholds = nullptr; |
187 | |
|
188 | 0 | if (pix_channels_ == 0) { |
189 | | // We have a binary image, but it still has to be copied, as this API |
190 | | // allows the caller to modify the output. |
191 | 0 | Image original = GetPixRect(); |
192 | 0 | pix_binary = original.copy(); |
193 | 0 | original.destroy(); |
194 | 0 | return std::make_tuple(true, nullptr, pix_binary, nullptr); |
195 | 0 | } |
196 | | |
197 | 0 | auto pix_grey = GetPixRectGrey(); |
198 | |
|
199 | 0 | int r; |
200 | |
|
201 | 0 | l_int32 pix_w, pix_h; |
202 | 0 | pixGetDimensions(pix_grey, &pix_w, &pix_h, nullptr); |
203 | |
|
204 | 0 | bool thresholding_debug; |
205 | 0 | api->GetBoolVariable("thresholding_debug", &thresholding_debug); |
206 | 0 | if (thresholding_debug) { |
207 | 0 | tprintf("\nimage width: %d height: %d ppi: %d\n", pix_w, pix_h, yres_); |
208 | 0 | } |
209 | |
|
210 | 0 | if (method == ThresholdMethod::Sauvola && pix_w > 6 && pix_h > 6) { |
211 | | // pixSauvolaBinarizeTiled requires half_window_size >= 2. |
212 | | // Therefore window_size must be at least 4 which requires |
213 | | // pix_w and pix_h to be at least 7. |
214 | 0 | int window_size; |
215 | 0 | double window_size_factor; |
216 | 0 | api->GetDoubleVariable("thresholding_window_size", &window_size_factor); |
217 | 0 | window_size = window_size_factor * yres_; |
218 | 0 | window_size = std::max(7, window_size); |
219 | 0 | window_size = std::min(pix_w < pix_h ? pix_w - 3 : pix_h - 3, window_size); |
220 | 0 | int half_window_size = window_size / 2; |
221 | | |
222 | | // factor for image division into tiles; >= 1 |
223 | 0 | l_int32 nx, ny; |
224 | | // tiles size will be approx. 250 x 250 pixels |
225 | 0 | nx = std::max(1, (pix_w + 125) / 250); |
226 | 0 | ny = std::max(1, (pix_h + 125) / 250); |
227 | 0 | auto xrat = pix_w / nx; |
228 | 0 | auto yrat = pix_h / ny; |
229 | 0 | if (xrat < half_window_size + 2) { |
230 | 0 | nx = pix_w / (half_window_size + 2); |
231 | 0 | } |
232 | 0 | if (yrat < half_window_size + 2) { |
233 | 0 | ny = pix_h / (half_window_size + 2); |
234 | 0 | } |
235 | |
|
236 | 0 | double kfactor; |
237 | 0 | api->GetDoubleVariable("thresholding_kfactor", &kfactor); |
238 | 0 | kfactor = std::max(0.0, kfactor); |
239 | |
|
240 | 0 | if (thresholding_debug) { |
241 | 0 | tprintf("window size: %d kfactor: %.3f nx:%d ny: %d\n", window_size, kfactor, nx, ny); |
242 | 0 | } |
243 | |
|
244 | 0 | r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny, |
245 | 0 | (PIX**)pix_thresholds, |
246 | 0 | (PIX**)pix_binary); |
247 | 0 | } else { // if (method == ThresholdMethod::LeptonicaOtsu) |
248 | 0 | int tile_size; |
249 | 0 | double tile_size_factor; |
250 | 0 | api->GetDoubleVariable("thresholding_tile_size", &tile_size_factor); |
251 | 0 | tile_size = tile_size_factor * yres_; |
252 | 0 | tile_size = std::max(16, tile_size); |
253 | |
|
254 | 0 | int smooth_size; |
255 | 0 | double smooth_size_factor; |
256 | 0 | api->GetDoubleVariable("thresholding_smooth_kernel_size", |
257 | 0 | &smooth_size_factor); |
258 | 0 | smooth_size_factor = std::max(0.0, smooth_size_factor); |
259 | 0 | smooth_size = smooth_size_factor * yres_; |
260 | 0 | int half_smooth_size = smooth_size / 2; |
261 | |
|
262 | 0 | double score_fraction; |
263 | 0 | api->GetDoubleVariable("thresholding_score_fraction", &score_fraction); |
264 | |
|
265 | 0 | if (thresholding_debug) { |
266 | 0 | tprintf("tile size: %d smooth_size: %d score_fraction: %.2f\n", tile_size, smooth_size, score_fraction); |
267 | 0 | } |
268 | |
|
269 | 0 | r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size, |
270 | 0 | half_smooth_size, half_smooth_size, |
271 | 0 | score_fraction, |
272 | 0 | (PIX**)pix_thresholds, |
273 | 0 | (PIX**)pix_binary); |
274 | 0 | } |
275 | |
|
276 | 0 | bool ok = (r == 0); |
277 | 0 | return std::make_tuple(ok, pix_grey, pix_binary, pix_thresholds); |
278 | 0 | } |
279 | | |
280 | | // Threshold the source image as efficiently as possible to the output Pix. |
281 | | // Creates a Pix and sets pix to point to the resulting pointer. |
282 | | // Caller must use pixDestroy to free the created Pix. |
283 | | /// Returns false on error. |
284 | 16.2k | bool ImageThresholder::ThresholdToPix(Image *pix) { |
285 | 16.2k | if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) { |
286 | 0 | tprintf("Image too large: (%d, %d)\n", image_width_, image_height_); |
287 | 0 | return false; |
288 | 0 | } |
289 | | // Handle binary image |
290 | 16.2k | if (pix_channels_ == 0) { |
291 | | // We have a binary image, but it still has to be copied, as this API |
292 | | // allows the caller to modify the output. |
293 | 16.2k | Image original = GetPixRect(); |
294 | 16.2k | *pix = original.copy(); |
295 | 16.2k | original.destroy(); |
296 | 16.2k | return true; |
297 | 16.2k | } |
298 | | // Handle colormaps |
299 | 0 | Image src = pix_; |
300 | 0 | if (pixGetColormap(src)) { |
301 | 0 | src = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC); |
302 | 0 | } |
303 | 0 | OtsuThresholdRectToPix(src, pix); |
304 | 0 | if (src != pix_) { |
305 | 0 | src.destroy(); |
306 | 0 | } |
307 | 0 | return true; |
308 | 16.2k | } |
309 | | |
310 | | // Gets a pix that contains an 8 bit threshold value at each pixel. The |
311 | | // returned pix may be an integer reduction of the binary image such that |
312 | | // the scale factor may be inferred from the ratio of the sizes, even down |
313 | | // to the extreme of a 1x1 pixel thresholds image. |
314 | | // Ideally the 8 bit threshold should be the exact threshold used to generate |
315 | | // the binary image in ThresholdToPix, but this is not a hard constraint. |
316 | | // Returns nullptr if the input is binary. PixDestroy after use. |
317 | 0 | Image ImageThresholder::GetPixRectThresholds() { |
318 | 0 | if (IsBinary()) { |
319 | 0 | return nullptr; |
320 | 0 | } |
321 | 0 | Image pix_grey = GetPixRectGrey(); |
322 | 0 | int width = pixGetWidth(pix_grey); |
323 | 0 | int height = pixGetHeight(pix_grey); |
324 | 0 | std::vector<int> thresholds; |
325 | 0 | std::vector<int> hi_values; |
326 | 0 | OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values); |
327 | 0 | pix_grey.destroy(); |
328 | 0 | Image pix_thresholds = pixCreate(width, height, 8); |
329 | 0 | int threshold = thresholds[0] > 0 ? thresholds[0] : 128; |
330 | 0 | pixSetAllArbitrary(pix_thresholds, threshold); |
331 | 0 | return pix_thresholds; |
332 | 0 | } |
333 | | |
334 | | // Common initialization shared between SetImage methods. |
335 | 16.2k | void ImageThresholder::Init() { |
336 | 16.2k | SetRectangle(0, 0, image_width_, image_height_); |
337 | 16.2k | } |
338 | | |
339 | | // Get a clone/copy of the source image rectangle. |
340 | | // The returned Pix must be pixDestroyed. |
341 | | // This function will be used in the future by the page layout analysis, and |
342 | | // the layout analysis that uses it will only be available with Leptonica, |
343 | | // so there is no raw equivalent. |
344 | 32.4k | Image ImageThresholder::GetPixRect() { |
345 | 32.4k | if (IsFullImage()) { |
346 | | // Just clone the whole thing. |
347 | 32.4k | return pix_.clone(); |
348 | 32.4k | } else { |
349 | | // Crop to the given rectangle. |
350 | 0 | Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_); |
351 | 0 | Image cropped = pixClipRectangle(pix_, box, nullptr); |
352 | 0 | boxDestroy(&box); |
353 | 0 | return cropped; |
354 | 0 | } |
355 | 32.4k | } |
356 | | |
357 | | // Get a clone/copy of the source image rectangle, reduced to greyscale, |
358 | | // and at the same resolution as the output binary. |
359 | | // The returned Pix must be pixDestroyed. |
360 | | // Provided to the classifier to extract features from the greyscale image. |
361 | 0 | Image ImageThresholder::GetPixRectGrey() { |
362 | 0 | auto pix = GetPixRect(); // May have to be reduced to grey. |
363 | 0 | int depth = pixGetDepth(pix); |
364 | 0 | if (depth != 8 || pixGetColormap(pix)) { |
365 | 0 | if (depth == 24) { |
366 | 0 | auto tmp = pixConvert24To32(pix); |
367 | 0 | pix.destroy(); |
368 | 0 | pix = tmp; |
369 | 0 | } |
370 | 0 | auto result = pixConvertTo8(pix, false); |
371 | 0 | pix.destroy(); |
372 | 0 | return result; |
373 | 0 | } |
374 | 0 | return pix; |
375 | 0 | } |
376 | | |
377 | | // Otsu thresholds the rectangle, taking the rectangle from *this. |
378 | 0 | void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const { |
379 | 0 | std::vector<int> thresholds; |
380 | 0 | std::vector<int> hi_values; |
381 | |
|
382 | 0 | int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_, |
383 | 0 | thresholds, hi_values); |
384 | 0 | ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix); |
385 | 0 | } |
386 | | |
387 | | /// Threshold the rectangle, taking everything except the src_pix |
388 | | /// from the class, using thresholds/hi_values to the output pix. |
389 | | /// NOTE that num_channels is the size of the thresholds and hi_values |
390 | | // arrays and also the bytes per pixel in src_pix. |
391 | | void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds, |
392 | 0 | const std::vector<int> &hi_values, Image *pix) const { |
393 | 0 | *pix = pixCreate(rect_width_, rect_height_, 1); |
394 | 0 | uint32_t *pixdata = pixGetData(*pix); |
395 | 0 | int wpl = pixGetWpl(*pix); |
396 | 0 | int src_wpl = pixGetWpl(src_pix); |
397 | 0 | uint32_t *srcdata = pixGetData(src_pix); |
398 | 0 | pixSetXRes(*pix, pixGetXRes(src_pix)); |
399 | 0 | pixSetYRes(*pix, pixGetYRes(src_pix)); |
400 | 0 | for (int y = 0; y < rect_height_; ++y) { |
401 | 0 | const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl; |
402 | 0 | uint32_t *pixline = pixdata + y * wpl; |
403 | 0 | for (int x = 0; x < rect_width_; ++x) { |
404 | 0 | bool white_result = true; |
405 | 0 | for (int ch = 0; ch < num_channels; ++ch) { |
406 | 0 | int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch); |
407 | 0 | if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) { |
408 | 0 | white_result = false; |
409 | 0 | break; |
410 | 0 | } |
411 | 0 | } |
412 | 0 | if (white_result) { |
413 | 0 | CLEAR_DATA_BIT(pixline, x); |
414 | 0 | } else { |
415 | 0 | SET_DATA_BIT(pixline, x); |
416 | 0 | } |
417 | 0 | } |
418 | 0 | } |
419 | 0 | } |
420 | | |
421 | | } // namespace tesseract. |