/src/tesseract/src/dict/context.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * File: context.cpp (Formerly context.c) |
4 | | * Description: Context checking functions |
5 | | * Author: Mark Seaman, OCR Technology |
6 | | * |
7 | | * (c) Copyright 1990, Hewlett-Packard Company. |
8 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | ** you may not use this file except in compliance with the License. |
10 | | ** You may obtain a copy of the License at |
11 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
12 | | ** Unless required by applicable law or agreed to in writing, software |
13 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
14 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | ** See the License for the specific language governing permissions and |
16 | | ** limitations under the License. |
17 | | * |
18 | | *****************************************************************************/ |
19 | | |
20 | | #include "dict.h" |
21 | | #include "unicharset.h" |
22 | | |
23 | | namespace tesseract { |
24 | | |
25 | | static const int kMinAbsoluteGarbageWordLength = 10; |
26 | | static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; |
27 | | |
28 | | const int case_state_table[6][4] = { |
29 | | {/* 0. Beginning of word */ |
30 | | /* P U L D */ |
31 | | /* -1. Error on case */ |
32 | | 0, 1, 5, 4}, |
33 | | {/* 1. After initial capital */ |
34 | | 0, 3, 2, 4}, |
35 | | {/* 2. After lower case */ |
36 | | 0, -1, 2, -1}, |
37 | | {/* 3. After upper case */ |
38 | | 0, 3, -1, 4}, |
39 | | {/* 4. After a digit */ |
40 | | 0, -1, -1, 4}, |
41 | | {/* 5. After initial lower case */ |
42 | | 5, -1, 2, -1}, |
43 | | }; |
44 | | |
45 | 1.88M | int Dict::case_ok(const WERD_CHOICE &word) const { |
46 | 1.88M | int state = 0; |
47 | 1.88M | const UNICHARSET *unicharset = word.unicharset(); |
48 | 13.6M | for (unsigned x = 0; x < word.length(); ++x) { |
49 | 12.0M | UNICHAR_ID ch_id = word.unichar_id(x); |
50 | 12.0M | if (unicharset->get_isupper(ch_id)) { |
51 | 3.65M | state = case_state_table[state][1]; |
52 | 8.41M | } else if (unicharset->get_islower(ch_id)) { |
53 | 3.50M | state = case_state_table[state][2]; |
54 | 4.91M | } else if (unicharset->get_isdigit(ch_id)) { |
55 | 192k | state = case_state_table[state][3]; |
56 | 4.71M | } else { |
57 | 4.71M | state = case_state_table[state][0]; |
58 | 4.71M | } |
59 | 12.0M | if (state == -1) { |
60 | 354k | return false; |
61 | 354k | } |
62 | 12.0M | } |
63 | 1.53M | return state != 5; // single lower is bad |
64 | 1.88M | } |
65 | | |
66 | 0 | bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) { |
67 | 0 | if (word.length() < kMinAbsoluteGarbageWordLength) { |
68 | 0 | return false; |
69 | 0 | } |
70 | 0 | int num_alphanum = 0; |
71 | 0 | for (unsigned x = 0; x < word.length(); ++x) { |
72 | 0 | num_alphanum += |
73 | 0 | (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x))); |
74 | 0 | } |
75 | 0 | return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) < |
76 | 0 | kMinAbsoluteGarbageAlphanumFrac); |
77 | 0 | } |
78 | | |
79 | | } // namespace tesseract |