Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/dict/context.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * File:         context.cpp  (Formerly context.c)
4
 * Description:  Context checking functions
5
 * Author:       Mark Seaman, OCR Technology
6
 *
7
 * (c) Copyright 1990, Hewlett-Packard Company.
8
 ** Licensed under the Apache License, Version 2.0 (the "License");
9
 ** you may not use this file except in compliance with the License.
10
 ** You may obtain a copy of the License at
11
 ** http://www.apache.org/licenses/LICENSE-2.0
12
 ** Unless required by applicable law or agreed to in writing, software
13
 ** distributed under the License is distributed on an "AS IS" BASIS,
14
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 ** See the License for the specific language governing permissions and
16
 ** limitations under the License.
17
 *
18
 *****************************************************************************/
19
20
#include "dict.h"
21
#include "unicharset.h"
22
23
namespace tesseract {
24
25
static const int kMinAbsoluteGarbageWordLength = 10;
26
static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
27
28
const int case_state_table[6][4] = {
29
    {/*  0. Beginning of word       */
30
     /*    P   U   L   D                                          */
31
     /* -1. Error on case           */
32
     0, 1, 5, 4},
33
    {/*  1. After initial capital    */
34
     0, 3, 2, 4},
35
    {/*  2. After lower case         */
36
     0, -1, 2, -1},
37
    {/*  3. After upper case         */
38
     0, 3, -1, 4},
39
    {/*  4. After a digit            */
40
     0, -1, -1, 4},
41
    {/*  5. After initial lower case */
42
     5, -1, 2, -1},
43
};
44
45
1.88M
int Dict::case_ok(const WERD_CHOICE &word) const {
46
1.88M
  int state = 0;
47
1.88M
  const UNICHARSET *unicharset = word.unicharset();
48
13.6M
  for (unsigned x = 0; x < word.length(); ++x) {
49
12.0M
    UNICHAR_ID ch_id = word.unichar_id(x);
50
12.0M
    if (unicharset->get_isupper(ch_id)) {
51
3.65M
      state = case_state_table[state][1];
52
8.41M
    } else if (unicharset->get_islower(ch_id)) {
53
3.50M
      state = case_state_table[state][2];
54
4.91M
    } else if (unicharset->get_isdigit(ch_id)) {
55
192k
      state = case_state_table[state][3];
56
4.71M
    } else {
57
4.71M
      state = case_state_table[state][0];
58
4.71M
    }
59
12.0M
    if (state == -1) {
60
354k
      return false;
61
354k
    }
62
12.0M
  }
63
1.53M
  return state != 5; // single lower is bad
64
1.88M
}
65
66
0
bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
67
0
  if (word.length() < kMinAbsoluteGarbageWordLength) {
68
0
    return false;
69
0
  }
70
0
  int num_alphanum = 0;
71
0
  for (unsigned x = 0; x < word.length(); ++x) {
72
0
    num_alphanum +=
73
0
        (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
74
0
  }
75
0
  return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76
0
          kMinAbsoluteGarbageAlphanumFrac);
77
0
}
78
79
} // namespace tesseract