/src/tesseract/src/dict/context.cpp

Source (jump to first uncovered line)
/******************************************************************************
 *
 * File:         context.cpp  (Formerly context.c)
 * Description:  Context checking functions
 * Author:       Mark Seaman, OCR Technology
 *
 * (c) Copyright 1990, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 *****************************************************************************/

#include "dict.h"
#include "unicharset.h"

namespace tesseract {

static const int kMinAbsoluteGarbageWordLength = 10;
static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;

const int case_state_table[6][4] = {
    {/*  0. Beginning of word       */
     /*    P   U   L   D                                          */
     /* -1. Error on case           */
     0, 1, 5, 4},
    {/*  1. After initial capital    */
     0, 3, 2, 4},
    {/*  2. After lower case         */
     0, -1, 2, -1},
    {/*  3. After upper case         */
     0, 3, -1, 4},
    {/*  4. After a digit            */
     0, -1, -1, 4},
    {/*  5. After initial lower case */
     5, -1, 2, -1},
};

int Dict::case_ok(const WERD_CHOICE &word) const {
  int state = 0;
  const UNICHARSET *unicharset = word.unicharset();
  for (unsigned x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset->get_isupper(ch_id)) {
      state = case_state_table[state][1];
    } else if (unicharset->get_islower(ch_id)) {
      state = case_state_table[state][2];
    } else if (unicharset->get_isdigit(ch_id)) {
      state = case_state_table[state][3];
    } else {
      state = case_state_table[state][0];
    }
    if (state == -1) {
      return false;
    }
  }
  return state != 5; // single lower is bad
}

bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
  if (word.length() < kMinAbsoluteGarbageWordLength) {
    return false;
  }
  int num_alphanum = 0;
  for (unsigned x = 0; x < word.length(); ++x) {
    num_alphanum +=
        (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
  }
  return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
          kMinAbsoluteGarbageAlphanumFrac);
}

} // namespace tesseract

Line	Count	Source (jump to first uncovered line)
1		/******************************************************************************
2		*
3		* File: context.cpp (Formerly context.c)
4		* Description: Context checking functions
5		* Author: Mark Seaman, OCR Technology
6		*
7		* (c) Copyright 1990, Hewlett-Packard Company.
8		** Licensed under the Apache License, Version 2.0 (the "License");
9		** you may not use this file except in compliance with the License.
10		** You may obtain a copy of the License at
11		** http://www.apache.org/licenses/LICENSE-2.0
12		** Unless required by applicable law or agreed to in writing, software
13		** distributed under the License is distributed on an "AS IS" BASIS,
14		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		** See the License for the specific language governing permissions and
16		** limitations under the License.
17		*
18		*****************************************************************************/
19
20		#include "dict.h"
21		#include "unicharset.h"
22
23		namespace tesseract {
24
25		static const int kMinAbsoluteGarbageWordLength = 10;
26		static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
27
28		const int case_state_table[6][4] = {
29		{/* 0. Beginning of word */
30		/* P U L D */
31		/* -1. Error on case */
32		0, 1, 5, 4},
33		{/* 1. After initial capital */
34		0, 3, 2, 4},
35		{/* 2. After lower case */
36		0, -1, 2, -1},
37		{/* 3. After upper case */
38		0, 3, -1, 4},
39		{/* 4. After a digit */
40		0, -1, -1, 4},
41		{/* 5. After initial lower case */
42		5, -1, 2, -1},
43		};
44
45	1.88M	int Dict::case_ok(const WERD_CHOICE &word) const {
46	1.88M	int state = 0;
47	1.88M	const UNICHARSET *unicharset = word.unicharset();
48	13.6M	for (unsigned x = 0; x < word.length(); ++x) {
49	12.0M	UNICHAR_ID ch_id = word.unichar_id(x);
50	12.0M	if (unicharset->get_isupper(ch_id)) {
51	3.65M	state = case_state_table[state][1];
52	8.41M	} else if (unicharset->get_islower(ch_id)) {
53	3.50M	state = case_state_table[state][2];
54	4.91M	} else if (unicharset->get_isdigit(ch_id)) {
55	192k	state = case_state_table[state][3];
56	4.71M	} else {
57	4.71M	state = case_state_table[state][0];
58	4.71M	}
59	12.0M	if (state == -1) {
60	354k	return false;
61	354k	}
62	12.0M	}
63	1.53M	return state != 5; // single lower is bad
64	1.88M	}
65
66	0	bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
67	0	if (word.length() < kMinAbsoluteGarbageWordLength) {
68	0	return false;
69	0	}
70	0	int num_alphanum = 0;
71	0	for (unsigned x = 0; x < word.length(); ++x) {
72	0	num_alphanum +=
73	0	(unicharset.get_isalpha(word.unichar_id(x)) \|\| unicharset.get_isdigit(word.unichar_id(x)));
74	0	}
75	0	return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76	0	kMinAbsoluteGarbageAlphanumFrac);
77	0	}
78
79		} // namespace tesseract

Coverage Report

Created: 2024-02-28 06:46