Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/include/tesseract/unichar.h
Line
Count
Source (jump to first uncovered line)
1
// SPDX-License-Identifier: Apache-2.0
2
// File:        unichar.h
3
// Description: Unicode character/ligature class.
4
// Author:      Ray Smith
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
17
#ifndef TESSERACT_CCUTIL_UNICHAR_H_
18
#define TESSERACT_CCUTIL_UNICHAR_H_
19
20
#include "export.h"
21
22
#include <memory.h>
23
#include <cstring>
24
#include <string>
25
#include <vector>
26
27
namespace tesseract {
28
29
// Maximum number of characters that can be stored in a UNICHAR. Must be
30
// at least 4. Must not exceed 31 without changing the coding of length.
31
1.15G
#define UNICHAR_LEN 30
32
33
// A UNICHAR_ID is the unique id of a unichar.
34
using UNICHAR_ID = int;
35
36
// A variable to indicate an invalid or uninitialized unichar id.
37
static const int INVALID_UNICHAR_ID = -1;
38
// A special unichar that corresponds to INVALID_UNICHAR_ID.
39
static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
40
41
enum StrongScriptDirection {
42
  DIR_NEUTRAL = 0,       // Text contains only neutral characters.
43
  DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
44
  DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
45
  DIR_MIX = 3,           // Text contains a mixture of left-to-right
46
                         // and right-to-left characters.
47
};
48
49
using char32 = signed int;
50
51
// The UNICHAR class holds a single classification result. This may be
52
// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
53
// multiple Unicode characters representing the NFKC expansion of a ligature
54
// such as fi, ffl etc. These are also stored as utf8.
55
class TESS_API UNICHAR {
56
public:
57
0
  UNICHAR() {
58
0
    memset(chars, 0, UNICHAR_LEN);
59
0
  }
60
61
  // Construct from a utf8 string. If len<0 then the string is null terminated.
62
  // If the string is too long to fit in the UNICHAR then it takes only what
63
  // will fit.
64
  UNICHAR(const char *utf8_str, int len);
65
66
  // Construct from a single UCS4 character.
67
  explicit UNICHAR(int unicode);
68
69
  // Default copy constructor and operator= are OK.
70
71
  // Get the first character as UCS-4.
72
  int first_uni() const;
73
74
  // Get the length of the UTF8 string.
75
0
  int utf8_len() const {
76
0
    int len = chars[UNICHAR_LEN - 1];
77
0
    return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
78
0
  }
79
80
  // Get a UTF8 string, but NOT nullptr terminated.
81
0
  const char *utf8() const {
82
0
    return chars;
83
0
  }
84
85
  // Get a terminated UTF8 string: Must delete[] it after use.
86
  char *utf8_str() const;
87
88
  // Get the number of bytes in the first character of the given utf8 string.
89
  static int utf8_step(const char *utf8_str);
90
91
  // A class to simplify iterating over and accessing elements of a UTF8
92
  // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
93
  // take ownership of the underlying byte array. It also does not permit
94
  // modification of the array (as the name suggests).
95
  //
96
  // Example:
97
  //   for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
98
  //        it != UNICHAR::end(str, len);
99
  //        ++it) {
100
  //     printf("UCS-4 symbol code = %d\n", *it);
101
  //     char buf[5];
102
  //     int char_len = it.get_utf8(buf); buf[char_len] = '\0';
103
  //     printf("Char = %s\n", buf);
104
  //   }
105
  class TESS_API const_iterator {
106
    using CI = const_iterator;
107
108
  public:
109
    // Step to the next UTF8 character.
110
    // If the current position is at an illegal UTF8 character, then print an
111
    // error message and step by one byte. If the current position is at a
112
    // nullptr value, don't step past it.
113
    const_iterator &operator++();
114
115
    // Return the UCS-4 value at the current position.
116
    // If the current position is at an illegal UTF8 value, return a single
117
    // space character.
118
    int operator*() const;
119
120
    // Store the UTF-8 encoding of the current codepoint into buf, which must be
121
    // at least 4 bytes long. Return the number of bytes written.
122
    // If the current position is at an illegal UTF8 value, writes a single
123
    // space character and returns 1.
124
    // Note that this method does not null-terminate the buffer.
125
    int get_utf8(char *buf) const;
126
    // Returns the number of bytes of the current codepoint. Returns 1 if the
127
    // current position is at an illegal UTF8 value.
128
    int utf8_len() const;
129
    // Returns true if the UTF-8 encoding at the current position is legal.
130
    bool is_legal() const;
131
132
    // Return the pointer into the string at the current position.
133
0
    const char *utf8_data() const {
134
0
      return it_;
135
0
    }
136
137
    // Iterator equality operators.
138
0
    friend bool operator==(const CI &lhs, const CI &rhs) {
139
0
      return lhs.it_ == rhs.it_;
140
0
    }
141
0
    friend bool operator!=(const CI &lhs, const CI &rhs) {
142
0
      return !(lhs == rhs);
143
0
    }
144
145
  private:
146
    friend class UNICHAR;
147
0
    explicit const_iterator(const char *it) : it_(it) {}
148
149
    const char *it_; // Pointer into the string.
150
  };
151
152
  // Create a start/end iterator pointing to a string. Note that these methods
153
  // are static and do NOT create a copy or take ownership of the underlying
154
  // array.
155
  static const_iterator begin(const char *utf8_str, int byte_length);
156
  static const_iterator end(const char *utf8_str, int byte_length);
157
158
  // Converts a utf-8 string to a vector of unicodes.
159
  // Returns an empty vector if the input contains invalid UTF-8.
160
  static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
161
  // Converts a vector of unicodes to a utf8 string.
162
  // Returns an empty string if the input contains an invalid unicode.
163
  static std::string UTF32ToUTF8(const std::vector<char32> &str32);
164
165
private:
166
  // A UTF-8 representation of 1 or more Unicode characters.
167
  // The last element (chars[UNICHAR_LEN - 1]) is a length if
168
  // its value < UNICHAR_LEN, otherwise it is a genuine character.
169
  char chars[UNICHAR_LEN]{};
170
};
171
172
} // namespace tesseract
173
174
#endif // TESSERACT_CCUTIL_UNICHAR_H_