Coverage Report

Created: 2025-11-11 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/myanmar-tools/clients/cpp/zawgyi_detector.cpp
Line
Count
Source
1
// Copyright 2017 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
// https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include <cmath>
16
#include <cstddef>
17
#include <cstdint>
18
#include <cstring>
19
#include <limits>
20
#include <glog/logging.h>
21
#include <unicode/utf8.h>
22
23
#include "public/myanmartools.h"
24
#include "zawgyi_detector-impl.h"
25
26
namespace {
27
const uint8_t kModelData[] = {
28
#include "zawgyi_model_data.inc"
29
};
30
constexpr size_t kModelSize = sizeof kModelData;
31
}  // namespace
32
33
using namespace google_myanmar_tools;
34
35
#if __BYTE_ORDER == __LITTLE_ENDIAN
36
4.05M
#    define BSWAP(dest, bits) __builtin_bswap##bits(dest);
37
#else
38
#    define BSWAP(dest, bits) dest;
39
#endif
40
41
/**
42
 * Loads a big-endian type from ptr to dest. Static-asserts that the number of
43
 * bytes matches the expected size.
44
 */
45
#define BIG_ENDIAN_LOAD(ptr, dest, bits) \
46
4.05M
  static_assert(sizeof(dest) == bits / 8, \
47
4.05M
    "Expected type to be " #bits " bits"); \
48
4.05M
  uint##bits##_t u; \
49
4.05M
  memcpy(&u, ptr, bits / 8); \
50
4.05M
  u = BSWAP(u, bits); \
51
4.05M
  memcpy(&dest, &u, bits / 8);
52
53
866
int64_t BigEndian::loadInt64(const void* ptr) {
54
866
  int64_t dest;
55
866
  BIG_ENDIAN_LOAD(ptr, dest, 64);
56
866
  return dest;
57
866
}
58
59
1.29k
int32_t BigEndian::loadInt32(const void* ptr) {
60
1.29k
  int32_t dest;
61
1.29k
  BIG_ENDIAN_LOAD(ptr, dest, 32);
62
1.29k
  return dest;
63
1.29k
}
64
65
2.04M
int16_t BigEndian::loadInt16(const void* ptr) {
66
2.04M
  int16_t dest;
67
2.04M
  BIG_ENDIAN_LOAD(ptr, dest, 16);
68
2.04M
  return dest;
69
2.04M
}
70
71
2.00M
float BigEndian::loadFloat(const void* ptr) {
72
2.00M
  float dest;
73
2.00M
  BIG_ENDIAN_LOAD(ptr, dest, 32);
74
2.00M
  return dest;
75
2.00M
}
76
77
// Implement Markov Chain processing.
78
433
BinaryMarkovClassifier::BinaryMarkovClassifier(const uint8_t* binary_ptr) {
79
  // Binary formatted file:
80
  // magic number: int64
81
  // version: int32
82
  // int16 size of model N
83
  // N entries of form:
84
  //   int16 entry_count
85
  //   float default_log_value for row unless entry_count is zero
86
  //   entry count items of:
87
  //     byte: index
88
  //     float: log_value
89
90
433
  const uint8_t* data_ptr = binary_ptr;
91
92
433
  model_size_ = 0;
93
433
  model_array_ = nullptr;
94
95
433
  int64_t magic_number;
96
433
  int32_t version;
97
98
433
  magic_number = BigEndian::loadInt64(data_ptr);
99
433
  data_ptr += sizeof(magic_number);
100
101
433
  CHECK_EQ(BINARY_TAG, magic_number);
102
103
433
  version = BigEndian::loadInt32(data_ptr);
104
433
  data_ptr += sizeof(version);
105
106
433
  CHECK_EQ(VERSION, version);
107
108
433
  model_size_ = BigEndian::loadInt16(data_ptr);
109
433
  data_ptr += sizeof(model_size_);
110
433
  VLOG(2) << "BinaryMarkovClassifier size = " << model_size_;
111
112
433
  model_array_ = new float[model_size_ * model_size_];
113
114
433
  float row_default_value;
115
  // Read each "row".
116
98.7k
  for (int row = 0; row < model_size_; ++row) {
117
98.2k
    int16_t row_entry_count;
118
98.2k
    row_entry_count = BigEndian::loadInt16(data_ptr);
119
98.2k
    data_ptr += sizeof(row_entry_count);
120
121
98.2k
    if (row_entry_count != 0) {
122
53.2k
      row_default_value = BigEndian::loadFloat(data_ptr);
123
53.2k
      data_ptr += sizeof(row_default_value);
124
53.2k
    } else {
125
45.0k
      row_default_value = 0.0f;
126
45.0k
    }
127
128
98.2k
    int index;
129
    // Set all the entries in the row to default.
130
22.4M
    for (int col = 0; col < model_size_; ++col) {
131
22.3M
      index = row * model_size_ + col;
132
22.3M
      model_array_[index] = row_default_value;
133
22.3M
    }
134
135
98.2k
    int16_t column;
136
    // Set non-default values.
137
2.04M
    for (int entry = 0; entry < row_entry_count; ++entry) {
138
1.94M
      column = BigEndian::loadInt16(data_ptr);
139
1.94M
      data_ptr += sizeof(column);
140
141
1.94M
      index = row * model_size_ + column;
142
143
1.94M
      model_array_[index] = BigEndian::loadFloat(data_ptr);
144
1.94M
      data_ptr += sizeof(float);
145
1.94M
    }
146
98.2k
  }
147
433
}
148
149
433
BinaryMarkovClassifier::~BinaryMarkovClassifier() {
150
433
  delete[] model_array_;
151
433
}
152
153
132k
float BinaryMarkovClassifier::GetLogProbabilityDifference(int i1, int i2) {
154
132k
  return model_array_[i1 * model_size_ + i2];
155
132k
}
156
157
//----------------------------------------------------------------------------
158
159
// Initialize ZawgyiUnicode models from the stream
160
433
ZawgyiUnicodeMarkovModel::ZawgyiUnicodeMarkovModel(const uint8_t* data_models) {
161
433
  int64_t magic_number;
162
433
  const uint8_t* input_ptr = data_models;
163
164
433
  magic_number = BigEndian::loadInt64(input_ptr);
165
433
  input_ptr += sizeof(magic_number);
166
167
433
  CHECK_EQ(BINARY_TAG, magic_number);
168
169
433
  int32_t version = BigEndian::loadInt32(input_ptr);
170
433
  input_ptr += sizeof(version);
171
172
433
  if (version == 1) {
173
    // No SSV field
174
0
    ssv_ = 0;
175
433
  } else {
176
433
    CHECK_EQ(2, version);
177
433
    ssv_ = BigEndian::loadInt32(input_ptr);
178
433
    input_ptr += sizeof(ssv_);
179
433
    CHECK_GE(ssv_, 0);
180
433
    CHECK_LT(ssv_, SSV_COUNT);
181
433
  }
182
183
433
  classifier_ = new BinaryMarkovClassifier(input_ptr);
184
433
}
185
186
433
ZawgyiUnicodeMarkovModel::~ZawgyiUnicodeMarkovModel() {
187
433
  delete classifier_;
188
433
}
189
190
double
191
ZawgyiUnicodeMarkovModel::Predict(const char* input_utf8,
192
433
                                  int32_t length) const {
193
433
  if (length < 0) {
194
0
    size_t length_size = strlen(input_utf8);
195
0
    if (length_size > __INT32_MAX__) {
196
0
      return -std::numeric_limits<double>::infinity();
197
0
    }
198
0
    length = static_cast<int32_t>(length_size);
199
0
  }
200
201
  // Start at the base state
202
433
  int prevState = 0;
203
204
433
  double totalDelta = 0.0;
205
433
  bool seenTransition = false;
206
14.0M
  for (int32_t i = 0; i <= length;) {
207
14.0M
    int currState;
208
14.0M
    if (i >= length) {
209
433
      currState = 0;
210
433
      i++;
211
14.0M
    } else {
212
14.0M
      char32_t cp;
213
14.0M
      U8_NEXT(input_utf8, i, length, cp);
214
14.0M
      currState = GetIndexForCodePoint(cp);
215
14.0M
    }
216
    // Ignore 0-to-0 transitions
217
14.0M
    if (prevState != 0 || currState != 0) {
218
132k
      float delta =
219
132k
          classifier_->GetLogProbabilityDifference(prevState, currState);
220
132k
      totalDelta += delta;
221
132k
      seenTransition = true;
222
132k
    }
223
14.0M
    prevState = currState;
224
14.0M
  }
225
226
  // Special case: if there is no signal (both log probabilities are zero),
227
  // return -Infinity, which will get interpreted by users as strong Unicode.
228
  // This happens when the input string contains no Myanmar-range code points.
229
433
  if (!seenTransition) {
230
300
    return -std::numeric_limits<double>::infinity();
231
300
  }
232
233
  // result = Pz/(Pu+Pz)
234
  //        = exp(logPz)/(exp(logPu)+exp(logPz))
235
  //        = 1/(1+exp(logPu-logPz))
236
133
  return 1.0 / (1.0 + exp(totalDelta));
237
433
}
238
239
14.0M
int16_t ZawgyiUnicodeMarkovModel::GetIndexForCodePoint(char32_t cp) const {
240
14.0M
  if (STD_CP0 <= cp && cp <= STD_CP1) {
241
41.6k
    return cp - STD_CP0 + STD_OFFSET;
242
41.6k
  }
243
13.9M
  if (AFT_CP0 <= cp && cp <= AFT_CP1) {
244
8.89k
    return cp - AFT_CP0 + AFT_OFFSET;
245
8.89k
  }
246
13.9M
  if (EXA_CP0 <= cp && cp <= EXA_CP1) {
247
22.3k
    return cp - EXA_CP0 + EXA_OFFSET;
248
22.3k
  }
249
13.9M
  if (EXB_CP0 <= cp && cp <= EXB_CP1) {
250
4.48k
    return cp - EXB_CP0 + EXB_OFFSET;
251
4.48k
  }
252
13.9M
  if (ssv_ == SSV_STD_EXA_EXB_SPC && SPC_CP0 <= cp && cp <= SPC_CP1) {
253
3.15k
    return cp - SPC_CP0 + SPC_OFFSET;
254
3.15k
  }
255
13.9M
  return 0;
256
13.9M
}
257
258
259
//----------------------------------------------------------------------------
260
261
// Reads standard detection modes from embedded data.
262
433
ZawgyiDetector::ZawgyiDetector() {
263
433
  CHECK(kModelData) << " null model_data loaded";
264
433
  CHECK(kModelSize > 0) << " model size = " << kModelSize;
265
433
  VLOG(2) << "model_data size = " << kModelSize;
266
  // TODO: Check kModelSize when reading the model?
267
433
  model_ = new ZawgyiUnicodeMarkovModel(kModelData);
268
433
}
269
270
433
ZawgyiDetector::~ZawgyiDetector() {
271
433
  delete model_;
272
433
}
273
274
double ZawgyiDetector::GetZawgyiProbability(const char* input_utf8,
275
433
                                            int32_t length) const {
276
433
  return model_->Predict(input_utf8, length);
277
433
}
278
279
// C bindings (declared with extern "C").
280
0
GMTZawgyiDetector* GMTOpenZawgyiDetector(void) {
281
0
  return reinterpret_cast<GMTZawgyiDetector*>(new ZawgyiDetector());
282
0
}
283
284
0
void GMTCloseZawgyiDetector(GMTZawgyiDetector* detector) {
285
0
  ZawgyiDetector* cppDetector = reinterpret_cast<ZawgyiDetector*>(detector);
286
0
  delete cppDetector;
287
0
}
288
289
0
double GMTGetZawgyiProbability(GMTZawgyiDetector* detector, const char* input_utf8) {
290
0
  return GMTGetZawgyiProbabilityWithLength(detector, input_utf8, -1);
291
0
}
292
293
0
double GMTGetZawgyiProbabilityWithLength(GMTZawgyiDetector* detector, const char* input_utf8, int32_t length) {
294
0
  ZawgyiDetector* cppDetector = reinterpret_cast<ZawgyiDetector*>(detector);
295
0
  return cppDetector->GetZawgyiProbability(input_utf8, length);
296
0
}