Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/unicharutil/util/IrishCasing.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* This Source Code Form is subject to the terms of the Mozilla Public
3
 * License, v. 2.0. If a copy of the MPL was not distributed with this
4
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
/******************************************************************************
7
8
This file provides a finite state machine to support Irish Gaelic uppercasing
9
rules.
10
11
The caller will need to iterate through a string, passing a State variable
12
along with the current character to each UpperCase call and checking the flags
13
that are returned:
14
15
  If aMarkPos is true, caller must remember the current index in the string as
16
  a possible target for a future action.
17
18
  If aAction is non-zero, then one or more characters from the marked index are
19
  to be modified:
20
    1  lowercase the marked letter
21
    2  lowercase the marked letter and its successor
22
    3  lowercase the marked letter, and delete its successor
23
24
25
### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
26
### comments 1 and 4:
27
28
v = [a,á,e,é,i,í,o,ó,u,ú]
29
V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
30
31
bhf -> bhF
32
bhF -> bhF
33
bp  -> bP
34
bP  -> bP
35
dt  -> dT
36
dT  -> dT
37
gc  -> gC
38
gC  -> gC
39
h{V}  -> h{V}
40
mb  -> mB
41
mB  -> mB
42
n-{v} -> n{V}
43
n{V} -> n{V}
44
nd  -> nD
45
nD  -> nD
46
ng  -> nG
47
nG  -> nG
48
t-{v} -> t{V}
49
t{V} -> t{V}
50
ts{v} -> tS{V}
51
tS{v} -> tS{V}
52
tS{V} -> tS{V}
53
tsl  -> tSL
54
tSl  -> tSL
55
tSL  -> tSL
56
tsn  -> tSN
57
tSn  -> tSN
58
tSN  -> tSN
59
tsr  -> tSR
60
tSr  -> tSR
61
tSR  -> tSR
62
63
### Create table of states and actions for each input class.
64
65
Start (non-word) state is #; generic in-word state is _, once we know there's
66
no special action to do in this word.
67
68
         #   _   b   bh  d   g   h   m   n   n-  t   t-  ts
69
input\state
70
b        b'  _   _   _   _   _   _   1   _   _   _   _   _
71
B        _   _   _   _   _   _   _   1   _   _   _   _   _
72
c        _   _   _   _   _   1   _   _   _   _   _   _   _
73
C        _   _   _   _   _   1   _   _   _   _   _   _   _
74
d        d'  _   _   _   _   _   _   _   1   _   _   _   _
75
D        _   _   _   _   _   _   _   _   1   _   _   _   _
76
f        _   _   _   2   _   _   _   _   _   _   _   _   _
77
F        _   _   _   2   _   _   _   _   _   _   _   _   _
78
g        g'  _   _   _   _   _   _   _   1   _   _   _   _
79
G        _   _   _   _   _   _   _   _   1   _   _   _   _
80
h        h'  _   bh  _   _   _   _   _   _   _   _   _   _
81
l        _   _   _   _   _   _   _   _   _   _   _   _   1
82
L        _   _   _   _   _   _   _   _   _   _   _   _   1
83
m        m'  _   _   _   _   _   _   _   _   _   _   _   _
84
n        n'  _   _   _   _   _   _   _   _   _   _   _   1
85
N        _   _   _   _   _   _   _   _   _   _   _   _   1
86
p        _   _   1   _   _   _   _   _   _   _   _   _   _
87
P        _   _   1   _   _   _   _   _   _   _   _   _   _
88
r        _   _   _   _   _   _   _   _   _   _   _   _   1
89
R        _   _   _   _   _   _   _   _   _   _   _   _   1
90
s        _   _   _   _   _   _   _   _   _   _   ts  _   _
91
S        _   _   _   _   _   _   _   _   _   _   ts  _   _
92
t        t'  _   _   _   1   _   _   _   _   _   _   _   _
93
T        _   _   _   _   1   _   _   _   _   _   _   _   _
94
vowel    _   _   _   _   _   _   _   _   _   1d  _   1d  1
95
Vowel    _   _   _   _   _   _   1   _   1   _   1   _   1
96
hyph     _   _   _   _   _   _   _   _   n-  _   t-  _   _
97
letter   _   _   _   _   _   _   _   _   _   _   _   _   _
98
other    #   #   #   #   #   #   #   #   #   #   #   #   #
99
100
Actions:
101
  1            lowercase one letter at start of word
102
  2            lowercase two letters at start of word
103
  1d           lowercase one letter at start of word, and delete next
104
               (and then go to state _, nothing further to do in this word)
105
106
else just go to the given state; suffix ' indicates mark start-of-word.
107
108
### Consolidate identical states and classes:
109
110
         0   1   2   3   4   5   6   7   8   9   A   B
111
         #   _   b   bh  d   g   h   m   n [nt]- t   ts
112
input\state
113
b        b'  _   _   _   _   _   _   1   _   _   _   _
114
B        _   _   _   _   _   _   _   1   _   _   _   _
115
[cC]     _   _   _   _   _   1   _   _   _   _   _   _
116
d        d'  _   _   _   _   _   _   _   1   _   _   _
117
[DG]     _   _   _   _   _   _   _   _   1   _   _   _
118
[fF]     _   _   _   2   _   _   _   _   _   _   _   _
119
g        g'  _   _   _   _   _   _   _   1   _   _   _
120
h        h'  _   bh  _   _   _   _   _   _   _   _   _
121
[lLNrR]  _   _   _   _   _   _   _   _   _   _   _   1
122
m        m'  _   _   _   _   _   _   _   _   _   _   _
123
n        n'  _   _   _   _   _   _   _   _   _   _   1
124
[pP]     _   _   1   _   _   _   _   _   _   _   _   _
125
[sS]     _   _   _   _   _   _   _   _   _   _   ts  _
126
t        t'  _   _   _   1   _   _   _   _   _   _   _
127
T        _   _   _   _   1   _   _   _   _   _   _   _
128
vowel    _   _   _   _   _   _   _   _   _   1d  _   1
129
Vowel    _   _   _   _   _   _   1   _   1   _   1   1
130
hyph     _   _   _   _   _   _   _   _ [nt-] _ [nt-] _
131
letter   _   _   _   _   _   _   _   _   _   _   _   _
132
other    #   #   #   #   #   #   #   #   #   #   #   #
133
134
So we have 20 input classes, and 12 states.
135
136
State table array will contain bytes that encode action and new state:
137
138
  0x80  -  bit flag: mark start-of-word position
139
  0x40  -  currently unused
140
  0x30  -  action mask: 4 values
141
           0x00  -  do nothing
142
           0x10  -  lowercase one letter
143
           0x20  -  lowercase two letters
144
           0x30  -  lowercase one, delete one
145
  0x0F  -  next-state mask
146
******************************************************************************/
147
148
#include "IrishCasing.h"
149
150
#include "nsUnicodeProperties.h"
151
#include "nsUnicharUtils.h"
152
153
namespace mozilla {
154
155
const uint8_t
156
IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
157
//  #     _     b     bh    d     g     h     m     n     [nt]- t     ts
158
  { 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b
159
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B
160
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC]
161
  { 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d
162
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG]
163
  { 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF]
164
  { 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g
165
  { 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h
166
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR]
167
  { 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m
168
  { 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n
169
  { 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP]
170
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS]
171
  { 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t
172
  { 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T
173
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel
174
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel
175
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph
176
  { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter
177
  { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }  // other
178
};
179
180
0
#define HYPHEN          0x2010
181
0
#define NO_BREAK_HYPHEN 0x2011
182
0
#define a_ACUTE         0x00e1
183
0
#define e_ACUTE         0x00e9
184
0
#define i_ACUTE         0x00ed
185
0
#define o_ACUTE         0x00f3
186
0
#define u_ACUTE         0x00fa
187
0
#define A_ACUTE         0x00c1
188
0
#define E_ACUTE         0x00c9
189
0
#define I_ACUTE         0x00cd
190
0
#define O_ACUTE         0x00d3
191
0
#define U_ACUTE         0x00da
192
193
const uint8_t IrishCasing::sLcClasses[26] = {
194
  kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
195
  kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
196
  kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
197
  kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
198
  kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
199
  kClass_letter
200
};
201
202
const uint8_t IrishCasing::sUcClasses[26] = {
203
  kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
204
  kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
205
  kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
206
  kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
207
  kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
208
  kClass_letter
209
};
210
211
uint8_t
212
IrishCasing::GetClass(uint32_t aCh)
213
0
{
214
0
  using mozilla::unicode::GetGenCategory;
215
0
  if (aCh >= 'a' && aCh <= 'z') {
216
0
    return sLcClasses[aCh - 'a'];
217
0
  } else if (aCh >= 'A' && aCh <= 'Z') {
218
0
    return sUcClasses[aCh - 'A'];
219
0
  } else if (GetGenCategory(aCh) == nsUGenCategory::kLetter) {
220
0
    if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE ||
221
0
        aCh == o_ACUTE || aCh == u_ACUTE) {
222
0
      return kClass_vowel;
223
0
    } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
224
0
               aCh == O_ACUTE || aCh == U_ACUTE) {
225
0
      return kClass_Vowel;
226
0
    } else {
227
0
      return kClass_letter;
228
0
    }
229
0
  } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
230
0
    return kClass_hyph;
231
0
  } else {
232
0
    return kClass_other;
233
0
  }
234
0
}
235
236
uint32_t
237
IrishCasing::UpperCase(uint32_t aCh, State& aState,
238
                       bool& aMarkPos, uint8_t& aAction)
239
0
{
240
0
  uint8_t cls = GetClass(aCh);
241
0
  uint8_t stateEntry = sUppercaseStateTable[cls][aState];
242
0
  aMarkPos = !!(stateEntry & kMarkPositionFlag);
243
0
  aAction = (stateEntry & kActionMask) >> kActionShift;
244
0
  aState = State(stateEntry & kNextStateMask);
245
0
246
0
  return ToUpperCase(aCh);
247
0
}
248
249
} // namespace mozilla