/src/mozilla-central/intl/unicharutil/util/IrishCasing.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | /****************************************************************************** |
7 | | |
8 | | This file provides a finite state machine to support Irish Gaelic uppercasing |
9 | | rules. |
10 | | |
11 | | The caller will need to iterate through a string, passing a State variable |
12 | | along with the current character to each UpperCase call and checking the flags |
13 | | that are returned: |
14 | | |
15 | | If aMarkPos is true, caller must remember the current index in the string as |
16 | | a possible target for a future action. |
17 | | |
18 | | If aAction is non-zero, then one or more characters from the marked index are |
19 | | to be modified: |
20 | | 1 lowercase the marked letter |
21 | | 2 lowercase the marked letter and its successor |
22 | | 3 lowercase the marked letter, and delete its successor |
23 | | |
24 | | |
25 | | ### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639, |
26 | | ### comments 1 and 4: |
27 | | |
28 | | v = [a,á,e,é,i,í,o,ó,u,ú] |
29 | | V = [A,Á,E,É,I,Í,O,Ó,U,Ú] |
30 | | |
31 | | bhf -> bhF |
32 | | bhF -> bhF |
33 | | bp -> bP |
34 | | bP -> bP |
35 | | dt -> dT |
36 | | dT -> dT |
37 | | gc -> gC |
38 | | gC -> gC |
39 | | h{V} -> h{V} |
40 | | mb -> mB |
41 | | mB -> mB |
42 | | n-{v} -> n{V} |
43 | | n{V} -> n{V} |
44 | | nd -> nD |
45 | | nD -> nD |
46 | | ng -> nG |
47 | | nG -> nG |
48 | | t-{v} -> t{V} |
49 | | t{V} -> t{V} |
50 | | ts{v} -> tS{V} |
51 | | tS{v} -> tS{V} |
52 | | tS{V} -> tS{V} |
53 | | tsl -> tSL |
54 | | tSl -> tSL |
55 | | tSL -> tSL |
56 | | tsn -> tSN |
57 | | tSn -> tSN |
58 | | tSN -> tSN |
59 | | tsr -> tSR |
60 | | tSr -> tSR |
61 | | tSR -> tSR |
62 | | |
63 | | ### Create table of states and actions for each input class. |
64 | | |
65 | | Start (non-word) state is #; generic in-word state is _, once we know there's |
66 | | no special action to do in this word. |
67 | | |
68 | | # _ b bh d g h m n n- t t- ts |
69 | | input\state |
70 | | b b' _ _ _ _ _ _ 1 _ _ _ _ _ |
71 | | B _ _ _ _ _ _ _ 1 _ _ _ _ _ |
72 | | c _ _ _ _ _ 1 _ _ _ _ _ _ _ |
73 | | C _ _ _ _ _ 1 _ _ _ _ _ _ _ |
74 | | d d' _ _ _ _ _ _ _ 1 _ _ _ _ |
75 | | D _ _ _ _ _ _ _ _ 1 _ _ _ _ |
76 | | f _ _ _ 2 _ _ _ _ _ _ _ _ _ |
77 | | F _ _ _ 2 _ _ _ _ _ _ _ _ _ |
78 | | g g' _ _ _ _ _ _ _ 1 _ _ _ _ |
79 | | G _ _ _ _ _ _ _ _ 1 _ _ _ _ |
80 | | h h' _ bh _ _ _ _ _ _ _ _ _ _ |
81 | | l _ _ _ _ _ _ _ _ _ _ _ _ 1 |
82 | | L _ _ _ _ _ _ _ _ _ _ _ _ 1 |
83 | | m m' _ _ _ _ _ _ _ _ _ _ _ _ |
84 | | n n' _ _ _ _ _ _ _ _ _ _ _ 1 |
85 | | N _ _ _ _ _ _ _ _ _ _ _ _ 1 |
86 | | p _ _ 1 _ _ _ _ _ _ _ _ _ _ |
87 | | P _ _ 1 _ _ _ _ _ _ _ _ _ _ |
88 | | r _ _ _ _ _ _ _ _ _ _ _ _ 1 |
89 | | R _ _ _ _ _ _ _ _ _ _ _ _ 1 |
90 | | s _ _ _ _ _ _ _ _ _ _ ts _ _ |
91 | | S _ _ _ _ _ _ _ _ _ _ ts _ _ |
92 | | t t' _ _ _ 1 _ _ _ _ _ _ _ _ |
93 | | T _ _ _ _ 1 _ _ _ _ _ _ _ _ |
94 | | vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1 |
95 | | Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1 |
96 | | hyph _ _ _ _ _ _ _ _ n- _ t- _ _ |
97 | | letter _ _ _ _ _ _ _ _ _ _ _ _ _ |
98 | | other # # # # # # # # # # # # # |
99 | | |
100 | | Actions: |
101 | | 1 lowercase one letter at start of word |
102 | | 2 lowercase two letters at start of word |
103 | | 1d lowercase one letter at start of word, and delete next |
104 | | (and then go to state _, nothing further to do in this word) |
105 | | |
106 | | else just go to the given state; suffix ' indicates mark start-of-word. |
107 | | |
108 | | ### Consolidate identical states and classes: |
109 | | |
110 | | 0 1 2 3 4 5 6 7 8 9 A B |
111 | | # _ b bh d g h m n [nt]- t ts |
112 | | input\state |
113 | | b b' _ _ _ _ _ _ 1 _ _ _ _ |
114 | | B _ _ _ _ _ _ _ 1 _ _ _ _ |
115 | | [cC] _ _ _ _ _ 1 _ _ _ _ _ _ |
116 | | d d' _ _ _ _ _ _ _ 1 _ _ _ |
117 | | [DG] _ _ _ _ _ _ _ _ 1 _ _ _ |
118 | | [fF] _ _ _ 2 _ _ _ _ _ _ _ _ |
119 | | g g' _ _ _ _ _ _ _ 1 _ _ _ |
120 | | h h' _ bh _ _ _ _ _ _ _ _ _ |
121 | | [lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1 |
122 | | m m' _ _ _ _ _ _ _ _ _ _ _ |
123 | | n n' _ _ _ _ _ _ _ _ _ _ 1 |
124 | | [pP] _ _ 1 _ _ _ _ _ _ _ _ _ |
125 | | [sS] _ _ _ _ _ _ _ _ _ _ ts _ |
126 | | t t' _ _ _ 1 _ _ _ _ _ _ _ |
127 | | T _ _ _ _ 1 _ _ _ _ _ _ _ |
128 | | vowel _ _ _ _ _ _ _ _ _ 1d _ 1 |
129 | | Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1 |
130 | | hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _ |
131 | | letter _ _ _ _ _ _ _ _ _ _ _ _ |
132 | | other # # # # # # # # # # # # |
133 | | |
134 | | So we have 20 input classes, and 12 states. |
135 | | |
136 | | State table array will contain bytes that encode action and new state: |
137 | | |
138 | | 0x80 - bit flag: mark start-of-word position |
139 | | 0x40 - currently unused |
140 | | 0x30 - action mask: 4 values |
141 | | 0x00 - do nothing |
142 | | 0x10 - lowercase one letter |
143 | | 0x20 - lowercase two letters |
144 | | 0x30 - lowercase one, delete one |
145 | | 0x0F - next-state mask |
146 | | ******************************************************************************/ |
147 | | |
148 | | #include "IrishCasing.h" |
149 | | |
150 | | #include "nsUnicodeProperties.h" |
151 | | #include "nsUnicharUtils.h" |
152 | | |
153 | | namespace mozilla { |
154 | | |
155 | | const uint8_t |
156 | | IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = { |
157 | | // # _ b bh d g h m n [nt]- t ts |
158 | | { 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b |
159 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B |
160 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC] |
161 | | { 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d |
162 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG] |
163 | | { 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF] |
164 | | { 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g |
165 | | { 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h |
166 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR] |
167 | | { 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m |
168 | | { 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n |
169 | | { 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP] |
170 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS] |
171 | | { 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t |
172 | | { 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T |
173 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel |
174 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel |
175 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph |
176 | | { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter |
177 | | { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } // other |
178 | | }; |
179 | | |
180 | 0 | #define HYPHEN 0x2010 |
181 | 0 | #define NO_BREAK_HYPHEN 0x2011 |
182 | 0 | #define a_ACUTE 0x00e1 |
183 | 0 | #define e_ACUTE 0x00e9 |
184 | 0 | #define i_ACUTE 0x00ed |
185 | 0 | #define o_ACUTE 0x00f3 |
186 | 0 | #define u_ACUTE 0x00fa |
187 | 0 | #define A_ACUTE 0x00c1 |
188 | 0 | #define E_ACUTE 0x00c9 |
189 | 0 | #define I_ACUTE 0x00cd |
190 | 0 | #define O_ACUTE 0x00d3 |
191 | 0 | #define U_ACUTE 0x00da |
192 | | |
193 | | const uint8_t IrishCasing::sLcClasses[26] = { |
194 | | kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel, |
195 | | kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter, |
196 | | kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel, |
197 | | kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t, |
198 | | kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter, |
199 | | kClass_letter |
200 | | }; |
201 | | |
202 | | const uint8_t IrishCasing::sUcClasses[26] = { |
203 | | kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel, |
204 | | kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter, |
205 | | kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel, |
206 | | kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T, |
207 | | kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter, |
208 | | kClass_letter |
209 | | }; |
210 | | |
211 | | uint8_t |
212 | | IrishCasing::GetClass(uint32_t aCh) |
213 | 0 | { |
214 | 0 | using mozilla::unicode::GetGenCategory; |
215 | 0 | if (aCh >= 'a' && aCh <= 'z') { |
216 | 0 | return sLcClasses[aCh - 'a']; |
217 | 0 | } else if (aCh >= 'A' && aCh <= 'Z') { |
218 | 0 | return sUcClasses[aCh - 'A']; |
219 | 0 | } else if (GetGenCategory(aCh) == nsUGenCategory::kLetter) { |
220 | 0 | if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE || |
221 | 0 | aCh == o_ACUTE || aCh == u_ACUTE) { |
222 | 0 | return kClass_vowel; |
223 | 0 | } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE || |
224 | 0 | aCh == O_ACUTE || aCh == U_ACUTE) { |
225 | 0 | return kClass_Vowel; |
226 | 0 | } else { |
227 | 0 | return kClass_letter; |
228 | 0 | } |
229 | 0 | } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) { |
230 | 0 | return kClass_hyph; |
231 | 0 | } else { |
232 | 0 | return kClass_other; |
233 | 0 | } |
234 | 0 | } |
235 | | |
236 | | uint32_t |
237 | | IrishCasing::UpperCase(uint32_t aCh, State& aState, |
238 | | bool& aMarkPos, uint8_t& aAction) |
239 | 0 | { |
240 | 0 | uint8_t cls = GetClass(aCh); |
241 | 0 | uint8_t stateEntry = sUppercaseStateTable[cls][aState]; |
242 | 0 | aMarkPos = !!(stateEntry & kMarkPositionFlag); |
243 | 0 | aAction = (stateEntry & kActionMask) >> kActionShift; |
244 | 0 | aState = State(stateEntry & kNextStateMask); |
245 | 0 |
|
246 | 0 | return ToUpperCase(aCh); |
247 | 0 | } |
248 | | |
249 | | } // namespace mozilla |