/src/mozilla-central/intl/unicharutil/util/GreekCasing.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "GreekCasing.h" |
7 | | #include "nsUnicharUtils.h" |
8 | | #include "nsUnicodeProperties.h" |
9 | | |
10 | | // Custom uppercase mapping for Greek; see bug 307039 for details |
11 | 0 | #define GREEK_LOWER_ALPHA 0x03B1 |
12 | 0 | #define GREEK_LOWER_ALPHA_TONOS 0x03AC |
13 | 0 | #define GREEK_LOWER_ALPHA_OXIA 0x1F71 |
14 | 0 | #define GREEK_LOWER_EPSILON 0x03B5 |
15 | 0 | #define GREEK_LOWER_EPSILON_TONOS 0x03AD |
16 | 0 | #define GREEK_LOWER_EPSILON_OXIA 0x1F73 |
17 | 0 | #define GREEK_LOWER_ETA 0x03B7 |
18 | 0 | #define GREEK_LOWER_ETA_TONOS 0x03AE |
19 | 0 | #define GREEK_LOWER_ETA_OXIA 0x1F75 |
20 | 0 | #define GREEK_LOWER_IOTA 0x03B9 |
21 | 0 | #define GREEK_LOWER_IOTA_TONOS 0x03AF |
22 | 0 | #define GREEK_LOWER_IOTA_OXIA 0x1F77 |
23 | 0 | #define GREEK_LOWER_IOTA_DIALYTIKA 0x03CA |
24 | 0 | #define GREEK_LOWER_IOTA_DIALYTIKA_TONOS 0x0390 |
25 | 0 | #define GREEK_LOWER_IOTA_DIALYTIKA_OXIA 0x1FD3 |
26 | 0 | #define GREEK_LOWER_OMICRON 0x03BF |
27 | 0 | #define GREEK_LOWER_OMICRON_TONOS 0x03CC |
28 | 0 | #define GREEK_LOWER_OMICRON_OXIA 0x1F79 |
29 | 0 | #define GREEK_LOWER_UPSILON 0x03C5 |
30 | 0 | #define GREEK_LOWER_UPSILON_TONOS 0x03CD |
31 | 0 | #define GREEK_LOWER_UPSILON_OXIA 0x1F7B |
32 | 0 | #define GREEK_LOWER_UPSILON_DIALYTIKA 0x03CB |
33 | 0 | #define GREEK_LOWER_UPSILON_DIALYTIKA_TONOS 0x03B0 |
34 | 0 | #define GREEK_LOWER_UPSILON_DIALYTIKA_OXIA 0x1FE3 |
35 | 0 | #define GREEK_LOWER_OMEGA 0x03C9 |
36 | 0 | #define GREEK_LOWER_OMEGA_TONOS 0x03CE |
37 | 0 | #define GREEK_LOWER_OMEGA_OXIA 0x1F7D |
38 | 0 | #define GREEK_UPPER_ALPHA 0x0391 |
39 | 0 | #define GREEK_UPPER_EPSILON 0x0395 |
40 | 0 | #define GREEK_UPPER_ETA 0x0397 |
41 | 0 | #define GREEK_UPPER_IOTA 0x0399 |
42 | 0 | #define GREEK_UPPER_IOTA_DIALYTIKA 0x03AA |
43 | 0 | #define GREEK_UPPER_OMICRON 0x039F |
44 | 0 | #define GREEK_UPPER_UPSILON 0x03A5 |
45 | 0 | #define GREEK_UPPER_UPSILON_DIALYTIKA 0x03AB |
46 | 0 | #define GREEK_UPPER_OMEGA 0x03A9 |
47 | 0 | #define GREEK_UPPER_ALPHA_TONOS 0x0386 |
48 | 0 | #define GREEK_UPPER_ALPHA_OXIA 0x1FBB |
49 | 0 | #define GREEK_UPPER_EPSILON_TONOS 0x0388 |
50 | 0 | #define GREEK_UPPER_EPSILON_OXIA 0x1FC9 |
51 | 0 | #define GREEK_UPPER_ETA_TONOS 0x0389 |
52 | 0 | #define GREEK_UPPER_ETA_OXIA 0x1FCB |
53 | 0 | #define GREEK_UPPER_IOTA_TONOS 0x038A |
54 | 0 | #define GREEK_UPPER_IOTA_OXIA 0x1FDB |
55 | 0 | #define GREEK_UPPER_OMICRON_TONOS 0x038C |
56 | 0 | #define GREEK_UPPER_OMICRON_OXIA 0x1FF9 |
57 | 0 | #define GREEK_UPPER_UPSILON_TONOS 0x038E |
58 | 0 | #define GREEK_UPPER_UPSILON_OXIA 0x1FEB |
59 | 0 | #define GREEK_UPPER_OMEGA_TONOS 0x038F |
60 | 0 | #define GREEK_UPPER_OMEGA_OXIA 0x1FFB |
61 | 0 | #define COMBINING_ACUTE_ACCENT 0x0301 |
62 | 0 | #define COMBINING_DIAERESIS 0x0308 |
63 | 0 | #define COMBINING_ACUTE_TONE_MARK 0x0341 |
64 | 0 | #define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344 |
65 | | |
66 | | namespace mozilla { |
67 | | |
68 | | uint32_t |
69 | | GreekCasing::UpperCase(uint32_t aCh, GreekCasing::State& aState, |
70 | | bool& aMarkEtaPos, bool& aUpdateMarkedEta) |
71 | 0 | { |
72 | 0 | aMarkEtaPos = false; |
73 | 0 | aUpdateMarkedEta = false; |
74 | 0 |
|
75 | 0 | uint8_t category = unicode::GetGeneralCategory(aCh); |
76 | 0 |
|
77 | 0 | if (aState == kEtaAccMarked) { |
78 | 0 | switch (category) { |
79 | 0 | case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER: |
80 | 0 | case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER: |
81 | 0 | case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER: |
82 | 0 | case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER: |
83 | 0 | case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER: |
84 | 0 | case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK: |
85 | 0 | case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK: |
86 | 0 | case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK: |
87 | 0 | aUpdateMarkedEta = true; |
88 | 0 | break; |
89 | 0 | default: |
90 | 0 | break; |
91 | 0 | } |
92 | 0 | aState = kEtaAcc; |
93 | 0 | } |
94 | 0 |
|
95 | 0 | switch (aCh) { |
96 | 0 | case GREEK_UPPER_ALPHA: |
97 | 0 | case GREEK_LOWER_ALPHA: |
98 | 0 | aState = kAlpha; |
99 | 0 | return GREEK_UPPER_ALPHA; |
100 | 0 |
|
101 | 0 | case GREEK_UPPER_EPSILON: |
102 | 0 | case GREEK_LOWER_EPSILON: |
103 | 0 | aState = kEpsilon; |
104 | 0 | return GREEK_UPPER_EPSILON; |
105 | 0 |
|
106 | 0 | case GREEK_UPPER_ETA: |
107 | 0 | case GREEK_LOWER_ETA: |
108 | 0 | aState = kEta; |
109 | 0 | return GREEK_UPPER_ETA; |
110 | 0 |
|
111 | 0 | case GREEK_UPPER_IOTA: |
112 | 0 | aState = kIota; |
113 | 0 | return GREEK_UPPER_IOTA; |
114 | 0 |
|
115 | 0 | case GREEK_UPPER_OMICRON: |
116 | 0 | case GREEK_LOWER_OMICRON: |
117 | 0 | aState = kOmicron; |
118 | 0 | return GREEK_UPPER_OMICRON; |
119 | 0 |
|
120 | 0 | case GREEK_UPPER_UPSILON: |
121 | 0 | switch (aState) { |
122 | 0 | case kOmicron: |
123 | 0 | aState = kOmicronUpsilon; |
124 | 0 | break; |
125 | 0 | default: |
126 | 0 | aState = kUpsilon; |
127 | 0 | break; |
128 | 0 | } |
129 | 0 | return GREEK_UPPER_UPSILON; |
130 | 0 |
|
131 | 0 | case GREEK_UPPER_OMEGA: |
132 | 0 | case GREEK_LOWER_OMEGA: |
133 | 0 | aState = kOmega; |
134 | 0 | return GREEK_UPPER_OMEGA; |
135 | 0 |
|
136 | 0 | // iota and upsilon may be the second vowel of a diphthong |
137 | 0 | case GREEK_LOWER_IOTA: |
138 | 0 | switch (aState) { |
139 | 0 | case kAlphaAcc: |
140 | 0 | case kEpsilonAcc: |
141 | 0 | case kOmicronAcc: |
142 | 0 | case kUpsilonAcc: |
143 | 0 | aState = kInWord; |
144 | 0 | return GREEK_UPPER_IOTA_DIALYTIKA; |
145 | 0 | default: |
146 | 0 | break; |
147 | 0 | } |
148 | 0 | aState = kIota; |
149 | 0 | return GREEK_UPPER_IOTA; |
150 | 0 |
|
151 | 0 | case GREEK_LOWER_UPSILON: |
152 | 0 | switch (aState) { |
153 | 0 | case kAlphaAcc: |
154 | 0 | case kEpsilonAcc: |
155 | 0 | case kEtaAcc: |
156 | 0 | case kOmicronAcc: |
157 | 0 | aState = kInWord; |
158 | 0 | return GREEK_UPPER_UPSILON_DIALYTIKA; |
159 | 0 | case kOmicron: |
160 | 0 | aState = kOmicronUpsilon; |
161 | 0 | break; |
162 | 0 | default: |
163 | 0 | aState = kUpsilon; |
164 | 0 | break; |
165 | 0 | } |
166 | 0 | return GREEK_UPPER_UPSILON; |
167 | 0 |
|
168 | 0 | case GREEK_UPPER_IOTA_DIALYTIKA: |
169 | 0 | case GREEK_LOWER_IOTA_DIALYTIKA: |
170 | 0 | case GREEK_UPPER_UPSILON_DIALYTIKA: |
171 | 0 | case GREEK_LOWER_UPSILON_DIALYTIKA: |
172 | 0 | case COMBINING_DIAERESIS: |
173 | 0 | aState = kDiaeresis; |
174 | 0 | return ToUpperCase(aCh); |
175 | 0 |
|
176 | 0 | // remove accent if it follows a vowel or diaeresis, |
177 | 0 | // and set appropriate state for diphthong detection |
178 | 0 | case COMBINING_ACUTE_ACCENT: |
179 | 0 | case COMBINING_ACUTE_TONE_MARK: |
180 | 0 | switch (aState) { |
181 | 0 | case kAlpha: |
182 | 0 | aState = kAlphaAcc; |
183 | 0 | return uint32_t(-1); // omit this char from result string |
184 | 0 | case kEpsilon: |
185 | 0 | aState = kEpsilonAcc; |
186 | 0 | return uint32_t(-1); |
187 | 0 | case kEta: |
188 | 0 | aState = kEtaAcc; |
189 | 0 | return uint32_t(-1); |
190 | 0 | case kIota: |
191 | 0 | aState = kIotaAcc; |
192 | 0 | return uint32_t(-1); |
193 | 0 | case kOmicron: |
194 | 0 | aState = kOmicronAcc; |
195 | 0 | return uint32_t(-1); |
196 | 0 | case kUpsilon: |
197 | 0 | aState = kUpsilonAcc; |
198 | 0 | return uint32_t(-1); |
199 | 0 | case kOmicronUpsilon: |
200 | 0 | aState = kInWord; // this completed a diphthong |
201 | 0 | return uint32_t(-1); |
202 | 0 | case kOmega: |
203 | 0 | aState = kOmegaAcc; |
204 | 0 | return uint32_t(-1); |
205 | 0 | case kDiaeresis: |
206 | 0 | aState = kInWord; |
207 | 0 | return uint32_t(-1); |
208 | 0 | default: |
209 | 0 | break; |
210 | 0 | } |
211 | 0 | break; |
212 | 0 |
|
213 | 0 | // combinations with dieresis+accent just strip the accent, |
214 | 0 | // and reset to start state (don't form diphthong with following vowel) |
215 | 0 | case GREEK_LOWER_IOTA_DIALYTIKA_TONOS: |
216 | 0 | case GREEK_LOWER_IOTA_DIALYTIKA_OXIA: |
217 | 0 | aState = kInWord; |
218 | 0 | return GREEK_UPPER_IOTA_DIALYTIKA; |
219 | 0 |
|
220 | 0 | case GREEK_LOWER_UPSILON_DIALYTIKA_TONOS: |
221 | 0 | case GREEK_LOWER_UPSILON_DIALYTIKA_OXIA: |
222 | 0 | aState = kInWord; |
223 | 0 | return GREEK_UPPER_UPSILON_DIALYTIKA; |
224 | 0 |
|
225 | 0 | case COMBINING_GREEK_DIALYTIKA_TONOS: |
226 | 0 | aState = kInWord; |
227 | 0 | return COMBINING_DIAERESIS; |
228 | 0 |
|
229 | 0 | // strip accents from vowels, and note the vowel seen so that we can detect |
230 | 0 | // diphthongs where diaeresis needs to be added |
231 | 0 | case GREEK_LOWER_ALPHA_TONOS: |
232 | 0 | case GREEK_LOWER_ALPHA_OXIA: |
233 | 0 | case GREEK_UPPER_ALPHA_TONOS: |
234 | 0 | case GREEK_UPPER_ALPHA_OXIA: |
235 | 0 | aState = kAlphaAcc; |
236 | 0 | return GREEK_UPPER_ALPHA; |
237 | 0 |
|
238 | 0 | case GREEK_LOWER_EPSILON_TONOS: |
239 | 0 | case GREEK_LOWER_EPSILON_OXIA: |
240 | 0 | case GREEK_UPPER_EPSILON_TONOS: |
241 | 0 | case GREEK_UPPER_EPSILON_OXIA: |
242 | 0 | aState = kEpsilonAcc; |
243 | 0 | return GREEK_UPPER_EPSILON; |
244 | 0 |
|
245 | 0 | case GREEK_LOWER_ETA_TONOS: |
246 | 0 | case GREEK_UPPER_ETA_TONOS: |
247 | 0 | if (aState == kStart) { |
248 | 0 | aState = kEtaAccMarked; |
249 | 0 | aMarkEtaPos = true; // mark in case we need to remove the tonos later |
250 | 0 | return GREEK_UPPER_ETA_TONOS; // treat as disjunctive eta for now |
251 | 0 | } |
252 | 0 | // if not in initial state, fall through to strip the accent |
253 | 0 | MOZ_FALLTHROUGH; |
254 | 0 |
|
255 | 0 | case GREEK_LOWER_ETA_OXIA: |
256 | 0 | case GREEK_UPPER_ETA_OXIA: |
257 | 0 | aState = kEtaAcc; |
258 | 0 | return GREEK_UPPER_ETA; |
259 | 0 |
|
260 | 0 | case GREEK_LOWER_IOTA_TONOS: |
261 | 0 | case GREEK_LOWER_IOTA_OXIA: |
262 | 0 | case GREEK_UPPER_IOTA_TONOS: |
263 | 0 | case GREEK_UPPER_IOTA_OXIA: |
264 | 0 | aState = kIotaAcc; |
265 | 0 | return GREEK_UPPER_IOTA; |
266 | 0 |
|
267 | 0 | case GREEK_LOWER_OMICRON_TONOS: |
268 | 0 | case GREEK_LOWER_OMICRON_OXIA: |
269 | 0 | case GREEK_UPPER_OMICRON_TONOS: |
270 | 0 | case GREEK_UPPER_OMICRON_OXIA: |
271 | 0 | aState = kOmicronAcc; |
272 | 0 | return GREEK_UPPER_OMICRON; |
273 | 0 |
|
274 | 0 | case GREEK_LOWER_UPSILON_TONOS: |
275 | 0 | case GREEK_LOWER_UPSILON_OXIA: |
276 | 0 | case GREEK_UPPER_UPSILON_TONOS: |
277 | 0 | case GREEK_UPPER_UPSILON_OXIA: |
278 | 0 | switch (aState) { |
279 | 0 | case kOmicron: |
280 | 0 | aState = kInWord; // this completed a diphthong |
281 | 0 | break; |
282 | 0 | default: |
283 | 0 | aState = kUpsilonAcc; |
284 | 0 | break; |
285 | 0 | } |
286 | 0 | return GREEK_UPPER_UPSILON; |
287 | 0 |
|
288 | 0 | case GREEK_LOWER_OMEGA_TONOS: |
289 | 0 | case GREEK_LOWER_OMEGA_OXIA: |
290 | 0 | case GREEK_UPPER_OMEGA_TONOS: |
291 | 0 | case GREEK_UPPER_OMEGA_OXIA: |
292 | 0 | aState = kOmegaAcc; |
293 | 0 | return GREEK_UPPER_OMEGA; |
294 | 0 | } |
295 | 0 | |
296 | 0 | // all other characters just reset the state to either kStart or kInWord, |
297 | 0 | // and use standard mappings |
298 | 0 | switch (category) { |
299 | 0 | case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER: |
300 | 0 | case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER: |
301 | 0 | case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER: |
302 | 0 | case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER: |
303 | 0 | case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER: |
304 | 0 | case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK: |
305 | 0 | case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK: |
306 | 0 | case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK: |
307 | 0 | aState = kInWord; |
308 | 0 | break; |
309 | 0 | default: |
310 | 0 | aState = kStart; |
311 | 0 | break; |
312 | 0 | } |
313 | 0 | |
314 | 0 | return ToUpperCase(aCh); |
315 | 0 | } |
316 | | |
317 | | } // namespace mozilla |