/src/mozilla-central/intl/lwbrk/LineBreaker.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "mozilla/intl/LineBreaker.h" |
7 | | |
8 | | #include "jisx4051class.h" |
9 | | #include "nsComplexBreaker.h" |
10 | | #include "nsTArray.h" |
11 | | #include "nsUnicodeProperties.h" |
12 | | #include "mozilla/ArrayUtils.h" |
13 | | |
14 | | using namespace mozilla::unicode; |
15 | | using namespace mozilla::intl; |
16 | | |
17 | | /*static*/ |
18 | | already_AddRefed<LineBreaker> |
19 | | LineBreaker::Create() |
20 | 3 | { |
21 | 3 | return RefPtr<LineBreaker>(new LineBreaker()).forget(); |
22 | 3 | } |
23 | | |
24 | | /* |
25 | | |
26 | | Simplification of Pair Table in JIS X 4051 |
27 | | |
28 | | 1. The Origion Table - in 4.1.3 |
29 | | |
30 | | In JIS x 4051. The pair table is defined as below |
31 | | |
32 | | Class of |
33 | | Leading Class of Trailing Char Class |
34 | | Char |
35 | | |
36 | | 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 |
37 | | * # * # |
38 | | 1 X X X X X X X X X X X X X X X X X X X X X E |
39 | | 2 X X X X X X |
40 | | 3 X X X X X X |
41 | | 4 X X X X X X |
42 | | 5 X X X X X X |
43 | | 6 X X X X X X |
44 | | 7 X X X X X X X |
45 | | 8 X X X X X X E |
46 | | 9 X X X X X X |
47 | | 10 X X X X X X |
48 | | 11 X X X X X X |
49 | | 12 X X X X X X |
50 | | 13 X X X X X X X |
51 | | 14 X X X X X X X |
52 | | 15 X X X X X X X X X |
53 | | 16 X X X X X X X X |
54 | | 17 X X X X X E |
55 | | 18 X X X X X X X X X |
56 | | 19 X E E E E E X X X X X X X X X X X X E X E E |
57 | | 20 X X X X X E |
58 | | |
59 | | * Same Char |
60 | | # Other Char |
61 | | |
62 | | X Cannot Break |
63 | | |
64 | | The classes mean: |
65 | | 1: Open parenthesis |
66 | | 2: Close parenthesis |
67 | | 3: Prohibit a line break before |
68 | | 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") |
69 | | 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) |
70 | | 6: Full stop |
71 | | 7: Non-breakable between same characters |
72 | | 8: Prefix (e.g., "$", "NO.") |
73 | | 9: Postfix (e.g., "%") |
74 | | 10: Ideographic space |
75 | | 11: Hiragana |
76 | | 12: Japanese characters (except class 11) |
77 | | 13: Subscript |
78 | | 14: Ruby |
79 | | 15: Numeric |
80 | | 16: Alphabet |
81 | | 17: Space for Western language |
82 | | 18: Western characters (except class 17) |
83 | | 19: Split line note (Warichu) begin quote |
84 | | 20: Split line note (Warichu) end quote |
85 | | |
86 | | 2. Simplified by remove the class which we do not care |
87 | | |
88 | | However, since we do not care about class 13(Subscript), 14(Ruby), |
89 | | 16 (Aphabet), 19(split line note begin quote), and 20(split line note end |
90 | | quote) we can simplify this par table into the following |
91 | | |
92 | | Class of |
93 | | Leading Class of Trailing Char Class |
94 | | Char |
95 | | |
96 | | 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 |
97 | | |
98 | | 1 X X X X X X X X X X X X X X X |
99 | | 2 X X X X X |
100 | | 3 X X X X X |
101 | | 4 X X X X X |
102 | | 5 X X X X X |
103 | | 6 X X X X X |
104 | | 7 X X X X X X |
105 | | 8 X X X X X X |
106 | | 9 X X X X X |
107 | | 10 X X X X X |
108 | | 11 X X X X X |
109 | | 12 X X X X X |
110 | | 15 X X X X X X X X |
111 | | 17 X X X X X |
112 | | 18 X X X X X X X |
113 | | |
114 | | 3. Simplified by merged classes |
115 | | |
116 | | After the 2 simplification, the pair table have some duplication |
117 | | a. class 2, 3, 4, 5, 6, are the same- we can merged them |
118 | | b. class 10, 11, 12, 17 are the same- we can merged them |
119 | | |
120 | | |
121 | | Class of |
122 | | Leading Class of Trailing Char Class |
123 | | Char |
124 | | |
125 | | 1 [a] 7 8 9 [b]15 18 |
126 | | |
127 | | 1 X X X X X X X X |
128 | | [a] X |
129 | | 7 X X |
130 | | 8 X X |
131 | | 9 X |
132 | | [b] X |
133 | | 15 X X X X |
134 | | 18 X X X |
135 | | |
136 | | |
137 | | 4. We add COMPLEX characters and make it breakable w/ all ther class |
138 | | except after class 1 and before class [a] |
139 | | |
140 | | Class of |
141 | | Leading Class of Trailing Char Class |
142 | | Char |
143 | | |
144 | | 1 [a] 7 8 9 [b]15 18 COMPLEX |
145 | | |
146 | | 1 X X X X X X X X X |
147 | | [a] X |
148 | | 7 X X |
149 | | 8 X X |
150 | | 9 X |
151 | | [b] X |
152 | | 15 X X X X |
153 | | 18 X X X |
154 | | COMPLEX X T |
155 | | |
156 | | T : need special handling |
157 | | |
158 | | |
159 | | 5. However, we need two special class for some punctuations/parentheses, |
160 | | theirs breaking rules like character class (18), see bug 389056. |
161 | | And also we need character like punctuation that is same behavior with 18, |
162 | | but the characters are not letters of all languages. (e.g., '_') |
163 | | [c]. Based on open parenthesis class (1), but it is not breakable after |
164 | | character class (18) or numeric class (15). |
165 | | [d]. Based on close parenthesis (or punctuation) class (2), but it is not |
166 | | breakable before character class (18) or numeric class (15). |
167 | | |
168 | | Class of |
169 | | Leading Class of Trailing Char Class |
170 | | Char |
171 | | |
172 | | 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] |
173 | | |
174 | | 1 X X X X X X X X X X X |
175 | | [a] X X X |
176 | | 7 X X |
177 | | 8 X X |
178 | | 9 X |
179 | | [b] X X |
180 | | 15 X X X X X X |
181 | | 18 X X X X X |
182 | | COMPLEX X T |
183 | | [c] X X X X X X X X X X X |
184 | | [d] X X X X |
185 | | |
186 | | |
187 | | 6. And Unicode has "NON-BREAK" characters. The lines should be broken around |
188 | | them. But in JIS X 4051, such class is not, therefore, we create [e]. |
189 | | |
190 | | Class of |
191 | | Leading Class of Trailing Char Class |
192 | | Char |
193 | | |
194 | | 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] |
195 | | |
196 | | 1 X X X X X X X X X X X X |
197 | | [a] X X X |
198 | | 7 X X X |
199 | | 8 X X X |
200 | | 9 X X |
201 | | [b] X X X |
202 | | 15 X X X X X X X |
203 | | 18 X X X X X X |
204 | | COMPLEX X T X |
205 | | [c] X X X X X X X X X X X X |
206 | | [d] X X X X X |
207 | | [e] X X X X X X X X X X X X |
208 | | |
209 | | |
210 | | 7. Now we use one bit to encode weather it is breakable, and use 2 bytes |
211 | | for one row, then the bit table will look like: |
212 | | |
213 | | 18 <- 1 |
214 | | |
215 | | 1 0000 1111 1111 1111 = 0x0FFF |
216 | | [a] 0000 1100 0000 0010 = 0x0C02 |
217 | | 7 0000 1000 0000 0110 = 0x0806 |
218 | | 8 0000 1000 0100 0010 = 0x0842 |
219 | | 9 0000 1000 0000 0010 = 0x0802 |
220 | | [b] 0000 1100 0000 0010 = 0x0C02 |
221 | | 15 0000 1110 1101 0010 = 0x0ED2 |
222 | | 18 0000 1110 1100 0010 = 0x0EC2 |
223 | | COMPLEX 0000 1001 0000 0010 = 0x0902 |
224 | | [c] 0000 1111 1111 1111 = 0x0FFF |
225 | | [d] 0000 1100 1100 0010 = 0x0CC2 |
226 | | [e] 0000 1111 1111 1111 = 0x0FFF |
227 | | */ |
228 | | |
229 | | #define MAX_CLASSES 12 |
230 | | |
231 | | static const uint16_t gPair[MAX_CLASSES] = { |
232 | | 0x0FFF, |
233 | | 0x0C02, |
234 | | 0x0806, |
235 | | 0x0842, |
236 | | 0x0802, |
237 | | 0x0C02, |
238 | | 0x0ED2, |
239 | | 0x0EC2, |
240 | | 0x0902, |
241 | | 0x0FFF, |
242 | | 0x0CC2, |
243 | | 0x0FFF |
244 | | }; |
245 | | |
246 | | |
247 | | /* |
248 | | |
249 | | 8. And if the character is not enough far from word start, word end and |
250 | | another break point, we should not break in non-CJK languages. |
251 | | I.e., Don't break around 15, 18, [c] and [d], but don't change |
252 | | that if they are related to [b]. |
253 | | |
254 | | Class of |
255 | | Leading Class of Trailing Char Class |
256 | | Char |
257 | | |
258 | | 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] |
259 | | |
260 | | 1 X X X X X X X X X X X X |
261 | | [a] X X X X X X |
262 | | 7 X X X X X X X |
263 | | 8 X X X X X X |
264 | | 9 X X X X X X |
265 | | [b] X X X |
266 | | 15 X X X X X X X X X X X |
267 | | 18 X X X X X X X X X X X |
268 | | COMPLEX X X X T X X X |
269 | | [c] X X X X X X X X X X X X |
270 | | [d] X X X X X X X X X X X |
271 | | [e] X X X X X X X X X X X X |
272 | | |
273 | | 18 <- 1 |
274 | | |
275 | | 1 0000 1111 1111 1111 = 0x0FFF |
276 | | [a] 0000 1110 1100 0010 = 0x0EC2 |
277 | | 7 0000 1110 1100 0110 = 0x0EC6 |
278 | | 8 0000 1110 1100 0010 = 0x0EC2 |
279 | | 9 0000 1110 1100 0010 = 0x0EC2 |
280 | | [b] 0000 1100 0000 0010 = 0x0C02 |
281 | | 15 0000 1111 1101 1111 = 0x0FDF |
282 | | 18 0000 1111 1101 1111 = 0x0FDF |
283 | | COMPLEX 0000 1111 1100 0010 = 0x0FC2 |
284 | | [c] 0000 1111 1111 1111 = 0x0FFF |
285 | | [d] 0000 1111 1101 1111 = 0x0FDF |
286 | | [e] 0000 1111 1111 1111 = 0x0FFF |
287 | | */ |
288 | | |
289 | | static const uint16_t gPairConservative[MAX_CLASSES] = { |
290 | | 0x0FFF, |
291 | | 0x0EC2, |
292 | | 0x0EC6, |
293 | | 0x0EC2, |
294 | | 0x0EC2, |
295 | | 0x0C02, |
296 | | 0x0FDF, |
297 | | 0x0FDF, |
298 | | 0x0FC2, |
299 | | 0x0FFF, |
300 | | 0x0FDF, |
301 | | 0x0FFF |
302 | | }; |
303 | | |
304 | | |
305 | | /* |
306 | | |
307 | | 9. Now we map the class to number |
308 | | |
309 | | 0: 1 |
310 | | 1: [a]- 2, 3, 4, 5, 6 |
311 | | 2: 7 |
312 | | 3: 8 |
313 | | 4: 9 |
314 | | 5: [b]- 10, 11, 12, 17 |
315 | | 6: 15 |
316 | | 7: 18 |
317 | | 8: COMPLEX |
318 | | 9: [c] |
319 | | A: [d] |
320 | | B: [e] |
321 | | |
322 | | and they mean: |
323 | | 0: Open parenthesis |
324 | | 1: Punctuation that prohibits break before |
325 | | 2: Non-breakable between same classes |
326 | | 3: Prefix |
327 | | 4: Postfix |
328 | | 5: Breakable character (Spaces and Most Japanese characters) |
329 | | 6: Numeric |
330 | | 7: Characters |
331 | | 8: Need special handling characters (E.g., Thai) |
332 | | 9: Open parentheses like Character (See bug 389056) |
333 | | A: Close parenthese (or punctuations) like Character (See bug 389056) |
334 | | B: Non breakable (See bug 390920) |
335 | | |
336 | | */ |
337 | | |
338 | 0 | #define CLASS_NONE INT8_MAX |
339 | | |
340 | 0 | #define CLASS_OPEN 0x00 |
341 | 0 | #define CLASS_CLOSE 0x01 |
342 | | #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 |
343 | | #define CLASS_PREFIX 0x03 |
344 | | #define CLASS_POSTFFIX 0x04 |
345 | 0 | #define CLASS_BREAKABLE 0x05 |
346 | 0 | #define CLASS_NUMERIC 0x06 |
347 | 0 | #define CLASS_CHARACTER 0x07 |
348 | 0 | #define CLASS_COMPLEX 0x08 |
349 | 0 | #define CLASS_OPEN_LIKE_CHARACTER 0x09 |
350 | 0 | #define CLASS_CLOSE_LIKE_CHARACTER 0x0A |
351 | 0 | #define CLASS_NON_BREAKABLE 0x0B |
352 | | |
353 | 0 | #define U_NULL char16_t(0x0000) |
354 | 0 | #define U_SLASH char16_t('/') |
355 | | #define U_SPACE char16_t(' ') |
356 | 0 | #define U_HYPHEN char16_t('-') |
357 | 0 | #define U_EQUAL char16_t('=') |
358 | 0 | #define U_PERCENT char16_t('%') |
359 | 0 | #define U_AMPERSAND char16_t('&') |
360 | 0 | #define U_SEMICOLON char16_t(';') |
361 | 0 | #define U_BACKSLASH char16_t('\\') |
362 | 0 | #define U_OPEN_SINGLE_QUOTE char16_t(0x2018) |
363 | 0 | #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) |
364 | 0 | #define U_OPEN_GUILLEMET char16_t(0x00AB) |
365 | | |
366 | 0 | #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ |
367 | 0 | (c) == U_SLASH || \ |
368 | 0 | (c) == U_PERCENT || \ |
369 | 0 | (c) == U_AMPERSAND || \ |
370 | 0 | (c) == U_SEMICOLON || \ |
371 | 0 | (c) == U_BACKSLASH || \ |
372 | 0 | (c) == U_OPEN_SINGLE_QUOTE || \ |
373 | 0 | (c) == U_OPEN_DOUBLE_QUOTE || \ |
374 | 0 | (c) == U_OPEN_GUILLEMET) |
375 | | |
376 | 0 | #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) |
377 | | |
378 | | static inline int |
379 | | GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) |
380 | 0 | { |
381 | 0 | return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); |
382 | 0 | } |
383 | | |
384 | | static inline int |
385 | | IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) |
386 | 0 | { |
387 | 0 | return ((0xff66 <= (u)) && ((u) <= 0xff70)); |
388 | 0 | } |
389 | | |
390 | | static inline int |
391 | | IS_CJK_CHAR(char32_t u) |
392 | 0 | { |
393 | 0 | return ((0x1100 <= (u) && (u) <= 0x11ff) || |
394 | 0 | (0x2e80 <= (u) && (u) <= 0xd7ff) || |
395 | 0 | (0xf900 <= (u) && (u) <= 0xfaff) || |
396 | 0 | (0xff00 <= (u) && (u) <= 0xffef) || |
397 | 0 | (0x20000 <= (u) && (u) <= 0x2fffd)); |
398 | 0 | } |
399 | | |
400 | | static inline bool |
401 | | IS_NONBREAKABLE_SPACE(char16_t u) |
402 | 0 | { |
403 | 0 | return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE |
404 | 0 | } |
405 | | |
406 | | static inline bool |
407 | | IS_HYPHEN(char16_t u) |
408 | 0 | { |
409 | 0 | return (u == U_HYPHEN || |
410 | 0 | u == 0x058A || // ARMENIAN HYPHEN |
411 | 0 | u == 0x2010 || // HYPHEN |
412 | 0 | u == 0x2012 || // FIGURE DASH |
413 | 0 | u == 0x2013); // EN DASH |
414 | 0 | } |
415 | | |
416 | | static int8_t |
417 | | GetClass(uint32_t u) |
418 | 0 | { |
419 | 0 | if (u < 0x10000) { |
420 | 0 | uint16_t h = u & 0xFF00; |
421 | 0 | uint16_t l = u & 0x00ff; |
422 | 0 |
|
423 | 0 | // Handle 3 range table first |
424 | 0 | if (0x0000 == h) { |
425 | 0 | return GETCLASSFROMTABLE(gLBClass00, l); |
426 | 0 | } |
427 | 0 | if (0x1700 == h) { |
428 | 0 | return GETCLASSFROMTABLE(gLBClass17, l); |
429 | 0 | } |
430 | 0 | if (NS_NeedsPlatformNativeHandling(u)) { |
431 | 0 | return CLASS_COMPLEX; |
432 | 0 | } |
433 | 0 | if (0x0E00 == h) { |
434 | 0 | return GETCLASSFROMTABLE(gLBClass0E, l); |
435 | 0 | } |
436 | 0 | if (0x2000 == h) { |
437 | 0 | return GETCLASSFROMTABLE(gLBClass20, l); |
438 | 0 | } |
439 | 0 | if (0x2100 == h) { |
440 | 0 | return GETCLASSFROMTABLE(gLBClass21, l); |
441 | 0 | } |
442 | 0 | if (0x3000 == h) { |
443 | 0 | return GETCLASSFROMTABLE(gLBClass30, l); |
444 | 0 | } |
445 | 0 | if (0xff00 == h) { |
446 | 0 | if (l < 0x0060) { // Fullwidth ASCII variant |
447 | 0 | return GETCLASSFROMTABLE(gLBClass00, (l+0x20)); |
448 | 0 | } |
449 | 0 | if (l < 0x00a0) { // Halfwidth Katakana variants |
450 | 0 | switch (l) { |
451 | 0 | case 0x61: return GetClass(0x3002); |
452 | 0 | case 0x62: return GetClass(0x300c); |
453 | 0 | case 0x63: return GetClass(0x300d); |
454 | 0 | case 0x64: return GetClass(0x3001); |
455 | 0 | case 0x65: return GetClass(0x30fb); |
456 | 0 | case 0x9e: return GetClass(0x309b); |
457 | 0 | case 0x9f: return GetClass(0x309c); |
458 | 0 | default: |
459 | 0 | if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) { |
460 | 0 | return CLASS_CLOSE; // jis x4051 class 3 |
461 | 0 | } |
462 | 0 | return CLASS_BREAKABLE; // jis x4051 class 11 |
463 | 0 | } |
464 | 0 | } |
465 | 0 | if (l < 0x00e0) { |
466 | 0 | return CLASS_CHARACTER; // Halfwidth Hangul variants |
467 | 0 | } |
468 | 0 | if (l < 0x00f0) { |
469 | 0 | static char16_t NarrowFFEx[16] = { |
470 | 0 | 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, |
471 | 0 | 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 |
472 | 0 | }; |
473 | 0 | return GetClass(NarrowFFEx[l - 0x00e0]); |
474 | 0 | } |
475 | 0 | } else if (0x3100 == h) { |
476 | 0 | if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun |
477 | 0 | // XXX: This is per UAX #14, but UAX #14 may change |
478 | 0 | // the line breaking rules about Kanbun and Bopomofo. |
479 | 0 | return CLASS_BREAKABLE; |
480 | 0 | } |
481 | 0 | if (l >= 0xf0) { // Katakana small letters for Ainu |
482 | 0 | return CLASS_CLOSE; |
483 | 0 | } |
484 | 0 | } else if (0x0300 == h) { |
485 | 0 | if (0x4F == l || (0x5C <= l && l <= 0x62)) { |
486 | 0 | return CLASS_NON_BREAKABLE; |
487 | 0 | } |
488 | 0 | } else if (0x0500 == h) { |
489 | 0 | // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) |
490 | 0 | if (l == 0x8A) { |
491 | 0 | return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); |
492 | 0 | } |
493 | 0 | } else if (0x0F00 == h) { |
494 | 0 | if (0x08 == l || 0x0C == l || 0x12 == l) { |
495 | 0 | return CLASS_NON_BREAKABLE; |
496 | 0 | } |
497 | 0 | } else if (0x1800 == h) { |
498 | 0 | if (0x0E == l) { |
499 | 0 | return CLASS_NON_BREAKABLE; |
500 | 0 | } |
501 | 0 | } else if (0x1600 == h) { |
502 | 0 | if (0x80 == l) { // U+1680 OGHAM SPACE MARK |
503 | 0 | return CLASS_BREAKABLE; |
504 | 0 | } |
505 | 0 | } else if (u == 0xfeff) { |
506 | 0 | return CLASS_NON_BREAKABLE; |
507 | 0 | } |
508 | 0 | } |
509 | 0 |
|
510 | 0 | // Mapping for Unicode LineBreak.txt classes to the (simplified) set of |
511 | 0 | // character classes used here. |
512 | 0 | // XXX The mappings here were derived by comparing the Unicode LineBreak |
513 | 0 | // values of BMP characters to the classes our existing GetClass returns |
514 | 0 | // for the same codepoints; in cases where characters with the same |
515 | 0 | // LineBreak class mapped to various classes here, I picked what seemed |
516 | 0 | // the most prevalent equivalence. |
517 | 0 | // Some of these are unclear to me, but currently they are ONLY used |
518 | 0 | // for characters not handled by the old code above, so all the JISx405 |
519 | 0 | // special cases should already be accounted for. |
520 | 0 | static const int8_t sUnicodeLineBreakToClass[] = { |
521 | 0 | /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER, |
522 | 0 | /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER, |
523 | 0 | /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER, |
524 | 0 | /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER, |
525 | 0 | /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER, |
526 | 0 | /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER, |
527 | 0 | /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER, |
528 | 0 | /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER, |
529 | 0 | /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER, |
530 | 0 | /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER, |
531 | 0 | /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE, |
532 | 0 | /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER, |
533 | 0 | /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE, |
534 | 0 | /* HYPHEN = 13, [HY] */ CLASS_CHARACTER, |
535 | 0 | /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE, |
536 | 0 | /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER, |
537 | 0 | /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER, |
538 | 0 | /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE, |
539 | 0 | /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER, |
540 | 0 | /* NUMERIC = 19, [NU] */ CLASS_CHARACTER, |
541 | 0 | /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER, |
542 | 0 | /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER, |
543 | 0 | /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER, |
544 | 0 | /* QUOTATION = 23, [QU] */ CLASS_CHARACTER, |
545 | 0 | /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER, |
546 | 0 | /* SURROGATE = 25, [SG] */ CLASS_CHARACTER, |
547 | 0 | /* SPACE = 26, [SP] */ CLASS_BREAKABLE, |
548 | 0 | /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER, |
549 | 0 | /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE, |
550 | 0 | /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER, |
551 | 0 | /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE, |
552 | 0 | /* H2 = 31, [H2] */ CLASS_BREAKABLE, |
553 | 0 | /* H3 = 32, [H3] */ CLASS_BREAKABLE, |
554 | 0 | /* JL = 33, [JL] */ CLASS_CHARACTER, |
555 | 0 | /* JT = 34, [JT] */ CLASS_CHARACTER, |
556 | 0 | /* JV = 35, [JV] */ CLASS_CHARACTER, |
557 | 0 | /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER, |
558 | 0 | /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE, |
559 | 0 | /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER, |
560 | 0 | /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER, |
561 | 0 | /* E_BASE = 40, [EB] */ CLASS_BREAKABLE, |
562 | 0 | /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER, |
563 | 0 | /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER |
564 | 0 | }; |
565 | 0 |
|
566 | 0 | static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass), |
567 | 0 | "Gecko vs ICU LineBreak class mismatch"); |
568 | 0 |
|
569 | 0 | auto cls = mozilla::unicode::GetLineBreakClass(u); |
570 | 0 | MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass)); |
571 | 0 | return sUnicodeLineBreakToClass[cls]; |
572 | 0 | } |
573 | | |
574 | | static bool |
575 | | GetPair(int8_t c1, int8_t c2) |
576 | 0 | { |
577 | 0 | NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); |
578 | 0 | NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); |
579 | 0 |
|
580 | 0 | return (0 == ((gPair[c1] >> c2) & 0x0001)); |
581 | 0 | } |
582 | | |
583 | | static bool |
584 | | GetPairConservative(int8_t c1, int8_t c2) |
585 | 0 | { |
586 | 0 | NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); |
587 | 0 | NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); |
588 | 0 |
|
589 | 0 | return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); |
590 | 0 | } |
591 | | |
592 | | class ContextState { |
593 | | public: |
594 | | ContextState(const char16_t* aText, uint32_t aLength) |
595 | | : mUniText(aText) |
596 | | , mText(nullptr) |
597 | | , mLength(aLength) |
598 | 0 | { |
599 | 0 | Init(); |
600 | 0 | } |
601 | | |
602 | | ContextState(const uint8_t* aText, uint32_t aLength) |
603 | | : mUniText(nullptr) |
604 | | , mText(aText) |
605 | | , mLength(aLength) |
606 | 0 | { |
607 | 0 | Init(); |
608 | 0 | } |
609 | | |
610 | 0 | uint32_t Length() const { return mLength; } |
611 | 0 | uint32_t Index() const { return mIndex; } |
612 | | |
613 | | // This gets a single code unit of the text, without checking for surrogates |
614 | | // (in the case of a 16-bit text buffer). That's OK if we're only checking for |
615 | | // specific characters that are known to be BMP values. |
616 | 0 | char16_t GetCodeUnitAt(uint32_t aIndex) const { |
617 | 0 | MOZ_ASSERT(aIndex < mLength, "Out of range!"); |
618 | 0 | return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); |
619 | 0 | } |
620 | | |
621 | | // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs |
622 | | // as necessary. It must ONLY be called for 16-bit text, not 8-bit. |
623 | 0 | char32_t GetUnicodeCharAt(uint32_t aIndex) const { |
624 | 0 | MOZ_ASSERT(mUniText, "Only for 16-bit text!"); |
625 | 0 | MOZ_ASSERT(aIndex < mLength, "Out of range!"); |
626 | 0 | char32_t c = mUniText[aIndex]; |
627 | 0 | if (NS_IS_HIGH_SURROGATE(c) && aIndex + 1 < mLength && |
628 | 0 | NS_IS_LOW_SURROGATE(mUniText[aIndex + 1])) { |
629 | 0 | c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]); |
630 | 0 | } |
631 | 0 | return c; |
632 | 0 | } |
633 | | |
634 | 0 | void AdvanceIndex() { |
635 | 0 | ++mIndex; |
636 | 0 | } |
637 | | |
638 | 0 | void NotifyBreakBefore() { mLastBreakIndex = mIndex; } |
639 | | |
640 | | // A word of western language should not be broken. But even if the word has |
641 | | // only ASCII characters, non-natural context words should be broken, e.g., |
642 | | // URL and file path. For protecting the natural words, we should use |
643 | | // conservative breaking rules at following conditions: |
644 | | // 1. at near the start of word |
645 | | // 2. at near the end of word |
646 | | // 3. at near the latest broken point |
647 | | // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters, |
648 | | // which varies depending whether we are looking at a letter or a non-letter |
649 | | // character: for non-letters, we use an extended "conservative" range. |
650 | | |
651 | 0 | #define CONSERVATIVE_RANGE_LETTER 2 |
652 | 0 | #define CONSERVATIVE_RANGE_OTHER 6 |
653 | | |
654 | 0 | bool UseConservativeBreaking(uint32_t aOffset = 0) const { |
655 | 0 | if (mHasCJKChar) |
656 | 0 | return false; |
657 | 0 | uint32_t index = mIndex + aOffset; |
658 | 0 |
|
659 | 0 | // If the character at index is a letter (rather than various punctuation |
660 | 0 | // characters, etc) then we want a shorter "conservative" range |
661 | 0 | uint32_t conservativeRangeStart, conservativeRangeEnd; |
662 | 0 | if (index < mLength && |
663 | 0 | nsUGenCategory::kLetter == |
664 | 0 | (mText ? GetGenCategory(mText[index]) |
665 | 0 | : GetGenCategory(GetUnicodeCharAt(index)))) { |
666 | 0 | // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start |
667 | 0 | // to get more balanced behavior (if we break off a 2-letter prefix, |
668 | 0 | // that means the break will actually be three letters from start of |
669 | 0 | // word, to include the hyphen; whereas a 2-letter suffix will be |
670 | 0 | // broken only two letters from end of word). |
671 | 0 | conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER; |
672 | 0 | conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1; |
673 | 0 | } else { |
674 | 0 | conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER; |
675 | 0 | } |
676 | 0 |
|
677 | 0 | bool result = (index < conservativeRangeStart || |
678 | 0 | mLength - index < conservativeRangeEnd || |
679 | 0 | index - mLastBreakIndex < conservativeRangeStart); |
680 | 0 | if (result || !mHasNonbreakableSpace) |
681 | 0 | return result; |
682 | 0 | |
683 | 0 | // This text has no-breakable space, we need to check whether the index |
684 | 0 | // is near it. |
685 | 0 | |
686 | 0 | // Note that index is always larger than conservativeRange here. |
687 | 0 | for (uint32_t i = index; index - conservativeRangeStart < i; --i) { |
688 | 0 | if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) |
689 | 0 | return true; |
690 | 0 | } |
691 | 0 | // Note that index is always less than mLength - conservativeRange. |
692 | 0 | for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) { |
693 | 0 | if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) |
694 | 0 | return true; |
695 | 0 | } |
696 | 0 | return false; |
697 | 0 | } |
698 | | |
699 | 0 | bool HasPreviousEqualsSign() const { |
700 | 0 | return mHasPreviousEqualsSign; |
701 | 0 | } |
702 | 0 | void NotifySeenEqualsSign() { |
703 | 0 | mHasPreviousEqualsSign = true; |
704 | 0 | } |
705 | | |
706 | 0 | bool HasPreviousSlash() const { |
707 | 0 | return mHasPreviousSlash; |
708 | 0 | } |
709 | 0 | void NotifySeenSlash() { |
710 | 0 | mHasPreviousSlash = true; |
711 | 0 | } |
712 | | |
713 | 0 | bool HasPreviousBackslash() const { |
714 | 0 | return mHasPreviousBackslash; |
715 | 0 | } |
716 | 0 | void NotifySeenBackslash() { |
717 | 0 | mHasPreviousBackslash = true; |
718 | 0 | } |
719 | | |
720 | 0 | uint32_t GetPreviousNonHyphenCharacter() const { |
721 | 0 | return mPreviousNonHyphenCharacter; |
722 | 0 | } |
723 | 0 | void NotifyNonHyphenCharacter(uint32_t ch) { |
724 | 0 | mPreviousNonHyphenCharacter = ch; |
725 | 0 | } |
726 | | |
727 | | private: |
728 | 0 | void Init() { |
729 | 0 | mIndex = 0; |
730 | 0 | mLastBreakIndex = 0; |
731 | 0 | mPreviousNonHyphenCharacter = U_NULL; |
732 | 0 | mHasCJKChar = false; |
733 | 0 | mHasNonbreakableSpace = false; |
734 | 0 | mHasPreviousEqualsSign = false; |
735 | 0 | mHasPreviousSlash = false; |
736 | 0 | mHasPreviousBackslash = false; |
737 | 0 |
|
738 | 0 | if (mText) { |
739 | 0 | // 8-bit text: we only need to check for |
740 | 0 | for (uint32_t i = 0; i < mLength; ++i) { |
741 | 0 | if (IS_NONBREAKABLE_SPACE(mText[i])) { |
742 | 0 | mHasNonbreakableSpace = true; |
743 | 0 | break; |
744 | 0 | } |
745 | 0 | } |
746 | 0 | } else { |
747 | 0 | // 16-bit text: handle surrogates and check for CJK as well as |
748 | 0 | for (uint32_t i = 0; i < mLength; ++i) { |
749 | 0 | char32_t u = GetUnicodeCharAt(i); |
750 | 0 | if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) { |
751 | 0 | mHasNonbreakableSpace = true; |
752 | 0 | if (mHasCJKChar) { |
753 | 0 | break; |
754 | 0 | } |
755 | 0 | } else if (!mHasCJKChar && IS_CJK_CHAR(u)) { |
756 | 0 | mHasCJKChar = 1; |
757 | 0 | if (mHasNonbreakableSpace) { |
758 | 0 | break; |
759 | 0 | } |
760 | 0 | } |
761 | 0 | if (u > 0xFFFFu) { |
762 | 0 | ++i; // step over trailing low surrogate |
763 | 0 | } |
764 | 0 | } |
765 | 0 | } |
766 | 0 | } |
767 | | |
768 | | const char16_t* const mUniText; |
769 | | const uint8_t* const mText; |
770 | | |
771 | | uint32_t mIndex; |
772 | | const uint32_t mLength; // length of text |
773 | | uint32_t mLastBreakIndex; |
774 | | char32_t mPreviousNonHyphenCharacter; // The last character we have seen |
775 | | // which is not U_HYPHEN |
776 | | bool mHasCJKChar; // if the text has CJK character, this is true. |
777 | | bool mHasNonbreakableSpace; // if the text has no-breakable space, |
778 | | // this is true. |
779 | | bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL |
780 | | bool mHasPreviousSlash; // True if we have seen a U_SLASH |
781 | | bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH |
782 | | }; |
783 | | |
784 | | static int8_t |
785 | | ContextualAnalysis(char32_t prev, char32_t cur, char32_t next, |
786 | | ContextState &aState) |
787 | 0 | { |
788 | 0 | // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. |
789 | 0 |
|
790 | 0 | if (IS_HYPHEN(cur)) { |
791 | 0 | // If next character is hyphen, we don't need to break between them. |
792 | 0 | if (IS_HYPHEN(next)) |
793 | 0 | return CLASS_CHARACTER; |
794 | 0 | // If prev and next characters are numeric, it may be in Math context. |
795 | 0 | // So, we should not break here. |
796 | 0 | bool prevIsNum = IS_ASCII_DIGIT(prev); |
797 | 0 | bool nextIsNum = IS_ASCII_DIGIT(next); |
798 | 0 | if (prevIsNum && nextIsNum) |
799 | 0 | return CLASS_NUMERIC; |
800 | 0 | // If one side is numeric and the other is a character, or if both sides are |
801 | 0 | // characters, the hyphen should be breakable. |
802 | 0 | if (!aState.UseConservativeBreaking(1)) { |
803 | 0 | char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); |
804 | 0 | if (prevOfHyphen && next) { |
805 | 0 | int8_t prevClass = GetClass(prevOfHyphen); |
806 | 0 | int8_t nextClass = GetClass(next); |
807 | 0 | bool prevIsNumOrCharOrClose = |
808 | 0 | prevIsNum || |
809 | 0 | (prevClass == CLASS_CHARACTER && |
810 | 0 | !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || |
811 | 0 | prevClass == CLASS_CLOSE || |
812 | 0 | prevClass == CLASS_CLOSE_LIKE_CHARACTER; |
813 | 0 | bool nextIsNumOrCharOrOpen = |
814 | 0 | nextIsNum || |
815 | 0 | (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || |
816 | 0 | nextClass == CLASS_OPEN || |
817 | 0 | nextClass == CLASS_OPEN_LIKE_CHARACTER || |
818 | 0 | next == U_OPEN_SINGLE_QUOTE || |
819 | 0 | next == U_OPEN_DOUBLE_QUOTE || |
820 | 0 | next == U_OPEN_GUILLEMET; |
821 | 0 | if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { |
822 | 0 | return CLASS_CLOSE; |
823 | 0 | } |
824 | 0 | } |
825 | 0 | } |
826 | 0 | } else { |
827 | 0 | aState.NotifyNonHyphenCharacter(cur); |
828 | 0 | if (cur == U_SLASH || cur == U_BACKSLASH) { |
829 | 0 | // If this is immediately after same char, we should not break here. |
830 | 0 | if (prev == cur) |
831 | 0 | return CLASS_CHARACTER; |
832 | 0 | // If this text has two or more (BACK)SLASHs, this may be file path or URL. |
833 | 0 | // Make sure to compute shouldReturn before we notify on this slash. |
834 | 0 | bool shouldReturn = !aState.UseConservativeBreaking() && |
835 | 0 | (cur == U_SLASH ? |
836 | 0 | aState.HasPreviousSlash() : aState.HasPreviousBackslash()); |
837 | 0 |
|
838 | 0 | if (cur == U_SLASH) { |
839 | 0 | aState.NotifySeenSlash(); |
840 | 0 | } else { |
841 | 0 | aState.NotifySeenBackslash(); |
842 | 0 | } |
843 | 0 |
|
844 | 0 | if (shouldReturn) |
845 | 0 | return CLASS_OPEN; |
846 | 0 | } else if (cur == U_PERCENT) { |
847 | 0 | // If this is a part of the param of URL, we should break before. |
848 | 0 | if (!aState.UseConservativeBreaking()) { |
849 | 0 | if (aState.Index() >= 3 && |
850 | 0 | aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT) |
851 | 0 | return CLASS_OPEN; |
852 | 0 | if (aState.Index() + 3 < aState.Length() && |
853 | 0 | aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT) |
854 | 0 | return CLASS_OPEN; |
855 | 0 | } |
856 | 0 | } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { |
857 | 0 | // If this may be a separator of params of URL, we should break after. |
858 | 0 | if (!aState.UseConservativeBreaking(1) && |
859 | 0 | aState.HasPreviousEqualsSign()) |
860 | 0 | return CLASS_CLOSE; |
861 | 0 | } else if (cur == U_OPEN_SINGLE_QUOTE || |
862 | 0 | cur == U_OPEN_DOUBLE_QUOTE || |
863 | 0 | cur == U_OPEN_GUILLEMET) { |
864 | 0 | // for CJK usage, we treat these as openers to allow a break before them, |
865 | 0 | // but otherwise treat them as normal characters because quote mark usage |
866 | 0 | // in various Western languages varies too much; see bug #450088 discussion. |
867 | 0 | if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) |
868 | 0 | return CLASS_OPEN; |
869 | 0 | } else { |
870 | 0 | NS_ERROR("Forgot to handle the current character!"); |
871 | 0 | } |
872 | 0 | } |
873 | 0 | return GetClass(cur); |
874 | 0 | } |
875 | | |
876 | | |
877 | | int32_t |
878 | | LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, |
879 | | uint32_t aPos, int8_t aDirection) |
880 | 0 | { |
881 | 0 | bool textNeedsJISx4051 = false; |
882 | 0 | int32_t begin, end; |
883 | 0 |
|
884 | 0 | for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { |
885 | 0 | if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) { |
886 | 0 | textNeedsJISx4051 = true; |
887 | 0 | } |
888 | 0 | } |
889 | 0 | for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { |
890 | 0 | if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { |
891 | 0 | textNeedsJISx4051 = true; |
892 | 0 | } |
893 | 0 | } |
894 | 0 |
|
895 | 0 | int32_t ret; |
896 | 0 | AutoTArray<uint8_t, 2000> breakState; |
897 | 0 | if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) { |
898 | 0 | // No complex text character, do not try to do complex line break. |
899 | 0 | // (This is required for serializers. See Bug #344816.) |
900 | 0 | // Also fall back to this when out of memory. |
901 | 0 | if (aDirection < 0) { |
902 | 0 | ret = (begin == int32_t(aPos)) ? begin - 1 : begin; |
903 | 0 | } else { |
904 | 0 | ret = end; |
905 | 0 | } |
906 | 0 | } else { |
907 | 0 | GetJISx4051Breaks(aText + begin, end - begin, LineBreaker::kWordBreak_Normal, |
908 | 0 | breakState.Elements()); |
909 | 0 |
|
910 | 0 | ret = aPos; |
911 | 0 | do { |
912 | 0 | ret += aDirection; |
913 | 0 | } while (begin < ret && ret < end && !breakState[ret - begin]); |
914 | 0 | } |
915 | 0 |
|
916 | 0 | return ret; |
917 | 0 | } |
918 | | |
919 | | int32_t |
920 | | LineBreaker::Next(const char16_t* aText, uint32_t aLen, |
921 | | uint32_t aPos) |
922 | 0 | { |
923 | 0 | NS_ASSERTION(aText, "aText shouldn't be null"); |
924 | 0 | NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next"); |
925 | 0 |
|
926 | 0 | int32_t nextPos = WordMove(aText, aLen, aPos, 1); |
927 | 0 | return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; |
928 | 0 | } |
929 | | |
930 | | int32_t |
931 | | LineBreaker::Prev(const char16_t* aText, uint32_t aLen, |
932 | | uint32_t aPos) |
933 | 0 | { |
934 | 0 | NS_ASSERTION(aText, "aText shouldn't be null"); |
935 | 0 | NS_ASSERTION(aLen >= aPos && aPos > 0, |
936 | 0 | "Bad position passed to nsJISx4051LineBreaker::Prev"); |
937 | 0 |
|
938 | 0 | int32_t prevPos = WordMove(aText, aLen, aPos, -1); |
939 | 0 | return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; |
940 | 0 | } |
941 | | |
942 | | void |
943 | | LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, |
944 | | uint8_t aWordBreak, |
945 | | uint8_t* aBreakBefore) |
946 | 0 | { |
947 | 0 | uint32_t cur; |
948 | 0 | int8_t lastClass = CLASS_NONE; |
949 | 0 | ContextState state(aChars, aLength); |
950 | 0 |
|
951 | 0 | for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { |
952 | 0 | char32_t ch = state.GetUnicodeCharAt(cur); |
953 | 0 | uint32_t chLen = ch > 0xFFFFu ? 2 : 1; |
954 | 0 | int8_t cl; |
955 | 0 |
|
956 | 0 | if (NEED_CONTEXTUAL_ANALYSIS(ch)) { |
957 | 0 | char32_t prev, next; |
958 | 0 | if (cur > 0) { |
959 | 0 | // not using state.GetUnicodeCharAt() here because we're looking back |
960 | 0 | // rather than forward for possible surrogates |
961 | 0 | prev = aChars[cur - 1]; |
962 | 0 | if (NS_IS_LOW_SURROGATE(prev) && cur > 1 && |
963 | 0 | NS_IS_HIGH_SURROGATE(aChars[cur - 2])) { |
964 | 0 | prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev); |
965 | 0 | } |
966 | 0 | } else { |
967 | 0 | prev = 0; |
968 | 0 | } |
969 | 0 | if (cur + chLen < aLength) { |
970 | 0 | next = state.GetUnicodeCharAt(cur + chLen); |
971 | 0 | } else { |
972 | 0 | next = 0; |
973 | 0 | } |
974 | 0 | cl = ContextualAnalysis(prev, ch, next, state); |
975 | 0 | } else { |
976 | 0 | if (ch == U_EQUAL) |
977 | 0 | state.NotifySeenEqualsSign(); |
978 | 0 | state.NotifyNonHyphenCharacter(ch); |
979 | 0 | cl = GetClass(ch); |
980 | 0 | } |
981 | 0 |
|
982 | 0 | bool allowBreak = false; |
983 | 0 | if (cur > 0) { |
984 | 0 | NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, |
985 | 0 | "Loop should have prevented adjacent complex chars here"); |
986 | 0 | if (aWordBreak == LineBreaker::kWordBreak_Normal) { |
987 | 0 | allowBreak = (state.UseConservativeBreaking()) ? |
988 | 0 | GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); |
989 | 0 | } else if (aWordBreak == LineBreaker::kWordBreak_BreakAll) { |
990 | 0 | allowBreak = true; |
991 | 0 | } |
992 | 0 | } |
993 | 0 | aBreakBefore[cur] = allowBreak; |
994 | 0 | if (allowBreak) |
995 | 0 | state.NotifyBreakBefore(); |
996 | 0 | lastClass = cl; |
997 | 0 | if (CLASS_COMPLEX == cl) { |
998 | 0 | uint32_t end = cur + chLen; |
999 | 0 |
|
1000 | 0 | while (end < aLength) { |
1001 | 0 | char32_t c = state.GetUnicodeCharAt(end); |
1002 | 0 | if (CLASS_COMPLEX != GetClass(c)) { |
1003 | 0 | break; |
1004 | 0 | } |
1005 | 0 | ++end; |
1006 | 0 | if (c > 0xFFFFU) { // it was a surrogate pair |
1007 | 0 | ++end; |
1008 | 0 | } |
1009 | 0 | } |
1010 | 0 |
|
1011 | 0 | NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); |
1012 | 0 |
|
1013 | 0 | // We have to consider word-break value again for complex characters |
1014 | 0 | if (aWordBreak != LineBreaker::kWordBreak_Normal) { |
1015 | 0 | // Respect word-break property |
1016 | 0 | for (uint32_t i = cur; i < end; i++) |
1017 | 0 | aBreakBefore[i] = (aWordBreak == LineBreaker::kWordBreak_BreakAll); |
1018 | 0 | } |
1019 | 0 |
|
1020 | 0 | // restore breakability at chunk begin, which was always set to false |
1021 | 0 | // by the complex line breaker |
1022 | 0 | aBreakBefore[cur] = allowBreak; |
1023 | 0 |
|
1024 | 0 | cur = end - 1; |
1025 | 0 | } |
1026 | 0 |
|
1027 | 0 | if (chLen == 2) { |
1028 | 0 | // Supplementary-plane character: mark that we cannot break before the |
1029 | 0 | // trailing low surrogate, and advance past it. |
1030 | 0 | ++cur; |
1031 | 0 | aBreakBefore[cur] = false; |
1032 | 0 | state.AdvanceIndex(); |
1033 | 0 | } |
1034 | 0 | } |
1035 | 0 | } |
1036 | | |
1037 | | void |
1038 | | LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, |
1039 | | uint8_t aWordBreak, |
1040 | | uint8_t* aBreakBefore) |
1041 | 0 | { |
1042 | 0 | uint32_t cur; |
1043 | 0 | int8_t lastClass = CLASS_NONE; |
1044 | 0 | ContextState state(aChars, aLength); |
1045 | 0 |
|
1046 | 0 | for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { |
1047 | 0 | char32_t ch = aChars[cur]; |
1048 | 0 | int8_t cl; |
1049 | 0 |
|
1050 | 0 | if (NEED_CONTEXTUAL_ANALYSIS(ch)) { |
1051 | 0 | cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, |
1052 | 0 | ch, |
1053 | 0 | cur + 1 < aLength ? aChars[cur + 1] : U_NULL, |
1054 | 0 | state); |
1055 | 0 | } else { |
1056 | 0 | if (ch == U_EQUAL) |
1057 | 0 | state.NotifySeenEqualsSign(); |
1058 | 0 | state.NotifyNonHyphenCharacter(ch); |
1059 | 0 | cl = GetClass(ch); |
1060 | 0 | } |
1061 | 0 |
|
1062 | 0 | bool allowBreak = false; |
1063 | 0 | if (cur > 0) { |
1064 | 0 | if (aWordBreak == LineBreaker::kWordBreak_Normal) { |
1065 | 0 | allowBreak = (state.UseConservativeBreaking()) ? |
1066 | 0 | GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); |
1067 | 0 | } else if (aWordBreak == LineBreaker::kWordBreak_BreakAll) { |
1068 | 0 | allowBreak = true; |
1069 | 0 | } |
1070 | 0 | } |
1071 | 0 | aBreakBefore[cur] = allowBreak; |
1072 | 0 | if (allowBreak) |
1073 | 0 | state.NotifyBreakBefore(); |
1074 | 0 | lastClass = cl; |
1075 | 0 | } |
1076 | 0 | } |