/src/mozilla-central/intl/lwbrk/WordBreaker.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "mozilla/intl/WordBreaker.h" |
7 | | #include "mozilla/Preferences.h" |
8 | | |
9 | | using mozilla::intl::WordBreaker; |
10 | | using mozilla::intl::WordBreakClass; |
11 | | using mozilla::intl::WordRange; |
12 | | |
13 | | /*static*/ |
14 | | already_AddRefed<WordBreaker> |
15 | | WordBreaker::Create() |
16 | 3 | { |
17 | 3 | return RefPtr<WordBreaker>(new WordBreaker()).forget(); |
18 | 3 | } |
19 | | |
20 | | bool WordBreaker::BreakInBetween( |
21 | | const char16_t* aText1 , uint32_t aTextLen1, |
22 | | const char16_t* aText2 , uint32_t aTextLen2) |
23 | 0 | { |
24 | 0 | MOZ_ASSERT(nullptr != aText1, "null ptr"); |
25 | 0 | MOZ_ASSERT(nullptr != aText2, "null ptr"); |
26 | 0 |
|
27 | 0 | if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) |
28 | 0 | return false; |
29 | 0 | |
30 | 0 | return GetClass(aText1[aTextLen1-1]) != GetClass(aText2[0]); |
31 | 0 | } |
32 | | |
33 | 0 | #define IS_ASCII(c) (0 == ( 0xFF80 & (c))) |
34 | 0 | #define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z'))) |
35 | 0 | #define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9')) |
36 | 0 | #define ASCII_IS_SPACE(c) (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c))) |
37 | 0 | #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) |
38 | | |
39 | | // we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0 |
40 | 0 | #define IS_HAN(c) (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff)) |
41 | 0 | #define IS_KATAKANA(c) (( 0x30A0 <= (c)) && ((c) <= 0x30FF)) |
42 | 0 | #define IS_HIRAGANA(c) (( 0x3040 <= (c)) && ((c) <= 0x309F)) |
43 | 0 | #define IS_HALFWIDTHKATAKANA(c) (( 0xFF60 <= (c)) && ((c) <= 0xFF9F)) |
44 | 0 | #define IS_THAI(c) (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits |
45 | | |
46 | | /* static */ WordBreakClass |
47 | | WordBreaker::GetClass(char16_t c) |
48 | 0 | { |
49 | 0 | // The pref is cached on first call; changes will require a browser restart. |
50 | 0 | static bool sStopAtUnderscore = |
51 | 0 | Preferences::GetBool("layout.word_select.stop_at_underscore", false); |
52 | 0 |
|
53 | 0 | // begin of the hack |
54 | 0 |
|
55 | 0 | if (IS_ALPHABETICAL_SCRIPT(c)) { |
56 | 0 | if(IS_ASCII(c)) { |
57 | 0 | if(ASCII_IS_SPACE(c)) { |
58 | 0 | return kWbClassSpace; |
59 | 0 | } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || |
60 | 0 | (c == '_' && !sStopAtUnderscore)) { |
61 | 0 | return kWbClassAlphaLetter; |
62 | 0 | } else { |
63 | 0 | return kWbClassPunct; |
64 | 0 | } |
65 | 0 | } else if(IS_THAI(c)) { |
66 | 0 | return kWbClassThaiLetter; |
67 | 0 | } else if (c == 0x00A0/*NBSP*/) { |
68 | 0 | return kWbClassSpace; |
69 | 0 | } else { |
70 | 0 | return kWbClassAlphaLetter; |
71 | 0 | } |
72 | 0 | } else { |
73 | 0 | if(IS_HAN(c)) { |
74 | 0 | return kWbClassHanLetter; |
75 | 0 | } else if(IS_KATAKANA(c)) { |
76 | 0 | return kWbClassKatakanaLetter; |
77 | 0 | } else if(IS_HIRAGANA(c)) { |
78 | 0 | return kWbClassHiraganaLetter; |
79 | 0 | } else if(IS_HALFWIDTHKATAKANA(c)) { |
80 | 0 | return kWbClassHWKatakanaLetter; |
81 | 0 | } else { |
82 | 0 | return kWbClassAlphaLetter; |
83 | 0 | } |
84 | 0 | } |
85 | 0 | return static_cast<WordBreakClass>(0); |
86 | 0 | } |
87 | | |
88 | | WordRange WordBreaker::FindWord( |
89 | | const char16_t* aText , uint32_t aTextLen, |
90 | | uint32_t aOffset) |
91 | 0 | { |
92 | 0 | WordRange range; |
93 | 0 | MOZ_ASSERT(nullptr != aText, "null ptr"); |
94 | 0 | MOZ_ASSERT(0 != aTextLen, "len = 0"); |
95 | 0 | MOZ_ASSERT(aOffset <= aTextLen, "aOffset > aTextLen"); |
96 | 0 |
|
97 | 0 | range.mBegin = aTextLen + 1; |
98 | 0 | range.mEnd = aTextLen + 1; |
99 | 0 |
|
100 | 0 | if(!aText || aOffset > aTextLen) |
101 | 0 | return range; |
102 | 0 | |
103 | 0 | WordBreakClass c = GetClass(aText[aOffset]); |
104 | 0 | uint32_t i; |
105 | 0 | // Scan forward |
106 | 0 | range.mEnd--; |
107 | 0 | for(i = aOffset +1;i <= aTextLen; i++) |
108 | 0 | { |
109 | 0 | if( c != GetClass(aText[i])) |
110 | 0 | { |
111 | 0 | range.mEnd = i; |
112 | 0 | break; |
113 | 0 | } |
114 | 0 | } |
115 | 0 |
|
116 | 0 | // Scan backward |
117 | 0 | range.mBegin = 0; |
118 | 0 | for(i = aOffset ;i > 0; i--) |
119 | 0 | { |
120 | 0 | if( c != GetClass(aText[i-1])) |
121 | 0 | { |
122 | 0 | range.mBegin = i; |
123 | 0 | break; |
124 | 0 | } |
125 | 0 | } |
126 | 0 | if(kWbClassThaiLetter == c) |
127 | 0 | { |
128 | 0 | // need to call Thai word breaker from here |
129 | 0 | // we should pass the whole Thai segment to the thai word breaker to find a shorter answer |
130 | 0 | } |
131 | 0 | return range; |
132 | 0 | } |
133 | | |
134 | | int32_t WordBreaker::NextWord( |
135 | | const char16_t* aText, uint32_t aLen, uint32_t aPos) |
136 | 0 | { |
137 | 0 | WordBreakClass c1, c2; |
138 | 0 | uint32_t cur = aPos; |
139 | 0 | if (cur == aLen) |
140 | 0 | return NS_WORDBREAKER_NEED_MORE_TEXT; |
141 | 0 | c1 = GetClass(aText[cur]); |
142 | 0 |
|
143 | 0 | for(cur++; cur <aLen; cur++) |
144 | 0 | { |
145 | 0 | c2 = GetClass(aText[cur]); |
146 | 0 | if(c2 != c1) |
147 | 0 | break; |
148 | 0 | } |
149 | 0 | if(kWbClassThaiLetter == c1) |
150 | 0 | { |
151 | 0 | // need to call Thai word breaker from here |
152 | 0 | // we should pass the whole Thai segment to the thai word breaker to find a shorter answer |
153 | 0 | } |
154 | 0 | if (cur == aLen) |
155 | 0 | return NS_WORDBREAKER_NEED_MORE_TEXT; |
156 | 0 | return cur; |
157 | 0 | } |