/src/libreoffice/vcl/source/gdi/scrptrun.cxx
Line | Count | Source |
1 | | /* |
2 | | ******************************************************************************* |
3 | | * |
4 | | * Copyright (c) 1995-2013 International Business Machines Corporation and others |
5 | | * |
6 | | * All rights reserved. |
7 | | * |
8 | | * Permission is hereby granted, free of charge, to any person obtaining a copy of |
9 | | * this software and associated documentation files (the "Software"), to deal in |
10 | | * the Software without restriction, including without limitation the rights to |
11 | | * use, copy, modify, merge, publish, distribute, and/or sell copies of the |
12 | | * Software, and to permit persons to whom the Software is furnished to do so, |
13 | | * provided that the above copyright notice(s) and this permission notice appear |
14 | | * in all copies of the Software and that both the above copyright notice(s) and |
15 | | * this permission notice appear in supporting documentation. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
18 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN |
20 | | * NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE |
21 | | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY |
22 | | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
23 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
24 | | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
25 | | * |
26 | | * Except as contained in this notice, the name of a copyright holder shall not be |
27 | | * used in advertising or otherwise to promote the sale, use or other dealings in |
28 | | * this Software without prior written authorization of the copyright holder. |
29 | | * |
30 | | ******************************************************************************* |
31 | | * file name: scrptrun.cpp |
32 | | * |
33 | | * created on: 10/17/2001 |
34 | | * created by: Eric R. Mader |
35 | | */ |
36 | | /** |
37 | | * This file is largely copied from the ICU project, |
38 | | * under folder source/extra/scrptrun/scrptrun.cpp |
39 | | */ |
40 | | |
41 | | #include <sal/config.h> |
42 | | |
43 | | #include <rtl/character.hxx> |
44 | | #include <unicode/uchar.h> |
45 | | #include <unicode/utypes.h> |
46 | | #include <unicode/uscript.h> |
47 | | |
48 | | #include <scrptrun.h> |
49 | | #include <algorithm> |
50 | | |
51 | | namespace { |
52 | | |
53 | | struct PairIndices |
54 | | { |
55 | | int8_t ma00[0xff]; |
56 | | int8_t ma20[0x7f]; |
57 | | int8_t ma30[0x7f]; |
58 | | |
59 | | PairIndices() |
60 | 110 | { |
61 | 110 | std::fill_n(ma00, 0xff, -1); |
62 | 110 | std::fill_n(ma20, 0x7f, -1); |
63 | 110 | std::fill_n(ma30, 0x7f, -1); |
64 | | |
65 | | // characters in the range 0x0000 - 0x007e (inclusive) |
66 | | // ascii paired punctuation |
67 | 110 | ma00[0x28] = 0; |
68 | 110 | ma00[0x29] = 1; |
69 | 110 | ma00[0x3c] = 2; |
70 | 110 | ma00[0x3e] = 3; |
71 | 110 | ma00[0x5b] = 4; |
72 | 110 | ma00[0x5d] = 5; |
73 | 110 | ma00[0x7b] = 6; |
74 | 110 | ma00[0x7d] = 7; |
75 | | // guillemets |
76 | 110 | ma00[0xab] = 8; |
77 | 110 | ma00[0xbb] = 9; |
78 | | |
79 | | // characters in the range 0x2000 - 0x207e (inclusive) |
80 | | // general punctuation |
81 | 110 | ma20[0x18] = 10; |
82 | 110 | ma20[0x19] = 11; |
83 | 110 | ma20[0x1c] = 12; |
84 | 110 | ma20[0x1d] = 13; |
85 | 110 | ma20[0x39] = 14; |
86 | 110 | ma20[0x3a] = 15; |
87 | | |
88 | | // characters in the range 0x3000 - 0x307e (inclusive) |
89 | | // chinese paired punctuation |
90 | 110 | ma30[0x08] = 16; |
91 | 110 | ma30[0x09] = 17; |
92 | 110 | ma30[0x0a] = 18; |
93 | 110 | ma30[0x0b] = 19; |
94 | 110 | ma30[0x0c] = 20; |
95 | 110 | ma30[0x0d] = 21; |
96 | 110 | ma30[0x0e] = 22; |
97 | 110 | ma30[0x0f] = 23; |
98 | 110 | ma30[0x10] = 24; |
99 | 110 | ma30[0x11] = 25; |
100 | 110 | ma30[0x14] = 26; |
101 | 110 | ma30[0x15] = 27; |
102 | 110 | ma30[0x16] = 28; |
103 | 110 | ma30[0x17] = 29; |
104 | 110 | ma30[0x18] = 30; |
105 | 110 | ma30[0x19] = 31; |
106 | 110 | ma30[0x1a] = 32; |
107 | 110 | ma30[0x1b] = 33; |
108 | 110 | } |
109 | | |
110 | | int32_t getPairIndex(UChar32 ch) const |
111 | 280M | { |
112 | 280M | if (ch < 0xff) |
113 | 252M | return ma00[ch]; |
114 | 27.9M | if (ch >= 0x2000 && ch < 0x207f) |
115 | 1.01M | return ma20[ch - 0x2000]; |
116 | 26.9M | if (ch >= 0x3000 && ch < 0x307f) |
117 | 75.3k | return ma30[ch - 0x3000]; |
118 | 26.8M | return -1; |
119 | 26.9M | } |
120 | | |
121 | | }; |
122 | | |
123 | | UScriptCode getScript(UChar32 ch, UErrorCode* status) |
124 | 280M | { |
125 | | // tdf#154549 |
126 | | // Make combining marks inherit the script of their bases, regardless of |
127 | | // their own script. |
128 | 280M | if (u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY) == U_NON_SPACING_MARK) |
129 | 979k | return USCRIPT_INHERITED; |
130 | | |
131 | 279M | UScriptCode script = uscript_getScript(ch, status); |
132 | 279M | if (U_FAILURE(*status)) |
133 | 0 | return script; |
134 | | |
135 | | // There are three Unicode script codes for Japanese text, but only one |
136 | | // OpenType script tag, so we want to keep them in one run as splitting is |
137 | | // pointless for the purpose of OpenType shaping. |
138 | 279M | if (script == USCRIPT_KATAKANA || script == USCRIPT_KATAKANA_OR_HIRAGANA) |
139 | 325k | return USCRIPT_HIRAGANA; |
140 | 278M | return script; |
141 | 279M | } |
142 | | |
143 | | } |
144 | | |
145 | | const PairIndices gPairIndices; |
146 | | |
147 | | |
148 | | namespace vcl { |
149 | | |
150 | | const char ScriptRun::fgClassID=0; |
151 | | |
152 | | static bool sameScript(int32_t scriptOne, int32_t scriptTwo) |
153 | 280M | { |
154 | 280M | return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; |
155 | 280M | } |
156 | | |
157 | | UBool ScriptRun::next() |
158 | 10.4M | { |
159 | 10.4M | int32_t startSP = parenSP; // used to find the first new open character |
160 | 10.4M | UErrorCode error = U_ZERO_ERROR; |
161 | | |
162 | | // if we've fallen off the end of the text, we're done |
163 | 10.4M | if (scriptEnd >= charLimit) { |
164 | 1.87M | return false; |
165 | 1.87M | } |
166 | | |
167 | 8.53M | scriptCode = USCRIPT_COMMON; |
168 | | |
169 | 282M | for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) { |
170 | 280M | UChar high = charArray[scriptEnd]; |
171 | 280M | UChar32 ch = high; |
172 | | |
173 | | // if the character is a high surrogate and it's not the last one |
174 | | // in the text, see if it's followed by a low surrogate |
175 | 280M | if (rtl::isHighSurrogate(high) && scriptEnd < charLimit - 1) |
176 | 342k | { |
177 | 342k | UChar low = charArray[scriptEnd + 1]; |
178 | | |
179 | | // if it is followed by a low surrogate, |
180 | | // consume it and form the full character |
181 | 342k | if (rtl::isLowSurrogate(low)) { |
182 | 197k | ch = rtl::combineSurrogates(high, low); |
183 | 197k | scriptEnd += 1; |
184 | 197k | } |
185 | 342k | } |
186 | | |
187 | 280M | UScriptCode sc = getScript(ch, &error); |
188 | 280M | int32_t pairIndex = gPairIndices.getPairIndex(ch); |
189 | | |
190 | | // Paired character handling: |
191 | | |
192 | | // if it's an open character, push it onto the stack. |
193 | | // if it's a close character, find the matching open on the |
194 | | // stack, and use that script code. Any non-matching open |
195 | | // characters above it on the stack will be popped. |
196 | 280M | if (pairIndex >= 0) { |
197 | 5.75M | if ((pairIndex & 1) == 0) { |
198 | 3.04M | ++parenSP; |
199 | 3.04M | int32_t nVecSize = parenStack.size(); |
200 | 3.04M | if (parenSP == nVecSize) |
201 | 178k | parenStack.resize(nVecSize + 128); |
202 | 3.04M | parenStack[parenSP].pairIndex = pairIndex; |
203 | 3.04M | parenStack[parenSP].scriptCode = scriptCode; |
204 | 3.04M | } else if (parenSP >= 0) { |
205 | 1.16M | int32_t pi = pairIndex & ~1; |
206 | | |
207 | 2.87M | while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) { |
208 | 1.70M | parenSP -= 1; |
209 | 1.70M | } |
210 | | |
211 | 1.16M | if (parenSP < startSP) { |
212 | 67.4k | startSP = parenSP; |
213 | 67.4k | } |
214 | | |
215 | 1.16M | if (parenSP >= 0) { |
216 | 991k | sc = parenStack[parenSP].scriptCode; |
217 | 991k | } |
218 | 1.16M | } |
219 | 5.75M | } |
220 | | |
221 | 280M | if (sameScript(scriptCode, sc)) { |
222 | 273M | if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { |
223 | 7.50M | scriptCode = sc; |
224 | | |
225 | | // now that we have a final script code, fix any open |
226 | | // characters we pushed before we knew the script code. |
227 | 7.54M | while (startSP < parenSP) { |
228 | 41.1k | parenStack[++startSP].scriptCode = scriptCode; |
229 | 41.1k | } |
230 | 7.50M | } |
231 | | |
232 | | // if this character is a close paired character, |
233 | | // pop it from the stack |
234 | 273M | if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { |
235 | 906k | parenSP -= 1; |
236 | | /* decrement startSP only if it is >= 0, |
237 | | decrementing it unnecessarily will lead to memory corruption |
238 | | while processing the above while block. |
239 | | e.g. startSP = -4 , parenSP = -1 |
240 | | */ |
241 | 906k | if (startSP >= 0) { |
242 | 294k | startSP -= 1; |
243 | 294k | } |
244 | 906k | } |
245 | 273M | } else { |
246 | | // if the run broke on a surrogate pair, |
247 | | // end it before the high surrogate |
248 | 6.66M | if (ch >= 0x10000) { |
249 | 67.9k | scriptEnd -= 1; |
250 | 67.9k | } |
251 | | |
252 | 6.66M | break; |
253 | 6.66M | } |
254 | 280M | } |
255 | | |
256 | 8.53M | return true; |
257 | 10.4M | } |
258 | | |
259 | | } |