/src/libreoffice/vcl/source/gdi/scrptrun.cxx

Source
/*
 *******************************************************************************
 *
 *   Copyright (c) 1995-2013 International Business Machines Corporation and others
 *
 *   All rights reserved.
 *
 *   Permission is hereby granted, free of charge, to any person obtaining a copy of
 *   this software and associated documentation files (the "Software"), to deal in
 *   the Software without restriction, including without limitation the rights to
 *   use, copy, modify, merge, publish, distribute, and/or sell copies of the
 *   Software, and to permit persons to whom the Software is furnished to do so,
 *   provided that the above copyright notice(s) and this permission notice appear
 *   in all copies of the Software and that both the above copyright notice(s) and
 *   this permission notice appear in supporting documentation.
 *
 *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
 *   NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
 *   LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
 *   DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 *   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 *   CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 *   Except as contained in this notice, the name of a copyright holder shall not be
 *   used in advertising or otherwise to promote the sale, use or other dealings in
 *   this Software without prior written authorization of the copyright holder.
 *
 *******************************************************************************
 *   file name:  scrptrun.cpp
 *
 *   created on: 10/17/2001
 *   created by: Eric R. Mader
 */
/**
  * This file is largely copied from the ICU project,
  * under folder source/extra/scrptrun/scrptrun.cpp
  */

#include <sal/config.h>

#include <rtl/character.hxx>
#include <unicode/uchar.h>
#include <unicode/utypes.h>
#include <unicode/uscript.h>

#include <scrptrun.h>
#include <algorithm>

namespace {

struct PairIndices
{
    int8_t ma00[0xff];
    int8_t ma20[0x7f];
    int8_t ma30[0x7f];

    PairIndices()
    {
        std::fill_n(ma00, 0xff, -1);
        std::fill_n(ma20, 0x7f, -1);
        std::fill_n(ma30, 0x7f, -1);

        // characters in the range 0x0000 - 0x007e (inclusive)
        // ascii paired punctuation
        ma00[0x28] =  0;
        ma00[0x29] =  1;
        ma00[0x3c] =  2;
        ma00[0x3e] =  3;
        ma00[0x5b] =  4;
        ma00[0x5d] =  5;
        ma00[0x7b] =  6;
        ma00[0x7d] =  7;
        // guillemets
        ma00[0xab] =  8;
        ma00[0xbb] =  9;

        // characters in the range 0x2000 - 0x207e (inclusive)
        // general punctuation
        ma20[0x18] = 10;
        ma20[0x19] = 11;
        ma20[0x1c] = 12;
        ma20[0x1d] = 13;
        ma20[0x39] = 14;
        ma20[0x3a] = 15;

        // characters in the range 0x3000 - 0x307e (inclusive)
        // chinese paired punctuation
        ma30[0x08] = 16;
        ma30[0x09] = 17;
        ma30[0x0a] = 18;
        ma30[0x0b] = 19;
        ma30[0x0c] = 20;
        ma30[0x0d] = 21;
        ma30[0x0e] = 22;
        ma30[0x0f] = 23;
        ma30[0x10] = 24;
        ma30[0x11] = 25;
        ma30[0x14] = 26;
        ma30[0x15] = 27;
        ma30[0x16] = 28;
        ma30[0x17] = 29;
        ma30[0x18] = 30;
        ma30[0x19] = 31;
        ma30[0x1a] = 32;
        ma30[0x1b] = 33;
    }

    int32_t getPairIndex(UChar32 ch) const
    {
        if (ch < 0xff)
            return ma00[ch];
        if (ch >= 0x2000 && ch < 0x207f)
            return ma20[ch - 0x2000];
        if (ch >= 0x3000 && ch < 0x307f)
            return ma30[ch - 0x3000];
        return -1;
    }

};

UScriptCode getScript(UChar32 ch, UErrorCode* status)
{
    // tdf#154549
    // Make combining marks inherit the script of their bases, regardless of
    // their own script.
    if (u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY) == U_NON_SPACING_MARK)
        return USCRIPT_INHERITED;

    UScriptCode script = uscript_getScript(ch, status);
    if (U_FAILURE(*status))
        return script;

    // There are three Unicode script codes for Japanese text, but only one
    // OpenType script tag, so we want to keep them in one run as splitting is
    // pointless for the purpose of OpenType shaping.
    if (script == USCRIPT_KATAKANA || script == USCRIPT_KATAKANA_OR_HIRAGANA)
        return USCRIPT_HIRAGANA;
    return script;
}

}

const PairIndices gPairIndices;


namespace vcl {

const char ScriptRun::fgClassID=0;

static bool sameScript(int32_t scriptOne, int32_t scriptTwo)
{
    return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
}

UBool ScriptRun::next()
{
    int32_t startSP  = parenSP;  // used to find the first new open character
    UErrorCode error = U_ZERO_ERROR;

    // if we've fallen off the end of the text, we're done
    if (scriptEnd >= charLimit) {
        return false;
    }

    scriptCode = USCRIPT_COMMON;

    for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
        UChar   high = charArray[scriptEnd];
        UChar32 ch   = high;

        // if the character is a high surrogate and it's not the last one
        // in the text, see if it's followed by a low surrogate
        if (rtl::isHighSurrogate(high) && scriptEnd < charLimit - 1)
        {
            UChar low = charArray[scriptEnd + 1];

            // if it is followed by a low surrogate,
            // consume it and form the full character
            if (rtl::isLowSurrogate(low)) {
                ch = rtl::combineSurrogates(high, low);
                scriptEnd += 1;
            }
        }

        UScriptCode sc = getScript(ch, &error);
        int32_t pairIndex = gPairIndices.getPairIndex(ch);

        // Paired character handling:

        // if it's an open character, push it onto the stack.
        // if it's a close character, find the matching open on the
        // stack, and use that script code. Any non-matching open
        // characters above it on the stack will be popped.
        if (pairIndex >= 0) {
            if ((pairIndex & 1) == 0) {
                ++parenSP;
                int32_t nVecSize = parenStack.size();
                if (parenSP == nVecSize)
                    parenStack.resize(nVecSize + 128);
                parenStack[parenSP].pairIndex = pairIndex;
                parenStack[parenSP].scriptCode  = scriptCode;
            } else if (parenSP >= 0) {
                int32_t pi = pairIndex & ~1;

                while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
                    parenSP -= 1;
                }

                if (parenSP < startSP) {
                    startSP = parenSP;
                }

                if (parenSP >= 0) {
                    sc = parenStack[parenSP].scriptCode;
                }
            }
        }

        if (sameScript(scriptCode, sc)) {
            if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
                scriptCode = sc;

                // now that we have a final script code, fix any open
                // characters we pushed before we knew the script code.
                while (startSP < parenSP) {
                    parenStack[++startSP].scriptCode = scriptCode;
                }
            }

            // if this character is a close paired character,
            // pop it from the stack
            if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
                parenSP -= 1;
                /* decrement startSP only if it is >= 0,
                   decrementing it unnecessarily will lead to memory corruption
                   while processing the above while block.
                   e.g. startSP = -4 , parenSP = -1
                */
                if (startSP >= 0) {
                    startSP -= 1;
                }
            }
        } else {
            // if the run broke on a surrogate pair,
            // end it before the high surrogate
            if (ch >= 0x10000) {
                scriptEnd -= 1;
            }

            break;
        }
    }

    return true;
}

}

Coverage Report

Created: 2026-02-14 09:37

Line	Count	Source
1		/*
2		*******************************************************************************
3		*
4		* Copyright (c) 1995-2013 International Business Machines Corporation and others
5		*
6		* All rights reserved.
7		*
8		* Permission is hereby granted, free of charge, to any person obtaining a copy of
9		* this software and associated documentation files (the "Software"), to deal in
10		* the Software without restriction, including without limitation the rights to
11		* use, copy, modify, merge, publish, distribute, and/or sell copies of the
12		* Software, and to permit persons to whom the Software is furnished to do so,
13		* provided that the above copyright notice(s) and this permission notice appear
14		* in all copies of the Software and that both the above copyright notice(s) and
15		* this permission notice appear in supporting documentation.
16		*
17		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
20		* NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
21		* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
22		* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
23		* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
24		* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25		*
26		* Except as contained in this notice, the name of a copyright holder shall not be
27		* used in advertising or otherwise to promote the sale, use or other dealings in
28		* this Software without prior written authorization of the copyright holder.
29		*
30		*******************************************************************************
31		* file name: scrptrun.cpp
32		*
33		* created on: 10/17/2001
34		* created by: Eric R. Mader
35		*/
36		/**
37		* This file is largely copied from the ICU project,
38		* under folder source/extra/scrptrun/scrptrun.cpp
39		*/
40
41		#include <sal/config.h>
42
43		#include <rtl/character.hxx>
44		#include <unicode/uchar.h>
45		#include <unicode/utypes.h>
46		#include <unicode/uscript.h>
47
48		#include <scrptrun.h>
49		#include <algorithm>
50
51		namespace {
52
53		struct PairIndices
54		{
55		int8_t ma00[0xff];
56		int8_t ma20[0x7f];
57		int8_t ma30[0x7f];
58
59		PairIndices()
60	110	{
61	110	std::fill_n(ma00, 0xff, -1);
62	110	std::fill_n(ma20, 0x7f, -1);
63	110	std::fill_n(ma30, 0x7f, -1);
64
65		// characters in the range 0x0000 - 0x007e (inclusive)
66		// ascii paired punctuation
67	110	ma00[0x28] = 0;
68	110	ma00[0x29] = 1;
69	110	ma00[0x3c] = 2;
70	110	ma00[0x3e] = 3;
71	110	ma00[0x5b] = 4;
72	110	ma00[0x5d] = 5;
73	110	ma00[0x7b] = 6;
74	110	ma00[0x7d] = 7;
75		// guillemets
76	110	ma00[0xab] = 8;
77	110	ma00[0xbb] = 9;
78
79		// characters in the range 0x2000 - 0x207e (inclusive)
80		// general punctuation
81	110	ma20[0x18] = 10;
82	110	ma20[0x19] = 11;
83	110	ma20[0x1c] = 12;
84	110	ma20[0x1d] = 13;
85	110	ma20[0x39] = 14;
86	110	ma20[0x3a] = 15;
87
88		// characters in the range 0x3000 - 0x307e (inclusive)
89		// chinese paired punctuation
90	110	ma30[0x08] = 16;
91	110	ma30[0x09] = 17;
92	110	ma30[0x0a] = 18;
93	110	ma30[0x0b] = 19;
94	110	ma30[0x0c] = 20;
95	110	ma30[0x0d] = 21;
96	110	ma30[0x0e] = 22;
97	110	ma30[0x0f] = 23;
98	110	ma30[0x10] = 24;
99	110	ma30[0x11] = 25;
100	110	ma30[0x14] = 26;
101	110	ma30[0x15] = 27;
102	110	ma30[0x16] = 28;
103	110	ma30[0x17] = 29;
104	110	ma30[0x18] = 30;
105	110	ma30[0x19] = 31;
106	110	ma30[0x1a] = 32;
107	110	ma30[0x1b] = 33;
108	110	}
109
110		int32_t getPairIndex(UChar32 ch) const
111	280M	{
112	280M	if (ch < 0xff)
113	252M	return ma00[ch];
114	27.9M	if (ch >= 0x2000 && ch < 0x207f)
115	1.01M	return ma20[ch - 0x2000];
116	26.9M	if (ch >= 0x3000 && ch < 0x307f)
117	75.3k	return ma30[ch - 0x3000];
118	26.8M	return -1;
119	26.9M	}
120
121		};
122
123		UScriptCode getScript(UChar32 ch, UErrorCode* status)
124	280M	{
125		// tdf#154549
126		// Make combining marks inherit the script of their bases, regardless of
127		// their own script.
128	280M	if (u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY) == U_NON_SPACING_MARK)
129	979k	return USCRIPT_INHERITED;
130
131	279M	UScriptCode script = uscript_getScript(ch, status);
132	279M	if (U_FAILURE(*status))
133	0	return script;
134
135		// There are three Unicode script codes for Japanese text, but only one
136		// OpenType script tag, so we want to keep them in one run as splitting is
137		// pointless for the purpose of OpenType shaping.
138	279M	if (script == USCRIPT_KATAKANA \|\| script == USCRIPT_KATAKANA_OR_HIRAGANA)
139	325k	return USCRIPT_HIRAGANA;
140	278M	return script;
141	279M	}
142
143		}
144
145		const PairIndices gPairIndices;
146
147
148		namespace vcl {
149
150		const char ScriptRun::fgClassID=0;
151
152		static bool sameScript(int32_t scriptOne, int32_t scriptTwo)
153	280M	{
154	280M	return scriptOne <= USCRIPT_INHERITED \|\| scriptTwo <= USCRIPT_INHERITED \|\| scriptOne == scriptTwo;
155	280M	}
156
157		UBool ScriptRun::next()
158	10.4M	{
159	10.4M	int32_t startSP = parenSP; // used to find the first new open character
160	10.4M	UErrorCode error = U_ZERO_ERROR;
161
162		// if we've fallen off the end of the text, we're done
163	10.4M	if (scriptEnd >= charLimit) {
164	1.87M	return false;
165	1.87M	}
166
167	8.53M	scriptCode = USCRIPT_COMMON;
168
169	282M	for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
170	280M	UChar high = charArray[scriptEnd];
171	280M	UChar32 ch = high;
172
173		// if the character is a high surrogate and it's not the last one
174		// in the text, see if it's followed by a low surrogate
175	280M	if (rtl::isHighSurrogate(high) && scriptEnd < charLimit - 1)
176	342k	{
177	342k	UChar low = charArray[scriptEnd + 1];
178
179		// if it is followed by a low surrogate,
180		// consume it and form the full character
181	342k	if (rtl::isLowSurrogate(low)) {
182	197k	ch = rtl::combineSurrogates(high, low);
183	197k	scriptEnd += 1;
184	197k	}
185	342k	}
186
187	280M	UScriptCode sc = getScript(ch, &error);
188	280M	int32_t pairIndex = gPairIndices.getPairIndex(ch);
189
190		// Paired character handling:
191
192		// if it's an open character, push it onto the stack.
193		// if it's a close character, find the matching open on the
194		// stack, and use that script code. Any non-matching open
195		// characters above it on the stack will be popped.
196	280M	if (pairIndex >= 0) {
197	5.75M	if ((pairIndex & 1) == 0) {
198	3.04M	++parenSP;
199	3.04M	int32_t nVecSize = parenStack.size();
200	3.04M	if (parenSP == nVecSize)
201	178k	parenStack.resize(nVecSize + 128);
202	3.04M	parenStack[parenSP].pairIndex = pairIndex;
203	3.04M	parenStack[parenSP].scriptCode = scriptCode;
204	3.04M	} else if (parenSP >= 0) {
205	1.16M	int32_t pi = pairIndex & ~1;
206
207	2.87M	while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
208	1.70M	parenSP -= 1;
209	1.70M	}
210
211	1.16M	if (parenSP < startSP) {
212	67.4k	startSP = parenSP;
213	67.4k	}
214
215	1.16M	if (parenSP >= 0) {
216	991k	sc = parenStack[parenSP].scriptCode;
217	991k	}
218	1.16M	}
219	5.75M	}
220
221	280M	if (sameScript(scriptCode, sc)) {
222	273M	if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
223	7.50M	scriptCode = sc;
224
225		// now that we have a final script code, fix any open
226		// characters we pushed before we knew the script code.
227	7.54M	while (startSP < parenSP) {
228	41.1k	parenStack[++startSP].scriptCode = scriptCode;
229	41.1k	}
230	7.50M	}
231
232		// if this character is a close paired character,
233		// pop it from the stack
234	273M	if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
235	906k	parenSP -= 1;
236		/* decrement startSP only if it is >= 0,
237		decrementing it unnecessarily will lead to memory corruption
238		while processing the above while block.
239		e.g. startSP = -4 , parenSP = -1
240		*/
241	906k	if (startSP >= 0) {
242	294k	startSP -= 1;
243	294k	}
244	906k	}
245	273M	} else {
246		// if the run broke on a surrogate pair,
247		// end it before the high surrogate
248	6.66M	if (ch >= 0x10000) {
249	67.9k	scriptEnd -= 1;
250	67.9k	}
251
252	6.66M	break;
253	6.66M	}
254	280M	}
255
256	8.53M	return true;
257	10.4M	}
258
259		}