/src/mozilla-central/intl/hyphenation/glue/nsHyphenator.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "nsHyphenator.h" |
7 | | #include "nsIFile.h" |
8 | | #include "nsUTF8Utils.h" |
9 | | #include "nsUnicodeProperties.h" |
10 | | #include "nsIURI.h" |
11 | | |
12 | | #include "hyphen.h" |
13 | | |
14 | | nsHyphenator::nsHyphenator(nsIURI *aURI) |
15 | | : mDict(nullptr) |
16 | 0 | { |
17 | 0 | nsCString uriSpec; |
18 | 0 | nsresult rv = aURI->GetSpec(uriSpec); |
19 | 0 | if (NS_FAILED(rv)) { |
20 | 0 | return; |
21 | 0 | } |
22 | 0 | mDict = hnj_hyphen_load(uriSpec.get()); |
23 | | #ifdef DEBUG |
24 | | if (mDict) { |
25 | | printf("loaded hyphenation patterns from %s\n", uriSpec.get()); |
26 | | } |
27 | | #endif |
28 | | } |
29 | | |
30 | | nsHyphenator::~nsHyphenator() |
31 | 0 | { |
32 | 0 | if (mDict != nullptr) { |
33 | 0 | hnj_hyphen_free((HyphenDict*)mDict); |
34 | 0 | mDict = nullptr; |
35 | 0 | } |
36 | 0 | } |
37 | | |
38 | | bool |
39 | | nsHyphenator::IsValid() |
40 | 0 | { |
41 | 0 | return (mDict != nullptr); |
42 | 0 | } |
43 | | |
44 | | nsresult |
45 | | nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens) |
46 | 0 | { |
47 | 0 | if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) { |
48 | 0 | return NS_ERROR_OUT_OF_MEMORY; |
49 | 0 | } |
50 | 0 | memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool)); |
51 | 0 |
|
52 | 0 | bool inWord = false; |
53 | 0 | uint32_t wordStart = 0, wordLimit = 0; |
54 | 0 | uint32_t chLen; |
55 | 0 | for (uint32_t i = 0; i < aString.Length(); i += chLen) { |
56 | 0 | uint32_t ch = aString[i]; |
57 | 0 | chLen = 1; |
58 | 0 |
|
59 | 0 | if (NS_IS_HIGH_SURROGATE(ch)) { |
60 | 0 | if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) { |
61 | 0 | ch = SURROGATE_TO_UCS4(ch, aString[i+1]); |
62 | 0 | chLen = 2; |
63 | 0 | } else { |
64 | 0 | NS_WARNING("unpaired surrogate found during hyphenation"); |
65 | 0 | } |
66 | 0 | } |
67 | 0 |
|
68 | 0 | nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch); |
69 | 0 | if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) { |
70 | 0 | if (!inWord) { |
71 | 0 | inWord = true; |
72 | 0 | wordStart = i; |
73 | 0 | } |
74 | 0 | wordLimit = i + chLen; |
75 | 0 | if (i + chLen < aString.Length()) { |
76 | 0 | continue; |
77 | 0 | } |
78 | 0 | } |
79 | 0 | |
80 | 0 | if (inWord) { |
81 | 0 | // Convert the word to utf-8 for libhyphen, lowercasing it as we go |
82 | 0 | // so that it will match the (lowercased) patterns (bug 1105644). |
83 | 0 | nsAutoCString utf8; |
84 | 0 | const char16_t* const begin = aString.BeginReading(); |
85 | 0 | const char16_t *cur = begin + wordStart; |
86 | 0 | const char16_t *end = begin + wordLimit; |
87 | 0 | while (cur < end) { |
88 | 0 | uint32_t ch = *cur++; |
89 | 0 |
|
90 | 0 | if (NS_IS_HIGH_SURROGATE(ch)) { |
91 | 0 | if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { |
92 | 0 | ch = SURROGATE_TO_UCS4(ch, *cur++); |
93 | 0 | } else { |
94 | 0 | ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR |
95 | 0 | } |
96 | 0 | } else if (NS_IS_LOW_SURROGATE(ch)) { |
97 | 0 | ch = 0xfffd; // unpaired surrogate |
98 | 0 | } |
99 | 0 |
|
100 | 0 | // XXX What about language-specific casing? Consider Turkish I/i... |
101 | 0 | // In practice, it looks like the current patterns will not be |
102 | 0 | // affected by this, as they treat dotted and undotted i similarly. |
103 | 0 | ch = ToLowerCase(ch); |
104 | 0 |
|
105 | 0 | if (ch < 0x80) { // U+0000 - U+007F |
106 | 0 | utf8.Append(ch); |
107 | 0 | } else if (ch < 0x0800) { // U+0100 - U+07FF |
108 | 0 | utf8.Append(0xC0 | (ch >> 6)); |
109 | 0 | utf8.Append(0x80 | (0x003F & ch)); |
110 | 0 | } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF |
111 | 0 | utf8.Append(0xE0 | (ch >> 12)); |
112 | 0 | utf8.Append(0x80 | (0x003F & (ch >> 6))); |
113 | 0 | utf8.Append(0x80 | (0x003F & ch)); |
114 | 0 | } else { |
115 | 0 | utf8.Append(0xF0 | (ch >> 18)); |
116 | 0 | utf8.Append(0x80 | (0x003F & (ch >> 12))); |
117 | 0 | utf8.Append(0x80 | (0x003F & (ch >> 6))); |
118 | 0 | utf8.Append(0x80 | (0x003F & ch)); |
119 | 0 | } |
120 | 0 | } |
121 | 0 |
|
122 | 0 | AutoTArray<char,200> utf8hyphens; |
123 | 0 | utf8hyphens.SetLength(utf8.Length() + 5); |
124 | 0 | char **rep = nullptr; |
125 | 0 | int *pos = nullptr; |
126 | 0 | int *cut = nullptr; |
127 | 0 | int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, |
128 | 0 | utf8.BeginReading(), utf8.Length(), |
129 | 0 | utf8hyphens.Elements(), nullptr, |
130 | 0 | &rep, &pos, &cut); |
131 | 0 | if (!err) { |
132 | 0 | // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer |
133 | 0 | // from utf8 code unit indexing (which would match the utf8 input |
134 | 0 | // string directly) to Unicode character indexing. |
135 | 0 | // We then need to convert this to utf16 code unit offsets for Gecko. |
136 | 0 | const char *hyphPtr = utf8hyphens.Elements(); |
137 | 0 | const char16_t *cur = begin + wordStart; |
138 | 0 | const char16_t *end = begin + wordLimit; |
139 | 0 | while (cur < end) { |
140 | 0 | if (*hyphPtr & 0x01) { |
141 | 0 | aHyphens[cur - begin] = true; |
142 | 0 | } |
143 | 0 | cur++; |
144 | 0 | if (cur < end && NS_IS_LOW_SURROGATE(*cur) && |
145 | 0 | NS_IS_HIGH_SURROGATE(*(cur-1))) |
146 | 0 | { |
147 | 0 | cur++; |
148 | 0 | } |
149 | 0 | hyphPtr++; |
150 | 0 | } |
151 | 0 | } |
152 | 0 | } |
153 | 0 |
|
154 | 0 | inWord = false; |
155 | 0 | } |
156 | 0 |
|
157 | 0 | return NS_OK; |
158 | 0 | } |