/src/libreoffice/lingucomponent/source/lingutil/lingutil.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #if defined(_WIN32) |
21 | | #if !defined WIN32_LEAN_AND_MEAN |
22 | | # define WIN32_LEAN_AND_MEAN |
23 | | #endif |
24 | | #include <windows.h> |
25 | | #endif |
26 | | |
27 | | #include <osl/diagnose.h> |
28 | | #include <osl/thread.h> |
29 | | #include <osl/file.hxx> |
30 | | #include <osl/process.h> |
31 | | #include <tools/debug.hxx> |
32 | | #include <tools/urlobj.hxx> |
33 | | #include <i18nlangtag/languagetag.hxx> |
34 | | #include <i18nlangtag/mslangid.hxx> |
35 | | #include <unotools/bootstrap.hxx> |
36 | | #include <unotools/lingucfg.hxx> |
37 | | #include <unotools/pathoptions.hxx> |
38 | | #include <rtl/bootstrap.hxx> |
39 | | #include <rtl/ustring.hxx> |
40 | | #include <rtl/string.hxx> |
41 | | #include <rtl/tencinfo.h> |
42 | | #include <linguistic/misc.hxx> |
43 | | |
44 | | #include <set> |
45 | | #include <vector> |
46 | | #include <string.h> |
47 | | |
48 | | #include "lingutil.hxx" |
49 | | |
50 | | #include <sal/macros.h> |
51 | | |
52 | | using namespace ::com::sun::star; |
53 | | |
54 | | #if defined(_WIN32) |
55 | | OString Win_AddLongPathPrefix( const OString &rPathName ) |
56 | | { |
57 | | constexpr OString WIN32_LONG_PATH_PREFIX = "\\\\?\\"_ostr; |
58 | | if (!rPathName.match(WIN32_LONG_PATH_PREFIX)) return WIN32_LONG_PATH_PREFIX + rPathName; |
59 | | return rPathName; |
60 | | } |
61 | | #endif //defined(_WIN32) |
62 | | |
63 | | #if defined SYSTEM_DICTS || defined IOS |
64 | | // find old style dictionaries in system directories |
65 | | static void GetOldStyleDicsInDir( |
66 | | OUString const & aSystemDir, OUString const & aFormatName, |
67 | | std::u16string_view aSystemSuffix, std::u16string_view aSystemPrefix, |
68 | | std::set< OUString >& aDicLangInUse, |
69 | | std::vector< SvtLinguConfigDictionaryEntry >& aRes ) |
70 | 0 | { |
71 | 0 | osl::Directory aSystemDicts(aSystemDir); |
72 | 0 | if (aSystemDicts.open() != osl::FileBase::E_None) |
73 | 0 | return; |
74 | | |
75 | 0 | osl::DirectoryItem aItem; |
76 | 0 | osl::FileStatus aFileStatus(osl_FileStatus_Mask_FileURL); |
77 | 0 | while (aSystemDicts.getNextItem(aItem) == osl::FileBase::E_None) |
78 | 0 | { |
79 | 0 | aItem.getFileStatus(aFileStatus); |
80 | 0 | OUString sPath = aFileStatus.getFileURL(); |
81 | 0 | if (sPath.endsWith(aSystemSuffix)) |
82 | 0 | { |
83 | 0 | sal_Int32 nStartIndex = sPath.lastIndexOf('/') + 1; |
84 | 0 | if (!sPath.match(aSystemPrefix, nStartIndex)) |
85 | 0 | continue; |
86 | 0 | OUString sChunk = sPath.copy(nStartIndex + aSystemPrefix.size(), |
87 | 0 | sPath.getLength() - aSystemSuffix.size() - |
88 | 0 | nStartIndex - aSystemPrefix.size()); |
89 | 0 | if (sChunk.isEmpty()) |
90 | 0 | continue; |
91 | | |
92 | | // We prefer (now) to use language tags. |
93 | | // Avoid feeding in the older LANG_REGION scheme to the BCP47 |
94 | | // ctor as that triggers use of liblangtag and initializes its |
95 | | // database which we do not want during startup. Convert |
96 | | // instead. |
97 | 0 | sChunk = sChunk.replace( '_', '-'); |
98 | | |
99 | | // There's a known exception to the rule, the dreaded |
100 | | // hu_HU_u8.dic of the myspell-hu package, see |
101 | | // http://packages.debian.org/search?arch=any&searchon=contents&keywords=hu_HU_u8.dic |
102 | | // This was ignored because unknown in the old implementation, |
103 | | // truncate to the known locale and either insert because hu_HU |
104 | | // wasn't encountered yet, or skip because it was. It doesn't |
105 | | // really matter because the proper new-style hu_HU dictionary |
106 | | // will take precedence anyway if installed with a Hungarian |
107 | | // languagepack. Again, this is only to not pull in all |
108 | | // liblangtag and stuff during startup, the result would be |
109 | | // !isValidBcp47() and the dictionary ignored. |
110 | 0 | if (sChunk == "hu-HU-u8") |
111 | 0 | sChunk = "hu-HU"; |
112 | |
|
113 | 0 | LanguageTag aLangTag(sChunk, true); |
114 | 0 | if (!aLangTag.isValidBcp47()) |
115 | 0 | continue; |
116 | | |
117 | | // Thus we first get the language of the dictionary |
118 | 0 | const OUString& aLocaleName(aLangTag.getBcp47()); |
119 | |
|
120 | 0 | if (aDicLangInUse.insert(aLocaleName).second) |
121 | 0 | { |
122 | | // add the dictionary to the resulting vector |
123 | 0 | SvtLinguConfigDictionaryEntry aDicEntry; |
124 | 0 | aDicEntry.aLocations = { sPath }; |
125 | 0 | aDicEntry.aFormatName = aFormatName; |
126 | 0 | if (aLocaleName == u"ar") |
127 | 0 | aDicEntry.aLocaleNames = { |
128 | 0 | aLocaleName, |
129 | 0 | u"ar-AE"_ustr, u"ar-BH"_ustr, u"ar-DJ"_ustr, u"ar-DZ"_ustr, u"ar-EG"_ustr, |
130 | 0 | u"ar-ER"_ustr, u"ar-IL"_ustr, u"ar-IQ"_ustr, u"ar-JO"_ustr, u"ar-KM"_ustr, |
131 | 0 | u"ar-KW"_ustr, u"ar-LB"_ustr, u"ar-LY"_ustr, u"ar-MA"_ustr, u"ar-MR"_ustr, |
132 | 0 | u"ar-OM"_ustr, u"ar-PS"_ustr, u"ar-QA"_ustr, u"ar-SA"_ustr, u"ar-SD"_ustr, |
133 | 0 | u"ar-SO"_ustr, u"ar-SY"_ustr, u"ar-TD"_ustr, u"ar-TN"_ustr, u"ar-YE"_ustr |
134 | 0 | }; |
135 | 0 | else |
136 | 0 | aDicEntry.aLocaleNames = { aLocaleName }; |
137 | 0 | aRes.push_back(std::move(aDicEntry)); |
138 | 0 | } |
139 | 0 | } |
140 | 0 | } |
141 | 0 | } |
142 | | #endif |
143 | | |
144 | | // build list of old style dictionaries (not as extensions) to use. |
145 | | // User installed dictionaries (the ones residing in the user paths) |
146 | | // will get precedence over system installed ones for the same language. |
147 | | std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics( const char *pDicType ) |
148 | 0 | { |
149 | 0 | std::vector< SvtLinguConfigDictionaryEntry > aRes; |
150 | |
|
151 | 0 | if (!pDicType) |
152 | 0 | return aRes; |
153 | | |
154 | 0 | OUString aFormatName; |
155 | 0 | OUString aDicExtension; |
156 | 0 | #if defined SYSTEM_DICTS || defined IOS |
157 | 0 | OUString aSystemDir; |
158 | 0 | OUString aSystemPrefix; |
159 | 0 | OUString aSystemSuffix; |
160 | 0 | #endif |
161 | 0 | if (strcmp( pDicType, "DICT" ) == 0) |
162 | 0 | { |
163 | 0 | aFormatName = "DICT_SPELL"; |
164 | 0 | aDicExtension = ".dic"; |
165 | 0 | #ifdef SYSTEM_DICTS |
166 | 0 | aSystemDir = DICT_SYSTEM_DIR; |
167 | 0 | aSystemSuffix = aDicExtension; |
168 | | #elif defined IOS |
169 | | aSystemDir = "$BRAND_BASE_DIR/share/spell"; |
170 | | rtl::Bootstrap::expandMacros(aSystemDir); |
171 | | aSystemSuffix = ".dic"; |
172 | | #endif |
173 | 0 | } |
174 | 0 | else if (strcmp( pDicType, "HYPH" ) == 0) |
175 | 0 | { |
176 | 0 | aFormatName = "DICT_HYPH"; |
177 | 0 | aDicExtension = ".dic"; |
178 | 0 | #ifdef SYSTEM_DICTS |
179 | 0 | aSystemDir = HYPH_SYSTEM_DIR; |
180 | 0 | aSystemPrefix = "hyph_"; |
181 | 0 | aSystemSuffix = aDicExtension; |
182 | 0 | #endif |
183 | 0 | } |
184 | 0 | else if (strcmp( pDicType, "THES" ) == 0) |
185 | 0 | { |
186 | 0 | aFormatName = "DICT_THES"; |
187 | 0 | aDicExtension = ".dat"; |
188 | 0 | #ifdef SYSTEM_DICTS |
189 | 0 | aSystemDir = THES_SYSTEM_DIR; |
190 | 0 | aSystemPrefix = "th_"; |
191 | 0 | aSystemSuffix = "_v2.dat"; |
192 | | #elif defined IOS |
193 | | aSystemDir = "$BRAND_BASE_DIR/share/thes"; |
194 | | rtl::Bootstrap::expandMacros(aSystemDir); |
195 | | aSystemPrefix = "th_"; |
196 | | aSystemSuffix = "_v2.dat"; |
197 | | #endif |
198 | 0 | } |
199 | |
|
200 | 0 | if (aFormatName.isEmpty() || aDicExtension.isEmpty()) |
201 | 0 | return aRes; |
202 | | |
203 | 0 | #if defined SYSTEM_DICTS || defined IOS |
204 | | // set of languages to remember the language where it is already |
205 | | // decided to make use of the dictionary. |
206 | 0 | std::set< OUString > aDicLangInUse; |
207 | |
|
208 | 0 | #ifndef IOS |
209 | | // follow the hunspell tool's example and check DICPATH for preferred dictionaries |
210 | 0 | rtl_uString * pSearchPath = nullptr; |
211 | 0 | osl_getEnvironment(u"DICPATH"_ustr.pData, &pSearchPath); |
212 | |
|
213 | 0 | if (pSearchPath) |
214 | 0 | { |
215 | 0 | OUString aSearchPath(pSearchPath); |
216 | 0 | rtl_uString_release(pSearchPath); |
217 | |
|
218 | 0 | sal_Int32 nIndex = 0; |
219 | 0 | do |
220 | 0 | { |
221 | 0 | OUString aSystem( aSearchPath.getToken(0, ':', nIndex) ); |
222 | 0 | OUString aCWD; |
223 | 0 | OUString aRelative; |
224 | 0 | OUString aAbsolute; |
225 | |
|
226 | 0 | if (!utl::Bootstrap::getProcessWorkingDir(aCWD)) |
227 | 0 | continue; |
228 | 0 | if (osl::FileBase::getFileURLFromSystemPath(aSystem, aRelative) |
229 | 0 | != osl::FileBase::E_None) |
230 | 0 | continue; |
231 | 0 | if (osl::FileBase::getAbsoluteFileURL(aCWD, aRelative, aAbsolute) |
232 | 0 | != osl::FileBase::E_None) |
233 | 0 | continue; |
234 | | |
235 | | // GetOldStyleDicsInDir will make sure the dictionary is the right |
236 | | // type based on its prefix, that way hyphen, mythes and regular |
237 | | // dictionaries can live in one directory |
238 | 0 | GetOldStyleDicsInDir(aAbsolute, aFormatName, aSystemSuffix, |
239 | 0 | aSystemPrefix, aDicLangInUse, aRes); |
240 | 0 | } |
241 | 0 | while (nIndex != -1); |
242 | 0 | } |
243 | 0 | #endif |
244 | | |
245 | | // load system directories last so that DICPATH prevails |
246 | 0 | GetOldStyleDicsInDir(aSystemDir, aFormatName, aSystemSuffix, aSystemPrefix, |
247 | 0 | aDicLangInUse, aRes); |
248 | 0 | #endif |
249 | |
|
250 | 0 | return aRes; |
251 | 0 | } |
252 | | |
253 | | void MergeNewStyleDicsAndOldStyleDics( |
254 | | std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics, |
255 | | const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics ) |
256 | 0 | { |
257 | | // get list of languages supported by new style dictionaries |
258 | 0 | std::set< OUString > aNewStyleLanguages; |
259 | 0 | for (auto const& newStyleDic : rNewStyleDics) |
260 | 0 | { |
261 | 0 | const uno::Sequence< OUString > aLocaleNames(newStyleDic.aLocaleNames); |
262 | 0 | sal_Int32 nLocaleNames = aLocaleNames.getLength(); |
263 | 0 | for (sal_Int32 k = 0; k < nLocaleNames; ++k) |
264 | 0 | { |
265 | 0 | aNewStyleLanguages.insert( aLocaleNames[k] ); |
266 | 0 | } |
267 | 0 | } |
268 | | |
269 | | // now check all old style dictionaries if they will add a not yet |
270 | | // added language. If so add them to the resulting vector |
271 | 0 | for (auto const& oldStyleDic : rOldStyleDics) |
272 | 0 | { |
273 | 0 | sal_Int32 nOldStyleDics = oldStyleDic.aLocaleNames.getLength(); |
274 | | |
275 | | // old style dics should only have one language listed... |
276 | 0 | DBG_ASSERT( nOldStyleDics, "old style dictionary with more than one language found!"); |
277 | 0 | if (nOldStyleDics > 0) |
278 | 0 | { |
279 | 0 | if (linguistic::LinguIsUnspecified( oldStyleDic.aLocaleNames[0])) |
280 | 0 | { |
281 | 0 | OSL_FAIL( "old style dictionary with invalid language found!" ); |
282 | 0 | continue; |
283 | 0 | } |
284 | | |
285 | | // language not yet added? |
286 | 0 | if (aNewStyleLanguages.find( oldStyleDic.aLocaleNames[0] ) == aNewStyleLanguages.end()) |
287 | 0 | rNewStyleDics.push_back(oldStyleDic); |
288 | 0 | } |
289 | 0 | else |
290 | 0 | { |
291 | 0 | OSL_FAIL( "old style dictionary with no language found!" ); |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | | |
296 | | rtl_TextEncoding getTextEncodingFromCharset(const char* pCharset) |
297 | 0 | { |
298 | | // default result: used to indicate that we failed to get the proper encoding |
299 | 0 | rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW; |
300 | |
|
301 | 0 | if (pCharset) |
302 | 0 | { |
303 | 0 | eRet = rtl_getTextEncodingFromMimeCharset(pCharset); |
304 | 0 | if (eRet == RTL_TEXTENCODING_DONTKNOW) |
305 | 0 | eRet = rtl_getTextEncodingFromUnixCharset(pCharset); |
306 | 0 | if (eRet == RTL_TEXTENCODING_DONTKNOW) |
307 | 0 | { |
308 | 0 | if (strcmp("ISCII-DEVANAGARI", pCharset) == 0) |
309 | 0 | eRet = RTL_TEXTENCODING_ISCII_DEVANAGARI; |
310 | 0 | } |
311 | 0 | } |
312 | 0 | return eRet; |
313 | 0 | } |
314 | | |
315 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |