/src/libreoffice/ucb/source/regexp/regexp.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include <regexp.hxx> |
21 | | |
22 | | #include <cstddef> |
23 | | |
24 | | #include <osl/diagnose.h> |
25 | | #include <com/sun/star/lang/IllegalArgumentException.hpp> |
26 | | #include <rtl/character.hxx> |
27 | | #include <rtl/ustrbuf.hxx> |
28 | | #include <rtl/ustring.hxx> |
29 | | #include <utility> |
30 | | |
31 | | using namespace com::sun::star; |
32 | | using namespace ucb_impl; |
33 | | |
34 | | |
35 | | // Regexp |
36 | | |
37 | | |
38 | | inline Regexp::Regexp(Kind eTheKind, OUString aThePrefix, |
39 | | bool bTheEmptyDomain, OUString aTheInfix, |
40 | | bool bTheTranslation, |
41 | | OUString aTheReversePrefix): |
42 | 24 | m_eKind(eTheKind), |
43 | 24 | m_aPrefix(std::move(aThePrefix)), |
44 | 24 | m_aInfix(std::move(aTheInfix)), |
45 | 24 | m_aReversePrefix(std::move(aTheReversePrefix)), |
46 | 24 | m_bEmptyDomain(bTheEmptyDomain), |
47 | 24 | m_bTranslation(bTheTranslation) |
48 | 24 | { |
49 | 24 | OSL_ASSERT(m_eKind == KIND_DOMAIN |
50 | 24 | || (!m_bEmptyDomain && m_aInfix.isEmpty())); |
51 | 24 | OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty()); |
52 | 24 | } |
53 | | |
54 | | |
55 | | namespace { |
56 | | |
57 | | bool matchStringIgnoreCase(sal_Unicode const ** pBegin, |
58 | | sal_Unicode const * pEnd, |
59 | | OUString const & rString) |
60 | 386k | { |
61 | 386k | sal_Unicode const * p = *pBegin; |
62 | | |
63 | 386k | sal_Unicode const * q = rString.getStr(); |
64 | 386k | sal_Unicode const * qEnd = q + rString.getLength(); |
65 | | |
66 | 386k | if (pEnd - p < qEnd - q) |
67 | 27.4k | return false; |
68 | | |
69 | 2.15M | while (q != qEnd) |
70 | 1.79M | { |
71 | 1.79M | if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0) |
72 | 471 | return false; |
73 | 1.79M | } |
74 | | |
75 | 358k | *pBegin = p; |
76 | 358k | return true; |
77 | 359k | } |
78 | | |
79 | | } |
80 | | |
81 | | bool Regexp::matches(OUString const & rString) const |
82 | 386k | { |
83 | 386k | sal_Unicode const * pBegin = rString.getStr(); |
84 | 386k | sal_Unicode const * pEnd = pBegin + rString.getLength(); |
85 | | |
86 | 386k | bool bMatches = false; |
87 | | |
88 | 386k | sal_Unicode const * p = pBegin; |
89 | 386k | if (matchStringIgnoreCase(&p, pEnd, m_aPrefix)) |
90 | 358k | { |
91 | 358k | switch (m_eKind) |
92 | 358k | { |
93 | 358k | case KIND_PREFIX: |
94 | 358k | bMatches = true; |
95 | 358k | break; |
96 | | |
97 | 0 | case KIND_AUTHORITY: |
98 | 0 | bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#'; |
99 | 0 | break; |
100 | | |
101 | 0 | case KIND_DOMAIN: |
102 | 0 | if (!m_bEmptyDomain) |
103 | 0 | { |
104 | 0 | if (p == pEnd || *p == '/' || *p == '?' || *p == '#') |
105 | 0 | break; |
106 | 0 | ++p; |
107 | 0 | } |
108 | 0 | for (;;) |
109 | 0 | { |
110 | 0 | sal_Unicode const * q = p; |
111 | 0 | if (matchStringIgnoreCase(&q, pEnd, m_aInfix) |
112 | 0 | && (q == pEnd || *q == '/' || *q == '?' || *q == '#')) |
113 | 0 | { |
114 | 0 | bMatches = true; |
115 | 0 | break; |
116 | 0 | } |
117 | | |
118 | 0 | if (p == pEnd) |
119 | 0 | break; |
120 | | |
121 | 0 | sal_Unicode c = *p++; |
122 | 0 | if (c == '/' || c == '?' || c == '#') |
123 | 0 | break; |
124 | 0 | } |
125 | 0 | break; |
126 | 358k | } |
127 | 358k | } |
128 | | |
129 | 386k | return bMatches; |
130 | 386k | } |
131 | | |
132 | | |
133 | | namespace { |
134 | | |
135 | | bool isScheme(OUString const & rString, bool bColon) |
136 | 24 | { |
137 | | // Return true if rString matches <scheme> (plus a trailing ":" if bColon |
138 | | // is true) from RFC 2396: |
139 | 24 | sal_Unicode const * p = rString.getStr(); |
140 | 24 | sal_Unicode const * pEnd = p + rString.getLength(); |
141 | 24 | if (p != pEnd && rtl::isAsciiAlpha(*p)) |
142 | 24 | for (++p;;) |
143 | 96 | { |
144 | 96 | if (p == pEnd) |
145 | 24 | return !bColon; |
146 | 72 | sal_Unicode c = *p++; |
147 | 72 | if (!(rtl::isAsciiAlphanumeric(c) |
148 | 0 | || c == '+' || c == '-' || c == '.')) |
149 | 0 | return bColon && c == ':' && p == pEnd; |
150 | 72 | } |
151 | 0 | return false; |
152 | 24 | } |
153 | | |
154 | | void appendStringLiteral(OUStringBuffer * pBuffer, |
155 | | OUString const & rString) |
156 | 0 | { |
157 | 0 | OSL_ASSERT(pBuffer); |
158 | |
|
159 | 0 | pBuffer->append('"'); |
160 | 0 | sal_Unicode const * p = rString.getStr(); |
161 | 0 | sal_Unicode const * pEnd = p + rString.getLength(); |
162 | 0 | while (p != pEnd) |
163 | 0 | { |
164 | 0 | sal_Unicode c = *p++; |
165 | 0 | if (c == '"' || c == '\\') |
166 | 0 | pBuffer->append('\\'); |
167 | 0 | pBuffer->append(c); |
168 | 0 | } |
169 | 0 | pBuffer->append('"'); |
170 | 0 | } |
171 | | |
172 | | } |
173 | | |
174 | | OUString Regexp::getRegexp() const |
175 | 0 | { |
176 | 0 | if (m_bTranslation) |
177 | 0 | { |
178 | 0 | OUStringBuffer aBuffer; |
179 | 0 | if (!m_aPrefix.isEmpty()) |
180 | 0 | appendStringLiteral(&aBuffer, m_aPrefix); |
181 | 0 | switch (m_eKind) |
182 | 0 | { |
183 | 0 | case KIND_PREFIX: |
184 | 0 | aBuffer.append("(.*)"); |
185 | 0 | break; |
186 | | |
187 | 0 | case KIND_AUTHORITY: |
188 | 0 | aBuffer.append("(([/?#].*)?)"); |
189 | 0 | break; |
190 | | |
191 | 0 | case KIND_DOMAIN: |
192 | 0 | aBuffer.append("([^/?#]" + OUStringChar(sal_Unicode(m_bEmptyDomain ? '*' : '+'))); |
193 | 0 | if (!m_aInfix.isEmpty()) |
194 | 0 | appendStringLiteral(&aBuffer, m_aInfix); |
195 | 0 | aBuffer.append("([/?#].*)?)"); |
196 | 0 | break; |
197 | 0 | } |
198 | 0 | aBuffer.append("->"); |
199 | 0 | if (!m_aReversePrefix.isEmpty()) |
200 | 0 | appendStringLiteral(&aBuffer, m_aReversePrefix); |
201 | 0 | aBuffer.append("\\1"); |
202 | 0 | return aBuffer.makeStringAndClear(); |
203 | 0 | } |
204 | 0 | else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true)) |
205 | 0 | return m_aPrefix.copy(0, m_aPrefix.getLength() - 1); |
206 | 0 | else |
207 | 0 | { |
208 | 0 | OUStringBuffer aBuffer; |
209 | 0 | if (!m_aPrefix.isEmpty()) |
210 | 0 | appendStringLiteral(&aBuffer, m_aPrefix); |
211 | 0 | switch (m_eKind) |
212 | 0 | { |
213 | 0 | case KIND_PREFIX: |
214 | 0 | aBuffer.append(".*"); |
215 | 0 | break; |
216 | | |
217 | 0 | case KIND_AUTHORITY: |
218 | 0 | aBuffer.append("([/?#].*)?"); |
219 | 0 | break; |
220 | | |
221 | 0 | case KIND_DOMAIN: |
222 | 0 | aBuffer.append("[^/?#]" + OUStringChar( m_bEmptyDomain ? '*' : '+' )); |
223 | 0 | if (!m_aInfix.isEmpty()) |
224 | 0 | appendStringLiteral(&aBuffer, m_aInfix); |
225 | 0 | aBuffer.append("([/?#].*)?"); |
226 | 0 | break; |
227 | 0 | } |
228 | 0 | return aBuffer.makeStringAndClear(); |
229 | 0 | } |
230 | 0 | } |
231 | | |
232 | | |
233 | | namespace { |
234 | | |
235 | | bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd, |
236 | | char const * pString, size_t nStringLength) |
237 | 0 | { |
238 | 0 | sal_Unicode const * p = *pBegin; |
239 | |
|
240 | 0 | unsigned char const * q = reinterpret_cast< unsigned char const * >(pString); |
241 | 0 | unsigned char const * qEnd = q + nStringLength; |
242 | |
|
243 | 0 | if (pEnd - p < qEnd - q) |
244 | 0 | return false; |
245 | | |
246 | 0 | while (q != qEnd) |
247 | 0 | { |
248 | 0 | sal_Unicode c1 = *p++; |
249 | 0 | sal_Unicode c2 = *q++; |
250 | 0 | if (c1 != c2) |
251 | 0 | return false; |
252 | 0 | } |
253 | | |
254 | 0 | *pBegin = p; |
255 | 0 | return true; |
256 | 0 | } |
257 | | |
258 | | bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd, |
259 | | OUString * pString) |
260 | 0 | { |
261 | 0 | sal_Unicode const * p = *pBegin; |
262 | |
|
263 | 0 | if (p == pEnd || *p++ != '"') |
264 | 0 | return false; |
265 | | |
266 | 0 | OUStringBuffer aBuffer; |
267 | 0 | for (;;) |
268 | 0 | { |
269 | 0 | if (p == pEnd) |
270 | 0 | return false; |
271 | 0 | sal_Unicode c = *p++; |
272 | 0 | if (c == '"') |
273 | 0 | break; |
274 | 0 | if (c == '\\') |
275 | 0 | { |
276 | 0 | if (p == pEnd) |
277 | 0 | return false; |
278 | 0 | c = *p++; |
279 | 0 | if (c != '"' && c != '\\') |
280 | 0 | return false; |
281 | 0 | } |
282 | 0 | aBuffer.append(c); |
283 | 0 | } |
284 | | |
285 | 0 | *pBegin = p; |
286 | 0 | *pString = aBuffer.makeStringAndClear(); |
287 | 0 | return true; |
288 | 0 | } |
289 | | |
290 | | } |
291 | | |
292 | | Regexp Regexp::parse(OUString const & rRegexp) |
293 | 24 | { |
294 | | // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*' |
295 | | // where <scheme> is as defined in RFC 2396: |
296 | 24 | if (isScheme(rRegexp, false)) |
297 | 24 | return Regexp(Regexp::KIND_PREFIX, |
298 | 24 | rRegexp + ":", |
299 | 24 | false, |
300 | 24 | OUString(), |
301 | 24 | false, |
302 | 24 | OUString()); |
303 | | |
304 | 0 | sal_Unicode const * p = rRegexp.getStr(); |
305 | 0 | sal_Unicode const * pEnd = p + rRegexp.getLength(); |
306 | |
|
307 | 0 | OUString aPrefix; |
308 | 0 | scanStringLiteral(&p, pEnd, &aPrefix); |
309 | |
|
310 | 0 | if (p == pEnd) |
311 | 0 | throw lang::IllegalArgumentException(); |
312 | | |
313 | | // This and the matchString() calls below are some of the few places where |
314 | | // RTL_CONSTASCII_STRINGPARAM() should NOT be removed. |
315 | | // (c.f. https://gerrit.libreoffice.org/3117) |
316 | 0 | if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*"))) |
317 | 0 | { |
318 | 0 | if (p != pEnd) |
319 | 0 | throw lang::IllegalArgumentException(); |
320 | | |
321 | 0 | return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(), |
322 | 0 | false, OUString()); |
323 | 0 | } |
324 | 0 | else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->"))) |
325 | 0 | { |
326 | 0 | OUString aReversePrefix; |
327 | 0 | scanStringLiteral(&p, pEnd, &aReversePrefix); |
328 | |
|
329 | 0 | if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")) |
330 | 0 | || p != pEnd) |
331 | 0 | throw lang::IllegalArgumentException(); |
332 | | |
333 | 0 | return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(), |
334 | 0 | true, aReversePrefix); |
335 | 0 | } |
336 | 0 | else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"))) |
337 | 0 | { |
338 | 0 | if (p != pEnd) |
339 | 0 | throw lang::IllegalArgumentException(); |
340 | | |
341 | 0 | return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(), |
342 | 0 | false, OUString()); |
343 | 0 | } |
344 | 0 | else if (matchString(&p, pEnd, |
345 | 0 | RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->"))) |
346 | 0 | { |
347 | 0 | OUString aReversePrefix; |
348 | 0 | if (!(scanStringLiteral(&p, pEnd, &aReversePrefix) |
349 | 0 | && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")) |
350 | 0 | && p == pEnd)) |
351 | 0 | throw lang::IllegalArgumentException(); |
352 | | |
353 | 0 | return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(), |
354 | 0 | true, aReversePrefix); |
355 | 0 | } |
356 | 0 | else |
357 | 0 | { |
358 | 0 | bool bOpen = false; |
359 | 0 | if (p != pEnd && *p == '(') |
360 | 0 | { |
361 | 0 | ++p; |
362 | 0 | bOpen = true; |
363 | 0 | } |
364 | |
|
365 | 0 | if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]"))) |
366 | 0 | throw lang::IllegalArgumentException(); |
367 | | |
368 | 0 | if (p == pEnd || (*p != '*' && *p != '+')) |
369 | 0 | throw lang::IllegalArgumentException(); |
370 | 0 | bool bEmptyDomain = *p++ == '*'; |
371 | |
|
372 | 0 | OUString aInfix; |
373 | 0 | scanStringLiteral(&p, pEnd, &aInfix); |
374 | |
|
375 | 0 | if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"))) |
376 | 0 | throw lang::IllegalArgumentException(); |
377 | | |
378 | 0 | OUString aReversePrefix; |
379 | 0 | if (bOpen |
380 | 0 | && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->")) |
381 | 0 | && scanStringLiteral(&p, pEnd, &aReversePrefix) |
382 | 0 | && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")))) |
383 | 0 | throw lang::IllegalArgumentException(); |
384 | | |
385 | 0 | if (p != pEnd) |
386 | 0 | throw lang::IllegalArgumentException(); |
387 | | |
388 | 0 | return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix, |
389 | 0 | bOpen, aReversePrefix); |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |