Coverage Report

Created: 2025-12-31 10:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libreoffice/ucb/source/regexp/regexp.cxx
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
#include <regexp.hxx>
21
22
#include <cstddef>
23
24
#include <osl/diagnose.h>
25
#include <com/sun/star/lang/IllegalArgumentException.hpp>
26
#include <rtl/character.hxx>
27
#include <rtl/ustrbuf.hxx>
28
#include <rtl/ustring.hxx>
29
#include <utility>
30
31
using namespace com::sun::star;
32
using namespace ucb_impl;
33
34
35
//  Regexp
36
37
38
inline Regexp::Regexp(Kind eTheKind, OUString aThePrefix,
39
                      bool bTheEmptyDomain, OUString aTheInfix,
40
                      bool bTheTranslation,
41
                      OUString aTheReversePrefix):
42
24
    m_eKind(eTheKind),
43
24
    m_aPrefix(std::move(aThePrefix)),
44
24
    m_aInfix(std::move(aTheInfix)),
45
24
    m_aReversePrefix(std::move(aTheReversePrefix)),
46
24
    m_bEmptyDomain(bTheEmptyDomain),
47
24
    m_bTranslation(bTheTranslation)
48
24
{
49
24
    OSL_ASSERT(m_eKind == KIND_DOMAIN
50
24
               || (!m_bEmptyDomain && m_aInfix.isEmpty()));
51
24
    OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
52
24
}
53
54
55
namespace {
56
57
bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
58
                           sal_Unicode const * pEnd,
59
                           OUString const & rString)
60
386k
{
61
386k
    sal_Unicode const * p = *pBegin;
62
63
386k
    sal_Unicode const * q = rString.getStr();
64
386k
    sal_Unicode const * qEnd = q + rString.getLength();
65
66
386k
    if (pEnd - p < qEnd - q)
67
27.4k
        return false;
68
69
2.15M
    while (q != qEnd)
70
1.79M
    {
71
1.79M
        if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
72
471
            return false;
73
1.79M
    }
74
75
358k
    *pBegin = p;
76
358k
    return true;
77
359k
}
78
79
}
80
81
bool Regexp::matches(OUString const & rString) const
82
386k
{
83
386k
    sal_Unicode const * pBegin = rString.getStr();
84
386k
    sal_Unicode const * pEnd = pBegin + rString.getLength();
85
86
386k
    bool bMatches = false;
87
88
386k
    sal_Unicode const * p = pBegin;
89
386k
    if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
90
358k
    {
91
358k
        switch (m_eKind)
92
358k
        {
93
358k
            case KIND_PREFIX:
94
358k
                bMatches = true;
95
358k
                break;
96
97
0
            case KIND_AUTHORITY:
98
0
                bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
99
0
                break;
100
101
0
            case KIND_DOMAIN:
102
0
                if (!m_bEmptyDomain)
103
0
                {
104
0
                    if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
105
0
                        break;
106
0
                    ++p;
107
0
                }
108
0
                for (;;)
109
0
                {
110
0
                    sal_Unicode const * q = p;
111
0
                    if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
112
0
                        && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
113
0
                    {
114
0
                        bMatches = true;
115
0
                        break;
116
0
                    }
117
118
0
                    if (p == pEnd)
119
0
                        break;
120
121
0
                    sal_Unicode c = *p++;
122
0
                    if (c == '/' || c == '?' || c == '#')
123
0
                        break;
124
0
                }
125
0
                break;
126
358k
        }
127
358k
    }
128
129
386k
    return bMatches;
130
386k
}
131
132
133
namespace {
134
135
bool isScheme(OUString const & rString, bool bColon)
136
24
{
137
    // Return true if rString matches <scheme> (plus a trailing ":" if bColon
138
    // is true) from RFC 2396:
139
24
    sal_Unicode const * p = rString.getStr();
140
24
    sal_Unicode const * pEnd = p + rString.getLength();
141
24
    if (p != pEnd && rtl::isAsciiAlpha(*p))
142
24
        for (++p;;)
143
96
        {
144
96
            if (p == pEnd)
145
24
                return !bColon;
146
72
            sal_Unicode c = *p++;
147
72
            if (!(rtl::isAsciiAlphanumeric(c)
148
0
                  || c == '+' || c == '-' || c == '.'))
149
0
                return bColon && c == ':' && p == pEnd;
150
72
        }
151
0
    return false;
152
24
}
153
154
void appendStringLiteral(OUStringBuffer * pBuffer,
155
                         OUString const & rString)
156
0
{
157
0
    OSL_ASSERT(pBuffer);
158
159
0
    pBuffer->append('"');
160
0
    sal_Unicode const * p = rString.getStr();
161
0
    sal_Unicode const * pEnd = p + rString.getLength();
162
0
    while (p != pEnd)
163
0
    {
164
0
        sal_Unicode c = *p++;
165
0
        if (c == '"' || c == '\\')
166
0
            pBuffer->append('\\');
167
0
        pBuffer->append(c);
168
0
    }
169
0
    pBuffer->append('"');
170
0
}
171
172
}
173
174
OUString Regexp::getRegexp() const
175
0
{
176
0
    if (m_bTranslation)
177
0
    {
178
0
        OUStringBuffer aBuffer;
179
0
        if (!m_aPrefix.isEmpty())
180
0
            appendStringLiteral(&aBuffer, m_aPrefix);
181
0
        switch (m_eKind)
182
0
        {
183
0
            case KIND_PREFIX:
184
0
                aBuffer.append("(.*)");
185
0
                break;
186
187
0
            case KIND_AUTHORITY:
188
0
                aBuffer.append("(([/?#].*)?)");
189
0
                break;
190
191
0
            case KIND_DOMAIN:
192
0
                aBuffer.append("([^/?#]" + OUStringChar(sal_Unicode(m_bEmptyDomain ? '*' : '+')));
193
0
                if (!m_aInfix.isEmpty())
194
0
                    appendStringLiteral(&aBuffer, m_aInfix);
195
0
                aBuffer.append("([/?#].*)?)");
196
0
                break;
197
0
        }
198
0
        aBuffer.append("->");
199
0
        if (!m_aReversePrefix.isEmpty())
200
0
            appendStringLiteral(&aBuffer, m_aReversePrefix);
201
0
        aBuffer.append("\\1");
202
0
        return aBuffer.makeStringAndClear();
203
0
    }
204
0
    else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
205
0
        return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
206
0
    else
207
0
    {
208
0
        OUStringBuffer aBuffer;
209
0
        if (!m_aPrefix.isEmpty())
210
0
            appendStringLiteral(&aBuffer, m_aPrefix);
211
0
        switch (m_eKind)
212
0
        {
213
0
            case KIND_PREFIX:
214
0
                aBuffer.append(".*");
215
0
                break;
216
217
0
            case KIND_AUTHORITY:
218
0
                aBuffer.append("([/?#].*)?");
219
0
                break;
220
221
0
            case KIND_DOMAIN:
222
0
                aBuffer.append("[^/?#]" + OUStringChar( m_bEmptyDomain ? '*' : '+' ));
223
0
                if (!m_aInfix.isEmpty())
224
0
                    appendStringLiteral(&aBuffer, m_aInfix);
225
0
                aBuffer.append("([/?#].*)?");
226
0
                break;
227
0
        }
228
0
        return aBuffer.makeStringAndClear();
229
0
    }
230
0
}
231
232
233
namespace {
234
235
bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
236
                 char const * pString, size_t nStringLength)
237
0
{
238
0
    sal_Unicode const * p = *pBegin;
239
240
0
    unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
241
0
    unsigned char const * qEnd = q + nStringLength;
242
243
0
    if (pEnd - p < qEnd - q)
244
0
        return false;
245
246
0
    while (q != qEnd)
247
0
    {
248
0
        sal_Unicode c1 = *p++;
249
0
        sal_Unicode c2 = *q++;
250
0
        if (c1 != c2)
251
0
            return false;
252
0
    }
253
254
0
    *pBegin = p;
255
0
    return true;
256
0
}
257
258
bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
259
                       OUString * pString)
260
0
{
261
0
    sal_Unicode const * p = *pBegin;
262
263
0
    if (p == pEnd || *p++ != '"')
264
0
        return false;
265
266
0
    OUStringBuffer aBuffer;
267
0
    for (;;)
268
0
    {
269
0
        if (p == pEnd)
270
0
            return false;
271
0
        sal_Unicode c = *p++;
272
0
        if (c == '"')
273
0
            break;
274
0
        if (c == '\\')
275
0
        {
276
0
            if (p == pEnd)
277
0
                return false;
278
0
            c = *p++;
279
0
            if (c != '"' && c != '\\')
280
0
                return false;
281
0
        }
282
0
        aBuffer.append(c);
283
0
    }
284
285
0
    *pBegin = p;
286
0
    *pString = aBuffer.makeStringAndClear();
287
0
    return true;
288
0
}
289
290
}
291
292
Regexp Regexp::parse(OUString const & rRegexp)
293
24
{
294
    // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
295
    // where <scheme> is as defined in RFC 2396:
296
24
    if (isScheme(rRegexp, false))
297
24
        return Regexp(Regexp::KIND_PREFIX,
298
24
                      rRegexp + ":",
299
24
                      false,
300
24
                      OUString(),
301
24
                      false,
302
24
                      OUString());
303
304
0
    sal_Unicode const * p = rRegexp.getStr();
305
0
    sal_Unicode const * pEnd = p + rRegexp.getLength();
306
307
0
    OUString aPrefix;
308
0
    scanStringLiteral(&p, pEnd, &aPrefix);
309
310
0
    if (p == pEnd)
311
0
        throw lang::IllegalArgumentException();
312
313
    // This and the matchString() calls below are some of the few places where
314
    // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
315
    // (c.f. https://gerrit.libreoffice.org/3117)
316
0
    if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
317
0
    {
318
0
        if (p != pEnd)
319
0
            throw lang::IllegalArgumentException();
320
321
0
        return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
322
0
                      false, OUString());
323
0
    }
324
0
    else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
325
0
    {
326
0
        OUString aReversePrefix;
327
0
        scanStringLiteral(&p, pEnd, &aReversePrefix);
328
329
0
        if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
330
0
            || p != pEnd)
331
0
            throw lang::IllegalArgumentException();
332
333
0
        return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
334
0
                      true, aReversePrefix);
335
0
    }
336
0
    else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
337
0
    {
338
0
        if (p != pEnd)
339
0
            throw lang::IllegalArgumentException();
340
341
0
        return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
342
0
                      false, OUString());
343
0
    }
344
0
    else if (matchString(&p, pEnd,
345
0
                         RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
346
0
    {
347
0
        OUString aReversePrefix;
348
0
        if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
349
0
              && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
350
0
              && p == pEnd))
351
0
            throw lang::IllegalArgumentException();
352
353
0
        return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
354
0
                      true, aReversePrefix);
355
0
    }
356
0
    else
357
0
    {
358
0
        bool bOpen = false;
359
0
        if (p != pEnd && *p == '(')
360
0
        {
361
0
            ++p;
362
0
            bOpen = true;
363
0
        }
364
365
0
        if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
366
0
            throw lang::IllegalArgumentException();
367
368
0
        if (p == pEnd || (*p != '*' && *p != '+'))
369
0
            throw lang::IllegalArgumentException();
370
0
        bool bEmptyDomain = *p++ == '*';
371
372
0
        OUString aInfix;
373
0
        scanStringLiteral(&p, pEnd, &aInfix);
374
375
0
        if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
376
0
            throw lang::IllegalArgumentException();
377
378
0
        OUString aReversePrefix;
379
0
        if (bOpen
380
0
            && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
381
0
                 && scanStringLiteral(&p, pEnd, &aReversePrefix)
382
0
                 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
383
0
            throw lang::IllegalArgumentException();
384
385
0
        if (p != pEnd)
386
0
            throw lang::IllegalArgumentException();
387
388
0
        return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
389
0
                      bOpen, aReversePrefix);
390
0
    }
391
0
}
392
393
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */