Coverage Report

Created: 2024-02-25 06:22

/src/poco/Foundation/src/RegularExpression.cpp
Line
Count
Source (jump to first uncovered line)
1
//
2
// RegularExpression.h
3
//
4
// Library: Foundation
5
// Package: RegExp
6
// Module:  RegularExpression
7
//
8
// Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH.
9
// and Contributors.
10
//
11
// SPDX-License-Identifier: BSL-1.0
12
//
13
14
15
#include "Poco/RegularExpression.h"
16
#include "Poco/Exception.h"
17
#include <sstream>
18
#if defined(POCO_UNBUNDLED)
19
#define PCRE2_CODE_UNIT_WIDTH 8
20
#include <pcre2.h>
21
#else
22
#include "pcre2_config.h"
23
#include "pcre2.h"
24
#endif
25
26
27
namespace
28
{
29
  class MatchData
30
  {
31
  public:
32
    MatchData(pcre2_code_8* code):
33
      _match(pcre2_match_data_create_from_pattern_8(reinterpret_cast<pcre2_code_8*>(code), nullptr))
34
0
    {
35
0
      if (!_match) throw Poco::RegularExpressionException("cannot create match data");
36
0
    }
37
38
    ~MatchData()
39
0
    {
40
0
      if (_match) pcre2_match_data_free_8(_match);
41
0
    }
42
43
    std::uint32_t count() const
44
0
    {
45
0
      return pcre2_get_ovector_count_8(_match);
46
0
    }
47
48
    const PCRE2_SIZE* data() const
49
0
    {
50
0
      return pcre2_get_ovector_pointer_8(_match);
51
0
    }
52
53
    operator pcre2_match_data_8*()
54
0
    {
55
0
      return _match;
56
0
    }
57
58
  private:
59
    pcre2_match_data_8* _match;
60
  };
61
}
62
63
64
namespace Poco {
65
66
67
RegularExpression::RegularExpression(const std::string& pattern, int options, bool /*study*/): _pcre(nullptr)
68
0
{
69
0
  int errorCode;
70
0
  PCRE2_SIZE errorOffset;
71
0
  unsigned nameCount;
72
0
  unsigned nameEntrySize;
73
0
  unsigned char* nameTable;
74
75
0
  pcre2_compile_context_8* context = pcre2_compile_context_create_8(nullptr);
76
0
  if (!context) throw Poco::RegularExpressionException("cannot create compile context");
77
78
0
  if (options & RE_NEWLINE_LF)
79
0
    pcre2_set_newline_8(context, PCRE2_NEWLINE_LF);
80
0
  else if (options & RE_NEWLINE_CRLF)
81
0
    pcre2_set_newline_8(context, PCRE2_NEWLINE_CRLF);
82
0
  else if (options & RE_NEWLINE_ANY)
83
0
    pcre2_set_newline_8(context, PCRE2_NEWLINE_ANY);
84
0
  else if (options & RE_NEWLINE_ANYCRLF)
85
0
    pcre2_set_newline_8(context, PCRE2_NEWLINE_ANYCRLF);
86
0
  else // default RE_NEWLINE_CR
87
0
    pcre2_set_newline_8(context, PCRE2_NEWLINE_CR);
88
89
0
  _pcre = pcre2_compile_8(reinterpret_cast<const PCRE2_SPTR>(pattern.c_str()), pattern.length(), compileOptions(options), &errorCode, &errorOffset, context);
90
0
  pcre2_compile_context_free_8(context);
91
92
0
  if (!_pcre)
93
0
  {
94
0
    PCRE2_UCHAR buffer[256];
95
0
    pcre2_get_error_message_8(errorCode, buffer, sizeof(buffer));
96
0
    std::ostringstream msg;
97
0
    msg << reinterpret_cast<char*>(buffer) << " (at offset " << errorOffset << ")";
98
0
    throw RegularExpressionException(msg.str());
99
0
  }
100
101
0
  pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMECOUNT, &nameCount);
102
0
  pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize);
103
0
  pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMETABLE, &nameTable);
104
105
0
  for (int i = 0; i < nameCount; i++)
106
0
  {
107
0
    unsigned char* group = nameTable + 2 + (nameEntrySize * i);
108
0
    int n = pcre2_substring_number_from_name_8(reinterpret_cast<pcre2_code_8*>(_pcre), group);
109
0
    _groups[n] = std::string(reinterpret_cast<char*>(group));
110
0
  }
111
0
}
112
113
114
RegularExpression::~RegularExpression()
115
0
{
116
0
  if (_pcre) pcre2_code_free_8(reinterpret_cast<pcre2_code_8*>(_pcre));
117
0
}
118
119
120
int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const
121
0
{
122
0
  poco_assert (offset <= subject.length());
123
124
0
  MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
125
0
  int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
126
0
  if (rc == PCRE2_ERROR_NOMATCH)
127
0
  {
128
0
    mtch.offset = std::string::npos;
129
0
    mtch.length = 0;
130
0
    return 0;
131
0
  }
132
0
  else if (rc == PCRE2_ERROR_BADOPTION)
133
0
  {
134
0
    throw RegularExpressionException("bad option");
135
0
  }
136
0
  else if (rc == 0)
137
0
  {
138
0
    throw RegularExpressionException("too many captured substrings");
139
0
  }
140
0
  else if (rc < 0)
141
0
  {
142
0
    PCRE2_UCHAR buffer[256];
143
0
    pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
144
0
    throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
145
0
  }
146
0
  const PCRE2_SIZE* ovec = matchData.data();
147
0
  mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0];
148
0
  mtch.length = ovec[1] - mtch.offset;
149
0
  return rc;
150
0
}
151
152
153
int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const
154
0
{
155
0
  poco_assert (offset <= subject.length());
156
157
0
  matches.clear();
158
159
0
  MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
160
0
  int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, options & 0xFFFF, matchData, nullptr);
161
0
  if (rc == PCRE2_ERROR_NOMATCH)
162
0
  {
163
0
    return 0;
164
0
  }
165
0
  else if (rc == PCRE2_ERROR_BADOPTION)
166
0
  {
167
0
    throw RegularExpressionException("bad option");
168
0
  }
169
0
  else if (rc == 0)
170
0
  {
171
0
    throw RegularExpressionException("too many captured substrings");
172
0
  }
173
0
  else if (rc < 0)
174
0
  {
175
0
    PCRE2_UCHAR buffer[256];
176
0
    pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
177
0
    throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
178
0
  }
179
0
  matches.reserve(rc);
180
0
  const PCRE2_SIZE* ovec = matchData.data();
181
0
  for (int i = 0; i < rc; ++i)
182
0
  {
183
0
    Match m;
184
0
    GroupMap::const_iterator it;
185
186
0
    m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ;
187
0
    m.length = ovec[i*2 + 1] - m.offset;
188
189
0
    it = _groups.find(i);
190
0
    if (it != _groups.end())
191
0
    {
192
0
      m.name = (*it).second;
193
0
    }
194
195
0
    matches.push_back(m);
196
0
  }
197
0
  return rc;
198
0
}
199
200
201
bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const
202
0
{
203
0
  Match mtch;
204
0
  match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY);
205
0
  return mtch.offset == offset && mtch.length == subject.length() - offset;
206
0
}
207
208
209
bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const
210
0
{
211
0
  Match mtch;
212
0
  match(subject, offset, mtch, options);
213
0
  return mtch.offset == offset && mtch.length == subject.length() - offset;
214
0
}
215
216
217
int RegularExpression::extract(const std::string& subject, std::string& str, int options) const
218
0
{
219
0
  Match mtch;
220
0
  int rc = match(subject, 0, mtch, options);
221
0
  if (mtch.offset != std::string::npos)
222
0
    str.assign(subject, mtch.offset, mtch.length);
223
0
  else
224
0
    str.clear();
225
0
  return rc;
226
0
}
227
228
229
int RegularExpression::extract(const std::string& subject, std::string::size_type offset, std::string& str, int options) const
230
0
{
231
0
  Match mtch;
232
0
  int rc = match(subject, offset, mtch, options);
233
0
  if (mtch.offset != std::string::npos)
234
0
    str.assign(subject, mtch.offset, mtch.length);
235
0
  else
236
0
    str.clear();
237
0
  return rc;
238
0
}
239
240
241
int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const
242
0
{
243
0
  MatchVec matches;
244
0
  strings.clear();
245
0
  int rc = match(subject, offset, matches, options);
246
0
  strings.reserve(matches.size());
247
0
  for (const auto& m: matches)
248
0
  {
249
0
    if (m.offset != std::string::npos)
250
0
      strings.push_back(subject.substr(m.offset, m.length));
251
0
    else
252
0
      strings.push_back(std::string());
253
0
  }
254
0
  return rc;
255
0
}
256
257
258
int RegularExpression::subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const
259
0
{
260
0
  if (options & RE_GLOBAL)
261
0
  {
262
0
    int rc = 0;
263
0
    std::string::size_type pos = substOne(subject, offset, replacement, options);
264
0
    while (pos != std::string::npos)
265
0
    {
266
0
      ++rc;
267
0
      pos = substOne(subject, pos, replacement, options);
268
0
    }
269
0
    return rc;
270
0
  }
271
0
  else
272
0
  {
273
0
    return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0;
274
0
  }
275
0
}
276
277
278
std::string::size_type RegularExpression::substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const
279
0
{
280
0
  if (offset >= subject.length()) return std::string::npos;
281
282
0
  MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
283
0
  int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
284
0
  if (rc == PCRE2_ERROR_NOMATCH)
285
0
  {
286
0
    return std::string::npos;
287
0
  }
288
0
  else if (rc == PCRE2_ERROR_BADOPTION)
289
0
  {
290
0
    throw RegularExpressionException("bad option");
291
0
  }
292
0
  else if (rc == 0)
293
0
  {
294
0
    throw RegularExpressionException("too many captured substrings");
295
0
  }
296
0
  else if (rc < 0)
297
0
  {
298
0
    PCRE2_UCHAR buffer[256];
299
0
    pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
300
0
    throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
301
0
  }
302
0
  const PCRE2_SIZE* ovec = matchData.data();
303
0
  std::string result;
304
0
  std::string::size_type len = subject.length();
305
0
  std::string::size_type pos = 0;
306
0
  std::string::size_type rp = std::string::npos;
307
0
  while (pos < len)
308
0
  {
309
0
    if (ovec[0] == pos)
310
0
    {
311
0
      std::string::const_iterator it  = replacement.begin();
312
0
      std::string::const_iterator end = replacement.end();
313
0
      while (it != end)
314
0
      {
315
0
        if (*it == '$' && !(options & RE_NO_VARS))
316
0
        {
317
0
          ++it;
318
0
          if (it != end)
319
0
          {
320
0
            char d = *it;
321
0
            if (d >= '0' && d <= '9')
322
0
            {
323
0
              int c = d - '0';
324
0
              if (c < rc)
325
0
              {
326
0
                std::size_t o = ovec[c*2];
327
0
                std::size_t l = ovec[c*2 + 1] - o;
328
0
                result.append(subject, o, l);
329
0
              }
330
0
            }
331
0
            else
332
0
            {
333
0
              result += '$';
334
0
              result += d;
335
0
            }
336
0
            ++it;
337
0
          }
338
0
          else result += '$';
339
0
        }
340
0
        else result += *it++;
341
0
      }
342
0
      pos = ovec[1];
343
0
      rp = result.length();
344
0
    }
345
0
    else result += subject[pos++];
346
0
  }
347
0
  subject = result;
348
0
  return rp;
349
0
}
350
351
352
bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options)
353
0
{
354
0
  int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE);
355
0
  int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK);
356
0
  RegularExpression re(pattern, ctorOptions, false);
357
0
  return re.match(subject, 0, mtchOptions);
358
0
}
359
360
361
int RegularExpression::compileOptions(int options)
362
0
{
363
0
  int pcreOptions = 0;
364
365
0
  if (options & RE_CASELESS)
366
0
    pcreOptions |= PCRE2_CASELESS;
367
0
  if (options & RE_MULTILINE)
368
0
    pcreOptions |= PCRE2_MULTILINE;
369
0
  if (options & RE_DOTALL)
370
0
    pcreOptions |= PCRE2_DOTALL;
371
0
  if (options & RE_EXTENDED)
372
0
    pcreOptions |= PCRE2_EXTENDED;
373
0
  if (options & RE_ANCHORED)
374
0
    pcreOptions |= PCRE2_ANCHORED;
375
0
  if (options & RE_DOLLAR_ENDONLY)
376
0
    pcreOptions |= PCRE2_DOLLAR_ENDONLY;
377
0
  if (options & RE_UNGREEDY)
378
0
    pcreOptions |= PCRE2_UNGREEDY;
379
0
  if (options & RE_UTF8)
380
0
    pcreOptions |= PCRE2_UTF | PCRE2_UCP;
381
0
  if (options & RE_NO_AUTO_CAPTURE)
382
0
    pcreOptions |= PCRE2_NO_AUTO_CAPTURE;
383
0
  if (options & RE_FIRSTLINE)
384
0
    pcreOptions |= PCRE2_FIRSTLINE;
385
0
  if (options & RE_DUPNAMES)
386
0
    pcreOptions |= PCRE2_DUPNAMES;
387
388
0
  return pcreOptions;
389
0
}
390
391
392
int RegularExpression::matchOptions(int options)
393
0
{
394
0
  int pcreOptions = 0;
395
396
0
  if (options & RE_ANCHORED)
397
0
    pcreOptions |= PCRE2_ANCHORED;
398
0
  if (options & RE_NOTBOL)
399
0
    pcreOptions |= PCRE2_NOTBOL;
400
0
  if (options & RE_NOTEOL)
401
0
    pcreOptions |= PCRE2_NOTEOL;
402
0
  if (options & RE_NOTEMPTY)
403
0
    pcreOptions |= PCRE2_NOTEMPTY;
404
0
  if (options & RE_NO_AUTO_CAPTURE)
405
0
    pcreOptions |= PCRE2_NO_AUTO_CAPTURE;
406
0
  if (options & RE_NO_UTF8_CHECK)
407
0
    pcreOptions |= PCRE2_NO_UTF_CHECK;
408
409
0
  return pcreOptions;
410
0
}
411
412
413
} // namespace Poco