Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/re2.py: 45%

2# Use of this source code is governed by a BSD-style

3# license that can be found in the LICENSE file.

4r"""A drop-in replacement for the re module.

6It uses RE2 under the hood, of course, so various PCRE features

7(e.g. backreferences, look-around assertions) are not supported.

8See https://github.com/google/re2/wiki/Syntax for the canonical

9reference, but known syntactic "gotchas" relative to Python are:

11 * PCRE supports \Z and \z; RE2 supports \z; Python supports \z,

12 but calls it \Z. You must rewrite \Z to \z in pattern strings.

14Known differences between this module's API and the re module's API:

16 * The error class does not provide any error information as attributes.

17 * The Options class replaces the re module's flags with RE2's options as

18 gettable/settable properties. Please see re2.h for their documentation.

19 * The pattern string and the input string do not have to be the same type.

20 Any str will be encoded to UTF-8.

21 * The pattern string cannot be str if the options specify Latin-1 encoding.

23This module's LRU cache contains a maximum of 128 regular expression objects.

24Each regular expression object's underlying RE2 object uses a maximum of 8MiB

25of memory (by default). Hence, this module's LRU cache uses a maximum of 1GiB

26of memory (by default), but in most cases, it should use much less than that.

27"""

29import codecs

30import functools

31import itertools

33import _re2

36# pybind11 translates C++ exceptions to Python exceptions.

37# We use that same Python exception class for consistency.

38error = _re2.Error

41class Options(_re2.RE2.Options):

43 __slots__ = ()

45 NAMES = (

46 'max_mem',

47 'encoding',

48 'posix_syntax',

49 'longest_match',

50 'log_errors',

51 'literal',

52 'never_nl',

53 'dot_nl',

54 'never_capture',

55 'case_sensitive',

56 'perl_classes',

57 'word_boundary',

58 'one_line',

59 )

62def compile(pattern, options=None):

63 if isinstance(pattern, _Regexp):

64 if options:

65 raise error('pattern is already compiled, so '

66 'options may not be specified')

67 pattern = pattern._pattern

68 options = options or Options()

69 values = tuple(getattr(options, name) for name in Options.NAMES)

70 return _Regexp._make(pattern, values)

73def search(pattern, text, options=None):

74 return compile(pattern, options=options).search(text)

77def match(pattern, text, options=None):

78 return compile(pattern, options=options).match(text)

81def fullmatch(pattern, text, options=None):

82 return compile(pattern, options=options).fullmatch(text)

85def finditer(pattern, text, options=None):

86 return compile(pattern, options=options).finditer(text)

89def findall(pattern, text, options=None):

90 return compile(pattern, options=options).findall(text)

93def split(pattern, text, maxsplit=0, options=None):

94 return compile(pattern, options=options).split(text, maxsplit)

97def subn(pattern, repl, text, count=0, options=None):

98 return compile(pattern, options=options).subn(repl, text, count)

100

101def sub(pattern, repl, text, count=0, options=None):

102 return compile(pattern, options=options).sub(repl, text, count)

103

104

105def _encode(t):

106 return t.encode(encoding='utf-8')

107

108

109def _decode(b):

110 return b.decode(encoding='utf-8')

111

112

113def escape(pattern):

114 if isinstance(pattern, str):

115 encoded_pattern = _encode(pattern)

116 escaped = _re2.RE2.QuoteMeta(encoded_pattern)

117 decoded_escaped = _decode(escaped)

118 return decoded_escaped

119 else:

120 escaped = _re2.RE2.QuoteMeta(pattern)

121 return escaped

122

123

124def purge():

125 return _Regexp._make.cache_clear()

126

127

128_Anchor = _re2.RE2.Anchor

129_NULL_SPAN = (-1, -1)

130

131

132class _Regexp(object):

133

134 __slots__ = ('_pattern', '_regexp')

135

136 @classmethod

137 @functools.lru_cache(typed=True)

138 def _make(cls, pattern, values):

139 options = Options()

140 for name, value in zip(Options.NAMES, values):

141 setattr(options, name, value)

142 return cls(pattern, options)

143

144 def __init__(self, pattern, options):

145 self._pattern = pattern

146 if isinstance(self._pattern, str):

147 if options.encoding == Options.Encoding.LATIN1:

148 raise error('string type of pattern is str, but '

149 'encoding specified in options is LATIN1')

150 encoded_pattern = _encode(self._pattern)

151 self._regexp = _re2.RE2(encoded_pattern, options)

152 else:

153 self._regexp = _re2.RE2(self._pattern, options)

154 if not self._regexp.ok():

155 raise error(self._regexp.error())

156

157 def __getstate__(self):

158 options = {name: getattr(self.options, name) for name in Options.NAMES}

159 return self._pattern, options

160

161 def __setstate__(self, state):

162 pattern, options = state

163 values = tuple(options[name] for name in Options.NAMES)

164 other = _Regexp._make(pattern, values)

165 self._pattern = other._pattern

166 self._regexp = other._regexp

167

168 def _match(self, anchor, text, pos=None, endpos=None):

169 pos = 0 if pos is None else max(0, min(pos, len(text)))

170 endpos = len(text) if endpos is None else max(0, min(endpos, len(text)))

171 if pos > endpos:

172 return

173 if isinstance(text, str):

174 encoded_text = _encode(text)

175 encoded_pos = _re2.CharLenToBytes(encoded_text, 0, pos)

176 if endpos == len(text):

177 # This is the common case.

178 encoded_endpos = len(encoded_text)

179 else:

180 encoded_endpos = encoded_pos + _re2.CharLenToBytes(

181 encoded_text, encoded_pos, endpos - pos)

182 decoded_offsets = {0: 0}

183 last_offset = 0

184 while True:

185 spans = self._regexp.Match(anchor, encoded_text, encoded_pos,

186 encoded_endpos)

187 if spans[0] == _NULL_SPAN:

188 break

189

190 # This algorithm is linear in the length of encoded_text. Specifically,

191 # no matter how many groups there are for a given regular expression or

192 # how many iterations through the loop there are for a given generator,

193 # this algorithm uses a single, straightforward pass over encoded_text.

194 offsets = sorted(set(itertools.chain(*spans)))

195 if offsets[0] == -1:

196 offsets = offsets[1:]

197 # Discard the rest of the items because they are useless now - and we

198 # could accumulate one item per str offset in the pathological case!

199 decoded_offsets = {last_offset: decoded_offsets[last_offset]}

200 for offset in offsets:

201 decoded_offsets[offset] = (

202 decoded_offsets[last_offset] +

203 _re2.BytesToCharLen(encoded_text, last_offset, offset))

204 last_offset = offset

205

206 def decode(span):

207 if span == _NULL_SPAN:

208 return span

209 return decoded_offsets[span[0]], decoded_offsets[span[1]]

210

211 decoded_spans = [decode(span) for span in spans]

212 yield _Match(self, text, pos, endpos, decoded_spans)

213 if encoded_pos == encoded_endpos:

214 break

215 elif encoded_pos == spans[0][1]:

216 # We matched the empty string at encoded_pos and would be stuck, so

217 # in order to make forward progress, increment the str offset.

218 encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1)

219 else:

220 encoded_pos = spans[0][1]

221 else:

222 while True:

223 spans = self._regexp.Match(anchor, text, pos, endpos)

224 if spans[0] == _NULL_SPAN:

225 break

226 yield _Match(self, text, pos, endpos, spans)

227 if pos == endpos:

228 break

229 elif pos == spans[0][1]:

230 # We matched the empty string at pos and would be stuck, so in order

231 # to make forward progress, increment the bytes offset.

232 pos += 1

233 else:

234 pos = spans[0][1]

235

236 def search(self, text, pos=None, endpos=None):

237 return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None)

238

239 def match(self, text, pos=None, endpos=None):

240 return next(self._match(_Anchor.ANCHOR_START, text, pos, endpos), None)

241

242 def fullmatch(self, text, pos=None, endpos=None):

243 return next(self._match(_Anchor.ANCHOR_BOTH, text, pos, endpos), None)

244

245 def finditer(self, text, pos=None, endpos=None):

246 return self._match(_Anchor.UNANCHORED, text, pos, endpos)

247

248 def findall(self, text, pos=None, endpos=None):

249 empty = type(text)()

250 items = []

251 for match in self.finditer(text, pos, endpos):

252 if not self.groups:

253 item = match.group()

254 elif self.groups == 1:

255 item = match.groups(default=empty)[0]

256 else:

257 item = match.groups(default=empty)

258 items.append(item)

259 return items

260

261 def _split(self, cb, text, maxsplit=0):

262 if maxsplit < 0:

263 return [text], 0

264 elif maxsplit > 0:

265 matchiter = itertools.islice(self.finditer(text), maxsplit)

266 else:

267 matchiter = self.finditer(text)

268 pieces = []

269 end = 0

270 numsplit = 0

271 for match in matchiter:

272 pieces.append(text[end:match.start()])

273 pieces.extend(cb(match))

274 end = match.end()

275 numsplit += 1

276 pieces.append(text[end:])

277 return pieces, numsplit

278

279 def split(self, text, maxsplit=0):

280 cb = lambda match: [match[group] for group in range(1, self.groups + 1)]

281 pieces, _ = self._split(cb, text, maxsplit)

282 return pieces

283

284 def subn(self, repl, text, count=0):

285 cb = lambda match: [repl(match) if callable(repl) else match.expand(repl)]

286 empty = type(text)()

287 pieces, numsplit = self._split(cb, text, count)

288 joined_pieces = empty.join(pieces)

289 return joined_pieces, numsplit

290

291 def sub(self, repl, text, count=0):

292 joined_pieces, _ = self.subn(repl, text, count)

293 return joined_pieces

294

295 @property

296 def pattern(self):

297 return self._pattern

298

299 @property

300 def options(self):

301 return self._regexp.options()

302

303 @property

304 def groups(self):

305 return self._regexp.NumberOfCapturingGroups()

306

307 @property

308 def groupindex(self):

309 groups = self._regexp.NamedCapturingGroups()

310 if isinstance(self._pattern, str):

311 decoded_groups = [(_decode(group), index) for group, index in groups]

312 return dict(decoded_groups)

313 else:

314 return dict(groups)

315

316 @property

317 def programsize(self):

318 return self._regexp.ProgramSize()

319

320 @property

321 def reverseprogramsize(self):

322 return self._regexp.ReverseProgramSize()

323

324 @property

325 def programfanout(self):

326 return self._regexp.ProgramFanout()

327

328 @property

329 def reverseprogramfanout(self):

330 return self._regexp.ReverseProgramFanout()

331

332 def possiblematchrange(self, maxlen):

333 ok, min, max = self._regexp.PossibleMatchRange(maxlen)

334 if not ok:

335 raise error('failed to compute match range')

336 return min, max

337

338

339class _Match(object):

340

341 __slots__ = ('_regexp', '_text', '_pos', '_endpos', '_spans')

342

343 def __init__(self, regexp, text, pos, endpos, spans):

344 self._regexp = regexp

345 self._text = text

346 self._pos = pos

347 self._endpos = endpos

348 self._spans = spans

349

350 # Python prioritises three-digit octal numbers over group escapes.

351 # For example, \100 should not be handled the same way as \g<10>0.

352 _OCTAL_RE = compile('\\\\[0-7][0-7][0-7]')

353

354 # Python supports \1 through \99 (inclusive) and \g<...> syntax.

355 _GROUP_RE = compile('\\\\[1-9][0-9]?|\\\\g<\\w+>')

356

357 @classmethod

358 @functools.lru_cache(typed=True)

359 def _split(cls, template):

360 if isinstance(template, str):

361 backslash = '\\'

362 else:

363 backslash = b'\\'

364 empty = type(template)()

365 pieces = [empty]

366 index = template.find(backslash)

367 while index != -1:

368 piece, template = template[:index], template[index:]

369 pieces[-1] += piece

370 octal_match = cls._OCTAL_RE.match(template)

371 group_match = cls._GROUP_RE.match(template)

372 if (not octal_match) and group_match:

373 index = group_match.end()

374 piece, template = template[:index], template[index:]

375 pieces.extend((piece, empty))

376 else:

377 # 2 isn't enough for \o, \x, \N, \u and \U escapes, but none of those

378 # should contain backslashes, so break them here and then fix them at

379 # the beginning of the next loop iteration or right before returning.

380 index = 2

381 piece, template = template[:index], template[index:]

382 pieces[-1] += piece

383 index = template.find(backslash)

384 pieces[-1] += template

385 return pieces

386

387 def expand(self, template):

388 if isinstance(template, str):

389 unescape = codecs.unicode_escape_decode

390 else:

391 unescape = codecs.escape_decode

392 empty = type(template)()

393 # Make a copy so that we don't clobber the cached pieces!

394 pieces = list(self._split(template))

395 for index, piece in enumerate(pieces):

396 if not index % 2:

397 pieces[index], _ = unescape(piece)

398 else:

399 if len(piece) <= 3: # \1 through \99 (inclusive)

400 group = int(piece[1:])

401 else: # \g<...>

402 group = piece[3:-1]

403 try:

404 group = int(group)

405 except ValueError:

406 pass

407 pieces[index] = self.__getitem__(group) or empty

408 joined_pieces = empty.join(pieces)

409 return joined_pieces

410

411 def __getitem__(self, group):

412 if not isinstance(group, int):

413 try:

414 group = self._regexp.groupindex[group]

415 except KeyError:

416 raise IndexError('bad group name')

417 if not 0 <= group <= self._regexp.groups:

418 raise IndexError('bad group index')

419 span = self._spans[group]

420 if span == _NULL_SPAN:

421 return None

422 return self._text[span[0]:span[1]]

423

424 def group(self, *groups):

425 if not groups:

426 groups = (0,)

427 items = (self.__getitem__(group) for group in groups)

428 return next(items) if len(groups) == 1 else tuple(items)

429

430 def groups(self, default=None):

431 items = []

432 for group in range(1, self._regexp.groups + 1):

433 item = self.__getitem__(group)

434 items.append(default if item is None else item)

435 return tuple(items)

436

437 def groupdict(self, default=None):

438 items = []

439 for group, index in self._regexp.groupindex.items():

440 item = self.__getitem__(index)

441 items.append((group, default) if item is None else (group, item))

442 return dict(items)

443

444 def start(self, group=0):

445 if not 0 <= group <= self._regexp.groups:

446 raise IndexError('bad group index')

447 return self._spans[group][0]

448

449 def end(self, group=0):

450 if not 0 <= group <= self._regexp.groups:

451 raise IndexError('bad group index')

452 return self._spans[group][1]

453

454 def span(self, group=0):

455 if not 0 <= group <= self._regexp.groups:

456 raise IndexError('bad group index')

457 return self._spans[group]

458

459 @property

460 def re(self):

461 return self._regexp

462

463 @property

464 def string(self):

465 return self._text

466

467 @property

468 def pos(self):

469 return self._pos

470

471 @property

472 def endpos(self):

473 return self._endpos

474

475 @property

476 def lastindex(self):

477 max_end = -1

478 max_group = None

479 # We look for the rightmost right parenthesis by keeping the first group

480 # that ends at max_end because that is the leftmost/outermost group when

481 # there are nested groups!

482 for group in range(1, self._regexp.groups + 1):

483 end = self._spans[group][1]

484 if max_end < end:

485 max_end = end

486 max_group = group

487 return max_group

488

489 @property

490 def lastgroup(self):

491 max_group = self.lastindex

492 if not max_group:

493 return None

494 for group, index in self._regexp.groupindex.items():

495 if max_group == index:

496 return group

497 return None

498

499

500class Set(object):

501 """A Pythonic wrapper around RE2::Set."""

502

503 __slots__ = ('_set')

504

505 def __init__(self, anchor, options=None):

506 options = options or Options()

507 self._set = _re2.Set(anchor, options)

508

509 @classmethod

510 def SearchSet(cls, options=None):

511 return cls(_Anchor.UNANCHORED, options=options)

512

513 @classmethod

514 def MatchSet(cls, options=None):

515 return cls(_Anchor.ANCHOR_START, options=options)

516

517 @classmethod

518 def FullMatchSet(cls, options=None):

519 return cls(_Anchor.ANCHOR_BOTH, options=options)

520

521 def Add(self, pattern):

522 if isinstance(pattern, str):

523 encoded_pattern = _encode(pattern)

524 index = self._set.Add(encoded_pattern)

525 else:

526 index = self._set.Add(pattern)

527 if index == -1:

528 raise error('failed to add %r to Set' % pattern)

529 return index

530

531 def Compile(self):

532 if not self._set.Compile():

533 raise error('failed to compile Set')

534

535 def Match(self, text):

536 if isinstance(text, str):

537 encoded_text = _encode(text)

538 matches = self._set.Match(encoded_text)

539 else:

540 matches = self._set.Match(text)

541 return matches or None

542

543

544class Filter(object):

545 """A Pythonic wrapper around FilteredRE2."""

546

547 __slots__ = ('_filter', '_patterns')

548

549 def __init__(self):

550 self._filter = _re2.Filter()

551 self._patterns = []

552

553 def Add(self, pattern, options=None):

554 options = options or Options()

555 if isinstance(pattern, str):

556 encoded_pattern = _encode(pattern)

557 index = self._filter.Add(encoded_pattern, options)

558 else:

559 index = self._filter.Add(pattern, options)

560 if index == -1:

561 raise error('failed to add %r to Filter' % pattern)

562 self._patterns.append(pattern)

563 return index

564

565 def Compile(self):

566 if not self._filter.Compile():

567 raise error('failed to compile Filter')

568

569 def Match(self, text, potential=False):

570 if isinstance(text, str):

571 encoded_text = _encode(text)

572 matches = self._filter.Match(encoded_text, potential)

573 else:

574 matches = self._filter.Match(text, potential)

575 return matches or None

576

577 def re(self, index):

578 if not 0 <= index < len(self._patterns):

579 raise IndexError('bad index')

580 proxy = object.__new__(_Regexp)

581 proxy._pattern = self._patterns[index]

582 proxy._regexp = self._filter.GetRE2(index)

583 return proxy