Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/boltons/strutils.py: 20%

3# Redistribution and use in source and binary forms, with or without

4# modification, are permitted provided that the following conditions are

5# met:

7# * Redistributions of source code must retain the above copyright

8# notice, this list of conditions and the following disclaimer.

10# * Redistributions in binary form must reproduce the above

11# copyright notice, this list of conditions and the following

12# disclaimer in the documentation and/or other materials provided

13# with the distribution.

14#

15# * The names of the contributors may not be used to endorse or

16# promote products derived from this software without specific

17# prior written permission.

18#

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

31"""So much practical programming involves string manipulation, which

32Python readily accommodates. Still, there are dozens of basic and

33common capabilities missing from the standard library, several of them

34provided by ``strutils``.

35"""

38import builtins

39import re

40import sys

41import uuid

42import zlib

43import string

44import unicodedata

45import collections

46from collections.abc import Mapping

47from gzip import GzipFile

48from html.parser import HTMLParser

49from html import entities as htmlentitydefs

50from io import BytesIO as StringIO

53__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',

54 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',

55 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',

56 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',

57 'iter_splitlines', 'indent', 'escape_shell_args',

58 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',

59 'complement_int_list', 'int_ranges_from_int_list', 'MultiReplace',

60 'multi_replace', 'unwrap_text']

63_punct_ws_str = string.punctuation + string.whitespace

64_punct_re = re.compile('[' + _punct_ws_str + ']+')

65_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')

68def camel2under(camel_string):

69 """Converts a camelcased string to underscores. Useful for turning a

70 class name into a function name.

72 >>> camel2under('BasicParseTest')

73 'basic_parse_test'

74 """

75 return _camel2under_re.sub(r'_\1', camel_string).lower()

78def under2camel(under_string):

79 """Converts an underscored string to camelcased. Useful for turning a

80 function name into a class name.

82 >>> under2camel('complex_tokenizer')

83 'ComplexTokenizer'

84 """

85 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))

88def slugify(text, delim='_', lower=True, ascii=False):

89 """

90 A basic function that turns text full of scary characters

91 (i.e., punctuation and whitespace), into a relatively safe

92 lowercased string separated only by the delimiter specified

93 by *delim*, which defaults to ``_``.

95 The *ascii* convenience flag will :func:`asciify` the slug if

96 you require ascii-only slugs.

98 >>> slugify('First post! Hi!!!!~1 ')

99 'first_post_hi_1'

100

101 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \

102 b'kurt_goedel_s_pretty_cool'

103 True

104

105 """

106 ret = delim.join(split_punct_ws(text)) or delim if text else ''

107 if ascii:

108 ret = asciify(ret)

109 if lower:

110 ret = ret.lower()

111 return ret

112

113

114def split_punct_ws(text):

115 """While :meth:`str.split` will split on whitespace,

116 :func:`split_punct_ws` will split on punctuation and

117 whitespace. This used internally by :func:`slugify`, above.

118

119 >>> split_punct_ws('First post! Hi!!!!~1 ')

120 ['First', 'post', 'Hi', '1']

121 """

122 return [w for w in _punct_re.split(text) if w]

123

124

125def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?

126 """Returns a plain-English description of an iterable's

127 :func:`len()`, conditionally pluralized with :func:`cardinalize`,

128 detailed below.

129

130 >>> print(unit_len(range(10), 'number'))

131 10 numbers

132 >>> print(unit_len('aeiou', 'vowel'))

133 5 vowels

134 >>> print(unit_len([], 'worry'))

135 No worries

136 """

137 count = len(sized_iterable)

138 units = cardinalize(unit_noun, count)

139 if count:

140 return f'{count} {units}'

141 return f'No {units}'

142

143

144_ORDINAL_MAP = {'1': 'st',

145 '2': 'nd',

146 '3': 'rd'} # 'th' is the default

147

148

149def ordinalize(number, ext_only=False):

150 """Turns *number* into its cardinal form, i.e., 1st, 2nd,

151 3rd, 4th, etc. If the last character isn't a digit, it returns the

152 string value unchanged.

153

154 Args:

155 number (int or str): Number to be cardinalized.

156 ext_only (bool): Whether to return only the suffix. Default ``False``.

157

158 >>> print(ordinalize(1))

159 1st

160 >>> print(ordinalize(3694839230))

161 3694839230th

162 >>> print(ordinalize('hi'))

163 hi

164 >>> print(ordinalize(1515))

165 1515th

166 """

167 numstr, ext = str(number), ''

168 if numstr and numstr[-1] in string.digits:

169 try:

170 # first check for teens

171 if numstr[-2] == '1':

172 ext = 'th'

173 else:

174 # all other cases

175 ext = _ORDINAL_MAP.get(numstr[-1], 'th')

176 except IndexError:

177 # single digit numbers (will reach here based on [-2] above)

178 ext = _ORDINAL_MAP.get(numstr[-1], 'th')

179 if ext_only:

180 return ext

181 else:

182 return numstr + ext

183

184

185def cardinalize(unit_noun, count):

186 """Conditionally pluralizes a singular word *unit_noun* if

187 *count* is not one, preserving case when possible.

188

189 >>> vowels = 'aeiou'

190 >>> print(len(vowels), cardinalize('vowel', len(vowels)))

191 5 vowels

192 >>> print(3, cardinalize('Wish', 3))

193 3 Wishes

194 """

195 if count == 1:

196 return unit_noun

197 return pluralize(unit_noun)

198

199

200def singularize(word):

201 """Semi-intelligently converts an English plural *word* to its

202 singular form, preserving case pattern.

203

204 >>> singularize('chances')

205 'chance'

206 >>> singularize('Activities')

207 'Activity'

208 >>> singularize('Glasses')

209 'Glass'

210 >>> singularize('FEET')

211 'FOOT'

212

213 """

214 orig_word, word = word, word.strip().lower()

215 if not word or word in _IRR_S2P:

216 return orig_word

217

218 irr_singular = _IRR_P2S.get(word)

219 if irr_singular:

220 singular = irr_singular

221 elif not word.endswith('s'):

222 return orig_word

223 elif len(word) == 2:

224 singular = word[:-1] # or just return word?

225 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':

226 singular = word[:-3] + 'y'

227 elif word.endswith('es') and word[-3] == 's':

228 singular = word[:-2]

229 else:

230 singular = word[:-1]

231 return _match_case(orig_word, singular)

232

233

234def pluralize(word):

235 """Semi-intelligently converts an English *word* from singular form to

236 plural, preserving case pattern.

237

238 >>> pluralize('friend')

239 'friends'

240 >>> pluralize('enemy')

241 'enemies'

242 >>> pluralize('Sheep')

243 'Sheep'

244 """

245 orig_word, word = word, word.strip().lower()

246 if not word or word in _IRR_P2S:

247 return orig_word

248 irr_plural = _IRR_S2P.get(word)

249 if irr_plural:

250 plural = irr_plural

251 elif word.endswith('y') and word[-2:-1] not in 'aeiou':

252 plural = word[:-1] + 'ies'

253 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):

254 plural = word if word.endswith('es') else word + 'es'

255 else:

256 plural = word + 's'

257 return _match_case(orig_word, plural)

258

259

260def _match_case(master, disciple):

261 if not master.strip():

262 return disciple

263 if master.lower() == master:

264 return disciple.lower()

265 elif master.upper() == master:

266 return disciple.upper()

267 elif master.title() == master:

268 return disciple.title()

269 return disciple

270

271

272# Singular to plural map of irregular pluralizations

273_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',

274 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',

275 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',

276 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',

277 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',

278 'calf': 'calves', 'child': 'children', 'corps': 'corps',

279 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',

280 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',

281 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',

282 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',

283 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',

284 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',

285 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',

286 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',

287 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',

288 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',

289 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',

290 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',

291 'matrix': 'matrices', 'means': 'means', 'medium': 'media',

292 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',

293 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',

294 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',

295 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',

296 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',

297 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',

298 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',

299 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':

300 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':

301 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':

302 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',

303 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',

304 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':

305 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':

306 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',

307 'wolf': 'wolves', 'woman': 'women'}

308

309

310# Reverse index of the above

311_IRR_P2S = {v: k for k, v in _IRR_S2P.items()}

312

313HASHTAG_RE = re.compile(r"(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)

314

315

316def find_hashtags(string):

317 """Finds and returns all hashtags in a string, with the hashmark

318 removed. Supports full-width hashmarks for Asian languages and

319 does not false-positive on URL anchors.

320

321 >>> find_hashtags('#atag http://asite/#ananchor')

322 ['atag']

323

324 ``find_hashtags`` also works with unicode hashtags.

325 """

326

327 # the following works, doctest just struggles with it

328 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")

329 # [u'\u80af\u5fb7\u57fa']

330 return HASHTAG_RE.findall(string)

331

332

333def a10n(string):

334 """That thing where "internationalization" becomes "i18n", what's it

335 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form

336 of `numeronym`_.)

337

338 >>> a10n('abbreviation')

339 'a10n'

340 >>> a10n('internationalization')

341 'i18n'

342 >>> a10n('')

343 ''

344

345 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym

346 """

347 if len(string) < 3:

348 return string

349 return f'{string[0]}{len(string[1:-1])}{string[-1]}'

350

351

352# Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences

353ANSI_SEQUENCES = re.compile(r'''

354 \x1B # Sequence starts with ESC, i.e. hex 0x1B

355 (?:

356 [@-Z\\-_] # Second byte:

357 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_

358 | # Or

359 \[ # CSI sequences, starting with [

360 [0-?]* # Parameter bytes:

361 # range 0x30–0x3F, ASCII 0–9:;<=>?

362 [ -/]* # Intermediate bytes:

363 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./

364 [@-~] # Final byte

365 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~

366 )

367''', re.VERBOSE)

368

369

370def strip_ansi(text):

371 """Strips ANSI escape codes from *text*. Useful for the occasional

372 time when a log or redirected output accidentally captures console

373 color codes and the like.

374

375 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')

376 'art'

377

378 Supports str, bytes and bytearray content as input. Returns the

379 same type as the input.

380

381 There's a lot of ANSI art available for testing on `sixteencolors.net`_.

382 This function does not interpret or render ANSI art, but you can do so with

383 `ansi2img`_ or `escapes.js`_.

384

385 .. _sixteencolors.net: http://sixteencolors.net

386 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img

387 .. _escapes.js: https://github.com/atdt/escapes.js

388 """

389 # TODO: move to cliutils.py

390

391 # Transform any ASCII-like content to unicode to allow regex to match, and

392 # save input type for later.

393 target_type = None

394 # Unicode type aliased to str is code-smell for Boltons in Python 3 env.

395 if isinstance(text, (bytes, bytearray)):

396 target_type = type(text)

397 text = text.decode('utf-8')

398

399 cleaned = ANSI_SEQUENCES.sub('', text)

400

401 # Transform back the result to the same bytearray type provided by the user.

402 if target_type and target_type != type(cleaned):

403 cleaned = target_type(cleaned, 'utf-8')

404

405 return cleaned

406

407

408def asciify(text, ignore=False):

409 """Converts a unicode or bytestring, *text*, into a bytestring with

410 just ascii characters. Performs basic deaccenting for all you

411 Europhiles out there.

412

413 Also, a gentle reminder that this is a **utility**, primarily meant

414 for slugification. Whenever possible, make your application work

415 **with** unicode, not against it.

416

417 Args:

418 text (str): The string to be asciified.

419 ignore (bool): Configures final encoding to ignore remaining

420 unasciified string instead of replacing it.

421

422 >>> asciify('Beyoncé') == b'Beyonce'

423 True

424 """

425 try:

426 try:

427 return text.encode('ascii')

428 except UnicodeDecodeError:

429 # this usually means you passed in a non-unicode string

430 text = text.decode('utf-8')

431 return text.encode('ascii')

432 except UnicodeEncodeError:

433 mode = 'replace'

434 if ignore:

435 mode = 'ignore'

436 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))

437 ret = transd.encode('ascii', mode)

438 return ret

439

440

441def is_ascii(text):

442 """Check if a string or bytestring, *text*, is composed of ascii

443 characters only. Raises :exc:`ValueError` if argument is not text.

444

445 Args:

446 text (str): The string to be checked.

447

448 >>> is_ascii('Beyoncé')

449 False

450 >>> is_ascii('Beyonce')

451 True

452 """

453 if isinstance(text, str):

454 try:

455 text.encode('ascii')

456 except UnicodeEncodeError:

457 return False

458 elif isinstance(text, bytes):

459 try:

460 text.decode('ascii')

461 except UnicodeDecodeError:

462 return False

463 else:

464 raise ValueError('expected text or bytes, not %r' % type(text))

465 return True

466

467

468class DeaccenterDict(dict):

469 "A small caching dictionary for deaccenting."

470 def __missing__(self, key):

471 ch = self.get(key)

472 if ch is not None:

473 return ch

474 try:

475 de = unicodedata.decomposition(chr(key))

476 p1, _, p2 = de.rpartition(' ')

477 if int(p2, 16) == 0x308:

478 ch = self.get(key)

479 else:

480 ch = int(p1, 16)

481 except (IndexError, ValueError):

482 ch = self.get(key, key)

483 self[key] = ch

484 return ch

485

486

487# http://chmullig.com/2009/12/python-unicode-ascii-ifier/

488# For something more complete, investigate the unidecode

489# or isounidecode packages, which are capable of performing

490# crude transliteration.

491_BASE_DEACCENT_MAP = {

492 0xc6: "AE", # Æ LATIN CAPITAL LETTER AE

493 0xd0: "D", # Ð LATIN CAPITAL LETTER ETH

494 0xd8: "OE", # Ø LATIN CAPITAL LETTER O WITH STROKE

495 0xde: "Th", # Þ LATIN CAPITAL LETTER THORN

496 0xc4: 'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS

497 0xd6: 'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS

498 0xdc: 'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS

499 0xc0: "A", # À LATIN CAPITAL LETTER A WITH GRAVE

500 0xc1: "A", # Á LATIN CAPITAL LETTER A WITH ACUTE

501 0xc3: "A", # Ã LATIN CAPITAL LETTER A WITH TILDE

502 0xc7: "C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA

503 0xc8: "E", # È LATIN CAPITAL LETTER E WITH GRAVE

504 0xc9: "E", # É LATIN CAPITAL LETTER E WITH ACUTE

505 0xca: "E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX

506 0xcc: "I", # Ì LATIN CAPITAL LETTER I WITH GRAVE

507 0xcd: "I", # Í LATIN CAPITAL LETTER I WITH ACUTE

508 0xd2: "O", # Ò LATIN CAPITAL LETTER O WITH GRAVE

509 0xd3: "O", # Ó LATIN CAPITAL LETTER O WITH ACUTE

510 0xd5: "O", # Õ LATIN CAPITAL LETTER O WITH TILDE

511 0xd9: "U", # Ù LATIN CAPITAL LETTER U WITH GRAVE

512 0xda: "U", # Ú LATIN CAPITAL LETTER U WITH ACUTE

513 0xdf: "ss", # ß LATIN SMALL LETTER SHARP S

514 0xe6: "ae", # æ LATIN SMALL LETTER AE

515 0xf0: "d", # ð LATIN SMALL LETTER ETH

516 0xf8: "oe", # ø LATIN SMALL LETTER O WITH STROKE

517 0xfe: "th", # þ LATIN SMALL LETTER THORN,

518 0xe4: 'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS

519 0xf6: 'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS

520 0xfc: 'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS

521 0xe0: "a", # à LATIN SMALL LETTER A WITH GRAVE

522 0xe1: "a", # á LATIN SMALL LETTER A WITH ACUTE

523 0xe3: "a", # ã LATIN SMALL LETTER A WITH TILDE

524 0xe7: "c", # ç LATIN SMALL LETTER C WITH CEDILLA

525 0xe8: "e", # è LATIN SMALL LETTER E WITH GRAVE

526 0xe9: "e", # é LATIN SMALL LETTER E WITH ACUTE

527 0xea: "e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX

528 0xec: "i", # ì LATIN SMALL LETTER I WITH GRAVE

529 0xed: "i", # í LATIN SMALL LETTER I WITH ACUTE

530 0xf2: "o", # ò LATIN SMALL LETTER O WITH GRAVE

531 0xf3: "o", # ó LATIN SMALL LETTER O WITH ACUTE

532 0xf5: "o", # õ LATIN SMALL LETTER O WITH TILDE

533 0xf9: "u", # ù LATIN SMALL LETTER U WITH GRAVE

534 0xfa: "u", # ú LATIN SMALL LETTER U WITH ACUTE

535 0x2018: "'", # ‘ LEFT SINGLE QUOTATION MARK

536 0x2019: "'", # ’ RIGHT SINGLE QUOTATION MARK

537 0x201c: '"', # “ LEFT DOUBLE QUOTATION MARK

538 0x201d: '"', # ” RIGHT DOUBLE QUOTATION MARK

539 }

540

541

542DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)

543

544

545_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')

546_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]

547_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))

548

549

550def bytes2human(nbytes, ndigits=0):

551 """Turns an integer value of *nbytes* into a human readable format. Set

552 *ndigits* to control how many digits after the decimal point

553 should be shown (default ``0``).

554

555 >>> bytes2human(128991)

556 '126K'

557 >>> bytes2human(100001221)

558 '95M'

559 >>> bytes2human(0, 2)

560 '0.00B'

561 """

562 abs_bytes = abs(nbytes)

563 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:

564 if abs_bytes <= next_size:

565 break

566 hnbytes = float(nbytes) / size

567 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,

568 ndigits=ndigits,

569 symbol=symbol)

570

571

572class HTMLTextExtractor(HTMLParser):

573 def __init__(self):

574 self.reset()

575 self.strict = False

576 self.convert_charrefs = True

577 self.result = []

578

579 def handle_data(self, d):

580 self.result.append(d)

581

582 def handle_charref(self, number):

583 if number[0] == 'x' or number[0] == 'X':

584 codepoint = int(number[1:], 16)

585 else:

586 codepoint = int(number)

587 self.result.append(chr(codepoint))

588

589 def handle_entityref(self, name):

590 try:

591 codepoint = htmlentitydefs.name2codepoint[name]

592 except KeyError:

593 self.result.append('&' + name + ';')

594 else:

595 self.result.append(chr(codepoint))

596

597 def get_text(self):

598 return ''.join(self.result)

599

600

601def html2text(html):

602 """Strips tags from HTML text, returning markup-free text. Also, does

603 a best effort replacement of entities like " "

604

605 >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>')

606 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'

607 True

608 """

609 # based on answers to http://stackoverflow.com/questions/753052/

610 s = HTMLTextExtractor()

611 s.feed(html)

612 return s.get_text()

613

614

615_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'

616_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'

617

618

619def gunzip_bytes(bytestring):

620 """The :mod:`gzip` module is great if you have a file or file-like

621 object, but what if you just have bytes. StringIO is one

622 possibility, but it's often faster, easier, and simpler to just

623 use this one-liner. Use this tried-and-true utility function to

624 decompress gzip from bytes.

625

626 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''

627 True

628 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'

629 True

630 """

631 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)

632

633

634def gzip_bytes(bytestring, level=6):

635 """Turn some bytes into some compressed bytes.

636

637 >>> len(gzip_bytes(b'a' * 10000))

638 46

639

640 Args:

641 bytestring (bytes): Bytes to be compressed

642 level (int): An integer, 1-9, controlling the

643 speed/compression. 1 is fastest, least compressed, 9 is

644 slowest, but most compressed.

645

646 Note that all levels of gzip are pretty fast these days, though

647 it's not really a competitor in compression, at any level.

648 """

649 out = StringIO()

650 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)

651 f.write(bytestring)

652 f.close()

653 return out.getvalue()

654

655

656

657_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',

658 re.UNICODE)

659

660

661def iter_splitlines(text):

662 r"""Like :meth:`str.splitlines`, but returns an iterator of lines

663 instead of a list. Also similar to :meth:`file.next`, as that also

664 lazily reads and yields lines from a file.

665

666 This function works with a variety of line endings, but as always,

667 be careful when mixing line endings within a file.

668

669 >>> list(iter_splitlines('\nhi\nbye\n'))

670 ['', 'hi', 'bye', '']

671 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))

672 ['', 'hi', 'bye', '']

673 >>> list(iter_splitlines(''))

674 []

675 """

676 prev_end, len_text = 0, len(text)

677 # print('last: %r' % last_idx)

678 # start, end = None, None

679 for match in _line_ending_re.finditer(text):

680 start, end = match.start(1), match.end(1)

681 # print(start, end)

682 if prev_end <= start:

683 yield text[prev_end:start]

684 if end == len_text:

685 yield ''

686 prev_end = end

687 tail = text[prev_end:]

688 if tail:

689 yield tail

690 return

691

692

693def indent(text, margin, newline='\n', key=bool):

694 """The missing counterpart to the built-in :func:`textwrap.dedent`.

695

696 Args:

697 text (str): The text to indent.

698 margin (str): The string to prepend to each line.

699 newline (str): The newline used to rejoin the lines (default: ``\\n``)

700 key (callable): Called on each line to determine whether to

701 indent it. Default: :class:`bool`, to ensure that empty lines do

702 not get whitespace added.

703 """

704 indented_lines = [(margin + line if key(line) else line)

705 for line in iter_splitlines(text)]

706 return newline.join(indented_lines)

707

708

709def is_uuid(obj, version=4):

710 """Check the argument is either a valid UUID object or string.

711

712 Args:

713 obj (object): The test target. Strings and UUID objects supported.

714 version (int): The target UUID version, set to 0 to skip version check.

715

716 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')

717 True

718 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')

719 False

720 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)

721 True

722 """

723 if not isinstance(obj, uuid.UUID):

724 try:

725 obj = uuid.UUID(obj)

726 except (TypeError, ValueError, AttributeError):

727 return False

728 if version and obj.version != int(version):

729 return False

730 return True

731

732

733def escape_shell_args(args, sep=' ', style=None):

734 """Returns an escaped version of each string in *args*, according to

735 *style*.

736

737 Args:

738 args (list): A list of arguments to escape and join together

739 sep (str): The separator used to join the escaped arguments.

740 style (str): The style of escaping to use. Can be one of

741 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,

742 respectively. If *style* is ``None``, then it is picked

743 according to the system platform.

744

745 See :func:`args2cmd` and :func:`args2sh` for details and example

746 output for each style.

747 """

748 if not style:

749 style = 'cmd' if sys.platform == 'win32' else 'sh'

750

751 if style == 'sh':

752 return args2sh(args, sep=sep)

753 elif style == 'cmd':

754 return args2cmd(args, sep=sep)

755

756 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)

757

758

759_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search

760

761

762def args2sh(args, sep=' '):

763 """Return a shell-escaped string version of *args*, separated by

764 *sep*, based on the rules of sh, bash, and other shells in the

765 Linux/BSD/MacOS ecosystem.

766

767 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))

768 aa '[bb]' 'cc'"'"'cc' 'dd"dd'

769

770 As you can see, arguments with no special characters are not

771 escaped, arguments with special characters are quoted with single

772 quotes, and single quotes themselves are quoted with double

773 quotes. Double quotes are handled like any other special

774 character.

775

776 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also

777 note that :mod:`shlex` and :mod:`argparse` have functions to split

778 and parse strings escaped in this manner.

779 """

780 ret_list = []

781

782 for arg in args:

783 if not arg:

784 ret_list.append("''")

785 continue

786 if _find_sh_unsafe(arg) is None:

787 ret_list.append(arg)

788 continue

789 # use single quotes, and put single quotes into double quotes

790 # the string $'b is then quoted as '$'"'"'b'

791 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")

792

793 return ' '.join(ret_list)

794

795

796def args2cmd(args, sep=' '):

797 r"""Return a shell-escaped string version of *args*, separated by

798 *sep*, using the same rules as the Microsoft C runtime.

799

800 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))

801 aa [bb] cc'cc dd\"dd

802

803 As you can see, escaping is through backslashing and not quoting,

804 and double quotes are the only special character. See the comment

805 in the code for more details. Based on internal code from the

806 :mod:`subprocess` module.

807

808 """

809 # technique description from subprocess below

810 """

811 1) Arguments are delimited by white space, which is either a

812 space or a tab.

813

814 2) A string surrounded by double quotation marks is

815 interpreted as a single argument, regardless of white space

816 contained within. A quoted string can be embedded in an

817 argument.

818

819 3) A double quotation mark preceded by a backslash is

820 interpreted as a literal double quotation mark.

821

822 4) Backslashes are interpreted literally, unless they

823 immediately precede a double quotation mark.

824

825 5) If backslashes immediately precede a double quotation mark,

826 every pair of backslashes is interpreted as a literal

827 backslash. If the number of backslashes is odd, the last

828 backslash escapes the next double quotation mark as

829 described in rule 3.

830

831 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx

832 or search http://msdn.microsoft.com for

833 "Parsing C++ Command-Line Arguments"

834 """

835 result = []

836 needquote = False

837 for arg in args:

838 bs_buf = []

839

840 # Add a space to separate this argument from the others

841 if result:

842 result.append(' ')

843

844 needquote = (" " in arg) or ("\t" in arg) or not arg

845 if needquote:

846 result.append('"')

847

848 for c in arg:

849 if c == '\\':

850 # Don't know if we need to double yet.

851 bs_buf.append(c)

852 elif c == '"':

853 # Double backslashes.

854 result.append('\\' * len(bs_buf)*2)

855 bs_buf = []

856 result.append('\\"')

857 else:

858 # Normal char

859 if bs_buf:

860 result.extend(bs_buf)

861 bs_buf = []

862 result.append(c)

863

864 # Add remaining backslashes, if any.

865 if bs_buf:

866 result.extend(bs_buf)

867

868 if needquote:

869 result.extend(bs_buf)

870 result.append('"')

871

872 return ''.join(result)

873

874

875def parse_int_list(range_string, delim=',', range_delim='-'):

876 """Returns a sorted list of positive integers based on

877 *range_string*. Reverse of :func:`format_int_list`.

878

879 Args:

880 range_string (str): String of comma separated positive

881 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom

882 page range string used in printer dialogs.

883 delim (char): Defaults to ','. Separates integers and

884 contiguous ranges of integers.

885 range_delim (char): Defaults to '-'. Indicates a contiguous

886 range of integers.

887

888 >>> parse_int_list('1,3,5-8,10-11,15')

889 [1, 3, 5, 6, 7, 8, 10, 11, 15]

890

891 """

892 output = []

893

894 for x in range_string.strip().split(delim):

895

896 # Range

897 if range_delim in x:

898 range_limits = list(map(int, x.split(range_delim)))

899 output += list(range(min(range_limits), max(range_limits)+1))

900

901 # Empty String

902 elif not x:

903 continue

904

905 # Integer

906 else:

907 output.append(int(x))

908

909 return sorted(output)

910

911

912def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):

913 """Returns a sorted range string from a list of positive integers

914 (*int_list*). Contiguous ranges of integers are collapsed to min

915 and max values. Reverse of :func:`parse_int_list`.

916

917 Args:

918 int_list (list): List of positive integers to be converted

919 into a range string (e.g. [1,2,4,5,6,8]).

920 delim (char): Defaults to ','. Separates integers and

921 contiguous ranges of integers.

922 range_delim (char): Defaults to '-'. Indicates a contiguous

923 range of integers.

924 delim_space (bool): Defaults to ``False``. If ``True``, adds a

925 space after all *delim* characters.

926

927 >>> format_int_list([1,3,5,6,7,8,10,11,15])

928 '1,3,5-8,10-11,15'

929

930 """

931 output = []

932 contig_range = collections.deque()

933

934 for x in sorted(int_list):

935

936 # Handle current (and first) value.

937 if len(contig_range) < 1:

938 contig_range.append(x)

939

940 # Handle current value, given multiple previous values are contiguous.

941 elif len(contig_range) > 1:

942 delta = x - contig_range[-1]

943

944 # Current value is contiguous.

945 if delta == 1:

946 contig_range.append(x)

947

948 # Current value is non-contiguous.

949 elif delta > 1:

950 range_substr = '{:d}{}{:d}'.format(min(contig_range),

951 range_delim,

952 max(contig_range))

953 output.append(range_substr)

954 contig_range.clear()

955 contig_range.append(x)

956

957 # Current value repeated.

958 else:

959 continue

960

961 # Handle current value, given no previous contiguous integers

962 else:

963 delta = x - contig_range[0]

964

965 # Current value is contiguous.

966 if delta == 1:

967 contig_range.append(x)

968

969 # Current value is non-contiguous.

970 elif delta > 1:

971 output.append(f'{contig_range.popleft():d}')

972 contig_range.append(x)

973

974 # Current value repeated.

975 else:

976 continue

977

978 # Handle the last value.

979 else:

980

981 # Last value is non-contiguous.

982 if len(contig_range) == 1:

983 output.append(f'{contig_range.popleft():d}')

984 contig_range.clear()

985

986 # Last value is part of contiguous range.

987 elif len(contig_range) > 1:

988 range_substr = '{:d}{}{:d}'.format(min(contig_range),

989 range_delim,

990 max(contig_range))

991 output.append(range_substr)

992 contig_range.clear()

993

994 if delim_space:

995 output_str = (delim+' ').join(output)

996 else:

997 output_str = delim.join(output)

998

999 return output_str

1000

1001

1002def complement_int_list(

1003 range_string, range_start=0, range_end=None,

1004 delim=',', range_delim='-'):

1005 """ Returns range string that is the complement of the one provided as

1006 *range_string* parameter.

1007

1008 These range strings are of the kind produce by :func:`format_int_list`, and

1009 parseable by :func:`parse_int_list`.

1010

1011 Args:

1012 range_string (str): String of comma separated positive integers or

1013 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string

1014 used in printer dialogs.

1015 range_start (int): A positive integer from which to start the resulting

1016 range. Value is inclusive. Defaults to ``0``.

1017 range_end (int): A positive integer from which the produced range is

1018 stopped. Value is exclusive. Defaults to the maximum value found in

1019 the provided ``range_string``.

1020 delim (char): Defaults to ','. Separates integers and contiguous ranges

1021 of integers.

1022 range_delim (char): Defaults to '-'. Indicates a contiguous range of

1023 integers.

1024

1025 >>> complement_int_list('1,3,5-8,10-11,15')

1026 '0,2,4,9,12-14'

1027

1028 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)

1029 '0,2,4,9,12-14'

1030

1031 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)

1032 '2,4,9,12-14'

1033

1034 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)

1035 '2,4,9,12-14'

1036

1037 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)

1038 '4,9,12-14'

1039

1040 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)

1041 '0,2,4,9,12-14'

1042

1043 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)

1044 '0,2,4,9,12-13'

1045

1046 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)

1047 '0,2,4,9,12'

1048

1049 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)

1050 '0,2,4,9,12-14,16-19'

1051

1052 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)

1053 ''

1054

1055 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)

1056 '0,2,4,9,12-14'

1057

1058 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)

1059 ''

1060

1061 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)

1062 ''

1063

1064 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)

1065 ''

1066

1067 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)

1068 '2'

1069

1070 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)

1071 ''

1072

1073 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)

1074 ''

1075

1076 >>> complement_int_list('')

1077 ''

1078 """

1079 int_list = set(parse_int_list(range_string, delim, range_delim))

1080 if range_end is None:

1081 if int_list:

1082 range_end = max(int_list) + 1

1083 else:

1084 range_end = range_start

1085 complement_values = set(

1086 range(range_end)) - int_list - set(range(range_start))

1087 return format_int_list(complement_values, delim, range_delim)

1088

1089

1090def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):

1091 """ Transform a string of ranges (*range_string*) into a tuple of tuples.

1092

1093 Args:

1094 range_string (str): String of comma separated positive integers or

1095 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string

1096 used in printer dialogs.

1097 delim (char): Defaults to ','. Separates integers and contiguous ranges

1098 of integers.

1099 range_delim (char): Defaults to '-'. Indicates a contiguous range of

1100 integers.

1101

1102 >>> int_ranges_from_int_list('1,3,5-8,10-11,15')

1103 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))

1104

1105 >>> int_ranges_from_int_list('1')

1106 ((1, 1),)

1107

1108 >>> int_ranges_from_int_list('')

1109 ()

1110 """

1111 int_tuples = []

1112 # Normalize the range string to our internal format for processing.

1113 range_string = format_int_list(

1114 parse_int_list(range_string, delim, range_delim))

1115 if range_string:

1116 for bounds in range_string.split(','):

1117 if '-' in bounds:

1118 start, end = bounds.split('-')

1119 else:

1120 start, end = bounds, bounds

1121 int_tuples.append((int(start), int(end)))

1122 return tuple(int_tuples)

1123

1124

1125class MultiReplace:

1126 """

1127 MultiReplace is a tool for doing multiple find/replace actions in one pass.

1128

1129 Given a mapping of values to be replaced it allows for all of the matching

1130 values to be replaced in a single pass which can save a lot of performance

1131 on very large strings. In addition to simple replace, it also allows for

1132 replacing based on regular expressions.

1133

1134 Keyword Arguments:

1135

1136 :type regex: bool

1137 :param regex: Treat search keys as regular expressions [Default: False]

1138 :type flags: int

1139 :param flags: flags to pass to the regex engine during compile

1140

1141 Dictionary Usage::

1142

1143 from boltons import stringutils

1144 s = stringutils.MultiReplace({

1145 'foo': 'zoo',

1146 'cat': 'hat',

1147 'bat': 'kraken'

1148 })

1149 new = s.sub('The foo bar cat ate a bat')

1150 new == 'The zoo bar hat ate a kraken'

1151

1152 Iterable Usage::

1153

1154 from boltons import stringutils

1155 s = stringutils.MultiReplace([

1156 ('foo', 'zoo'),

1157 ('cat', 'hat'),

1158 ('bat', 'kraken)'

1159 ])

1160 new = s.sub('The foo bar cat ate a bat')

1161 new == 'The zoo bar hat ate a kraken'

1162

1163

1164 The constructor can be passed a dictionary or other mapping as well as

1165 an iterable of tuples. If given an iterable, the substitution will be run

1166 in the order the replacement values are specified in the iterable. This is

1167 also true if it is given an OrderedDict. If given a dictionary then the

1168 order will be non-deterministic::

1169

1170 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')

1171 'bar bar bar'

1172 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})

1173 >>> m.sub('foo bar baz')

1174 'baz bar bar'

1175

1176 This is because the order of replacement can matter if you're inserting

1177 something that might be replaced by a later substitution. Pay attention and

1178 if you need to rely on order then consider using a list of tuples instead

1179 of a dictionary.

1180 """

1181

1182 def __init__(self, sub_map, **kwargs):

1183 """Compile any regular expressions that have been passed."""

1184 options = {

1185 'regex': False,

1186 'flags': 0,

1187 }

1188 options.update(kwargs)

1189 self.group_map = {}

1190 regex_values = []

1191

1192 if isinstance(sub_map, Mapping):

1193 sub_map = sub_map.items()

1194

1195 for idx, vals in enumerate(sub_map):

1196 group_name = f'group{idx}'

1197 if isinstance(vals[0], str):

1198 # If we're not treating input strings like a regex, escape it

1199 if not options['regex']:

1200 exp = re.escape(vals[0])

1201 else:

1202 exp = vals[0]

1203 else:

1204 exp = vals[0].pattern

1205

1206 regex_values.append(f'(?P<{group_name}>{exp})')

1207 self.group_map[group_name] = vals[1]

1208

1209 self.combined_pattern = re.compile(

1210 '|'.join(regex_values),

1211 flags=options['flags']

1212 )

1213

1214 def _get_value(self, match):

1215 """Given a match object find replacement value."""

1216 group_dict = match.groupdict()

1217 key = [x for x in group_dict if group_dict[x]][0]

1218 return self.group_map[key]

1219

1220 def sub(self, text):

1221 """

1222 Run substitutions on the input text.

1223

1224 Given an input string, run all substitutions given in the

1225 constructor.

1226 """

1227 return self.combined_pattern.sub(self._get_value, text)

1228

1229

1230def multi_replace(text, sub_map, **kwargs):

1231 """

1232 Shortcut function to invoke MultiReplace in a single call.

1233

1234 Example Usage::

1235

1236 from boltons.stringutils import multi_replace

1237 new = multi_replace(

1238 'The foo bar cat ate a bat',

1239 {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}

1240 )

1241 new == 'The zoo bar hat ate a kraken'

1242 """

1243 m = MultiReplace(sub_map, **kwargs)

1244 return m.sub(text)

1245

1246

1247def unwrap_text(text, ending='\n\n'):

1248 r"""

1249 Unwrap text, the natural complement to :func:`textwrap.wrap`.

1250

1251 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."

1252 >>> unwrap_text(text)

1253 'Short lines wrapped small.\n\nAnother paragraph.'

1254

1255 Args:

1256 text: A string to unwrap.

1257 ending (str): The string to join all unwrapped paragraphs

1258 by. Pass ``None`` to get the list. Defaults to '\n\n' for

1259 compatibility with Markdown and RST.

1260

1261 """

1262 all_grafs = []

1263 cur_graf = []

1264 for line in text.splitlines():

1265 line = line.strip()

1266 if line:

1267 cur_graf.append(line)

1268 else:

1269 all_grafs.append(' '.join(cur_graf))

1270 cur_graf = []

1271 if cur_graf:

1272 all_grafs.append(' '.join(cur_graf))

1273 if ending is None:

1274 return all_grafs

1275 return ending.join(all_grafs)