Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/boltons/strutils.py: 22%

1# -*- coding: utf-8 -*-

5# Redistribution and use in source and binary forms, with or without

6# modification, are permitted provided that the following conditions are

7# met:

9# * Redistributions of source code must retain the above copyright

10# notice, this list of conditions and the following disclaimer.

11#

12# * Redistributions in binary form must reproduce the above

13# copyright notice, this list of conditions and the following

14# disclaimer in the documentation and/or other materials provided

15# with the distribution.

16#

17# * The names of the contributors may not be used to endorse or

18# promote products derived from this software without specific

19# prior written permission.

20#

21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

33"""So much practical programming involves string manipulation, which

34Python readily accommodates. Still, there are dozens of basic and

35common capabilities missing from the standard library, several of them

36provided by ``strutils``.

37"""

39from __future__ import print_function

41import re

42import sys

43import uuid

44import zlib

45import string

46import unicodedata

47import collections

48from gzip import GzipFile

50try:

51 from cStringIO import cStringIO as StringIO

52except ImportError:

53 from io import BytesIO as StringIO

55try:

56 from collections.abc import Mapping

57except ImportError:

58 from collections import Mapping

60try:

61 unicode, str, bytes, basestring = unicode, str, str, basestring

62 from HTMLParser import HTMLParser

63 import htmlentitydefs

64except NameError: # basestring not defined in Python 3

65 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes)

66 unichr = chr

67 from html.parser import HTMLParser

68 from html import entities as htmlentitydefs

70try:

71 import __builtin__ as builtins

72except ImportError:

73 import builtins

75__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',

76 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',

77 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',

78 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',

79 'iter_splitlines', 'indent', 'escape_shell_args',

80 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',

81 'int_list_complement', 'int_list_to_int_tuples', 'MultiReplace',

82 'multi_replace', 'unwrap_text']

85_punct_ws_str = string.punctuation + string.whitespace

86_punct_re = re.compile('[' + _punct_ws_str + ']+')

87_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')

90def camel2under(camel_string):

91 """Converts a camelcased string to underscores. Useful for turning a

92 class name into a function name.

94 >>> camel2under('BasicParseTest')

95 'basic_parse_test'

96 """

97 return _camel2under_re.sub(r'_\1', camel_string).lower()

100def under2camel(under_string):

101 """Converts an underscored string to camelcased. Useful for turning a

102 function name into a class name.

103

104 >>> under2camel('complex_tokenizer')

105 'ComplexTokenizer'

106 """

107 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))

108

109

110def slugify(text, delim='_', lower=True, ascii=False):

111 """

112 A basic function that turns text full of scary characters

113 (i.e., punctuation and whitespace), into a relatively safe

114 lowercased string separated only by the delimiter specified

115 by *delim*, which defaults to ``_``.

116

117 The *ascii* convenience flag will :func:`asciify` the slug if

118 you require ascii-only slugs.

119

120 >>> slugify('First post! Hi!!!!~1 ')

121 'first_post_hi_1'

122

123 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \

124 b'kurt_goedel_s_pretty_cool'

125 True

126

127 """

128 ret = delim.join(split_punct_ws(text)) or delim if text else ''

129 if ascii:

130 ret = asciify(ret)

131 if lower:

132 ret = ret.lower()

133 return ret

134

135

136def split_punct_ws(text):

137 """While :meth:`str.split` will split on whitespace,

138 :func:`split_punct_ws` will split on punctuation and

139 whitespace. This used internally by :func:`slugify`, above.

140

141 >>> split_punct_ws('First post! Hi!!!!~1 ')

142 ['First', 'post', 'Hi', '1']

143 """

144 return [w for w in _punct_re.split(text) if w]

145

146

147def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?

148 """Returns a plain-English description of an iterable's

149 :func:`len()`, conditionally pluralized with :func:`cardinalize`,

150 detailed below.

151

152 >>> print(unit_len(range(10), 'number'))

153 10 numbers

154 >>> print(unit_len('aeiou', 'vowel'))

155 5 vowels

156 >>> print(unit_len([], 'worry'))

157 No worries

158 """

159 count = len(sized_iterable)

160 units = cardinalize(unit_noun, count)

161 if count:

162 return u'%s %s' % (count, units)

163 return u'No %s' % (units,)

164

165

166_ORDINAL_MAP = {'1': 'st',

167 '2': 'nd',

168 '3': 'rd'} # 'th' is the default

169

170

171def ordinalize(number, ext_only=False):

172 """Turns *number* into its cardinal form, i.e., 1st, 2nd,

173 3rd, 4th, etc. If the last character isn't a digit, it returns the

174 string value unchanged.

175

176 Args:

177 number (int or str): Number to be cardinalized.

178 ext_only (bool): Whether to return only the suffix. Default ``False``.

179

180 >>> print(ordinalize(1))

181 1st

182 >>> print(ordinalize(3694839230))

183 3694839230th

184 >>> print(ordinalize('hi'))

185 hi

186 >>> print(ordinalize(1515))

187 1515th

188 """

189 numstr, ext = unicode(number), ''

190 if numstr and numstr[-1] in string.digits:

191 try:

192 # first check for teens

193 if numstr[-2] == '1':

194 ext = 'th'

195 else:

196 # all other cases

197 ext = _ORDINAL_MAP.get(numstr[-1], 'th')

198 except IndexError:

199 # single digit numbers (will reach here based on [-2] above)

200 ext = _ORDINAL_MAP.get(numstr[-1], 'th')

201 if ext_only:

202 return ext

203 else:

204 return numstr + ext

205

206

207def cardinalize(unit_noun, count):

208 """Conditionally pluralizes a singular word *unit_noun* if

209 *count* is not one, preserving case when possible.

210

211 >>> vowels = 'aeiou'

212 >>> print(len(vowels), cardinalize('vowel', len(vowels)))

213 5 vowels

214 >>> print(3, cardinalize('Wish', 3))

215 3 Wishes

216 """

217 if count == 1:

218 return unit_noun

219 return pluralize(unit_noun)

220

221

222def singularize(word):

223 """Semi-intelligently converts an English plural *word* to its

224 singular form, preserving case pattern.

225

226 >>> singularize('chances')

227 'chance'

228 >>> singularize('Activities')

229 'Activity'

230 >>> singularize('Glasses')

231 'Glass'

232 >>> singularize('FEET')

233 'FOOT'

234

235 """

236 orig_word, word = word, word.strip().lower()

237 if not word or word in _IRR_S2P:

238 return orig_word

239

240 irr_singular = _IRR_P2S.get(word)

241 if irr_singular:

242 singular = irr_singular

243 elif not word.endswith('s'):

244 return orig_word

245 elif len(word) == 2:

246 singular = word[:-1] # or just return word?

247 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':

248 singular = word[:-3] + 'y'

249 elif word.endswith('es') and word[-3] == 's':

250 singular = word[:-2]

251 else:

252 singular = word[:-1]

253 return _match_case(orig_word, singular)

254

255

256def pluralize(word):

257 """Semi-intelligently converts an English *word* from singular form to

258 plural, preserving case pattern.

259

260 >>> pluralize('friend')

261 'friends'

262 >>> pluralize('enemy')

263 'enemies'

264 >>> pluralize('Sheep')

265 'Sheep'

266 """

267 orig_word, word = word, word.strip().lower()

268 if not word or word in _IRR_P2S:

269 return orig_word

270 irr_plural = _IRR_S2P.get(word)

271 if irr_plural:

272 plural = irr_plural

273 elif word.endswith('y') and word[-2:-1] not in 'aeiou':

274 plural = word[:-1] + 'ies'

275 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):

276 plural = word if word.endswith('es') else word + 'es'

277 else:

278 plural = word + 's'

279 return _match_case(orig_word, plural)

280

281

282def _match_case(master, disciple):

283 if not master.strip():

284 return disciple

285 if master.lower() == master:

286 return disciple.lower()

287 elif master.upper() == master:

288 return disciple.upper()

289 elif master.title() == master:

290 return disciple.title()

291 return disciple

292

293

294# Singular to plural map of irregular pluralizations

295_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',

296 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',

297 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',

298 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',

299 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',

300 'calf': 'calves', 'child': 'children', 'corps': 'corps',

301 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',

302 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',

303 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',

304 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',

305 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',

306 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',

307 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',

308 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',

309 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',

310 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',

311 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',

312 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',

313 'matrix': 'matrices', 'means': 'means', 'medium': 'media',

314 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',

315 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',

316 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',

317 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',

318 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',

319 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',

320 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',

321 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':

322 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':

323 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':

324 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',

325 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',

326 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':

327 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':

328 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',

329 'wolf': 'wolves', 'woman': 'women'}

330

331

332# Reverse index of the above

333_IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()])

334

335HASHTAG_RE = re.compile(r"(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)

336

337

338def find_hashtags(string):

339 """Finds and returns all hashtags in a string, with the hashmark

340 removed. Supports full-width hashmarks for Asian languages and

341 does not false-positive on URL anchors.

342

343 >>> find_hashtags('#atag http://asite/#ananchor')

344 ['atag']

345

346 ``find_hashtags`` also works with unicode hashtags.

347 """

348

349 # the following works, doctest just struggles with it

350 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")

351 # [u'\u80af\u5fb7\u57fa']

352 return HASHTAG_RE.findall(string)

353

354

355def a10n(string):

356 """That thing where "internationalization" becomes "i18n", what's it

357 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form

358 of `numeronym`_.)

359

360 >>> a10n('abbreviation')

361 'a10n'

362 >>> a10n('internationalization')

363 'i18n'

364 >>> a10n('')

365 ''

366

367 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym

368 """

369 if len(string) < 3:

370 return string

371 return '%s%s%s' % (string[0], len(string[1:-1]), string[-1])

372

373

374# Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences

375ANSI_SEQUENCES = re.compile(r'''

376 \x1B # Sequence starts with ESC, i.e. hex 0x1B

377 (?:

378 [@-Z\\-_] # Second byte:

379 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_

380 | # Or

381 \[ # CSI sequences, starting with [

382 [0-?]* # Parameter bytes:

383 # range 0x30–0x3F, ASCII 0–9:;<=>?

384 [ -/]* # Intermediate bytes:

385 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./

386 [@-~] # Final byte

387 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~

388 )

389''', re.VERBOSE)

390

391

392def strip_ansi(text):

393 """Strips ANSI escape codes from *text*. Useful for the occasional

394 time when a log or redirected output accidentally captures console

395 color codes and the like.

396

397 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')

398 'art'

399

400 Supports unicode, str, bytes and bytearray content as input. Returns the

401 same type as the input.

402

403 There's a lot of ANSI art available for testing on `sixteencolors.net`_.

404 This function does not interpret or render ANSI art, but you can do so with

405 `ansi2img`_ or `escapes.js`_.

406

407 .. _sixteencolors.net: http://sixteencolors.net

408 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img

409 .. _escapes.js: https://github.com/atdt/escapes.js

410 """

411 # TODO: move to cliutils.py

412

413 # Transform any ASCII-like content to unicode to allow regex to match, and

414 # save input type for later.

415 target_type = None

416 # Unicode type aliased to str is code-smell for Boltons in Python 3 env.

417 is_py3 = (unicode == builtins.str)

418 if is_py3 and isinstance(text, (bytes, bytearray)):

419 target_type = type(text)

420 text = text.decode('utf-8')

421

422 cleaned = ANSI_SEQUENCES.sub('', text)

423

424 # Transform back the result to the same bytearray type provided by the user.

425 if target_type and target_type != type(cleaned):

426 cleaned = target_type(cleaned, 'utf-8')

427

428 return cleaned

429

430

431def asciify(text, ignore=False):

432 """Converts a unicode or bytestring, *text*, into a bytestring with

433 just ascii characters. Performs basic deaccenting for all you

434 Europhiles out there.

435

436 Also, a gentle reminder that this is a **utility**, primarily meant

437 for slugification. Whenever possible, make your application work

438 **with** unicode, not against it.

439

440 Args:

441 text (str or unicode): The string to be asciified.

442 ignore (bool): Configures final encoding to ignore remaining

443 unasciified unicode instead of replacing it.

444

445 >>> asciify('Beyoncé') == b'Beyonce'

446 True

447 """

448 try:

449 try:

450 return text.encode('ascii')

451 except UnicodeDecodeError:

452 # this usually means you passed in a non-unicode string

453 text = text.decode('utf-8')

454 return text.encode('ascii')

455 except UnicodeEncodeError:

456 mode = 'replace'

457 if ignore:

458 mode = 'ignore'

459 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))

460 ret = transd.encode('ascii', mode)

461 return ret

462

463

464def is_ascii(text):

465 """Check if a unicode or bytestring, *text*, is composed of ascii

466 characters only. Raises :exc:`ValueError` if argument is not text.

467

468 Args:

469 text (str or unicode): The string to be checked.

470

471 >>> is_ascii('Beyoncé')

472 False

473 >>> is_ascii('Beyonce')

474 True

475 """

476 if isinstance(text, unicode):

477 try:

478 text.encode('ascii')

479 except UnicodeEncodeError:

480 return False

481 elif isinstance(text, bytes):

482 try:

483 text.decode('ascii')

484 except UnicodeDecodeError:

485 return False

486 else:

487 raise ValueError('expected text or bytes, not %r' % type(text))

488 return True

489

490

491class DeaccenterDict(dict):

492 "A small caching dictionary for deaccenting."

493 def __missing__(self, key):

494 ch = self.get(key)

495 if ch is not None:

496 return ch

497 try:

498 de = unicodedata.decomposition(unichr(key))

499 p1, _, p2 = de.rpartition(' ')

500 if int(p2, 16) == 0x308:

501 ch = self.get(key)

502 else:

503 ch = int(p1, 16)

504 except (IndexError, ValueError):

505 ch = self.get(key, key)

506 self[key] = ch

507 return ch

508

509 try:

510 from collections import defaultdict

511 except ImportError:

512 # no defaultdict means that __missing__ isn't supported in

513 # this version of python, so we define __getitem__

514 def __getitem__(self, key):

515 try:

516 return super(DeaccenterDict, self).__getitem__(key)

517 except KeyError:

518 return self.__missing__(key)

519 else:

520 del defaultdict

521

522

523# http://chmullig.com/2009/12/python-unicode-ascii-ifier/

524# For something more complete, investigate the unidecode

525# or isounidecode packages, which are capable of performing

526# crude transliteration.

527_BASE_DEACCENT_MAP = {

528 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE

529 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH

530 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE

531 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN

532 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS

533 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS

534 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS

535 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE

536 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE

537 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE

538 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA

539 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE

540 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE

541 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX

542 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE

543 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE

544 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE

545 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE

546 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE

547 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE

548 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE

549 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S

550 0xe6: u"ae", # æ LATIN SMALL LETTER AE

551 0xf0: u"d", # ð LATIN SMALL LETTER ETH

552 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE

553 0xfe: u"th", # þ LATIN SMALL LETTER THORN,

554 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS

555 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS

556 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS

557 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE

558 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE

559 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE

560 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA

561 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE

562 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE

563 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX

564 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE

565 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE

566 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE

567 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE

568 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE

569 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE

570 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE

571 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK

572 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK

573 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK

574 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK

575 }

576

577

578DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)

579

580

581_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')

582_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]

583_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))

584

585

586def bytes2human(nbytes, ndigits=0):

587 """Turns an integer value of *nbytes* into a human readable format. Set

588 *ndigits* to control how many digits after the decimal point

589 should be shown (default ``0``).

590

591 >>> bytes2human(128991)

592 '126K'

593 >>> bytes2human(100001221)

594 '95M'

595 >>> bytes2human(0, 2)

596 '0.00B'

597 """

598 abs_bytes = abs(nbytes)

599 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:

600 if abs_bytes <= next_size:

601 break

602 hnbytes = float(nbytes) / size

603 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,

604 ndigits=ndigits,

605 symbol=symbol)

606

607

608class HTMLTextExtractor(HTMLParser):

609 def __init__(self):

610 self.reset()

611 self.strict = False

612 self.convert_charrefs = True

613 self.result = []

614

615 def handle_data(self, d):

616 self.result.append(d)

617

618 def handle_charref(self, number):

619 if number[0] == u'x' or number[0] == u'X':

620 codepoint = int(number[1:], 16)

621 else:

622 codepoint = int(number)

623 self.result.append(unichr(codepoint))

624

625 def handle_entityref(self, name):

626 try:

627 codepoint = htmlentitydefs.name2codepoint[name]

628 except KeyError:

629 self.result.append(u'&' + name + u';')

630 else:

631 self.result.append(unichr(codepoint))

632

633 def get_text(self):

634 return u''.join(self.result)

635

636

637def html2text(html):

638 """Strips tags from HTML text, returning markup-free text. Also, does

639 a best effort replacement of entities like " "

640

641 >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>')

642 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'

643 True

644 """

645 # based on answers to http://stackoverflow.com/questions/753052/

646 s = HTMLTextExtractor()

647 s.feed(html)

648 return s.get_text()

649

650

651_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'

652_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'

653

654

655def gunzip_bytes(bytestring):

656 """The :mod:`gzip` module is great if you have a file or file-like

657 object, but what if you just have bytes. StringIO is one

658 possibility, but it's often faster, easier, and simpler to just

659 use this one-liner. Use this tried-and-true utility function to

660 decompress gzip from bytes.

661

662 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''

663 True

664 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'

665 True

666 """

667 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)

668

669

670def gzip_bytes(bytestring, level=6):

671 """Turn some bytes into some compressed bytes.

672

673 >>> len(gzip_bytes(b'a' * 10000))

674 46

675

676 Args:

677 bytestring (bytes): Bytes to be compressed

678 level (int): An integer, 1-9, controlling the

679 speed/compression. 1 is fastest, least compressed, 9 is

680 slowest, but most compressed.

681

682 Note that all levels of gzip are pretty fast these days, though

683 it's not really a competitor in compression, at any level.

684 """

685 out = StringIO()

686 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)

687 f.write(bytestring)

688 f.close()

689 return out.getvalue()

690

691

692

693_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',

694 re.UNICODE)

695

696

697def iter_splitlines(text):

698 r"""Like :meth:`str.splitlines`, but returns an iterator of lines

699 instead of a list. Also similar to :meth:`file.next`, as that also

700 lazily reads and yields lines from a file.

701

702 This function works with a variety of line endings, but as always,

703 be careful when mixing line endings within a file.

704

705 >>> list(iter_splitlines('\nhi\nbye\n'))

706 ['', 'hi', 'bye', '']

707 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))

708 ['', 'hi', 'bye', '']

709 >>> list(iter_splitlines(''))

710 []

711 """

712 prev_end, len_text = 0, len(text)

713 # print('last: %r' % last_idx)

714 # start, end = None, None

715 for match in _line_ending_re.finditer(text):

716 start, end = match.start(1), match.end(1)

717 # print(start, end)

718 if prev_end <= start:

719 yield text[prev_end:start]

720 if end == len_text:

721 yield ''

722 prev_end = end

723 tail = text[prev_end:]

724 if tail:

725 yield tail

726 return

727

728

729def indent(text, margin, newline='\n', key=bool):

730 """The missing counterpart to the built-in :func:`textwrap.dedent`.

731

732 Args:

733 text (str): The text to indent.

734 margin (str): The string to prepend to each line.

735 newline (str): The newline used to rejoin the lines (default: ``\\n``)

736 key (callable): Called on each line to determine whether to

737 indent it. Default: :class:`bool`, to ensure that empty lines do

738 not get whitespace added.

739 """

740 indented_lines = [(margin + line if key(line) else line)

741 for line in iter_splitlines(text)]

742 return newline.join(indented_lines)

743

744

745def is_uuid(obj, version=4):

746 """Check the argument is either a valid UUID object or string.

747

748 Args:

749 obj (object): The test target. Strings and UUID objects supported.

750 version (int): The target UUID version, set to 0 to skip version check.

751

752 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')

753 True

754 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')

755 False

756 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)

757 True

758 """

759 if not isinstance(obj, uuid.UUID):

760 try:

761 obj = uuid.UUID(obj)

762 except (TypeError, ValueError, AttributeError):

763 return False

764 if version and obj.version != int(version):

765 return False

766 return True

767

768

769def escape_shell_args(args, sep=' ', style=None):

770 """Returns an escaped version of each string in *args*, according to

771 *style*.

772

773 Args:

774 args (list): A list of arguments to escape and join together

775 sep (str): The separator used to join the escaped arguments.

776 style (str): The style of escaping to use. Can be one of

777 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,

778 respectively. If *style* is ``None``, then it is picked

779 according to the system platform.

780

781 See :func:`args2cmd` and :func:`args2sh` for details and example

782 output for each style.

783 """

784 if not style:

785 style = 'cmd' if sys.platform == 'win32' else 'sh'

786

787 if style == 'sh':

788 return args2sh(args, sep=sep)

789 elif style == 'cmd':

790 return args2cmd(args, sep=sep)

791

792 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)

793

794

795_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search

796

797

798def args2sh(args, sep=' '):

799 """Return a shell-escaped string version of *args*, separated by

800 *sep*, based on the rules of sh, bash, and other shells in the

801 Linux/BSD/MacOS ecosystem.

802

803 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))

804 aa '[bb]' 'cc'"'"'cc' 'dd"dd'

805

806 As you can see, arguments with no special characters are not

807 escaped, arguments with special characters are quoted with single

808 quotes, and single quotes themselves are quoted with double

809 quotes. Double quotes are handled like any other special

810 character.

811

812 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also

813 note that :mod:`shlex` and :mod:`argparse` have functions to split

814 and parse strings escaped in this manner.

815 """

816 ret_list = []

817

818 for arg in args:

819 if not arg:

820 ret_list.append("''")

821 continue

822 if _find_sh_unsafe(arg) is None:

823 ret_list.append(arg)

824 continue

825 # use single quotes, and put single quotes into double quotes

826 # the string $'b is then quoted as '$'"'"'b'

827 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")

828

829 return ' '.join(ret_list)

830

831

832def args2cmd(args, sep=' '):

833 r"""Return a shell-escaped string version of *args*, separated by

834 *sep*, using the same rules as the Microsoft C runtime.

835

836 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))

837 aa [bb] cc'cc dd\"dd

838

839 As you can see, escaping is through backslashing and not quoting,

840 and double quotes are the only special character. See the comment

841 in the code for more details. Based on internal code from the

842 :mod:`subprocess` module.

843

844 """

845 # technique description from subprocess below

846 """

847 1) Arguments are delimited by white space, which is either a

848 space or a tab.

849

850 2) A string surrounded by double quotation marks is

851 interpreted as a single argument, regardless of white space

852 contained within. A quoted string can be embedded in an

853 argument.

854

855 3) A double quotation mark preceded by a backslash is

856 interpreted as a literal double quotation mark.

857

858 4) Backslashes are interpreted literally, unless they

859 immediately precede a double quotation mark.

860

861 5) If backslashes immediately precede a double quotation mark,

862 every pair of backslashes is interpreted as a literal

863 backslash. If the number of backslashes is odd, the last

864 backslash escapes the next double quotation mark as

865 described in rule 3.

866

867 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx

868 or search http://msdn.microsoft.com for

869 "Parsing C++ Command-Line Arguments"

870 """

871 result = []

872 needquote = False

873 for arg in args:

874 bs_buf = []

875

876 # Add a space to separate this argument from the others

877 if result:

878 result.append(' ')

879

880 needquote = (" " in arg) or ("\t" in arg) or not arg

881 if needquote:

882 result.append('"')

883

884 for c in arg:

885 if c == '\\':

886 # Don't know if we need to double yet.

887 bs_buf.append(c)

888 elif c == '"':

889 # Double backslashes.

890 result.append('\\' * len(bs_buf)*2)

891 bs_buf = []

892 result.append('\\"')

893 else:

894 # Normal char

895 if bs_buf:

896 result.extend(bs_buf)

897 bs_buf = []

898 result.append(c)

899

900 # Add remaining backslashes, if any.

901 if bs_buf:

902 result.extend(bs_buf)

903

904 if needquote:

905 result.extend(bs_buf)

906 result.append('"')

907

908 return ''.join(result)

909

910

911def parse_int_list(range_string, delim=',', range_delim='-'):

912 """Returns a sorted list of positive integers based on

913 *range_string*. Reverse of :func:`format_int_list`.

914

915 Args:

916 range_string (str): String of comma separated positive

917 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom

918 page range string used in printer dialogs.

919 delim (char): Defaults to ','. Separates integers and

920 contiguous ranges of integers.

921 range_delim (char): Defaults to '-'. Indicates a contiguous

922 range of integers.

923

924 >>> parse_int_list('1,3,5-8,10-11,15')

925 [1, 3, 5, 6, 7, 8, 10, 11, 15]

926

927 """

928 output = []

929

930 for x in range_string.strip().split(delim):

931

932 # Range

933 if range_delim in x:

934 range_limits = list(map(int, x.split(range_delim)))

935 output += list(range(min(range_limits), max(range_limits)+1))

936

937 # Empty String

938 elif not x:

939 continue

940

941 # Integer

942 else:

943 output.append(int(x))

944

945 return sorted(output)

946

947

948def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):

949 """Returns a sorted range string from a list of positive integers

950 (*int_list*). Contiguous ranges of integers are collapsed to min

951 and max values. Reverse of :func:`parse_int_list`.

952

953 Args:

954 int_list (list): List of positive integers to be converted

955 into a range string (e.g. [1,2,4,5,6,8]).

956 delim (char): Defaults to ','. Separates integers and

957 contiguous ranges of integers.

958 range_delim (char): Defaults to '-'. Indicates a contiguous

959 range of integers.

960 delim_space (bool): Defaults to ``False``. If ``True``, adds a

961 space after all *delim* characters.

962

963 >>> format_int_list([1,3,5,6,7,8,10,11,15])

964 '1,3,5-8,10-11,15'

965

966 """

967 output = []

968 contig_range = collections.deque()

969

970 for x in sorted(int_list):

971

972 # Handle current (and first) value.

973 if len(contig_range) < 1:

974 contig_range.append(x)

975

976 # Handle current value, given multiple previous values are contiguous.

977 elif len(contig_range) > 1:

978 delta = x - contig_range[-1]

979

980 # Current value is contiguous.

981 if delta == 1:

982 contig_range.append(x)

983

984 # Current value is non-contiguous.

985 elif delta > 1:

986 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),

987 range_delim,

988 max(contig_range))

989 output.append(range_substr)

990 contig_range.clear()

991 contig_range.append(x)

992

993 # Current value repeated.

994 else:

995 continue

996

997 # Handle current value, given no previous contiguous integers

998 else:

999 delta = x - contig_range[0]

1000

1001 # Current value is contiguous.

1002 if delta == 1:

1003 contig_range.append(x)

1004

1005 # Current value is non-contiguous.

1006 elif delta > 1:

1007 output.append('{0:d}'.format(contig_range.popleft()))

1008 contig_range.append(x)

1009

1010 # Current value repeated.

1011 else:

1012 continue

1013

1014 # Handle the last value.

1015 else:

1016

1017 # Last value is non-contiguous.

1018 if len(contig_range) == 1:

1019 output.append('{0:d}'.format(contig_range.popleft()))

1020 contig_range.clear()

1021

1022 # Last value is part of contiguous range.

1023 elif len(contig_range) > 1:

1024 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),

1025 range_delim,

1026 max(contig_range))

1027 output.append(range_substr)

1028 contig_range.clear()

1029

1030 if delim_space:

1031 output_str = (delim+' ').join(output)

1032 else:

1033 output_str = delim.join(output)

1034

1035 return output_str

1036

1037

1038def complement_int_list(

1039 range_string, range_start=0, range_end=None,

1040 delim=',', range_delim='-'):

1041 """ Returns range string that is the complement of the one provided as

1042 *range_string* parameter.

1043

1044 These range strings are of the kind produce by :func:`format_int_list`, and

1045 parseable by :func:`parse_int_list`.

1046

1047 Args:

1048 range_string (str): String of comma separated positive integers or

1049 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string

1050 used in printer dialogs.

1051 range_start (int): A positive integer from which to start the resulting

1052 range. Value is inclusive. Defaults to ``0``.

1053 range_end (int): A positive integer from which the produced range is

1054 stopped. Value is exclusive. Defaults to the maximum value found in

1055 the provided ``range_string``.

1056 delim (char): Defaults to ','. Separates integers and contiguous ranges

1057 of integers.

1058 range_delim (char): Defaults to '-'. Indicates a contiguous range of

1059 integers.

1060

1061 >>> complement_int_list('1,3,5-8,10-11,15')

1062 '0,2,4,9,12-14'

1063

1064 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)

1065 '0,2,4,9,12-14'

1066

1067 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)

1068 '2,4,9,12-14'

1069

1070 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)

1071 '2,4,9,12-14'

1072

1073 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)

1074 '4,9,12-14'

1075

1076 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)

1077 '0,2,4,9,12-14'

1078

1079 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)

1080 '0,2,4,9,12-13'

1081

1082 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)

1083 '0,2,4,9,12'

1084

1085 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)

1086 '0,2,4,9,12-14,16-19'

1087

1088 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)

1089 ''

1090

1091 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)

1092 '0,2,4,9,12-14'

1093

1094 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)

1095 ''

1096

1097 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)

1098 ''

1099

1100 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)

1101 ''

1102

1103 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)

1104 '2'

1105

1106 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)

1107 ''

1108

1109 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)

1110 ''

1111

1112 >>> complement_int_list('')

1113 ''

1114 """

1115 int_list = set(parse_int_list(range_string, delim, range_delim))

1116 if range_end is None:

1117 if int_list:

1118 range_end = max(int_list) + 1

1119 else:

1120 range_end = range_start

1121 complement_values = set(

1122 range(range_end)) - int_list - set(range(range_start))

1123 return format_int_list(complement_values, delim, range_delim)

1124

1125

1126def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):

1127 """ Transform a string of ranges (*range_string*) into a tuple of tuples.

1128

1129 Args:

1130 range_string (str): String of comma separated positive integers or

1131 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string

1132 used in printer dialogs.

1133 delim (char): Defaults to ','. Separates integers and contiguous ranges

1134 of integers.

1135 range_delim (char): Defaults to '-'. Indicates a contiguous range of

1136 integers.

1137

1138 >>> int_ranges_from_int_list('1,3,5-8,10-11,15')

1139 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))

1140

1141 >>> int_ranges_from_int_list('1')

1142 ((1, 1),)

1143

1144 >>> int_ranges_from_int_list('')

1145 ()

1146 """

1147 int_tuples = []

1148 # Normalize the range string to our internal format for processing.

1149 range_string = format_int_list(

1150 parse_int_list(range_string, delim, range_delim))

1151 if range_string:

1152 for bounds in range_string.split(','):

1153 if '-' in bounds:

1154 start, end = bounds.split('-')

1155 else:

1156 start, end = bounds, bounds

1157 int_tuples.append((int(start), int(end)))

1158 return tuple(int_tuples)

1159

1160

1161class MultiReplace(object):

1162 """

1163 MultiReplace is a tool for doing multiple find/replace actions in one pass.

1164

1165 Given a mapping of values to be replaced it allows for all of the matching

1166 values to be replaced in a single pass which can save a lot of performance

1167 on very large strings. In addition to simple replace, it also allows for

1168 replacing based on regular expressions.

1169

1170 Keyword Arguments:

1171

1172 :type regex: bool

1173 :param regex: Treat search keys as regular expressions [Default: False]

1174 :type flags: int

1175 :param flags: flags to pass to the regex engine during compile

1176

1177 Dictionary Usage::

1178

1179 from boltons import stringutils

1180 s = stringutils.MultiReplace({

1181 'foo': 'zoo',

1182 'cat': 'hat',

1183 'bat': 'kraken'

1184 })

1185 new = s.sub('The foo bar cat ate a bat')

1186 new == 'The zoo bar hat ate a kraken'

1187

1188 Iterable Usage::

1189

1190 from boltons import stringutils

1191 s = stringutils.MultiReplace([

1192 ('foo', 'zoo'),

1193 ('cat', 'hat'),

1194 ('bat', 'kraken)'

1195 ])

1196 new = s.sub('The foo bar cat ate a bat')

1197 new == 'The zoo bar hat ate a kraken'

1198

1199

1200 The constructor can be passed a dictionary or other mapping as well as

1201 an iterable of tuples. If given an iterable, the substitution will be run

1202 in the order the replacement values are specified in the iterable. This is

1203 also true if it is given an OrderedDict. If given a dictionary then the

1204 order will be non-deterministic::

1205

1206 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')

1207 'bar bar bar'

1208 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})

1209 >>> m.sub('foo bar baz')

1210 'baz bar bar'

1211

1212 This is because the order of replacement can matter if you're inserting

1213 something that might be replaced by a later substitution. Pay attention and

1214 if you need to rely on order then consider using a list of tuples instead

1215 of a dictionary.

1216 """

1217

1218 def __init__(self, sub_map, **kwargs):

1219 """Compile any regular expressions that have been passed."""

1220 options = {

1221 'regex': False,

1222 'flags': 0,

1223 }

1224 options.update(kwargs)

1225 self.group_map = {}

1226 regex_values = []

1227

1228 if isinstance(sub_map, Mapping):

1229 sub_map = sub_map.items()

1230

1231 for idx, vals in enumerate(sub_map):

1232 group_name = 'group{0}'.format(idx)

1233 if isinstance(vals[0], basestring):

1234 # If we're not treating input strings like a regex, escape it

1235 if not options['regex']:

1236 exp = re.escape(vals[0])

1237 else:

1238 exp = vals[0]

1239 else:

1240 exp = vals[0].pattern

1241

1242 regex_values.append('(?P<{}>{})'.format(group_name, exp))

1243 self.group_map[group_name] = vals[1]

1244

1245 self.combined_pattern = re.compile(

1246 '|'.join(regex_values),

1247 flags=options['flags']

1248 )

1249

1250 def _get_value(self, match):

1251 """Given a match object find replacement value."""

1252 group_dict = match.groupdict()

1253 key = [x for x in group_dict if group_dict[x]][0]

1254 return self.group_map[key]

1255

1256 def sub(self, text):

1257 """

1258 Run substitutions on the input text.

1259

1260 Given an input string, run all substitutions given in the

1261 constructor.

1262 """

1263 return self.combined_pattern.sub(self._get_value, text)

1264

1265

1266def multi_replace(text, sub_map, **kwargs):

1267 """

1268 Shortcut function to invoke MultiReplace in a single call.

1269

1270 Example Usage::

1271

1272 from boltons.stringutils import multi_replace

1273 new = multi_replace(

1274 'The foo bar cat ate a bat',

1275 {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}

1276 )

1277 new == 'The zoo bar hat ate a kraken'

1278 """

1279 m = MultiReplace(sub_map, **kwargs)

1280 return m.sub(text)

1281

1282

1283def unwrap_text(text, ending='\n\n'):

1284 r"""

1285 Unwrap text, the natural complement to :func:`textwrap.wrap`.

1286

1287 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."

1288 >>> unwrap_text(text)

1289 'Short lines wrapped small.\n\nAnother paragraph.'

1290

1291 Args:

1292 text: A string to unwrap.

1293 ending (str): The string to join all unwrapped paragraphs

1294 by. Pass ``None`` to get the list. Defaults to '\n\n' for

1295 compatibility with Markdown and RST.

1296

1297 """

1298 all_grafs = []

1299 cur_graf = []

1300 for line in text.splitlines():

1301 line = line.strip()

1302 if line:

1303 cur_graf.append(line)

1304 else:

1305 all_grafs.append(' '.join(cur_graf))

1306 cur_graf = []

1307 if cur_graf:

1308 all_grafs.append(' '.join(cur_graf))

1309 if ending is None:

1310 return all_grafs

1311 return ending.join(all_grafs)