Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/boltons/strutils.py: 22%

405 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:13 +0000

1# -*- coding: utf-8 -*- 

2 

3# Copyright (c) 2013, Mahmoud Hashemi 

4# 

5# Redistribution and use in source and binary forms, with or without 

6# modification, are permitted provided that the following conditions are 

7# met: 

8# 

9# * Redistributions of source code must retain the above copyright 

10# notice, this list of conditions and the following disclaimer. 

11# 

12# * Redistributions in binary form must reproduce the above 

13# copyright notice, this list of conditions and the following 

14# disclaimer in the documentation and/or other materials provided 

15# with the distribution. 

16# 

17# * The names of the contributors may not be used to endorse or 

18# promote products derived from this software without specific 

19# prior written permission. 

20# 

21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 

22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 

23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 

24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 

25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 

26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 

27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 

28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 

29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 

30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 

31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

32 

33"""So much practical programming involves string manipulation, which 

34Python readily accommodates. Still, there are dozens of basic and 

35common capabilities missing from the standard library, several of them 

36provided by ``strutils``. 

37""" 

38 

39from __future__ import print_function 

40 

41import re 

42import sys 

43import uuid 

44import zlib 

45import string 

46import unicodedata 

47import collections 

48from gzip import GzipFile 

49 

50try: 

51 from cStringIO import cStringIO as StringIO 

52except ImportError: 

53 from io import BytesIO as StringIO 

54 

55try: 

56 from collections.abc import Mapping 

57except ImportError: 

58 from collections import Mapping 

59 

60try: 

61 unicode, str, bytes, basestring = unicode, str, str, basestring 

62 from HTMLParser import HTMLParser 

63 import htmlentitydefs 

64except NameError: # basestring not defined in Python 3 

65 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes) 

66 unichr = chr 

67 from html.parser import HTMLParser 

68 from html import entities as htmlentitydefs 

69 

70try: 

71 import __builtin__ as builtins 

72except ImportError: 

73 import builtins 

74 

75__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws', 

76 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize', 

77 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi', 

78 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes', 

79 'iter_splitlines', 'indent', 'escape_shell_args', 

80 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list', 

81 'int_list_complement', 'int_list_to_int_tuples', 'MultiReplace', 

82 'multi_replace', 'unwrap_text'] 

83 

84 

85_punct_ws_str = string.punctuation + string.whitespace 

86_punct_re = re.compile('[' + _punct_ws_str + ']+') 

87_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') 

88 

89 

90def camel2under(camel_string): 

91 """Converts a camelcased string to underscores. Useful for turning a 

92 class name into a function name. 

93 

94 >>> camel2under('BasicParseTest') 

95 'basic_parse_test' 

96 """ 

97 return _camel2under_re.sub(r'_\1', camel_string).lower() 

98 

99 

100def under2camel(under_string): 

101 """Converts an underscored string to camelcased. Useful for turning a 

102 function name into a class name. 

103 

104 >>> under2camel('complex_tokenizer') 

105 'ComplexTokenizer' 

106 """ 

107 return ''.join(w.capitalize() or '_' for w in under_string.split('_')) 

108 

109 

110def slugify(text, delim='_', lower=True, ascii=False): 

111 """ 

112 A basic function that turns text full of scary characters 

113 (i.e., punctuation and whitespace), into a relatively safe 

114 lowercased string separated only by the delimiter specified 

115 by *delim*, which defaults to ``_``. 

116 

117 The *ascii* convenience flag will :func:`asciify` the slug if 

118 you require ascii-only slugs. 

119 

120 >>> slugify('First post! Hi!!!!~1 ') 

121 'first_post_hi_1' 

122 

123 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \ 

124 b'kurt_goedel_s_pretty_cool' 

125 True 

126 

127 """ 

128 ret = delim.join(split_punct_ws(text)) or delim if text else '' 

129 if ascii: 

130 ret = asciify(ret) 

131 if lower: 

132 ret = ret.lower() 

133 return ret 

134 

135 

136def split_punct_ws(text): 

137 """While :meth:`str.split` will split on whitespace, 

138 :func:`split_punct_ws` will split on punctuation and 

139 whitespace. This used internally by :func:`slugify`, above. 

140 

141 >>> split_punct_ws('First post! Hi!!!!~1 ') 

142 ['First', 'post', 'Hi', '1'] 

143 """ 

144 return [w for w in _punct_re.split(text) if w] 

145 

146 

147def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()? 

148 """Returns a plain-English description of an iterable's 

149 :func:`len()`, conditionally pluralized with :func:`cardinalize`, 

150 detailed below. 

151 

152 >>> print(unit_len(range(10), 'number')) 

153 10 numbers 

154 >>> print(unit_len('aeiou', 'vowel')) 

155 5 vowels 

156 >>> print(unit_len([], 'worry')) 

157 No worries 

158 """ 

159 count = len(sized_iterable) 

160 units = cardinalize(unit_noun, count) 

161 if count: 

162 return u'%s %s' % (count, units) 

163 return u'No %s' % (units,) 

164 

165 

166_ORDINAL_MAP = {'1': 'st', 

167 '2': 'nd', 

168 '3': 'rd'} # 'th' is the default 

169 

170 

171def ordinalize(number, ext_only=False): 

172 """Turns *number* into its cardinal form, i.e., 1st, 2nd, 

173 3rd, 4th, etc. If the last character isn't a digit, it returns the 

174 string value unchanged. 

175 

176 Args: 

177 number (int or str): Number to be cardinalized. 

178 ext_only (bool): Whether to return only the suffix. Default ``False``. 

179 

180 >>> print(ordinalize(1)) 

181 1st 

182 >>> print(ordinalize(3694839230)) 

183 3694839230th 

184 >>> print(ordinalize('hi')) 

185 hi 

186 >>> print(ordinalize(1515)) 

187 1515th 

188 """ 

189 numstr, ext = unicode(number), '' 

190 if numstr and numstr[-1] in string.digits: 

191 try: 

192 # first check for teens 

193 if numstr[-2] == '1': 

194 ext = 'th' 

195 else: 

196 # all other cases 

197 ext = _ORDINAL_MAP.get(numstr[-1], 'th') 

198 except IndexError: 

199 # single digit numbers (will reach here based on [-2] above) 

200 ext = _ORDINAL_MAP.get(numstr[-1], 'th') 

201 if ext_only: 

202 return ext 

203 else: 

204 return numstr + ext 

205 

206 

207def cardinalize(unit_noun, count): 

208 """Conditionally pluralizes a singular word *unit_noun* if 

209 *count* is not one, preserving case when possible. 

210 

211 >>> vowels = 'aeiou' 

212 >>> print(len(vowels), cardinalize('vowel', len(vowels))) 

213 5 vowels 

214 >>> print(3, cardinalize('Wish', 3)) 

215 3 Wishes 

216 """ 

217 if count == 1: 

218 return unit_noun 

219 return pluralize(unit_noun) 

220 

221 

222def singularize(word): 

223 """Semi-intelligently converts an English plural *word* to its 

224 singular form, preserving case pattern. 

225 

226 >>> singularize('chances') 

227 'chance' 

228 >>> singularize('Activities') 

229 'Activity' 

230 >>> singularize('Glasses') 

231 'Glass' 

232 >>> singularize('FEET') 

233 'FOOT' 

234 

235 """ 

236 orig_word, word = word, word.strip().lower() 

237 if not word or word in _IRR_S2P: 

238 return orig_word 

239 

240 irr_singular = _IRR_P2S.get(word) 

241 if irr_singular: 

242 singular = irr_singular 

243 elif not word.endswith('s'): 

244 return orig_word 

245 elif len(word) == 2: 

246 singular = word[:-1] # or just return word? 

247 elif word.endswith('ies') and word[-4:-3] not in 'aeiou': 

248 singular = word[:-3] + 'y' 

249 elif word.endswith('es') and word[-3] == 's': 

250 singular = word[:-2] 

251 else: 

252 singular = word[:-1] 

253 return _match_case(orig_word, singular) 

254 

255 

256def pluralize(word): 

257 """Semi-intelligently converts an English *word* from singular form to 

258 plural, preserving case pattern. 

259 

260 >>> pluralize('friend') 

261 'friends' 

262 >>> pluralize('enemy') 

263 'enemies' 

264 >>> pluralize('Sheep') 

265 'Sheep' 

266 """ 

267 orig_word, word = word, word.strip().lower() 

268 if not word or word in _IRR_P2S: 

269 return orig_word 

270 irr_plural = _IRR_S2P.get(word) 

271 if irr_plural: 

272 plural = irr_plural 

273 elif word.endswith('y') and word[-2:-1] not in 'aeiou': 

274 plural = word[:-1] + 'ies' 

275 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'): 

276 plural = word if word.endswith('es') else word + 'es' 

277 else: 

278 plural = word + 's' 

279 return _match_case(orig_word, plural) 

280 

281 

282def _match_case(master, disciple): 

283 if not master.strip(): 

284 return disciple 

285 if master.lower() == master: 

286 return disciple.lower() 

287 elif master.upper() == master: 

288 return disciple.upper() 

289 elif master.title() == master: 

290 return disciple.title() 

291 return disciple 

292 

293 

294# Singular to plural map of irregular pluralizations 

295_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae', 

296 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae', 

297 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli', 

298 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux', 

299 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti', 

300 'calf': 'calves', 'child': 'children', 'corps': 'corps', 

301 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria', 

302 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer', 

303 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves', 

304 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses', 

305 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata', 

306 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci', 

307 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas', 

308 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese', 

309 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami', 

310 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices', 

311 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives', 

312 'loaf': 'loaves', 'louse': 'lice', 'man': 'men', 

313 'matrix': 'matrices', 'means': 'means', 'medium': 'media', 

314 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose', 

315 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae', 

316 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases', 

317 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova', 

318 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses', 

319 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes', 

320 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors', 

321 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep': 

322 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus': 

323 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium': 

324 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses', 

325 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses', 

326 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth': 

327 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto': 

328 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives', 

329 'wolf': 'wolves', 'woman': 'women'} 

330 

331 

332# Reverse index of the above 

333_IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()]) 

334 

335HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE) 

336 

337 

338def find_hashtags(string): 

339 """Finds and returns all hashtags in a string, with the hashmark 

340 removed. Supports full-width hashmarks for Asian languages and 

341 does not false-positive on URL anchors. 

342 

343 >>> find_hashtags('#atag http://asite/#ananchor') 

344 ['atag'] 

345 

346 ``find_hashtags`` also works with unicode hashtags. 

347 """ 

348 

349 # the following works, doctest just struggles with it 

350 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo") 

351 # [u'\u80af\u5fb7\u57fa'] 

352 return HASHTAG_RE.findall(string) 

353 

354 

355def a10n(string): 

356 """That thing where "internationalization" becomes "i18n", what's it 

357 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form 

358 of `numeronym`_.) 

359 

360 >>> a10n('abbreviation') 

361 'a10n' 

362 >>> a10n('internationalization') 

363 'i18n' 

364 >>> a10n('') 

365 '' 

366 

367 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym 

368 """ 

369 if len(string) < 3: 

370 return string 

371 return '%s%s%s' % (string[0], len(string[1:-1]), string[-1]) 

372 

373 

374# Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences 

375ANSI_SEQUENCES = re.compile(r''' 

376 \x1B # Sequence starts with ESC, i.e. hex 0x1B 

377 (?: 

378 [@-Z\\-_] # Second byte: 

379 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_ 

380 | # Or 

381 \[ # CSI sequences, starting with [ 

382 [0-?]* # Parameter bytes: 

383 # range 0x30–0x3F, ASCII 0–9:;<=>? 

384 [ -/]* # Intermediate bytes: 

385 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./ 

386 [@-~] # Final byte 

387 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~ 

388 ) 

389''', re.VERBOSE) 

390 

391 

392def strip_ansi(text): 

393 """Strips ANSI escape codes from *text*. Useful for the occasional 

394 time when a log or redirected output accidentally captures console 

395 color codes and the like. 

396 

397 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m') 

398 'art' 

399 

400 Supports unicode, str, bytes and bytearray content as input. Returns the 

401 same type as the input. 

402 

403 There's a lot of ANSI art available for testing on `sixteencolors.net`_. 

404 This function does not interpret or render ANSI art, but you can do so with 

405 `ansi2img`_ or `escapes.js`_. 

406 

407 .. _sixteencolors.net: http://sixteencolors.net 

408 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img 

409 .. _escapes.js: https://github.com/atdt/escapes.js 

410 """ 

411 # TODO: move to cliutils.py 

412 

413 # Transform any ASCII-like content to unicode to allow regex to match, and 

414 # save input type for later. 

415 target_type = None 

416 # Unicode type aliased to str is code-smell for Boltons in Python 3 env. 

417 is_py3 = (unicode == builtins.str) 

418 if is_py3 and isinstance(text, (bytes, bytearray)): 

419 target_type = type(text) 

420 text = text.decode('utf-8') 

421 

422 cleaned = ANSI_SEQUENCES.sub('', text) 

423 

424 # Transform back the result to the same bytearray type provided by the user. 

425 if target_type and target_type != type(cleaned): 

426 cleaned = target_type(cleaned, 'utf-8') 

427 

428 return cleaned 

429 

430 

431def asciify(text, ignore=False): 

432 """Converts a unicode or bytestring, *text*, into a bytestring with 

433 just ascii characters. Performs basic deaccenting for all you 

434 Europhiles out there. 

435 

436 Also, a gentle reminder that this is a **utility**, primarily meant 

437 for slugification. Whenever possible, make your application work 

438 **with** unicode, not against it. 

439 

440 Args: 

441 text (str or unicode): The string to be asciified. 

442 ignore (bool): Configures final encoding to ignore remaining 

443 unasciified unicode instead of replacing it. 

444 

445 >>> asciify('Beyoncé') == b'Beyonce' 

446 True 

447 """ 

448 try: 

449 try: 

450 return text.encode('ascii') 

451 except UnicodeDecodeError: 

452 # this usually means you passed in a non-unicode string 

453 text = text.decode('utf-8') 

454 return text.encode('ascii') 

455 except UnicodeEncodeError: 

456 mode = 'replace' 

457 if ignore: 

458 mode = 'ignore' 

459 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP)) 

460 ret = transd.encode('ascii', mode) 

461 return ret 

462 

463 

464def is_ascii(text): 

465 """Check if a unicode or bytestring, *text*, is composed of ascii 

466 characters only. Raises :exc:`ValueError` if argument is not text. 

467 

468 Args: 

469 text (str or unicode): The string to be checked. 

470 

471 >>> is_ascii('Beyoncé') 

472 False 

473 >>> is_ascii('Beyonce') 

474 True 

475 """ 

476 if isinstance(text, unicode): 

477 try: 

478 text.encode('ascii') 

479 except UnicodeEncodeError: 

480 return False 

481 elif isinstance(text, bytes): 

482 try: 

483 text.decode('ascii') 

484 except UnicodeDecodeError: 

485 return False 

486 else: 

487 raise ValueError('expected text or bytes, not %r' % type(text)) 

488 return True 

489 

490 

491class DeaccenterDict(dict): 

492 "A small caching dictionary for deaccenting." 

493 def __missing__(self, key): 

494 ch = self.get(key) 

495 if ch is not None: 

496 return ch 

497 try: 

498 de = unicodedata.decomposition(unichr(key)) 

499 p1, _, p2 = de.rpartition(' ') 

500 if int(p2, 16) == 0x308: 

501 ch = self.get(key) 

502 else: 

503 ch = int(p1, 16) 

504 except (IndexError, ValueError): 

505 ch = self.get(key, key) 

506 self[key] = ch 

507 return ch 

508 

509 try: 

510 from collections import defaultdict 

511 except ImportError: 

512 # no defaultdict means that __missing__ isn't supported in 

513 # this version of python, so we define __getitem__ 

514 def __getitem__(self, key): 

515 try: 

516 return super(DeaccenterDict, self).__getitem__(key) 

517 except KeyError: 

518 return self.__missing__(key) 

519 else: 

520 del defaultdict 

521 

522 

523# http://chmullig.com/2009/12/python-unicode-ascii-ifier/ 

524# For something more complete, investigate the unidecode 

525# or isounidecode packages, which are capable of performing 

526# crude transliteration. 

527_BASE_DEACCENT_MAP = { 

528 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE 

529 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH 

530 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE 

531 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN 

532 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS 

533 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS 

534 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS 

535 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE 

536 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE 

537 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE 

538 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA 

539 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE 

540 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE 

541 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX 

542 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE 

543 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE 

544 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE 

545 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE 

546 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE 

547 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE 

548 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE 

549 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S 

550 0xe6: u"ae", # æ LATIN SMALL LETTER AE 

551 0xf0: u"d", # ð LATIN SMALL LETTER ETH 

552 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE 

553 0xfe: u"th", # þ LATIN SMALL LETTER THORN, 

554 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS 

555 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS 

556 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS 

557 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE 

558 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE 

559 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE 

560 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA 

561 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE 

562 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE 

563 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX 

564 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE 

565 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE 

566 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE 

567 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE 

568 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE 

569 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE 

570 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE 

571 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK 

572 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK 

573 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK 

574 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK 

575 } 

576 

577 

578DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP) 

579 

580 

581_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') 

582_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)] 

583_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:])) 

584 

585 

586def bytes2human(nbytes, ndigits=0): 

587 """Turns an integer value of *nbytes* into a human readable format. Set 

588 *ndigits* to control how many digits after the decimal point 

589 should be shown (default ``0``). 

590 

591 >>> bytes2human(128991) 

592 '126K' 

593 >>> bytes2human(100001221) 

594 '95M' 

595 >>> bytes2human(0, 2) 

596 '0.00B' 

597 """ 

598 abs_bytes = abs(nbytes) 

599 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES: 

600 if abs_bytes <= next_size: 

601 break 

602 hnbytes = float(nbytes) / size 

603 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes, 

604 ndigits=ndigits, 

605 symbol=symbol) 

606 

607 

608class HTMLTextExtractor(HTMLParser): 

609 def __init__(self): 

610 self.reset() 

611 self.strict = False 

612 self.convert_charrefs = True 

613 self.result = [] 

614 

615 def handle_data(self, d): 

616 self.result.append(d) 

617 

618 def handle_charref(self, number): 

619 if number[0] == u'x' or number[0] == u'X': 

620 codepoint = int(number[1:], 16) 

621 else: 

622 codepoint = int(number) 

623 self.result.append(unichr(codepoint)) 

624 

625 def handle_entityref(self, name): 

626 try: 

627 codepoint = htmlentitydefs.name2codepoint[name] 

628 except KeyError: 

629 self.result.append(u'&' + name + u';') 

630 else: 

631 self.result.append(unichr(codepoint)) 

632 

633 def get_text(self): 

634 return u''.join(self.result) 

635 

636 

637def html2text(html): 

638 """Strips tags from HTML text, returning markup-free text. Also, does 

639 a best effort replacement of entities like "&nbsp;" 

640 

641 >>> r = html2text(u'<a href="#">Test &amp;<em>(\u0394&#x03b7;&#956;&#x03CE;)</em></a>') 

642 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)' 

643 True 

644 """ 

645 # based on answers to http://stackoverflow.com/questions/753052/ 

646 s = HTMLTextExtractor() 

647 s.feed(html) 

648 return s.get_text() 

649 

650 

651_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00' 

652_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00' 

653 

654 

655def gunzip_bytes(bytestring): 

656 """The :mod:`gzip` module is great if you have a file or file-like 

657 object, but what if you just have bytes. StringIO is one 

658 possibility, but it's often faster, easier, and simpler to just 

659 use this one-liner. Use this tried-and-true utility function to 

660 decompress gzip from bytes. 

661 

662 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b'' 

663 True 

664 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!' 

665 True 

666 """ 

667 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS) 

668 

669 

670def gzip_bytes(bytestring, level=6): 

671 """Turn some bytes into some compressed bytes. 

672 

673 >>> len(gzip_bytes(b'a' * 10000)) 

674 46 

675 

676 Args: 

677 bytestring (bytes): Bytes to be compressed 

678 level (int): An integer, 1-9, controlling the 

679 speed/compression. 1 is fastest, least compressed, 9 is 

680 slowest, but most compressed. 

681 

682 Note that all levels of gzip are pretty fast these days, though 

683 it's not really a competitor in compression, at any level. 

684 """ 

685 out = StringIO() 

686 f = GzipFile(fileobj=out, mode='wb', compresslevel=level) 

687 f.write(bytestring) 

688 f.close() 

689 return out.getvalue() 

690 

691 

692 

693_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)', 

694 re.UNICODE) 

695 

696 

697def iter_splitlines(text): 

698 r"""Like :meth:`str.splitlines`, but returns an iterator of lines 

699 instead of a list. Also similar to :meth:`file.next`, as that also 

700 lazily reads and yields lines from a file. 

701 

702 This function works with a variety of line endings, but as always, 

703 be careful when mixing line endings within a file. 

704 

705 >>> list(iter_splitlines('\nhi\nbye\n')) 

706 ['', 'hi', 'bye', ''] 

707 >>> list(iter_splitlines('\r\nhi\rbye\r\n')) 

708 ['', 'hi', 'bye', ''] 

709 >>> list(iter_splitlines('')) 

710 [] 

711 """ 

712 prev_end, len_text = 0, len(text) 

713 # print('last: %r' % last_idx) 

714 # start, end = None, None 

715 for match in _line_ending_re.finditer(text): 

716 start, end = match.start(1), match.end(1) 

717 # print(start, end) 

718 if prev_end <= start: 

719 yield text[prev_end:start] 

720 if end == len_text: 

721 yield '' 

722 prev_end = end 

723 tail = text[prev_end:] 

724 if tail: 

725 yield tail 

726 return 

727 

728 

729def indent(text, margin, newline='\n', key=bool): 

730 """The missing counterpart to the built-in :func:`textwrap.dedent`. 

731 

732 Args: 

733 text (str): The text to indent. 

734 margin (str): The string to prepend to each line. 

735 newline (str): The newline used to rejoin the lines (default: ``\\n``) 

736 key (callable): Called on each line to determine whether to 

737 indent it. Default: :class:`bool`, to ensure that empty lines do 

738 not get whitespace added. 

739 """ 

740 indented_lines = [(margin + line if key(line) else line) 

741 for line in iter_splitlines(text)] 

742 return newline.join(indented_lines) 

743 

744 

745def is_uuid(obj, version=4): 

746 """Check the argument is either a valid UUID object or string. 

747 

748 Args: 

749 obj (object): The test target. Strings and UUID objects supported. 

750 version (int): The target UUID version, set to 0 to skip version check. 

751 

752 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea') 

753 True 

754 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9') 

755 False 

756 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1) 

757 True 

758 """ 

759 if not isinstance(obj, uuid.UUID): 

760 try: 

761 obj = uuid.UUID(obj) 

762 except (TypeError, ValueError, AttributeError): 

763 return False 

764 if version and obj.version != int(version): 

765 return False 

766 return True 

767 

768 

769def escape_shell_args(args, sep=' ', style=None): 

770 """Returns an escaped version of each string in *args*, according to 

771 *style*. 

772 

773 Args: 

774 args (list): A list of arguments to escape and join together 

775 sep (str): The separator used to join the escaped arguments. 

776 style (str): The style of escaping to use. Can be one of 

777 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc., 

778 respectively. If *style* is ``None``, then it is picked 

779 according to the system platform. 

780 

781 See :func:`args2cmd` and :func:`args2sh` for details and example 

782 output for each style. 

783 """ 

784 if not style: 

785 style = 'cmd' if sys.platform == 'win32' else 'sh' 

786 

787 if style == 'sh': 

788 return args2sh(args, sep=sep) 

789 elif style == 'cmd': 

790 return args2cmd(args, sep=sep) 

791 

792 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style) 

793 

794 

795_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search 

796 

797 

798def args2sh(args, sep=' '): 

799 """Return a shell-escaped string version of *args*, separated by 

800 *sep*, based on the rules of sh, bash, and other shells in the 

801 Linux/BSD/MacOS ecosystem. 

802 

803 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd'])) 

804 aa '[bb]' 'cc'"'"'cc' 'dd"dd' 

805 

806 As you can see, arguments with no special characters are not 

807 escaped, arguments with special characters are quoted with single 

808 quotes, and single quotes themselves are quoted with double 

809 quotes. Double quotes are handled like any other special 

810 character. 

811 

812 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also 

813 note that :mod:`shlex` and :mod:`argparse` have functions to split 

814 and parse strings escaped in this manner. 

815 """ 

816 ret_list = [] 

817 

818 for arg in args: 

819 if not arg: 

820 ret_list.append("''") 

821 continue 

822 if _find_sh_unsafe(arg) is None: 

823 ret_list.append(arg) 

824 continue 

825 # use single quotes, and put single quotes into double quotes 

826 # the string $'b is then quoted as '$'"'"'b' 

827 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'") 

828 

829 return ' '.join(ret_list) 

830 

831 

832def args2cmd(args, sep=' '): 

833 r"""Return a shell-escaped string version of *args*, separated by 

834 *sep*, using the same rules as the Microsoft C runtime. 

835 

836 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd'])) 

837 aa [bb] cc'cc dd\"dd 

838 

839 As you can see, escaping is through backslashing and not quoting, 

840 and double quotes are the only special character. See the comment 

841 in the code for more details. Based on internal code from the 

842 :mod:`subprocess` module. 

843 

844 """ 

845 # technique description from subprocess below 

846 """ 

847 1) Arguments are delimited by white space, which is either a 

848 space or a tab. 

849 

850 2) A string surrounded by double quotation marks is 

851 interpreted as a single argument, regardless of white space 

852 contained within. A quoted string can be embedded in an 

853 argument. 

854 

855 3) A double quotation mark preceded by a backslash is 

856 interpreted as a literal double quotation mark. 

857 

858 4) Backslashes are interpreted literally, unless they 

859 immediately precede a double quotation mark. 

860 

861 5) If backslashes immediately precede a double quotation mark, 

862 every pair of backslashes is interpreted as a literal 

863 backslash. If the number of backslashes is odd, the last 

864 backslash escapes the next double quotation mark as 

865 described in rule 3. 

866 

867 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx 

868 or search http://msdn.microsoft.com for 

869 "Parsing C++ Command-Line Arguments" 

870 """ 

871 result = [] 

872 needquote = False 

873 for arg in args: 

874 bs_buf = [] 

875 

876 # Add a space to separate this argument from the others 

877 if result: 

878 result.append(' ') 

879 

880 needquote = (" " in arg) or ("\t" in arg) or not arg 

881 if needquote: 

882 result.append('"') 

883 

884 for c in arg: 

885 if c == '\\': 

886 # Don't know if we need to double yet. 

887 bs_buf.append(c) 

888 elif c == '"': 

889 # Double backslashes. 

890 result.append('\\' * len(bs_buf)*2) 

891 bs_buf = [] 

892 result.append('\\"') 

893 else: 

894 # Normal char 

895 if bs_buf: 

896 result.extend(bs_buf) 

897 bs_buf = [] 

898 result.append(c) 

899 

900 # Add remaining backslashes, if any. 

901 if bs_buf: 

902 result.extend(bs_buf) 

903 

904 if needquote: 

905 result.extend(bs_buf) 

906 result.append('"') 

907 

908 return ''.join(result) 

909 

910 

911def parse_int_list(range_string, delim=',', range_delim='-'): 

912 """Returns a sorted list of positive integers based on 

913 *range_string*. Reverse of :func:`format_int_list`. 

914 

915 Args: 

916 range_string (str): String of comma separated positive 

917 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom 

918 page range string used in printer dialogs. 

919 delim (char): Defaults to ','. Separates integers and 

920 contiguous ranges of integers. 

921 range_delim (char): Defaults to '-'. Indicates a contiguous 

922 range of integers. 

923 

924 >>> parse_int_list('1,3,5-8,10-11,15') 

925 [1, 3, 5, 6, 7, 8, 10, 11, 15] 

926 

927 """ 

928 output = [] 

929 

930 for x in range_string.strip().split(delim): 

931 

932 # Range 

933 if range_delim in x: 

934 range_limits = list(map(int, x.split(range_delim))) 

935 output += list(range(min(range_limits), max(range_limits)+1)) 

936 

937 # Empty String 

938 elif not x: 

939 continue 

940 

941 # Integer 

942 else: 

943 output.append(int(x)) 

944 

945 return sorted(output) 

946 

947 

948def format_int_list(int_list, delim=',', range_delim='-', delim_space=False): 

949 """Returns a sorted range string from a list of positive integers 

950 (*int_list*). Contiguous ranges of integers are collapsed to min 

951 and max values. Reverse of :func:`parse_int_list`. 

952 

953 Args: 

954 int_list (list): List of positive integers to be converted 

955 into a range string (e.g. [1,2,4,5,6,8]). 

956 delim (char): Defaults to ','. Separates integers and 

957 contiguous ranges of integers. 

958 range_delim (char): Defaults to '-'. Indicates a contiguous 

959 range of integers. 

960 delim_space (bool): Defaults to ``False``. If ``True``, adds a 

961 space after all *delim* characters. 

962 

963 >>> format_int_list([1,3,5,6,7,8,10,11,15]) 

964 '1,3,5-8,10-11,15' 

965 

966 """ 

967 output = [] 

968 contig_range = collections.deque() 

969 

970 for x in sorted(int_list): 

971 

972 # Handle current (and first) value. 

973 if len(contig_range) < 1: 

974 contig_range.append(x) 

975 

976 # Handle current value, given multiple previous values are contiguous. 

977 elif len(contig_range) > 1: 

978 delta = x - contig_range[-1] 

979 

980 # Current value is contiguous. 

981 if delta == 1: 

982 contig_range.append(x) 

983 

984 # Current value is non-contiguous. 

985 elif delta > 1: 

986 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), 

987 range_delim, 

988 max(contig_range)) 

989 output.append(range_substr) 

990 contig_range.clear() 

991 contig_range.append(x) 

992 

993 # Current value repeated. 

994 else: 

995 continue 

996 

997 # Handle current value, given no previous contiguous integers 

998 else: 

999 delta = x - contig_range[0] 

1000 

1001 # Current value is contiguous. 

1002 if delta == 1: 

1003 contig_range.append(x) 

1004 

1005 # Current value is non-contiguous. 

1006 elif delta > 1: 

1007 output.append('{0:d}'.format(contig_range.popleft())) 

1008 contig_range.append(x) 

1009 

1010 # Current value repeated. 

1011 else: 

1012 continue 

1013 

1014 # Handle the last value. 

1015 else: 

1016 

1017 # Last value is non-contiguous. 

1018 if len(contig_range) == 1: 

1019 output.append('{0:d}'.format(contig_range.popleft())) 

1020 contig_range.clear() 

1021 

1022 # Last value is part of contiguous range. 

1023 elif len(contig_range) > 1: 

1024 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), 

1025 range_delim, 

1026 max(contig_range)) 

1027 output.append(range_substr) 

1028 contig_range.clear() 

1029 

1030 if delim_space: 

1031 output_str = (delim+' ').join(output) 

1032 else: 

1033 output_str = delim.join(output) 

1034 

1035 return output_str 

1036 

1037 

1038def complement_int_list( 

1039 range_string, range_start=0, range_end=None, 

1040 delim=',', range_delim='-'): 

1041 """ Returns range string that is the complement of the one provided as 

1042 *range_string* parameter. 

1043 

1044 These range strings are of the kind produce by :func:`format_int_list`, and 

1045 parseable by :func:`parse_int_list`. 

1046 

1047 Args: 

1048 range_string (str): String of comma separated positive integers or 

1049 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string 

1050 used in printer dialogs. 

1051 range_start (int): A positive integer from which to start the resulting 

1052 range. Value is inclusive. Defaults to ``0``. 

1053 range_end (int): A positive integer from which the produced range is 

1054 stopped. Value is exclusive. Defaults to the maximum value found in 

1055 the provided ``range_string``. 

1056 delim (char): Defaults to ','. Separates integers and contiguous ranges 

1057 of integers. 

1058 range_delim (char): Defaults to '-'. Indicates a contiguous range of 

1059 integers. 

1060 

1061 >>> complement_int_list('1,3,5-8,10-11,15') 

1062 '0,2,4,9,12-14' 

1063 

1064 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0) 

1065 '0,2,4,9,12-14' 

1066 

1067 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1) 

1068 '2,4,9,12-14' 

1069 

1070 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2) 

1071 '2,4,9,12-14' 

1072 

1073 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3) 

1074 '4,9,12-14' 

1075 

1076 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15) 

1077 '0,2,4,9,12-14' 

1078 

1079 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14) 

1080 '0,2,4,9,12-13' 

1081 

1082 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13) 

1083 '0,2,4,9,12' 

1084 

1085 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20) 

1086 '0,2,4,9,12-14,16-19' 

1087 

1088 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0) 

1089 '' 

1090 

1091 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1) 

1092 '0,2,4,9,12-14' 

1093 

1094 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1) 

1095 '' 

1096 

1097 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1) 

1098 '' 

1099 

1100 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2) 

1101 '' 

1102 

1103 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3) 

1104 '2' 

1105 

1106 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5) 

1107 '' 

1108 

1109 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10) 

1110 '' 

1111 

1112 >>> complement_int_list('') 

1113 '' 

1114 """ 

1115 int_list = set(parse_int_list(range_string, delim, range_delim)) 

1116 if range_end is None: 

1117 if int_list: 

1118 range_end = max(int_list) + 1 

1119 else: 

1120 range_end = range_start 

1121 complement_values = set( 

1122 range(range_end)) - int_list - set(range(range_start)) 

1123 return format_int_list(complement_values, delim, range_delim) 

1124 

1125 

1126def int_ranges_from_int_list(range_string, delim=',', range_delim='-'): 

1127 """ Transform a string of ranges (*range_string*) into a tuple of tuples. 

1128 

1129 Args: 

1130 range_string (str): String of comma separated positive integers or 

1131 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string 

1132 used in printer dialogs. 

1133 delim (char): Defaults to ','. Separates integers and contiguous ranges 

1134 of integers. 

1135 range_delim (char): Defaults to '-'. Indicates a contiguous range of 

1136 integers. 

1137 

1138 >>> int_ranges_from_int_list('1,3,5-8,10-11,15') 

1139 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15)) 

1140 

1141 >>> int_ranges_from_int_list('1') 

1142 ((1, 1),) 

1143 

1144 >>> int_ranges_from_int_list('') 

1145 () 

1146 """ 

1147 int_tuples = [] 

1148 # Normalize the range string to our internal format for processing. 

1149 range_string = format_int_list( 

1150 parse_int_list(range_string, delim, range_delim)) 

1151 if range_string: 

1152 for bounds in range_string.split(','): 

1153 if '-' in bounds: 

1154 start, end = bounds.split('-') 

1155 else: 

1156 start, end = bounds, bounds 

1157 int_tuples.append((int(start), int(end))) 

1158 return tuple(int_tuples) 

1159 

1160 

1161class MultiReplace(object): 

1162 """ 

1163 MultiReplace is a tool for doing multiple find/replace actions in one pass. 

1164 

1165 Given a mapping of values to be replaced it allows for all of the matching 

1166 values to be replaced in a single pass which can save a lot of performance 

1167 on very large strings. In addition to simple replace, it also allows for 

1168 replacing based on regular expressions. 

1169 

1170 Keyword Arguments: 

1171 

1172 :type regex: bool 

1173 :param regex: Treat search keys as regular expressions [Default: False] 

1174 :type flags: int 

1175 :param flags: flags to pass to the regex engine during compile 

1176 

1177 Dictionary Usage:: 

1178 

1179 from boltons import stringutils 

1180 s = stringutils.MultiReplace({ 

1181 'foo': 'zoo', 

1182 'cat': 'hat', 

1183 'bat': 'kraken' 

1184 }) 

1185 new = s.sub('The foo bar cat ate a bat') 

1186 new == 'The zoo bar hat ate a kraken' 

1187 

1188 Iterable Usage:: 

1189 

1190 from boltons import stringutils 

1191 s = stringutils.MultiReplace([ 

1192 ('foo', 'zoo'), 

1193 ('cat', 'hat'), 

1194 ('bat', 'kraken)' 

1195 ]) 

1196 new = s.sub('The foo bar cat ate a bat') 

1197 new == 'The zoo bar hat ate a kraken' 

1198 

1199 

1200 The constructor can be passed a dictionary or other mapping as well as 

1201 an iterable of tuples. If given an iterable, the substitution will be run 

1202 in the order the replacement values are specified in the iterable. This is 

1203 also true if it is given an OrderedDict. If given a dictionary then the 

1204 order will be non-deterministic:: 

1205 

1206 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar') 

1207 'bar bar bar' 

1208 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'}) 

1209 >>> m.sub('foo bar baz') 

1210 'baz bar bar' 

1211 

1212 This is because the order of replacement can matter if you're inserting 

1213 something that might be replaced by a later substitution. Pay attention and 

1214 if you need to rely on order then consider using a list of tuples instead 

1215 of a dictionary. 

1216 """ 

1217 

1218 def __init__(self, sub_map, **kwargs): 

1219 """Compile any regular expressions that have been passed.""" 

1220 options = { 

1221 'regex': False, 

1222 'flags': 0, 

1223 } 

1224 options.update(kwargs) 

1225 self.group_map = {} 

1226 regex_values = [] 

1227 

1228 if isinstance(sub_map, Mapping): 

1229 sub_map = sub_map.items() 

1230 

1231 for idx, vals in enumerate(sub_map): 

1232 group_name = 'group{0}'.format(idx) 

1233 if isinstance(vals[0], basestring): 

1234 # If we're not treating input strings like a regex, escape it 

1235 if not options['regex']: 

1236 exp = re.escape(vals[0]) 

1237 else: 

1238 exp = vals[0] 

1239 else: 

1240 exp = vals[0].pattern 

1241 

1242 regex_values.append('(?P<{}>{})'.format(group_name, exp)) 

1243 self.group_map[group_name] = vals[1] 

1244 

1245 self.combined_pattern = re.compile( 

1246 '|'.join(regex_values), 

1247 flags=options['flags'] 

1248 ) 

1249 

1250 def _get_value(self, match): 

1251 """Given a match object find replacement value.""" 

1252 group_dict = match.groupdict() 

1253 key = [x for x in group_dict if group_dict[x]][0] 

1254 return self.group_map[key] 

1255 

1256 def sub(self, text): 

1257 """ 

1258 Run substitutions on the input text. 

1259 

1260 Given an input string, run all substitutions given in the 

1261 constructor. 

1262 """ 

1263 return self.combined_pattern.sub(self._get_value, text) 

1264 

1265 

1266def multi_replace(text, sub_map, **kwargs): 

1267 """ 

1268 Shortcut function to invoke MultiReplace in a single call. 

1269 

1270 Example Usage:: 

1271 

1272 from boltons.stringutils import multi_replace 

1273 new = multi_replace( 

1274 'The foo bar cat ate a bat', 

1275 {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'} 

1276 ) 

1277 new == 'The zoo bar hat ate a kraken' 

1278 """ 

1279 m = MultiReplace(sub_map, **kwargs) 

1280 return m.sub(text) 

1281 

1282 

1283def unwrap_text(text, ending='\n\n'): 

1284 r""" 

1285 Unwrap text, the natural complement to :func:`textwrap.wrap`. 

1286 

1287 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph." 

1288 >>> unwrap_text(text) 

1289 'Short lines wrapped small.\n\nAnother paragraph.' 

1290 

1291 Args: 

1292 text: A string to unwrap. 

1293 ending (str): The string to join all unwrapped paragraphs 

1294 by. Pass ``None`` to get the list. Defaults to '\n\n' for 

1295 compatibility with Markdown and RST. 

1296 

1297 """ 

1298 all_grafs = [] 

1299 cur_graf = [] 

1300 for line in text.splitlines(): 

1301 line = line.strip() 

1302 if line: 

1303 cur_graf.append(line) 

1304 else: 

1305 all_grafs.append(' '.join(cur_graf)) 

1306 cur_graf = [] 

1307 if cur_graf: 

1308 all_grafs.append(' '.join(cur_graf)) 

1309 if ending is None: 

1310 return all_grafs 

1311 return ending.join(all_grafs)