Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/boltons/strutils.py: 20%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2013, Mahmoud Hashemi
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are
5# met:
6#
7# * Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9#
10# * Redistributions in binary form must reproduce the above
11# copyright notice, this list of conditions and the following
12# disclaimer in the documentation and/or other materials provided
13# with the distribution.
14#
15# * The names of the contributors may not be used to endorse or
16# promote products derived from this software without specific
17# prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31"""So much practical programming involves string manipulation, which
32Python readily accommodates. Still, there are dozens of basic and
33common capabilities missing from the standard library, several of them
34provided by ``strutils``.
35"""
38import builtins
39import re
40import sys
41import uuid
42import zlib
43import string
44import unicodedata
45import collections
46from collections.abc import Mapping
47from gzip import GzipFile
48from html.parser import HTMLParser
49from html import entities as htmlentitydefs
50from io import BytesIO as StringIO
53__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
54 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
55 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
56 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
57 'iter_splitlines', 'indent', 'escape_shell_args',
58 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',
59 'complement_int_list', 'int_ranges_from_int_list', 'MultiReplace',
60 'multi_replace', 'unwrap_text']
63_punct_ws_str = string.punctuation + string.whitespace
64_punct_re = re.compile('[' + _punct_ws_str + ']+')
65_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
68def camel2under(camel_string):
69 """Converts a camelcased string to underscores. Useful for turning a
70 class name into a function name.
72 >>> camel2under('BasicParseTest')
73 'basic_parse_test'
74 """
75 return _camel2under_re.sub(r'_\1', camel_string).lower()
78def under2camel(under_string):
79 """Converts an underscored string to camelcased. Useful for turning a
80 function name into a class name.
82 >>> under2camel('complex_tokenizer')
83 'ComplexTokenizer'
84 """
85 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
88def slugify(text, delim='_', lower=True, ascii=False):
89 """
90 A basic function that turns text full of scary characters
91 (i.e., punctuation and whitespace), into a relatively safe
92 lowercased string separated only by the delimiter specified
93 by *delim*, which defaults to ``_``.
95 The *ascii* convenience flag will :func:`asciify` the slug if
96 you require ascii-only slugs.
98 >>> slugify('First post! Hi!!!!~1 ')
99 'first_post_hi_1'
101 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
102 b'kurt_goedel_s_pretty_cool'
103 True
105 """
106 ret = delim.join(split_punct_ws(text)) or delim if text else ''
107 if ascii:
108 ret = asciify(ret)
109 if lower:
110 ret = ret.lower()
111 return ret
114def split_punct_ws(text):
115 """While :meth:`str.split` will split on whitespace,
116 :func:`split_punct_ws` will split on punctuation and
117 whitespace. This used internally by :func:`slugify`, above.
119 >>> split_punct_ws('First post! Hi!!!!~1 ')
120 ['First', 'post', 'Hi', '1']
121 """
122 return [w for w in _punct_re.split(text) if w]
125def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
126 """Returns a plain-English description of an iterable's
127 :func:`len()`, conditionally pluralized with :func:`cardinalize`,
128 detailed below.
130 >>> print(unit_len(range(10), 'number'))
131 10 numbers
132 >>> print(unit_len('aeiou', 'vowel'))
133 5 vowels
134 >>> print(unit_len([], 'worry'))
135 No worries
136 """
137 count = len(sized_iterable)
138 units = cardinalize(unit_noun, count)
139 if count:
140 return f'{count} {units}'
141 return f'No {units}'
144_ORDINAL_MAP = {'1': 'st',
145 '2': 'nd',
146 '3': 'rd'} # 'th' is the default
149def ordinalize(number, ext_only=False):
150 """Turns *number* into its cardinal form, i.e., 1st, 2nd,
151 3rd, 4th, etc. If the last character isn't a digit, it returns the
152 string value unchanged.
154 Args:
155 number (int or str): Number to be cardinalized.
156 ext_only (bool): Whether to return only the suffix. Default ``False``.
158 >>> print(ordinalize(1))
159 1st
160 >>> print(ordinalize(3694839230))
161 3694839230th
162 >>> print(ordinalize('hi'))
163 hi
164 >>> print(ordinalize(1515))
165 1515th
166 """
167 numstr, ext = str(number), ''
168 if numstr and numstr[-1] in string.digits:
169 try:
170 # first check for teens
171 if numstr[-2] == '1':
172 ext = 'th'
173 else:
174 # all other cases
175 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
176 except IndexError:
177 # single digit numbers (will reach here based on [-2] above)
178 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
179 if ext_only:
180 return ext
181 else:
182 return numstr + ext
185def cardinalize(unit_noun, count):
186 """Conditionally pluralizes a singular word *unit_noun* if
187 *count* is not one, preserving case when possible.
189 >>> vowels = 'aeiou'
190 >>> print(len(vowels), cardinalize('vowel', len(vowels)))
191 5 vowels
192 >>> print(3, cardinalize('Wish', 3))
193 3 Wishes
194 """
195 if count == 1:
196 return unit_noun
197 return pluralize(unit_noun)
200def singularize(word):
201 """Semi-intelligently converts an English plural *word* to its
202 singular form, preserving case pattern.
204 >>> singularize('chances')
205 'chance'
206 >>> singularize('Activities')
207 'Activity'
208 >>> singularize('Glasses')
209 'Glass'
210 >>> singularize('FEET')
211 'FOOT'
213 """
214 orig_word, word = word, word.strip().lower()
215 if not word or word in _IRR_S2P:
216 return orig_word
218 irr_singular = _IRR_P2S.get(word)
219 if irr_singular:
220 singular = irr_singular
221 elif not word.endswith('s'):
222 return orig_word
223 elif len(word) == 2:
224 singular = word[:-1] # or just return word?
225 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
226 singular = word[:-3] + 'y'
227 elif word.endswith('es') and word[-3] == 's':
228 singular = word[:-2]
229 else:
230 singular = word[:-1]
231 return _match_case(orig_word, singular)
234def pluralize(word):
235 """Semi-intelligently converts an English *word* from singular form to
236 plural, preserving case pattern.
238 >>> pluralize('friend')
239 'friends'
240 >>> pluralize('enemy')
241 'enemies'
242 >>> pluralize('Sheep')
243 'Sheep'
244 """
245 orig_word, word = word, word.strip().lower()
246 if not word or word in _IRR_P2S:
247 return orig_word
248 irr_plural = _IRR_S2P.get(word)
249 if irr_plural:
250 plural = irr_plural
251 elif word.endswith('y') and word[-2:-1] not in 'aeiou':
252 plural = word[:-1] + 'ies'
253 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
254 plural = word if word.endswith('es') else word + 'es'
255 else:
256 plural = word + 's'
257 return _match_case(orig_word, plural)
260def _match_case(master, disciple):
261 if not master.strip():
262 return disciple
263 if master.lower() == master:
264 return disciple.lower()
265 elif master.upper() == master:
266 return disciple.upper()
267 elif master.title() == master:
268 return disciple.title()
269 return disciple
272# Singular to plural map of irregular pluralizations
273_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
274 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
275 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
276 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
277 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
278 'calf': 'calves', 'child': 'children', 'corps': 'corps',
279 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
280 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
281 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
282 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
283 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
284 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
285 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
286 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
287 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
288 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
289 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
290 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
291 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
292 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
293 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
294 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
295 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
296 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
297 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
298 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
299 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
300 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
301 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
302 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
303 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
304 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
305 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
306 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
307 'wolf': 'wolves', 'woman': 'women'}
310# Reverse index of the above
311_IRR_P2S = {v: k for k, v in _IRR_S2P.items()}
313HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
316def find_hashtags(string):
317 """Finds and returns all hashtags in a string, with the hashmark
318 removed. Supports full-width hashmarks for Asian languages and
319 does not false-positive on URL anchors.
321 >>> find_hashtags('#atag http://asite/#ananchor')
322 ['atag']
324 ``find_hashtags`` also works with unicode hashtags.
325 """
327 # the following works, doctest just struggles with it
328 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
329 # [u'\u80af\u5fb7\u57fa']
330 return HASHTAG_RE.findall(string)
333def a10n(string):
334 """That thing where "internationalization" becomes "i18n", what's it
335 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
336 of `numeronym`_.)
338 >>> a10n('abbreviation')
339 'a10n'
340 >>> a10n('internationalization')
341 'i18n'
342 >>> a10n('')
343 ''
345 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
346 """
347 if len(string) < 3:
348 return string
349 return f'{string[0]}{len(string[1:-1])}{string[-1]}'
352# Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences
353ANSI_SEQUENCES = re.compile(r'''
354 \x1B # Sequence starts with ESC, i.e. hex 0x1B
355 (?:
356 [@-Z\\-_] # Second byte:
357 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_
358 | # Or
359 \[ # CSI sequences, starting with [
360 [0-?]* # Parameter bytes:
361 # range 0x30–0x3F, ASCII 0–9:;<=>?
362 [ -/]* # Intermediate bytes:
363 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./
364 [@-~] # Final byte
365 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~
366 )
367''', re.VERBOSE)
370def strip_ansi(text):
371 """Strips ANSI escape codes from *text*. Useful for the occasional
372 time when a log or redirected output accidentally captures console
373 color codes and the like.
375 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')
376 'art'
378 Supports str, bytes and bytearray content as input. Returns the
379 same type as the input.
381 There's a lot of ANSI art available for testing on `sixteencolors.net`_.
382 This function does not interpret or render ANSI art, but you can do so with
383 `ansi2img`_ or `escapes.js`_.
385 .. _sixteencolors.net: http://sixteencolors.net
386 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
387 .. _escapes.js: https://github.com/atdt/escapes.js
388 """
389 # TODO: move to cliutils.py
391 # Transform any ASCII-like content to unicode to allow regex to match, and
392 # save input type for later.
393 target_type = None
394 # Unicode type aliased to str is code-smell for Boltons in Python 3 env.
395 if isinstance(text, (bytes, bytearray)):
396 target_type = type(text)
397 text = text.decode('utf-8')
399 cleaned = ANSI_SEQUENCES.sub('', text)
401 # Transform back the result to the same bytearray type provided by the user.
402 if target_type and target_type != type(cleaned):
403 cleaned = target_type(cleaned, 'utf-8')
405 return cleaned
408def asciify(text, ignore=False):
409 """Converts a unicode or bytestring, *text*, into a bytestring with
410 just ascii characters. Performs basic deaccenting for all you
411 Europhiles out there.
413 Also, a gentle reminder that this is a **utility**, primarily meant
414 for slugification. Whenever possible, make your application work
415 **with** unicode, not against it.
417 Args:
418 text (str): The string to be asciified.
419 ignore (bool): Configures final encoding to ignore remaining
420 unasciified string instead of replacing it.
422 >>> asciify('Beyoncé') == b'Beyonce'
423 True
424 """
425 try:
426 try:
427 return text.encode('ascii')
428 except UnicodeDecodeError:
429 # this usually means you passed in a non-unicode string
430 text = text.decode('utf-8')
431 return text.encode('ascii')
432 except UnicodeEncodeError:
433 mode = 'replace'
434 if ignore:
435 mode = 'ignore'
436 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
437 ret = transd.encode('ascii', mode)
438 return ret
441def is_ascii(text):
442 """Check if a string or bytestring, *text*, is composed of ascii
443 characters only. Raises :exc:`ValueError` if argument is not text.
445 Args:
446 text (str): The string to be checked.
448 >>> is_ascii('Beyoncé')
449 False
450 >>> is_ascii('Beyonce')
451 True
452 """
453 if isinstance(text, str):
454 try:
455 text.encode('ascii')
456 except UnicodeEncodeError:
457 return False
458 elif isinstance(text, bytes):
459 try:
460 text.decode('ascii')
461 except UnicodeDecodeError:
462 return False
463 else:
464 raise ValueError('expected text or bytes, not %r' % type(text))
465 return True
468class DeaccenterDict(dict):
469 "A small caching dictionary for deaccenting."
470 def __missing__(self, key):
471 ch = self.get(key)
472 if ch is not None:
473 return ch
474 try:
475 de = unicodedata.decomposition(chr(key))
476 p1, _, p2 = de.rpartition(' ')
477 if int(p2, 16) == 0x308:
478 ch = self.get(key)
479 else:
480 ch = int(p1, 16)
481 except (IndexError, ValueError):
482 ch = self.get(key, key)
483 self[key] = ch
484 return ch
487# http://chmullig.com/2009/12/python-unicode-ascii-ifier/
488# For something more complete, investigate the unidecode
489# or isounidecode packages, which are capable of performing
490# crude transliteration.
491_BASE_DEACCENT_MAP = {
492 0xc6: "AE", # Æ LATIN CAPITAL LETTER AE
493 0xd0: "D", # Ð LATIN CAPITAL LETTER ETH
494 0xd8: "OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
495 0xde: "Th", # Þ LATIN CAPITAL LETTER THORN
496 0xc4: 'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
497 0xd6: 'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
498 0xdc: 'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
499 0xc0: "A", # À LATIN CAPITAL LETTER A WITH GRAVE
500 0xc1: "A", # Á LATIN CAPITAL LETTER A WITH ACUTE
501 0xc3: "A", # Ã LATIN CAPITAL LETTER A WITH TILDE
502 0xc7: "C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
503 0xc8: "E", # È LATIN CAPITAL LETTER E WITH GRAVE
504 0xc9: "E", # É LATIN CAPITAL LETTER E WITH ACUTE
505 0xca: "E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
506 0xcc: "I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
507 0xcd: "I", # Í LATIN CAPITAL LETTER I WITH ACUTE
508 0xd2: "O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
509 0xd3: "O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
510 0xd5: "O", # Õ LATIN CAPITAL LETTER O WITH TILDE
511 0xd9: "U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
512 0xda: "U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
513 0xdf: "ss", # ß LATIN SMALL LETTER SHARP S
514 0xe6: "ae", # æ LATIN SMALL LETTER AE
515 0xf0: "d", # ð LATIN SMALL LETTER ETH
516 0xf8: "oe", # ø LATIN SMALL LETTER O WITH STROKE
517 0xfe: "th", # þ LATIN SMALL LETTER THORN,
518 0xe4: 'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
519 0xf6: 'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
520 0xfc: 'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
521 0xe0: "a", # à LATIN SMALL LETTER A WITH GRAVE
522 0xe1: "a", # á LATIN SMALL LETTER A WITH ACUTE
523 0xe3: "a", # ã LATIN SMALL LETTER A WITH TILDE
524 0xe7: "c", # ç LATIN SMALL LETTER C WITH CEDILLA
525 0xe8: "e", # è LATIN SMALL LETTER E WITH GRAVE
526 0xe9: "e", # é LATIN SMALL LETTER E WITH ACUTE
527 0xea: "e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
528 0xec: "i", # ì LATIN SMALL LETTER I WITH GRAVE
529 0xed: "i", # í LATIN SMALL LETTER I WITH ACUTE
530 0xf2: "o", # ò LATIN SMALL LETTER O WITH GRAVE
531 0xf3: "o", # ó LATIN SMALL LETTER O WITH ACUTE
532 0xf5: "o", # õ LATIN SMALL LETTER O WITH TILDE
533 0xf9: "u", # ù LATIN SMALL LETTER U WITH GRAVE
534 0xfa: "u", # ú LATIN SMALL LETTER U WITH ACUTE
535 0x2018: "'", # ‘ LEFT SINGLE QUOTATION MARK
536 0x2019: "'", # ’ RIGHT SINGLE QUOTATION MARK
537 0x201c: '"', # “ LEFT DOUBLE QUOTATION MARK
538 0x201d: '"', # ” RIGHT DOUBLE QUOTATION MARK
539 }
542DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
545_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
546_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
547_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
550def bytes2human(nbytes, ndigits=0):
551 """Turns an integer value of *nbytes* into a human readable format. Set
552 *ndigits* to control how many digits after the decimal point
553 should be shown (default ``0``).
555 >>> bytes2human(128991)
556 '126K'
557 >>> bytes2human(100001221)
558 '95M'
559 >>> bytes2human(0, 2)
560 '0.00B'
561 """
562 abs_bytes = abs(nbytes)
563 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
564 if abs_bytes <= next_size:
565 break
566 hnbytes = float(nbytes) / size
567 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
568 ndigits=ndigits,
569 symbol=symbol)
572class HTMLTextExtractor(HTMLParser):
573 def __init__(self):
574 self.reset()
575 self.strict = False
576 self.convert_charrefs = True
577 self.result = []
579 def handle_data(self, d):
580 self.result.append(d)
582 def handle_charref(self, number):
583 if number[0] == 'x' or number[0] == 'X':
584 codepoint = int(number[1:], 16)
585 else:
586 codepoint = int(number)
587 self.result.append(chr(codepoint))
589 def handle_entityref(self, name):
590 try:
591 codepoint = htmlentitydefs.name2codepoint[name]
592 except KeyError:
593 self.result.append('&' + name + ';')
594 else:
595 self.result.append(chr(codepoint))
597 def get_text(self):
598 return ''.join(self.result)
601def html2text(html):
602 """Strips tags from HTML text, returning markup-free text. Also, does
603 a best effort replacement of entities like " "
605 >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>')
606 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
607 True
608 """
609 # based on answers to http://stackoverflow.com/questions/753052/
610 s = HTMLTextExtractor()
611 s.feed(html)
612 return s.get_text()
615_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
616_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
619def gunzip_bytes(bytestring):
620 """The :mod:`gzip` module is great if you have a file or file-like
621 object, but what if you just have bytes. StringIO is one
622 possibility, but it's often faster, easier, and simpler to just
623 use this one-liner. Use this tried-and-true utility function to
624 decompress gzip from bytes.
626 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
627 True
628 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
629 True
630 """
631 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
634def gzip_bytes(bytestring, level=6):
635 """Turn some bytes into some compressed bytes.
637 >>> len(gzip_bytes(b'a' * 10000))
638 46
640 Args:
641 bytestring (bytes): Bytes to be compressed
642 level (int): An integer, 1-9, controlling the
643 speed/compression. 1 is fastest, least compressed, 9 is
644 slowest, but most compressed.
646 Note that all levels of gzip are pretty fast these days, though
647 it's not really a competitor in compression, at any level.
648 """
649 out = StringIO()
650 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
651 f.write(bytestring)
652 f.close()
653 return out.getvalue()
657_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
658 re.UNICODE)
661def iter_splitlines(text):
662 r"""Like :meth:`str.splitlines`, but returns an iterator of lines
663 instead of a list. Also similar to :meth:`file.next`, as that also
664 lazily reads and yields lines from a file.
666 This function works with a variety of line endings, but as always,
667 be careful when mixing line endings within a file.
669 >>> list(iter_splitlines('\nhi\nbye\n'))
670 ['', 'hi', 'bye', '']
671 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
672 ['', 'hi', 'bye', '']
673 >>> list(iter_splitlines(''))
674 []
675 """
676 prev_end, len_text = 0, len(text)
677 # print('last: %r' % last_idx)
678 # start, end = None, None
679 for match in _line_ending_re.finditer(text):
680 start, end = match.start(1), match.end(1)
681 # print(start, end)
682 if prev_end <= start:
683 yield text[prev_end:start]
684 if end == len_text:
685 yield ''
686 prev_end = end
687 tail = text[prev_end:]
688 if tail:
689 yield tail
690 return
693def indent(text, margin, newline='\n', key=bool):
694 """The missing counterpart to the built-in :func:`textwrap.dedent`.
696 Args:
697 text (str): The text to indent.
698 margin (str): The string to prepend to each line.
699 newline (str): The newline used to rejoin the lines (default: ``\\n``)
700 key (callable): Called on each line to determine whether to
701 indent it. Default: :class:`bool`, to ensure that empty lines do
702 not get whitespace added.
703 """
704 indented_lines = [(margin + line if key(line) else line)
705 for line in iter_splitlines(text)]
706 return newline.join(indented_lines)
709def is_uuid(obj, version=4):
710 """Check the argument is either a valid UUID object or string.
712 Args:
713 obj (object): The test target. Strings and UUID objects supported.
714 version (int): The target UUID version, set to 0 to skip version check.
716 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
717 True
718 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
719 False
720 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
721 True
722 """
723 if not isinstance(obj, uuid.UUID):
724 try:
725 obj = uuid.UUID(obj)
726 except (TypeError, ValueError, AttributeError):
727 return False
728 if version and obj.version != int(version):
729 return False
730 return True
733def escape_shell_args(args, sep=' ', style=None):
734 """Returns an escaped version of each string in *args*, according to
735 *style*.
737 Args:
738 args (list): A list of arguments to escape and join together
739 sep (str): The separator used to join the escaped arguments.
740 style (str): The style of escaping to use. Can be one of
741 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
742 respectively. If *style* is ``None``, then it is picked
743 according to the system platform.
745 See :func:`args2cmd` and :func:`args2sh` for details and example
746 output for each style.
747 """
748 if not style:
749 style = 'cmd' if sys.platform == 'win32' else 'sh'
751 if style == 'sh':
752 return args2sh(args, sep=sep)
753 elif style == 'cmd':
754 return args2cmd(args, sep=sep)
756 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
759_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
762def args2sh(args, sep=' '):
763 """Return a shell-escaped string version of *args*, separated by
764 *sep*, based on the rules of sh, bash, and other shells in the
765 Linux/BSD/MacOS ecosystem.
767 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
768 aa '[bb]' 'cc'"'"'cc' 'dd"dd'
770 As you can see, arguments with no special characters are not
771 escaped, arguments with special characters are quoted with single
772 quotes, and single quotes themselves are quoted with double
773 quotes. Double quotes are handled like any other special
774 character.
776 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
777 note that :mod:`shlex` and :mod:`argparse` have functions to split
778 and parse strings escaped in this manner.
779 """
780 ret_list = []
782 for arg in args:
783 if not arg:
784 ret_list.append("''")
785 continue
786 if _find_sh_unsafe(arg) is None:
787 ret_list.append(arg)
788 continue
789 # use single quotes, and put single quotes into double quotes
790 # the string $'b is then quoted as '$'"'"'b'
791 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
793 return ' '.join(ret_list)
796def args2cmd(args, sep=' '):
797 r"""Return a shell-escaped string version of *args*, separated by
798 *sep*, using the same rules as the Microsoft C runtime.
800 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
801 aa [bb] cc'cc dd\"dd
803 As you can see, escaping is through backslashing and not quoting,
804 and double quotes are the only special character. See the comment
805 in the code for more details. Based on internal code from the
806 :mod:`subprocess` module.
808 """
809 # technique description from subprocess below
810 """
811 1) Arguments are delimited by white space, which is either a
812 space or a tab.
814 2) A string surrounded by double quotation marks is
815 interpreted as a single argument, regardless of white space
816 contained within. A quoted string can be embedded in an
817 argument.
819 3) A double quotation mark preceded by a backslash is
820 interpreted as a literal double quotation mark.
822 4) Backslashes are interpreted literally, unless they
823 immediately precede a double quotation mark.
825 5) If backslashes immediately precede a double quotation mark,
826 every pair of backslashes is interpreted as a literal
827 backslash. If the number of backslashes is odd, the last
828 backslash escapes the next double quotation mark as
829 described in rule 3.
831 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
832 or search http://msdn.microsoft.com for
833 "Parsing C++ Command-Line Arguments"
834 """
835 result = []
836 needquote = False
837 for arg in args:
838 bs_buf = []
840 # Add a space to separate this argument from the others
841 if result:
842 result.append(' ')
844 needquote = (" " in arg) or ("\t" in arg) or not arg
845 if needquote:
846 result.append('"')
848 for c in arg:
849 if c == '\\':
850 # Don't know if we need to double yet.
851 bs_buf.append(c)
852 elif c == '"':
853 # Double backslashes.
854 result.append('\\' * len(bs_buf)*2)
855 bs_buf = []
856 result.append('\\"')
857 else:
858 # Normal char
859 if bs_buf:
860 result.extend(bs_buf)
861 bs_buf = []
862 result.append(c)
864 # Add remaining backslashes, if any.
865 if bs_buf:
866 result.extend(bs_buf)
868 if needquote:
869 result.extend(bs_buf)
870 result.append('"')
872 return ''.join(result)
875def parse_int_list(range_string, delim=',', range_delim='-'):
876 """Returns a sorted list of positive integers based on
877 *range_string*. Reverse of :func:`format_int_list`.
879 Args:
880 range_string (str): String of comma separated positive
881 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
882 page range string used in printer dialogs.
883 delim (char): Defaults to ','. Separates integers and
884 contiguous ranges of integers.
885 range_delim (char): Defaults to '-'. Indicates a contiguous
886 range of integers.
888 >>> parse_int_list('1,3,5-8,10-11,15')
889 [1, 3, 5, 6, 7, 8, 10, 11, 15]
891 """
892 output = []
894 for x in range_string.strip().split(delim):
896 # Range
897 if range_delim in x:
898 range_limits = list(map(int, x.split(range_delim)))
899 output += list(range(min(range_limits), max(range_limits)+1))
901 # Empty String
902 elif not x:
903 continue
905 # Integer
906 else:
907 output.append(int(x))
909 return sorted(output)
912def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
913 """Returns a sorted range string from a list of positive integers
914 (*int_list*). Contiguous ranges of integers are collapsed to min
915 and max values. Reverse of :func:`parse_int_list`.
917 Args:
918 int_list (list): List of positive integers to be converted
919 into a range string (e.g. [1,2,4,5,6,8]).
920 delim (char): Defaults to ','. Separates integers and
921 contiguous ranges of integers.
922 range_delim (char): Defaults to '-'. Indicates a contiguous
923 range of integers.
924 delim_space (bool): Defaults to ``False``. If ``True``, adds a
925 space after all *delim* characters.
927 >>> format_int_list([1,3,5,6,7,8,10,11,15])
928 '1,3,5-8,10-11,15'
930 """
931 output = []
932 contig_range = collections.deque()
934 for x in sorted(int_list):
936 # Handle current (and first) value.
937 if len(contig_range) < 1:
938 contig_range.append(x)
940 # Handle current value, given multiple previous values are contiguous.
941 elif len(contig_range) > 1:
942 delta = x - contig_range[-1]
944 # Current value is contiguous.
945 if delta == 1:
946 contig_range.append(x)
948 # Current value is non-contiguous.
949 elif delta > 1:
950 range_substr = '{:d}{}{:d}'.format(min(contig_range),
951 range_delim,
952 max(contig_range))
953 output.append(range_substr)
954 contig_range.clear()
955 contig_range.append(x)
957 # Current value repeated.
958 else:
959 continue
961 # Handle current value, given no previous contiguous integers
962 else:
963 delta = x - contig_range[0]
965 # Current value is contiguous.
966 if delta == 1:
967 contig_range.append(x)
969 # Current value is non-contiguous.
970 elif delta > 1:
971 output.append(f'{contig_range.popleft():d}')
972 contig_range.append(x)
974 # Current value repeated.
975 else:
976 continue
978 # Handle the last value.
979 else:
981 # Last value is non-contiguous.
982 if len(contig_range) == 1:
983 output.append(f'{contig_range.popleft():d}')
984 contig_range.clear()
986 # Last value is part of contiguous range.
987 elif len(contig_range) > 1:
988 range_substr = '{:d}{}{:d}'.format(min(contig_range),
989 range_delim,
990 max(contig_range))
991 output.append(range_substr)
992 contig_range.clear()
994 if delim_space:
995 output_str = (delim+' ').join(output)
996 else:
997 output_str = delim.join(output)
999 return output_str
1002def complement_int_list(
1003 range_string, range_start=0, range_end=None,
1004 delim=',', range_delim='-'):
1005 """ Returns range string that is the complement of the one provided as
1006 *range_string* parameter.
1008 These range strings are of the kind produce by :func:`format_int_list`, and
1009 parseable by :func:`parse_int_list`.
1011 Args:
1012 range_string (str): String of comma separated positive integers or
1013 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1014 used in printer dialogs.
1015 range_start (int): A positive integer from which to start the resulting
1016 range. Value is inclusive. Defaults to ``0``.
1017 range_end (int): A positive integer from which the produced range is
1018 stopped. Value is exclusive. Defaults to the maximum value found in
1019 the provided ``range_string``.
1020 delim (char): Defaults to ','. Separates integers and contiguous ranges
1021 of integers.
1022 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1023 integers.
1025 >>> complement_int_list('1,3,5-8,10-11,15')
1026 '0,2,4,9,12-14'
1028 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)
1029 '0,2,4,9,12-14'
1031 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)
1032 '2,4,9,12-14'
1034 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)
1035 '2,4,9,12-14'
1037 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)
1038 '4,9,12-14'
1040 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)
1041 '0,2,4,9,12-14'
1043 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)
1044 '0,2,4,9,12-13'
1046 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)
1047 '0,2,4,9,12'
1049 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)
1050 '0,2,4,9,12-14,16-19'
1052 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)
1053 ''
1055 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)
1056 '0,2,4,9,12-14'
1058 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)
1059 ''
1061 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)
1062 ''
1064 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)
1065 ''
1067 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)
1068 '2'
1070 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)
1071 ''
1073 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)
1074 ''
1076 >>> complement_int_list('')
1077 ''
1078 """
1079 int_list = set(parse_int_list(range_string, delim, range_delim))
1080 if range_end is None:
1081 if int_list:
1082 range_end = max(int_list) + 1
1083 else:
1084 range_end = range_start
1085 complement_values = set(
1086 range(range_end)) - int_list - set(range(range_start))
1087 return format_int_list(complement_values, delim, range_delim)
1090def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):
1091 """ Transform a string of ranges (*range_string*) into a tuple of tuples.
1093 Args:
1094 range_string (str): String of comma separated positive integers or
1095 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1096 used in printer dialogs.
1097 delim (char): Defaults to ','. Separates integers and contiguous ranges
1098 of integers.
1099 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1100 integers.
1102 >>> int_ranges_from_int_list('1,3,5-8,10-11,15')
1103 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))
1105 >>> int_ranges_from_int_list('1')
1106 ((1, 1),)
1108 >>> int_ranges_from_int_list('')
1109 ()
1110 """
1111 int_tuples = []
1112 # Normalize the range string to our internal format for processing.
1113 range_string = format_int_list(
1114 parse_int_list(range_string, delim, range_delim))
1115 if range_string:
1116 for bounds in range_string.split(','):
1117 if '-' in bounds:
1118 start, end = bounds.split('-')
1119 else:
1120 start, end = bounds, bounds
1121 int_tuples.append((int(start), int(end)))
1122 return tuple(int_tuples)
1125class MultiReplace:
1126 """
1127 MultiReplace is a tool for doing multiple find/replace actions in one pass.
1129 Given a mapping of values to be replaced it allows for all of the matching
1130 values to be replaced in a single pass which can save a lot of performance
1131 on very large strings. In addition to simple replace, it also allows for
1132 replacing based on regular expressions.
1134 Keyword Arguments:
1136 :type regex: bool
1137 :param regex: Treat search keys as regular expressions [Default: False]
1138 :type flags: int
1139 :param flags: flags to pass to the regex engine during compile
1141 Dictionary Usage::
1143 from boltons import stringutils
1144 s = stringutils.MultiReplace({
1145 'foo': 'zoo',
1146 'cat': 'hat',
1147 'bat': 'kraken'
1148 })
1149 new = s.sub('The foo bar cat ate a bat')
1150 new == 'The zoo bar hat ate a kraken'
1152 Iterable Usage::
1154 from boltons import stringutils
1155 s = stringutils.MultiReplace([
1156 ('foo', 'zoo'),
1157 ('cat', 'hat'),
1158 ('bat', 'kraken)'
1159 ])
1160 new = s.sub('The foo bar cat ate a bat')
1161 new == 'The zoo bar hat ate a kraken'
1164 The constructor can be passed a dictionary or other mapping as well as
1165 an iterable of tuples. If given an iterable, the substitution will be run
1166 in the order the replacement values are specified in the iterable. This is
1167 also true if it is given an OrderedDict. If given a dictionary then the
1168 order will be non-deterministic::
1170 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
1171 'bar bar bar'
1172 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
1173 >>> m.sub('foo bar baz')
1174 'baz bar bar'
1176 This is because the order of replacement can matter if you're inserting
1177 something that might be replaced by a later substitution. Pay attention and
1178 if you need to rely on order then consider using a list of tuples instead
1179 of a dictionary.
1180 """
1182 def __init__(self, sub_map, **kwargs):
1183 """Compile any regular expressions that have been passed."""
1184 options = {
1185 'regex': False,
1186 'flags': 0,
1187 }
1188 options.update(kwargs)
1189 self.group_map = {}
1190 regex_values = []
1192 if isinstance(sub_map, Mapping):
1193 sub_map = sub_map.items()
1195 for idx, vals in enumerate(sub_map):
1196 group_name = f'group{idx}'
1197 if isinstance(vals[0], str):
1198 # If we're not treating input strings like a regex, escape it
1199 if not options['regex']:
1200 exp = re.escape(vals[0])
1201 else:
1202 exp = vals[0]
1203 else:
1204 exp = vals[0].pattern
1206 regex_values.append(f'(?P<{group_name}>{exp})')
1207 self.group_map[group_name] = vals[1]
1209 self.combined_pattern = re.compile(
1210 '|'.join(regex_values),
1211 flags=options['flags']
1212 )
1214 def _get_value(self, match):
1215 """Given a match object find replacement value."""
1216 group_dict = match.groupdict()
1217 key = [x for x in group_dict if group_dict[x]][0]
1218 return self.group_map[key]
1220 def sub(self, text):
1221 """
1222 Run substitutions on the input text.
1224 Given an input string, run all substitutions given in the
1225 constructor.
1226 """
1227 return self.combined_pattern.sub(self._get_value, text)
1230def multi_replace(text, sub_map, **kwargs):
1231 """
1232 Shortcut function to invoke MultiReplace in a single call.
1234 Example Usage::
1236 from boltons.stringutils import multi_replace
1237 new = multi_replace(
1238 'The foo bar cat ate a bat',
1239 {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}
1240 )
1241 new == 'The zoo bar hat ate a kraken'
1242 """
1243 m = MultiReplace(sub_map, **kwargs)
1244 return m.sub(text)
1247def unwrap_text(text, ending='\n\n'):
1248 r"""
1249 Unwrap text, the natural complement to :func:`textwrap.wrap`.
1251 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
1252 >>> unwrap_text(text)
1253 'Short lines wrapped small.\n\nAnother paragraph.'
1255 Args:
1256 text: A string to unwrap.
1257 ending (str): The string to join all unwrapped paragraphs
1258 by. Pass ``None`` to get the list. Defaults to '\n\n' for
1259 compatibility with Markdown and RST.
1261 """
1262 all_grafs = []
1263 cur_graf = []
1264 for line in text.splitlines():
1265 line = line.strip()
1266 if line:
1267 cur_graf.append(line)
1268 else:
1269 all_grafs.append(' '.join(cur_graf))
1270 cur_graf = []
1271 if cur_graf:
1272 all_grafs.append(' '.join(cur_graf))
1273 if ending is None:
1274 return all_grafs
1275 return ending.join(all_grafs)