Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/boltons/strutils.py: 22%
405 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:13 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:13 +0000
1# -*- coding: utf-8 -*-
3# Copyright (c) 2013, Mahmoud Hashemi
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9# * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#
12# * Redistributions in binary form must reproduce the above
13# copyright notice, this list of conditions and the following
14# disclaimer in the documentation and/or other materials provided
15# with the distribution.
16#
17# * The names of the contributors may not be used to endorse or
18# promote products derived from this software without specific
19# prior written permission.
20#
21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33"""So much practical programming involves string manipulation, which
34Python readily accommodates. Still, there are dozens of basic and
35common capabilities missing from the standard library, several of them
36provided by ``strutils``.
37"""
39from __future__ import print_function
41import re
42import sys
43import uuid
44import zlib
45import string
46import unicodedata
47import collections
48from gzip import GzipFile
50try:
51 from cStringIO import cStringIO as StringIO
52except ImportError:
53 from io import BytesIO as StringIO
55try:
56 from collections.abc import Mapping
57except ImportError:
58 from collections import Mapping
60try:
61 unicode, str, bytes, basestring = unicode, str, str, basestring
62 from HTMLParser import HTMLParser
63 import htmlentitydefs
64except NameError: # basestring not defined in Python 3
65 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes)
66 unichr = chr
67 from html.parser import HTMLParser
68 from html import entities as htmlentitydefs
70try:
71 import __builtin__ as builtins
72except ImportError:
73 import builtins
75__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
76 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
77 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
78 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
79 'iter_splitlines', 'indent', 'escape_shell_args',
80 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',
81 'int_list_complement', 'int_list_to_int_tuples', 'MultiReplace',
82 'multi_replace', 'unwrap_text']
85_punct_ws_str = string.punctuation + string.whitespace
86_punct_re = re.compile('[' + _punct_ws_str + ']+')
87_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
90def camel2under(camel_string):
91 """Converts a camelcased string to underscores. Useful for turning a
92 class name into a function name.
94 >>> camel2under('BasicParseTest')
95 'basic_parse_test'
96 """
97 return _camel2under_re.sub(r'_\1', camel_string).lower()
100def under2camel(under_string):
101 """Converts an underscored string to camelcased. Useful for turning a
102 function name into a class name.
104 >>> under2camel('complex_tokenizer')
105 'ComplexTokenizer'
106 """
107 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
110def slugify(text, delim='_', lower=True, ascii=False):
111 """
112 A basic function that turns text full of scary characters
113 (i.e., punctuation and whitespace), into a relatively safe
114 lowercased string separated only by the delimiter specified
115 by *delim*, which defaults to ``_``.
117 The *ascii* convenience flag will :func:`asciify` the slug if
118 you require ascii-only slugs.
120 >>> slugify('First post! Hi!!!!~1 ')
121 'first_post_hi_1'
123 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
124 b'kurt_goedel_s_pretty_cool'
125 True
127 """
128 ret = delim.join(split_punct_ws(text)) or delim if text else ''
129 if ascii:
130 ret = asciify(ret)
131 if lower:
132 ret = ret.lower()
133 return ret
136def split_punct_ws(text):
137 """While :meth:`str.split` will split on whitespace,
138 :func:`split_punct_ws` will split on punctuation and
139 whitespace. This used internally by :func:`slugify`, above.
141 >>> split_punct_ws('First post! Hi!!!!~1 ')
142 ['First', 'post', 'Hi', '1']
143 """
144 return [w for w in _punct_re.split(text) if w]
147def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
148 """Returns a plain-English description of an iterable's
149 :func:`len()`, conditionally pluralized with :func:`cardinalize`,
150 detailed below.
152 >>> print(unit_len(range(10), 'number'))
153 10 numbers
154 >>> print(unit_len('aeiou', 'vowel'))
155 5 vowels
156 >>> print(unit_len([], 'worry'))
157 No worries
158 """
159 count = len(sized_iterable)
160 units = cardinalize(unit_noun, count)
161 if count:
162 return u'%s %s' % (count, units)
163 return u'No %s' % (units,)
166_ORDINAL_MAP = {'1': 'st',
167 '2': 'nd',
168 '3': 'rd'} # 'th' is the default
171def ordinalize(number, ext_only=False):
172 """Turns *number* into its cardinal form, i.e., 1st, 2nd,
173 3rd, 4th, etc. If the last character isn't a digit, it returns the
174 string value unchanged.
176 Args:
177 number (int or str): Number to be cardinalized.
178 ext_only (bool): Whether to return only the suffix. Default ``False``.
180 >>> print(ordinalize(1))
181 1st
182 >>> print(ordinalize(3694839230))
183 3694839230th
184 >>> print(ordinalize('hi'))
185 hi
186 >>> print(ordinalize(1515))
187 1515th
188 """
189 numstr, ext = unicode(number), ''
190 if numstr and numstr[-1] in string.digits:
191 try:
192 # first check for teens
193 if numstr[-2] == '1':
194 ext = 'th'
195 else:
196 # all other cases
197 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
198 except IndexError:
199 # single digit numbers (will reach here based on [-2] above)
200 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
201 if ext_only:
202 return ext
203 else:
204 return numstr + ext
207def cardinalize(unit_noun, count):
208 """Conditionally pluralizes a singular word *unit_noun* if
209 *count* is not one, preserving case when possible.
211 >>> vowels = 'aeiou'
212 >>> print(len(vowels), cardinalize('vowel', len(vowels)))
213 5 vowels
214 >>> print(3, cardinalize('Wish', 3))
215 3 Wishes
216 """
217 if count == 1:
218 return unit_noun
219 return pluralize(unit_noun)
222def singularize(word):
223 """Semi-intelligently converts an English plural *word* to its
224 singular form, preserving case pattern.
226 >>> singularize('chances')
227 'chance'
228 >>> singularize('Activities')
229 'Activity'
230 >>> singularize('Glasses')
231 'Glass'
232 >>> singularize('FEET')
233 'FOOT'
235 """
236 orig_word, word = word, word.strip().lower()
237 if not word or word in _IRR_S2P:
238 return orig_word
240 irr_singular = _IRR_P2S.get(word)
241 if irr_singular:
242 singular = irr_singular
243 elif not word.endswith('s'):
244 return orig_word
245 elif len(word) == 2:
246 singular = word[:-1] # or just return word?
247 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
248 singular = word[:-3] + 'y'
249 elif word.endswith('es') and word[-3] == 's':
250 singular = word[:-2]
251 else:
252 singular = word[:-1]
253 return _match_case(orig_word, singular)
256def pluralize(word):
257 """Semi-intelligently converts an English *word* from singular form to
258 plural, preserving case pattern.
260 >>> pluralize('friend')
261 'friends'
262 >>> pluralize('enemy')
263 'enemies'
264 >>> pluralize('Sheep')
265 'Sheep'
266 """
267 orig_word, word = word, word.strip().lower()
268 if not word or word in _IRR_P2S:
269 return orig_word
270 irr_plural = _IRR_S2P.get(word)
271 if irr_plural:
272 plural = irr_plural
273 elif word.endswith('y') and word[-2:-1] not in 'aeiou':
274 plural = word[:-1] + 'ies'
275 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
276 plural = word if word.endswith('es') else word + 'es'
277 else:
278 plural = word + 's'
279 return _match_case(orig_word, plural)
282def _match_case(master, disciple):
283 if not master.strip():
284 return disciple
285 if master.lower() == master:
286 return disciple.lower()
287 elif master.upper() == master:
288 return disciple.upper()
289 elif master.title() == master:
290 return disciple.title()
291 return disciple
294# Singular to plural map of irregular pluralizations
295_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
296 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
297 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
298 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
299 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
300 'calf': 'calves', 'child': 'children', 'corps': 'corps',
301 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
302 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
303 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
304 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
305 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
306 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
307 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
308 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
309 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
310 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
311 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
312 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
313 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
314 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
315 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
316 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
317 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
318 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
319 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
320 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
321 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
322 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
323 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
324 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
325 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
326 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
327 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
328 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
329 'wolf': 'wolves', 'woman': 'women'}
332# Reverse index of the above
333_IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()])
335HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
338def find_hashtags(string):
339 """Finds and returns all hashtags in a string, with the hashmark
340 removed. Supports full-width hashmarks for Asian languages and
341 does not false-positive on URL anchors.
343 >>> find_hashtags('#atag http://asite/#ananchor')
344 ['atag']
346 ``find_hashtags`` also works with unicode hashtags.
347 """
349 # the following works, doctest just struggles with it
350 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
351 # [u'\u80af\u5fb7\u57fa']
352 return HASHTAG_RE.findall(string)
355def a10n(string):
356 """That thing where "internationalization" becomes "i18n", what's it
357 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
358 of `numeronym`_.)
360 >>> a10n('abbreviation')
361 'a10n'
362 >>> a10n('internationalization')
363 'i18n'
364 >>> a10n('')
365 ''
367 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
368 """
369 if len(string) < 3:
370 return string
371 return '%s%s%s' % (string[0], len(string[1:-1]), string[-1])
374# Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences
375ANSI_SEQUENCES = re.compile(r'''
376 \x1B # Sequence starts with ESC, i.e. hex 0x1B
377 (?:
378 [@-Z\\-_] # Second byte:
379 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_
380 | # Or
381 \[ # CSI sequences, starting with [
382 [0-?]* # Parameter bytes:
383 # range 0x30–0x3F, ASCII 0–9:;<=>?
384 [ -/]* # Intermediate bytes:
385 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./
386 [@-~] # Final byte
387 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~
388 )
389''', re.VERBOSE)
392def strip_ansi(text):
393 """Strips ANSI escape codes from *text*. Useful for the occasional
394 time when a log or redirected output accidentally captures console
395 color codes and the like.
397 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')
398 'art'
400 Supports unicode, str, bytes and bytearray content as input. Returns the
401 same type as the input.
403 There's a lot of ANSI art available for testing on `sixteencolors.net`_.
404 This function does not interpret or render ANSI art, but you can do so with
405 `ansi2img`_ or `escapes.js`_.
407 .. _sixteencolors.net: http://sixteencolors.net
408 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
409 .. _escapes.js: https://github.com/atdt/escapes.js
410 """
411 # TODO: move to cliutils.py
413 # Transform any ASCII-like content to unicode to allow regex to match, and
414 # save input type for later.
415 target_type = None
416 # Unicode type aliased to str is code-smell for Boltons in Python 3 env.
417 is_py3 = (unicode == builtins.str)
418 if is_py3 and isinstance(text, (bytes, bytearray)):
419 target_type = type(text)
420 text = text.decode('utf-8')
422 cleaned = ANSI_SEQUENCES.sub('', text)
424 # Transform back the result to the same bytearray type provided by the user.
425 if target_type and target_type != type(cleaned):
426 cleaned = target_type(cleaned, 'utf-8')
428 return cleaned
431def asciify(text, ignore=False):
432 """Converts a unicode or bytestring, *text*, into a bytestring with
433 just ascii characters. Performs basic deaccenting for all you
434 Europhiles out there.
436 Also, a gentle reminder that this is a **utility**, primarily meant
437 for slugification. Whenever possible, make your application work
438 **with** unicode, not against it.
440 Args:
441 text (str or unicode): The string to be asciified.
442 ignore (bool): Configures final encoding to ignore remaining
443 unasciified unicode instead of replacing it.
445 >>> asciify('Beyoncé') == b'Beyonce'
446 True
447 """
448 try:
449 try:
450 return text.encode('ascii')
451 except UnicodeDecodeError:
452 # this usually means you passed in a non-unicode string
453 text = text.decode('utf-8')
454 return text.encode('ascii')
455 except UnicodeEncodeError:
456 mode = 'replace'
457 if ignore:
458 mode = 'ignore'
459 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
460 ret = transd.encode('ascii', mode)
461 return ret
464def is_ascii(text):
465 """Check if a unicode or bytestring, *text*, is composed of ascii
466 characters only. Raises :exc:`ValueError` if argument is not text.
468 Args:
469 text (str or unicode): The string to be checked.
471 >>> is_ascii('Beyoncé')
472 False
473 >>> is_ascii('Beyonce')
474 True
475 """
476 if isinstance(text, unicode):
477 try:
478 text.encode('ascii')
479 except UnicodeEncodeError:
480 return False
481 elif isinstance(text, bytes):
482 try:
483 text.decode('ascii')
484 except UnicodeDecodeError:
485 return False
486 else:
487 raise ValueError('expected text or bytes, not %r' % type(text))
488 return True
491class DeaccenterDict(dict):
492 "A small caching dictionary for deaccenting."
493 def __missing__(self, key):
494 ch = self.get(key)
495 if ch is not None:
496 return ch
497 try:
498 de = unicodedata.decomposition(unichr(key))
499 p1, _, p2 = de.rpartition(' ')
500 if int(p2, 16) == 0x308:
501 ch = self.get(key)
502 else:
503 ch = int(p1, 16)
504 except (IndexError, ValueError):
505 ch = self.get(key, key)
506 self[key] = ch
507 return ch
509 try:
510 from collections import defaultdict
511 except ImportError:
512 # no defaultdict means that __missing__ isn't supported in
513 # this version of python, so we define __getitem__
514 def __getitem__(self, key):
515 try:
516 return super(DeaccenterDict, self).__getitem__(key)
517 except KeyError:
518 return self.__missing__(key)
519 else:
520 del defaultdict
523# http://chmullig.com/2009/12/python-unicode-ascii-ifier/
524# For something more complete, investigate the unidecode
525# or isounidecode packages, which are capable of performing
526# crude transliteration.
527_BASE_DEACCENT_MAP = {
528 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
529 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
530 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
531 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
532 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
533 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
534 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
535 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
536 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
537 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
538 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
539 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
540 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
541 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
542 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
543 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
544 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
545 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
546 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
547 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
548 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
549 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
550 0xe6: u"ae", # æ LATIN SMALL LETTER AE
551 0xf0: u"d", # ð LATIN SMALL LETTER ETH
552 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
553 0xfe: u"th", # þ LATIN SMALL LETTER THORN,
554 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
555 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
556 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
557 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
558 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
559 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
560 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
561 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
562 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
563 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
564 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
565 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
566 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
567 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
568 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
569 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
570 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
571 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK
572 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK
573 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
574 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
575 }
578DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
581_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
582_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
583_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
586def bytes2human(nbytes, ndigits=0):
587 """Turns an integer value of *nbytes* into a human readable format. Set
588 *ndigits* to control how many digits after the decimal point
589 should be shown (default ``0``).
591 >>> bytes2human(128991)
592 '126K'
593 >>> bytes2human(100001221)
594 '95M'
595 >>> bytes2human(0, 2)
596 '0.00B'
597 """
598 abs_bytes = abs(nbytes)
599 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
600 if abs_bytes <= next_size:
601 break
602 hnbytes = float(nbytes) / size
603 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
604 ndigits=ndigits,
605 symbol=symbol)
608class HTMLTextExtractor(HTMLParser):
609 def __init__(self):
610 self.reset()
611 self.strict = False
612 self.convert_charrefs = True
613 self.result = []
615 def handle_data(self, d):
616 self.result.append(d)
618 def handle_charref(self, number):
619 if number[0] == u'x' or number[0] == u'X':
620 codepoint = int(number[1:], 16)
621 else:
622 codepoint = int(number)
623 self.result.append(unichr(codepoint))
625 def handle_entityref(self, name):
626 try:
627 codepoint = htmlentitydefs.name2codepoint[name]
628 except KeyError:
629 self.result.append(u'&' + name + u';')
630 else:
631 self.result.append(unichr(codepoint))
633 def get_text(self):
634 return u''.join(self.result)
637def html2text(html):
638 """Strips tags from HTML text, returning markup-free text. Also, does
639 a best effort replacement of entities like " "
641 >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>')
642 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
643 True
644 """
645 # based on answers to http://stackoverflow.com/questions/753052/
646 s = HTMLTextExtractor()
647 s.feed(html)
648 return s.get_text()
651_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
652_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
655def gunzip_bytes(bytestring):
656 """The :mod:`gzip` module is great if you have a file or file-like
657 object, but what if you just have bytes. StringIO is one
658 possibility, but it's often faster, easier, and simpler to just
659 use this one-liner. Use this tried-and-true utility function to
660 decompress gzip from bytes.
662 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
663 True
664 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
665 True
666 """
667 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
670def gzip_bytes(bytestring, level=6):
671 """Turn some bytes into some compressed bytes.
673 >>> len(gzip_bytes(b'a' * 10000))
674 46
676 Args:
677 bytestring (bytes): Bytes to be compressed
678 level (int): An integer, 1-9, controlling the
679 speed/compression. 1 is fastest, least compressed, 9 is
680 slowest, but most compressed.
682 Note that all levels of gzip are pretty fast these days, though
683 it's not really a competitor in compression, at any level.
684 """
685 out = StringIO()
686 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
687 f.write(bytestring)
688 f.close()
689 return out.getvalue()
693_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
694 re.UNICODE)
697def iter_splitlines(text):
698 r"""Like :meth:`str.splitlines`, but returns an iterator of lines
699 instead of a list. Also similar to :meth:`file.next`, as that also
700 lazily reads and yields lines from a file.
702 This function works with a variety of line endings, but as always,
703 be careful when mixing line endings within a file.
705 >>> list(iter_splitlines('\nhi\nbye\n'))
706 ['', 'hi', 'bye', '']
707 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
708 ['', 'hi', 'bye', '']
709 >>> list(iter_splitlines(''))
710 []
711 """
712 prev_end, len_text = 0, len(text)
713 # print('last: %r' % last_idx)
714 # start, end = None, None
715 for match in _line_ending_re.finditer(text):
716 start, end = match.start(1), match.end(1)
717 # print(start, end)
718 if prev_end <= start:
719 yield text[prev_end:start]
720 if end == len_text:
721 yield ''
722 prev_end = end
723 tail = text[prev_end:]
724 if tail:
725 yield tail
726 return
729def indent(text, margin, newline='\n', key=bool):
730 """The missing counterpart to the built-in :func:`textwrap.dedent`.
732 Args:
733 text (str): The text to indent.
734 margin (str): The string to prepend to each line.
735 newline (str): The newline used to rejoin the lines (default: ``\\n``)
736 key (callable): Called on each line to determine whether to
737 indent it. Default: :class:`bool`, to ensure that empty lines do
738 not get whitespace added.
739 """
740 indented_lines = [(margin + line if key(line) else line)
741 for line in iter_splitlines(text)]
742 return newline.join(indented_lines)
745def is_uuid(obj, version=4):
746 """Check the argument is either a valid UUID object or string.
748 Args:
749 obj (object): The test target. Strings and UUID objects supported.
750 version (int): The target UUID version, set to 0 to skip version check.
752 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
753 True
754 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
755 False
756 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
757 True
758 """
759 if not isinstance(obj, uuid.UUID):
760 try:
761 obj = uuid.UUID(obj)
762 except (TypeError, ValueError, AttributeError):
763 return False
764 if version and obj.version != int(version):
765 return False
766 return True
769def escape_shell_args(args, sep=' ', style=None):
770 """Returns an escaped version of each string in *args*, according to
771 *style*.
773 Args:
774 args (list): A list of arguments to escape and join together
775 sep (str): The separator used to join the escaped arguments.
776 style (str): The style of escaping to use. Can be one of
777 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
778 respectively. If *style* is ``None``, then it is picked
779 according to the system platform.
781 See :func:`args2cmd` and :func:`args2sh` for details and example
782 output for each style.
783 """
784 if not style:
785 style = 'cmd' if sys.platform == 'win32' else 'sh'
787 if style == 'sh':
788 return args2sh(args, sep=sep)
789 elif style == 'cmd':
790 return args2cmd(args, sep=sep)
792 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
795_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
798def args2sh(args, sep=' '):
799 """Return a shell-escaped string version of *args*, separated by
800 *sep*, based on the rules of sh, bash, and other shells in the
801 Linux/BSD/MacOS ecosystem.
803 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
804 aa '[bb]' 'cc'"'"'cc' 'dd"dd'
806 As you can see, arguments with no special characters are not
807 escaped, arguments with special characters are quoted with single
808 quotes, and single quotes themselves are quoted with double
809 quotes. Double quotes are handled like any other special
810 character.
812 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
813 note that :mod:`shlex` and :mod:`argparse` have functions to split
814 and parse strings escaped in this manner.
815 """
816 ret_list = []
818 for arg in args:
819 if not arg:
820 ret_list.append("''")
821 continue
822 if _find_sh_unsafe(arg) is None:
823 ret_list.append(arg)
824 continue
825 # use single quotes, and put single quotes into double quotes
826 # the string $'b is then quoted as '$'"'"'b'
827 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
829 return ' '.join(ret_list)
832def args2cmd(args, sep=' '):
833 r"""Return a shell-escaped string version of *args*, separated by
834 *sep*, using the same rules as the Microsoft C runtime.
836 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
837 aa [bb] cc'cc dd\"dd
839 As you can see, escaping is through backslashing and not quoting,
840 and double quotes are the only special character. See the comment
841 in the code for more details. Based on internal code from the
842 :mod:`subprocess` module.
844 """
845 # technique description from subprocess below
846 """
847 1) Arguments are delimited by white space, which is either a
848 space or a tab.
850 2) A string surrounded by double quotation marks is
851 interpreted as a single argument, regardless of white space
852 contained within. A quoted string can be embedded in an
853 argument.
855 3) A double quotation mark preceded by a backslash is
856 interpreted as a literal double quotation mark.
858 4) Backslashes are interpreted literally, unless they
859 immediately precede a double quotation mark.
861 5) If backslashes immediately precede a double quotation mark,
862 every pair of backslashes is interpreted as a literal
863 backslash. If the number of backslashes is odd, the last
864 backslash escapes the next double quotation mark as
865 described in rule 3.
867 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
868 or search http://msdn.microsoft.com for
869 "Parsing C++ Command-Line Arguments"
870 """
871 result = []
872 needquote = False
873 for arg in args:
874 bs_buf = []
876 # Add a space to separate this argument from the others
877 if result:
878 result.append(' ')
880 needquote = (" " in arg) or ("\t" in arg) or not arg
881 if needquote:
882 result.append('"')
884 for c in arg:
885 if c == '\\':
886 # Don't know if we need to double yet.
887 bs_buf.append(c)
888 elif c == '"':
889 # Double backslashes.
890 result.append('\\' * len(bs_buf)*2)
891 bs_buf = []
892 result.append('\\"')
893 else:
894 # Normal char
895 if bs_buf:
896 result.extend(bs_buf)
897 bs_buf = []
898 result.append(c)
900 # Add remaining backslashes, if any.
901 if bs_buf:
902 result.extend(bs_buf)
904 if needquote:
905 result.extend(bs_buf)
906 result.append('"')
908 return ''.join(result)
911def parse_int_list(range_string, delim=',', range_delim='-'):
912 """Returns a sorted list of positive integers based on
913 *range_string*. Reverse of :func:`format_int_list`.
915 Args:
916 range_string (str): String of comma separated positive
917 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
918 page range string used in printer dialogs.
919 delim (char): Defaults to ','. Separates integers and
920 contiguous ranges of integers.
921 range_delim (char): Defaults to '-'. Indicates a contiguous
922 range of integers.
924 >>> parse_int_list('1,3,5-8,10-11,15')
925 [1, 3, 5, 6, 7, 8, 10, 11, 15]
927 """
928 output = []
930 for x in range_string.strip().split(delim):
932 # Range
933 if range_delim in x:
934 range_limits = list(map(int, x.split(range_delim)))
935 output += list(range(min(range_limits), max(range_limits)+1))
937 # Empty String
938 elif not x:
939 continue
941 # Integer
942 else:
943 output.append(int(x))
945 return sorted(output)
948def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
949 """Returns a sorted range string from a list of positive integers
950 (*int_list*). Contiguous ranges of integers are collapsed to min
951 and max values. Reverse of :func:`parse_int_list`.
953 Args:
954 int_list (list): List of positive integers to be converted
955 into a range string (e.g. [1,2,4,5,6,8]).
956 delim (char): Defaults to ','. Separates integers and
957 contiguous ranges of integers.
958 range_delim (char): Defaults to '-'. Indicates a contiguous
959 range of integers.
960 delim_space (bool): Defaults to ``False``. If ``True``, adds a
961 space after all *delim* characters.
963 >>> format_int_list([1,3,5,6,7,8,10,11,15])
964 '1,3,5-8,10-11,15'
966 """
967 output = []
968 contig_range = collections.deque()
970 for x in sorted(int_list):
972 # Handle current (and first) value.
973 if len(contig_range) < 1:
974 contig_range.append(x)
976 # Handle current value, given multiple previous values are contiguous.
977 elif len(contig_range) > 1:
978 delta = x - contig_range[-1]
980 # Current value is contiguous.
981 if delta == 1:
982 contig_range.append(x)
984 # Current value is non-contiguous.
985 elif delta > 1:
986 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
987 range_delim,
988 max(contig_range))
989 output.append(range_substr)
990 contig_range.clear()
991 contig_range.append(x)
993 # Current value repeated.
994 else:
995 continue
997 # Handle current value, given no previous contiguous integers
998 else:
999 delta = x - contig_range[0]
1001 # Current value is contiguous.
1002 if delta == 1:
1003 contig_range.append(x)
1005 # Current value is non-contiguous.
1006 elif delta > 1:
1007 output.append('{0:d}'.format(contig_range.popleft()))
1008 contig_range.append(x)
1010 # Current value repeated.
1011 else:
1012 continue
1014 # Handle the last value.
1015 else:
1017 # Last value is non-contiguous.
1018 if len(contig_range) == 1:
1019 output.append('{0:d}'.format(contig_range.popleft()))
1020 contig_range.clear()
1022 # Last value is part of contiguous range.
1023 elif len(contig_range) > 1:
1024 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
1025 range_delim,
1026 max(contig_range))
1027 output.append(range_substr)
1028 contig_range.clear()
1030 if delim_space:
1031 output_str = (delim+' ').join(output)
1032 else:
1033 output_str = delim.join(output)
1035 return output_str
1038def complement_int_list(
1039 range_string, range_start=0, range_end=None,
1040 delim=',', range_delim='-'):
1041 """ Returns range string that is the complement of the one provided as
1042 *range_string* parameter.
1044 These range strings are of the kind produce by :func:`format_int_list`, and
1045 parseable by :func:`parse_int_list`.
1047 Args:
1048 range_string (str): String of comma separated positive integers or
1049 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1050 used in printer dialogs.
1051 range_start (int): A positive integer from which to start the resulting
1052 range. Value is inclusive. Defaults to ``0``.
1053 range_end (int): A positive integer from which the produced range is
1054 stopped. Value is exclusive. Defaults to the maximum value found in
1055 the provided ``range_string``.
1056 delim (char): Defaults to ','. Separates integers and contiguous ranges
1057 of integers.
1058 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1059 integers.
1061 >>> complement_int_list('1,3,5-8,10-11,15')
1062 '0,2,4,9,12-14'
1064 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)
1065 '0,2,4,9,12-14'
1067 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)
1068 '2,4,9,12-14'
1070 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)
1071 '2,4,9,12-14'
1073 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)
1074 '4,9,12-14'
1076 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)
1077 '0,2,4,9,12-14'
1079 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)
1080 '0,2,4,9,12-13'
1082 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)
1083 '0,2,4,9,12'
1085 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)
1086 '0,2,4,9,12-14,16-19'
1088 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)
1089 ''
1091 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)
1092 '0,2,4,9,12-14'
1094 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)
1095 ''
1097 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)
1098 ''
1100 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)
1101 ''
1103 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)
1104 '2'
1106 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)
1107 ''
1109 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)
1110 ''
1112 >>> complement_int_list('')
1113 ''
1114 """
1115 int_list = set(parse_int_list(range_string, delim, range_delim))
1116 if range_end is None:
1117 if int_list:
1118 range_end = max(int_list) + 1
1119 else:
1120 range_end = range_start
1121 complement_values = set(
1122 range(range_end)) - int_list - set(range(range_start))
1123 return format_int_list(complement_values, delim, range_delim)
1126def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):
1127 """ Transform a string of ranges (*range_string*) into a tuple of tuples.
1129 Args:
1130 range_string (str): String of comma separated positive integers or
1131 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1132 used in printer dialogs.
1133 delim (char): Defaults to ','. Separates integers and contiguous ranges
1134 of integers.
1135 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1136 integers.
1138 >>> int_ranges_from_int_list('1,3,5-8,10-11,15')
1139 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))
1141 >>> int_ranges_from_int_list('1')
1142 ((1, 1),)
1144 >>> int_ranges_from_int_list('')
1145 ()
1146 """
1147 int_tuples = []
1148 # Normalize the range string to our internal format for processing.
1149 range_string = format_int_list(
1150 parse_int_list(range_string, delim, range_delim))
1151 if range_string:
1152 for bounds in range_string.split(','):
1153 if '-' in bounds:
1154 start, end = bounds.split('-')
1155 else:
1156 start, end = bounds, bounds
1157 int_tuples.append((int(start), int(end)))
1158 return tuple(int_tuples)
1161class MultiReplace(object):
1162 """
1163 MultiReplace is a tool for doing multiple find/replace actions in one pass.
1165 Given a mapping of values to be replaced it allows for all of the matching
1166 values to be replaced in a single pass which can save a lot of performance
1167 on very large strings. In addition to simple replace, it also allows for
1168 replacing based on regular expressions.
1170 Keyword Arguments:
1172 :type regex: bool
1173 :param regex: Treat search keys as regular expressions [Default: False]
1174 :type flags: int
1175 :param flags: flags to pass to the regex engine during compile
1177 Dictionary Usage::
1179 from boltons import stringutils
1180 s = stringutils.MultiReplace({
1181 'foo': 'zoo',
1182 'cat': 'hat',
1183 'bat': 'kraken'
1184 })
1185 new = s.sub('The foo bar cat ate a bat')
1186 new == 'The zoo bar hat ate a kraken'
1188 Iterable Usage::
1190 from boltons import stringutils
1191 s = stringutils.MultiReplace([
1192 ('foo', 'zoo'),
1193 ('cat', 'hat'),
1194 ('bat', 'kraken)'
1195 ])
1196 new = s.sub('The foo bar cat ate a bat')
1197 new == 'The zoo bar hat ate a kraken'
1200 The constructor can be passed a dictionary or other mapping as well as
1201 an iterable of tuples. If given an iterable, the substitution will be run
1202 in the order the replacement values are specified in the iterable. This is
1203 also true if it is given an OrderedDict. If given a dictionary then the
1204 order will be non-deterministic::
1206 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
1207 'bar bar bar'
1208 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
1209 >>> m.sub('foo bar baz')
1210 'baz bar bar'
1212 This is because the order of replacement can matter if you're inserting
1213 something that might be replaced by a later substitution. Pay attention and
1214 if you need to rely on order then consider using a list of tuples instead
1215 of a dictionary.
1216 """
1218 def __init__(self, sub_map, **kwargs):
1219 """Compile any regular expressions that have been passed."""
1220 options = {
1221 'regex': False,
1222 'flags': 0,
1223 }
1224 options.update(kwargs)
1225 self.group_map = {}
1226 regex_values = []
1228 if isinstance(sub_map, Mapping):
1229 sub_map = sub_map.items()
1231 for idx, vals in enumerate(sub_map):
1232 group_name = 'group{0}'.format(idx)
1233 if isinstance(vals[0], basestring):
1234 # If we're not treating input strings like a regex, escape it
1235 if not options['regex']:
1236 exp = re.escape(vals[0])
1237 else:
1238 exp = vals[0]
1239 else:
1240 exp = vals[0].pattern
1242 regex_values.append('(?P<{}>{})'.format(group_name, exp))
1243 self.group_map[group_name] = vals[1]
1245 self.combined_pattern = re.compile(
1246 '|'.join(regex_values),
1247 flags=options['flags']
1248 )
1250 def _get_value(self, match):
1251 """Given a match object find replacement value."""
1252 group_dict = match.groupdict()
1253 key = [x for x in group_dict if group_dict[x]][0]
1254 return self.group_map[key]
1256 def sub(self, text):
1257 """
1258 Run substitutions on the input text.
1260 Given an input string, run all substitutions given in the
1261 constructor.
1262 """
1263 return self.combined_pattern.sub(self._get_value, text)
1266def multi_replace(text, sub_map, **kwargs):
1267 """
1268 Shortcut function to invoke MultiReplace in a single call.
1270 Example Usage::
1272 from boltons.stringutils import multi_replace
1273 new = multi_replace(
1274 'The foo bar cat ate a bat',
1275 {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}
1276 )
1277 new == 'The zoo bar hat ate a kraken'
1278 """
1279 m = MultiReplace(sub_map, **kwargs)
1280 return m.sub(text)
1283def unwrap_text(text, ending='\n\n'):
1284 r"""
1285 Unwrap text, the natural complement to :func:`textwrap.wrap`.
1287 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
1288 >>> unwrap_text(text)
1289 'Short lines wrapped small.\n\nAnother paragraph.'
1291 Args:
1292 text: A string to unwrap.
1293 ending (str): The string to join all unwrapped paragraphs
1294 by. Pass ``None`` to get the list. Defaults to '\n\n' for
1295 compatibility with Markdown and RST.
1297 """
1298 all_grafs = []
1299 cur_graf = []
1300 for line in text.splitlines():
1301 line = line.strip()
1302 if line:
1303 cur_graf.append(line)
1304 else:
1305 all_grafs.append(' '.join(cur_graf))
1306 cur_graf = []
1307 if cur_graf:
1308 all_grafs.append(' '.join(cur_graf))
1309 if ending is None:
1310 return all_grafs
1311 return ending.join(all_grafs)