1# Copyright (c) 2013, Mahmoud Hashemi
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are
5# met:
6#
7# * Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9#
10# * Redistributions in binary form must reproduce the above
11# copyright notice, this list of conditions and the following
12# disclaimer in the documentation and/or other materials provided
13# with the distribution.
14#
15# * The names of the contributors may not be used to endorse or
16# promote products derived from this software without specific
17# prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""So much practical programming involves string manipulation, which
32Python readily accommodates. Still, there are dozens of basic and
33common capabilities missing from the standard library, several of them
34provided by ``strutils``.
35"""
36
37
38import builtins
39import re
40import sys
41import uuid
42import zlib
43import string
44import unicodedata
45import collections
46from collections.abc import Mapping
47from gzip import GzipFile
48from html.parser import HTMLParser
49from html import entities as htmlentitydefs
50from io import BytesIO as StringIO
51
52
53__all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
54 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
55 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
56 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
57 'iter_splitlines', 'indent', 'escape_shell_args',
58 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',
59 'complement_int_list', 'int_ranges_from_int_list', 'MultiReplace',
60 'multi_replace', 'unwrap_text', 'removeprefix']
61
62
63_punct_ws_str = string.punctuation + string.whitespace
64_punct_re = re.compile('[' + _punct_ws_str + ']+')
65_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
66
67
68def camel2under(camel_string):
69 """Converts a camelcased string to underscores. Useful for turning a
70 class name into a function name.
71
72 >>> camel2under('BasicParseTest')
73 'basic_parse_test'
74 """
75 return _camel2under_re.sub(r'_\1', camel_string).lower()
76
77
78def under2camel(under_string):
79 """Converts an underscored string to camelcased. Useful for turning a
80 function name into a class name.
81
82 >>> under2camel('complex_tokenizer')
83 'ComplexTokenizer'
84 """
85 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
86
87
88def slugify(text, delim='_', lower=True, ascii=False):
89 """
90 A basic function that turns text full of scary characters
91 (i.e., punctuation and whitespace), into a relatively safe
92 lowercased string separated only by the delimiter specified
93 by *delim*, which defaults to ``_``.
94
95 The *ascii* convenience flag will :func:`asciify` the slug if
96 you require ascii-only slugs.
97
98 >>> slugify('First post! Hi!!!!~1 ')
99 'first_post_hi_1'
100
101 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
102 b'kurt_goedel_s_pretty_cool'
103 True
104
105 """
106 ret = delim.join(split_punct_ws(text)) or delim if text else ''
107 if ascii:
108 ret = asciify(ret)
109 if lower:
110 ret = ret.lower()
111 return ret
112
113
114def split_punct_ws(text):
115 """While :meth:`str.split` will split on whitespace,
116 :func:`split_punct_ws` will split on punctuation and
117 whitespace. This used internally by :func:`slugify`, above.
118
119 >>> split_punct_ws('First post! Hi!!!!~1 ')
120 ['First', 'post', 'Hi', '1']
121 """
122 return [w for w in _punct_re.split(text) if w]
123
124
125def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
126 """Returns a plain-English description of an iterable's
127 :func:`len()`, conditionally pluralized with :func:`cardinalize`,
128 detailed below.
129
130 >>> print(unit_len(range(10), 'number'))
131 10 numbers
132 >>> print(unit_len('aeiou', 'vowel'))
133 5 vowels
134 >>> print(unit_len([], 'worry'))
135 No worries
136 """
137 count = len(sized_iterable)
138 units = cardinalize(unit_noun, count)
139 if count:
140 return f'{count} {units}'
141 return f'No {units}'
142
143
144_ORDINAL_MAP = {'1': 'st',
145 '2': 'nd',
146 '3': 'rd'} # 'th' is the default
147
148
149def ordinalize(number, ext_only=False):
150 """Turns *number* into its cardinal form, i.e., 1st, 2nd,
151 3rd, 4th, etc. If the last character isn't a digit, it returns the
152 string value unchanged.
153
154 Args:
155 number (int or str): Number to be cardinalized.
156 ext_only (bool): Whether to return only the suffix. Default ``False``.
157
158 >>> print(ordinalize(1))
159 1st
160 >>> print(ordinalize(3694839230))
161 3694839230th
162 >>> print(ordinalize('hi'))
163 hi
164 >>> print(ordinalize(1515))
165 1515th
166 """
167 numstr, ext = str(number), ''
168 if numstr and numstr[-1] in string.digits:
169 try:
170 # first check for teens
171 if numstr[-2] == '1':
172 ext = 'th'
173 else:
174 # all other cases
175 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
176 except IndexError:
177 # single digit numbers (will reach here based on [-2] above)
178 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
179 if ext_only:
180 return ext
181 else:
182 return numstr + ext
183
184
185def cardinalize(unit_noun, count):
186 """Conditionally pluralizes a singular word *unit_noun* if
187 *count* is not one, preserving case when possible.
188
189 >>> vowels = 'aeiou'
190 >>> print(len(vowels), cardinalize('vowel', len(vowels)))
191 5 vowels
192 >>> print(3, cardinalize('Wish', 3))
193 3 Wishes
194 """
195 if count == 1:
196 return unit_noun
197 return pluralize(unit_noun)
198
199
200def singularize(word):
201 """Semi-intelligently converts an English plural *word* to its
202 singular form, preserving case pattern.
203
204 >>> singularize('chances')
205 'chance'
206 >>> singularize('Activities')
207 'Activity'
208 >>> singularize('Glasses')
209 'Glass'
210 >>> singularize('FEET')
211 'FOOT'
212
213 """
214 orig_word, word = word, word.strip().lower()
215 if not word or word in _IRR_S2P:
216 return orig_word
217
218 irr_singular = _IRR_P2S.get(word)
219 if irr_singular:
220 singular = irr_singular
221 elif not word.endswith('s'):
222 return orig_word
223 elif len(word) == 2:
224 singular = word[:-1] # or just return word?
225 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
226 singular = word[:-3] + 'y'
227 elif word.endswith('es') and word[-3] == 's':
228 singular = word[:-2]
229 else:
230 singular = word[:-1]
231 return _match_case(orig_word, singular)
232
233
234def pluralize(word):
235 """Semi-intelligently converts an English *word* from singular form to
236 plural, preserving case pattern.
237
238 >>> pluralize('friend')
239 'friends'
240 >>> pluralize('enemy')
241 'enemies'
242 >>> pluralize('Sheep')
243 'Sheep'
244 """
245 orig_word, word = word, word.strip().lower()
246 if not word or word in _IRR_P2S:
247 return orig_word
248 irr_plural = _IRR_S2P.get(word)
249 if irr_plural:
250 plural = irr_plural
251 elif word.endswith('y') and word[-2:-1] not in 'aeiou':
252 plural = word[:-1] + 'ies'
253 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
254 plural = word if word.endswith('es') else word + 'es'
255 else:
256 plural = word + 's'
257 return _match_case(orig_word, plural)
258
259
260def _match_case(master, disciple):
261 if not master.strip():
262 return disciple
263 if master.lower() == master:
264 return disciple.lower()
265 elif master.upper() == master:
266 return disciple.upper()
267 elif master.title() == master:
268 return disciple.title()
269 return disciple
270
271
272# Singular to plural map of irregular pluralizations
273_IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
274 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
275 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
276 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
277 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
278 'calf': 'calves', 'child': 'children', 'corps': 'corps',
279 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
280 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
281 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
282 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
283 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
284 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
285 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
286 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
287 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
288 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
289 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
290 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
291 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
292 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
293 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
294 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
295 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
296 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
297 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
298 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
299 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
300 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
301 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
302 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
303 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
304 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
305 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
306 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
307 'wolf': 'wolves', 'woman': 'women'}
308
309
310# Reverse index of the above
311_IRR_P2S = {v: k for k, v in _IRR_S2P.items()}
312
313HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
314
315
316def find_hashtags(string):
317 """Finds and returns all hashtags in a string, with the hashmark
318 removed. Supports full-width hashmarks for Asian languages and
319 does not false-positive on URL anchors.
320
321 >>> find_hashtags('#atag http://asite/#ananchor')
322 ['atag']
323
324 ``find_hashtags`` also works with unicode hashtags.
325 """
326
327 # the following works, doctest just struggles with it
328 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
329 # [u'\u80af\u5fb7\u57fa']
330 return HASHTAG_RE.findall(string)
331
332
333def a10n(string):
334 """That thing where "internationalization" becomes "i18n", what's it
335 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
336 of `numeronym`_.)
337
338 >>> a10n('abbreviation')
339 'a10n'
340 >>> a10n('internationalization')
341 'i18n'
342 >>> a10n('')
343 ''
344
345 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
346 """
347 if len(string) < 3:
348 return string
349 return f'{string[0]}{len(string[1:-1])}{string[-1]}'
350
351
352# Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences
353ANSI_SEQUENCES = re.compile(r'''
354 \x1B # Sequence starts with ESC, i.e. hex 0x1B
355 (?:
356 [@-Z\\-_] # Second byte:
357 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_
358 | # Or
359 \[ # CSI sequences, starting with [
360 [0-?]* # Parameter bytes:
361 # range 0x30–0x3F, ASCII 0–9:;<=>?
362 [ -/]* # Intermediate bytes:
363 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./
364 [@-~] # Final byte
365 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~
366 )
367''', re.VERBOSE)
368
369
370def strip_ansi(text):
371 """Strips ANSI escape codes from *text*. Useful for the occasional
372 time when a log or redirected output accidentally captures console
373 color codes and the like.
374
375 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')
376 'art'
377
378 Supports str, bytes and bytearray content as input. Returns the
379 same type as the input.
380
381 There's a lot of ANSI art available for testing on `sixteencolors.net`_.
382 This function does not interpret or render ANSI art, but you can do so with
383 `ansi2img`_ or `escapes.js`_.
384
385 .. _sixteencolors.net: http://sixteencolors.net
386 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
387 .. _escapes.js: https://github.com/atdt/escapes.js
388 """
389 # TODO: move to cliutils.py
390
391 # Transform any ASCII-like content to unicode to allow regex to match, and
392 # save input type for later.
393 target_type = None
394 # Unicode type aliased to str is code-smell for Boltons in Python 3 env.
395 if isinstance(text, (bytes, bytearray)):
396 target_type = type(text)
397 text = text.decode('utf-8')
398
399 cleaned = ANSI_SEQUENCES.sub('', text)
400
401 # Transform back the result to the same bytearray type provided by the user.
402 if target_type and target_type != type(cleaned):
403 cleaned = target_type(cleaned, 'utf-8')
404
405 return cleaned
406
407
408def asciify(text, ignore=False):
409 """Converts a unicode or bytestring, *text*, into a bytestring with
410 just ascii characters. Performs basic deaccenting for all you
411 Europhiles out there.
412
413 Also, a gentle reminder that this is a **utility**, primarily meant
414 for slugification. Whenever possible, make your application work
415 **with** unicode, not against it.
416
417 Args:
418 text (str): The string to be asciified.
419 ignore (bool): Configures final encoding to ignore remaining
420 unasciified string instead of replacing it.
421
422 >>> asciify('Beyoncé') == b'Beyonce'
423 True
424 """
425 try:
426 try:
427 return text.encode('ascii')
428 except UnicodeDecodeError:
429 # this usually means you passed in a non-unicode string
430 text = text.decode('utf-8')
431 return text.encode('ascii')
432 except UnicodeEncodeError:
433 mode = 'replace'
434 if ignore:
435 mode = 'ignore'
436 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
437 ret = transd.encode('ascii', mode)
438 return ret
439
440
441def is_ascii(text):
442 """Check if a string or bytestring, *text*, is composed of ascii
443 characters only. Raises :exc:`ValueError` if argument is not text.
444
445 Args:
446 text (str): The string to be checked.
447
448 >>> is_ascii('Beyoncé')
449 False
450 >>> is_ascii('Beyonce')
451 True
452 """
453 if isinstance(text, str):
454 try:
455 text.encode('ascii')
456 except UnicodeEncodeError:
457 return False
458 elif isinstance(text, bytes):
459 try:
460 text.decode('ascii')
461 except UnicodeDecodeError:
462 return False
463 else:
464 raise ValueError('expected text or bytes, not %r' % type(text))
465 return True
466
467
468class DeaccenterDict(dict):
469 "A small caching dictionary for deaccenting."
470 def __missing__(self, key):
471 ch = self.get(key)
472 if ch is not None:
473 return ch
474 try:
475 de = unicodedata.decomposition(chr(key))
476 p1, _, p2 = de.rpartition(' ')
477 if int(p2, 16) == 0x308:
478 ch = self.get(key)
479 else:
480 ch = int(p1, 16)
481 except (IndexError, ValueError):
482 ch = self.get(key, key)
483 self[key] = ch
484 return ch
485
486
487# http://chmullig.com/2009/12/python-unicode-ascii-ifier/
488# For something more complete, investigate the unidecode
489# or isounidecode packages, which are capable of performing
490# crude transliteration.
491_BASE_DEACCENT_MAP = {
492 0xc6: "AE", # Æ LATIN CAPITAL LETTER AE
493 0xd0: "D", # Ð LATIN CAPITAL LETTER ETH
494 0xd8: "OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
495 0xde: "Th", # Þ LATIN CAPITAL LETTER THORN
496 0xc4: 'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
497 0xd6: 'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
498 0xdc: 'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
499 0xc0: "A", # À LATIN CAPITAL LETTER A WITH GRAVE
500 0xc1: "A", # Á LATIN CAPITAL LETTER A WITH ACUTE
501 0xc3: "A", # Ã LATIN CAPITAL LETTER A WITH TILDE
502 0xc7: "C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
503 0xc8: "E", # È LATIN CAPITAL LETTER E WITH GRAVE
504 0xc9: "E", # É LATIN CAPITAL LETTER E WITH ACUTE
505 0xca: "E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
506 0xcc: "I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
507 0xcd: "I", # Í LATIN CAPITAL LETTER I WITH ACUTE
508 0xd2: "O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
509 0xd3: "O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
510 0xd5: "O", # Õ LATIN CAPITAL LETTER O WITH TILDE
511 0xd9: "U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
512 0xda: "U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
513 0xdf: "ss", # ß LATIN SMALL LETTER SHARP S
514 0xe6: "ae", # æ LATIN SMALL LETTER AE
515 0xf0: "d", # ð LATIN SMALL LETTER ETH
516 0xf8: "oe", # ø LATIN SMALL LETTER O WITH STROKE
517 0xfe: "th", # þ LATIN SMALL LETTER THORN,
518 0xe4: 'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
519 0xf6: 'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
520 0xfc: 'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
521 0xe0: "a", # à LATIN SMALL LETTER A WITH GRAVE
522 0xe1: "a", # á LATIN SMALL LETTER A WITH ACUTE
523 0xe3: "a", # ã LATIN SMALL LETTER A WITH TILDE
524 0xe7: "c", # ç LATIN SMALL LETTER C WITH CEDILLA
525 0xe8: "e", # è LATIN SMALL LETTER E WITH GRAVE
526 0xe9: "e", # é LATIN SMALL LETTER E WITH ACUTE
527 0xea: "e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
528 0xec: "i", # ì LATIN SMALL LETTER I WITH GRAVE
529 0xed: "i", # í LATIN SMALL LETTER I WITH ACUTE
530 0xf2: "o", # ò LATIN SMALL LETTER O WITH GRAVE
531 0xf3: "o", # ó LATIN SMALL LETTER O WITH ACUTE
532 0xf5: "o", # õ LATIN SMALL LETTER O WITH TILDE
533 0xf9: "u", # ù LATIN SMALL LETTER U WITH GRAVE
534 0xfa: "u", # ú LATIN SMALL LETTER U WITH ACUTE
535 0x2018: "'", # ‘ LEFT SINGLE QUOTATION MARK
536 0x2019: "'", # ’ RIGHT SINGLE QUOTATION MARK
537 0x201c: '"', # “ LEFT DOUBLE QUOTATION MARK
538 0x201d: '"', # ” RIGHT DOUBLE QUOTATION MARK
539 }
540
541
542DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
543
544
545_SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
546_SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
547_SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
548
549
550def bytes2human(nbytes, ndigits=0):
551 """Turns an integer value of *nbytes* into a human readable format. Set
552 *ndigits* to control how many digits after the decimal point
553 should be shown (default ``0``).
554
555 >>> bytes2human(128991)
556 '126K'
557 >>> bytes2human(100001221)
558 '95M'
559 >>> bytes2human(0, 2)
560 '0.00B'
561 """
562 abs_bytes = abs(nbytes)
563 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
564 if abs_bytes <= next_size:
565 break
566 hnbytes = float(nbytes) / size
567 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
568 ndigits=ndigits,
569 symbol=symbol)
570
571
572class HTMLTextExtractor(HTMLParser):
573 def __init__(self):
574 self.reset()
575 self.strict = False
576 self.convert_charrefs = True
577 self.result = []
578
579 def handle_data(self, d):
580 self.result.append(d)
581
582 def handle_charref(self, number):
583 if number[0] == 'x' or number[0] == 'X':
584 codepoint = int(number[1:], 16)
585 else:
586 codepoint = int(number)
587 self.result.append(chr(codepoint))
588
589 def handle_entityref(self, name):
590 try:
591 codepoint = htmlentitydefs.name2codepoint[name]
592 except KeyError:
593 self.result.append('&' + name + ';')
594 else:
595 self.result.append(chr(codepoint))
596
597 def get_text(self):
598 return ''.join(self.result)
599
600
601def html2text(html):
602 """Strips tags from HTML text, returning markup-free text. Also, does
603 a best effort replacement of entities like " "
604
605 >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>')
606 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
607 True
608 """
609 # based on answers to http://stackoverflow.com/questions/753052/
610 s = HTMLTextExtractor()
611 s.feed(html)
612 return s.get_text()
613
614
615_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
616_NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
617
618
619def gunzip_bytes(bytestring):
620 """The :mod:`gzip` module is great if you have a file or file-like
621 object, but what if you just have bytes. StringIO is one
622 possibility, but it's often faster, easier, and simpler to just
623 use this one-liner. Use this tried-and-true utility function to
624 decompress gzip from bytes.
625
626 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
627 True
628 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
629 True
630 """
631 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
632
633
634def gzip_bytes(bytestring, level=6):
635 """Turn some bytes into some compressed bytes.
636
637 >>> len(gzip_bytes(b'a' * 10000))
638 46
639
640 Args:
641 bytestring (bytes): Bytes to be compressed
642 level (int): An integer, 1-9, controlling the
643 speed/compression. 1 is fastest, least compressed, 9 is
644 slowest, but most compressed.
645
646 Note that all levels of gzip are pretty fast these days, though
647 it's not really a competitor in compression, at any level.
648 """
649 out = StringIO()
650 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
651 f.write(bytestring)
652 f.close()
653 return out.getvalue()
654
655
656
657_line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
658 re.UNICODE)
659
660
661def iter_splitlines(text):
662 r"""Like :meth:`str.splitlines`, but returns an iterator of lines
663 instead of a list. Also similar to :meth:`file.next`, as that also
664 lazily reads and yields lines from a file.
665
666 This function works with a variety of line endings, but as always,
667 be careful when mixing line endings within a file.
668
669 >>> list(iter_splitlines('\nhi\nbye\n'))
670 ['', 'hi', 'bye', '']
671 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
672 ['', 'hi', 'bye', '']
673 >>> list(iter_splitlines(''))
674 []
675 """
676 prev_end, len_text = 0, len(text)
677 # print('last: %r' % last_idx)
678 # start, end = None, None
679 for match in _line_ending_re.finditer(text):
680 start, end = match.start(1), match.end(1)
681 # print(start, end)
682 if prev_end <= start:
683 yield text[prev_end:start]
684 if end == len_text:
685 yield ''
686 prev_end = end
687 tail = text[prev_end:]
688 if tail:
689 yield tail
690 return
691
692
693def indent(text, margin, newline='\n', key=bool):
694 """The missing counterpart to the built-in :func:`textwrap.dedent`.
695
696 Args:
697 text (str): The text to indent.
698 margin (str): The string to prepend to each line.
699 newline (str): The newline used to rejoin the lines (default: ``\\n``)
700 key (callable): Called on each line to determine whether to
701 indent it. Default: :class:`bool`, to ensure that empty lines do
702 not get whitespace added.
703 """
704 indented_lines = [(margin + line if key(line) else line)
705 for line in iter_splitlines(text)]
706 return newline.join(indented_lines)
707
708
709def is_uuid(obj, version=4):
710 """Check the argument is either a valid UUID object or string.
711
712 Args:
713 obj (object): The test target. Strings and UUID objects supported.
714 version (int): The target UUID version, set to 0 to skip version check.
715
716 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
717 True
718 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
719 False
720 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
721 True
722 """
723 if not isinstance(obj, uuid.UUID):
724 try:
725 obj = uuid.UUID(obj)
726 except (TypeError, ValueError, AttributeError):
727 return False
728 if version and obj.version != int(version):
729 return False
730 return True
731
732
733def escape_shell_args(args, sep=' ', style=None):
734 """Returns an escaped version of each string in *args*, according to
735 *style*.
736
737 Args:
738 args (list): A list of arguments to escape and join together
739 sep (str): The separator used to join the escaped arguments.
740 style (str): The style of escaping to use. Can be one of
741 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
742 respectively. If *style* is ``None``, then it is picked
743 according to the system platform.
744
745 See :func:`args2cmd` and :func:`args2sh` for details and example
746 output for each style.
747 """
748 if not style:
749 style = 'cmd' if sys.platform == 'win32' else 'sh'
750
751 if style == 'sh':
752 return args2sh(args, sep=sep)
753 elif style == 'cmd':
754 return args2cmd(args, sep=sep)
755
756 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
757
758
759_find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
760
761
762def args2sh(args, sep=' '):
763 """Return a shell-escaped string version of *args*, separated by
764 *sep*, based on the rules of sh, bash, and other shells in the
765 Linux/BSD/MacOS ecosystem.
766
767 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
768 aa '[bb]' 'cc'"'"'cc' 'dd"dd'
769
770 As you can see, arguments with no special characters are not
771 escaped, arguments with special characters are quoted with single
772 quotes, and single quotes themselves are quoted with double
773 quotes. Double quotes are handled like any other special
774 character.
775
776 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
777 note that :mod:`shlex` and :mod:`argparse` have functions to split
778 and parse strings escaped in this manner.
779 """
780 ret_list = []
781
782 for arg in args:
783 if not arg:
784 ret_list.append("''")
785 continue
786 if _find_sh_unsafe(arg) is None:
787 ret_list.append(arg)
788 continue
789 # use single quotes, and put single quotes into double quotes
790 # the string $'b is then quoted as '$'"'"'b'
791 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
792
793 return ' '.join(ret_list)
794
795
796def args2cmd(args, sep=' '):
797 r"""Return a shell-escaped string version of *args*, separated by
798 *sep*, using the same rules as the Microsoft C runtime.
799
800 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
801 aa [bb] cc'cc dd\"dd
802
803 As you can see, escaping is through backslashing and not quoting,
804 and double quotes are the only special character. See the comment
805 in the code for more details. Based on internal code from the
806 :mod:`subprocess` module.
807
808 """
809 # technique description from subprocess below
810 """
811 1) Arguments are delimited by white space, which is either a
812 space or a tab.
813
814 2) A string surrounded by double quotation marks is
815 interpreted as a single argument, regardless of white space
816 contained within. A quoted string can be embedded in an
817 argument.
818
819 3) A double quotation mark preceded by a backslash is
820 interpreted as a literal double quotation mark.
821
822 4) Backslashes are interpreted literally, unless they
823 immediately precede a double quotation mark.
824
825 5) If backslashes immediately precede a double quotation mark,
826 every pair of backslashes is interpreted as a literal
827 backslash. If the number of backslashes is odd, the last
828 backslash escapes the next double quotation mark as
829 described in rule 3.
830
831 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
832 or search http://msdn.microsoft.com for
833 "Parsing C++ Command-Line Arguments"
834 """
835 result = []
836 needquote = False
837 for arg in args:
838 bs_buf = []
839
840 # Add a space to separate this argument from the others
841 if result:
842 result.append(' ')
843
844 needquote = (" " in arg) or ("\t" in arg) or not arg
845 if needquote:
846 result.append('"')
847
848 for c in arg:
849 if c == '\\':
850 # Don't know if we need to double yet.
851 bs_buf.append(c)
852 elif c == '"':
853 # Double backslashes.
854 result.append('\\' * len(bs_buf)*2)
855 bs_buf = []
856 result.append('\\"')
857 else:
858 # Normal char
859 if bs_buf:
860 result.extend(bs_buf)
861 bs_buf = []
862 result.append(c)
863
864 # Add remaining backslashes, if any.
865 if bs_buf:
866 result.extend(bs_buf)
867
868 if needquote:
869 result.extend(bs_buf)
870 result.append('"')
871
872 return ''.join(result)
873
874
875def parse_int_list(range_string, delim=',', range_delim='-'):
876 """Returns a sorted list of positive integers based on
877 *range_string*. Reverse of :func:`format_int_list`.
878
879 Args:
880 range_string (str): String of comma separated positive
881 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
882 page range string used in printer dialogs.
883 delim (char): Defaults to ','. Separates integers and
884 contiguous ranges of integers.
885 range_delim (char): Defaults to '-'. Indicates a contiguous
886 range of integers.
887
888 >>> parse_int_list('1,3,5-8,10-11,15')
889 [1, 3, 5, 6, 7, 8, 10, 11, 15]
890
891 """
892 output = []
893
894 for x in range_string.strip().split(delim):
895
896 # Range
897 if range_delim in x:
898 range_limits = list(map(int, x.split(range_delim)))
899 output += list(range(min(range_limits), max(range_limits)+1))
900
901 # Empty String
902 elif not x:
903 continue
904
905 # Integer
906 else:
907 output.append(int(x))
908
909 return sorted(output)
910
911
912def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
913 """Returns a sorted range string from a list of positive integers
914 (*int_list*). Contiguous ranges of integers are collapsed to min
915 and max values. Reverse of :func:`parse_int_list`.
916
917 Args:
918 int_list (list): List of positive integers to be converted
919 into a range string (e.g. [1,2,4,5,6,8]).
920 delim (char): Defaults to ','. Separates integers and
921 contiguous ranges of integers.
922 range_delim (char): Defaults to '-'. Indicates a contiguous
923 range of integers.
924 delim_space (bool): Defaults to ``False``. If ``True``, adds a
925 space after all *delim* characters.
926
927 >>> format_int_list([1,3,5,6,7,8,10,11,15])
928 '1,3,5-8,10-11,15'
929
930 """
931 output = []
932 contig_range = collections.deque()
933
934 for x in sorted(int_list):
935
936 # Handle current (and first) value.
937 if len(contig_range) < 1:
938 contig_range.append(x)
939
940 # Handle current value, given multiple previous values are contiguous.
941 elif len(contig_range) > 1:
942 delta = x - contig_range[-1]
943
944 # Current value is contiguous.
945 if delta == 1:
946 contig_range.append(x)
947
948 # Current value is non-contiguous.
949 elif delta > 1:
950 range_substr = '{:d}{}{:d}'.format(min(contig_range),
951 range_delim,
952 max(contig_range))
953 output.append(range_substr)
954 contig_range.clear()
955 contig_range.append(x)
956
957 # Current value repeated.
958 else:
959 continue
960
961 # Handle current value, given no previous contiguous integers
962 else:
963 delta = x - contig_range[0]
964
965 # Current value is contiguous.
966 if delta == 1:
967 contig_range.append(x)
968
969 # Current value is non-contiguous.
970 elif delta > 1:
971 output.append(f'{contig_range.popleft():d}')
972 contig_range.append(x)
973
974 # Current value repeated.
975 else:
976 continue
977
978 # Handle the last value.
979 else:
980
981 # Last value is non-contiguous.
982 if len(contig_range) == 1:
983 output.append(f'{contig_range.popleft():d}')
984 contig_range.clear()
985
986 # Last value is part of contiguous range.
987 elif len(contig_range) > 1:
988 range_substr = '{:d}{}{:d}'.format(min(contig_range),
989 range_delim,
990 max(contig_range))
991 output.append(range_substr)
992 contig_range.clear()
993
994 if delim_space:
995 output_str = (delim+' ').join(output)
996 else:
997 output_str = delim.join(output)
998
999 return output_str
1000
1001
1002def complement_int_list(
1003 range_string, range_start=0, range_end=None,
1004 delim=',', range_delim='-'):
1005 """ Returns range string that is the complement of the one provided as
1006 *range_string* parameter.
1007
1008 These range strings are of the kind produce by :func:`format_int_list`, and
1009 parseable by :func:`parse_int_list`.
1010
1011 Args:
1012 range_string (str): String of comma separated positive integers or
1013 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1014 used in printer dialogs.
1015 range_start (int): A positive integer from which to start the resulting
1016 range. Value is inclusive. Defaults to ``0``.
1017 range_end (int): A positive integer from which the produced range is
1018 stopped. Value is exclusive. Defaults to the maximum value found in
1019 the provided ``range_string``.
1020 delim (char): Defaults to ','. Separates integers and contiguous ranges
1021 of integers.
1022 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1023 integers.
1024
1025 >>> complement_int_list('1,3,5-8,10-11,15')
1026 '0,2,4,9,12-14'
1027
1028 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)
1029 '0,2,4,9,12-14'
1030
1031 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)
1032 '2,4,9,12-14'
1033
1034 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)
1035 '2,4,9,12-14'
1036
1037 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)
1038 '4,9,12-14'
1039
1040 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)
1041 '0,2,4,9,12-14'
1042
1043 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)
1044 '0,2,4,9,12-13'
1045
1046 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)
1047 '0,2,4,9,12'
1048
1049 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)
1050 '0,2,4,9,12-14,16-19'
1051
1052 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)
1053 ''
1054
1055 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)
1056 '0,2,4,9,12-14'
1057
1058 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)
1059 ''
1060
1061 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)
1062 ''
1063
1064 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)
1065 ''
1066
1067 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)
1068 '2'
1069
1070 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)
1071 ''
1072
1073 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)
1074 ''
1075
1076 >>> complement_int_list('')
1077 ''
1078 """
1079 int_list = set(parse_int_list(range_string, delim, range_delim))
1080 if range_end is None:
1081 if int_list:
1082 range_end = max(int_list) + 1
1083 else:
1084 range_end = range_start
1085 complement_values = set(
1086 range(range_end)) - int_list - set(range(range_start))
1087 return format_int_list(complement_values, delim, range_delim)
1088
1089
1090def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):
1091 """ Transform a string of ranges (*range_string*) into a tuple of tuples.
1092
1093 Args:
1094 range_string (str): String of comma separated positive integers or
1095 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1096 used in printer dialogs.
1097 delim (char): Defaults to ','. Separates integers and contiguous ranges
1098 of integers.
1099 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1100 integers.
1101
1102 >>> int_ranges_from_int_list('1,3,5-8,10-11,15')
1103 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))
1104
1105 >>> int_ranges_from_int_list('1')
1106 ((1, 1),)
1107
1108 >>> int_ranges_from_int_list('')
1109 ()
1110 """
1111 int_tuples = []
1112 # Normalize the range string to our internal format for processing.
1113 range_string = format_int_list(
1114 parse_int_list(range_string, delim, range_delim))
1115 if range_string:
1116 for bounds in range_string.split(','):
1117 if '-' in bounds:
1118 start, end = bounds.split('-')
1119 else:
1120 start, end = bounds, bounds
1121 int_tuples.append((int(start), int(end)))
1122 return tuple(int_tuples)
1123
1124
1125class MultiReplace:
1126 """
1127 MultiReplace is a tool for doing multiple find/replace actions in one pass.
1128
1129 Given a mapping of values to be replaced it allows for all of the matching
1130 values to be replaced in a single pass which can save a lot of performance
1131 on very large strings. In addition to simple replace, it also allows for
1132 replacing based on regular expressions.
1133
1134 Keyword Arguments:
1135
1136 :type regex: bool
1137 :param regex: Treat search keys as regular expressions [Default: False]
1138 :type flags: int
1139 :param flags: flags to pass to the regex engine during compile
1140
1141 Dictionary Usage::
1142
1143 from boltons import strutils
1144 s = strutils.MultiReplace({
1145 'foo': 'zoo',
1146 'cat': 'hat',
1147 'bat': 'kraken'
1148 })
1149 new = s.sub('The foo bar cat ate a bat')
1150 new == 'The zoo bar hat ate a kraken'
1151
1152 Iterable Usage::
1153
1154 from boltons import strutils
1155 s = strutils.MultiReplace([
1156 ('foo', 'zoo'),
1157 ('cat', 'hat'),
1158 ('bat', 'kraken)'
1159 ])
1160 new = s.sub('The foo bar cat ate a bat')
1161 new == 'The zoo bar hat ate a kraken'
1162
1163
1164 The constructor can be passed a dictionary or other mapping as well as
1165 an iterable of tuples. If given an iterable, the substitution will be run
1166 in the order the replacement values are specified in the iterable. This is
1167 also true if it is given an OrderedDict. If given a dictionary then the
1168 order will be non-deterministic::
1169
1170 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
1171 'bar bar bar'
1172 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
1173 >>> m.sub('foo bar baz')
1174 'baz bar bar'
1175
1176 This is because the order of replacement can matter if you're inserting
1177 something that might be replaced by a later substitution. Pay attention and
1178 if you need to rely on order then consider using a list of tuples instead
1179 of a dictionary.
1180 """
1181
1182 def __init__(self, sub_map, **kwargs):
1183 """Compile any regular expressions that have been passed."""
1184 options = {
1185 'regex': False,
1186 'flags': 0,
1187 }
1188 options.update(kwargs)
1189 self.group_map = {}
1190 regex_values = []
1191
1192 if isinstance(sub_map, Mapping):
1193 sub_map = sub_map.items()
1194
1195 for idx, vals in enumerate(sub_map):
1196 group_name = f'group{idx}'
1197 if isinstance(vals[0], str):
1198 # If we're not treating input strings like a regex, escape it
1199 if not options['regex']:
1200 exp = re.escape(vals[0])
1201 else:
1202 exp = vals[0]
1203 else:
1204 exp = vals[0].pattern
1205
1206 regex_values.append(f'(?P<{group_name}>{exp})')
1207 self.group_map[group_name] = vals[1]
1208
1209 self.combined_pattern = re.compile(
1210 '|'.join(regex_values),
1211 flags=options['flags']
1212 )
1213
1214 def _get_value(self, match):
1215 """Given a match object find replacement value."""
1216 group_dict = match.groupdict()
1217 key = [x for x in group_dict if group_dict[x]][0]
1218 return self.group_map[key]
1219
1220 def sub(self, text):
1221 """
1222 Run substitutions on the input text.
1223
1224 Given an input string, run all substitutions given in the
1225 constructor.
1226 """
1227 return self.combined_pattern.sub(self._get_value, text)
1228
1229
1230def multi_replace(text, sub_map, **kwargs):
1231 """
1232 Shortcut function to invoke MultiReplace in a single call.
1233
1234 Example Usage::
1235
1236 from boltons.strutils import multi_replace
1237 new = multi_replace(
1238 'The foo bar cat ate a bat',
1239 {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}
1240 )
1241 new == 'The zoo bar hat ate a kraken'
1242 """
1243 m = MultiReplace(sub_map, **kwargs)
1244 return m.sub(text)
1245
1246
1247def unwrap_text(text, ending='\n\n'):
1248 r"""
1249 Unwrap text, the natural complement to :func:`textwrap.wrap`.
1250
1251 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
1252 >>> unwrap_text(text)
1253 'Short lines wrapped small.\n\nAnother paragraph.'
1254
1255 Args:
1256 text: A string to unwrap.
1257 ending (str): The string to join all unwrapped paragraphs
1258 by. Pass ``None`` to get the list. Defaults to '\n\n' for
1259 compatibility with Markdown and RST.
1260
1261 """
1262 all_grafs = []
1263 cur_graf = []
1264 for line in text.splitlines():
1265 line = line.strip()
1266 if line:
1267 cur_graf.append(line)
1268 else:
1269 all_grafs.append(' '.join(cur_graf))
1270 cur_graf = []
1271 if cur_graf:
1272 all_grafs.append(' '.join(cur_graf))
1273 if ending is None:
1274 return all_grafs
1275 return ending.join(all_grafs)
1276
1277def removeprefix(text: str, prefix: str) -> str:
1278 r"""
1279 Remove `prefix` from start of `text` if present.
1280
1281 Backport of `str.removeprefix` for Python versions less than 3.9.
1282
1283 Args:
1284 text: A string to remove the prefix from.
1285 prefix: The string to remove from the beginning of `text`.
1286 """
1287 if text.startswith(prefix):
1288 return text[len(prefix):]
1289 return text