Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/python_slugify-8.0.1-py3.8.egg/slugify/slugify.py: 60%
91 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-03 06:25 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-03 06:25 +0000
1import re
2import sys
3import unicodedata
4from html.entities import name2codepoint
6try:
7 import unidecode
8except ImportError:
9 import text_unidecode as unidecode
11__all__ = ['slugify', 'smart_truncate']
14CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
15DECIMAL_PATTERN = re.compile(r'&#(\d+);')
16HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
17QUOTE_PATTERN = re.compile(r'[\']+')
18DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
19DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
20DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
21NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
22DEFAULT_SEPARATOR = '-'
25def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False):
26 """
27 Truncate a string.
28 :param string (str): string for modification
29 :param max_length (int): output string length
30 :param word_boundary (bool):
31 :param save_order (bool): if True then word order of output string is like input string
32 :param separator (str): separator between words
33 :return:
34 """
36 string = string.strip(separator)
38 if not max_length:
39 return string
41 if len(string) < max_length:
42 return string
44 if not word_boundary:
45 return string[:max_length].strip(separator)
47 if separator not in string:
48 return string[:max_length]
50 truncated = ''
51 for word in string.split(separator):
52 if word:
53 next_len = len(truncated) + len(word)
54 if next_len < max_length:
55 truncated += '{}{}'.format(word, separator)
56 elif next_len == max_length:
57 truncated += '{}'.format(word)
58 break
59 else:
60 if save_order:
61 break
62 if not truncated: # pragma: no cover
63 truncated = string[:max_length]
64 return truncated.strip(separator)
67def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
68 separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
69 replacements=(), allow_unicode=False):
70 """
71 Make a slug from the given text.
72 :param text (str): initial text
73 :param entities (bool): converts html entities to unicode
74 :param decimal (bool): converts html decimal to unicode
75 :param hexadecimal (bool): converts html hexadecimal to unicode
76 :param max_length (int): output string length
77 :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
78 :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
79 :param separator (str): separator between words
80 :param stopwords (iterable): words to discount
81 :param regex_pattern (str): regex pattern for disallowed characters
82 :param lowercase (bool): activate case sensitivity by setting it to False
83 :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
84 :param allow_unicode (bool): allow unicode characters
85 :return (str):
86 """
88 # user-specific replacements
89 if replacements:
90 for old, new in replacements:
91 text = text.replace(old, new)
93 # ensure text is unicode
94 if not isinstance(text, str):
95 text = str(text, 'utf-8', 'ignore')
97 # replace quotes with dashes - pre-process
98 text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
100 # decode unicode
101 if not allow_unicode:
102 text = unidecode.unidecode(text)
104 # ensure text is still in unicode
105 if not isinstance(text, str):
106 text = str(text, 'utf-8', 'ignore')
108 # character entity reference
109 if entities:
110 text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
112 # decimal character reference
113 if decimal:
114 try:
115 text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
116 except Exception:
117 pass
119 # hexadecimal character reference
120 if hexadecimal:
121 try:
122 text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
123 except Exception:
124 pass
126 # translate
127 if allow_unicode:
128 text = unicodedata.normalize('NFKC', text)
129 else:
130 text = unicodedata.normalize('NFKD', text)
132 if sys.version_info < (3,):
133 text = text.encode('ascii', 'ignore')
135 # make the text lowercase (optional)
136 if lowercase:
137 text = text.lower()
139 # remove generated quotes -- post-process
140 text = QUOTE_PATTERN.sub('', text)
142 # cleanup numbers
143 text = NUMBERS_PATTERN.sub('', text)
145 # replace all other unwanted characters
146 if allow_unicode:
147 pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
148 else:
149 pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
151 text = re.sub(pattern, DEFAULT_SEPARATOR, text)
153 # remove redundant
154 text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
156 # remove stopwords
157 if stopwords:
158 if lowercase:
159 stopwords_lower = [s.lower() for s in stopwords]
160 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
161 else:
162 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
163 text = DEFAULT_SEPARATOR.join(words)
165 # finalize user-specific replacements
166 if replacements:
167 for old, new in replacements:
168 text = text.replace(old, new)
170 # smart truncate if requested
171 if max_length > 0:
172 text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
174 if separator != DEFAULT_SEPARATOR:
175 text = text.replace(DEFAULT_SEPARATOR, separator)
177 return text