1from __future__ import annotations
2
3import re
4import unicodedata
5from collections.abc import Iterable
6from html.entities import name2codepoint
7
8try:
9 import unidecode
10except ImportError:
11 import text_unidecode as unidecode
12
13__all__ = ['slugify', 'smart_truncate']
14
15
16CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
17DECIMAL_PATTERN = re.compile(r'&#(\d+);')
18HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
19QUOTE_PATTERN = re.compile(r'[\']+')
20DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
21DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
22DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
23NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
24DEFAULT_SEPARATOR = '-'
25
26
27def smart_truncate(
28 string: str,
29 max_length: int = 0,
30 word_boundary: bool = False,
31 separator: str = " ",
32 save_order: bool = False,
33) -> str:
34 """
35 Truncate a string.
36 :param string (str): string for modification
37 :param max_length (int): output string length
38 :param word_boundary (bool):
39 :param save_order (bool): if True then word order of output string is like input string
40 :param separator (str): separator between words
41 :return:
42 """
43
44 string = string.strip(separator)
45
46 if not max_length:
47 return string
48
49 if len(string) < max_length:
50 return string
51
52 if not word_boundary:
53 return string[:max_length].strip(separator)
54
55 if separator not in string:
56 return string[:max_length]
57
58 truncated = ''
59 for word in string.split(separator):
60 if word:
61 next_len = len(truncated) + len(word)
62 if next_len < max_length:
63 truncated += '{}{}'.format(word, separator)
64 elif next_len == max_length:
65 truncated += '{}'.format(word)
66 break
67 else:
68 if save_order:
69 break
70 if not truncated: # pragma: no cover
71 truncated = string[:max_length]
72 return truncated.strip(separator)
73
74
75def slugify(
76 text: str,
77 entities: bool = True,
78 decimal: bool = True,
79 hexadecimal: bool = True,
80 max_length: int = 0,
81 word_boundary: bool = False,
82 separator: str = DEFAULT_SEPARATOR,
83 save_order: bool = False,
84 stopwords: Iterable[str] = (),
85 regex_pattern: re.Pattern[str] | str | None = None,
86 lowercase: bool = True,
87 replacements: Iterable[Iterable[str]] = (),
88 allow_unicode: bool = False,
89) -> str:
90 """
91 Make a slug from the given text.
92 :param text (str): initial text
93 :param entities (bool): converts html entities to unicode
94 :param decimal (bool): converts html decimal to unicode
95 :param hexadecimal (bool): converts html hexadecimal to unicode
96 :param max_length (int): output string length
97 :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
98 :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
99 :param separator (str): separator between words
100 :param stopwords (iterable): words to discount
101 :param regex_pattern (str): regex pattern for disallowed characters
102 :param lowercase (bool): activate case sensitivity by setting it to False
103 :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
104 :param allow_unicode (bool): allow unicode characters
105 :return (str):
106 """
107
108 # user-specific replacements
109 if replacements:
110 for old, new in replacements:
111 text = text.replace(old, new)
112
113 # ensure text is unicode
114 if not isinstance(text, str):
115 text = str(text, 'utf-8', 'ignore')
116
117 # replace quotes with dashes - pre-process
118 text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
119
120 # normalize text, convert to unicode if required
121 if allow_unicode:
122 text = unicodedata.normalize('NFKC', text)
123 else:
124 text = unicodedata.normalize('NFKD', text)
125 text = unidecode.unidecode(text)
126
127 # ensure text is still in unicode
128 if not isinstance(text, str):
129 text = str(text, 'utf-8', 'ignore')
130
131 # character entity reference
132 if entities:
133 text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
134
135 # decimal character reference
136 if decimal:
137 try:
138 text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
139 except Exception:
140 pass
141
142 # hexadecimal character reference
143 if hexadecimal:
144 try:
145 text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
146 except Exception:
147 pass
148
149 # re normalize text
150 if allow_unicode:
151 text = unicodedata.normalize('NFKC', text)
152 else:
153 text = unicodedata.normalize('NFKD', text)
154
155 # make the text lowercase (optional)
156 if lowercase:
157 text = text.lower()
158
159 # remove generated quotes -- post-process
160 text = QUOTE_PATTERN.sub('', text)
161
162 # cleanup numbers
163 text = NUMBERS_PATTERN.sub('', text)
164
165 # replace all other unwanted characters
166 if allow_unicode:
167 pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
168 else:
169 pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
170
171 text = re.sub(pattern, DEFAULT_SEPARATOR, text)
172
173 # remove redundant
174 text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
175
176 # remove stopwords
177 if stopwords:
178 if lowercase:
179 stopwords_lower = [s.lower() for s in stopwords]
180 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
181 else:
182 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
183 text = DEFAULT_SEPARATOR.join(words)
184
185 # finalize user-specific replacements
186 if replacements:
187 for old, new in replacements:
188 text = text.replace(old, new)
189
190 # smart truncate if requested
191 if max_length > 0:
192 text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
193
194 if separator != DEFAULT_SEPARATOR:
195 text = text.replace(DEFAULT_SEPARATOR, separator)
196
197 return text