1import gzip
2import re
3import secrets
4import unicodedata
5from collections import deque
6from gzip import GzipFile
7from gzip import compress as gzip_compress
8from html import escape
9from html.parser import HTMLParser
10from io import BytesIO
11
12from django.core.exceptions import SuspiciousFileOperation
13from django.utils.functional import (
14 SimpleLazyObject,
15 cached_property,
16 keep_lazy_text,
17 lazy,
18)
19from django.utils.regex_helper import _lazy_re_compile
20from django.utils.translation import gettext as _
21from django.utils.translation import gettext_lazy, pgettext
22
23
24@keep_lazy_text
25def capfirst(x):
26 """Capitalize the first letter of a string."""
27 if not x:
28 return x
29 if not isinstance(x, str):
30 x = str(x)
31 return x[0].upper() + x[1:]
32
33
34# Set up regular expressions
35re_newlines = _lazy_re_compile(r"\r\n|\r") # Used in normalize_newlines
36re_camel_case = _lazy_re_compile(r"(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))")
37
38
39@keep_lazy_text
40def wrap(text, width):
41 """
42 A word-wrap function that preserves existing line breaks. Expects that
43 existing line breaks are posix newlines.
44
45 Preserve all white space except added line breaks consume the space on
46 which they break the line.
47
48 Don't wrap long words, thus the output text may have lines longer than
49 ``width``.
50 """
51
52 def _generator():
53 for line in text.splitlines(True): # True keeps trailing linebreaks
54 max_width = min((line.endswith("\n") and width + 1 or width), width)
55 while len(line) > max_width:
56 space = line[: max_width + 1].rfind(" ") + 1
57 if space == 0:
58 space = line.find(" ") + 1
59 if space == 0:
60 yield line
61 line = ""
62 break
63 yield "%s\n" % line[: space - 1]
64 line = line[space:]
65 max_width = min((line.endswith("\n") and width + 1 or width), width)
66 if line:
67 yield line
68
69 return "".join(_generator())
70
71
72def add_truncation_text(text, truncate=None):
73 if truncate is None:
74 truncate = pgettext(
75 "String to return when truncating text", "%(truncated_text)s…"
76 )
77 if "%(truncated_text)s" in truncate:
78 return truncate % {"truncated_text": text}
79 # The truncation text didn't contain the %(truncated_text)s string
80 # replacement argument so just append it to the text.
81 if text.endswith(truncate):
82 # But don't append the truncation text if the current text already ends
83 # in this.
84 return text
85 return f"{text}{truncate}"
86
87
88def calculate_truncate_chars_length(length, replacement):
89 truncate_len = length
90 for char in add_truncation_text("", replacement):
91 if not unicodedata.combining(char):
92 truncate_len -= 1
93 if truncate_len == 0:
94 break
95 return truncate_len
96
97
98class TruncateHTMLParser(HTMLParser):
99 class TruncationCompleted(Exception):
100 pass
101
102 def __init__(self, *, length, replacement, convert_charrefs=True):
103 super().__init__(convert_charrefs=convert_charrefs)
104 self.tags = deque()
105 self.output = ""
106 self.remaining = length
107 self.replacement = replacement
108
109 @cached_property
110 def void_elements(self):
111 from django.utils.html import VOID_ELEMENTS
112
113 return VOID_ELEMENTS
114
115 def handle_startendtag(self, tag, attrs):
116 self.handle_starttag(tag, attrs)
117 if tag not in self.void_elements:
118 self.handle_endtag(tag)
119
120 def handle_starttag(self, tag, attrs):
121 self.output += self.get_starttag_text()
122 if tag not in self.void_elements:
123 self.tags.appendleft(tag)
124
125 def handle_endtag(self, tag):
126 if tag not in self.void_elements:
127 self.output += f"</{tag}>"
128 try:
129 self.tags.remove(tag)
130 except ValueError:
131 pass
132
133 def handle_data(self, data):
134 data, output = self.process(data)
135 data_len = len(data)
136 if self.remaining < data_len:
137 self.remaining = 0
138 self.output += add_truncation_text(output, self.replacement)
139 raise self.TruncationCompleted
140 self.remaining -= data_len
141 self.output += output
142
143 def feed(self, data):
144 try:
145 super().feed(data)
146 except self.TruncationCompleted:
147 self.output += "".join([f"</{tag}>" for tag in self.tags])
148 self.tags.clear()
149 self.reset()
150 else:
151 # No data was handled.
152 self.reset()
153
154
155class TruncateCharsHTMLParser(TruncateHTMLParser):
156 def __init__(self, *, length, replacement, convert_charrefs=True):
157 self.length = length
158 self.processed_chars = 0
159 super().__init__(
160 length=calculate_truncate_chars_length(length, replacement),
161 replacement=replacement,
162 convert_charrefs=convert_charrefs,
163 )
164
165 def process(self, data):
166 self.processed_chars += len(data)
167 if (self.processed_chars == self.length) and (
168 len(self.output) + len(data) == len(self.rawdata)
169 ):
170 self.output += data
171 raise self.TruncationCompleted
172 output = escape("".join(data[: self.remaining]))
173 return data, output
174
175
176class TruncateWordsHTMLParser(TruncateHTMLParser):
177 def process(self, data):
178 data = re.split(r"(?<=\S)\s+(?=\S)", data)
179 output = escape(" ".join(data[: self.remaining]))
180 return data, output
181
182
183class Truncator(SimpleLazyObject):
184 """
185 An object used to truncate text, either by characters or words.
186
187 When truncating HTML text (either chars or words), input will be limited to
188 at most `MAX_LENGTH_HTML` characters.
189 """
190
191 # 5 million characters are approximately 4000 text pages or 3 web pages.
192 MAX_LENGTH_HTML = 5_000_000
193
194 def __init__(self, text):
195 super().__init__(lambda: str(text))
196
197 def chars(self, num, truncate=None, html=False):
198 """
199 Return the text truncated to be no longer than the specified number
200 of characters.
201
202 `truncate` specifies what should be used to notify that the string has
203 been truncated, defaulting to a translatable string of an ellipsis.
204 """
205 self._setup()
206 length = int(num)
207 if length <= 0:
208 return ""
209 text = unicodedata.normalize("NFC", self._wrapped)
210
211 if html:
212 parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
213 parser.feed(text)
214 parser.close()
215 return parser.output
216 return self._text_chars(length, truncate, text)
217
218 def _text_chars(self, length, truncate, text):
219 """Truncate a string after a certain number of chars."""
220 truncate_len = calculate_truncate_chars_length(length, truncate)
221 s_len = 0
222 end_index = None
223 for i, char in enumerate(text):
224 if unicodedata.combining(char):
225 # Don't consider combining characters
226 # as adding to the string length
227 continue
228 s_len += 1
229 if end_index is None and s_len > truncate_len:
230 end_index = i
231 if s_len > length:
232 # Return the truncated string
233 return add_truncation_text(text[: end_index or 0], truncate)
234
235 # Return the original string since no truncation was necessary
236 return text
237
238 def words(self, num, truncate=None, html=False):
239 """
240 Truncate a string after a certain number of words. `truncate` specifies
241 what should be used to notify that the string has been truncated,
242 defaulting to ellipsis.
243 """
244 self._setup()
245 length = int(num)
246 if length <= 0:
247 return ""
248 if html:
249 parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
250 parser.feed(self._wrapped)
251 parser.close()
252 return parser.output
253 return self._text_words(length, truncate)
254
255 def _text_words(self, length, truncate):
256 """
257 Truncate a string after a certain number of words.
258
259 Strip newlines in the string.
260 """
261 words = self._wrapped.split()
262 if len(words) > length:
263 words = words[:length]
264 return add_truncation_text(" ".join(words), truncate)
265 return " ".join(words)
266
267
268@keep_lazy_text
269def get_valid_filename(name):
270 """
271 Return the given string converted to a string that can be used for a clean
272 filename. Remove leading and trailing spaces; convert other spaces to
273 underscores; and remove anything that is not an alphanumeric, dash,
274 underscore, or dot.
275 >>> get_valid_filename("john's portrait in 2004.jpg")
276 'johns_portrait_in_2004.jpg'
277 """
278 s = str(name).strip().replace(" ", "_")
279 s = re.sub(r"(?u)[^-\w.]", "", s)
280 if s in {"", ".", ".."}:
281 raise SuspiciousFileOperation("Could not derive file name from '%s'" % name)
282 return s
283
284
285@keep_lazy_text
286def get_text_list(list_, last_word=gettext_lazy("or")):
287 """
288 >>> get_text_list(['a', 'b', 'c', 'd'])
289 'a, b, c or d'
290 >>> get_text_list(['a', 'b', 'c'], 'and')
291 'a, b and c'
292 >>> get_text_list(['a', 'b'], 'and')
293 'a and b'
294 >>> get_text_list(['a'])
295 'a'
296 >>> get_text_list([])
297 ''
298 """
299 if not list_:
300 return ""
301 if len(list_) == 1:
302 return str(list_[0])
303 return "%s %s %s" % (
304 # Translators: This string is used as a separator between list elements
305 _(", ").join(str(i) for i in list_[:-1]),
306 str(last_word),
307 str(list_[-1]),
308 )
309
310
311@keep_lazy_text
312def normalize_newlines(text):
313 """Normalize CRLF and CR newlines to just LF."""
314 return re_newlines.sub("\n", str(text))
315
316
317@keep_lazy_text
318def phone2numeric(phone):
319 """Convert a phone number with letters into its numeric equivalent."""
320 char2number = {
321 "a": "2",
322 "b": "2",
323 "c": "2",
324 "d": "3",
325 "e": "3",
326 "f": "3",
327 "g": "4",
328 "h": "4",
329 "i": "4",
330 "j": "5",
331 "k": "5",
332 "l": "5",
333 "m": "6",
334 "n": "6",
335 "o": "6",
336 "p": "7",
337 "q": "7",
338 "r": "7",
339 "s": "7",
340 "t": "8",
341 "u": "8",
342 "v": "8",
343 "w": "9",
344 "x": "9",
345 "y": "9",
346 "z": "9",
347 }
348 return "".join(char2number.get(c, c) for c in phone.lower())
349
350
351def _get_random_filename(max_random_bytes):
352 return b"a" * secrets.randbelow(max_random_bytes)
353
354
355def compress_string(s, *, max_random_bytes=None):
356 compressed_data = gzip_compress(s, compresslevel=6, mtime=0)
357
358 if not max_random_bytes:
359 return compressed_data
360
361 compressed_view = memoryview(compressed_data)
362 header = bytearray(compressed_view[:10])
363 header[3] = gzip.FNAME
364
365 filename = _get_random_filename(max_random_bytes) + b"\x00"
366
367 return bytes(header) + filename + compressed_view[10:]
368
369
370class StreamingBuffer(BytesIO):
371 def read(self):
372 ret = self.getvalue()
373 self.seek(0)
374 self.truncate()
375 return ret
376
377
378# Like compress_string, but for iterators of strings.
379def compress_sequence(sequence, *, max_random_bytes=None):
380 buf = StreamingBuffer()
381 filename = _get_random_filename(max_random_bytes) if max_random_bytes else None
382 with GzipFile(
383 filename=filename, mode="wb", compresslevel=6, fileobj=buf, mtime=0
384 ) as zfile:
385 # Output headers...
386 yield buf.read()
387 for item in sequence:
388 zfile.write(item)
389 data = buf.read()
390 if data:
391 yield data
392 yield buf.read()
393
394
395# Expression to match some_token and some_token="with spaces" (and similarly
396# for single-quoted strings).
397smart_split_re = _lazy_re_compile(
398 r"""
399 ((?:
400 [^\s'"]*
401 (?:
402 (?:"(?:[^"\\]|\\.)*" | '(?:[^'\\]|\\.)*')
403 [^\s'"]*
404 )+
405 ) | \S+)
406""",
407 re.VERBOSE,
408)
409
410
411def smart_split(text):
412 r"""
413 Generator that splits a string by spaces, leaving quoted phrases together.
414 Supports both single and double quotes, and supports escaping quotes with
415 backslashes. In the output, strings will keep their initial and trailing
416 quote marks and escaped quotes will remain escaped (the results can then
417 be further processed with unescape_string_literal()).
418
419 >>> list(smart_split(r'This is "a person\'s" test.'))
420 ['This', 'is', '"a person\\\'s"', 'test.']
421 >>> list(smart_split(r"Another 'person\'s' test."))
422 ['Another', "'person\\'s'", 'test.']
423 >>> list(smart_split(r'A "\"funky\" style" test.'))
424 ['A', '"\\"funky\\" style"', 'test.']
425 """
426 for bit in smart_split_re.finditer(str(text)):
427 yield bit[0]
428
429
430@keep_lazy_text
431def unescape_string_literal(s):
432 r"""
433 Convert quoted string literals to unquoted strings with escaped quotes and
434 backslashes unquoted::
435
436 >>> unescape_string_literal('"abc"')
437 'abc'
438 >>> unescape_string_literal("'abc'")
439 'abc'
440 >>> unescape_string_literal('"a \"bc\""')
441 'a "bc"'
442 >>> unescape_string_literal("'\'ab\' c'")
443 "'ab' c"
444 """
445 if not s or s[0] not in "\"'" or s[-1] != s[0]:
446 raise ValueError("Not a string literal: %r" % s)
447 quote = s[0]
448 return s[1:-1].replace(r"\%s" % quote, quote).replace(r"\\", "\\")
449
450
451@keep_lazy_text
452def slugify(value, allow_unicode=False):
453 """
454 Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
455 dashes to single dashes. Remove characters that aren't alphanumerics,
456 underscores, or hyphens. Convert to lowercase. Also strip leading and
457 trailing whitespace, dashes, and underscores.
458 """
459 value = str(value)
460 if allow_unicode:
461 value = unicodedata.normalize("NFKC", value)
462 else:
463 value = (
464 unicodedata.normalize("NFKD", value)
465 .encode("ascii", "ignore")
466 .decode("ascii")
467 )
468 value = re.sub(r"[^\w\s-]", "", value.lower())
469 return re.sub(r"[-\s]+", "-", value).strip("-_")
470
471
472def camel_case_to_spaces(value):
473 """
474 Split CamelCase and convert to lowercase. Strip surrounding whitespace.
475 """
476 return re_camel_case.sub(r" \1", value).strip().lower()
477
478
479def _format_lazy(format_string, *args, **kwargs):
480 """
481 Apply str.format() on 'format_string' where format_string, args,
482 and/or kwargs might be lazy.
483 """
484 return format_string.format(*args, **kwargs)
485
486
487format_lazy = lazy(_format_lazy, str)