Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/python_slugify-8.0.4-py3.11.egg/slugify/slugify.py: 23%

1from __future__ import annotations

3import re

4import unicodedata

5from collections.abc import Iterable

6from html.entities import name2codepoint

8try:

9 import unidecode

10except ImportError:

11 import text_unidecode as unidecode

13__all__ = ['slugify', 'smart_truncate']

16CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))

17DECIMAL_PATTERN = re.compile(r'&#(\d+);')

18HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')

19QUOTE_PATTERN = re.compile(r'[\']+')

20DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')

21DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')

22DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')

23NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')

24DEFAULT_SEPARATOR = '-'

27def smart_truncate(

28 string: str,

29 max_length: int = 0,

30 word_boundary: bool = False,

31 separator: str = " ",

32 save_order: bool = False,

33) -> str:

34 """

35 Truncate a string.

36 :param string (str): string for modification

37 :param max_length (int): output string length

38 :param word_boundary (bool):

39 :param save_order (bool): if True then word order of output string is like input string

40 :param separator (str): separator between words

41 :return:

42 """

44 string = string.strip(separator)

46 if not max_length:

47 return string

49 if len(string) < max_length:

50 return string

52 if not word_boundary:

53 return string[:max_length].strip(separator)

55 if separator not in string:

56 return string[:max_length]

58 truncated = ''

59 for word in string.split(separator):

60 if word:

61 next_len = len(truncated) + len(word)

62 if next_len < max_length:

63 truncated += '{}{}'.format(word, separator)

64 elif next_len == max_length:

65 truncated += '{}'.format(word)

66 break

67 else:

68 if save_order:

69 break

70 if not truncated: # pragma: no cover

71 truncated = string[:max_length]

72 return truncated.strip(separator)

75def slugify(

76 text: str,

77 entities: bool = True,

78 decimal: bool = True,

79 hexadecimal: bool = True,

80 max_length: int = 0,

81 word_boundary: bool = False,

82 separator: str = DEFAULT_SEPARATOR,

83 save_order: bool = False,

84 stopwords: Iterable[str] = (),

85 regex_pattern: re.Pattern[str] | str | None = None,

86 lowercase: bool = True,

87 replacements: Iterable[Iterable[str]] = (),

88 allow_unicode: bool = False,

89) -> str:

90 """

91 Make a slug from the given text.

92 :param text (str): initial text

93 :param entities (bool): converts html entities to unicode

94 :param decimal (bool): converts html decimal to unicode

95 :param hexadecimal (bool): converts html hexadecimal to unicode

96 :param max_length (int): output string length

97 :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length

98 :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order

99 :param separator (str): separator between words

100 :param stopwords (iterable): words to discount

101 :param regex_pattern (str): regex pattern for disallowed characters

102 :param lowercase (bool): activate case sensitivity by setting it to False

103 :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]

104 :param allow_unicode (bool): allow unicode characters

105 :return (str):

106 """

107

108 # user-specific replacements

109 if replacements:

110 for old, new in replacements:

111 text = text.replace(old, new)

112

113 # ensure text is unicode

114 if not isinstance(text, str):

115 text = str(text, 'utf-8', 'ignore')

116

117 # replace quotes with dashes - pre-process

118 text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)

119

120 # normalize text, convert to unicode if required

121 if allow_unicode:

122 text = unicodedata.normalize('NFKC', text)

123 else:

124 text = unicodedata.normalize('NFKD', text)

125 text = unidecode.unidecode(text)

126

127 # ensure text is still in unicode

128 if not isinstance(text, str):

129 text = str(text, 'utf-8', 'ignore')

130

131 # character entity reference

132 if entities:

133 text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)

134

135 # decimal character reference

136 if decimal:

137 try:

138 text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)

139 except Exception:

140 pass

141

142 # hexadecimal character reference

143 if hexadecimal:

144 try:

145 text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)

146 except Exception:

147 pass

148

149 # re normalize text

150 if allow_unicode:

151 text = unicodedata.normalize('NFKC', text)

152 else:

153 text = unicodedata.normalize('NFKD', text)

154

155 # make the text lowercase (optional)

156 if lowercase:

157 text = text.lower()

158

159 # remove generated quotes -- post-process

160 text = QUOTE_PATTERN.sub('', text)

161

162 # cleanup numbers

163 text = NUMBERS_PATTERN.sub('', text)

164

165 # replace all other unwanted characters

166 if allow_unicode:

167 pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN

168 else:

169 pattern = regex_pattern or DISALLOWED_CHARS_PATTERN

170

171 text = re.sub(pattern, DEFAULT_SEPARATOR, text)

172

173 # remove redundant

174 text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)

175

176 # remove stopwords

177 if stopwords:

178 if lowercase:

179 stopwords_lower = [s.lower() for s in stopwords]

180 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]

181 else:

182 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]

183 text = DEFAULT_SEPARATOR.join(words)

184

185 # finalize user-specific replacements

186 if replacements:

187 for old, new in replacements:

188 text = text.replace(old, new)

189

190 # smart truncate if requested

191 if max_length > 0:

192 text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)

193

194 if separator != DEFAULT_SEPARATOR:

195 text = text.replace(DEFAULT_SEPARATOR, separator)

196

197 return text