Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/python_slugify-8.0.4-py3.11.egg/slugify/slugify.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

92 statements  

1from __future__ import annotations 

2 

3import re 

4import unicodedata 

5from collections.abc import Iterable 

6from html.entities import name2codepoint 

7 

8try: 

9 import unidecode 

10except ImportError: 

11 import text_unidecode as unidecode 

12 

13__all__ = ['slugify', 'smart_truncate'] 

14 

15 

16CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) 

17DECIMAL_PATTERN = re.compile(r'&#(\d+);') 

18HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') 

19QUOTE_PATTERN = re.compile(r'[\']+') 

20DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') 

21DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') 

22DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') 

23NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') 

24DEFAULT_SEPARATOR = '-' 

25 

26 

27def smart_truncate( 

28 string: str, 

29 max_length: int = 0, 

30 word_boundary: bool = False, 

31 separator: str = " ", 

32 save_order: bool = False, 

33) -> str: 

34 """ 

35 Truncate a string. 

36 :param string (str): string for modification 

37 :param max_length (int): output string length 

38 :param word_boundary (bool): 

39 :param save_order (bool): if True then word order of output string is like input string 

40 :param separator (str): separator between words 

41 :return: 

42 """ 

43 

44 string = string.strip(separator) 

45 

46 if not max_length: 

47 return string 

48 

49 if len(string) < max_length: 

50 return string 

51 

52 if not word_boundary: 

53 return string[:max_length].strip(separator) 

54 

55 if separator not in string: 

56 return string[:max_length] 

57 

58 truncated = '' 

59 for word in string.split(separator): 

60 if word: 

61 next_len = len(truncated) + len(word) 

62 if next_len < max_length: 

63 truncated += '{}{}'.format(word, separator) 

64 elif next_len == max_length: 

65 truncated += '{}'.format(word) 

66 break 

67 else: 

68 if save_order: 

69 break 

70 if not truncated: # pragma: no cover 

71 truncated = string[:max_length] 

72 return truncated.strip(separator) 

73 

74 

75def slugify( 

76 text: str, 

77 entities: bool = True, 

78 decimal: bool = True, 

79 hexadecimal: bool = True, 

80 max_length: int = 0, 

81 word_boundary: bool = False, 

82 separator: str = DEFAULT_SEPARATOR, 

83 save_order: bool = False, 

84 stopwords: Iterable[str] = (), 

85 regex_pattern: re.Pattern[str] | str | None = None, 

86 lowercase: bool = True, 

87 replacements: Iterable[Iterable[str]] = (), 

88 allow_unicode: bool = False, 

89) -> str: 

90 """ 

91 Make a slug from the given text. 

92 :param text (str): initial text 

93 :param entities (bool): converts html entities to unicode 

94 :param decimal (bool): converts html decimal to unicode 

95 :param hexadecimal (bool): converts html hexadecimal to unicode 

96 :param max_length (int): output string length 

97 :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length 

98 :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order 

99 :param separator (str): separator between words 

100 :param stopwords (iterable): words to discount 

101 :param regex_pattern (str): regex pattern for disallowed characters 

102 :param lowercase (bool): activate case sensitivity by setting it to False 

103 :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] 

104 :param allow_unicode (bool): allow unicode characters 

105 :return (str): 

106 """ 

107 

108 # user-specific replacements 

109 if replacements: 

110 for old, new in replacements: 

111 text = text.replace(old, new) 

112 

113 # ensure text is unicode 

114 if not isinstance(text, str): 

115 text = str(text, 'utf-8', 'ignore') 

116 

117 # replace quotes with dashes - pre-process 

118 text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) 

119 

120 # normalize text, convert to unicode if required 

121 if allow_unicode: 

122 text = unicodedata.normalize('NFKC', text) 

123 else: 

124 text = unicodedata.normalize('NFKD', text) 

125 text = unidecode.unidecode(text) 

126 

127 # ensure text is still in unicode 

128 if not isinstance(text, str): 

129 text = str(text, 'utf-8', 'ignore') 

130 

131 # character entity reference 

132 if entities: 

133 text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) 

134 

135 # decimal character reference 

136 if decimal: 

137 try: 

138 text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) 

139 except Exception: 

140 pass 

141 

142 # hexadecimal character reference 

143 if hexadecimal: 

144 try: 

145 text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) 

146 except Exception: 

147 pass 

148 

149 # re normalize text 

150 if allow_unicode: 

151 text = unicodedata.normalize('NFKC', text) 

152 else: 

153 text = unicodedata.normalize('NFKD', text) 

154 

155 # make the text lowercase (optional) 

156 if lowercase: 

157 text = text.lower() 

158 

159 # remove generated quotes -- post-process 

160 text = QUOTE_PATTERN.sub('', text) 

161 

162 # cleanup numbers 

163 text = NUMBERS_PATTERN.sub('', text) 

164 

165 # replace all other unwanted characters 

166 if allow_unicode: 

167 pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN 

168 else: 

169 pattern = regex_pattern or DISALLOWED_CHARS_PATTERN 

170 

171 text = re.sub(pattern, DEFAULT_SEPARATOR, text) 

172 

173 # remove redundant 

174 text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) 

175 

176 # remove stopwords 

177 if stopwords: 

178 if lowercase: 

179 stopwords_lower = [s.lower() for s in stopwords] 

180 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] 

181 else: 

182 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] 

183 text = DEFAULT_SEPARATOR.join(words) 

184 

185 # finalize user-specific replacements 

186 if replacements: 

187 for old, new in replacements: 

188 text = text.replace(old, new) 

189 

190 # smart truncate if requested 

191 if max_length > 0: 

192 text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) 

193 

194 if separator != DEFAULT_SEPARATOR: 

195 text = text.replace(DEFAULT_SEPARATOR, separator) 

196 

197 return text