Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/python_slugify-8.0.1-py3.8.egg/slugify/slugify.py: 60%

91 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-03 06:25 +0000

1import re 

2import sys 

3import unicodedata 

4from html.entities import name2codepoint 

5 

6try: 

7 import unidecode 

8except ImportError: 

9 import text_unidecode as unidecode 

10 

11__all__ = ['slugify', 'smart_truncate'] 

12 

13 

14CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) 

15DECIMAL_PATTERN = re.compile(r'&#(\d+);') 

16HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') 

17QUOTE_PATTERN = re.compile(r'[\']+') 

18DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') 

19DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') 

20DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') 

21NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') 

22DEFAULT_SEPARATOR = '-' 

23 

24 

25def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False): 

26 """ 

27 Truncate a string. 

28 :param string (str): string for modification 

29 :param max_length (int): output string length 

30 :param word_boundary (bool): 

31 :param save_order (bool): if True then word order of output string is like input string 

32 :param separator (str): separator between words 

33 :return: 

34 """ 

35 

36 string = string.strip(separator) 

37 

38 if not max_length: 

39 return string 

40 

41 if len(string) < max_length: 

42 return string 

43 

44 if not word_boundary: 

45 return string[:max_length].strip(separator) 

46 

47 if separator not in string: 

48 return string[:max_length] 

49 

50 truncated = '' 

51 for word in string.split(separator): 

52 if word: 

53 next_len = len(truncated) + len(word) 

54 if next_len < max_length: 

55 truncated += '{}{}'.format(word, separator) 

56 elif next_len == max_length: 

57 truncated += '{}'.format(word) 

58 break 

59 else: 

60 if save_order: 

61 break 

62 if not truncated: # pragma: no cover 

63 truncated = string[:max_length] 

64 return truncated.strip(separator) 

65 

66 

67def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, 

68 separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, 

69 replacements=(), allow_unicode=False): 

70 """ 

71 Make a slug from the given text. 

72 :param text (str): initial text 

73 :param entities (bool): converts html entities to unicode 

74 :param decimal (bool): converts html decimal to unicode 

75 :param hexadecimal (bool): converts html hexadecimal to unicode 

76 :param max_length (int): output string length 

77 :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length 

78 :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order 

79 :param separator (str): separator between words 

80 :param stopwords (iterable): words to discount 

81 :param regex_pattern (str): regex pattern for disallowed characters 

82 :param lowercase (bool): activate case sensitivity by setting it to False 

83 :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] 

84 :param allow_unicode (bool): allow unicode characters 

85 :return (str): 

86 """ 

87 

88 # user-specific replacements 

89 if replacements: 

90 for old, new in replacements: 

91 text = text.replace(old, new) 

92 

93 # ensure text is unicode 

94 if not isinstance(text, str): 

95 text = str(text, 'utf-8', 'ignore') 

96 

97 # replace quotes with dashes - pre-process 

98 text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) 

99 

100 # decode unicode 

101 if not allow_unicode: 

102 text = unidecode.unidecode(text) 

103 

104 # ensure text is still in unicode 

105 if not isinstance(text, str): 

106 text = str(text, 'utf-8', 'ignore') 

107 

108 # character entity reference 

109 if entities: 

110 text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) 

111 

112 # decimal character reference 

113 if decimal: 

114 try: 

115 text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) 

116 except Exception: 

117 pass 

118 

119 # hexadecimal character reference 

120 if hexadecimal: 

121 try: 

122 text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) 

123 except Exception: 

124 pass 

125 

126 # translate 

127 if allow_unicode: 

128 text = unicodedata.normalize('NFKC', text) 

129 else: 

130 text = unicodedata.normalize('NFKD', text) 

131 

132 if sys.version_info < (3,): 

133 text = text.encode('ascii', 'ignore') 

134 

135 # make the text lowercase (optional) 

136 if lowercase: 

137 text = text.lower() 

138 

139 # remove generated quotes -- post-process 

140 text = QUOTE_PATTERN.sub('', text) 

141 

142 # cleanup numbers 

143 text = NUMBERS_PATTERN.sub('', text) 

144 

145 # replace all other unwanted characters 

146 if allow_unicode: 

147 pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN 

148 else: 

149 pattern = regex_pattern or DISALLOWED_CHARS_PATTERN 

150 

151 text = re.sub(pattern, DEFAULT_SEPARATOR, text) 

152 

153 # remove redundant 

154 text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) 

155 

156 # remove stopwords 

157 if stopwords: 

158 if lowercase: 

159 stopwords_lower = [s.lower() for s in stopwords] 

160 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] 

161 else: 

162 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] 

163 text = DEFAULT_SEPARATOR.join(words) 

164 

165 # finalize user-specific replacements 

166 if replacements: 

167 for old, new in replacements: 

168 text = text.replace(old, new) 

169 

170 # smart truncate if requested 

171 if max_length > 0: 

172 text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) 

173 

174 if separator != DEFAULT_SEPARATOR: 

175 text = text.replace(DEFAULT_SEPARATOR, separator) 

176 

177 return text