Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/python_slugify-8.0.1-py3.8.egg/slugify/slugify.py: 60%

1import re

2import sys

3import unicodedata

4from html.entities import name2codepoint

6try:

7 import unidecode

8except ImportError:

9 import text_unidecode as unidecode

11__all__ = ['slugify', 'smart_truncate']

14CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))

15DECIMAL_PATTERN = re.compile(r'&#(\d+);')

16HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')

17QUOTE_PATTERN = re.compile(r'[\']+')

18DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')

19DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')

20DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')

21NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')

22DEFAULT_SEPARATOR = '-'

25def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False):

26 """

27 Truncate a string.

28 :param string (str): string for modification

29 :param max_length (int): output string length

30 :param word_boundary (bool):

31 :param save_order (bool): if True then word order of output string is like input string

32 :param separator (str): separator between words

33 :return:

34 """

36 string = string.strip(separator)

38 if not max_length:

39 return string

41 if len(string) < max_length:

42 return string

44 if not word_boundary:

45 return string[:max_length].strip(separator)

47 if separator not in string:

48 return string[:max_length]

50 truncated = ''

51 for word in string.split(separator):

52 if word:

53 next_len = len(truncated) + len(word)

54 if next_len < max_length:

55 truncated += '{}{}'.format(word, separator)

56 elif next_len == max_length:

57 truncated += '{}'.format(word)

58 break

59 else:

60 if save_order:

61 break

62 if not truncated: # pragma: no cover

63 truncated = string[:max_length]

64 return truncated.strip(separator)

67def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,

68 separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,

69 replacements=(), allow_unicode=False):

70 """

71 Make a slug from the given text.

72 :param text (str): initial text

73 :param entities (bool): converts html entities to unicode

74 :param decimal (bool): converts html decimal to unicode

75 :param hexadecimal (bool): converts html hexadecimal to unicode

76 :param max_length (int): output string length

77 :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length

78 :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order

79 :param separator (str): separator between words

80 :param stopwords (iterable): words to discount

81 :param regex_pattern (str): regex pattern for disallowed characters

82 :param lowercase (bool): activate case sensitivity by setting it to False

83 :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]

84 :param allow_unicode (bool): allow unicode characters

85 :return (str):

86 """

88 # user-specific replacements

89 if replacements:

90 for old, new in replacements:

91 text = text.replace(old, new)

93 # ensure text is unicode

94 if not isinstance(text, str):

95 text = str(text, 'utf-8', 'ignore')

97 # replace quotes with dashes - pre-process

98 text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)

100 # decode unicode

101 if not allow_unicode:

102 text = unidecode.unidecode(text)

103

104 # ensure text is still in unicode

105 if not isinstance(text, str):

106 text = str(text, 'utf-8', 'ignore')

107

108 # character entity reference

109 if entities:

110 text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)

111

112 # decimal character reference

113 if decimal:

114 try:

115 text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)

116 except Exception:

117 pass

118

119 # hexadecimal character reference

120 if hexadecimal:

121 try:

122 text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)

123 except Exception:

124 pass

125

126 # translate

127 if allow_unicode:

128 text = unicodedata.normalize('NFKC', text)

129 else:

130 text = unicodedata.normalize('NFKD', text)

131

132 if sys.version_info < (3,):

133 text = text.encode('ascii', 'ignore')

134

135 # make the text lowercase (optional)

136 if lowercase:

137 text = text.lower()

138

139 # remove generated quotes -- post-process

140 text = QUOTE_PATTERN.sub('', text)

141

142 # cleanup numbers

143 text = NUMBERS_PATTERN.sub('', text)

144

145 # replace all other unwanted characters

146 if allow_unicode:

147 pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN

148 else:

149 pattern = regex_pattern or DISALLOWED_CHARS_PATTERN

150

151 text = re.sub(pattern, DEFAULT_SEPARATOR, text)

152

153 # remove redundant

154 text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)

155

156 # remove stopwords

157 if stopwords:

158 if lowercase:

159 stopwords_lower = [s.lower() for s in stopwords]

160 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]

161 else:

162 words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]

163 text = DEFAULT_SEPARATOR.join(words)

164

165 # finalize user-specific replacements

166 if replacements:

167 for old, new in replacements:

168 text = text.replace(old, new)

169

170 # smart truncate if requested

171 if max_length > 0:

172 text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)

173

174 if separator != DEFAULT_SEPARATOR:

175 text = text.replace(DEFAULT_SEPARATOR, separator)

176

177 return text