Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown

1"""Utilities for parsing source text

2"""

3import html

4import re

5from typing import Any

7from .entities import entities

10def charCodeAt(src: str, pos: int) -> Any:

11 """

12 Returns the Unicode value of the character at the specified location.

14 @param - index The zero-based index of the desired character.

15 If there is no character at the specified index, NaN is returned.

17 This was added for compatibility with python

18 """

19 try:

20 return ord(src[pos])

21 except IndexError:

22 return None

25# Merge objects

26#

27def assign(obj):

28 """Merge objects /*from1, from2, from3, ...*/)"""

29 raise NotImplementedError

30 # sources = Array.prototype.slice.call(arguments, 1)

32 # sources.forEach(function (source) {

33 # if (!source) { return; }

35 # if (typeof source !== 'object') {

36 # throw new TypeError(source + 'must be object')

37 # }

39 # Object.keys(source).forEach(function (key) {

40 # obj[key] = source[key]

41 # })

42 # })

44 # return obj

47def arrayReplaceAt(src: list, pos: int, newElements: list) -> list:

48 """

49 Remove element from array and put another array at those position.

50 Useful for some operations with tokens

51 """

52 return src[:pos] + newElements + src[pos + 1 :]

55######################################################################

58def isValidEntityCode(c: int) -> bool:

59 # broken sequence

60 if c >= 0xD800 and c <= 0xDFFF:

61 return False

62 # never used

63 if c >= 0xFDD0 and c <= 0xFDEF:

64 return False

65 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):

66 return False

67 # control codes

68 if c >= 0x00 and c <= 0x08:

69 return False

70 if c == 0x0B:

71 return False

72 if c >= 0x0E and c <= 0x1F:

73 return False

74 if c >= 0x7F and c <= 0x9F:

75 return False

76 # out of range

77 if c > 0x10FFFF:

78 return False

79 return True

82def fromCodePoint(c: int) -> str:

83 """Convert ordinal to unicode.

85 Note, in the original Javascript two string characters were required,

86 for codepoints larger than `0xFFFF`.

87 But Python 3 can represent any unicode codepoint in one character.

88 """

89 return chr(c)

92UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')

93# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)

94UNESCAPE_ALL_RE = re.compile(

95 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",

96 re.IGNORECASE,

97)

98DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)

100

101def replaceEntityPattern(match: str, name: str) -> str:

102 """Convert HTML entity patterns

103

104 ::

105

106 https://www.google.com -> https%3A//www.google.com

107

108 """

109 code = 0

110

111 if name in entities:

112 return entities[name]

113

114 if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name):

115 code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)

116 if isValidEntityCode(code):

117 return fromCodePoint(code)

118

119 return match

120

121

122# def replaceEntities(string):

123# if (string.indexOf('&') < 0):

124# return string

125# return string.replace(ENTITY_RE, replaceEntityPattern)

126

127

128def unescapeMd(string: str) -> str:

129 raise NotImplementedError

130 # if "\\" in string:

131 # return string

132 # return string.replace(UNESCAPE_MD_RE, "$1")

133

134

135def unescapeAll(string: str) -> str:

136 def replacer_func(match):

137 escaped = match.group(1)

138 if escaped:

139 return escaped

140 entity = match.group(2)

141 return replaceEntityPattern(match.group(), entity)

142

143 if "\\" not in string and "&" not in string:

144 return string

145 return UNESCAPE_ALL_RE.sub(replacer_func, string)

146

147

148ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""

149ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")

150

151

152def stripEscape(string: str) -> str:

153 """Strip escape \\ characters"""

154 return ESCAPE_CHAR.sub(r"\1", string)

155

156

157# //////////////////////////////////////////////////////////////////////////////

158

159# TODO This section changed quite a lot, should re-check

160

161# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")

162# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")

163# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')

164

165

166# def escapeHtml(string: str):

167

168# if HTML_ESCAPE_REPLACE_RE.search(string):

169

170# string = UNESCAPE_HTML_RE.sub("&", string)

171# string = ESCAPE_AND_HTML.sub("&", string)

172# for k, v in {"<": "<", ">": ">", '"': """}.items():

173# string = string.replace(k, v)

174

175# return string

176

177

178def escapeHtml(raw: str) -> str:

179 # return html.escape(html.unescape(raw)).replace("'", "'")

180 return html.escape(raw).replace("'", "'")

181

182

183# //////////////////////////////////////////////////////////////////////////////

184

185REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")

186

187

188def escapeRE(string: str) -> str:

189 string = REGEXP_ESCAPE_RE.sub("\\$&", string)

190 return string

191

192

193# //////////////////////////////////////////////////////////////////////////////

194

195

196def isSpace(code: object) -> bool:

197 return code in {0x09, 0x20}

198

199

200MD_WHITESPACE = {

201 0x09, # \t

202 0x0A, # \n

203 0x0B, # \v

204 0x0C, # \f

205 0x0D, # \r

206 0x20,

207 0xA0,

208 0x1680,

209 0x202F,

210 0x205F,

211 0x3000,

212}

213

214

215def isWhiteSpace(code: int) -> bool:

216 r"""Zs (unicode class) || [\t\f\v\r\n]"""

217 if code >= 0x2000 and code <= 0x200A:

218 return True

219 return code in MD_WHITESPACE

220

221

222# //////////////////////////////////////////////////////////////////////////////

223

224UNICODE_PUNCT_RE = re.compile(

225 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501

226)

227

228

229# Currently without astral characters support.

230def isPunctChar(ch: str) -> bool:

231 return UNICODE_PUNCT_RE.search(ch) is not None

232

233

234MD_ASCII_PUNCT = {

235 0x21, # /* ! */

236 0x22, # /* " */

237 0x23, # /* # */

238 0x24, # /* $ */

239 0x25, # /* % */

240 0x26, # /* & */

241 0x27, # /* ' */

242 0x28, # /* ( */

243 0x29, # /* ) */

244 0x2A, # /* * */

245 0x2B, # /* + */

246 0x2C, # /* , */

247 0x2D, # /* - */

248 0x2E, # /* . */

249 0x2F, # /* / */

250 0x3A, # /* : */

251 0x3B, # /* ; */

252 0x3C, # /* < */

253 0x3D, # /* = */

254 0x3E, # /* > */

255 0x3F, # /* ? */

256 0x40, # /* @ */

257 0x5B, # /* [ */

258 0x5C, # /* \ */

259 0x5D, # /* ] */

260 0x5E, # /* ^ */

261 0x5F, # /* _ */

262 0x60, # /* ` */

263 0x7B, # /* { */

264 0x7C, # /* | */

265 0x7D, # /* } */

266 0x7E, # /* ~ */

267}

268

269

270def isMdAsciiPunct(ch: int) -> bool:

271 """Markdown ASCII punctuation characters.

272

273 ::

274

275 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~

276

277 See http://spec.commonmark.org/0.15/#ascii-punctuation-character

278

279 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.

280

281 """ # noqa: E501

282 return ch in MD_ASCII_PUNCT

283

284

285def normalizeReference(string: str) -> str:

286 """Helper to unify [reference labels]."""

287 # Trim and collapse whitespace

288 #

289 string = re.sub(r"\s+", " ", string.strip())

290

291 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug

292 # fixed in v12 (couldn't find any details).

293 #

294 # So treat this one as a special case

295 # (remove this when node v10 is no longer supported).

296 #

297 # if ('ẞ'.toLowerCase() === 'Ṿ') {

298 # str = str.replace(/ẞ/g, 'ß')

299 # }

300

301 # .toLowerCase().toUpperCase() should get rid of all differences

302 # between letter variants.

303 #

304 # Simple .toLowerCase() doesn't normalize 125 code points correctly,

305 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:

306 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently

307 # uppercased versions).

308 #

309 # Here's an example showing how it happens. Lets take greek letter omega:

310 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)

311 #

312 # Unicode entries:

313 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8

314 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398

315 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398

316 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8

317 #

318 # Case-insensitive comparison should treat all of them as equivalent.

319 #

320 # But .toLowerCase() doesn't change ϑ (it's already lowercase),

321 # and .toUpperCase() doesn't change ϴ (already uppercase).

322 #

323 # Applying first lower then upper case normalizes any character:

324 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'

325 #

326 # Note: this is equivalent to unicode case folding; unicode normalization

327 # is a different step that is not required here.

328 #

329 # Final result should be uppercased, because it's later stored in an object

330 # (this avoid a conflict with Object.prototype members,

331 # most notably, `__proto__`)

332 #

333 return string.lower().upper()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown_it/common/utils.py: 92%

83 statements