Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/wcwidth/wcwidth.py: 24%

1"""

2This is a python implementation of wcwidth() and wcswidth().

4https://github.com/jquast/wcwidth

6from Markus Kuhn's C code, retrieved from:

8 http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c

10This is an implementation of wcwidth() and wcswidth() (defined in

11IEEE Std 1002.1-2001) for Unicode.

13http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html

14http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html

16In fixed-width output devices, Latin characters all occupy a single

17"cell" position of equal width, whereas ideographic CJK characters

18occupy two such cells. Interoperability between terminal-line

19applications and (teletype-style) character terminals using the

20UTF-8 encoding requires agreement on which character should advance

21the cursor by how many cell positions. No established formal

22standards exist at present on which Unicode character shall occupy

23how many cell positions on character terminals. These routines are

24a first attempt of defining such behavior based on simple rules

25applied to data provided by the Unicode Consortium.

27For some graphical characters, the Unicode standard explicitly

28defines a character-cell width via the definition of the East Asian

29FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.

30In all these cases, there is no ambiguity about which width a

31terminal shall use. For characters in the East Asian Ambiguous (A)

32class, the width choice depends purely on a preference of backward

33compatibility with either historic CJK or Western practice.

34Choosing single-width for these characters is easy to justify as

35the appropriate long-term solution, as the CJK practice of

36displaying these characters as double-width comes from historic

37implementation simplicity (8-bit encoded characters were displayed

38single-width and 16-bit ones double-width, even for Greek,

39Cyrillic, etc.) and not any typographic considerations.

41Much less clear is the choice of width for the Not East Asian

42(Neutral) class. Existing practice does not dictate a width for any

43of these characters. It would nevertheless make sense

44typographically to allocate two character cells to characters such

45as for instance EM SPACE or VOLUME INTEGRAL, which cannot be

46represented adequately with a single-width glyph. The following

47routines at present merely assign a single-cell width to all

48neutral characters, in the interest of simplicity. This is not

49entirely satisfactory and should be reconsidered before

50establishing a formal standard in this area. At the moment, the

51decision which Not East Asian (Neutral) characters should be

52represented by double-width glyphs cannot yet be answered by

53applying a simple rule from the Unicode database content. Setting

54up a proper standard for the behavior of UTF-8 character terminals

55will require a careful analysis not only of each Unicode character,

56but also of each presentation form, something the author of these

57routines has avoided to do so far.

59http://www.unicode.org/unicode/reports/tr11/

61Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c

62"""

63from __future__ import division

65# std imports

66import os

67import sys

68import warnings

70# local

71from .table_wide import WIDE_EASTASIAN

72from .table_zero import ZERO_WIDTH

73from .unicode_versions import list_versions

75try:

76 # std imports

77 from functools import lru_cache

78except ImportError:

79 # lru_cache was added in Python 3.2

80 # 3rd party

81 from backports.functools_lru_cache import lru_cache

83# global cache

84_UNICODE_CMPTABLE = None

85_PY3 = (sys.version_info[0] >= 3)

88# NOTE: created by hand, there isn't anything identifiable other than

89# general Cf category code to identify these, and some characters in Cf

90# category code are of non-zero width.

91# Also includes some Cc, Mn, Zl, and Zp characters

92ZERO_WIDTH_CF = set([

93 0, # Null (Cc)

94 0x034F, # Combining grapheme joiner (Mn)

95 0x200B, # Zero width space

96 0x200C, # Zero width non-joiner

97 0x200D, # Zero width joiner

98 0x200E, # Left-to-right mark

99 0x200F, # Right-to-left mark

100 0x2028, # Line separator (Zl)

101 0x2029, # Paragraph separator (Zp)

102 0x202A, # Left-to-right embedding

103 0x202B, # Right-to-left embedding

104 0x202C, # Pop directional formatting

105 0x202D, # Left-to-right override

106 0x202E, # Right-to-left override

107 0x2060, # Word joiner

108 0x2061, # Function application

109 0x2062, # Invisible times

110 0x2063, # Invisible separator

111])

112

113

114def _bisearch(ucs, table):

115 """

116 Auxiliary function for binary search in interval table.

117

118 :arg int ucs: Ordinal value of unicode character.

119 :arg list table: List of starting and ending ranges of ordinal values,

120 in form of ``[(start, end), ...]``.

121 :rtype: int

122 :returns: 1 if ordinal value ucs is found within lookup table, else 0.

123 """

124 lbound = 0

125 ubound = len(table) - 1

126

127 if ucs < table[0][0] or ucs > table[ubound][1]:

128 return 0

129 while ubound >= lbound:

130 mid = (lbound + ubound) // 2

131 if ucs > table[mid][1]:

132 lbound = mid + 1

133 elif ucs < table[mid][0]:

134 ubound = mid - 1

135 else:

136 return 1

137

138 return 0

139

140

141@lru_cache(maxsize=1000)

142def wcwidth(wc, unicode_version='auto'):

143 r"""

144 Given one Unicode character, return its printable length on a terminal.

145

146 :param str wc: A single Unicode character.

147 :param str unicode_version: A Unicode version number, such as

148 ``'6.0.0'``, the list of available version levels may be

149 listed by pairing function :func:`list_versions`.

150

151 Any version string may be specified without error -- the nearest

152 matching version is selected. When ``latest`` (default), the

153 highest Unicode version level is used.

154 :return: The width, in cells, necessary to display the character of

155 Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has

156 no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is

157 not printable, or has an indeterminate effect on the terminal, such as

158 a control character. Otherwise, the number of column positions the

159 character occupies on a graphic terminal (1 or 2) is returned.

160 :rtype: int

161

162 The following have a column width of -1:

163

164 - C0 control characters (U+001 through U+01F).

165

166 - C1 control characters and DEL (U+07F through U+0A0).

167

168 The following have a column width of 0:

169

170 - Non-spacing and enclosing combining characters (general

171 category code Mn or Me in the Unicode database).

172

173 - NULL (``U+0000``).

174

175 - COMBINING GRAPHEME JOINER (``U+034F``).

176

177 - ZERO WIDTH SPACE (``U+200B``) *through*

178 RIGHT-TO-LEFT MARK (``U+200F``).

179

180 - LINE SEPARATOR (``U+2028``) *and*

181 PARAGRAPH SEPARATOR (``U+2029``).

182

183 - LEFT-TO-RIGHT EMBEDDING (``U+202A``) *through*

184 RIGHT-TO-LEFT OVERRIDE (``U+202E``).

185

186 - WORD JOINER (``U+2060``) *through*

187 INVISIBLE SEPARATOR (``U+2063``).

188

189 The following have a column width of 1:

190

191 - SOFT HYPHEN (``U+00AD``).

192

193 - All remaining characters, including all printable ISO 8859-1

194 and WGL4 characters, Unicode control characters, etc.

195

196 The following have a column width of 2:

197

198 - Spacing characters in the East Asian Wide (W) or East Asian

199 Full-width (F) category as defined in Unicode Technical

200 Report #11 have a column width of 2.

201

202 - Some kinds of Emoji or symbols.

203 """

204 # NOTE: created by hand, there isn't anything identifiable other than

205 # general Cf category code to identify these, and some characters in Cf

206 # category code are of non-zero width.

207 ucs = ord(wc)

208 if ucs in ZERO_WIDTH_CF:

209 return 0

210

211 # C0/C1 control characters

212 if ucs < 32 or 0x07F <= ucs < 0x0A0:

213 return -1

214

215 _unicode_version = _wcmatch_version(unicode_version)

216

217 # combining characters with zero width

218 if _bisearch(ucs, ZERO_WIDTH[_unicode_version]):

219 return 0

220

221 # "Wide AastAsian" (and emojis)

222 return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version])

223

224

225def wcswidth(pwcs, n=None, unicode_version='auto'):

226 """

227 Given a unicode string, return its printable length on a terminal.

228

229 :param str pwcs: Measure width of given unicode string.

230 :param int n: When ``n`` is None (default), return the length of the

231 entire string, otherwise width the first ``n`` characters specified.

232 :param str unicode_version: An explicit definition of the unicode version

233 level to use for determination, may be ``auto`` (default), which uses

234 the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest

235 available unicode version, otherwise.

236 :rtype: int

237 :returns: The width, in cells, necessary to display the first ``n``

238 characters of the unicode string ``pwcs``. Returns ``-1`` if

239 a non-printable character is encountered.

240 """

241 # pylint: disable=C0103

242 # Invalid argument name "n"

243

244 end = len(pwcs) if n is None else n

245 idx = slice(0, end)

246 width = 0

247 for char in pwcs[idx]:

248 wcw = wcwidth(char, unicode_version)

249 if wcw < 0:

250 return -1

251 width += wcw

252 return width

253

254

255@lru_cache(maxsize=128)

256def _wcversion_value(ver_string):

257 """

258 Integer-mapped value of given dotted version string.

259

260 :param str ver_string: Unicode version string, of form ``n.n.n``.

261 :rtype: tuple(int)

262 :returns: tuple of digit tuples, ``tuple(int, [...])``.

263 """

264 retval = tuple(map(int, (ver_string.split('.'))))

265 return retval

266

267

268@lru_cache(maxsize=8)

269def _wcmatch_version(given_version):

270 """

271 Return nearest matching supported Unicode version level.

272

273 If an exact match is not determined, the nearest lowest version level is

274 returned after a warning is emitted. For example, given supported levels

275 ``4.1.0`` and ``5.0.0``, and a version string of ``4.9.9``, then ``4.1.0``

276 is selected and returned:

277

278 >>> _wcmatch_version('4.9.9')

279 '4.1.0'

280 >>> _wcmatch_version('8.0')

281 '8.0.0'

282 >>> _wcmatch_version('1')

283 '4.1.0'

284

285 :param str given_version: given version for compare, may be ``auto``

286 (default), to select Unicode Version from Environment Variable,

287 ``UNICODE_VERSION``. If the environment variable is not set, then the

288 latest is used.

289 :rtype: str

290 :returns: unicode string, or non-unicode ``str`` type for python 2

291 when given ``version`` is also type ``str``.

292 """

293 # Design note: the choice to return the same type that is given certainly

294 # complicates it for python 2 str-type, but allows us to define an api that

295 # to use 'string-type', for unicode version level definitions, so all of our

296 # example code works with all versions of python. That, along with the

297 # string-to-numeric and comparisons of earliest, latest, matching, or

298 # nearest, greatly complicates this function.

299 _return_str = not _PY3 and isinstance(given_version, str)

300

301 if _return_str:

302 unicode_versions = [ucs.encode() for ucs in list_versions()]

303 else:

304 unicode_versions = list_versions()

305 latest_version = unicode_versions[-1]

306

307 if given_version in (u'auto', 'auto'):

308 given_version = os.environ.get(

309 'UNICODE_VERSION',

310 'latest' if not _return_str else latest_version.encode())

311

312 if given_version in (u'latest', 'latest'):

313 # default match, when given as 'latest', use the most latest unicode

314 # version specification level supported.

315 return latest_version if not _return_str else latest_version.encode()

316

317 if given_version in unicode_versions:

318 # exact match, downstream has specified an explicit matching version

319 # matching any value of list_versions().

320 return given_version if not _return_str else given_version.encode()

321

322 # The user's version is not supported by ours. We return the newest unicode

323 # version level that we support below their given value.

324 try:

325 cmp_given = _wcversion_value(given_version)

326

327 except ValueError:

328 # submitted value raises ValueError in int(), warn and use latest.

329 warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. "

330 "Value should be in form of `integer[.]+', the latest "

331 "supported unicode version {latest_version!r} has been "

332 "inferred.".format(given_version=given_version,

333 latest_version=latest_version))

334 return latest_version if not _return_str else latest_version.encode()

335

336 # given version is less than any available version, return earliest

337 # version.

338 earliest_version = unicode_versions[0]

339 cmp_earliest_version = _wcversion_value(earliest_version)

340

341 if cmp_given <= cmp_earliest_version:

342 # this probably isn't what you wanted, the oldest wcwidth.c you will

343 # find in the wild is likely version 5 or 6, which we both support,

344 # but it's better than not saying anything at all.

345 warnings.warn("UNICODE_VERSION value, {given_version!r}, is lower "

346 "than any available unicode version. Returning lowest "

347 "version level, {earliest_version!r}".format(

348 given_version=given_version,

349 earliest_version=earliest_version))

350 return earliest_version if not _return_str else earliest_version.encode()

351

352 # create list of versions which are less than our equal to given version,

353 # and return the tail value, which is the highest level we may support,

354 # or the latest value we support, when completely unmatched or higher

355 # than any supported version.

356 #

357 # function will never complete, always returns.

358 for idx, unicode_version in enumerate(unicode_versions):

359 # look ahead to next value

360 try:

361 cmp_next_version = _wcversion_value(unicode_versions[idx + 1])

362 except IndexError:

363 # at end of list, return latest version

364 return latest_version if not _return_str else latest_version.encode()

365

366 # Maybe our given version has less parts, as in tuple(8, 0), than the

367 # next compare version tuple(8, 0, 0). Test for an exact match by

368 # comparison of only the leading dotted piece(s): (8, 0) == (8, 0).

369 if cmp_given == cmp_next_version[:len(cmp_given)]:

370 return unicode_versions[idx + 1]

371

372 # Or, if any next value is greater than our given support level

373 # version, return the current value in index. Even though it must

374 # be less than the given value, its our closest possible match. That

375 # is, 4.1 is returned for given 4.9.9, where 4.1 and 5.0 are available.

376 if cmp_next_version > cmp_given:

377 return unicode_version

378 assert False, ("Code path unreachable", given_version, unicode_versions)