Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/wcwidth/wcwidth.py: 19%
100 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-20 06:09 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-20 06:09 +0000
1"""
2This is a python implementation of wcwidth() and wcswidth().
4https://github.com/jquast/wcwidth
6from Markus Kuhn's C code, retrieved from:
8 http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
10This is an implementation of wcwidth() and wcswidth() (defined in
11IEEE Std 1002.1-2001) for Unicode.
13http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
14http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
16In fixed-width output devices, Latin characters all occupy a single
17"cell" position of equal width, whereas ideographic CJK characters
18occupy two such cells. Interoperability between terminal-line
19applications and (teletype-style) character terminals using the
20UTF-8 encoding requires agreement on which character should advance
21the cursor by how many cell positions. No established formal
22standards exist at present on which Unicode character shall occupy
23how many cell positions on character terminals. These routines are
24a first attempt of defining such behavior based on simple rules
25applied to data provided by the Unicode Consortium.
27For some graphical characters, the Unicode standard explicitly
28defines a character-cell width via the definition of the East Asian
29FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
30In all these cases, there is no ambiguity about which width a
31terminal shall use. For characters in the East Asian Ambiguous (A)
32class, the width choice depends purely on a preference of backward
33compatibility with either historic CJK or Western practice.
34Choosing single-width for these characters is easy to justify as
35the appropriate long-term solution, as the CJK practice of
36displaying these characters as double-width comes from historic
37implementation simplicity (8-bit encoded characters were displayed
38single-width and 16-bit ones double-width, even for Greek,
39Cyrillic, etc.) and not any typographic considerations.
41Much less clear is the choice of width for the Not East Asian
42(Neutral) class. Existing practice does not dictate a width for any
43of these characters. It would nevertheless make sense
44typographically to allocate two character cells to characters such
45as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
46represented adequately with a single-width glyph. The following
47routines at present merely assign a single-cell width to all
48neutral characters, in the interest of simplicity. This is not
49entirely satisfactory and should be reconsidered before
50establishing a formal standard in this area. At the moment, the
51decision which Not East Asian (Neutral) characters should be
52represented by double-width glyphs cannot yet be answered by
53applying a simple rule from the Unicode database content. Setting
54up a proper standard for the behavior of UTF-8 character terminals
55will require a careful analysis not only of each Unicode character,
56but also of each presentation form, something the author of these
57routines has avoided to do so far.
59http://www.unicode.org/unicode/reports/tr11/
61Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
62"""
63from __future__ import division
65# std imports
66import os
67import sys
68import warnings
70# local
71from .table_vs16 import VS16_NARROW_TO_WIDE
72from .table_wide import WIDE_EASTASIAN
73from .table_zero import ZERO_WIDTH
74from .unicode_versions import list_versions
76try:
77 # std imports
78 from functools import lru_cache
79except ImportError:
80 # lru_cache was added in Python 3.2
81 # 3rd party
82 from backports.functools_lru_cache import lru_cache
84# global cache
85_PY3 = sys.version_info[0] >= 3
88def _bisearch(ucs, table):
89 """
90 Auxiliary function for binary search in interval table.
92 :arg int ucs: Ordinal value of unicode character.
93 :arg list table: List of starting and ending ranges of ordinal values,
94 in form of ``[(start, end), ...]``.
95 :rtype: int
96 :returns: 1 if ordinal value ucs is found within lookup table, else 0.
97 """
98 lbound = 0
99 ubound = len(table) - 1
101 if ucs < table[0][0] or ucs > table[ubound][1]:
102 return 0
103 while ubound >= lbound:
104 mid = (lbound + ubound) // 2
105 if ucs > table[mid][1]:
106 lbound = mid + 1
107 elif ucs < table[mid][0]:
108 ubound = mid - 1
109 else:
110 return 1
112 return 0
115@lru_cache(maxsize=1000)
116def wcwidth(wc, unicode_version='auto'):
117 r"""
118 Given one Unicode character, return its printable length on a terminal.
120 :param str wc: A single Unicode character.
121 :param str unicode_version: A Unicode version number, such as
122 ``'6.0.0'``. A list of version levels suported by wcwidth
123 is returned by :func:`list_versions`.
125 Any version string may be specified without error -- the nearest
126 matching version is selected. When ``latest`` (default), the
127 highest Unicode version level is used.
128 :return: The width, in cells, necessary to display the character of
129 Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has
130 no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is
131 not printable, or has an indeterminate effect on the terminal, such as
132 a control character. Otherwise, the number of column positions the
133 character occupies on a graphic terminal (1 or 2) is returned.
134 :rtype: int
136 See :ref:`Specification` for details of cell measurement.
137 """
138 ucs = ord(wc) if wc else 0
140 # small optimization: early return of 1 for printable ASCII, this provides
141 # approximately 40% performance improvement for mostly-ascii documents, with
142 # less than 1% impact to others.
143 if 32 <= ucs < 0x7f:
144 return 1
146 # C0/C1 control characters are -1 for compatibility with POSIX-like calls
147 if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:
148 return -1
150 _unicode_version = _wcmatch_version(unicode_version)
152 # Zero width
153 if _bisearch(ucs, ZERO_WIDTH[_unicode_version]):
154 return 0
156 # 1 or 2 width
157 return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version])
160def wcswidth(pwcs, n=None, unicode_version='auto'):
161 """
162 Given a unicode string, return its printable length on a terminal.
164 :param str pwcs: Measure width of given unicode string.
165 :param int n: When ``n`` is None (default), return the length of the entire
166 string, otherwise only the first ``n`` characters are measured. This
167 argument exists only for compatibility with the C POSIX function
168 signature. It is suggested instead to use python's string slicing
169 capability, ``wcswidth(pwcs[:n])``
170 :param str unicode_version: An explicit definition of the unicode version
171 level to use for determination, may be ``auto`` (default), which uses
172 the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest
173 available unicode version, otherwise.
174 :rtype: int
175 :returns: The width, in cells, needed to display the first ``n`` characters
176 of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control
177 characters!
179 See :ref:`Specification` for details of cell measurement.
180 """
181 # this 'n' argument is a holdover for POSIX function
182 _unicode_version = None
183 end = len(pwcs) if n is None else n
184 width = 0
185 idx = 0
186 last_measured_char = None
187 while idx < end:
188 char = pwcs[idx]
189 if char == u'\u200D':
190 # Zero Width Joiner, do not measure this or next character
191 idx += 2
192 continue
193 if char == u'\uFE0F' and last_measured_char:
194 # on variation selector 16 (VS16) following another character,
195 # conditionally add '1' to the measured width if that character is
196 # known to be converted from narrow to wide by the VS16 character.
197 if _unicode_version is None:
198 _unicode_version = _wcversion_value(_wcmatch_version(unicode_version))
199 if _unicode_version >= (9, 0, 0):
200 width += _bisearch(ord(last_measured_char), VS16_NARROW_TO_WIDE["9.0.0"])
201 last_measured_char = None
202 idx += 1
203 continue
204 # measure character at current index
205 wcw = wcwidth(char, unicode_version)
206 if wcw < 0:
207 # early return -1 on C0 and C1 control characters
208 return wcw
209 if wcw > 0:
210 # track last character measured to contain a cell, so that
211 # subsequent VS-16 modifiers may be understood
212 last_measured_char = char
213 width += wcw
214 idx += 1
215 return width
218@lru_cache(maxsize=128)
219def _wcversion_value(ver_string):
220 """
221 Integer-mapped value of given dotted version string.
223 :param str ver_string: Unicode version string, of form ``n.n.n``.
224 :rtype: tuple(int)
225 :returns: tuple of digit tuples, ``tuple(int, [...])``.
226 """
227 retval = tuple(map(int, (ver_string.split('.'))))
228 return retval
231@lru_cache(maxsize=8)
232def _wcmatch_version(given_version):
233 """
234 Return nearest matching supported Unicode version level.
236 If an exact match is not determined, the nearest lowest version level is
237 returned after a warning is emitted. For example, given supported levels
238 ``4.1.0`` and ``5.0.0``, and a version string of ``4.9.9``, then ``4.1.0``
239 is selected and returned:
241 >>> _wcmatch_version('4.9.9')
242 '4.1.0'
243 >>> _wcmatch_version('8.0')
244 '8.0.0'
245 >>> _wcmatch_version('1')
246 '4.1.0'
248 :param str given_version: given version for compare, may be ``auto``
249 (default), to select Unicode Version from Environment Variable,
250 ``UNICODE_VERSION``. If the environment variable is not set, then the
251 latest is used.
252 :rtype: str
253 :returns: unicode string, or non-unicode ``str`` type for python 2
254 when given ``version`` is also type ``str``.
255 """
256 # Design note: the choice to return the same type that is given certainly
257 # complicates it for python 2 str-type, but allows us to define an api that
258 # uses 'string-type' for unicode version level definitions, so all of our
259 # example code works with all versions of python.
260 #
261 # That, along with the string-to-numeric and comparisons of earliest,
262 # latest, matching, or nearest, greatly complicates this function.
263 # Performance is somewhat curbed by memoization.
264 _return_str = not _PY3 and isinstance(given_version, str)
266 if _return_str:
267 # avoid list-comprehension to work around a coverage issue:
268 # https://github.com/nedbat/coveragepy/issues/753
269 unicode_versions = list(map(lambda ucs: ucs.encode(), list_versions()))
270 else:
271 unicode_versions = list_versions()
272 latest_version = unicode_versions[-1]
274 if given_version in (u'auto', 'auto'):
275 given_version = os.environ.get(
276 'UNICODE_VERSION',
277 'latest' if not _return_str else latest_version.encode())
279 if given_version in (u'latest', 'latest'):
280 # default match, when given as 'latest', use the most latest unicode
281 # version specification level supported.
282 return latest_version if not _return_str else latest_version.encode()
284 if given_version in unicode_versions:
285 # exact match, downstream has specified an explicit matching version
286 # matching any value of list_versions().
287 return given_version if not _return_str else given_version.encode()
289 # The user's version is not supported by ours. We return the newest unicode
290 # version level that we support below their given value.
291 try:
292 cmp_given = _wcversion_value(given_version)
294 except ValueError:
295 # submitted value raises ValueError in int(), warn and use latest.
296 warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. "
297 "Value should be in form of `integer[.]+', the latest "
298 "supported unicode version {latest_version!r} has been "
299 "inferred.".format(given_version=given_version,
300 latest_version=latest_version))
301 return latest_version if not _return_str else latest_version.encode()
303 # given version is less than any available version, return earliest
304 # version.
305 earliest_version = unicode_versions[0]
306 cmp_earliest_version = _wcversion_value(earliest_version)
308 if cmp_given <= cmp_earliest_version:
309 # this probably isn't what you wanted, the oldest wcwidth.c you will
310 # find in the wild is likely version 5 or 6, which we both support,
311 # but it's better than not saying anything at all.
312 warnings.warn("UNICODE_VERSION value, {given_version!r}, is lower "
313 "than any available unicode version. Returning lowest "
314 "version level, {earliest_version!r}".format(
315 given_version=given_version,
316 earliest_version=earliest_version))
317 return earliest_version if not _return_str else earliest_version.encode()
319 # create list of versions which are less than our equal to given version,
320 # and return the tail value, which is the highest level we may support,
321 # or the latest value we support, when completely unmatched or higher
322 # than any supported version.
323 #
324 # function will never complete, always returns.
325 for idx, unicode_version in enumerate(unicode_versions):
326 # look ahead to next value
327 try:
328 cmp_next_version = _wcversion_value(unicode_versions[idx + 1])
329 except IndexError:
330 # at end of list, return latest version
331 return latest_version if not _return_str else latest_version.encode()
333 # Maybe our given version has less parts, as in tuple(8, 0), than the
334 # next compare version tuple(8, 0, 0). Test for an exact match by
335 # comparison of only the leading dotted piece(s): (8, 0) == (8, 0).
336 if cmp_given == cmp_next_version[:len(cmp_given)]:
337 return unicode_versions[idx + 1]
339 # Or, if any next value is greater than our given support level
340 # version, return the current value in index. Even though it must
341 # be less than the given value, its our closest possible match. That
342 # is, 4.1 is returned for given 4.9.9, where 4.1 and 5.0 are available.
343 if cmp_next_version > cmp_given:
344 return unicode_version
345 assert False, ("Code path unreachable", given_version, unicode_versions) # pragma: no cover