Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/_wcwidth.py: 43%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2This is a python implementation of wcwidth() and wcswidth().
4https://github.com/jquast/wcwidth
6Derived from Markus Kuhn's C code,
8This is an implementation of wcwidth() and wcswidth() (defined in
9IEEE Std 1002.1-2001) for Unicode.
11http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
12http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
14In fixed-width output devices, Latin characters all occupy a single
15"cell" position of equal width, whereas ideographic CJK characters
16occupy two such cells. Interoperability between terminal-line
17applications and (teletype-style) character terminals using the
18UTF-8 encoding requires agreement on which character should advance
19the cursor by how many cell positions. No established formal
20standards exist at present on which Unicode character shall occupy
21how many cell positions on character terminals. These routines are
22a first attempt of defining such behavior based on simple rules
23applied to data provided by the Unicode Consortium.
25For some graphical characters, the Unicode standard explicitly
26defines a character-cell width via the definition of the East Asian
27FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
28In all these cases, there is no ambiguity about which width a
29terminal shall use. For characters in the East Asian Ambiguous (A)
30class, the width choice depends purely on a preference of backward
31compatibility with either historic CJK or Western practice.
32Choosing single-width for these characters is easy to justify as
33the appropriate long-term solution, as the CJK practice of
34displaying these characters as double-width comes from historic
35implementation simplicity (8-bit encoded characters were displayed
36single-width and 16-bit ones double-width, even for Greek,
37Cyrillic, etc.) and not any typographic considerations.
39Much less clear is the choice of width for the Not East Asian
40(Neutral) class. Existing practice does not dictate a width for any
41of these characters. It would nevertheless make sense
42typographically to allocate two character cells to characters such
43as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
44represented adequately with a single-width glyph. The following
45routines at present merely assign a single-cell width to all
46neutral characters, in the interest of simplicity. This is not
47entirely satisfactory and should be reconsidered before
48establishing a formal standard in this area. At the moment, the
49decision which Not East Asian (Neutral) characters should be
50represented by double-width glyphs cannot yet be answered by
51applying a simple rule from the Unicode database content. Setting
52up a proper standard for the behavior of UTF-8 character terminals
53will require a careful analysis not only of each Unicode character,
54but also of each presentation form, something the author of these
55routines has avoided to do so far.
57http://www.unicode.org/unicode/reports/tr11/
59Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60"""
62from __future__ import annotations
64# std imports
65from functools import lru_cache
67# local
68from .bisearch import bisearch
69from ._constants import _LATEST_VERSION, _AMBIGUOUS_TABLE, _ZERO_WIDTH_TABLE, _WIDE_EASTASIAN_TABLE
72@lru_cache(maxsize=128)
73def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover
74 """
75 Integer-mapped value of given dotted version string.
77 .. deprecated:: 0.3.0
79 This function is no longer used internally by wcwidth but is retained
80 for API compatibility with external tools.
82 :param ver_string: Unicode version string, of form ``n.n.n``.
83 :returns: tuple of digit tuples, ``tuple(int, [...])``.
84 """
85 retval = tuple(map(int, (ver_string.split('.'))))
86 return retval
89@lru_cache(maxsize=8)
90def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument
91 """
92 Return the supported Unicode version level.
94 .. deprecated:: 0.3.0
95 This function now always returns the latest version.
97 This function is no longer used internally by wcwidth but is retained
98 for API compatibility with external tools.
100 :param given_version: Ignored. Any value is accepted for compatibility.
101 :returns: The latest unicode version string.
102 """
103 return _LATEST_VERSION
106# maxsize=1024: western scripts need ~64 unique codepoints per session, but
107# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates
108# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss.
110@lru_cache(maxsize=1024)
111def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument
112 r"""
113 Given one Unicode codepoint, return its printable length on a terminal.
115 :param wc: A single Unicode character.
116 :param unicode_version: Ignored. Retained for backwards compatibility.
118 .. deprecated:: 0.3.0
119 Only the latest Unicode version is now shipped.
121 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
122 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts
123 where ambiguous characters display as double-width. See
124 :ref:`ambiguous_width` for details.
125 :returns: The width, in cells, necessary to display the character of
126 Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has
127 no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is
128 not printable, or has an indeterminate effect on the terminal, such as
129 a control character. Otherwise, the number of column positions the
130 character occupies on a graphic terminal (1 or 2) is returned.
132 See :ref:`Specification` for details of cell measurement.
133 """
134 ucs = ord(wc) if wc else 0
136 # small optimization: early return of 1 for printable ASCII, this provides
137 # approximately 40% performance improvement for mostly-ascii documents, with
138 # less than 1% impact to others.
139 if 32 <= ucs < 0x7f:
140 return 1
142 # C0/C1 control characters are -1 for compatibility with POSIX-like calls
143 if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:
144 return -1
146 # Zero width
147 if bisearch(ucs, _ZERO_WIDTH_TABLE):
148 return 0
150 # Wide (F/W categories)
151 if bisearch(ucs, _WIDE_EASTASIAN_TABLE):
152 return 2
154 # Ambiguous width (A category) - only when ambiguous_width=2
155 if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE):
156 return 2
158 return 1